GopherProxy

	youtube: fix using the new layout and JSON extraction - frontends - front-ends …
	Log
	Files
	Refs
	README
	LICENSE
	---
	commit a9b8d9a25d11ec18fdee7fa98ad93db35325672a
	parent 6f3fa93b7099d8bf5df5ba3fc04958aedd1bb099
	Author: Hiltjo Posthuma <[email protected]>
	Date: Thu, 3 Sep 2020 11:23:10 +0200

	youtube: fix using the new layout and JSON extraction

	Instead of scraping HTML from the site it now extracts the initial JSON data
	and parses it.

	Diffstat:
	M youtube/youtube.c \| 399 ++++++++++++-----------------…

	1 file changed, 149 insertions(+), 250 deletions(-)
	---
	diff --git a/youtube/youtube.c b/youtube/youtube.c
	@@ -11,291 +11,192 @@
	#include <unistd.h>

	#include "https.h"
	+#include "json.h"
	#include "util.h"
	#include "youtube.h"
	-#include "xml.h"
	-
	-#define STRP(s) s,sizeof(s)-1
	-
	-/* temporary variables to copy for states */
	-static char id[256], userid[256];
	-
	-/* states */
	-static int metainfocount;
	-static enum ItemState {
	- None = 0,
	- Item = 1, Pager = 2,
	- Metainfo = 4, Title = 8, User = 16, Videotime = 32,
	-} state;
	-
	-static struct item *videos;
	-static size_t nvideos;

	static char *
	youtube_request(const char *path)
	{
	- return request("www.youtube.com", path,
	- "User-Agent: Mozilla/5.0 (compatible; Googlebot/2.1; +h…
	-}
	-
	-static int
	-isclassmatch(const char classes, const char clss, size_t len)
	-{
	- const char *p;
	-
	- if (!(p = strstr(classes, clss)))
	- return 0;
	- return (p == classes \|\| isspace((unsigned char)p[-1])) &&
	- (isspace((unsigned char)p[len]) \|\| !p[len]);
	-}
	-
	-/* XML/HTML entity conversion */
	-static const char *
	-entitytostr(const char *s)
	-{
	- static char buf[16];
	- ssize_t len;
	-
	- if ((len = xml_entitytostr(s, buf, sizeof(buf))) > 0)
	- return buf;
	-
	- return s;
	+ return request("www.youtube.com", path, "");
	}

	-static void
	-xmlattr(XMLParser x, const char t, size_t tl, const char *a, size_t al,
	- const char *v, size_t vl)
	+static char *
	+request_search(const char s, const char page, const char *order)
	{
	- /* grouped channel index, used for channelid and channel title */
	- static int grouped = -1;
	-
	- if (!strcmp(t, "div") && !strcmp(a, "class") && isclassmatch(v, STRP("…
	- /* last video */
	- if (nvideos < MAX_VIDEOS && videos[nvideos].linktype) {
	- if (grouped != -1 && !videos[nvideos].channelid[0]) {
	- strlcpy(videos[nvideos].channelid, videos[grou…
	- strlcpy(videos[nvideos].channeltitle, videos[g…
	- }
	- nvideos++;
	- }
	- state &= ~Item;
	- state \|= Pager;
	- }
	-
	- if (nvideos >= MAX_VIDEOS)
	- return;
	+ char path[4096];

	- if (!strcmp(t, "div") && !strcmp(a, "class") &&
	- isclassmatch(v, STRP("yt-lockup"))) {
	- state \|= Item;
	- if (videos[nvideos].linktype) {
	- if (videos[nvideos].channelid[0] \|\| videos[nvideos].us…
	- videos[nvideos].linktype != Video)
	- grouped = -1;
	- if (videos[nvideos].linktype == Channel)
	- grouped = nvideos;
	- if (grouped != -1 && !videos[nvideos].channelid[0]) {
	- strlcpy(videos[nvideos].channelid, videos[grou…
	- strlcpy(videos[nvideos].channeltitle, videos[g…
	- }
	- nvideos++;
	- }
	- if (strstr(v, " yt-lockup-channel "))
	- videos[nvideos].linktype = Channel;
	- else if (strstr(v, "yt-lockup-movie-"))
	- videos[nvideos].linktype = Movie;
	- else if (strstr(v, " yt-lockup-playlist "))
	- videos[nvideos].linktype = Playlist;
	- if (strstr(v, " yt-lockup-video "))
	- videos[nvideos].linktype = Video;
	- }
	- if (!(state & Item))
	- return;
	+ snprintf(path, sizeof(path), "/results?search_query=%s", s);

	- if (!strcmp(t, "span") && !strcmp(a, "class") && isclassmatch(v, STRP(…
	- state \|= Videotime;
	- if (!strcmp(t, "ul") && !strcmp(a, "class") && isclassmatch(v, STRP("y…
	- state \|= Metainfo;
	- metainfocount = 0;
	+ if (page[0]) {
	+ strlcat(path, "&page=", sizeof(path));
	+ strlcat(path, page, sizeof(path));
	}
	- if (!strcmp(t, "h3") && !strcmp(a, "class") && isclassmatch(v, STRP("y…
	- state \|= Title;
	- if (!strcmp(t, "div") && !strcmp(a, "class") && isclassmatch(v, STRP("…
	- state \|= User;

	- if ((state & Title) && !strcmp(t, "a") && !strcmp(a, "title")) {
	- if (videos[nvideos].linktype == Channel)
	- strlcat(videos[nvideos].channeltitle, v, sizeof(videos…
	- else
	- strlcat(videos[nvideos].title, v, sizeof(videos[nvideo…
	+ if (order[0]) {
	+ strlcat(path, "&search_sort=", sizeof(path));
	+ if (!strcmp(order, "date"))
	+ strlcat(path, "video_date_uploaded", sizeof(path));
	+ else if (!strcmp(order, "relevance"))
	+ strlcat(path, "video_relevance", sizeof(path));
	+ else if (!strcmp(order, "views"))
	+ strlcat(path, "video_view_count", sizeof(path));
	+ else if (!strcmp(order, "rating"))
	+ strlcat(path, "video_avg_rating", sizeof(path));
	}

	- if ((state & Title) && !strcmp(t, "a") && !strcmp(a, "href"))
	- strlcat(id, v, sizeof(id));
	-
	- if (!strcmp(t, "button") && !strcmp(a, "data-channel-external-id"))
	- strlcat(videos[nvideos].channelid, v, sizeof(videos[nvideos].c…
	+ /* check if request is too long (truncation) */
	+ if (strlen(path) >= sizeof(path) - 1)
	+ return NULL;

	- if ((state & User) && !strcmp(t, "a") && !strcmp(a, "href"))
	- strlcat(userid, v, sizeof(userid));
	+ return youtube_request(path);
	}

	-static void
	-xmlattrentity(XMLParser x, const char t, size_t tl, const char *a, size_t al,
	- const char *v, size_t vl)
	+int
	+extractjson(const char s, char start, char *end)
	{
	- const char *s;
	+ if (!(*start = strstr(s, "window[\"ytInitialData\"] = ")))
	+ return -1;
	+ if (!(end = strstr(start, "};\n")))
	+ return -1;

	- if (!(state & Pager) && nvideos >= MAX_VIDEOS)
	- return;
	+ (*start) += sizeof("window[\"ytInitialData\"] = ") - 1;
	+ (*end)++;

	- s = entitytostr(v);
	- xmlattr(x, t, tl, a, al, s, strlen(s));
	+ return 0;
	}

	-static void
	-xmldata(XMLParser x, const char d, size_t dl)
	+void
	+processnode(struct json_node nodes, size_t depth, const char value,
	+ void *pp)
	{
	- if ((state & Pager))
	- return;
	+ struct search_response r = (struct search_response )pp;
	+ static struct item *item;

	- /* optimization: no need to process and must not process videos after …
	- if (!state \|\| nvideos >= MAX_VIDEOS)
	+ if (r->nitems > MAX_VIDEOS)
	return;

	- /* use parsed link type for meta info since this metainfo differs per …
	- channel, playlist, video */
	- if ((state & Metainfo)) {
	- switch (videos[nvideos].linktype) {
	- case Playlist:
	- break; /* ignore */
	- case Channel:
	- if (metainfocount == 1)
	- strlcat(videos[nvideos].channelvideos, d, size…
	- break;
	- default:
	- if (metainfocount == 1)
	- strlcat(videos[nvideos].publishedat, d, sizeof…
	- else if (metainfocount == 2)
	- strlcat(videos[nvideos].viewcount, d, sizeof(v…
	- }
	+ /* new item, structures can be very deep, just check the end for:
	+ (items\|contents)[].videoRenderer objects */
	+ if (depth >= 3 &&
	+ nodes[depth - 3].type == TYPE_ARRAY &&
	+ nodes[depth - 2].type == TYPE_OBJECT &&
	+ nodes[depth - 1].type == TYPE_OBJECT &&
	+ (!strcmp(nodes[depth - 3].name, "items") \|\|
	+ !strcmp(nodes[depth - 3].name, "contents")) &&
	+ !strcmp(nodes[depth - 1].name, "videoRenderer")) {
	+ r->nitems++;
	+ return;
	}
	- if ((state & Videotime) && !strcmp(x->tag, "span"))
	- strlcat(videos[nvideos].duration, d, sizeof(videos[nvideos].du…
	- if ((state & User) && !strcmp(x->tag, "a"))
	- strlcat(videos[nvideos].channeltitle, d, sizeof(videos[nvideos…
	-}
	-
	-static void
	-xmldataentity(XMLParser x, const char d, size_t dl)
	-{
	- const char *s;

	- /* optimization: no need for entity conversion */
	- if (!state \|\| nvideos >= MAX_VIDEOS)
	+ if (r->nitems == 0)
	return;
	-
	- s = entitytostr(d);
	- xmldata(x, s, strlen(s));
	-}
	-
	-static void
	-xmltagend(XMLParser x, const char t, size_t tl, int isshort)
	-{
	- char *p;
	-
	- if ((state & Metainfo) && !strcmp(t, "ul"))
	- state &= ~Metainfo;
	- if ((state & Title) && !strcmp(t, "h3")) {
	- state &= ~Title;
	-
	- if (nvideos >= MAX_VIDEOS)
	- return;
	-
	- if (!strncmp(id, "/watch", sizeof("/watch") - 1)) {
	- if (!videos[nvideos].linktype)
	- videos[nvideos].linktype = Video;
	- if ((p = getparam(id, "v"))) {
	- if (decodeparam(videos[nvideos].id, sizeof(vid…
	- videos[nvideos].id[0] = '\0';
	- }
	- }
	-
	- id[0] = '\0';
	+ item = &(r->items[r->nitems - 1]);
	+
	+ if (depth >= 4 &&
	+ nodes[depth - 4].type == TYPE_ARRAY &&
	+ nodes[depth - 3].type == TYPE_OBJECT &&
	+ nodes[depth - 2].type == TYPE_OBJECT &&
	+ nodes[depth - 1].type == TYPE_STRING &&
	+ (!strcmp(nodes[depth - 4].name, "items") \|\|
	+ !strcmp(nodes[depth - 4].name, "contents")) &&
	+ !strcmp(nodes[depth - 2].name, "videoRenderer") &&
	+ !strcmp(nodes[depth - 1].name, "videoId")) {
	+ strlcpy(item->id, value, sizeof(item->id));
	}
	- if ((state & User)) {
	- state &= ~User;

	- if (nvideos >= MAX_VIDEOS)
	- return;
	+ if (depth >= 7 &&
	+ nodes[depth - 7].type == TYPE_ARRAY &&
	+ nodes[depth - 6].type == TYPE_OBJECT &&
	+ nodes[depth - 5].type == TYPE_OBJECT &&
	+ nodes[depth - 4].type == TYPE_OBJECT &&
	+ nodes[depth - 3].type == TYPE_ARRAY &&
	+ nodes[depth - 2].type == TYPE_OBJECT &&
	+ nodes[depth - 1].type == TYPE_STRING &&
	+ (!strcmp(nodes[depth - 7].name, "items") \|\|
	+ !strcmp(nodes[depth - 7].name, "contents")) &&
	+ !strcmp(nodes[depth - 5].name, "videoRenderer") &&
	+ !strcmp(nodes[depth - 4].name, "title") &&
	+ !strcmp(nodes[depth - 3].name, "runs") &&
	+ !strcmp(nodes[depth - 1].name, "text") &&
	+ !item->title[0]) {
	+ strlcpy(item->title, value, sizeof(item->title));
	+ }

	- /* can be user or channel */
	- if (!strncmp(userid, "/channel/", sizeof("/channel/") - 1)) {
	- strlcpy(videos[nvideos].channelid,
	- userid + sizeof("/channel/") - 1,
	- sizeof(videos[nvideos].channelid));
	- } else if (!strncmp(userid, "/user/", sizeof("/user/") - 1)) {
	- strlcpy(videos[nvideos].userid,
	- userid + sizeof("/user/") - 1,
	- sizeof(videos[nvideos].userid));
	+ if (depth >= 5 &&
	+ nodes[depth - 5].type == TYPE_ARRAY &&
	+ nodes[depth - 4].type == TYPE_OBJECT &&
	+ nodes[depth - 3].type == TYPE_OBJECT &&
	+ nodes[depth - 2].type == TYPE_OBJECT &&
	+ nodes[depth - 1].type == TYPE_STRING &&
	+ (!strcmp(nodes[depth - 5].name, "items") \|\|
	+ !strcmp(nodes[depth - 5].name, "contents")) &&
	+ !strcmp(nodes[depth - 3].name, "videoRenderer") &&
	+ !strcmp(nodes[depth - 1].name, "simpleText")) {
	+ if (!strcmp(nodes[depth - 2].name, "viewCountText") &&
	+ !item->viewcount[0]) {
	+ strlcpy(item->viewcount, value, sizeof(item->viewcount…
	+ } else if (!strcmp(nodes[depth - 2].name, "lengthText") &&
	+ !item->duration[0]) {
	+ strlcpy(item->duration, value, sizeof(item->duration));
	+ } else if (!strcmp(nodes[depth - 2].name, "publishedTimeText")…
	+ !item->publishedat[0]) {
	+ strlcpy(item->publishedat, value, sizeof(item->publish…
	}
	-
	- userid[0] = '\0';
	}
	- if ((state & Videotime))
	- state &= ~Videotime;
	-}

	-static void
	-xmltagstart(XMLParser x, const char t, size_t tl)
	-{
	- if ((state & Metainfo) && !strcmp(t, "li"))
	- metainfocount++;
	-}
	-
	-static char *
	-request_search(const char s, const char page, const char *order)
	-{
	- char path[4096];
	-
	- snprintf(path, sizeof(path), "/results?search_query=%s", s);
	- if (page[0]) {
	- strlcat(path, "&page=", sizeof(path));
	- strlcat(path, page, sizeof(path));
	+ if (depth >= 9 &&
	+ nodes[depth - 9].type == TYPE_ARRAY &&
	+ nodes[depth - 8].type == TYPE_OBJECT &&
	+ nodes[depth - 7].type == TYPE_OBJECT &&
	+ nodes[depth - 6].type == TYPE_OBJECT &&
	+ nodes[depth - 5].type == TYPE_ARRAY &&
	+ nodes[depth - 4].type == TYPE_OBJECT &&
	+ nodes[depth - 3].type == TYPE_OBJECT &&
	+ nodes[depth - 2].type == TYPE_OBJECT &&
	+ nodes[depth - 1].type == TYPE_STRING &&
	+ (!strcmp(nodes[depth - 9].name, "items") \|\|
	+ !strcmp(nodes[depth - 9].name, "contents")) &&
	+ !strcmp(nodes[depth - 7].name, "videoRenderer") &&
	+ !strcmp(nodes[depth - 6].name, "longBylineText") &&
	+ !strcmp(nodes[depth - 5].name, "runs") &&
	+ !strcmp(nodes[depth - 3].name, "navigationEndpoint") &&
	+ !strcmp(nodes[depth - 2].name, "browseEndpoint")) {
	+ if (!strcmp(nodes[depth - 1].name, "browseId")) {
	+ strlcpy(item->channelid, value, sizeof(item->channelid…
	+ }
	}

	- if (order[0]) {
	- strlcat(path, "&search_sort=", sizeof(path));
	- if (!strcmp(order, "date"))
	- strlcat(path, "video_date_uploaded", sizeof(path));
	- else if (!strcmp(order, "relevance"))
	- strlcat(path, "video_relevance", sizeof(path));
	- else if (!strcmp(order, "views"))
	- strlcat(path, "video_view_count", sizeof(path));
	- else if (!strcmp(order, "rating"))
	- strlcat(path, "video_avg_rating", sizeof(path));
	+ if (depth >= 7 &&
	+ nodes[depth - 7].type == TYPE_ARRAY &&
	+ nodes[depth - 6].type == TYPE_OBJECT &&
	+ nodes[depth - 5].type == TYPE_OBJECT &&
	+ nodes[depth - 4].type == TYPE_OBJECT &&
	+ nodes[depth - 3].type == TYPE_ARRAY &&
	+ nodes[depth - 2].type == TYPE_OBJECT &&
	+ nodes[depth - 1].type == TYPE_STRING &&
	+ (!strcmp(nodes[depth - 7].name, "items") \|\|
	+ !strcmp(nodes[depth - 7].name, "contents")) &&
	+ !strcmp(nodes[depth - 5].name, "videoRenderer") &&
	+ !strcmp(nodes[depth - 4].name, "longBylineText") &&
	+ !strcmp(nodes[depth - 3].name, "runs")) {
	+ if (!strcmp(nodes[depth - 1].name, "text") &&
	+ !item->channeltitle[0]) {
	+ strlcpy(item->channeltitle, value, sizeof(item->channe…
	+ }
	}
	-
	- /* force older youtube layout, else youtube will try to randomly serve
	- a new layout sometimes breaking the parsing */
	- strlcat(path, "&disable_polymer=1", sizeof(path));
	-
	- /* check if request is too long (truncation) */
	- if (strlen(path) >= sizeof(path) - 1)
	- return NULL;
	-
	- return youtube_request(path);
	}

	struct search_response *
	youtube_search(const char rawsearch, const char page, const char *order)
	{
	struct search_response *r;
	- XMLParser x = { 0 };
	- char data, s;
	+ char data, s, start, end;
	+ int ret;

	if (!(data = request_search(rawsearch, page, order)))
	return NULL;
	+
	if (!(s = strstr(data, "\r\n\r\n")))
	return NULL; /* invalid response */
	/* skip header */
	@@ -304,20 +205,18 @@ youtube_search(const char rawsearch, const char page, c…
	if (!(r = calloc(1, sizeof(*r))))
	return NULL;

	- nvideos = 0;
	- videos = r->items;
	-
	- x.xmlattr = xmlattr;
	- x.xmlattrentity = xmlattrentity;
	- x.xmldata = xmldata;
	- x.xmldataentity = xmldataentity;
	- x.xmltagend = xmltagend;
	- x.xmltagstart = xmltagstart;
	-
	- setxmldata(s, strlen(s));
	- xml_parse(&x);
	+ if (extractjson(s, &start, &end) == -1) {
	+// fprintf(stderr, "error extracting JSON");
	+ free(r);
	+ return NULL;
	+ }

	- r->nitems = nvideos;
	+ ret = parsejson(start, end - start, processnode, r);
	+ if (ret < 0) {
	+// fprintf(stderr, "error parsing JSON");
	+ free(r);
	+ return NULL;
	+ }

	return r;
	}