youtube: fix using the new layout and JSON extraction - frontends - front-ends … | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
commit a9b8d9a25d11ec18fdee7fa98ad93db35325672a | |
parent 6f3fa93b7099d8bf5df5ba3fc04958aedd1bb099 | |
Author: Hiltjo Posthuma <[email protected]> | |
Date: Thu, 3 Sep 2020 11:23:10 +0200 | |
youtube: fix using the new layout and JSON extraction | |
Instead of scraping HTML from the site it now extracts the initial JSON data | |
and parses it. | |
Diffstat: | |
M youtube/youtube.c | 399 ++++++++++++-----------------… | |
1 file changed, 149 insertions(+), 250 deletions(-) | |
--- | |
diff --git a/youtube/youtube.c b/youtube/youtube.c | |
@@ -11,291 +11,192 @@ | |
#include <unistd.h> | |
#include "https.h" | |
+#include "json.h" | |
#include "util.h" | |
#include "youtube.h" | |
-#include "xml.h" | |
- | |
-#define STRP(s) s,sizeof(s)-1 | |
- | |
-/* temporary variables to copy for states */ | |
-static char id[256], userid[256]; | |
- | |
-/* states */ | |
-static int metainfocount; | |
-static enum ItemState { | |
- None = 0, | |
- Item = 1, Pager = 2, | |
- Metainfo = 4, Title = 8, User = 16, Videotime = 32, | |
-} state; | |
- | |
-static struct item *videos; | |
-static size_t nvideos; | |
static char * | |
youtube_request(const char *path) | |
{ | |
- return request("www.youtube.com", path, | |
- "User-Agent: Mozilla/5.0 (compatible; Googlebot/2.1; +h… | |
-} | |
- | |
-static int | |
-isclassmatch(const char *classes, const char *clss, size_t len) | |
-{ | |
- const char *p; | |
- | |
- if (!(p = strstr(classes, clss))) | |
- return 0; | |
- return (p == classes || isspace((unsigned char)p[-1])) && | |
- (isspace((unsigned char)p[len]) || !p[len]); | |
-} | |
- | |
-/* XML/HTML entity conversion */ | |
-static const char * | |
-entitytostr(const char *s) | |
-{ | |
- static char buf[16]; | |
- ssize_t len; | |
- | |
- if ((len = xml_entitytostr(s, buf, sizeof(buf))) > 0) | |
- return buf; | |
- | |
- return s; | |
+ return request("www.youtube.com", path, ""); | |
} | |
-static void | |
-xmlattr(XMLParser *x, const char *t, size_t tl, const char *a, size_t al, | |
- const char *v, size_t vl) | |
+static char * | |
+request_search(const char *s, const char *page, const char *order) | |
{ | |
- /* grouped channel index, used for channelid and channel title */ | |
- static int grouped = -1; | |
- | |
- if (!strcmp(t, "div") && !strcmp(a, "class") && isclassmatch(v, STRP("… | |
- /* last video */ | |
- if (nvideos < MAX_VIDEOS && videos[nvideos].linktype) { | |
- if (grouped != -1 && !videos[nvideos].channelid[0]) { | |
- strlcpy(videos[nvideos].channelid, videos[grou… | |
- strlcpy(videos[nvideos].channeltitle, videos[g… | |
- } | |
- nvideos++; | |
- } | |
- state &= ~Item; | |
- state |= Pager; | |
- } | |
- | |
- if (nvideos >= MAX_VIDEOS) | |
- return; | |
+ char path[4096]; | |
- if (!strcmp(t, "div") && !strcmp(a, "class") && | |
- isclassmatch(v, STRP("yt-lockup"))) { | |
- state |= Item; | |
- if (videos[nvideos].linktype) { | |
- if (videos[nvideos].channelid[0] || videos[nvideos].us… | |
- videos[nvideos].linktype != Video) | |
- grouped = -1; | |
- if (videos[nvideos].linktype == Channel) | |
- grouped = nvideos; | |
- if (grouped != -1 && !videos[nvideos].channelid[0]) { | |
- strlcpy(videos[nvideos].channelid, videos[grou… | |
- strlcpy(videos[nvideos].channeltitle, videos[g… | |
- } | |
- nvideos++; | |
- } | |
- if (strstr(v, " yt-lockup-channel ")) | |
- videos[nvideos].linktype = Channel; | |
- else if (strstr(v, "yt-lockup-movie-")) | |
- videos[nvideos].linktype = Movie; | |
- else if (strstr(v, " yt-lockup-playlist ")) | |
- videos[nvideos].linktype = Playlist; | |
- if (strstr(v, " yt-lockup-video ")) | |
- videos[nvideos].linktype = Video; | |
- } | |
- if (!(state & Item)) | |
- return; | |
+ snprintf(path, sizeof(path), "/results?search_query=%s", s); | |
- if (!strcmp(t, "span") && !strcmp(a, "class") && isclassmatch(v, STRP(… | |
- state |= Videotime; | |
- if (!strcmp(t, "ul") && !strcmp(a, "class") && isclassmatch(v, STRP("y… | |
- state |= Metainfo; | |
- metainfocount = 0; | |
+ if (page[0]) { | |
+ strlcat(path, "&page=", sizeof(path)); | |
+ strlcat(path, page, sizeof(path)); | |
} | |
- if (!strcmp(t, "h3") && !strcmp(a, "class") && isclassmatch(v, STRP("y… | |
- state |= Title; | |
- if (!strcmp(t, "div") && !strcmp(a, "class") && isclassmatch(v, STRP("… | |
- state |= User; | |
- if ((state & Title) && !strcmp(t, "a") && !strcmp(a, "title")) { | |
- if (videos[nvideos].linktype == Channel) | |
- strlcat(videos[nvideos].channeltitle, v, sizeof(videos… | |
- else | |
- strlcat(videos[nvideos].title, v, sizeof(videos[nvideo… | |
+ if (order[0]) { | |
+ strlcat(path, "&search_sort=", sizeof(path)); | |
+ if (!strcmp(order, "date")) | |
+ strlcat(path, "video_date_uploaded", sizeof(path)); | |
+ else if (!strcmp(order, "relevance")) | |
+ strlcat(path, "video_relevance", sizeof(path)); | |
+ else if (!strcmp(order, "views")) | |
+ strlcat(path, "video_view_count", sizeof(path)); | |
+ else if (!strcmp(order, "rating")) | |
+ strlcat(path, "video_avg_rating", sizeof(path)); | |
} | |
- if ((state & Title) && !strcmp(t, "a") && !strcmp(a, "href")) | |
- strlcat(id, v, sizeof(id)); | |
- | |
- if (!strcmp(t, "button") && !strcmp(a, "data-channel-external-id")) | |
- strlcat(videos[nvideos].channelid, v, sizeof(videos[nvideos].c… | |
+ /* check if request is too long (truncation) */ | |
+ if (strlen(path) >= sizeof(path) - 1) | |
+ return NULL; | |
- if ((state & User) && !strcmp(t, "a") && !strcmp(a, "href")) | |
- strlcat(userid, v, sizeof(userid)); | |
+ return youtube_request(path); | |
} | |
-static void | |
-xmlattrentity(XMLParser *x, const char *t, size_t tl, const char *a, size_t al, | |
- const char *v, size_t vl) | |
+int | |
+extractjson(const char *s, char **start, char **end) | |
{ | |
- const char *s; | |
+ if (!(*start = strstr(s, "window[\"ytInitialData\"] = "))) | |
+ return -1; | |
+ if (!(*end = strstr(*start, "};\n"))) | |
+ return -1; | |
- if (!(state & Pager) && nvideos >= MAX_VIDEOS) | |
- return; | |
+ (*start) += sizeof("window[\"ytInitialData\"] = ") - 1; | |
+ (*end)++; | |
- s = entitytostr(v); | |
- xmlattr(x, t, tl, a, al, s, strlen(s)); | |
+ return 0; | |
} | |
-static void | |
-xmldata(XMLParser *x, const char *d, size_t dl) | |
+void | |
+processnode(struct json_node *nodes, size_t depth, const char *value, | |
+ void *pp) | |
{ | |
- if ((state & Pager)) | |
- return; | |
+ struct search_response *r = (struct search_response *)pp; | |
+ static struct item *item; | |
- /* optimization: no need to process and must not process videos after … | |
- if (!state || nvideos >= MAX_VIDEOS) | |
+ if (r->nitems > MAX_VIDEOS) | |
return; | |
- /* use parsed link type for meta info since this metainfo differs per … | |
- channel, playlist, video */ | |
- if ((state & Metainfo)) { | |
- switch (videos[nvideos].linktype) { | |
- case Playlist: | |
- break; /* ignore */ | |
- case Channel: | |
- if (metainfocount == 1) | |
- strlcat(videos[nvideos].channelvideos, d, size… | |
- break; | |
- default: | |
- if (metainfocount == 1) | |
- strlcat(videos[nvideos].publishedat, d, sizeof… | |
- else if (metainfocount == 2) | |
- strlcat(videos[nvideos].viewcount, d, sizeof(v… | |
- } | |
+ /* new item, structures can be very deep, just check the end for: | |
+ (items|contents)[].videoRenderer objects */ | |
+ if (depth >= 3 && | |
+ nodes[depth - 3].type == TYPE_ARRAY && | |
+ nodes[depth - 2].type == TYPE_OBJECT && | |
+ nodes[depth - 1].type == TYPE_OBJECT && | |
+ (!strcmp(nodes[depth - 3].name, "items") || | |
+ !strcmp(nodes[depth - 3].name, "contents")) && | |
+ !strcmp(nodes[depth - 1].name, "videoRenderer")) { | |
+ r->nitems++; | |
+ return; | |
} | |
- if ((state & Videotime) && !strcmp(x->tag, "span")) | |
- strlcat(videos[nvideos].duration, d, sizeof(videos[nvideos].du… | |
- if ((state & User) && !strcmp(x->tag, "a")) | |
- strlcat(videos[nvideos].channeltitle, d, sizeof(videos[nvideos… | |
-} | |
- | |
-static void | |
-xmldataentity(XMLParser *x, const char *d, size_t dl) | |
-{ | |
- const char *s; | |
- /* optimization: no need for entity conversion */ | |
- if (!state || nvideos >= MAX_VIDEOS) | |
+ if (r->nitems == 0) | |
return; | |
- | |
- s = entitytostr(d); | |
- xmldata(x, s, strlen(s)); | |
-} | |
- | |
-static void | |
-xmltagend(XMLParser *x, const char *t, size_t tl, int isshort) | |
-{ | |
- char *p; | |
- | |
- if ((state & Metainfo) && !strcmp(t, "ul")) | |
- state &= ~Metainfo; | |
- if ((state & Title) && !strcmp(t, "h3")) { | |
- state &= ~Title; | |
- | |
- if (nvideos >= MAX_VIDEOS) | |
- return; | |
- | |
- if (!strncmp(id, "/watch", sizeof("/watch") - 1)) { | |
- if (!videos[nvideos].linktype) | |
- videos[nvideos].linktype = Video; | |
- if ((p = getparam(id, "v"))) { | |
- if (decodeparam(videos[nvideos].id, sizeof(vid… | |
- videos[nvideos].id[0] = '\0'; | |
- } | |
- } | |
- | |
- id[0] = '\0'; | |
+ item = &(r->items[r->nitems - 1]); | |
+ | |
+ if (depth >= 4 && | |
+ nodes[depth - 4].type == TYPE_ARRAY && | |
+ nodes[depth - 3].type == TYPE_OBJECT && | |
+ nodes[depth - 2].type == TYPE_OBJECT && | |
+ nodes[depth - 1].type == TYPE_STRING && | |
+ (!strcmp(nodes[depth - 4].name, "items") || | |
+ !strcmp(nodes[depth - 4].name, "contents")) && | |
+ !strcmp(nodes[depth - 2].name, "videoRenderer") && | |
+ !strcmp(nodes[depth - 1].name, "videoId")) { | |
+ strlcpy(item->id, value, sizeof(item->id)); | |
} | |
- if ((state & User)) { | |
- state &= ~User; | |
- if (nvideos >= MAX_VIDEOS) | |
- return; | |
+ if (depth >= 7 && | |
+ nodes[depth - 7].type == TYPE_ARRAY && | |
+ nodes[depth - 6].type == TYPE_OBJECT && | |
+ nodes[depth - 5].type == TYPE_OBJECT && | |
+ nodes[depth - 4].type == TYPE_OBJECT && | |
+ nodes[depth - 3].type == TYPE_ARRAY && | |
+ nodes[depth - 2].type == TYPE_OBJECT && | |
+ nodes[depth - 1].type == TYPE_STRING && | |
+ (!strcmp(nodes[depth - 7].name, "items") || | |
+ !strcmp(nodes[depth - 7].name, "contents")) && | |
+ !strcmp(nodes[depth - 5].name, "videoRenderer") && | |
+ !strcmp(nodes[depth - 4].name, "title") && | |
+ !strcmp(nodes[depth - 3].name, "runs") && | |
+ !strcmp(nodes[depth - 1].name, "text") && | |
+ !item->title[0]) { | |
+ strlcpy(item->title, value, sizeof(item->title)); | |
+ } | |
- /* can be user or channel */ | |
- if (!strncmp(userid, "/channel/", sizeof("/channel/") - 1)) { | |
- strlcpy(videos[nvideos].channelid, | |
- userid + sizeof("/channel/") - 1, | |
- sizeof(videos[nvideos].channelid)); | |
- } else if (!strncmp(userid, "/user/", sizeof("/user/") - 1)) { | |
- strlcpy(videos[nvideos].userid, | |
- userid + sizeof("/user/") - 1, | |
- sizeof(videos[nvideos].userid)); | |
+ if (depth >= 5 && | |
+ nodes[depth - 5].type == TYPE_ARRAY && | |
+ nodes[depth - 4].type == TYPE_OBJECT && | |
+ nodes[depth - 3].type == TYPE_OBJECT && | |
+ nodes[depth - 2].type == TYPE_OBJECT && | |
+ nodes[depth - 1].type == TYPE_STRING && | |
+ (!strcmp(nodes[depth - 5].name, "items") || | |
+ !strcmp(nodes[depth - 5].name, "contents")) && | |
+ !strcmp(nodes[depth - 3].name, "videoRenderer") && | |
+ !strcmp(nodes[depth - 1].name, "simpleText")) { | |
+ if (!strcmp(nodes[depth - 2].name, "viewCountText") && | |
+ !item->viewcount[0]) { | |
+ strlcpy(item->viewcount, value, sizeof(item->viewcount… | |
+ } else if (!strcmp(nodes[depth - 2].name, "lengthText") && | |
+ !item->duration[0]) { | |
+ strlcpy(item->duration, value, sizeof(item->duration)); | |
+ } else if (!strcmp(nodes[depth - 2].name, "publishedTimeText")… | |
+ !item->publishedat[0]) { | |
+ strlcpy(item->publishedat, value, sizeof(item->publish… | |
} | |
- | |
- userid[0] = '\0'; | |
} | |
- if ((state & Videotime)) | |
- state &= ~Videotime; | |
-} | |
-static void | |
-xmltagstart(XMLParser *x, const char *t, size_t tl) | |
-{ | |
- if ((state & Metainfo) && !strcmp(t, "li")) | |
- metainfocount++; | |
-} | |
- | |
-static char * | |
-request_search(const char *s, const char *page, const char *order) | |
-{ | |
- char path[4096]; | |
- | |
- snprintf(path, sizeof(path), "/results?search_query=%s", s); | |
- if (page[0]) { | |
- strlcat(path, "&page=", sizeof(path)); | |
- strlcat(path, page, sizeof(path)); | |
+ if (depth >= 9 && | |
+ nodes[depth - 9].type == TYPE_ARRAY && | |
+ nodes[depth - 8].type == TYPE_OBJECT && | |
+ nodes[depth - 7].type == TYPE_OBJECT && | |
+ nodes[depth - 6].type == TYPE_OBJECT && | |
+ nodes[depth - 5].type == TYPE_ARRAY && | |
+ nodes[depth - 4].type == TYPE_OBJECT && | |
+ nodes[depth - 3].type == TYPE_OBJECT && | |
+ nodes[depth - 2].type == TYPE_OBJECT && | |
+ nodes[depth - 1].type == TYPE_STRING && | |
+ (!strcmp(nodes[depth - 9].name, "items") || | |
+ !strcmp(nodes[depth - 9].name, "contents")) && | |
+ !strcmp(nodes[depth - 7].name, "videoRenderer") && | |
+ !strcmp(nodes[depth - 6].name, "longBylineText") && | |
+ !strcmp(nodes[depth - 5].name, "runs") && | |
+ !strcmp(nodes[depth - 3].name, "navigationEndpoint") && | |
+ !strcmp(nodes[depth - 2].name, "browseEndpoint")) { | |
+ if (!strcmp(nodes[depth - 1].name, "browseId")) { | |
+ strlcpy(item->channelid, value, sizeof(item->channelid… | |
+ } | |
} | |
- if (order[0]) { | |
- strlcat(path, "&search_sort=", sizeof(path)); | |
- if (!strcmp(order, "date")) | |
- strlcat(path, "video_date_uploaded", sizeof(path)); | |
- else if (!strcmp(order, "relevance")) | |
- strlcat(path, "video_relevance", sizeof(path)); | |
- else if (!strcmp(order, "views")) | |
- strlcat(path, "video_view_count", sizeof(path)); | |
- else if (!strcmp(order, "rating")) | |
- strlcat(path, "video_avg_rating", sizeof(path)); | |
+ if (depth >= 7 && | |
+ nodes[depth - 7].type == TYPE_ARRAY && | |
+ nodes[depth - 6].type == TYPE_OBJECT && | |
+ nodes[depth - 5].type == TYPE_OBJECT && | |
+ nodes[depth - 4].type == TYPE_OBJECT && | |
+ nodes[depth - 3].type == TYPE_ARRAY && | |
+ nodes[depth - 2].type == TYPE_OBJECT && | |
+ nodes[depth - 1].type == TYPE_STRING && | |
+ (!strcmp(nodes[depth - 7].name, "items") || | |
+ !strcmp(nodes[depth - 7].name, "contents")) && | |
+ !strcmp(nodes[depth - 5].name, "videoRenderer") && | |
+ !strcmp(nodes[depth - 4].name, "longBylineText") && | |
+ !strcmp(nodes[depth - 3].name, "runs")) { | |
+ if (!strcmp(nodes[depth - 1].name, "text") && | |
+ !item->channeltitle[0]) { | |
+ strlcpy(item->channeltitle, value, sizeof(item->channe… | |
+ } | |
} | |
- | |
- /* force older youtube layout, else youtube will try to randomly serve | |
- a new layout sometimes breaking the parsing */ | |
- strlcat(path, "&disable_polymer=1", sizeof(path)); | |
- | |
- /* check if request is too long (truncation) */ | |
- if (strlen(path) >= sizeof(path) - 1) | |
- return NULL; | |
- | |
- return youtube_request(path); | |
} | |
struct search_response * | |
youtube_search(const char *rawsearch, const char *page, const char *order) | |
{ | |
struct search_response *r; | |
- XMLParser x = { 0 }; | |
- char *data, *s; | |
+ char *data, *s, *start, *end; | |
+ int ret; | |
if (!(data = request_search(rawsearch, page, order))) | |
return NULL; | |
+ | |
if (!(s = strstr(data, "\r\n\r\n"))) | |
return NULL; /* invalid response */ | |
/* skip header */ | |
@@ -304,20 +205,18 @@ youtube_search(const char *rawsearch, const char *page, c… | |
if (!(r = calloc(1, sizeof(*r)))) | |
return NULL; | |
- nvideos = 0; | |
- videos = r->items; | |
- | |
- x.xmlattr = xmlattr; | |
- x.xmlattrentity = xmlattrentity; | |
- x.xmldata = xmldata; | |
- x.xmldataentity = xmldataentity; | |
- x.xmltagend = xmltagend; | |
- x.xmltagstart = xmltagstart; | |
- | |
- setxmldata(s, strlen(s)); | |
- xml_parse(&x); | |
+ if (extractjson(s, &start, &end) == -1) { | |
+// fprintf(stderr, "error extracting JSON"); | |
+ free(r); | |
+ return NULL; | |
+ } | |
- r->nitems = nvideos; | |
+ ret = parsejson(start, end - start, processnode, r); | |
+ if (ret < 0) { | |
+// fprintf(stderr, "error parsing JSON"); | |
+ free(r); | |
+ return NULL; | |
+ } | |
return r; | |
} |