Introduction
Introduction Statistics Contact Development Disclaimer Help
youtube: fix using the new layout and JSON extraction - frontends - front-ends …
Log
Files
Refs
README
LICENSE
---
commit a9b8d9a25d11ec18fdee7fa98ad93db35325672a
parent 6f3fa93b7099d8bf5df5ba3fc04958aedd1bb099
Author: Hiltjo Posthuma <[email protected]>
Date: Thu, 3 Sep 2020 11:23:10 +0200
youtube: fix using the new layout and JSON extraction
Instead of scraping HTML from the site it now extracts the initial JSON data
and parses it.
Diffstat:
M youtube/youtube.c | 399 ++++++++++++-----------------…
1 file changed, 149 insertions(+), 250 deletions(-)
---
diff --git a/youtube/youtube.c b/youtube/youtube.c
@@ -11,291 +11,192 @@
#include <unistd.h>
#include "https.h"
+#include "json.h"
#include "util.h"
#include "youtube.h"
-#include "xml.h"
-
-#define STRP(s) s,sizeof(s)-1
-
-/* temporary variables to copy for states */
-static char id[256], userid[256];
-
-/* states */
-static int metainfocount;
-static enum ItemState {
- None = 0,
- Item = 1, Pager = 2,
- Metainfo = 4, Title = 8, User = 16, Videotime = 32,
-} state;
-
-static struct item *videos;
-static size_t nvideos;
static char *
youtube_request(const char *path)
{
- return request("www.youtube.com", path,
- "User-Agent: Mozilla/5.0 (compatible; Googlebot/2.1; +h…
-}
-
-static int
-isclassmatch(const char *classes, const char *clss, size_t len)
-{
- const char *p;
-
- if (!(p = strstr(classes, clss)))
- return 0;
- return (p == classes || isspace((unsigned char)p[-1])) &&
- (isspace((unsigned char)p[len]) || !p[len]);
-}
-
-/* XML/HTML entity conversion */
-static const char *
-entitytostr(const char *s)
-{
- static char buf[16];
- ssize_t len;
-
- if ((len = xml_entitytostr(s, buf, sizeof(buf))) > 0)
- return buf;
-
- return s;
+ return request("www.youtube.com", path, "");
}
-static void
-xmlattr(XMLParser *x, const char *t, size_t tl, const char *a, size_t al,
- const char *v, size_t vl)
+static char *
+request_search(const char *s, const char *page, const char *order)
{
- /* grouped channel index, used for channelid and channel title */
- static int grouped = -1;
-
- if (!strcmp(t, "div") && !strcmp(a, "class") && isclassmatch(v, STRP("…
- /* last video */
- if (nvideos < MAX_VIDEOS && videos[nvideos].linktype) {
- if (grouped != -1 && !videos[nvideos].channelid[0]) {
- strlcpy(videos[nvideos].channelid, videos[grou…
- strlcpy(videos[nvideos].channeltitle, videos[g…
- }
- nvideos++;
- }
- state &= ~Item;
- state |= Pager;
- }
-
- if (nvideos >= MAX_VIDEOS)
- return;
+ char path[4096];
- if (!strcmp(t, "div") && !strcmp(a, "class") &&
- isclassmatch(v, STRP("yt-lockup"))) {
- state |= Item;
- if (videos[nvideos].linktype) {
- if (videos[nvideos].channelid[0] || videos[nvideos].us…
- videos[nvideos].linktype != Video)
- grouped = -1;
- if (videos[nvideos].linktype == Channel)
- grouped = nvideos;
- if (grouped != -1 && !videos[nvideos].channelid[0]) {
- strlcpy(videos[nvideos].channelid, videos[grou…
- strlcpy(videos[nvideos].channeltitle, videos[g…
- }
- nvideos++;
- }
- if (strstr(v, " yt-lockup-channel "))
- videos[nvideos].linktype = Channel;
- else if (strstr(v, "yt-lockup-movie-"))
- videos[nvideos].linktype = Movie;
- else if (strstr(v, " yt-lockup-playlist "))
- videos[nvideos].linktype = Playlist;
- if (strstr(v, " yt-lockup-video "))
- videos[nvideos].linktype = Video;
- }
- if (!(state & Item))
- return;
+ snprintf(path, sizeof(path), "/results?search_query=%s", s);
- if (!strcmp(t, "span") && !strcmp(a, "class") && isclassmatch(v, STRP(…
- state |= Videotime;
- if (!strcmp(t, "ul") && !strcmp(a, "class") && isclassmatch(v, STRP("y…
- state |= Metainfo;
- metainfocount = 0;
+ if (page[0]) {
+ strlcat(path, "&page=", sizeof(path));
+ strlcat(path, page, sizeof(path));
}
- if (!strcmp(t, "h3") && !strcmp(a, "class") && isclassmatch(v, STRP("y…
- state |= Title;
- if (!strcmp(t, "div") && !strcmp(a, "class") && isclassmatch(v, STRP("…
- state |= User;
- if ((state & Title) && !strcmp(t, "a") && !strcmp(a, "title")) {
- if (videos[nvideos].linktype == Channel)
- strlcat(videos[nvideos].channeltitle, v, sizeof(videos…
- else
- strlcat(videos[nvideos].title, v, sizeof(videos[nvideo…
+ if (order[0]) {
+ strlcat(path, "&search_sort=", sizeof(path));
+ if (!strcmp(order, "date"))
+ strlcat(path, "video_date_uploaded", sizeof(path));
+ else if (!strcmp(order, "relevance"))
+ strlcat(path, "video_relevance", sizeof(path));
+ else if (!strcmp(order, "views"))
+ strlcat(path, "video_view_count", sizeof(path));
+ else if (!strcmp(order, "rating"))
+ strlcat(path, "video_avg_rating", sizeof(path));
}
- if ((state & Title) && !strcmp(t, "a") && !strcmp(a, "href"))
- strlcat(id, v, sizeof(id));
-
- if (!strcmp(t, "button") && !strcmp(a, "data-channel-external-id"))
- strlcat(videos[nvideos].channelid, v, sizeof(videos[nvideos].c…
+ /* check if request is too long (truncation) */
+ if (strlen(path) >= sizeof(path) - 1)
+ return NULL;
- if ((state & User) && !strcmp(t, "a") && !strcmp(a, "href"))
- strlcat(userid, v, sizeof(userid));
+ return youtube_request(path);
}
-static void
-xmlattrentity(XMLParser *x, const char *t, size_t tl, const char *a, size_t al,
- const char *v, size_t vl)
+int
+extractjson(const char *s, char **start, char **end)
{
- const char *s;
+ if (!(*start = strstr(s, "window[\"ytInitialData\"] = ")))
+ return -1;
+ if (!(*end = strstr(*start, "};\n")))
+ return -1;
- if (!(state & Pager) && nvideos >= MAX_VIDEOS)
- return;
+ (*start) += sizeof("window[\"ytInitialData\"] = ") - 1;
+ (*end)++;
- s = entitytostr(v);
- xmlattr(x, t, tl, a, al, s, strlen(s));
+ return 0;
}
-static void
-xmldata(XMLParser *x, const char *d, size_t dl)
+void
+processnode(struct json_node *nodes, size_t depth, const char *value,
+ void *pp)
{
- if ((state & Pager))
- return;
+ struct search_response *r = (struct search_response *)pp;
+ static struct item *item;
- /* optimization: no need to process and must not process videos after …
- if (!state || nvideos >= MAX_VIDEOS)
+ if (r->nitems > MAX_VIDEOS)
return;
- /* use parsed link type for meta info since this metainfo differs per …
- channel, playlist, video */
- if ((state & Metainfo)) {
- switch (videos[nvideos].linktype) {
- case Playlist:
- break; /* ignore */
- case Channel:
- if (metainfocount == 1)
- strlcat(videos[nvideos].channelvideos, d, size…
- break;
- default:
- if (metainfocount == 1)
- strlcat(videos[nvideos].publishedat, d, sizeof…
- else if (metainfocount == 2)
- strlcat(videos[nvideos].viewcount, d, sizeof(v…
- }
+ /* new item, structures can be very deep, just check the end for:
+ (items|contents)[].videoRenderer objects */
+ if (depth >= 3 &&
+ nodes[depth - 3].type == TYPE_ARRAY &&
+ nodes[depth - 2].type == TYPE_OBJECT &&
+ nodes[depth - 1].type == TYPE_OBJECT &&
+ (!strcmp(nodes[depth - 3].name, "items") ||
+ !strcmp(nodes[depth - 3].name, "contents")) &&
+ !strcmp(nodes[depth - 1].name, "videoRenderer")) {
+ r->nitems++;
+ return;
}
- if ((state & Videotime) && !strcmp(x->tag, "span"))
- strlcat(videos[nvideos].duration, d, sizeof(videos[nvideos].du…
- if ((state & User) && !strcmp(x->tag, "a"))
- strlcat(videos[nvideos].channeltitle, d, sizeof(videos[nvideos…
-}
-
-static void
-xmldataentity(XMLParser *x, const char *d, size_t dl)
-{
- const char *s;
- /* optimization: no need for entity conversion */
- if (!state || nvideos >= MAX_VIDEOS)
+ if (r->nitems == 0)
return;
-
- s = entitytostr(d);
- xmldata(x, s, strlen(s));
-}
-
-static void
-xmltagend(XMLParser *x, const char *t, size_t tl, int isshort)
-{
- char *p;
-
- if ((state & Metainfo) && !strcmp(t, "ul"))
- state &= ~Metainfo;
- if ((state & Title) && !strcmp(t, "h3")) {
- state &= ~Title;
-
- if (nvideos >= MAX_VIDEOS)
- return;
-
- if (!strncmp(id, "/watch", sizeof("/watch") - 1)) {
- if (!videos[nvideos].linktype)
- videos[nvideos].linktype = Video;
- if ((p = getparam(id, "v"))) {
- if (decodeparam(videos[nvideos].id, sizeof(vid…
- videos[nvideos].id[0] = '\0';
- }
- }
-
- id[0] = '\0';
+ item = &(r->items[r->nitems - 1]);
+
+ if (depth >= 4 &&
+ nodes[depth - 4].type == TYPE_ARRAY &&
+ nodes[depth - 3].type == TYPE_OBJECT &&
+ nodes[depth - 2].type == TYPE_OBJECT &&
+ nodes[depth - 1].type == TYPE_STRING &&
+ (!strcmp(nodes[depth - 4].name, "items") ||
+ !strcmp(nodes[depth - 4].name, "contents")) &&
+ !strcmp(nodes[depth - 2].name, "videoRenderer") &&
+ !strcmp(nodes[depth - 1].name, "videoId")) {
+ strlcpy(item->id, value, sizeof(item->id));
}
- if ((state & User)) {
- state &= ~User;
- if (nvideos >= MAX_VIDEOS)
- return;
+ if (depth >= 7 &&
+ nodes[depth - 7].type == TYPE_ARRAY &&
+ nodes[depth - 6].type == TYPE_OBJECT &&
+ nodes[depth - 5].type == TYPE_OBJECT &&
+ nodes[depth - 4].type == TYPE_OBJECT &&
+ nodes[depth - 3].type == TYPE_ARRAY &&
+ nodes[depth - 2].type == TYPE_OBJECT &&
+ nodes[depth - 1].type == TYPE_STRING &&
+ (!strcmp(nodes[depth - 7].name, "items") ||
+ !strcmp(nodes[depth - 7].name, "contents")) &&
+ !strcmp(nodes[depth - 5].name, "videoRenderer") &&
+ !strcmp(nodes[depth - 4].name, "title") &&
+ !strcmp(nodes[depth - 3].name, "runs") &&
+ !strcmp(nodes[depth - 1].name, "text") &&
+ !item->title[0]) {
+ strlcpy(item->title, value, sizeof(item->title));
+ }
- /* can be user or channel */
- if (!strncmp(userid, "/channel/", sizeof("/channel/") - 1)) {
- strlcpy(videos[nvideos].channelid,
- userid + sizeof("/channel/") - 1,
- sizeof(videos[nvideos].channelid));
- } else if (!strncmp(userid, "/user/", sizeof("/user/") - 1)) {
- strlcpy(videos[nvideos].userid,
- userid + sizeof("/user/") - 1,
- sizeof(videos[nvideos].userid));
+ if (depth >= 5 &&
+ nodes[depth - 5].type == TYPE_ARRAY &&
+ nodes[depth - 4].type == TYPE_OBJECT &&
+ nodes[depth - 3].type == TYPE_OBJECT &&
+ nodes[depth - 2].type == TYPE_OBJECT &&
+ nodes[depth - 1].type == TYPE_STRING &&
+ (!strcmp(nodes[depth - 5].name, "items") ||
+ !strcmp(nodes[depth - 5].name, "contents")) &&
+ !strcmp(nodes[depth - 3].name, "videoRenderer") &&
+ !strcmp(nodes[depth - 1].name, "simpleText")) {
+ if (!strcmp(nodes[depth - 2].name, "viewCountText") &&
+ !item->viewcount[0]) {
+ strlcpy(item->viewcount, value, sizeof(item->viewcount…
+ } else if (!strcmp(nodes[depth - 2].name, "lengthText") &&
+ !item->duration[0]) {
+ strlcpy(item->duration, value, sizeof(item->duration));
+ } else if (!strcmp(nodes[depth - 2].name, "publishedTimeText")…
+ !item->publishedat[0]) {
+ strlcpy(item->publishedat, value, sizeof(item->publish…
}
-
- userid[0] = '\0';
}
- if ((state & Videotime))
- state &= ~Videotime;
-}
-static void
-xmltagstart(XMLParser *x, const char *t, size_t tl)
-{
- if ((state & Metainfo) && !strcmp(t, "li"))
- metainfocount++;
-}
-
-static char *
-request_search(const char *s, const char *page, const char *order)
-{
- char path[4096];
-
- snprintf(path, sizeof(path), "/results?search_query=%s", s);
- if (page[0]) {
- strlcat(path, "&page=", sizeof(path));
- strlcat(path, page, sizeof(path));
+ if (depth >= 9 &&
+ nodes[depth - 9].type == TYPE_ARRAY &&
+ nodes[depth - 8].type == TYPE_OBJECT &&
+ nodes[depth - 7].type == TYPE_OBJECT &&
+ nodes[depth - 6].type == TYPE_OBJECT &&
+ nodes[depth - 5].type == TYPE_ARRAY &&
+ nodes[depth - 4].type == TYPE_OBJECT &&
+ nodes[depth - 3].type == TYPE_OBJECT &&
+ nodes[depth - 2].type == TYPE_OBJECT &&
+ nodes[depth - 1].type == TYPE_STRING &&
+ (!strcmp(nodes[depth - 9].name, "items") ||
+ !strcmp(nodes[depth - 9].name, "contents")) &&
+ !strcmp(nodes[depth - 7].name, "videoRenderer") &&
+ !strcmp(nodes[depth - 6].name, "longBylineText") &&
+ !strcmp(nodes[depth - 5].name, "runs") &&
+ !strcmp(nodes[depth - 3].name, "navigationEndpoint") &&
+ !strcmp(nodes[depth - 2].name, "browseEndpoint")) {
+ if (!strcmp(nodes[depth - 1].name, "browseId")) {
+ strlcpy(item->channelid, value, sizeof(item->channelid…
+ }
}
- if (order[0]) {
- strlcat(path, "&search_sort=", sizeof(path));
- if (!strcmp(order, "date"))
- strlcat(path, "video_date_uploaded", sizeof(path));
- else if (!strcmp(order, "relevance"))
- strlcat(path, "video_relevance", sizeof(path));
- else if (!strcmp(order, "views"))
- strlcat(path, "video_view_count", sizeof(path));
- else if (!strcmp(order, "rating"))
- strlcat(path, "video_avg_rating", sizeof(path));
+ if (depth >= 7 &&
+ nodes[depth - 7].type == TYPE_ARRAY &&
+ nodes[depth - 6].type == TYPE_OBJECT &&
+ nodes[depth - 5].type == TYPE_OBJECT &&
+ nodes[depth - 4].type == TYPE_OBJECT &&
+ nodes[depth - 3].type == TYPE_ARRAY &&
+ nodes[depth - 2].type == TYPE_OBJECT &&
+ nodes[depth - 1].type == TYPE_STRING &&
+ (!strcmp(nodes[depth - 7].name, "items") ||
+ !strcmp(nodes[depth - 7].name, "contents")) &&
+ !strcmp(nodes[depth - 5].name, "videoRenderer") &&
+ !strcmp(nodes[depth - 4].name, "longBylineText") &&
+ !strcmp(nodes[depth - 3].name, "runs")) {
+ if (!strcmp(nodes[depth - 1].name, "text") &&
+ !item->channeltitle[0]) {
+ strlcpy(item->channeltitle, value, sizeof(item->channe…
+ }
}
-
- /* force older youtube layout, else youtube will try to randomly serve
- a new layout sometimes breaking the parsing */
- strlcat(path, "&disable_polymer=1", sizeof(path));
-
- /* check if request is too long (truncation) */
- if (strlen(path) >= sizeof(path) - 1)
- return NULL;
-
- return youtube_request(path);
}
struct search_response *
youtube_search(const char *rawsearch, const char *page, const char *order)
{
struct search_response *r;
- XMLParser x = { 0 };
- char *data, *s;
+ char *data, *s, *start, *end;
+ int ret;
if (!(data = request_search(rawsearch, page, order)))
return NULL;
+
if (!(s = strstr(data, "\r\n\r\n")))
return NULL; /* invalid response */
/* skip header */
@@ -304,20 +205,18 @@ youtube_search(const char *rawsearch, const char *page, c…
if (!(r = calloc(1, sizeof(*r))))
return NULL;
- nvideos = 0;
- videos = r->items;
-
- x.xmlattr = xmlattr;
- x.xmlattrentity = xmlattrentity;
- x.xmldata = xmldata;
- x.xmldataentity = xmldataentity;
- x.xmltagend = xmltagend;
- x.xmltagstart = xmltagstart;
-
- setxmldata(s, strlen(s));
- xml_parse(&x);
+ if (extractjson(s, &start, &end) == -1) {
+// fprintf(stderr, "error extracting JSON");
+ free(r);
+ return NULL;
+ }
- r->nitems = nvideos;
+ ret = parsejson(start, end - start, processnode, r);
+ if (ret < 0) {
+// fprintf(stderr, "error parsing JSON");
+ free(r);
+ return NULL;
+ }
return r;
}
You are viewing proxied material from codemadness.org. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.