improvements - surf-adblock - Surf adblock web extension | |
git clone git://git.codemadness.org/surf-adblock | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
commit d781090ae7718310fb13c83c1a8406be46a613b8 | |
parent b6cc76e9fcac3112086f2d2348ef53b16b59da9d | |
Author: Hiltjo Posthuma <[email protected]> | |
Date: Sat, 3 Jun 2017 21:54:05 +0200 | |
improvements | |
- WIP: faster matching for simple rules, just a test atm, but ~50ms max to ~20m… | |
machine. | |
- add support for exception rules. | |
- debug.sh add debug script for testing: compile as main(). | |
Diffstat: | |
M TODO | 14 +++++++++++++- | |
A debug.sh | 18 ++++++++++++++++++ | |
M surf-adblock.c | 279 +++++++++++++++++++++++++----… | |
3 files changed, 255 insertions(+), 56 deletions(-) | |
--- | |
diff --git a/TODO b/TODO | |
@@ -1,5 +1,15 @@ | |
- fix tweakers.net popup / rule. | |
-- benchmark rule matching (timing). | |
+ this is in an exception rule... | |
+ | |
+ make sure exception rules are always below in the list? modify awk scr… | |
+ | |
+- performance: | |
+ - benchmark rule matching (timing). | |
+ - bloom filters? some kind of cache? | |
+ - optimize simple filter case. | |
+ | |
+- support separator "^" = [/\?]? | |
+ - test it better. | |
=== | |
@@ -23,6 +33,8 @@ Docs: | |
and matchbegin or matchend set. | |
- make less CPU intensive. | |
- maybe even include it statically? | |
+ - optimize CSS rule matching (only per site?). | |
+ | |
- optimize memory allocation. | |
- optimize: pregenerate one global stylesheet that applies to all sites? | |
- separate adblocker into daemon? not sure. | |
diff --git a/debug.sh b/debug.sh | |
@@ -0,0 +1,18 @@ | |
+#!/bin/sh | |
+# ugly debug script: compile as standalone program for testing. | |
+ | |
+cc -std=c99 -pedantic -Wall -Os -I. -I/usr/include -I/usr/X11R6/include \ | |
+ `pkg-config --cflags gtk+-3.0 webkit2gtk-4.0 webkit2gtk-web-extension… | |
+ -DVERSION=\"0.1\" -DWEBEXTDIR=\"/usr/local/lib/surf\" -D_DEFAULT_SOUR… | |
+ -DWEBEXTDIR=\"/usr/local/lib/surf\" \ | |
+ `pkg-config --cflags gtk+-3.0 webkit2gtk-4.0 webkit2gtk-web-extension… | |
+ -DDEBUG \ | |
+ -c surf-adblock.c | |
+cc -s -L/usr/lib -lc -L/usr/X11R6/lib -lX11 \ | |
+ `pkg-config --libs gtk+-3.0 webkit2gtk-4.0 webkit2gtk-web-extension-4… | |
+ surf-adblock.o | |
+ | |
+chmod +x surf-adblock | |
+# NOTE: need to copy because of W^X. | |
+doas cp surf-adblock /usr/local/bin | |
+/usr/local/bin/surf-adblock | |
diff --git a/surf-adblock.c b/surf-adblock.c | |
@@ -305,11 +305,11 @@ match(const char *pat, const char *str, int fcase) | |
break; | |
default: | |
k = str_next(str, n, &sinc); | |
+ /* TODO: write a test-case */ | |
+ if (c == CARET && (k == '?' || k == '/' || k <= 0)) | |
+ return 1; | |
if (k <= 0) | |
return (c==END) ? 0 : 1; | |
- if (c == CARET && (iswdigit(k) || iswalpha(k) || | |
- strchr("_-.%", k))) | |
- return 1; | |
str += sinc; | |
n -= sinc; | |
kfold = fcase ? casefold(k) : k; | |
@@ -410,7 +410,6 @@ match(const char *pat, const char *str, int fcase) | |
return 0; | |
} | |
- | |
/* | |
domain=... if domain is prefixed with ~, ignore. | |
multiple domains can be separated with | | |
@@ -521,28 +520,63 @@ matchrule(struct filterrule *f, const char *uri, const ch… | |
/* NOTE: order matters, see FilterType enum values */ | |
struct filterdomain *d; | |
char pat[1024]; | |
- int r; | |
- | |
- /* ignore exception rules for now, these are usually paid | |
- * for by sites to allow advertisements. */ | |
- if (f->isexception) | |
- return 0; | |
+ int r, m; | |
- if (f->css) { | |
- r = f->domains ? 0 : 1; | |
- for (d = f->domains; d; d = d->next) { | |
- if (matchdomain(d->domain, domain)) { | |
- if (r && d->inverse) | |
- r = 0; | |
- else if (!r && !d->inverse) | |
- r = 1; | |
- } else if (r && !d->inverse) { | |
+ r = f->domains ? 0 : 1; | |
+ for (d = f->domains; d; d = d->next) { | |
+ if (matchdomain(d->domain, domain)) { | |
+ if (r && d->inverse) | |
r = 0; | |
- } | |
+ else if (!r && !d->inverse) | |
+ r = 1; | |
+ } else if (r && !d->inverse) { | |
+ r = 0; | |
} | |
+ } | |
+ if (f->css) { | |
+ /* DEBUG */ | |
+ if (f->isexception) | |
+ printf("DEBUG, exception rule, CSS: %s, match? %d\n", | |
+ f->css, r); | |
return r; | |
} | |
+#if 1 | |
+ /* skip allow rule, TODO: inverse? */ | |
+ if (!r) | |
+ return 0; | |
+#endif | |
+ | |
+#if 1 | |
+ /* DEBUG: test, match if it is a simple pattern */ | |
+ char *p; | |
+ p = strchr(f->uri, '*'); | |
+ if (!p) | |
+ p = strchr(f->uri, '^'); | |
+ if (!p) { | |
+ /* TODO: write a test-case */ | |
+ if (f->block & FilterTypeMatchCase) { | |
+ if (f->matchbegin) | |
+ m = strncmp(uri, f->uri, strlen(f->uri)) == 0; | |
+ else if (f->matchend) | |
+ m = strlen(f->uri) <= strlen(uri) && | |
+ strcmp(&uri[strlen(uri) - strlen(f->ur… | |
+ else | |
+ m = strstr(uri, f->uri) ? 1 : 0; | |
+ } else { | |
+ if (f->matchbegin) | |
+ m = strncasecmp(uri, f->uri, strlen(f->uri)) =… | |
+ else if (f->matchend) | |
+ m = strlen(f->uri) <= strlen(uri) && | |
+ strcasecmp(&uri[strlen(uri) - strlen(f… | |
+ else | |
+ m = strcasestr(uri, f->uri) ? 1 : 0; | |
+ } | |
+ /*m = r ? !m : m;*/ | |
+ return m; | |
+ } | |
+#endif | |
+ | |
r = snprintf(pat, sizeof(pat), "%s%s%s", | |
f->matchbegin ? "" : "*", | |
f->uri, | |
@@ -552,19 +586,8 @@ matchrule(struct filterrule *f, const char *uri, const cha… | |
return 0; | |
} | |
- r = f->domains ? 0 : 1; | |
- for (d = f->domains; d; d = d->next) { | |
- if (matchdomain(d->domain, domain)) { | |
- if (r && d->inverse) | |
- r = 0; | |
- else if (!r && !d->inverse) | |
- r = 1; | |
- } else if (r && !d->inverse) { | |
- r = 0; | |
- } | |
- } | |
- | |
- if (r && !match(pat, uri, (f->block & FilterTypeMatchCase) ? 0 : 1)) { | |
+ m = 0; | |
+ if (!match(pat, uri, (f->block & FilterTypeMatchCase) ? 0 : 1)) { | |
#if 0 | |
for (; *type; type++) { | |
for (i = 0; blockstr[i]; i++) { | |
@@ -575,11 +598,13 @@ matchrule(struct filterrule *f, const char *uri, const ch… | |
} | |
} | |
} | |
+ | |
return 0; | |
#endif | |
- return 1; | |
+ m = 1; | |
} | |
- return 0; | |
+ /*m = r ? !m : m;*/ | |
+ return m; | |
} | |
static int | |
@@ -695,6 +720,7 @@ end: | |
return 1; | |
} | |
+#if 0 | |
static void | |
debugrule(struct filterrule *r) | |
{ | |
@@ -702,6 +728,7 @@ debugrule(struct filterrule *r) | |
"%lu\n===\n", r->uri ? r->uri : "", r->css ? r->css : "", | |
r->isexception, r->block); | |
} | |
+#endif | |
static int | |
loadrules(FILE *fp) | |
@@ -775,6 +802,12 @@ documentloaded(WebKitWebPage *wp, Page *p) | |
printf("uri: %s\n", uri); | |
printf("domain: %s\n", domain); | |
+ /* DEBUG: timing */ | |
+ struct timespec tp_start, tp_end, tp_diff; | |
+ if (clock_gettime(CLOCK_MONOTONIC, &tp_start) == -1) { | |
+ fprintf(stderr, "clock_gettime: %s\n", strerror(errno)); | |
+ } | |
+ | |
/* site-specific CSS */ | |
memset(&sitecss, 0, sizeof(sitecss)); | |
for (r = rules; r; r = r->next) { | |
@@ -783,11 +816,38 @@ documentloaded(WebKitWebPage *wp, Page *p) | |
len = strlen(r->css); | |
if (string_append(&sitecss, r->css, len) < len) | |
return; | |
- len = sizeof("{display:none;}") -1; | |
- if (string_append(&sitecss, "{display:none;}", len) < len) | |
- return; | |
+ | |
+ if (r->isexception) { | |
+ len = sizeof("{display:initial;}") -1; | |
+ if (string_append(&sitecss, "{display:initial;}", len)… | |
+ return; | |
+ } else { | |
+ len = sizeof("{display:none;}") -1; | |
+ if (string_append(&sitecss, "{display:none;}", len) < … | |
+ return; | |
+ } | |
+ } | |
+/* printf("sitecss: %s\n", sitecss.data ? sitecss.data : "<empty>");*/ | |
+ | |
+ /* DEBUG: timing */ | |
+ if (clock_gettime(CLOCK_MONOTONIC, &tp_end) == -1) { | |
+ fprintf(stderr, "clock_gettime: %s\n", strerror(errno)); | |
} | |
- printf("sitecss: %s\n", sitecss.data ? sitecss.data : "<empty>"); | |
+ | |
+ tp_diff.tv_sec = tp_end.tv_sec - tp_start.tv_sec; | |
+ tp_diff.tv_nsec = tp_end.tv_nsec - tp_start.tv_nsec; | |
+ if (tp_diff.tv_nsec < 0) { | |
+ tp_diff.tv_sec--; | |
+ tp_diff.tv_nsec += 1000000000L; | |
+ } | |
+ | |
+ printf("timing: %zu sec, %.3f ms\n", | |
+ tp_diff.tv_sec, (float)tp_diff.tv_nsec / 1000000.0f); | |
+ | |
+ if (globalcss.data) | |
+ printf("global CSS length in bytes: %zu\n", strlen(globalcss.d… | |
+ if (sitecss.data) | |
+ printf("site CSS length in bytes: %zu\n", strlen(sitecss.data)… | |
p->view = webkit_dom_document_get_default_view(doc); | |
@@ -819,6 +879,7 @@ sendrequest(WebKitWebPage *wp, WebKitURIRequest *req, | |
const char *s, *uri = webkit_web_page_get_uri(p->webpage), | |
*requri = webkit_uri_request_get_uri(req); | |
size_t len; | |
+ gboolean status = FALSE; | |
if (!uri || !strcmp(requri, uri) || | |
(strncmp(uri, "http://", sizeof("http://") - 1) && | |
@@ -830,6 +891,12 @@ sendrequest(WebKitWebPage *wp, WebKitURIRequest *req, | |
memcpy(domain, s, len); | |
domain[len] = '\0'; | |
+ /* DEBUG: timing */ | |
+ struct timespec tp_start, tp_end, tp_diff; | |
+ if (clock_gettime(CLOCK_MONOTONIC, &tp_start) == -1) { | |
+ fprintf(stderr, "clock_gettime: %s\n", strerror(errno)); | |
+ } | |
+ | |
/* match rules */ | |
for (r = rules; r; r = r->next) { | |
if (!r->css && matchrule(r, requri, "csio^", domain)) { | |
@@ -839,30 +906,32 @@ sendrequest(WebKitWebPage *wp, WebKitURIRequest *req, | |
fprintf(stderr, "blocked: %s, %s\n", domain, requri); | |
- return TRUE; | |
+ status = TRUE; | |
+ goto end; | |
} | |
} | |
- return FALSE; | |
-} | |
- | |
-static void | |
-webpagecreated(WebKitWebExtension *e, WebKitWebPage *p, gpointer unused) | |
-{ | |
- Page *np; | |
+end: | |
+ /* DEBUG: timing */ | |
+ if (clock_gettime(CLOCK_MONOTONIC, &tp_end) == -1) { | |
+ fprintf(stderr, "clock_gettime: %s\n", strerror(errno)); | |
+ } | |
- if (!(np = newpage(p))) { | |
- weprintf("cannot associate webext with new page: %s\n", | |
- strerror(errno)); | |
- return; | |
+ tp_diff.tv_sec = tp_end.tv_sec - tp_start.tv_sec; | |
+ tp_diff.tv_nsec = tp_end.tv_nsec - tp_start.tv_nsec; | |
+ if (tp_diff.tv_nsec < 0) { | |
+ tp_diff.tv_sec--; | |
+ tp_diff.tv_nsec += 1000000000L; | |
} | |
- g_signal_connect(p, "document-loaded", G_CALLBACK(documentloaded), np); | |
- g_signal_connect(p, "send-request", G_CALLBACK(sendrequest), np); | |
+ printf("%s [%s] timing: %zu sec, %.3f ms\n", | |
+ requri, uri, tp_diff.tv_sec, (float)tp_diff.tv_nsec / 1000000.… | |
+ | |
+ return status; | |
} | |
-G_MODULE_EXPORT void | |
-webkit_web_extension_initialize(WebKitWebExtension *ext) | |
+void | |
+init(void) | |
{ | |
struct filterrule *r; | |
FILE *fp; | |
@@ -922,6 +991,106 @@ webkit_web_extension_initialize(WebKitWebExtension *ext) | |
return; | |
} | |
} | |
+} | |
+ | |
+static void | |
+webpagecreated(WebKitWebExtension *e, WebKitWebPage *p, gpointer unused) | |
+{ | |
+ Page *np; | |
+ | |
+ if (!(np = newpage(p))) { | |
+ weprintf("cannot associate webext with new page: %s\n", | |
+ strerror(errno)); | |
+ return; | |
+ } | |
+ | |
+ g_signal_connect(p, "document-loaded", G_CALLBACK(documentloaded), np); | |
+ g_signal_connect(p, "send-request", G_CALLBACK(sendrequest), np); | |
+} | |
+G_MODULE_EXPORT void | |
+webkit_web_extension_initialize(WebKitWebExtension *ext) | |
+{ | |
+ init(); | |
g_signal_connect(ext, "page-created", G_CALLBACK(webpagecreated), NULL… | |
} | |
+ | |
+#ifdef DEBUG | |
+int | |
+main(void) | |
+{ | |
+ char domain[256]; | |
+ String sitecss; | |
+ struct filterrule *r; | |
+ const char *s, *uri; | |
+ size_t len; | |
+ | |
+ /* TEST */ | |
+ uri = "https://tweakers.net/"; | |
+ | |
+ if (!uri || (strncmp(uri, "http://", sizeof("http://") - 1) && | |
+ strncmp(uri, "https://", sizeof("https://") - 1))) | |
+ return; | |
+ | |
+ init(); | |
+ | |
+ s = strstr(uri, "://") + sizeof("://") - 1; | |
+ len = strcspn(s, "/"); | |
+ memcpy(domain, s, len); | |
+ domain[len] = '\0'; | |
+ | |
+ printf("uri: %s\n", uri); | |
+ printf("domain: %s\n", domain); | |
+ | |
+ /* DEBUG: timing */ | |
+ struct timespec tp_start, tp_end, tp_diff; | |
+ if (clock_gettime(CLOCK_MONOTONIC, &tp_start) == -1) { | |
+ fprintf(stderr, "clock_gettime: %s\n", strerror(errno)); | |
+ } | |
+ | |
+ /* site-specific CSS */ | |
+ memset(&sitecss, 0, sizeof(sitecss)); | |
+ for (r = rules; r; r = r->next) { | |
+ if (!r->css || !r->domains || !matchrule(r, "", "", domain)) | |
+ continue; | |
+ len = strlen(r->css); | |
+ if (string_append(&sitecss, r->css, len) < len) | |
+ return; | |
+ if (r->isexception) { | |
+ len = sizeof("{display:initial;}") -1; | |
+ if (string_append(&sitecss, "{display:initial;}", len)… | |
+ return; | |
+ } else { | |
+ len = sizeof("{display:none;}") -1; | |
+ if (string_append(&sitecss, "{display:none;}", len) < … | |
+ return; | |
+ } | |
+ } | |
+ printf("sitecss: %s\n", sitecss.data ? sitecss.data : "<empty>"); | |
+ | |
+ /* DEBUG: timing */ | |
+ if (clock_gettime(CLOCK_MONOTONIC, &tp_end) == -1) { | |
+ fprintf(stderr, "clock_gettime: %s\n", strerror(errno)); | |
+ } | |
+ | |
+ tp_diff.tv_sec = tp_end.tv_sec - tp_start.tv_sec; | |
+ tp_diff.tv_nsec = tp_end.tv_nsec - tp_start.tv_nsec; | |
+ if (tp_diff.tv_nsec < 0) { | |
+ tp_diff.tv_sec--; | |
+ tp_diff.tv_nsec += 1000000000L; | |
+ } | |
+ | |
+ printf("timing: %zu sec, %.3f ms\n", | |
+ tp_diff.tv_sec, (float)tp_diff.tv_nsec / 1000000.0f); | |
+ | |
+ if (globalcss.data) | |
+ printf("global CSS length in bytes: %zu\n", strlen(globalcss.d… | |
+ if (sitecss.data) | |
+ printf("site CSS length in bytes: %zu\n", strlen(sitecss.data)… | |
+ | |
+ free(sitecss.data); | |
+ cleanup(); | |
+ | |
+ return 0; | |
+} | |
+#endif |