GopherProxy

	improvements - surf-adblock - Surf adblock web extension
	git clone git://git.codemadness.org/surf-adblock
	Log
	Files
	Refs
	README
	LICENSE
	---
	commit 3cc61dad61ee13b47cc3b6a2931de9413c4c6176
	parent 84d3f064e393f5856f4bbbfb519b267ed4a5aa0a
	Author: Hiltjo Posthuma <[email protected]>
	Date: Mon, 5 Jun 2017 17:36:10 +0200

	improvements

	- fix CARET in match()
	- fix matchbegin rules (starts with \|\|).

	Diffstat:
	M TODO \| 2 ++
	M adblock.c \| 233 ++++++++++++++++++++---------…
	M surf-adblock.c \| 6 +++---
	M tests/tests.c \| 36 ++++++++++++++---------------…

	4 files changed, 175 insertions(+), 102 deletions(-)
	---
	diff --git a/TODO b/TODO
	@@ -1,3 +1,5 @@
	+- simplify match, there are only a few rules with multiple *.
	+
	- loadrules: return struct rules* ?
	on error free rules.

	diff --git a/adblock.c b/adblock.c
	@@ -35,8 +35,8 @@ struct filterrule {
	int matchend;
	/* is exception rule: prefix @@ for ABP or #@# for CSS */
	int isexception;
	- char css; / if non-NULL is CSS rule / hide element rule */
	- char *uri;
	+ const char css; / if non-NULL is CSS rule / hide element rule */
	+ const char *uri;
	struct filterdomain *domains;
	struct filterrule *next;
	};
	@@ -184,14 +184,14 @@ string_append(String s, const char data, size_t len)
	memcpy(s->data + s->len, data, len);
	s->len = newlen;
	s->data[s->len] = '\0';
	+
	return len;
	}

	#define END 0
	#define UNMATCHABLE -2
	-#define BRACKET -3
	-#define CARET -4
	-#define STAR -5
	+#define CARET -3
	+#define STAR -4

	static int
	str_next(const char str, size_t n, size_t step)
	@@ -275,11 +275,19 @@ match(const char pat, const char str, int fcase)
	pat++;
	m--;
	break;
	- default:
	+ case CARET:
	k = str_next(str, n, &sinc);
	- /* TODO: write a test-case */
	- if (c == CARET && (k == '?' \|\| k == '/' \|\| k <= 0))
	+ if (k <= 0)
	+ return (c==END) ? 0 : 1;
	+ str += sinc;
	+ n -= sinc;
	+ if (k != '?' && k != '/')
	return 1;
	+ pat++;
	+ m--;
	+ break;
	+ default:
	+ k = str_next(str, n, &sinc);
	if (k <= 0)
	return (c==END) ? 0 : 1;
	str += sinc;
	@@ -341,9 +349,14 @@ match(const char pat, const char str, int fcase)
	break;
	}
	s += sinc;
	- kfold = fcase ? casefold(k) : k;
	- if (k != c && kfold != c)
	- return 1;
	+ if (c == CARET) {
	+ if (k != '/' && k != '?')
	+ return 1;
	+ } else {
	+ kfold = fcase ? casefold(k) : k;
	+ if (k != c && kfold != c)
	+ return 1;
	+ }
	}

	/* We're all done with the tails now, so throw them out */
	@@ -366,10 +379,16 @@ match(const char pat, const char str, int fcase)
	k = str_next(s, endstr-s, &sinc);
	if (!k)
	return 1;
	- kfold = fcase ? casefold(k) : k;
	- if (k != c && kfold != c)
	- break;
	s += sinc;
	+ if (c == CARET) {
	+ if (k != '/' && k != '?')
	+ break;
	+ } else {
	+ kfold = fcase ? casefold(k) : k;
	+ if (k != c && kfold != c)
	+ break;
	+ }
	+
	}
	if (c == STAR) continue;
	/* If we failed, advance str, by 1 char if it's a valid
	@@ -486,17 +505,20 @@ matchdomain(const char s, const char domain)
	}

	static int
	-matchrule(struct filterrule f, const char uri, const char *type,
	- const char *domain)
	+matchrule(struct filterrule f, const char fromuri, const char *fromdomain,
	+ const char *fromrel,
	+ const char requri, const char reqdomain, const char *reqrel,
	+ const char *type)
	{
	/* NOTE: order matters, see FilterType enum values */
	struct filterdomain *d;
	char pat[1024];
	- int r, m;
	+ const char *uri;
	+ int len, r;

	r = f->domains ? 0 : 1;
	for (d = f->domains; d; d = d->next) {
	- if (matchdomain(d->domain, domain)) {
	+ if (matchdomain(d->domain, fromdomain)) {
	if (r && d->inverse)
	r = 0;
	else if (!r && !d->inverse)
	@@ -521,39 +543,58 @@ matchrule(struct filterrule f, const char uri, const ch…
	return 0;
	#endif

	- r = snprintf(pat, sizeof(pat), "%s%s%s",
	- f->matchbegin ? "" : "*",
	- f->uri,
	- f->matchend ? "" : "*");
	- if (r == -1 \|\| (size_t)r >= sizeof(pat)) {
	- fprintf(stderr, "warning: pattern too large, ignoring\n");
	- return 0;
	- }
	-
	- /* DEBUG */
	+ /* match begin including domain */
	if (f->matchbegin) {
	- printf("pat: %s, uri: %s, domain: %s\n", pat, uri, domai…
	- }
	+ /* TODO: match domain part of pattern */
	+ /* TODO: preprocess pattern if it is matchbegin? */

	- m = 0;
	- if (!match(pat, uri, (f->block & FilterTypeMatchCase) ? 0 : 1)) {
	-#if 0
	- for (; *type; type++) {
	- for (i = 0; blockstr[i]; i++) {
	- if (blockstr[i] == *type &&
	- f->block & (1 << i))
	- printf("block type '%c'\n", blockstr[i…
	- return 1;
	- }
	+ len = strcspn(f->uri, "^/");
	+
	+ /* match domain without dot */
	+ r = snprintf(pat, sizeof(pat), "%.*s",
	+ len, f->uri);
	+ if (r == -1 \|\| (size_t)r >= sizeof(pat)) {
	+ fprintf(stderr, "warning: pattern too large, ignoring\…
	+ return 0;
	+ }
	+
	+ /* TODO: block type mask */
	+ if (match(pat, reqdomain, (f->block & FilterTypeMatchCase) ? 0…
	+ /* match domain with dot */
	+ r = snprintf(pat, sizeof(pat), ".%.s",
	+ len, f->uri);
	+ if (r == -1 \|\| (size_t)r >= sizeof(pat)) {
	+ fprintf(stderr, "warning: pattern too large, i…
	+ return 0;
	}
	+
	+ /* TODO: block type mask */
	+ if (match(pat, reqdomain, (f->block & FilterTypeMatchC…
	+ return 0;
	}

	+ /* match on path */
	+ r = snprintf(pat, sizeof(pat), "*%s%s",
	+ f->uri + len,
	+ f->matchend ? "" : "*");
	+ uri = reqrel;
	+ } else {
	+ r = snprintf(pat, sizeof(pat), "*%s%s",
	+ f->uri,
	+ f->matchend ? "" : "*");
	+ uri = requri;
	+
	+ }
	+ if (r == -1 \|\| (size_t)r >= sizeof(pat)) {
	+ fprintf(stderr, "warning: pattern too large, ignoring\n");
	return 0;
	-#endif
	- m = 1;
	}
	- /m = r ? !m : m;/
	- return m;
	+
	+ /* TODO: block type mask */
	+ if (!match(pat, uri, (f->block & FilterTypeMatchCase) ? 0 : 1))
	+ return 1;
	+
	+ return 0;
	}

	static int
	@@ -619,6 +660,7 @@ parserule(struct filterrule f, char s)
	/* has options */
	if (!(f->uri = westrndup(s, p - s)))
	return -1;
	+
	s = ++p;

	/* blockmask, has options? default: allow all options, case-sensitive
	@@ -723,27 +765,23 @@ getglobalcss(void)
	}

	char *
	-getdocumentcss(const char *uri)
	+getdocumentcss(const char *fromuri)
	{
	const char *s;
	- char domain[256];
	+ char fromdomain[256];
	String sitecss;
	struct filterrule *r;
	size_t len;

	- if (!uri)
	- return NULL;
	-
	- if ((s = strstr(uri, "://")))
	- s += sizeof("://") - 1;
	- else
	- s = uri;
	- len = strcspn(s, "/"); /* TODO: ":/" */
	- memcpy(domain, s, len);
	- domain[len] = '\0';
	+ /* skip protocol */
	+ if ((s = strstr(fromuri, "://")))
	+ fromuri = s + sizeof("://") - 1;
	+ len = strcspn(fromuri, "/"); /* TODO: ":/" */
	+ memcpy(fromdomain, s, len);
	+ fromdomain[len] = '\0';

	- printf("uri: %s\n", uri);
	- printf("domain: %s\n", domain);
	+ printf("fromuri: %s\n", fromuri);
	+ printf("fromdomain: %s\n", fromdomain);

	/* DEBUG: timing */
	struct timespec tp_start, tp_end, tp_diff;
	@@ -754,7 +792,8 @@ getdocumentcss(const char *uri)
	/* site-specific CSS */
	memset(&sitecss, 0, sizeof(sitecss));
	for (r = rules; r; r = r->next) {
	- if (!r->css \|\| !r->domains \|\| !matchrule(r, "", "", domain))
	+ if (!r->css \|\| !r->domains \|\|
	+ !matchrule(r, "", fromdomain, "", "", "", "", ""))
	continue;

	len = strlen(r->css);
	@@ -792,28 +831,39 @@ getdocumentcss(const char *uri)

	err:
	free(sitecss.data);
	+ /memset(&sitecss, 0, sizeof(sitecss));/
	+
	return NULL;
	}

	int
	-allowrequest(const char uri, const char requri)
	+allowrequest(const char fromuri, const char requri)
	{
	- char domain[256];
	struct filterrule *r;
	- const char *s;
	+ char fromdomain[256], reqdomain[256];
	+ const char s, reqrel, *fromrel;
	size_t len;
	int status = 1;

	- if (!uri \|\| !strcmp(requri, uri))
	- return 1;
	+ /* skip protocol part */
	+ if ((s = strstr(fromuri, "://")))
	+ fromuri = s + sizeof("://") - 1;
	+ if ((s = strstr(requri, "://")))
	+ requri = s + sizeof("://") - 1;

	- if ((s = strstr(uri, "://")))
	- s += sizeof("://") - 1;
	- else
	- s = uri;
	- len = strcspn(s, "/"); /* TODO: ":/" */
	- memcpy(domain, s, len);
	- domain[len] = '\0';
	+ len = strcspn(fromuri, ":/"); /* TODO: ":/", but support IPV6... */
	+ memcpy(fromdomain, fromuri, len);
	+ fromdomain[len] = '\0';
	+
	+ len = strcspn(requri, ":/"); /* TODO: ":/", but support IPV6... */
	+ memcpy(reqdomain, requri, len);
	+ reqdomain[len] = '\0';
	+
	+ fromrel = &fromuri[strcspn(fromuri, "/")];
	+ reqrel = &requri[strcspn(requri, "/")];
	+
	+ printf("req %s = %s\n", requri, reqrel);
	+ printf("from %s = %s\n", fromuri, fromrel);

	/* DEBUG: timing */
	struct timespec tp_start, tp_end, tp_diff;
	@@ -823,12 +873,15 @@ allowrequest(const char uri, const char requri)

	/* match rules */
	for (r = rules; r; r = r->next) {
	- if (!r->css && matchrule(r, requri, "csio^", domain)) {
	- printf("requri: %s\n", requri);
	- printf("uri: %s\n", uri);
	- printf("domain: %s\n", domain);
	+ if (!r->css && matchrule(r, fromuri, fromdomain,
	+ fromrel, requri, reqdomain, reqrel, "…
	+ printf("reqrel: %s\n", reqrel);
	+ printf("reqdomain: %s\n", reqdomain);
	+ printf("requri: %s\n", requri);
	+ printf("from uri: %s\n", fromuri);
	+ printf("from domain: %s\n", fromdomain);

	- fprintf(stderr, "blocked: %s, %s\n", domain, requri);
	+ fprintf(stderr, "blocked: %s, %s\n", fromdomain, requr…

	/* DEBUG: for showing the timing */
	status = 0;
	@@ -851,13 +904,36 @@ end:
	}

	printf("%s [%s] timing: %lld sec, %.3f ms\n",
	- requri, uri, (long long)tp_diff.tv_sec,
	+ requri, fromuri, (long long)tp_diff.tv_sec,
	(float)tp_diff.tv_nsec / 1000000.0f);

	return status;
	}

	void
	+cleanup(void)
	+{
	+ struct filterrule *r;
	+ struct filterdomain *d;
	+
	+ free(globalcss.data);
	+ memset(&globalcss, 0, sizeof(globalcss));
	+
	+ for (r = rules; r; r = rules) {
	+ for (d = r->domains; d; d = r->domains) {
	+ free(d->domain);
	+ r->domains = d->next;
	+ free(d);
	+ }
	+ free(r->css);
	+ free(r->uri);
	+ rules = r->next;
	+ free(r);
	+ }
	+ rules = NULL;
	+}
	+
	+void
	init(void)
	{
	struct filterrule *r;
	@@ -906,8 +982,7 @@ init(void)

	len = strlen(r->css);
	if (string_append(&globalcss, r->css, strlen(r->css)) < len) {
	- weprintf("cannot load global css selectors "
	- "in memory\n");
	+ weprintf("cannot load global css selectors in memory\n…
	cleanup();
	return;
	}
	diff --git a/surf-adblock.c b/surf-adblock.c
	@@ -67,14 +67,14 @@ static gboolean
	sendrequest(WebKitWebPage wp, WebKitURIRequest req,
	WebKitURIResponse res, Page p)
	{
	- const char uri, requri;
	+ const char fromuri, requri;

	if (!webkit_uri_request_get_http_method(req))
	return TRUE; /* TRUE = don't handle any more events */
	- uri = webkit_web_page_get_uri(p->webpage);
	+ fromuri = webkit_web_page_get_uri(p->webpage);
	requri = webkit_uri_request_get_uri(req);

	- return allowrequest(uri, requri) ? FALSE : TRUE;
	+ return allowrequest(fromuri, requri) ? FALSE : TRUE;
	}

	static void
	diff --git a/tests/tests.c b/tests/tests.c
	@@ -1,25 +1,21 @@
	#include "../adblock.c"

	-void
	-cleanup(void)
	-{
	- struct filterrule *r;
	- struct filterdomain *d;
	-
	- free(globalcss.data);
	-
	- for (r = rules; r; r = rules) {
	- for (d = r->domains; d; d = r->domains) {
	- free(d->domain);
	- r->domains = d->next;
	- free(d);
	- }
	- free(r->css);
	- free(r->uri);
	- rules = r->next;
	- free(r);
	- }
	-}
	+/*
	+
	+TODO: add tests:
	+
	+\|\|example.com/banner.gif will block all these addresses
	+
	+ http://example.com/banner.gif
	+ https://example.com/banner.gif
	+ http://www.example.com/banner.gif
	+
	+while not blocking:
	+
	+ http://badexample.com/banner.gif
	+ http://gooddomain.example/analyze?http://example.com/banner.gif
	+
	+*/

	int
	main(void)