GopherProxy

	lex.c - bmf - bmf (Bayesian Mail Filter) 0.9.4 fork + patches
	git clone git://git.codemadness.org/bmf
	Log
	Files
	Refs
	README
	LICENSE
	---
	lex.c (11934B)
	---
	1 /* $Id: lex.c,v 1.18 2002/10/20 20:29:15 tommy Exp $ */
	2
	3 /*
	4 * Copyright (c) 2002 Tom Marshall <[email protected]>
	5 *
	6 * This program is free software. It may be distributed under the terms
	7 * in the file LICENSE, found in the top level of the distribution.
	8 *
	9 * lex.c: generate token stream for bmf.
	10 */
	11
	12 #include "config.h"
	13 #include "dbg.h"
	14 #include "str.h"
	15 #include "lex.h"
	16
	17 static cpchar g_htmltags[] =
	18 {
	19 "abbr",
	20 "above",
	21 "accesskey",
	22 "acronym",
	23 "align",
	24 "alink",
	25 "all",
	26 "alt",
	27 "applet",
	28 "archive",
	29 "axis",
	30 "basefont",
	31 "baseline",
	32 "below",
	33 "bgcolor",
	34 "big",
	35 "body",
	36 "border",
	37 "bottom",
	38 "box",
	39 "button",
	40 "cellpadding",
	41 "cellspacing",
	42 "center",
	43 "char",
	44 "charoff",
	45 "charset",
	46 "circle",
	47 "cite",
	48 "class",
	49 "classid",
	50 "clear",
	51 "codebase",
	52 "codetype",
	53 "color",
	54 "cols",
	55 "colspan",
	56 "compact",
	57 "content",
	58 "coords",
	59 "data",
	60 "datetime",
	61 "declare",
	62 "default",
	63 "defer",
	64 "dfn",
	65 "dir",
	66 "disabled",
	67 "face",
	68 "font",
	69 "frameborder",
	70 "groups",
	71 "head",
	72 "headers",
	73 "height",
	74 "href",
	75 "hreflang",
	76 "hsides",
	77 "hspace",
	78 "http-equiv",
	79 "iframe",
	80 "img",
	81 "input",
	82 "ismap",
	83 "justify",
	84 "kbd",
	85 "label",
	86 "lang",
	87 "language",
	88 "left",
	89 "lhs",
	90 "link",
	91 "longdesc",
	92 "map",
	93 "marginheight",
	94 "marginwidth",
	95 "media",
	96 "meta",
	97 "middle",
	98 "multiple",
	99 "name",
	100 "nohref",
	101 "none",
	102 "noresize",
	103 "noshade",
	104 "nowrap",
	105 "object",
	106 "onblur",
	107 "onchange",
	108 "onclick",
	109 "ondblclick",
	110 "onfocus",
	111 "onkeydown",
	112 "onkeypress",
	113 "onkeyup",
	114 "onload",
	115 "onmousedown",
	116 "onmousemove",
	117 "onmouseout",
	118 "onmouseover",
	119 "onmouseup",
	120 "onselect",
	121 "onunload",
	122 "param",
	123 "poly",
	124 "profile",
	125 "prompt",
	126 "readonly",
	127 "rect",
	128 "rel",
	129 "rev",
	130 "rhs",
	131 "right",
	132 "rows",
	133 "rowspan",
	134 "rules",
	135 "samp",
	136 "scheme",
	137 "scope",
	138 "script",
	139 "scrolling",
	140 "select",
	141 "selected",
	142 "shape",
	143 "size",
	144 "small",
	145 "span",
	146 "src",
	147 "standby",
	148 "strike",
	149 "strong",
	150 "style",
	151 "sub",
	152 "summary",
	153 "sup",
	154 "tabindex",
	155 "table",
	156 "target",
	157 "textarea",
	158 "title",
	159 "top",
	160 "type",
	161 "usemap",
	162 "valign",
	163 "value",
	164 "valuetype",
	165 "var",
	166 "vlink",
	167 "void",
	168 "vsides",
	169 "vspace",
	170 "width"
	171 };
	172 static const uint g_nhtmltags = sizeof(g_htmltags) / sizeof(cpchar);
	173
	174 static cpchar g_ignoredheaders[] =
	175 {
	176 "Date:",
	177 "Delivery-date:",
	178 "Message-ID:",
	179 "X-Sorted:",
	180 "X-Spam-"
	181 };
	182 static const uint g_nignoredheaders = sizeof(g_ignoredheaders) / sizeof(…
	183
	184 static inline bool_t
	185 is_whitespace(int c)
	186 {
	187 return (c == ' ' \|\| c == '\t' \|\| c == '\r');
	188 }
	189
	190 static inline bool_t
	191 is_wordmidchar(int c)
	192 {
	193 return (isalnum(c) \|\| c == '$' \|\| c == '\'' \|\| c == '.' \|\| c == …
	194 }
	195
	196 static inline bool_t
	197 is_wordendchar(int c)
	198 {
	199 return (isalnum(c) \|\| c == '$');
	200 }
	201
	202 static inline bool_t
	203 is_htmltag(cpchar p, uint len, uint * ptoklen)
	204 {
	205 int lo, hi, mid, minlen, cmp;
	206
	207 *ptoklen = 0;
	208
	209 hi = g_nhtmltags - 1;
	210 lo = -1;
	211 while (hi - lo > 1) {
	212 mid = (hi + lo) / 2;
	213 minlen = min(strlen(g_htmltags[mid]), len);
	214 cmp = strncmp(g_htmltags[mid], p, minlen);
	215 if (cmp > 0 \|\| (cmp == 0 && minlen < len && !islower(p[m…
	216 hi = mid;
	217 else
	218 lo = mid;
	219 }
	220 minlen = min(strlen(g_htmltags[hi]), len);
	221 if (len == minlen \|\| strncmp(g_htmltags[hi], p, minlen) != 0) {
	222 return false;
	223 }
	224 /* check if is_word() will have a longer match */
	225 if (is_wordendchar((unsigned char)p[minlen])) {
	226 return false;
	227 }
	228 if (is_wordmidchar((unsigned char)p[minlen]) &&
	229 is_wordendchar((unsigned char)p[minlen + 1])) {
	230 return false;
	231 }
	232 *ptoklen = strlen(g_htmltags[hi]);
	233
	234 return true;
	235 }
	236
	237 static inline bool_t
	238 is_htmlcomment(cpchar p, uint len, uint * ptoklen)
	239 {
	240 *ptoklen = 0;
	241
	242 if (len >= 4 && memcmp(p, "<!--", 4) == 0) {
	243 *ptoklen = 4;
	244 return true;
	245 }
	246 if (len >= 3 && memcmp(p, "-->", 3) == 0) {
	247 *ptoklen = 3;
	248 return true;
	249 }
	250 return false;
	251 }
	252
	253 static inline bool_t
	254 is_base64char(int c)
	255 {
	256 return (isalnum(c) \|\| (c == '/' \|\| c == '+'));
	257 }
	258
	259 static inline bool_t
	260 is_base64(cpchar p, uint len, uint * ptoklen)
	261 {
	262 *ptoklen = 0;
	263 while (len > 0) {
	264 if (p != '\n' && p != '\r' && !is_base64char((unsigned…
	265 return false;
	266 }
	267 p++;
	268 len--;
	269 (*ptoklen)++;
	270 }
	271 return true;
	272 }
	273
	274 static inline bool_t
	275 is_mimeboundary(cpchar p, uint len, uint * ptoklen)
	276 {
	277 *ptoklen = 0;
	278
	279 if (len < 3 \|\| p[0] != '-' \|\| p[1] != '-') {
	280 return false;
	281 }
	282 p += 2;
	283 len -= 2;
	284 *ptoklen += 2;
	285 while (len > 0) {
	286 if (is_whitespace(*p)) {
	287 return false;
	288 }
	289 if (p == '\n' \|\| p == '\r') {
	290 break;
	291 }
	292 p++;
	293 len--;
	294 (*ptoklen)++;
	295 }
	296 return true;
	297 }
	298
	299 static inline bool_t
	300 is_ipaddr(cpchar p, uint len, uint * ptoklen)
	301 {
	302 uint noctets, ndigits;
	303
	304 *ptoklen = 0;
	305
	306 noctets = 0;
	307 while (len > 0 && noctets < 4) {
	308 ndigits = 0;
	309 while (len > 0 && isdigit((unsigned char)*p)) {
	310 ndigits++;
	311 p++;
	312 len--;
	313 (*ptoklen)++;
	314 }
	315 if (ndigits == 0 \|\| ndigits > 3) {
	316 return false;
	317 }
	318 noctets++;
	319 if (noctets < 4) {
	320 if (*p != '.') {
	321 return false;
	322 }
	323 p++;
	324 len--;
	325 (*ptoklen)++;
	326 }
	327 }
	328 if (noctets < 4) {
	329 return false;
	330 }
	331 return true;
	332 }
	333
	334 static inline bool_t
	335 is_word(cpchar p, uint len, uint * ptoklen)
	336 {
	337 if (len < 3) {
	338 return false;
	339 }
	340 if (!(isalpha((unsigned char)p) \|\| p == '$')) {
	341 return false;
	342 }
	343 *ptoklen = 1;
	344 p++;
	345 len--;
	346 while (len > 0) {
	347 if (!is_wordmidchar((unsigned char)*p)) {
	348 break;
	349 }
	350 (*ptoklen)++;
	351 p++;
	352 len--;
	353 }
	354 while (ptoklen >= 3 && !is_wordendchar((unsigned char)(p - 1))…
	355 (*ptoklen)--;
	356 p--;
	357 len++;
	358 }
	359 if (*ptoklen < 3) {
	360 return false;
	361 }
	362 return true;
	363 }
	364
	365 static inline bool_t
	366 is_ignoredheader(cpchar p, uint len, uint * ptoklen)
	367 {
	368 int lo, hi, mid, minlen, cmp;
	369
	370 hi = g_nignoredheaders - 1;
	371 lo = -1;
	372 while (hi - lo > 1) {
	373 mid = (hi + lo) / 2;
	374 minlen = min(strlen(g_ignoredheaders[mid]), len);
	375 cmp = strncasecmp(g_ignoredheaders[mid], p, minlen);
	376 if (cmp >= 0)
	377 hi = mid;
	378 else
	379 lo = mid;
	380 }
	381 minlen = min(strlen(g_ignoredheaders[hi]), len);
	382 if (len == minlen \|\| strncasecmp(g_ignoredheaders[hi], p, minlen…
	383 return false;
	384 }
	385 *ptoklen = len;
	386 return true;
	387 }
	388
	389 static inline bool_t
	390 is_mailerid(cpchar p, uint len, uint * ptoklen)
	391 {
	392 if (len < 4 \|\| strncmp(p, "\tid ", 4) != 0) {
	393 return false;
	394 }
	395 *ptoklen = len;
	396 return true;
	397 }
	398
	399 static inline bool_t
	400 is_spamtext(cpchar p, uint len, uint * ptoklen)
	401 {
	402 if (len < 5 \|\| strncmp(p, "SPAM:", 5) != 0) {
	403 return false;
	404 }
	405 *ptoklen = len;
	406 return true;
	407 }
	408
	409 static inline bool_t
	410 is_smtpid(cpchar p, uint len, uint * ptoklen)
	411 {
	412 if (len < 8 \|\| strncmp(p, "SMTP id ", 8) != 0) {
	413 return false;
	414 }
	415 *ptoklen = len;
	416 return true;
	417 }
	418
	419 static inline bool_t
	420 is_boundaryequal(cpchar p, uint len, uint * ptoklen)
	421 {
	422 if (len < 9 \|\| strncmp(p, "boundary=", 9) != 0) {
	423 return false;
	424 }
	425 *ptoklen = len;
	426 return true;
	427 }
	428
	429 static inline bool_t
	430 is_nameequal(cpchar p, uint len, uint * ptoklen)
	431 {
	432 if (len < 6 \|\| strncmp(p, "name=\"", 6) != 0) {
	433 return false;
	434 }
	435 *ptoklen = 6;
	436 return true;
	437 }
	438
	439 static inline bool_t
	440 is_filenameequal(cpchar p, uint len, uint * ptoklen)
	441 {
	442 if (len < 10 \|\| strncmp(p, "filename=\"", 10) != 0) {
	443 return false;
	444 }
	445 *ptoklen = 10;
	446 return true;
	447 }
	448
	449 static inline bool_t
	450 is_from(cpchar p, uint len, uint * ptoklen)
	451 {
	452 if (len < 5 \|\| strncmp(p, "From ", 5) != 0) {
	453 return false;
	454 }
	455 *ptoklen = 5;
	456 return true;
	457 }
	458
	459 void
	460 lex_create(lex_t * pthis, mbox_t mboxtype)
	461 {
	462 pthis->mboxtype = mboxtype;
	463 pthis->section = envelope;
	464 pthis->pos = 0;
	465 pthis->bom = 0;
	466 pthis->eom = 0;
	467 pthis->lineend = 0;
	468 pthis->buflen = 0;
	469 pthis->pbuf = NULL;
	470 }
	471
	472 void
	473 lex_destroy(lex_t * pthis)
	474 {
	475 free(pthis->pbuf);
	476 }
	477
	478 bool_t
	479 lex_load(lex_t * pthis, int fd)
	480 {
	481 uint nalloc;
	482 ssize_t nread;
	483
	484 nalloc = IOBUFSIZE;
	485 if ((pthis->pbuf = malloc(IOBUFSIZE)) == NULL)
	486 return false;
	487
	488 while ((nread = read(fd, pthis->pbuf + pthis->buflen, nalloc - p…
	489 pthis->buflen += nread;
	490 if (pthis->buflen == nalloc) {
	491 char *pnewbuf;
	492
	493 nalloc += IOBUFSIZE;
	494 pnewbuf = (char *) realloc(pthis->pbuf, nalloc);
	495 if (pnewbuf == NULL) {
	496 free(pthis->pbuf);
	497 pthis->pbuf = NULL;
	498 return false;
	499 }
	500 pthis->pbuf = pnewbuf;
	501 }
	502 }
	503 if (nread < 0) {
	504 free(pthis->pbuf);
	505 pthis->pbuf = NULL;
	506 return false;
	507 }
	508 if (pthis->mboxtype == detect) {
	509 if (pthis->buflen > 5 && memcmp(pthis->pbuf, "From ", 5)…
	510 verbose(1, "Input looks like an mbox\n");
	511 pthis->mboxtype = mbox;
	512 } else {
	513 verbose(1, "Input looks like a maildir\n");
	514 pthis->mboxtype = maildir;
	515 }
	516 }
	517 return true;
	518 }
	519
	520 static bool_t
	521 lex_nextline(lex_t * pthis)
	522 {
	523 cpchar pbuf;
	524 uint len;
	525 uint toklen;
	526
	527 again:
	528 /* XXX: use and update pthis->section */
	529 pthis->pos = pthis->lineend;
	530 if (pthis->lineend == pthis->buflen) {
	531 return false;
	532 }
	533 pbuf = pthis->pbuf + pthis->pos;
	534 len = 0;
	535 while (pthis->pos + len < pthis->buflen && pbuf[len] != '\n') {
	536 len++;
	537 }
	538 if (pthis->pos + len < pthis->buflen) {
	539 len++; /* bump past the LF */
	540 }
	541 pthis->lineend = pthis->pos + len;
	542
	543 /* check beginning-of-line patterns */
	544 if (is_base64(pbuf, len, &toklen) \|\|
	545 is_ignoredheader(pbuf, len, &toklen) \|\|
	546 is_mailerid(pbuf, len, &toklen) \|\|
	547 is_mimeboundary(pbuf, len, &toklen) \|\|
	548 is_spamtext(pbuf, len, &toklen)) {
	549 /* ignore line */
	550 pthis->pos += toklen;
	551 goto again;
	552 }
	553 return true;
	554 }
	555
	556 void
	557 lex_nexttoken(lex_t * pthis, tok_t * ptok)
	558 {
	559 cpchar pbuf;
	560 uint len;
	561 uint toklen;
	562
	563 if (pthis->pos == pthis->eom) {
	564 pthis->bom = pthis->pos;
	565 }
	566 again:
	567 /* skip whitespace between tokens */
	568 while (pthis->pos != pthis->lineend && is_whitespace(pthis->pbuf…
	569 pthis->pos++;
	570 }
	571
	572 pbuf = pthis->pbuf + pthis->pos;
	573 len = pthis->lineend - pthis->pos;
	574
	575 /* possibilities: end-of-line, html-comment, ipaddr, word, junk …
	576
	577 if (pthis->pos == pthis->lineend) {
	578 if (!lex_nextline(pthis)) {
	579 pthis->eom = pthis->pos;
	580 ptok->tt = eof;
	581 return;
	582 }
	583 pbuf = pthis->pbuf + pthis->pos;
	584 len = pthis->lineend - pthis->pos;
	585
	586 if (pthis->mboxtype == mbox) {
	587 if (is_from(pbuf, len, &toklen)) {
	588 pthis->eom = pthis->pos;
	589 ptok->tt = from;
	590 ptok->p = pthis->pbuf + pthis->pos;
	591 ptok->len = toklen;
	592 pthis->pos += toklen;
	593 return;
	594 }
	595 }
	596 goto again; /* skip lws */
	597 }
	598 if (is_htmltag(pbuf, len, &toklen) \|\|
	599 is_htmlcomment(pbuf, len, &toklen) \|\|
	600 is_smtpid(pbuf, len, &toklen) \|\|
	601 is_boundaryequal(pbuf, len, &toklen) \|\|
	602 is_nameequal(pbuf, len, &toklen) \|\|
	603 is_filenameequal(pbuf, len, &toklen)) {
	604 /* ignore it */
	605 pthis->pos += toklen;
	606 goto again;
	607 }
	608 if (is_ipaddr(pbuf, len, &toklen)) {
	609 ptok->tt = word;
	610 ptok->p = pthis->pbuf + pthis->pos;
	611 ptok->len = toklen;
	612 pthis->pos += toklen;
	613 return;
	614 }
	615 if (is_word(pbuf, len, &toklen)) {
	616 ptok->tt = word;
	617 ptok->p = pthis->pbuf + pthis->pos;
	618 ptok->len = toklen;
	619 pthis->pos += toklen;
	620 if (toklen > MAXWORDLEN) {
	621 goto again;
	622 }
	623 return;
	624 }
	625 /* junk */
	626 pthis->pos++;
	627 goto again;
	628 }
	629
	630 /* SpamAssassin style passthru */
	631 void
	632 lex_passthru(lex_t * pthis, bool_t is_spam, double hits)
	633 {
	634 char szbuf[256];
	635 bool_t in_headers = true;
	636
	637 pthis->pos = pthis->bom;
	638 if (is_spam) {
	639 sprintf(szbuf, "X-Spam-Status: Yes, hits=%f required=%f,…
	640 "X-Spam-Flag: YES\n",
	641 hits, SPAM_CUTOFF);
	642 } else {
	643 sprintf(szbuf, "X-Spam-Status: No, hits=%f required=%f\n…
	644 hits, SPAM_CUTOFF);
	645 }
	646
	647 /* existing headers */
	648 while (in_headers && pthis->pos < pthis->eom) {
	649 cpchar pbuf = pthis->pbuf + pthis->pos;
	650 uint len = 0;
	651
	652 while (pthis->pos + len < pthis->buflen && pbuf[len] != …
	653 len++;
	654 }
	655 if (pthis->pos + len < pthis->buflen) {
	656 len++; /* bump past the LF */
	657 }
	658 /* check for end of headers */
	659 if (pbuf[0] == '\n' \|\| (pbuf[0] == '\r' && pbuf[1] == '\…
	660 /* end of headers */
	661 break;
	662 }
	663 /* write header, ignoring existing spam headers */
	664 if (strncasecmp(pbuf, "X-Spam-", 7) != 0) {
	665 write(STDOUT_FILENO, pbuf, len);
	666 }
	667 pthis->pos += len;
	668 }
	669
	670 /* new headers */
	671 write(STDOUT_FILENO, szbuf, strlen(szbuf));
	672
	673 /* remainder */
	674 if (pthis->pos < pthis->eom) {
	675 write(STDOUT_FILENO, pthis->pbuf + pthis->pos, pthis->eo…
	676 }
	677 pthis->bom = pthis->eom;
	678 }