lex.c - bmf - bmf (Bayesian Mail Filter) 0.9.4 fork + patches | |
git clone git://git.codemadness.org/bmf | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
lex.c (11934B) | |
--- | |
1 /* $Id: lex.c,v 1.18 2002/10/20 20:29:15 tommy Exp $ */ | |
2 | |
3 /* | |
4 * Copyright (c) 2002 Tom Marshall <[email protected]> | |
5 * | |
6 * This program is free software. It may be distributed under the terms | |
7 * in the file LICENSE, found in the top level of the distribution. | |
8 * | |
9 * lex.c: generate token stream for bmf. | |
10 */ | |
11 | |
12 #include "config.h" | |
13 #include "dbg.h" | |
14 #include "str.h" | |
15 #include "lex.h" | |
16 | |
17 static cpchar g_htmltags[] = | |
18 { | |
19 "abbr", | |
20 "above", | |
21 "accesskey", | |
22 "acronym", | |
23 "align", | |
24 "alink", | |
25 "all", | |
26 "alt", | |
27 "applet", | |
28 "archive", | |
29 "axis", | |
30 "basefont", | |
31 "baseline", | |
32 "below", | |
33 "bgcolor", | |
34 "big", | |
35 "body", | |
36 "border", | |
37 "bottom", | |
38 "box", | |
39 "button", | |
40 "cellpadding", | |
41 "cellspacing", | |
42 "center", | |
43 "char", | |
44 "charoff", | |
45 "charset", | |
46 "circle", | |
47 "cite", | |
48 "class", | |
49 "classid", | |
50 "clear", | |
51 "codebase", | |
52 "codetype", | |
53 "color", | |
54 "cols", | |
55 "colspan", | |
56 "compact", | |
57 "content", | |
58 "coords", | |
59 "data", | |
60 "datetime", | |
61 "declare", | |
62 "default", | |
63 "defer", | |
64 "dfn", | |
65 "dir", | |
66 "disabled", | |
67 "face", | |
68 "font", | |
69 "frameborder", | |
70 "groups", | |
71 "head", | |
72 "headers", | |
73 "height", | |
74 "href", | |
75 "hreflang", | |
76 "hsides", | |
77 "hspace", | |
78 "http-equiv", | |
79 "iframe", | |
80 "img", | |
81 "input", | |
82 "ismap", | |
83 "justify", | |
84 "kbd", | |
85 "label", | |
86 "lang", | |
87 "language", | |
88 "left", | |
89 "lhs", | |
90 "link", | |
91 "longdesc", | |
92 "map", | |
93 "marginheight", | |
94 "marginwidth", | |
95 "media", | |
96 "meta", | |
97 "middle", | |
98 "multiple", | |
99 "name", | |
100 "nohref", | |
101 "none", | |
102 "noresize", | |
103 "noshade", | |
104 "nowrap", | |
105 "object", | |
106 "onblur", | |
107 "onchange", | |
108 "onclick", | |
109 "ondblclick", | |
110 "onfocus", | |
111 "onkeydown", | |
112 "onkeypress", | |
113 "onkeyup", | |
114 "onload", | |
115 "onmousedown", | |
116 "onmousemove", | |
117 "onmouseout", | |
118 "onmouseover", | |
119 "onmouseup", | |
120 "onselect", | |
121 "onunload", | |
122 "param", | |
123 "poly", | |
124 "profile", | |
125 "prompt", | |
126 "readonly", | |
127 "rect", | |
128 "rel", | |
129 "rev", | |
130 "rhs", | |
131 "right", | |
132 "rows", | |
133 "rowspan", | |
134 "rules", | |
135 "samp", | |
136 "scheme", | |
137 "scope", | |
138 "script", | |
139 "scrolling", | |
140 "select", | |
141 "selected", | |
142 "shape", | |
143 "size", | |
144 "small", | |
145 "span", | |
146 "src", | |
147 "standby", | |
148 "strike", | |
149 "strong", | |
150 "style", | |
151 "sub", | |
152 "summary", | |
153 "sup", | |
154 "tabindex", | |
155 "table", | |
156 "target", | |
157 "textarea", | |
158 "title", | |
159 "top", | |
160 "type", | |
161 "usemap", | |
162 "valign", | |
163 "value", | |
164 "valuetype", | |
165 "var", | |
166 "vlink", | |
167 "void", | |
168 "vsides", | |
169 "vspace", | |
170 "width" | |
171 }; | |
172 static const uint g_nhtmltags = sizeof(g_htmltags) / sizeof(cpchar); | |
173 | |
174 static cpchar g_ignoredheaders[] = | |
175 { | |
176 "Date:", | |
177 "Delivery-date:", | |
178 "Message-ID:", | |
179 "X-Sorted:", | |
180 "X-Spam-" | |
181 }; | |
182 static const uint g_nignoredheaders = sizeof(g_ignoredheaders) / sizeof(… | |
183 | |
184 static inline bool_t | |
185 is_whitespace(int c) | |
186 { | |
187 return (c == ' ' || c == '\t' || c == '\r'); | |
188 } | |
189 | |
190 static inline bool_t | |
191 is_wordmidchar(int c) | |
192 { | |
193 return (isalnum(c) || c == '$' || c == '\'' || c == '.' || c == … | |
194 } | |
195 | |
196 static inline bool_t | |
197 is_wordendchar(int c) | |
198 { | |
199 return (isalnum(c) || c == '$'); | |
200 } | |
201 | |
202 static inline bool_t | |
203 is_htmltag(cpchar p, uint len, uint * ptoklen) | |
204 { | |
205 int lo, hi, mid, minlen, cmp; | |
206 | |
207 *ptoklen = 0; | |
208 | |
209 hi = g_nhtmltags - 1; | |
210 lo = -1; | |
211 while (hi - lo > 1) { | |
212 mid = (hi + lo) / 2; | |
213 minlen = min(strlen(g_htmltags[mid]), len); | |
214 cmp = strncmp(g_htmltags[mid], p, minlen); | |
215 if (cmp > 0 || (cmp == 0 && minlen < len && !islower(p[m… | |
216 hi = mid; | |
217 else | |
218 lo = mid; | |
219 } | |
220 minlen = min(strlen(g_htmltags[hi]), len); | |
221 if (len == minlen || strncmp(g_htmltags[hi], p, minlen) != 0) { | |
222 return false; | |
223 } | |
224 /* check if is_word() will have a longer match */ | |
225 if (is_wordendchar((unsigned char)p[minlen])) { | |
226 return false; | |
227 } | |
228 if (is_wordmidchar((unsigned char)p[minlen]) && | |
229 is_wordendchar((unsigned char)p[minlen + 1])) { | |
230 return false; | |
231 } | |
232 *ptoklen = strlen(g_htmltags[hi]); | |
233 | |
234 return true; | |
235 } | |
236 | |
237 static inline bool_t | |
238 is_htmlcomment(cpchar p, uint len, uint * ptoklen) | |
239 { | |
240 *ptoklen = 0; | |
241 | |
242 if (len >= 4 && memcmp(p, "<!--", 4) == 0) { | |
243 *ptoklen = 4; | |
244 return true; | |
245 } | |
246 if (len >= 3 && memcmp(p, "-->", 3) == 0) { | |
247 *ptoklen = 3; | |
248 return true; | |
249 } | |
250 return false; | |
251 } | |
252 | |
253 static inline bool_t | |
254 is_base64char(int c) | |
255 { | |
256 return (isalnum(c) || (c == '/' || c == '+')); | |
257 } | |
258 | |
259 static inline bool_t | |
260 is_base64(cpchar p, uint len, uint * ptoklen) | |
261 { | |
262 *ptoklen = 0; | |
263 while (len > 0) { | |
264 if (*p != '\n' && *p != '\r' && !is_base64char((unsigned… | |
265 return false; | |
266 } | |
267 p++; | |
268 len--; | |
269 (*ptoklen)++; | |
270 } | |
271 return true; | |
272 } | |
273 | |
274 static inline bool_t | |
275 is_mimeboundary(cpchar p, uint len, uint * ptoklen) | |
276 { | |
277 *ptoklen = 0; | |
278 | |
279 if (len < 3 || p[0] != '-' || p[1] != '-') { | |
280 return false; | |
281 } | |
282 p += 2; | |
283 len -= 2; | |
284 *ptoklen += 2; | |
285 while (len > 0) { | |
286 if (is_whitespace(*p)) { | |
287 return false; | |
288 } | |
289 if (*p == '\n' || *p == '\r') { | |
290 break; | |
291 } | |
292 p++; | |
293 len--; | |
294 (*ptoklen)++; | |
295 } | |
296 return true; | |
297 } | |
298 | |
299 static inline bool_t | |
300 is_ipaddr(cpchar p, uint len, uint * ptoklen) | |
301 { | |
302 uint noctets, ndigits; | |
303 | |
304 *ptoklen = 0; | |
305 | |
306 noctets = 0; | |
307 while (len > 0 && noctets < 4) { | |
308 ndigits = 0; | |
309 while (len > 0 && isdigit((unsigned char)*p)) { | |
310 ndigits++; | |
311 p++; | |
312 len--; | |
313 (*ptoklen)++; | |
314 } | |
315 if (ndigits == 0 || ndigits > 3) { | |
316 return false; | |
317 } | |
318 noctets++; | |
319 if (noctets < 4) { | |
320 if (*p != '.') { | |
321 return false; | |
322 } | |
323 p++; | |
324 len--; | |
325 (*ptoklen)++; | |
326 } | |
327 } | |
328 if (noctets < 4) { | |
329 return false; | |
330 } | |
331 return true; | |
332 } | |
333 | |
334 static inline bool_t | |
335 is_word(cpchar p, uint len, uint * ptoklen) | |
336 { | |
337 if (len < 3) { | |
338 return false; | |
339 } | |
340 if (!(isalpha((unsigned char)*p) || *p == '$')) { | |
341 return false; | |
342 } | |
343 *ptoklen = 1; | |
344 p++; | |
345 len--; | |
346 while (len > 0) { | |
347 if (!is_wordmidchar((unsigned char)*p)) { | |
348 break; | |
349 } | |
350 (*ptoklen)++; | |
351 p++; | |
352 len--; | |
353 } | |
354 while (*ptoklen >= 3 && !is_wordendchar((unsigned char)*(p - 1))… | |
355 (*ptoklen)--; | |
356 p--; | |
357 len++; | |
358 } | |
359 if (*ptoklen < 3) { | |
360 return false; | |
361 } | |
362 return true; | |
363 } | |
364 | |
365 static inline bool_t | |
366 is_ignoredheader(cpchar p, uint len, uint * ptoklen) | |
367 { | |
368 int lo, hi, mid, minlen, cmp; | |
369 | |
370 hi = g_nignoredheaders - 1; | |
371 lo = -1; | |
372 while (hi - lo > 1) { | |
373 mid = (hi + lo) / 2; | |
374 minlen = min(strlen(g_ignoredheaders[mid]), len); | |
375 cmp = strncasecmp(g_ignoredheaders[mid], p, minlen); | |
376 if (cmp >= 0) | |
377 hi = mid; | |
378 else | |
379 lo = mid; | |
380 } | |
381 minlen = min(strlen(g_ignoredheaders[hi]), len); | |
382 if (len == minlen || strncasecmp(g_ignoredheaders[hi], p, minlen… | |
383 return false; | |
384 } | |
385 *ptoklen = len; | |
386 return true; | |
387 } | |
388 | |
389 static inline bool_t | |
390 is_mailerid(cpchar p, uint len, uint * ptoklen) | |
391 { | |
392 if (len < 4 || strncmp(p, "\tid ", 4) != 0) { | |
393 return false; | |
394 } | |
395 *ptoklen = len; | |
396 return true; | |
397 } | |
398 | |
399 static inline bool_t | |
400 is_spamtext(cpchar p, uint len, uint * ptoklen) | |
401 { | |
402 if (len < 5 || strncmp(p, "SPAM:", 5) != 0) { | |
403 return false; | |
404 } | |
405 *ptoklen = len; | |
406 return true; | |
407 } | |
408 | |
409 static inline bool_t | |
410 is_smtpid(cpchar p, uint len, uint * ptoklen) | |
411 { | |
412 if (len < 8 || strncmp(p, "SMTP id ", 8) != 0) { | |
413 return false; | |
414 } | |
415 *ptoklen = len; | |
416 return true; | |
417 } | |
418 | |
419 static inline bool_t | |
420 is_boundaryequal(cpchar p, uint len, uint * ptoklen) | |
421 { | |
422 if (len < 9 || strncmp(p, "boundary=", 9) != 0) { | |
423 return false; | |
424 } | |
425 *ptoklen = len; | |
426 return true; | |
427 } | |
428 | |
429 static inline bool_t | |
430 is_nameequal(cpchar p, uint len, uint * ptoklen) | |
431 { | |
432 if (len < 6 || strncmp(p, "name=\"", 6) != 0) { | |
433 return false; | |
434 } | |
435 *ptoklen = 6; | |
436 return true; | |
437 } | |
438 | |
439 static inline bool_t | |
440 is_filenameequal(cpchar p, uint len, uint * ptoklen) | |
441 { | |
442 if (len < 10 || strncmp(p, "filename=\"", 10) != 0) { | |
443 return false; | |
444 } | |
445 *ptoklen = 10; | |
446 return true; | |
447 } | |
448 | |
449 static inline bool_t | |
450 is_from(cpchar p, uint len, uint * ptoklen) | |
451 { | |
452 if (len < 5 || strncmp(p, "From ", 5) != 0) { | |
453 return false; | |
454 } | |
455 *ptoklen = 5; | |
456 return true; | |
457 } | |
458 | |
459 void | |
460 lex_create(lex_t * pthis, mbox_t mboxtype) | |
461 { | |
462 pthis->mboxtype = mboxtype; | |
463 pthis->section = envelope; | |
464 pthis->pos = 0; | |
465 pthis->bom = 0; | |
466 pthis->eom = 0; | |
467 pthis->lineend = 0; | |
468 pthis->buflen = 0; | |
469 pthis->pbuf = NULL; | |
470 } | |
471 | |
472 void | |
473 lex_destroy(lex_t * pthis) | |
474 { | |
475 free(pthis->pbuf); | |
476 } | |
477 | |
478 bool_t | |
479 lex_load(lex_t * pthis, int fd) | |
480 { | |
481 uint nalloc; | |
482 ssize_t nread; | |
483 | |
484 nalloc = IOBUFSIZE; | |
485 if ((pthis->pbuf = malloc(IOBUFSIZE)) == NULL) | |
486 return false; | |
487 | |
488 while ((nread = read(fd, pthis->pbuf + pthis->buflen, nalloc - p… | |
489 pthis->buflen += nread; | |
490 if (pthis->buflen == nalloc) { | |
491 char *pnewbuf; | |
492 | |
493 nalloc += IOBUFSIZE; | |
494 pnewbuf = (char *) realloc(pthis->pbuf, nalloc); | |
495 if (pnewbuf == NULL) { | |
496 free(pthis->pbuf); | |
497 pthis->pbuf = NULL; | |
498 return false; | |
499 } | |
500 pthis->pbuf = pnewbuf; | |
501 } | |
502 } | |
503 if (nread < 0) { | |
504 free(pthis->pbuf); | |
505 pthis->pbuf = NULL; | |
506 return false; | |
507 } | |
508 if (pthis->mboxtype == detect) { | |
509 if (pthis->buflen > 5 && memcmp(pthis->pbuf, "From ", 5)… | |
510 verbose(1, "Input looks like an mbox\n"); | |
511 pthis->mboxtype = mbox; | |
512 } else { | |
513 verbose(1, "Input looks like a maildir\n"); | |
514 pthis->mboxtype = maildir; | |
515 } | |
516 } | |
517 return true; | |
518 } | |
519 | |
520 static bool_t | |
521 lex_nextline(lex_t * pthis) | |
522 { | |
523 cpchar pbuf; | |
524 uint len; | |
525 uint toklen; | |
526 | |
527 again: | |
528 /* XXX: use and update pthis->section */ | |
529 pthis->pos = pthis->lineend; | |
530 if (pthis->lineend == pthis->buflen) { | |
531 return false; | |
532 } | |
533 pbuf = pthis->pbuf + pthis->pos; | |
534 len = 0; | |
535 while (pthis->pos + len < pthis->buflen && pbuf[len] != '\n') { | |
536 len++; | |
537 } | |
538 if (pthis->pos + len < pthis->buflen) { | |
539 len++; /* bump past the LF */ | |
540 } | |
541 pthis->lineend = pthis->pos + len; | |
542 | |
543 /* check beginning-of-line patterns */ | |
544 if (is_base64(pbuf, len, &toklen) || | |
545 is_ignoredheader(pbuf, len, &toklen) || | |
546 is_mailerid(pbuf, len, &toklen) || | |
547 is_mimeboundary(pbuf, len, &toklen) || | |
548 is_spamtext(pbuf, len, &toklen)) { | |
549 /* ignore line */ | |
550 pthis->pos += toklen; | |
551 goto again; | |
552 } | |
553 return true; | |
554 } | |
555 | |
556 void | |
557 lex_nexttoken(lex_t * pthis, tok_t * ptok) | |
558 { | |
559 cpchar pbuf; | |
560 uint len; | |
561 uint toklen; | |
562 | |
563 if (pthis->pos == pthis->eom) { | |
564 pthis->bom = pthis->pos; | |
565 } | |
566 again: | |
567 /* skip whitespace between tokens */ | |
568 while (pthis->pos != pthis->lineend && is_whitespace(pthis->pbuf… | |
569 pthis->pos++; | |
570 } | |
571 | |
572 pbuf = pthis->pbuf + pthis->pos; | |
573 len = pthis->lineend - pthis->pos; | |
574 | |
575 /* possibilities: end-of-line, html-comment, ipaddr, word, junk … | |
576 | |
577 if (pthis->pos == pthis->lineend) { | |
578 if (!lex_nextline(pthis)) { | |
579 pthis->eom = pthis->pos; | |
580 ptok->tt = eof; | |
581 return; | |
582 } | |
583 pbuf = pthis->pbuf + pthis->pos; | |
584 len = pthis->lineend - pthis->pos; | |
585 | |
586 if (pthis->mboxtype == mbox) { | |
587 if (is_from(pbuf, len, &toklen)) { | |
588 pthis->eom = pthis->pos; | |
589 ptok->tt = from; | |
590 ptok->p = pthis->pbuf + pthis->pos; | |
591 ptok->len = toklen; | |
592 pthis->pos += toklen; | |
593 return; | |
594 } | |
595 } | |
596 goto again; /* skip lws */ | |
597 } | |
598 if (is_htmltag(pbuf, len, &toklen) || | |
599 is_htmlcomment(pbuf, len, &toklen) || | |
600 is_smtpid(pbuf, len, &toklen) || | |
601 is_boundaryequal(pbuf, len, &toklen) || | |
602 is_nameequal(pbuf, len, &toklen) || | |
603 is_filenameequal(pbuf, len, &toklen)) { | |
604 /* ignore it */ | |
605 pthis->pos += toklen; | |
606 goto again; | |
607 } | |
608 if (is_ipaddr(pbuf, len, &toklen)) { | |
609 ptok->tt = word; | |
610 ptok->p = pthis->pbuf + pthis->pos; | |
611 ptok->len = toklen; | |
612 pthis->pos += toklen; | |
613 return; | |
614 } | |
615 if (is_word(pbuf, len, &toklen)) { | |
616 ptok->tt = word; | |
617 ptok->p = pthis->pbuf + pthis->pos; | |
618 ptok->len = toklen; | |
619 pthis->pos += toklen; | |
620 if (toklen > MAXWORDLEN) { | |
621 goto again; | |
622 } | |
623 return; | |
624 } | |
625 /* junk */ | |
626 pthis->pos++; | |
627 goto again; | |
628 } | |
629 | |
630 /* SpamAssassin style passthru */ | |
631 void | |
632 lex_passthru(lex_t * pthis, bool_t is_spam, double hits) | |
633 { | |
634 char szbuf[256]; | |
635 bool_t in_headers = true; | |
636 | |
637 pthis->pos = pthis->bom; | |
638 if (is_spam) { | |
639 sprintf(szbuf, "X-Spam-Status: Yes, hits=%f required=%f,… | |
640 "X-Spam-Flag: YES\n", | |
641 hits, SPAM_CUTOFF); | |
642 } else { | |
643 sprintf(szbuf, "X-Spam-Status: No, hits=%f required=%f\n… | |
644 hits, SPAM_CUTOFF); | |
645 } | |
646 | |
647 /* existing headers */ | |
648 while (in_headers && pthis->pos < pthis->eom) { | |
649 cpchar pbuf = pthis->pbuf + pthis->pos; | |
650 uint len = 0; | |
651 | |
652 while (pthis->pos + len < pthis->buflen && pbuf[len] != … | |
653 len++; | |
654 } | |
655 if (pthis->pos + len < pthis->buflen) { | |
656 len++; /* bump past the LF */ | |
657 } | |
658 /* check for end of headers */ | |
659 if (pbuf[0] == '\n' || (pbuf[0] == '\r' && pbuf[1] == '\… | |
660 /* end of headers */ | |
661 break; | |
662 } | |
663 /* write header, ignoring existing spam headers */ | |
664 if (strncasecmp(pbuf, "X-Spam-", 7) != 0) { | |
665 write(STDOUT_FILENO, pbuf, len); | |
666 } | |
667 pthis->pos += len; | |
668 } | |
669 | |
670 /* new headers */ | |
671 write(STDOUT_FILENO, szbuf, strlen(szbuf)); | |
672 | |
673 /* remainder */ | |
674 if (pthis->pos < pthis->eom) { | |
675 write(STDOUT_FILENO, pthis->pbuf + pthis->pos, pthis->eo… | |
676 } | |
677 pthis->bom = pthis->eom; | |
678 } |