Introduction
Introduction Statistics Contact Development Disclaimer Help
lex.c - bmf - bmf (Bayesian Mail Filter) 0.9.4 fork + patches
git clone git://git.codemadness.org/bmf
Log
Files
Refs
README
LICENSE
---
lex.c (11934B)
---
1 /* $Id: lex.c,v 1.18 2002/10/20 20:29:15 tommy Exp $ */
2
3 /*
4 * Copyright (c) 2002 Tom Marshall <[email protected]>
5 *
6 * This program is free software. It may be distributed under the terms
7 * in the file LICENSE, found in the top level of the distribution.
8 *
9 * lex.c: generate token stream for bmf.
10 */
11
12 #include "config.h"
13 #include "dbg.h"
14 #include "str.h"
15 #include "lex.h"
16
17 static cpchar g_htmltags[] =
18 {
19 "abbr",
20 "above",
21 "accesskey",
22 "acronym",
23 "align",
24 "alink",
25 "all",
26 "alt",
27 "applet",
28 "archive",
29 "axis",
30 "basefont",
31 "baseline",
32 "below",
33 "bgcolor",
34 "big",
35 "body",
36 "border",
37 "bottom",
38 "box",
39 "button",
40 "cellpadding",
41 "cellspacing",
42 "center",
43 "char",
44 "charoff",
45 "charset",
46 "circle",
47 "cite",
48 "class",
49 "classid",
50 "clear",
51 "codebase",
52 "codetype",
53 "color",
54 "cols",
55 "colspan",
56 "compact",
57 "content",
58 "coords",
59 "data",
60 "datetime",
61 "declare",
62 "default",
63 "defer",
64 "dfn",
65 "dir",
66 "disabled",
67 "face",
68 "font",
69 "frameborder",
70 "groups",
71 "head",
72 "headers",
73 "height",
74 "href",
75 "hreflang",
76 "hsides",
77 "hspace",
78 "http-equiv",
79 "iframe",
80 "img",
81 "input",
82 "ismap",
83 "justify",
84 "kbd",
85 "label",
86 "lang",
87 "language",
88 "left",
89 "lhs",
90 "link",
91 "longdesc",
92 "map",
93 "marginheight",
94 "marginwidth",
95 "media",
96 "meta",
97 "middle",
98 "multiple",
99 "name",
100 "nohref",
101 "none",
102 "noresize",
103 "noshade",
104 "nowrap",
105 "object",
106 "onblur",
107 "onchange",
108 "onclick",
109 "ondblclick",
110 "onfocus",
111 "onkeydown",
112 "onkeypress",
113 "onkeyup",
114 "onload",
115 "onmousedown",
116 "onmousemove",
117 "onmouseout",
118 "onmouseover",
119 "onmouseup",
120 "onselect",
121 "onunload",
122 "param",
123 "poly",
124 "profile",
125 "prompt",
126 "readonly",
127 "rect",
128 "rel",
129 "rev",
130 "rhs",
131 "right",
132 "rows",
133 "rowspan",
134 "rules",
135 "samp",
136 "scheme",
137 "scope",
138 "script",
139 "scrolling",
140 "select",
141 "selected",
142 "shape",
143 "size",
144 "small",
145 "span",
146 "src",
147 "standby",
148 "strike",
149 "strong",
150 "style",
151 "sub",
152 "summary",
153 "sup",
154 "tabindex",
155 "table",
156 "target",
157 "textarea",
158 "title",
159 "top",
160 "type",
161 "usemap",
162 "valign",
163 "value",
164 "valuetype",
165 "var",
166 "vlink",
167 "void",
168 "vsides",
169 "vspace",
170 "width"
171 };
172 static const uint g_nhtmltags = sizeof(g_htmltags) / sizeof(cpchar);
173
174 static cpchar g_ignoredheaders[] =
175 {
176 "Date:",
177 "Delivery-date:",
178 "Message-ID:",
179 "X-Sorted:",
180 "X-Spam-"
181 };
182 static const uint g_nignoredheaders = sizeof(g_ignoredheaders) / sizeof(…
183
184 static inline bool_t
185 is_whitespace(int c)
186 {
187 return (c == ' ' || c == '\t' || c == '\r');
188 }
189
190 static inline bool_t
191 is_wordmidchar(int c)
192 {
193 return (isalnum(c) || c == '$' || c == '\'' || c == '.' || c == …
194 }
195
196 static inline bool_t
197 is_wordendchar(int c)
198 {
199 return (isalnum(c) || c == '$');
200 }
201
202 static inline bool_t
203 is_htmltag(cpchar p, uint len, uint * ptoklen)
204 {
205 int lo, hi, mid, minlen, cmp;
206
207 *ptoklen = 0;
208
209 hi = g_nhtmltags - 1;
210 lo = -1;
211 while (hi - lo > 1) {
212 mid = (hi + lo) / 2;
213 minlen = min(strlen(g_htmltags[mid]), len);
214 cmp = strncmp(g_htmltags[mid], p, minlen);
215 if (cmp > 0 || (cmp == 0 && minlen < len && !islower(p[m…
216 hi = mid;
217 else
218 lo = mid;
219 }
220 minlen = min(strlen(g_htmltags[hi]), len);
221 if (len == minlen || strncmp(g_htmltags[hi], p, minlen) != 0) {
222 return false;
223 }
224 /* check if is_word() will have a longer match */
225 if (is_wordendchar((unsigned char)p[minlen])) {
226 return false;
227 }
228 if (is_wordmidchar((unsigned char)p[minlen]) &&
229 is_wordendchar((unsigned char)p[minlen + 1])) {
230 return false;
231 }
232 *ptoklen = strlen(g_htmltags[hi]);
233
234 return true;
235 }
236
237 static inline bool_t
238 is_htmlcomment(cpchar p, uint len, uint * ptoklen)
239 {
240 *ptoklen = 0;
241
242 if (len >= 4 && memcmp(p, "<!--", 4) == 0) {
243 *ptoklen = 4;
244 return true;
245 }
246 if (len >= 3 && memcmp(p, "-->", 3) == 0) {
247 *ptoklen = 3;
248 return true;
249 }
250 return false;
251 }
252
253 static inline bool_t
254 is_base64char(int c)
255 {
256 return (isalnum(c) || (c == '/' || c == '+'));
257 }
258
259 static inline bool_t
260 is_base64(cpchar p, uint len, uint * ptoklen)
261 {
262 *ptoklen = 0;
263 while (len > 0) {
264 if (*p != '\n' && *p != '\r' && !is_base64char((unsigned…
265 return false;
266 }
267 p++;
268 len--;
269 (*ptoklen)++;
270 }
271 return true;
272 }
273
274 static inline bool_t
275 is_mimeboundary(cpchar p, uint len, uint * ptoklen)
276 {
277 *ptoklen = 0;
278
279 if (len < 3 || p[0] != '-' || p[1] != '-') {
280 return false;
281 }
282 p += 2;
283 len -= 2;
284 *ptoklen += 2;
285 while (len > 0) {
286 if (is_whitespace(*p)) {
287 return false;
288 }
289 if (*p == '\n' || *p == '\r') {
290 break;
291 }
292 p++;
293 len--;
294 (*ptoklen)++;
295 }
296 return true;
297 }
298
299 static inline bool_t
300 is_ipaddr(cpchar p, uint len, uint * ptoklen)
301 {
302 uint noctets, ndigits;
303
304 *ptoklen = 0;
305
306 noctets = 0;
307 while (len > 0 && noctets < 4) {
308 ndigits = 0;
309 while (len > 0 && isdigit((unsigned char)*p)) {
310 ndigits++;
311 p++;
312 len--;
313 (*ptoklen)++;
314 }
315 if (ndigits == 0 || ndigits > 3) {
316 return false;
317 }
318 noctets++;
319 if (noctets < 4) {
320 if (*p != '.') {
321 return false;
322 }
323 p++;
324 len--;
325 (*ptoklen)++;
326 }
327 }
328 if (noctets < 4) {
329 return false;
330 }
331 return true;
332 }
333
334 static inline bool_t
335 is_word(cpchar p, uint len, uint * ptoklen)
336 {
337 if (len < 3) {
338 return false;
339 }
340 if (!(isalpha((unsigned char)*p) || *p == '$')) {
341 return false;
342 }
343 *ptoklen = 1;
344 p++;
345 len--;
346 while (len > 0) {
347 if (!is_wordmidchar((unsigned char)*p)) {
348 break;
349 }
350 (*ptoklen)++;
351 p++;
352 len--;
353 }
354 while (*ptoklen >= 3 && !is_wordendchar((unsigned char)*(p - 1))…
355 (*ptoklen)--;
356 p--;
357 len++;
358 }
359 if (*ptoklen < 3) {
360 return false;
361 }
362 return true;
363 }
364
365 static inline bool_t
366 is_ignoredheader(cpchar p, uint len, uint * ptoklen)
367 {
368 int lo, hi, mid, minlen, cmp;
369
370 hi = g_nignoredheaders - 1;
371 lo = -1;
372 while (hi - lo > 1) {
373 mid = (hi + lo) / 2;
374 minlen = min(strlen(g_ignoredheaders[mid]), len);
375 cmp = strncasecmp(g_ignoredheaders[mid], p, minlen);
376 if (cmp >= 0)
377 hi = mid;
378 else
379 lo = mid;
380 }
381 minlen = min(strlen(g_ignoredheaders[hi]), len);
382 if (len == minlen || strncasecmp(g_ignoredheaders[hi], p, minlen…
383 return false;
384 }
385 *ptoklen = len;
386 return true;
387 }
388
389 static inline bool_t
390 is_mailerid(cpchar p, uint len, uint * ptoklen)
391 {
392 if (len < 4 || strncmp(p, "\tid ", 4) != 0) {
393 return false;
394 }
395 *ptoklen = len;
396 return true;
397 }
398
399 static inline bool_t
400 is_spamtext(cpchar p, uint len, uint * ptoklen)
401 {
402 if (len < 5 || strncmp(p, "SPAM:", 5) != 0) {
403 return false;
404 }
405 *ptoklen = len;
406 return true;
407 }
408
409 static inline bool_t
410 is_smtpid(cpchar p, uint len, uint * ptoklen)
411 {
412 if (len < 8 || strncmp(p, "SMTP id ", 8) != 0) {
413 return false;
414 }
415 *ptoklen = len;
416 return true;
417 }
418
419 static inline bool_t
420 is_boundaryequal(cpchar p, uint len, uint * ptoklen)
421 {
422 if (len < 9 || strncmp(p, "boundary=", 9) != 0) {
423 return false;
424 }
425 *ptoklen = len;
426 return true;
427 }
428
429 static inline bool_t
430 is_nameequal(cpchar p, uint len, uint * ptoklen)
431 {
432 if (len < 6 || strncmp(p, "name=\"", 6) != 0) {
433 return false;
434 }
435 *ptoklen = 6;
436 return true;
437 }
438
439 static inline bool_t
440 is_filenameequal(cpchar p, uint len, uint * ptoklen)
441 {
442 if (len < 10 || strncmp(p, "filename=\"", 10) != 0) {
443 return false;
444 }
445 *ptoklen = 10;
446 return true;
447 }
448
449 static inline bool_t
450 is_from(cpchar p, uint len, uint * ptoklen)
451 {
452 if (len < 5 || strncmp(p, "From ", 5) != 0) {
453 return false;
454 }
455 *ptoklen = 5;
456 return true;
457 }
458
459 void
460 lex_create(lex_t * pthis, mbox_t mboxtype)
461 {
462 pthis->mboxtype = mboxtype;
463 pthis->section = envelope;
464 pthis->pos = 0;
465 pthis->bom = 0;
466 pthis->eom = 0;
467 pthis->lineend = 0;
468 pthis->buflen = 0;
469 pthis->pbuf = NULL;
470 }
471
472 void
473 lex_destroy(lex_t * pthis)
474 {
475 free(pthis->pbuf);
476 }
477
478 bool_t
479 lex_load(lex_t * pthis, int fd)
480 {
481 uint nalloc;
482 ssize_t nread;
483
484 nalloc = IOBUFSIZE;
485 if ((pthis->pbuf = malloc(IOBUFSIZE)) == NULL)
486 return false;
487
488 while ((nread = read(fd, pthis->pbuf + pthis->buflen, nalloc - p…
489 pthis->buflen += nread;
490 if (pthis->buflen == nalloc) {
491 char *pnewbuf;
492
493 nalloc += IOBUFSIZE;
494 pnewbuf = (char *) realloc(pthis->pbuf, nalloc);
495 if (pnewbuf == NULL) {
496 free(pthis->pbuf);
497 pthis->pbuf = NULL;
498 return false;
499 }
500 pthis->pbuf = pnewbuf;
501 }
502 }
503 if (nread < 0) {
504 free(pthis->pbuf);
505 pthis->pbuf = NULL;
506 return false;
507 }
508 if (pthis->mboxtype == detect) {
509 if (pthis->buflen > 5 && memcmp(pthis->pbuf, "From ", 5)…
510 verbose(1, "Input looks like an mbox\n");
511 pthis->mboxtype = mbox;
512 } else {
513 verbose(1, "Input looks like a maildir\n");
514 pthis->mboxtype = maildir;
515 }
516 }
517 return true;
518 }
519
520 static bool_t
521 lex_nextline(lex_t * pthis)
522 {
523 cpchar pbuf;
524 uint len;
525 uint toklen;
526
527 again:
528 /* XXX: use and update pthis->section */
529 pthis->pos = pthis->lineend;
530 if (pthis->lineend == pthis->buflen) {
531 return false;
532 }
533 pbuf = pthis->pbuf + pthis->pos;
534 len = 0;
535 while (pthis->pos + len < pthis->buflen && pbuf[len] != '\n') {
536 len++;
537 }
538 if (pthis->pos + len < pthis->buflen) {
539 len++; /* bump past the LF */
540 }
541 pthis->lineend = pthis->pos + len;
542
543 /* check beginning-of-line patterns */
544 if (is_base64(pbuf, len, &toklen) ||
545 is_ignoredheader(pbuf, len, &toklen) ||
546 is_mailerid(pbuf, len, &toklen) ||
547 is_mimeboundary(pbuf, len, &toklen) ||
548 is_spamtext(pbuf, len, &toklen)) {
549 /* ignore line */
550 pthis->pos += toklen;
551 goto again;
552 }
553 return true;
554 }
555
556 void
557 lex_nexttoken(lex_t * pthis, tok_t * ptok)
558 {
559 cpchar pbuf;
560 uint len;
561 uint toklen;
562
563 if (pthis->pos == pthis->eom) {
564 pthis->bom = pthis->pos;
565 }
566 again:
567 /* skip whitespace between tokens */
568 while (pthis->pos != pthis->lineend && is_whitespace(pthis->pbuf…
569 pthis->pos++;
570 }
571
572 pbuf = pthis->pbuf + pthis->pos;
573 len = pthis->lineend - pthis->pos;
574
575 /* possibilities: end-of-line, html-comment, ipaddr, word, junk …
576
577 if (pthis->pos == pthis->lineend) {
578 if (!lex_nextline(pthis)) {
579 pthis->eom = pthis->pos;
580 ptok->tt = eof;
581 return;
582 }
583 pbuf = pthis->pbuf + pthis->pos;
584 len = pthis->lineend - pthis->pos;
585
586 if (pthis->mboxtype == mbox) {
587 if (is_from(pbuf, len, &toklen)) {
588 pthis->eom = pthis->pos;
589 ptok->tt = from;
590 ptok->p = pthis->pbuf + pthis->pos;
591 ptok->len = toklen;
592 pthis->pos += toklen;
593 return;
594 }
595 }
596 goto again; /* skip lws */
597 }
598 if (is_htmltag(pbuf, len, &toklen) ||
599 is_htmlcomment(pbuf, len, &toklen) ||
600 is_smtpid(pbuf, len, &toklen) ||
601 is_boundaryequal(pbuf, len, &toklen) ||
602 is_nameequal(pbuf, len, &toklen) ||
603 is_filenameequal(pbuf, len, &toklen)) {
604 /* ignore it */
605 pthis->pos += toklen;
606 goto again;
607 }
608 if (is_ipaddr(pbuf, len, &toklen)) {
609 ptok->tt = word;
610 ptok->p = pthis->pbuf + pthis->pos;
611 ptok->len = toklen;
612 pthis->pos += toklen;
613 return;
614 }
615 if (is_word(pbuf, len, &toklen)) {
616 ptok->tt = word;
617 ptok->p = pthis->pbuf + pthis->pos;
618 ptok->len = toklen;
619 pthis->pos += toklen;
620 if (toklen > MAXWORDLEN) {
621 goto again;
622 }
623 return;
624 }
625 /* junk */
626 pthis->pos++;
627 goto again;
628 }
629
630 /* SpamAssassin style passthru */
631 void
632 lex_passthru(lex_t * pthis, bool_t is_spam, double hits)
633 {
634 char szbuf[256];
635 bool_t in_headers = true;
636
637 pthis->pos = pthis->bom;
638 if (is_spam) {
639 sprintf(szbuf, "X-Spam-Status: Yes, hits=%f required=%f,…
640 "X-Spam-Flag: YES\n",
641 hits, SPAM_CUTOFF);
642 } else {
643 sprintf(szbuf, "X-Spam-Status: No, hits=%f required=%f\n…
644 hits, SPAM_CUTOFF);
645 }
646
647 /* existing headers */
648 while (in_headers && pthis->pos < pthis->eom) {
649 cpchar pbuf = pthis->pbuf + pthis->pos;
650 uint len = 0;
651
652 while (pthis->pos + len < pthis->buflen && pbuf[len] != …
653 len++;
654 }
655 if (pthis->pos + len < pthis->buflen) {
656 len++; /* bump past the LF */
657 }
658 /* check for end of headers */
659 if (pbuf[0] == '\n' || (pbuf[0] == '\r' && pbuf[1] == '\…
660 /* end of headers */
661 break;
662 }
663 /* write header, ignoring existing spam headers */
664 if (strncasecmp(pbuf, "X-Spam-", 7) != 0) {
665 write(STDOUT_FILENO, pbuf, len);
666 }
667 pthis->pos += len;
668 }
669
670 /* new headers */
671 write(STDOUT_FILENO, szbuf, strlen(szbuf));
672
673 /* remainder */
674 if (pthis->pos < pthis->eom) {
675 write(STDOUT_FILENO, pthis->pbuf + pthis->pos, pthis->eo…
676 }
677 pthis->bom = pthis->eom;
678 }
You are viewing proxied material from codemadness.org. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.