bmf.c - bmf - bmf (Bayesian Mail Filter) 0.9.4 fork + patches | |
git clone git://git.codemadness.org/bmf | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
bmf.c (9060B) | |
--- | |
1 /* $Id: bmf.c,v 1.20 2002/10/20 18:19:17 tommy Exp $ */ | |
2 | |
3 /* | |
4 * Copyright (c) 2002 Tom Marshall <[email protected]> | |
5 * | |
6 * This program is free software. It may be distributed under the terms | |
7 * in the file LICENSE, found in the top level of the distribution. | |
8 * | |
9 * bmf.c: top level Bayesian mail filter app. | |
10 */ | |
11 | |
12 #include "config.h" | |
13 #include "dbg.h" | |
14 #include "str.h" | |
15 #include "lex.h" | |
16 #include "vec.h" | |
17 #include "dbh.h" | |
18 #include "filt.h" | |
19 | |
20 #define PACKAGE "bmf" | |
21 | |
22 /* modes of operation (mutually exclusive) */ | |
23 typedef enum { | |
24 mode_test, /* test and produce report */ | |
25 mode_normal, /* test and register result */ | |
26 mode_reg_s, /* register as spam */ | |
27 mode_reg_n, /* register as non-spam */ | |
28 mode_n_to_s, /* undo non-spam registration and re… | |
29 * spam */ | |
30 mode_s_to_n, /* undo spam registration and regist… | |
31 * non-spam */ | |
32 /* test and product report in bulk, read file list from stdin, o… | |
33 mode_bulk | |
34 } runmode_t; | |
35 | |
36 static void | |
37 usage(void) | |
38 { | |
39 printf("\n" | |
40 "Usage: " PACKAGE " [mode] [options]\n" | |
41 "\n" | |
42 "Modes of operation (mutually exclusive; the last one spe… | |
43 "\t\tRegister message using historical data if no mode is… | |
44 "\t-b\tBulk test mode, read file list from stdin, output … | |
45 "\t-n\tRegister message as non-spam.\n" | |
46 "\t-s\tRegister message as spam.\n" | |
47 "\t-N\tRegister message as non-spam and undo prior regist… | |
48 "\t-S\tRegister message as spam and undo prior registrati… | |
49 "\t-t\tTest mode, print report and do not save results.\n" | |
50 "\n" | |
51 "Other options:\n" | |
52 "\t-d db\tSpecify database or directory name.\n" | |
53 "\t-k n\tSpecify count of extrema to use (keepers), defau… | |
54 "\t-m type\t[DEPRECATED] Specify mail storage format (mbo… | |
55 "\t-p\tPassthrough mode, like SpamAssassin.\n" | |
56 "\t-v\tIncrease verbosity level.\n" | |
57 "\t-V\tShow version information and exit.\n" | |
58 "\t-h\tShow this message and exit.\n" | |
59 "\n"); | |
60 exit(2); | |
61 } | |
62 | |
63 static void | |
64 version(void) | |
65 { | |
66 printf("\n" | |
67 PACKAGE " version " VERSION " - a Bayesian mail filter\n" | |
68 "Copyright (c) 2002 Tom Marshall\n" | |
69 "\n" | |
70 PACKAGE " comes with ABSOLUTELY NO WARRANTY.\n" | |
71 "This is free software. You are welcome to redistribute … | |
72 "of the GNU General Public License. See the file LICENSE… | |
73 "distribution, or visit http://www.gnu.org/licenses/gpl.h… | |
74 "\n"); | |
75 exit(2); | |
76 } | |
77 | |
78 int | |
79 main(int argc, char **argv) | |
80 { | |
81 char *dbname = NULL; | |
82 bool_t rdonly; | |
83 runmode_t mode = mode_normal; | |
84 mbox_t mboxtype = detect; | |
85 bool_t do_passthru = false; | |
86 dbhtext_t *pdb; | |
87 dbt_t *pblist, *pglist, *ptable; | |
88 vec_t mlist; | |
89 stats_t stats; | |
90 lex_t lex; | |
91 tok_t tok; | |
92 bool_t is_spam = false; | |
93 int ch; | |
94 char *line = NULL; | |
95 size_t linesiz = 0; | |
96 ssize_t n; | |
97 | |
98 int fd = STDIN_FILENO; | |
99 | |
100 if (pledge("stdio rpath wpath cpath flock unveil", NULL) == -1) | |
101 err(1, "pledge"); | |
102 | |
103 srand(time(NULL)); | |
104 | |
105 stats.keepers = DEF_KEEPERS; | |
106 while ((ch = getopt(argc, argv, "NSVd:hk:m:bnpstv")) != EOF) { | |
107 switch (ch) { | |
108 case 'b': | |
109 mode = mode_bulk; | |
110 break; | |
111 case 'N': | |
112 mode = mode_s_to_n; | |
113 break; | |
114 case 'S': | |
115 mode = mode_n_to_s; | |
116 break; | |
117 case 'V': | |
118 version(); | |
119 break; /* NOTREACHED */ | |
120 case 'd': | |
121 free(dbname); | |
122 if (!(dbname = strdup(optarg))) { | |
123 perror("strdup()"); | |
124 exit(2); | |
125 } | |
126 break; | |
127 case 'h': | |
128 usage(); | |
129 break; /* NOTREACHED */ | |
130 case 'k': | |
131 stats.keepers = atoi(optarg); | |
132 break; | |
133 case 'm': | |
134 if (strcasecmp(optarg, "mbox") == 0) { | |
135 mboxtype = mbox; | |
136 } else if (strcasecmp(optarg, "maildir") == 0) { | |
137 mboxtype = maildir; | |
138 } else { | |
139 usage(); | |
140 } | |
141 break; | |
142 case 'n': | |
143 mode = mode_reg_n; | |
144 break; | |
145 case 'p': | |
146 do_passthru = true; | |
147 break; | |
148 case 's': | |
149 mode = mode_reg_s; | |
150 break; | |
151 case 't': | |
152 mode = mode_test; | |
153 if (pledge("stdio rpath cpath flock unveil", NUL… | |
154 err(1, "pledge"); | |
155 break; | |
156 case 'v': | |
157 g_verbose++; | |
158 verbose(1, "Verbose level now %u\n", g_verbose); | |
159 break; | |
160 default: | |
161 usage(); | |
162 } | |
163 } | |
164 stats.extrema = (discrim_t *) malloc(stats.keepers * sizeof(disc… | |
165 | |
166 rdonly = (mode == mode_test) ? 1 : 0; | |
167 | |
168 /* create directory if it doesn't exist yet, when dbname is NULL… | |
169 pdb = dbtext_db_open(dbname, rdonly); | |
170 if (pdb == NULL) { | |
171 fprintf(stderr, "%s: cannot open database\n", argv[0]); | |
172 exit(2); | |
173 } | |
174 | |
175 /* bulk mode */ | |
176 if (mode == mode_bulk) { | |
177 pblist = pdb->opentable(pdb, "spamlist", rdonly); | |
178 if (pblist == NULL) { | |
179 fprintf(stderr, "%s: cannot open spamlist\n", ar… | |
180 exit(2); | |
181 } | |
182 pglist = pdb->opentable(pdb, "goodlist", rdonly); | |
183 if (pglist == NULL) { | |
184 fprintf(stderr, "%s: cannot open goodlist\n", ar… | |
185 exit(2); | |
186 } | |
187 | |
188 while ((n = getline(&line, &linesiz, stdin)) > 0) { | |
189 if (line[n - 1] == '\n') | |
190 line[--n] = '\0'; | |
191 | |
192 if ((fd = open(line, O_RDONLY)) == -1) | |
193 err(1, "open: %s", line); | |
194 | |
195 memset(stats.extrema, 0, stats.keepers * sizeof(… | |
196 | |
197 lex_create(&lex, mboxtype); | |
198 if (!lex_load(&lex, fd)) { | |
199 fprintf(stderr, "%s: cannot read input\n… | |
200 exit(2); | |
201 } | |
202 lex_nexttoken(&lex, &tok); | |
203 if (tok.tt == eof) { | |
204 fprintf(stderr, "%s: no input available\… | |
205 exit(2); | |
206 } | |
207 | |
208 while (tok.tt != eof) { | |
209 /* TODO: vec_create at top, vec->nitems … | |
210 vec_create(&mlist); | |
211 | |
212 bvec_loadmsg(&mlist, &lex, &tok); | |
213 bayesfilt(pglist, pblist, &mlist, &stats… | |
214 | |
215 vec_destroy(&mlist); | |
216 | |
217 printf("%s\t%f\n", line, stats.spamicity… | |
218 } | |
219 | |
220 lex_destroy(&lex); | |
221 | |
222 close(fd); | |
223 } | |
224 | |
225 pglist->close(pglist); | |
226 free(pglist); | |
227 pblist->close(pblist); | |
228 free(pblist); | |
229 | |
230 pdb->close(pdb); | |
231 free(pdb); | |
232 | |
233 free(stats.extrema); | |
234 | |
235 return 0; | |
236 } | |
237 | |
238 lex_create(&lex, mboxtype); | |
239 if (!lex_load(&lex, fd)) { | |
240 fprintf(stderr, "%s: cannot read input\n", argv[0]); | |
241 exit(2); | |
242 } | |
243 lex_nexttoken(&lex, &tok); | |
244 if (tok.tt == eof) { | |
245 fprintf(stderr, "%s: no input available\n", argv[0]); | |
246 exit(2); | |
247 } | |
248 | |
249 if (mode == mode_test) { | |
250 pblist = pdb->opentable(pdb, "spamlist", rdonly); | |
251 if (pblist == NULL) { | |
252 fprintf(stderr, "%s: cannot open spamlist\n", ar… | |
253 exit(2); | |
254 } | |
255 pglist = pdb->opentable(pdb, "goodlist", rdonly); | |
256 if (pglist == NULL) { | |
257 fprintf(stderr, "%s: cannot open goodlist\n", ar… | |
258 exit(2); | |
259 } | |
260 if (pledge("stdio", NULL) == -1) | |
261 err(1, "pledge"); | |
262 } | |
263 while (tok.tt != eof) { | |
264 if (mboxtype == mbox && tok.tt != from) { | |
265 fprintf(stderr, "%s: input does not look like an… | |
266 exit(2); | |
267 } | |
268 if (mode != mode_test) { | |
269 pblist = pdb->opentable(pdb, "spamlist", rdonly); | |
270 if (pblist == NULL) { | |
271 fprintf(stderr, "%s: cannot open spamlis… | |
272 exit(2); | |
273 } | |
274 pglist = pdb->opentable(pdb, "goodlist", rdonly); | |
275 if (pglist == NULL) { | |
276 fprintf(stderr, "%s: cannot open goodlis… | |
277 exit(2); | |
278 } | |
279 } | |
280 vec_create(&mlist); | |
281 bvec_loadmsg(&mlist, &lex, &tok); | |
282 | |
283 switch (mode) { | |
284 case mode_test: | |
285 bayesfilt(pglist, pblist, &mlist, &stats); | |
286 is_spam = (stats.spamicity > SPAM_CUTOFF); | |
287 break; | |
288 case mode_normal: | |
289 bayesfilt(pglist, pblist, &mlist, &stats); | |
290 is_spam = (stats.spamicity > SPAM_CUTOFF); | |
291 ptable = (is_spam ? pblist : pglist); | |
292 svec_sort(&mlist); | |
293 if (!ptable->mergeclose(ptable, &mlist)) { | |
294 fprintf(stderr, "%s: cannot merge/save l… | |
295 exit(2); | |
296 } | |
297 break; | |
298 case mode_reg_s: | |
299 stats.spamicity = 1.0; | |
300 is_spam = true; | |
301 svec_sort(&mlist); | |
302 if (!pblist->mergeclose(pblist, &mlist)) { | |
303 fprintf(stderr, "%s: cannot merge/save l… | |
304 exit(2); | |
305 } | |
306 break; | |
307 case mode_reg_n: | |
308 stats.spamicity = 0.0; | |
309 is_spam = false; | |
310 svec_sort(&mlist); | |
311 if (!pglist->mergeclose(pglist, &mlist)) { | |
312 fprintf(stderr, "%s: cannot merge/save l… | |
313 exit(2); | |
314 } | |
315 break; | |
316 case mode_n_to_s: | |
317 stats.spamicity = 1.0; | |
318 is_spam = true; | |
319 svec_sort(&mlist); | |
320 if (!pblist->mergeclose(pblist, &mlist) || | |
321 !pglist->unmergeclose(pglist, &mlist)) { | |
322 fprintf(stderr, "%s: cannot merge/save l… | |
323 exit(2); | |
324 } | |
325 break; | |
326 case mode_s_to_n: | |
327 stats.spamicity = 0.0; | |
328 is_spam = false; | |
329 svec_sort(&mlist); | |
330 if (!pblist->unmergeclose(pblist, &mlist) || | |
331 !pglist->mergeclose(pglist, &mlist)) { | |
332 fprintf(stderr, "%s: cannot merge/save l… | |
333 exit(2); | |
334 } | |
335 break; | |
336 default: | |
337 usage(); | |
338 } | |
339 | |
340 if (mode == mode_test) { | |
341 statdump(&stats, stdout); | |
342 } | |
343 if (do_passthru) { | |
344 lex_passthru(&lex, is_spam, stats.spamicity); | |
345 } | |
346 vec_destroy(&mlist); | |
347 | |
348 if (mode != mode_test) { | |
349 pglist->close(pglist); | |
350 free(pglist); | |
351 pblist->close(pblist); | |
352 free(pblist); | |
353 } | |
354 } | |
355 | |
356 if (mode == mode_test) { | |
357 pglist->close(pglist); | |
358 free(pglist); | |
359 pblist->close(pblist); | |
360 free(pblist); | |
361 } | |
362 lex_destroy(&lex); | |
363 | |
364 pdb->close(pdb); | |
365 free(pdb); | |
366 | |
367 free(stats.extrema); | |
368 | |
369 return ((do_passthru || is_spam) ? 0 : 1); | |
370 } |