add a bulk test mode option (-b) - bmf - bmf (Bayesian Mail Filter) 0.9.4 fork … | |
git clone git://git.codemadness.org/bmf | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
commit 8a316864887a48a5fd2867b6bde5d5e3b215e288 | |
parent da5b33ffd35e25649614ac678df293afcffb3f35 | |
Author: Hiltjo Posthuma <[email protected]> | |
Date: Tue, 4 Feb 2020 22:59:04 +0100 | |
add a bulk test mode option (-b) | |
Much more efficient for my use-case: bulk testing directory full of maildir | |
files quickly. | |
find "$d/new" -type f | bmf -b | awk -F '\t' '$2 > 0.9 { print $1; }' | while r… | |
...move ugly spam here... | |
done | |
Diffstat: | |
M bmf.1 | 5 ++++- | |
M bmf.c | 78 +++++++++++++++++++++++++++++… | |
M dbh.c | 3 +++ | |
3 files changed, 83 insertions(+), 3 deletions(-) | |
--- | |
diff --git a/bmf.1 b/bmf.1 | |
@@ -23,7 +23,7 @@ bmf \- efficient Bayesian mail filter | |
.SH "SYNOPSIS" | |
.nf | |
-\fBbmf\fR [-t] [-n] [-s] [-N] [-S] [-d db] [-k n] [-m type] [-p] | |
+\fBbmf\fR [-b] [-t] [-n] [-s] [-N] [-S] [-d db] [-k n] [-m type] [-p] | |
[-v] [-V] [-h] | |
.fi | |
@@ -41,6 +41,9 @@ bmf supports both mbox and maildir mail storage formats. It w… | |
Without command-line options, bmf processes the input, registers it as either … | |
.PP | |
+\fB-b\fR Bulk test mode, read file list from stdin, output file, TAB, spamicit… | |
+ | |
+.PP | |
\fB-t\fR Test to see if the input is spam. The word lists are not updated. A r… | |
.PP | |
diff --git a/bmf.c b/bmf.c | |
@@ -27,8 +27,10 @@ typedef enum { | |
mode_reg_n, /* register as non-spam */ | |
mode_n_to_s, /* undo non-spam registration and register… | |
* spam */ | |
- mode_s_to_n /* undo spam registration and register as | |
+ mode_s_to_n, /* undo spam registration and register as | |
* non-spam */ | |
+ /* test and product report in bulk, read file list from stdin, output … | |
+ mode_bulk | |
} runmode_t; | |
static void | |
@@ -39,6 +41,7 @@ usage(void) | |
"\n" | |
"Modes of operation (mutually exclusive; the last one specified… | |
"\t\tRegister message using historical data if no mode is speci… | |
+ "\t-b\tBulk test mode, read file list from stdin, output file, … | |
"\t-n\tRegister message as non-spam.\n" | |
"\t-s\tRegister message as spam.\n" | |
"\t-N\tRegister message as non-spam and undo prior registration… | |
@@ -88,6 +91,9 @@ main(int argc, char **argv) | |
tok_t tok; | |
bool_t is_spam = false; | |
int ch; | |
+ char *line = NULL; | |
+ size_t linesiz = 0; | |
+ ssize_t n; | |
int fd = STDIN_FILENO; | |
@@ -97,8 +103,11 @@ main(int argc, char **argv) | |
srand(time(NULL)); | |
stats.keepers = DEF_KEEPERS; | |
- while ((ch = getopt(argc, argv, "NSVd:hk:m:npstv")) != EOF) { | |
+ while ((ch = getopt(argc, argv, "NSVd:hk:m:bnpstv")) != EOF) { | |
switch (ch) { | |
+ case 'b': | |
+ mode = mode_bulk; | |
+ break; | |
case 'N': | |
mode = mode_s_to_n; | |
break; | |
@@ -162,6 +171,70 @@ main(int argc, char **argv) | |
fprintf(stderr, "%s: cannot open database\n", argv[0]); | |
exit(2); | |
} | |
+ | |
+ /* bulk mode */ | |
+ if (mode == mode_bulk) { | |
+ pblist = pdb->opentable(pdb, "spamlist", rdonly); | |
+ if (pblist == NULL) { | |
+ fprintf(stderr, "%s: cannot open spamlist\n", argv[0]); | |
+ exit(2); | |
+ } | |
+ pglist = pdb->opentable(pdb, "goodlist", rdonly); | |
+ if (pglist == NULL) { | |
+ fprintf(stderr, "%s: cannot open goodlist\n", argv[0]); | |
+ exit(2); | |
+ } | |
+ | |
+ while ((n = getline(&line, &linesiz, stdin)) > 0) { | |
+ if (line[n - 1] == '\n') | |
+ line[--n] = '\0'; | |
+ | |
+ if ((fd = open(line, O_RDONLY)) == -1) | |
+ err(1, "open: %s", line); | |
+ | |
+ memset(stats.extrema, 0, stats.keepers * sizeof(discri… | |
+ | |
+ lex_create(&lex, mboxtype); | |
+ if (!lex_load(&lex, fd)) { | |
+ fprintf(stderr, "%s: cannot read input\n", arg… | |
+ exit(2); | |
+ } | |
+ lex_nexttoken(&lex, &tok); | |
+ if (tok.tt == eof) { | |
+ fprintf(stderr, "%s: no input available\n", ar… | |
+ exit(2); | |
+ } | |
+ | |
+ while (tok.tt != eof) { | |
+ /* TODO: vec_create at top, vec->nitems = 0, b… | |
+ vec_create(&mlist); | |
+ | |
+ bvec_loadmsg(&mlist, &lex, &tok); | |
+ bayesfilt(pglist, pblist, &mlist, &stats); | |
+ | |
+ vec_destroy(&mlist); | |
+ | |
+ printf("%s\t%f\n", line, stats.spamicity); | |
+ } | |
+ | |
+ lex_destroy(&lex); | |
+ | |
+ close(fd); | |
+ } | |
+ | |
+ pglist->close(pglist); | |
+ free(pglist); | |
+ pblist->close(pblist); | |
+ free(pblist); | |
+ | |
+ pdb->close(pdb); | |
+ free(pdb); | |
+ | |
+ free(stats.extrema); | |
+ | |
+ return 0; | |
+ } | |
+ | |
lex_create(&lex, mboxtype); | |
if (!lex_load(&lex, fd)) { | |
fprintf(stderr, "%s: cannot read input\n", argv[0]); | |
@@ -172,6 +245,7 @@ main(int argc, char **argv) | |
fprintf(stderr, "%s: no input available\n", argv[0]); | |
exit(2); | |
} | |
+ | |
if (mode == mode_test) { | |
pblist = pdb->opentable(pdb, "spamlist", rdonly); | |
if (pblist == NULL) { | |
diff --git a/dbh.c b/dbh.c | |
@@ -95,6 +95,8 @@ dbtext_db_open(cpchar dbname, bool_t rdonly) | |
goto bail; | |
} | |
+/* TODO: handle unveil for bulk mode */ | |
+#if 0 | |
/* unveil(2), TODO: rework later */ | |
char listpath[PATH_MAX]; | |
snprintf(listpath, sizeof(listpath), "%s/%s", pthis->dir, "goodlist.tx… | |
@@ -111,6 +113,7 @@ dbtext_db_open(cpchar dbname, bool_t rdonly) | |
perror("unveil()"); | |
exit(2); | |
} | |
+#endif | |
return pthis; | |