Introduction
Introduction Statistics Contact Development Disclaimer Help
add a bulk test mode option (-b) - bmf - bmf (Bayesian Mail Filter) 0.9.4 fork …
git clone git://git.codemadness.org/bmf
Log
Files
Refs
README
LICENSE
---
commit 8a316864887a48a5fd2867b6bde5d5e3b215e288
parent da5b33ffd35e25649614ac678df293afcffb3f35
Author: Hiltjo Posthuma <[email protected]>
Date: Tue, 4 Feb 2020 22:59:04 +0100
add a bulk test mode option (-b)
Much more efficient for my use-case: bulk testing directory full of maildir
files quickly.
find "$d/new" -type f | bmf -b | awk -F '\t' '$2 > 0.9 { print $1; }' | while r…
...move ugly spam here...
done
Diffstat:
M bmf.1 | 5 ++++-
M bmf.c | 78 +++++++++++++++++++++++++++++…
M dbh.c | 3 +++
3 files changed, 83 insertions(+), 3 deletions(-)
---
diff --git a/bmf.1 b/bmf.1
@@ -23,7 +23,7 @@ bmf \- efficient Bayesian mail filter
.SH "SYNOPSIS"
.nf
-\fBbmf\fR [-t] [-n] [-s] [-N] [-S] [-d db] [-k n] [-m type] [-p]
+\fBbmf\fR [-b] [-t] [-n] [-s] [-N] [-S] [-d db] [-k n] [-m type] [-p]
[-v] [-V] [-h]
.fi
@@ -41,6 +41,9 @@ bmf supports both mbox and maildir mail storage formats. It w…
Without command-line options, bmf processes the input, registers it as either …
.PP
+\fB-b\fR Bulk test mode, read file list from stdin, output file, TAB, spamicit…
+
+.PP
\fB-t\fR Test to see if the input is spam. The word lists are not updated. A r…
.PP
diff --git a/bmf.c b/bmf.c
@@ -27,8 +27,10 @@ typedef enum {
mode_reg_n, /* register as non-spam */
mode_n_to_s, /* undo non-spam registration and register…
* spam */
- mode_s_to_n /* undo spam registration and register as
+ mode_s_to_n, /* undo spam registration and register as
* non-spam */
+ /* test and product report in bulk, read file list from stdin, output …
+ mode_bulk
} runmode_t;
static void
@@ -39,6 +41,7 @@ usage(void)
"\n"
"Modes of operation (mutually exclusive; the last one specified…
"\t\tRegister message using historical data if no mode is speci…
+ "\t-b\tBulk test mode, read file list from stdin, output file, …
"\t-n\tRegister message as non-spam.\n"
"\t-s\tRegister message as spam.\n"
"\t-N\tRegister message as non-spam and undo prior registration…
@@ -88,6 +91,9 @@ main(int argc, char **argv)
tok_t tok;
bool_t is_spam = false;
int ch;
+ char *line = NULL;
+ size_t linesiz = 0;
+ ssize_t n;
int fd = STDIN_FILENO;
@@ -97,8 +103,11 @@ main(int argc, char **argv)
srand(time(NULL));
stats.keepers = DEF_KEEPERS;
- while ((ch = getopt(argc, argv, "NSVd:hk:m:npstv")) != EOF) {
+ while ((ch = getopt(argc, argv, "NSVd:hk:m:bnpstv")) != EOF) {
switch (ch) {
+ case 'b':
+ mode = mode_bulk;
+ break;
case 'N':
mode = mode_s_to_n;
break;
@@ -162,6 +171,70 @@ main(int argc, char **argv)
fprintf(stderr, "%s: cannot open database\n", argv[0]);
exit(2);
}
+
+ /* bulk mode */
+ if (mode == mode_bulk) {
+ pblist = pdb->opentable(pdb, "spamlist", rdonly);
+ if (pblist == NULL) {
+ fprintf(stderr, "%s: cannot open spamlist\n", argv[0]);
+ exit(2);
+ }
+ pglist = pdb->opentable(pdb, "goodlist", rdonly);
+ if (pglist == NULL) {
+ fprintf(stderr, "%s: cannot open goodlist\n", argv[0]);
+ exit(2);
+ }
+
+ while ((n = getline(&line, &linesiz, stdin)) > 0) {
+ if (line[n - 1] == '\n')
+ line[--n] = '\0';
+
+ if ((fd = open(line, O_RDONLY)) == -1)
+ err(1, "open: %s", line);
+
+ memset(stats.extrema, 0, stats.keepers * sizeof(discri…
+
+ lex_create(&lex, mboxtype);
+ if (!lex_load(&lex, fd)) {
+ fprintf(stderr, "%s: cannot read input\n", arg…
+ exit(2);
+ }
+ lex_nexttoken(&lex, &tok);
+ if (tok.tt == eof) {
+ fprintf(stderr, "%s: no input available\n", ar…
+ exit(2);
+ }
+
+ while (tok.tt != eof) {
+ /* TODO: vec_create at top, vec->nitems = 0, b…
+ vec_create(&mlist);
+
+ bvec_loadmsg(&mlist, &lex, &tok);
+ bayesfilt(pglist, pblist, &mlist, &stats);
+
+ vec_destroy(&mlist);
+
+ printf("%s\t%f\n", line, stats.spamicity);
+ }
+
+ lex_destroy(&lex);
+
+ close(fd);
+ }
+
+ pglist->close(pglist);
+ free(pglist);
+ pblist->close(pblist);
+ free(pblist);
+
+ pdb->close(pdb);
+ free(pdb);
+
+ free(stats.extrema);
+
+ return 0;
+ }
+
lex_create(&lex, mboxtype);
if (!lex_load(&lex, fd)) {
fprintf(stderr, "%s: cannot read input\n", argv[0]);
@@ -172,6 +245,7 @@ main(int argc, char **argv)
fprintf(stderr, "%s: no input available\n", argv[0]);
exit(2);
}
+
if (mode == mode_test) {
pblist = pdb->opentable(pdb, "spamlist", rdonly);
if (pblist == NULL) {
diff --git a/dbh.c b/dbh.c
@@ -95,6 +95,8 @@ dbtext_db_open(cpchar dbname, bool_t rdonly)
goto bail;
}
+/* TODO: handle unveil for bulk mode */
+#if 0
/* unveil(2), TODO: rework later */
char listpath[PATH_MAX];
snprintf(listpath, sizeof(listpath), "%s/%s", pthis->dir, "goodlist.tx…
@@ -111,6 +113,7 @@ dbtext_db_open(cpchar dbname, bool_t rdonly)
perror("unveil()");
exit(2);
}
+#endif
return pthis;
You are viewing proxied material from codemadness.org. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.