Markup.
michael at optusnet.com.au
michael at optusnet.com.au
Fri May 9 09:00:13 CEST 2003
I finally got around to starting to experiment with
marking up some email features that bogofilter doesn't
currently see.
To start with, I added counting of the number
of html tags and html comments in an email.
Using a corpus of 75,122 spams, and 48,042 hams,
I divided them randomly into 4 sets. I then
use one set for training, and tested it against
all for sets. Repeat until each set has been
for for training.
Using a virgin bogofilter 12.2 I get these results.
(Sets are numbered 0 thru to 3).
CONFIG : Mindev 0.100, RobX 0.415
0 against 0 --> false pos 0 false neg 1309
0 against 1 --> false pos 2 false neg 3194
0 against 2 --> false pos 2 false neg 3271
0 against 3 --> false pos 3 false neg 3326
1 against 0 --> false pos 0 false neg 3476
1 against 1 --> false pos 0 false neg 1222
1 against 2 --> false pos 0 false neg 3259
1 against 3 --> false pos 1 false neg 3370
2 against 0 --> false pos 3 false neg 3370
2 against 1 --> false pos 3 false neg 3141
2 against 2 --> false pos 0 false neg 1190
2 against 3 --> false pos 3 false neg 3274
3 against 0 --> false pos 3 false neg 3388
3 against 1 --> false pos 6 false neg 3184
3 against 2 --> false pos 2 false neg 3288
3 against 3 --> false pos 0 false neg 1206
I.e. When using set 0 to train, it recorded 2 false positives
in set 1, and 3194 false negatives. (With about 18,700 spams
in set 1 that's an OK result. Not great, but ok).
Using the patch below, I get these results:
CONFIG : Mindev 0.100, RobX 0.415
0 against 0 --> false pos 0 false neg 1226
0 against 1 --> false pos 2 false neg 3027
0 against 2 --> false pos 2 false neg 3082
0 against 3 --> false pos 3 false neg 3134
1 against 0 --> false pos 0 false neg 3264
1 against 1 --> false pos 0 false neg 1160
1 against 2 --> false pos 0 false neg 3069
1 against 3 --> false pos 1 false neg 3167
2 against 0 --> false pos 3 false neg 3190
2 against 1 --> false pos 3 false neg 2947
2 against 2 --> false pos 0 false neg 1110
2 against 3 --> false pos 3 false neg 3062
3 against 0 --> false pos 3 false neg 3193
3 against 1 --> false pos 7 false neg 3029
3 against 2 --> false pos 2 false neg 3081
3 against 3 --> false pos 0 false neg 1134
As you can see, there's a single false positive increase (Looking at
the email, I can't really tell if it's spam or not).
On the other hand, there are definately fewer false negatives.
(Someone with a statistics background want to tell me
how significant result is?)
On the whole, it looks like markup might be an interesting
direction to experiment in.
Patch to do the above. Adds the '-A' flag which turns on
markup tokens. Currently the only tokens added are
(along with the counts and probabilities from one
of the training dbases)
m:html_comment100
m:html_comment20
m:html_comment10
m:html_comment4
m:html_tag20
m:html_tag10
m:html_tag4
The existance of the token 'm:html_tag10' says that
there's at least 10 html tags in the email.
Michael, experimenting.
diff --exclude=.* -ur bogofilter-0.12.2/src/bogoconfig.c bogofilter-0.12.2.dev/src/bogoconfig.c
--- bogofilter-0.12.2/src/bogoconfig.c Tue Apr 29 00:30:20 2003
+++ bogofilter-0.12.2.dev/src/bogoconfig.c Fri May 9 11:02:31 2003
@@ -159,6 +159,8 @@
{ "db_cachesize", CP_INTEGER, { (void *) &db_cachesize } },
{ "terse", CP_BOOLEAN, { (void *) &terse } },
+ { "markup", CP_BOOLEAN, { (void *) &markup } },
+
{ NULL, CP_NONE, { (void *) NULL } },
};
@@ -428,7 +430,7 @@
#if HAVE_DECL_OPTRESET
optreset = 1;
#endif
- while ((option = getopt(argc, argv, ":23bBc:Cd:DefFghI:lL:m:MnNo:O:pqQRrsStTuvVx:y:" G R F)) != -1)
+ while ((option = getopt(argc, argv, ":23AbBc:Cd:DefFghI:lL:m:MnNo:O:pqQRrsStTuvVx:y:" G R F)) != -1)
{
#if 0
if (getenv("BOGOFILTER_DEBUG_OPTIONS")) {
@@ -443,6 +445,10 @@
case '3':
twostate = option == '2';
threestate = option == '3';
+ break;
+
+ case 'A':
+ markup = 1;
break;
case 'b':
diff --exclude=.* -ur bogofilter-0.12.2/src/globals.c bogofilter-0.12.2.dev/src/globals.c
--- bogofilter-0.12.2/src/globals.c Tue Apr 22 03:03:17 2003
+++ bogofilter-0.12.2.dev/src/globals.c Fri May 9 11:05:43 2003
@@ -36,6 +36,7 @@
bool terse; /* '-t' */
int test = 0; /* '-T' */
int verbose; /* '-v' */
+bool markup = 0; /* '-A' */
/* config file options */
int max_repeats;
diff --exclude=.* -ur bogofilter-0.12.2/src/globals.h bogofilter-0.12.2.dev/src/globals.h
--- bogofilter-0.12.2/src/globals.h Tue Apr 22 03:03:17 2003
+++ bogofilter-0.12.2.dev/src/globals.h Fri May 9 11:05:51 2003
@@ -33,6 +33,7 @@
extern bool terse; /* '-t' */
extern int test; /* '-T' */
extern int verbose; /* '-v' */
+extern bool markup; /* '-A' */
/* config file options */
extern int max_repeats;
Only in bogofilter-0.12.2.dev/src: globals.o
Only in bogofilter-0.12.2.dev/src: graham.o
diff --exclude=.* -ur bogofilter-0.12.2/src/html.c bogofilter-0.12.2.dev/src/html.c
--- bogofilter-0.12.2/src/html.c Wed Apr 2 00:35:35 2003
+++ bogofilter-0.12.2.dev/src/html.c Fri May 9 11:19:23 2003
@@ -119,6 +119,7 @@
if (memcmp(tmp, start, start_len) != 0)
tmp += 1;
else {
+ html_comment(1);
comment = tmp;
level += 1;
tmp += start_len;
@@ -136,6 +137,7 @@
buff_shift(buff, comment, tmp - comment);
tmp = comment;
level -= 1;
+ html_comment(-1);
/* If not followed by a comment, there is no need to keep reading */
if (level == 0 && isalnum(*tmp))
done = true;
Only in bogofilter-0.12.2.dev/src: textblock.o
diff --exclude=.* -ur bogofilter-0.12.2/src/token.c bogofilter-0.12.2.dev/src/token.c
--- bogofilter-0.12.2/src/token.c Wed Apr 23 03:32:27 2003
+++ bogofilter-0.12.2.dev/src/token.c Fri May 9 11:31:40 2003
@@ -23,7 +23,7 @@
#include "word.h"
#include "token.h"
#include "xmemrchr.h"
-
+#include "xmalloc.h"
/* Local Variables */
word_t *yylval = NULL;
@@ -31,8 +31,14 @@
static token_t save_class = NONE;
static word_t *ipsave = NULL;
+static word_t **token_buffer = NULL;
+static int token_buffer_size = 0;
+static int token_buffer_used = 0;
+
static int html_tag_level = 0;
static int html_comment_level = 0;
+static int html_tag_count = 0;
+static int html_comment_count = 0;
/* Global Variables */
@@ -57,6 +63,17 @@
void html_tag(int level)
{
html_tag_level = level;
+
+ if (!markup)
+ return;
+ html_tag_count++;
+ if (html_tag_count == 4)
+ push_token(word_make("m:html_tag4"));
+ if (html_tag_count == 10)
+ push_token(word_make("m:html_tag10"));
+ if (html_tag_count == 20)
+ push_token(word_make("m:html_tag20"));
+
}
void html_comment(int level)
@@ -64,6 +81,54 @@
html_comment_level += level;
if (html_comment_level < 0)
html_comment_level = 0;
+
+ if (!markup || level < 1)
+ return;
+
+ html_comment_count++;
+ if (html_comment_count == 4)
+ push_token(word_make("m:html_comment4"));
+ if (html_comment_count == 10)
+ push_token(word_make("m:html_comment10"));
+ if (html_comment_count == 20)
+ push_token(word_make("m:html_comment20"));
+ if (html_comment_count == 100)
+ push_token(word_make("m:html_comment100"));
+}
+
+/*
+ * Add a token to the stack. Each time
+ * get_token() is called, a token from this
+ * queue will be returned until the queue is
+ * empty, at which time get_token() will then
+ * go back to the lexer.
+ */
+void push_token(word_t * w)
+{
+ if (!token_buffer) {
+ token_buffer = xmalloc(sizeof(*token_buffer) * 20);
+ token_buffer_size = 20;
+ token_buffer_used = 0;
+ }
+
+ if (token_buffer_used == token_buffer_size) {
+ token_buffer_size = token_buffer_size * 2 + 5;
+ token_buffer = xrealloc(token_buffer , sizeof(*token_buffer) * token_buffer_size);
+ }
+
+ token_buffer[token_buffer_used++] = w;
+}
+
+/*
+ * If there's a token in the stack, pop it
+ * from the stack and return it, else return
+ * NULL.
+ */
+static word_t * pop_token(void) {
+ if (!token_buffer || !token_buffer_used)
+ return NULL;
+
+ return token_buffer[--token_buffer_used];
}
token_t get_token(void)
@@ -72,6 +137,13 @@
unsigned char *cp;
bool done = false;
+
+ /* Are there words queued up to be used? */
+ yylval = pop_token();
+ if (yylval) {
+ return TOKEN;
+ }
+
/* If saved IPADDR, truncate last octet */
if ( block_on_subnets && save_class == IPADDR )
{
@@ -193,18 +265,30 @@
}
/* Need separate loop so lexer can see "From", "Date", etc */
- for (cp = yylval->text; cp < yylval->text+yylval->leng; cp += 1)
- *cp = casefold_table[*cp];
+ /* only casefold for normal tokens. Not for 'Subject' line tags. */
+ if (!markup || (markup && !token_prefix))
+ for (cp = yylval->text; cp < yylval->text+yylval->leng; cp += 1)
+ *cp = casefold_table[*cp];
return(class);
}
void token_init(void)
{
+ word_t * w;
+
msg_header = true;
+ html_tag_count = 0;
+ html_comment_count = 0;
+
yyinit();
mime_reset();
reset_html_level();
+
+ /* Free any words that are queued up. */
+ for (w = pop_token() ; w ; w = pop_token() ) {
+ word_free(w);
+ }
}
void got_from(void)
diff --exclude=.* -ur bogofilter-0.12.2/src/token.h bogofilter-0.12.2.dev/src/token.h
--- bogofilter-0.12.2/src/token.h Thu Apr 17 02:21:26 2003
+++ bogofilter-0.12.2.dev/src/token.h Fri May 9 10:44:40 2003
@@ -14,6 +14,7 @@
extern word_t *yylval;
+extern void push_token(word_t *);
extern token_t get_token(void);
extern void got_from(void);
Only in bogofilter-0.12.2.dev/src: token.o
Only in bogofilter-0.12.2.dev/src: uudecode.o
Only in bogofilter-0.12.2.dev/src: version.c
Only in bogofilter-0.12.2.dev/src: version.o
diff --exclude=.* -ur bogofilter-0.12.2/src/word.c bogofilter-0.12.2.dev/src/word.c
--- bogofilter-0.12.2/src/word.c Mon Mar 31 00:12:20 2003
+++ bogofilter-0.12.2.dev/src/word.c Fri May 9 11:05:08 2003
@@ -31,6 +31,11 @@
return self;
}
+word_t *word_make(const byte *text)
+{
+ return word_new(text, strlen(text));
+}
+
void word_free(word_t *self)
{
xfree(self);
diff --exclude=.* -ur bogofilter-0.12.2/src/word.h bogofilter-0.12.2.dev/src/word.h
--- bogofilter-0.12.2/src/word.h Thu Feb 27 14:22:49 2003
+++ bogofilter-0.12.2.dev/src/word.h Fri May 9 11:04:02 2003
@@ -24,6 +24,7 @@
} word_t;
extern word_t *word_new(const byte *text, size_t leng);
+extern word_t *word_make(const byte *text);
extern void word_free(word_t *self);
extern word_t *word_dup(const word_t *self);
extern word_t *word_cpy(word_t *dst, const word_t *src);
More information about the Bogofilter
mailing list