Markup.

Fri May 9 09:00:13 CEST 2003

I finally got around to starting to experiment with
marking up some email features that bogofilter doesn't
currently see.

To start with, I added counting of the number
of html tags and html comments in an email.

Using a corpus of 75,122 spams, and 48,042 hams,
I divided them randomly into 4 sets. I then
use one set for training, and tested it against
all for sets. Repeat until each set has been
for for training.

Using a virgin bogofilter 12.2 I get these results.
(Sets are numbered 0 thru to 3).
CONFIG : Mindev 0.100, RobX 0.415
         0 against 0   --> false pos     0 false neg  1309
         0 against 1   --> false pos     2 false neg  3194
         0 against 2   --> false pos     2 false neg  3271
         0 against 3   --> false pos     3 false neg  3326
         1 against 0   --> false pos     0 false neg  3476
         1 against 1   --> false pos     0 false neg  1222
         1 against 2   --> false pos     0 false neg  3259
         1 against 3   --> false pos     1 false neg  3370
         2 against 0   --> false pos     3 false neg  3370
         2 against 1   --> false pos     3 false neg  3141
         2 against 2   --> false pos     0 false neg  1190
         2 against 3   --> false pos     3 false neg  3274
         3 against 0   --> false pos     3 false neg  3388
         3 against 1   --> false pos     6 false neg  3184
         3 against 2   --> false pos     2 false neg  3288
         3 against 3   --> false pos     0 false neg  1206

I.e. When using set 0 to train, it recorded 2 false positives
in set 1, and 3194 false negatives. (With about 18,700 spams
in set 1 that's an OK result. Not great, but ok).

Using the patch below, I get these results:

CONFIG : Mindev 0.100, RobX 0.415
         0 against 0   --> false pos     0 false neg  1226
         0 against 1   --> false pos     2 false neg  3027
         0 against 2   --> false pos     2 false neg  3082
         0 against 3   --> false pos     3 false neg  3134
         1 against 0   --> false pos     0 false neg  3264
         1 against 1   --> false pos     0 false neg  1160
         1 against 2   --> false pos     0 false neg  3069
         1 against 3   --> false pos     1 false neg  3167
         2 against 0   --> false pos     3 false neg  3190
         2 against 1   --> false pos     3 false neg  2947
         2 against 2   --> false pos     0 false neg  1110
         2 against 3   --> false pos     3 false neg  3062
         3 against 0   --> false pos     3 false neg  3193
         3 against 1   --> false pos     7 false neg  3029
         3 against 2   --> false pos     2 false neg  3081
         3 against 3   --> false pos     0 false neg  1134

As you can see, there's a single false positive increase (Looking at
the email, I can't really tell if it's spam or not).

On the other hand, there are definately fewer false negatives.
(Someone with a statistics background want to tell me
how significant result is?)

On the whole, it looks like markup might be an interesting
direction to experiment in.

Patch to do the above. Adds the '-A' flag which turns on
markup tokens. Currently the only tokens added are
(along with the counts and probabilities from one
of the training dbases)

m:html_comment100
m:html_comment20
m:html_comment10
m:html_comment4
m:html_tag20
m:html_tag10
m:html_tag4

The existance of the token 'm:html_tag10' says that
there's at least 10 html tags in the email.

Michael, experimenting.

diff --exclude=.* -ur bogofilter-0.12.2/src/bogoconfig.c bogofilter-0.12.2.dev/src/bogoconfig.c

--- bogofilter-0.12.2/src/bogoconfig.c	Tue Apr 29 00:30:20 2003
+++ bogofilter-0.12.2.dev/src/bogoconfig.c	Fri May  9 11:02:31 2003
@@ -159,6 +159,8 @@
     { "db_cachesize",	  	     CP_INTEGER, { (void *) &db_cachesize } },
     { "terse",	 	  	     CP_BOOLEAN, { (void *) &terse } },
 
+    { "markup",			     CP_BOOLEAN, { (void *) &markup } },
+
     { NULL,		  	     CP_NONE,	 { (void *) NULL } },
 };
 
@@ -428,7 +430,7 @@
 #if HAVE_DECL_OPTRESET
     optreset = 1;
 #endif
-    while ((option = getopt(argc, argv, ":23bBc:Cd:DefFghI:lL:m:MnNo:O:pqQRrsStTuvVx:y:" G R F)) != -1)
+    while ((option = getopt(argc, argv, ":23AbBc:Cd:DefFghI:lL:m:MnNo:O:pqQRrsStTuvVx:y:" G R F)) != -1)
     {
 #if 0
 	if (getenv("BOGOFILTER_DEBUG_OPTIONS")) {
@@ -443,6 +445,10 @@
 	case '3':
 	    twostate = option == '2';
 	    threestate = option == '3';
+	    break;
+
+	case 'A':
+	    markup = 1;
 	    break;
 
 	case 'b':
diff --exclude=.* -ur bogofilter-0.12.2/src/globals.c bogofilter-0.12.2.dev/src/globals.c
--- bogofilter-0.12.2/src/globals.c	Tue Apr 22 03:03:17 2003
+++ bogofilter-0.12.2.dev/src/globals.c	Fri May  9 11:05:43 2003
@@ -36,6 +36,7 @@
 bool	terse;				/* '-t' */
 int	test = 0;			/* '-T' */
 int	verbose;			/* '-v' */
+bool	markup = 0;			/* '-A' */
 
 /* config file options */
 int	max_repeats;
diff --exclude=.* -ur bogofilter-0.12.2/src/globals.h bogofilter-0.12.2.dev/src/globals.h
--- bogofilter-0.12.2/src/globals.h	Tue Apr 22 03:03:17 2003
+++ bogofilter-0.12.2.dev/src/globals.h	Fri May  9 11:05:51 2003
@@ -33,6 +33,7 @@
 extern	bool	terse;			/* '-t' */
 extern	int	test;			/* '-T' */
 extern	int	verbose;		/* '-v' */
+extern  bool	markup;			/* '-A' */
 
 /* config file options */
 extern	int	max_repeats;
Only in bogofilter-0.12.2.dev/src: globals.o
Only in bogofilter-0.12.2.dev/src: graham.o
diff --exclude=.* -ur bogofilter-0.12.2/src/html.c bogofilter-0.12.2.dev/src/html.c
--- bogofilter-0.12.2/src/html.c	Wed Apr  2 00:35:35 2003
+++ bogofilter-0.12.2.dev/src/html.c	Fri May  9 11:19:23 2003
@@ -119,6 +119,7 @@
 	    if (memcmp(tmp, start, start_len) != 0)
 		tmp += 1;
 	    else {
+		html_comment(1);
 		comment = tmp;
 		level += 1;
 		tmp += start_len;
@@ -136,6 +137,7 @@
 		buff_shift(buff, comment, tmp - comment);
 		tmp = comment;
 		level -= 1;
+		html_comment(-1);
 		/* If not followed by a comment, there is no need to keep reading */
 		if (level == 0 && isalnum(*tmp))
 		    done = true;
Only in bogofilter-0.12.2.dev/src: textblock.o
diff --exclude=.* -ur bogofilter-0.12.2/src/token.c bogofilter-0.12.2.dev/src/token.c
--- bogofilter-0.12.2/src/token.c	Wed Apr 23 03:32:27 2003
+++ bogofilter-0.12.2.dev/src/token.c	Fri May  9 11:31:40 2003
@@ -23,7 +23,7 @@
 #include "word.h"
 #include "token.h"
 #include "xmemrchr.h"
-
+#include "xmalloc.h"
 /* Local Variables */
 
 word_t *yylval = NULL;
@@ -31,8 +31,14 @@
 static token_t save_class = NONE;
 static word_t *ipsave = NULL;
 
+static word_t **token_buffer = NULL;
+static int token_buffer_size = 0;
+static int token_buffer_used = 0;
+
 static int html_tag_level = 0;
 static int html_comment_level = 0;
+static int html_tag_count = 0;
+static int html_comment_count = 0;
 
 /* Global Variables */
 
@@ -57,6 +63,17 @@
 void html_tag(int level)
 {
     html_tag_level = level;
+
+    if (!markup)
+	return;
+    html_tag_count++;
+    if (html_tag_count == 4)
+	push_token(word_make("m:html_tag4"));
+    if (html_tag_count == 10)
+	push_token(word_make("m:html_tag10"));
+    if (html_tag_count == 20)
+	push_token(word_make("m:html_tag20"));
+    
 }
 
 void html_comment(int level)
@@ -64,6 +81,54 @@
     html_comment_level += level;
     if (html_comment_level < 0)
 	html_comment_level = 0;
+
+    if (!markup || level < 1)
+	return;
+
+    html_comment_count++;
+    if (html_comment_count == 4)
+	push_token(word_make("m:html_comment4"));
+    if (html_comment_count == 10)
+	push_token(word_make("m:html_comment10"));
+    if (html_comment_count == 20)
+	push_token(word_make("m:html_comment20"));
+    if (html_comment_count == 100)
+	push_token(word_make("m:html_comment100"));
+}
+
+/*
+ * Add a token to the stack. Each time
+ * get_token() is called, a token from this
+ * queue will be returned until the queue is
+ * empty, at which time get_token() will then
+ * go back to the lexer.
+ */
+void push_token(word_t * w)
+{
+	if (!token_buffer) {
+		token_buffer = xmalloc(sizeof(*token_buffer) * 20);
+		token_buffer_size = 20;
+		token_buffer_used = 0;
+	}
+
+	if (token_buffer_used == token_buffer_size) {
+		token_buffer_size = token_buffer_size * 2 + 5;
+		token_buffer = xrealloc(token_buffer , sizeof(*token_buffer) * token_buffer_size);
+	}
+
+	token_buffer[token_buffer_used++] = w;
+}
+
+/*
+ * If there's a token in the stack, pop it
+ * from the stack and return it, else return
+ * NULL.
+ */
+static word_t * pop_token(void) {
+	if (!token_buffer || !token_buffer_used)
+		return NULL;
+	
+	return token_buffer[--token_buffer_used];
 }
 
 token_t get_token(void)
@@ -72,6 +137,13 @@
     unsigned char *cp;
     bool done = false;
 
+
+	/* Are there words queued up to be used? */
+    yylval = pop_token();
+    if (yylval) {
+	return TOKEN;
+    }
+
     /* If saved IPADDR, truncate last octet */
     if ( block_on_subnets && save_class == IPADDR )
     {
@@ -193,18 +265,30 @@
     }
 
     /* Need separate loop so lexer can see "From", "Date", etc */
-    for (cp = yylval->text; cp < yylval->text+yylval->leng; cp += 1)
-	*cp = casefold_table[*cp];
+    /* only casefold for normal tokens. Not for 'Subject' line tags. */
+    if (!markup || (markup && !token_prefix))
+	for (cp = yylval->text; cp < yylval->text+yylval->leng; cp += 1)
+	    *cp = casefold_table[*cp];
 
     return(class);
 }
 
 void token_init(void)
 {
+    word_t * w;
+
     msg_header = true;
+    html_tag_count = 0;
+    html_comment_count = 0;
+
     yyinit();
     mime_reset(); 
     reset_html_level();
+ 
+	/* Free any words that are queued up. */
+    for (w = pop_token() ; w ; w = pop_token() ) {
+	word_free(w);
+    }   
 }
 
 void got_from(void)
diff --exclude=.* -ur bogofilter-0.12.2/src/token.h bogofilter-0.12.2.dev/src/token.h
--- bogofilter-0.12.2/src/token.h	Thu Apr 17 02:21:26 2003
+++ bogofilter-0.12.2.dev/src/token.h	Fri May  9 10:44:40 2003
@@ -14,6 +14,7 @@
 
 extern word_t *yylval;
 
+extern void push_token(word_t *);
 extern token_t get_token(void);
 
 extern void got_from(void);
Only in bogofilter-0.12.2.dev/src: token.o
Only in bogofilter-0.12.2.dev/src: uudecode.o
Only in bogofilter-0.12.2.dev/src: version.c
Only in bogofilter-0.12.2.dev/src: version.o
diff --exclude=.* -ur bogofilter-0.12.2/src/word.c bogofilter-0.12.2.dev/src/word.c
--- bogofilter-0.12.2/src/word.c	Mon Mar 31 00:12:20 2003
+++ bogofilter-0.12.2.dev/src/word.c	Fri May  9 11:05:08 2003
@@ -31,6 +31,11 @@
     return self;
 }
 
+word_t *word_make(const byte *text) 
+{
+    return word_new(text, strlen(text));
+}
+
 void word_free(word_t *self)
 {
     xfree(self);
diff --exclude=.* -ur bogofilter-0.12.2/src/word.h bogofilter-0.12.2.dev/src/word.h
--- bogofilter-0.12.2/src/word.h	Thu Feb 27 14:22:49 2003
+++ bogofilter-0.12.2.dev/src/word.h	Fri May  9 11:04:02 2003
@@ -24,6 +24,7 @@
 } word_t;
 
 extern word_t  *word_new(const byte *text, size_t leng);
+extern word_t  *word_make(const byte *text);
 extern void 	word_free(word_t *self);
 extern word_t  *word_dup(const word_t *self);
 extern word_t  *word_cpy(word_t *dst, const word_t *src);