[PATCH] Better tagging.

Sat Sep 13 14:02:52 CEST 2003

Bogofilter 0.15.3 gets the header tagging slightly wrong in
my not so humble opinion. :)

In particular, tokens discovered in the headers aren't
marked as being in the header (i.e. from 'Received' and
suchlike).

This degrades the performance of bogofilter about 4% on
my dataset. I.e. the number of false negatives is about 
approx 14% before this patch, and approx 10% after. False
positives aren't changed. (they _may_ be better, but the
numbers are too small to be reliable).

This is pretty much what I'd expect. The more info
you feed it, the better it is at discrimination.


Details:
1) Change the 'set_tag' function to take a 'char *' rather
than a word_t. Every user starts with a char *, and it's a bit
silly to convert to a word_t, just to convert it back again. :)

2) Keep the header-name from the various id headers. 'Resent-Message-ID'
is actually a powerful discriminator for example.

$ echo h:Resent-Message-ID | bogoutil  -W -p ./d/a0
                       spam    good  Gra prob  Rob/Fis
h:Resent-Message-ID       1     500  0.002132  0.143563

3) return NONE on <<EOF>>. At the moment, bogofilter works
by 'accident' (the enum NONE happens to be zero). This change
doesn't actually change any functionality, it just makes it
explict as to what's happening.

4) Explicitly tag tokens inside 'Received' headers. These headers
contain tokens that are likely to appear in the body of the email
but with a different purpose. Lets not mix them. :)


Michael.

diff -ur --exclude lexer_v3.c --exclude '*.log' bogofilter-0.15.3/src/lexer.h bogofilter-dev/src/lexer.h

--- bogofilter-0.15.3/src/lexer.h	Wed Sep 10 11:04:15 2003
+++ bogofilter-dev/src/lexer.h	Sat Sep 13 20:16:09 2003
@@ -32,13 +32,17 @@
     BOUNDARY,	/* MIME multipart boundary line */
     IPADDR,	/* ip address */
     MSG_COUNT_LINE,
-    BOGO_LEX_LINE
+    BOGO_LEX_LINE,
+//    BREAK,
 } token_t;
 
 /* in lexer.c */
 extern int yylineno;
 extern bool msg_header;
 
+/* in token.c */
+void add_hint(char *);
+
 /* Define a struct for interfacing to a lexer */
 
 typedef token_t yylex_t(void);
Binary files bogofilter-0.15.3/src/lexer.o and bogofilter-dev/src/lexer.o differ
diff -ur --exclude lexer_v3.c --exclude '*.log' bogofilter-0.15.3/src/lexer_v3.l bogofilter-dev/src/lexer_v3.l
--- bogofilter-0.15.3/src/lexer_v3.l	Wed Sep 10 11:04:15 2003
+++ bogofilter-dev/src/lexer_v3.l	Sat Sep 13 20:22:35 2003
@@ -94,6 +94,7 @@
 static void reorder_html(void);
 
 static void skip_to(char chr);
+static void use_to(char chr);
 
 /* Function Definitions */
 
@@ -213,20 +214,19 @@
 						    unput(w->text[size]);
 						}
 
-<INITIAL>^(To|From|Return-Path|Subject):	{ if (header_line_markup) set_tag(yy_text()); }
-<INITIAL>^Content-(Transfer-Encoding|Type|Disposition):{MTYPE}	{ mime_content(yy_text()); skip_to(':'); return TOKEN; }
-<INITIAL>^MIME-Version:.*			{ mime_version(yy_text()); skip_to(':'); return TOKEN; }
-
-<INITIAL>^(Delivery-)?Date:.*			/* ignore */
-<INITIAL>^(Resent-)?Message-ID:.*		/* ignore */
+<INITIAL>^(To|From|Return-Path|Subject|Received):	{ if (header_line_markup) set_tag(yytext); }
+<INITIAL>^Content-(Transfer-Encoding|Type|Disposition):{MTYPE}	{ mime_content(yy_text()); return TOKEN; }
+<INITIAL>^MIME-Version:.*			{ mime_version(yy_text()); return TOKEN; }
 
+<INITIAL>^(Delivery-)?Date:.*			{ return HEADKEY; }
+<INITIAL>^(Resent-)?Message-ID:.*		{ return HEADKEY; }
 <INITIAL>^(In-Reply-To|References):.* 		{ return HEADKEY; }
 
 <INITIAL>boundary=[ ]*\"?{MIME_BOUNDARY}\"?	{ mime_boundary_set(yy_text()); }
-<INITIAL>charset=\"?{CHARSET}\"?		{ got_charset(yytext); skip_to('='); return TOKEN; }
+<INITIAL>charset=\"?{CHARSET}\"?		{ got_charset(yytext); return TOKEN; }
 
 <INITIAL>(file)?name=\"?			/* ignore */
-<INITIAL>(ESMTP|SMTP)+[ \t\n]+id\ {ID}		/* ignore */
+<INITIAL>(ESMTP|SMTP)+[ \t\n]+id\ {ID}		{ use_to(' '); return TOKEN; }
 <INITIAL>[:blank:]*id\ {ID}			/* ignore */
 
 <INITIAL>\n[ \t]				{ lineno += 1; }
@@ -238,6 +238,8 @@
 						  clr_tag();
 						  return EOH;
 						}
+<INITIAL>\n					{ if (header_line_markup) set_tag("Header"); lineno += 1; }
+<INITIAL><<EOF>>				{ add_hint("no_body:"); return NONE; }
 ^--{MIME_BOUNDARY}(--)?$			{ if (got_mime_boundary(yy_text())) {
 						      BEGIN INITIAL;
 						      msg_header = true;
@@ -278,6 +280,7 @@
 
 .						/* ignore character */
 \n						{ lineno += 1; clr_tag(); }
+<<EOF>>						{ return NONE; }
 %%
 
 void lexer_v3_init(FILE *fp)
@@ -294,6 +297,15 @@
     yyless(len);
 }
 
+static void use_to(char chr)
+{
+    char * p = memchr(yytext, chr, yyleng);
+    if (p) {
+	*p = 0;
+	yyleng = (p - yytext);
+    }
+}
+
 static void reorder_html(void)
 {
     char *chr = memchr(yytext, '<', yyleng);	/* find start of html tag */
diff -ur --exclude lexer_v3.c --exclude '*.log' bogofilter-0.15.3/src/token.c bogofilter-dev/src/token.c
--- bogofilter-0.15.3/src/token.c	Wed Sep 10 11:04:15 2003
+++ bogofilter-dev/src/token.c	Sat Sep 13 20:19:33 2003
@@ -37,6 +37,8 @@
 static word_t *w_from = NULL;	/* From: */
 static word_t *w_rtrn = NULL;	/* Return-Path: */
 static word_t *w_subj = NULL;	/* Subject: */
+static word_t *w_recv = NULL;	/* Received: */
+static word_t *w_header = NULL;	/* Received: */
 
 /* Global Variables */
 
@@ -83,7 +85,7 @@
 	    fputc('\n', dbgout);
 	}
 	    
-	if (cls == NONE)
+	if (cls == NONE) /* End of message */
 	    break;
 
 	switch (cls) {
@@ -220,6 +222,8 @@
 	w_from = word_new((const byte *) "from:", 0);	/* From: */
 	w_rtrn = word_new((const byte *) "rtrn:", 0);	/* Return-Path: */
 	w_subj = word_new((const byte *) "subj:", 0);	/* Subject: */
+	w_recv = word_new((const byte *) "rv:", 0);
+	w_header = word_new((const byte *) "h:", 0);
     }
 
     return;
@@ -230,17 +234,23 @@
     token_prefix = NULL;
 }
 
-void set_tag(word_t *text)
+void set_tag(char *text)
 {
-    switch (tolower(*text->text)) {
+    switch (tolower(*text)) {
     case 't':			/* To: */
 	token_prefix = w_to;
 	break;
     case 'f':			/* From: */
 	token_prefix = w_from;
 	break;
+    case 'h':
+	token_prefix = w_header;
+	break;
     case 'r':			/* Return-Path: */
-	token_prefix = w_rtrn;
+	if (tolower(text[2]) == 't')
+	    token_prefix = w_rtrn;
+	else
+	    token_prefix = w_recv;
 	break;
     case 's':			/* Subject: */
 	token_prefix = w_subj;
@@ -248,12 +258,17 @@
     default:
 	fprintf(stderr, "%s:%d  invalid tag - '%s'\n", 
 		__FILE__, __LINE__, 
-		(char *)text->text);
+		text);
 	exit(EX_ERROR);
     }
     return;
 }
 
+void add_hint(char *h)
+{
+	(void*)h;
+}
+
 /* Cleanup storage allocation */
 void token_cleanup()
 {
diff -ur --exclude lexer_v3.c --exclude '*.log' bogofilter-0.15.3/src/token.h bogofilter-dev/src/token.h
--- bogofilter-0.15.3/src/token.h	Sun Sep  7 01:30:09 2003
+++ bogofilter-dev/src/token.h	Sat Sep 13 20:18:29 2003
@@ -18,7 +18,7 @@
 
 extern void got_from(void);
 extern void clr_tag(void);
-extern void set_tag(word_t *text);
+extern void set_tag(char *text);
 
 extern void token_init(void);
 extern void token_cleanup(void);
Binary files bogofilter-0.15.3/src/token.o and bogofilter-dev/src/token.o differ