[patch] word pairs

Sat Jul 19 13:33:12 CEST 2003

This is the first cut of a patch that lets bogofilter see word pairs.

(I know there's been various hackish pre-parsers to do similar things
previously, but they weren't fast enough for me to usefully play with
them).

This one is integrated but still hackish (it's missing the option to
turn off word pairs for example).

This patch changes get_token() to generate twice as many tokens. One
is the normal token, unchanged.  The other is that token concat'ed to
the previous token.

It won't generate tokens that would cross token classes. (i.e. it
won't joined 'subj:fred' with the first token on the next line).  It
will will never join tokens that are considered 'uniques' (i.e. mime
version et al).


Initial indications are that this is a HUGE win in accuracy.

In my ~200,000 item corpus, it pulled out 400 spams that had been
mis-filed as hams (that bogofilter had never previously noticed). The
false negative rate also dropped from ~3.5% of all spams down to
~2.2%.

I'll post some more numbers after I've cleaned the misfiles out of the
corpus.

Michael, impressed.

PS: The database size after training on ~50,000 emails is ~200Meg for
the good list, and ~50 meg for the spam list. much better than I
expected.

PPS: This patch includes the Received: line tagging I posted
earlier.

diff -ur bogofilter-0.13.7.3/src/lexer.h bogofilter-dev/src/lexer.h

--- bogofilter-0.13.7.3/src/lexer.h	Mon Jun 16 21:56:12 2003
+++ bogofilter-dev/src/lexer.h	Sat Jul 19 18:39:14 2003
@@ -30,7 +30,9 @@
     BOUNDARY,	/* MIME multipart boundary line */
     IPADDR,	/* ip address */
     MSG_COUNT_LINE,
-    BOGO_LEX_LINE
+    BOGO_LEX_LINE,
+    TAGGED,
+    UNIQUE,
 } token_t;
 
 /* in lexer.c */
diff -ur bogofilter-0.13.7.3/src/lexer_v3.l bogofilter-dev/src/lexer_v3.l
--- bogofilter-0.13.7.3/src/lexer_v3.l	Wed Jul 16 10:40:53 2003
+++ bogofilter-dev/src/lexer_v3.l	Sat Jul 19 18:55:20 2003
@@ -180,15 +180,16 @@
 <BOGO_LEX>^\"{BOGOLEX_TOKEN}\"{NUM_NUM}		{ return BOGO_LEX_LINE; }
 <BOGO_LEX>\n					{ lineno += 1; }
 
-<INITIAL>charset=\"?{ID}\"?			{ got_charset(yytext); yyless(strlen("charset")); return TOKEN; }
-<INITIAL>^MIME-Version:.*			{ mime_version(yy_text()); yyless(strlen("MIME-Version:")); return(TOKEN); }
-<INITIAL>^Content-Transfer-Encoding:{MTYPE}	{ mime_encoding(yy_text()); yyless(strlen("Content-Transfer-Encoding:")); return(TOKEN); }
-<INITIAL>^Content-Type:{MTYPE};?		{ mime_type(yy_text()); yyless(strlen("Content-Type:")); return TOKEN; }
-<INITIAL>^Content-Disposition:{MTYPE}		{ mime_disposition(yy_text()); yyless(strlen("Content-Disposition:")); return TOKEN; }
+<INITIAL>charset=\"?{ID}\"?			{ got_charset(yytext); yyless(strlen("charset")); return UNIQUE; }
+<INITIAL>^MIME-Version:.*			{ mime_version(yy_text()); yyless(strlen("MIME-Version:")); return(UNIQUE); }
+<INITIAL>^Content-Transfer-Encoding:{MTYPE}	{ mime_encoding(yy_text()); yyless(strlen("Content-Transfer-Encoding:")); return(UNIQUE); }
+<INITIAL>^Content-Type:{MTYPE};?		{ mime_type(yy_text()); yyless(strlen("Content-Type:")); return UNIQUE; }
+<INITIAL>^Content-Disposition:{MTYPE}		{ mime_disposition(yy_text()); yyless(strlen("Content-Disposition:")); return UNIQUE; }
 <INITIAL>^To/:\.*				{ if (header_line_markup) set_tag("to:"); }
 <INITIAL>^From/:\.*				{ if (header_line_markup) set_tag("from:");  else return TOKEN; }
 <INITIAL>^Return-Path/:\.*			{ if (header_line_markup) set_tag("rtrn:");  else return TOKEN; }
 <INITIAL>^Subject/:\.*				{ if (header_line_markup) set_tag("subj:");  else return TOKEN; }
+<INITIAL>^Received:\.*				{ if (header_line_markup) set_tag("rcvd:");  else return TOKEN; }
 <INITIAL>^Date:.*|Delivery-Date:.*		;
 <INITIAL>^Message-ID:.*				;
 <INITIAL>^\tid\ {ID}				;
@@ -197,6 +198,7 @@
 <INITIAL>boundary=[ ]*\"?{MIME_BOUNDARY}\"?	{ mime_boundary_set(yy_text()); }
 <INITIAL>name=\"?				;
 <INITIAL>filename=\"?				;
+<INITIAL>\n\t					; /* Eat the newline as it's a multi-line header */
 <INITIAL>^[ \b\t]*\n				{ got_emptyline(); 
 						  lineno += 1;
 						  switch (get_content_type()) 
--- bogofilter-0.13.7.3/src/token.c	Mon Jun 16 21:56:12 2003
+++ bogofilter-dev/src/token.c	Sat Jul 19 20:19:34 2003
@@ -38,9 +38,76 @@
 static word_t *token_prefix = NULL;
 static word_t *nonblank_line = NULL;
 
+#define STACK_SIZE 20
+static word_t * stack_word[STACK_SIZE];
+static token_t stack_class[STACK_SIZE];
+static int sp = 0;
+
+static word_t * lastword = NULL;
+static token_t lastclass = -1;
+
+static word_t *spaceword = NULL;
 /* Function Definitions */
+token_t raw_get_token(void);
 
-token_t get_token(void)
+token_t get_token(void) 
+{
+    token_t c, class;
+    word_t * w;
+
+//printf("%d: %d: %s\n", sp, lastclass, lastword ? lastword->text : "(null)");
+    if (sp > 0) {
+	--sp;
+	if (yylval)
+		word_free(yylval);
+	yylval = stack_word[sp];
+	return stack_class[sp];
+    }
+
+again:
+    class = raw_get_token();
+
+    if (!class && lastword) { /* EOF */
+	word_free(lastword);
+	lastword = NULL;
+	return class;
+    }
+
+    if (!class)
+	return class;
+
+    if ((lastclass != class || class == UNIQUE) && lastword) {	/* Can't merge. */
+	if (lastword) {
+	    word_free(lastword);
+	    lastword = NULL;
+	}
+
+	lastword = word_dup(yylval);
+	lastclass = class;
+
+	return class;
+    }
+	
+    if (!spaceword)  spaceword = word_new(" ", 1); /* spacer. */
+
+    if (lastword) {
+	w = word_concat(lastword, spaceword);
+	stack_word[sp] = word_concat(w, yylval);
+	stack_class[sp] = class;
+	sp++;
+
+	word_free(w);
+	word_free(lastword);
+	lastword = NULL;
+    }
+
+    lastword = word_dup(yylval);
+    lastclass = class;
+
+    return class;
+}
+
+token_t raw_get_token(void)
 {
     token_t class = NONE;
     unsigned char *cp;
@@ -50,13 +117,17 @@
     if ( block_on_subnets && save_class == IPADDR )
     {
 	byte *t = xmemrchr(ipsave->text, '.', ipsave->leng);
-	if (t == NULL)
+	if (t == NULL) {
 	    save_class = NONE;
+	    word_free(ipsave);
+	}
 	else
 	{
 	    *t = '\0';	
 	    ipsave->leng = t - ipsave->text;
-	    yylval = ipsave;
+	    if (yylval)
+		word_free(yylval);
+	    yylval = word_dup(ipsave);
 	    return save_class;
 	}
     }
@@ -97,6 +168,7 @@
 		    word_t *w = word_concat(token_prefix, yylval);
 		    word_free(yylval);
 		    yylval = w;
+		    class = TAGGED;
 		}
 		break;
 	    }
@@ -139,7 +211,7 @@
 		memcpy(ipsave->text, prefix, plen);
 		memcpy(ipsave->text+plen, yylval->text, yylval->leng+1);
 		word_free(yylval);
-		yylval = ipsave;
+		yylval = word_dup(ipsave);
 		save_class = IPADDR;
 		return (class);
 	    }
@@ -226,7 +298,7 @@
     return;
 }
 
-const char *prefixes = "|to:|from:|rtrn:|subj:";
+const char *prefixes = "|to:|from:|rtrn:|subj:|rcvd:";
 
 void set_tag(const char *tag)
 {
@@ -257,4 +329,14 @@
     if (nonblank_line)
 	word_free(nonblank_line);
     nonblank_line = NULL;
+
+    while (sp > 0) {
+	--sp;
+	if (stack_word[sp])
+	    word_free(stack_word[sp]);
+    }
+    if (lastword)
+	word_free(lastword);
+    lastword = NULL;
+
 }