[patch] tag Received lines and parse headers better.

Sat Jul 19 11:04:44 CEST 2003

Trivial patch to do two things.

#1. Tag the data in 'Received' headers as per
From/Subject/To et al.

#2. Handle continued header lines. E-mail headers
are continued if the next line starts with a tab
character. In this event, the lexer now eats
'\n\t' as whitespace and continues tagging.

These trivial changes added reduced false
positives nearly 6% on my (admittedly bizarre)
corpus.

Michael.

Ps: I edited this patch by hand to remove other
things I've been playing with so be careful.. :)

diff -ur bogofilter-0.13.7.3/src/token.c bogofilter-dev/src/token.c

--- bogofilter-0.13.7.3/src/token.c	Mon Jun 16 21:56:12 2003
+++ bogofilter-dev/src/token.c	Sat Jul 19 18:46:13 2003
@@ -97,6 +97,7 @@
 		    word_t *w = word_concat(token_prefix, yylval);
 		    word_free(yylval);
 		    yylval = w;
+
 		}
 		break;
 	    }
@@ -226,7 +227,7 @@
     return;
 }
 
-const char *prefixes = "|to:|from:|rtrn:|subj:";
+const char *prefixes = "|to:|from:|rtrn:|subj:|rcvd:";
 
 void set_tag(const char *tag)
 {
diff -ur bogofilter-0.13.7.3/src/lexer_v3.l bogofilter-dev/src/lexer_v3.l
--- bogofilter-0.13.7.3/src/lexer_v3.l	Wed Jul 16 10:40:53 2003
+++ bogofilter-dev/src/lexer_v3.l	Sat Jul 19 18:55:20 2003
@@ -180,15 +180,16 @@
 <BOGO_LEX>^\"{BOGOLEX_TOKEN}\"{NUM_NUM}		{ return BOGO_LEX_LINE; }
 <BOGO_LEX>\n					{ lineno += 1; }
 
 <INITIAL>charset=\"?{ID}\"?			{ got_charset(yytext); yyless(strlen("charset")); return TOKEN; }
 <INITIAL>^MIME-Version:.*			{ mime_version(yy_text()); yyless(strlen("MIME-Version:")); return(TOKEN); }
 <INITIAL>^Content-Transfer-Encoding:{MTYPE}	{ mime_encoding(yy_text()); yyless(strlen("Content-Transfer-Encoding:")); return(TOKEN); }
 <INITIAL>^Content-Type:{MTYPE};?		{ mime_type(yy_text()); yyless(strlen("Content-Type:")); return TOKEN; }
 <INITIAL>^Content-Disposition:{MTYPE}		{ mime_disposition(yy_text()); yyless(strlen("Content-Disposition:")); return TOKEN; }
 <INITIAL>^To/:\.*				{ if (header_line_markup) set_tag("to:"); }
 <INITIAL>^From/:\.*				{ if (header_line_markup) set_tag("from:");  else return TOKEN; }
 <INITIAL>^Return-Path/:\.*			{ if (header_line_markup) set_tag("rtrn:");  else return TOKEN; }
 <INITIAL>^Subject/:\.*				{ if (header_line_markup) set_tag("subj:");  else return TOKEN; }
+<INITIAL>^Received:\.*				{ if (header_line_markup) set_tag("rcvd:");  else return TOKEN; }
 <INITIAL>^Date:.*|Delivery-Date:.*		;
 <INITIAL>^Message-ID:.*				;
 <INITIAL>^\tid\ {ID}				;
@@ -197,6 +198,7 @@
 <INITIAL>boundary=[ ]*\"?{MIME_BOUNDARY}\"?	{ mime_boundary_set(yy_text()); }
 <INITIAL>name=\"?				;
 <INITIAL>filename=\"?				;
+<INITIAL>\n\t					; /* Eat the newline as it's a multi-line header */
 <INITIAL>^[ \b\t]*\n				{ got_emptyline(); 
 						  lineno += 1;
 						  switch (get_content_type()) 
Only in bogofilter-dev/src: lexer_v3.o
Only in bogofilter-dev/src: libbogofilter.a