[PATCH] experimenting with new parsing rules

David Relson relson at osagesoftware.com
Sun Sep 14 19:54:29 EDT 2003


Matthias,

If you're interested in experimenting with Michael's parsing rules, I
have attached patch that can enable/disable them using environment
variable BOGOTEST.  Sample uses would be:

BOGOTEST=0 bogolexer -p < message > 0.out
BOGOTEST=1 bogolexer -p < message > 1.out

where BOGOTEST=0 gives the current results and BOGOTEST=1 uses Michael's
rules.  I often run with redirected stdout (as shown above) so I can run
"gtkdiff 0.out 1.out" to see what's different.

Actually, BOGOTEST is bit coded with 1 enabling the modified rules and 2
enabling the lexer's debug mode (so one can see which rules are being
applied).

David
-------------- next part --------------
Index: bogoconfig.c
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/src/bogoconfig.c,v
retrieving revision 1.106
diff -u -r1.106 bogoconfig.c
--- bogoconfig.c	8 Sep 2003 12:16:02 -0000	1.106
+++ bogoconfig.c	14 Sep 2003 23:51:46 -0000
@@ -107,6 +107,8 @@
 extern double robx, robs;
 extern wl_t wl_default;
 
+extern void lexer_set_debug(int v);
+
 /*---------------------------------------------------------------------------*/
 
 /* Notes:
@@ -164,9 +166,16 @@
 
 void process_args_and_config_file(int argc, char **argv, bool warn_on_error)
 {
+    const char *bogotest = getenv("BOGOTEST");
+
     process_args_1(argc, argv);
     process_config_files(warn_on_error);
     process_args_2(argc, argv);
+
+    if (bogotest)
+	test = atoi(bogotest);
+
+    lexer_set_debug(test);	/* 1 - INITEST, 2 - lexer states */
 
     if (!twostate && !threestate) {
 	twostate = ham_cutoff < EPS;
Index: bogolexer.c
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/src/bogolexer.c,v
retrieving revision 1.44
diff -u -r1.44 bogolexer.c
--- bogolexer.c	8 Sep 2003 12:16:02 -0000	1.44
+++ bogolexer.c	14 Sep 2003 23:51:46 -0000
@@ -239,15 +239,24 @@
 
 int count=0;
 
+extern void lexer_set_debug(int v);
+
 int main(int argc, char **argv)
 {
     token_t t;
 
-    mbox_mode = true;		/* to allow multiple messages */
+    const char *bogotest = getenv("BOGOTEST");
 
     process_args_1(argc, argv);
     process_config_files(false);
     process_args_2(argc, argv);
+
+    if (bogotest)
+	test = atoi(bogotest);
+
+    lexer_set_debug(test);	/* 1 - INITEST, 2 - lexer states */
+
+    mbox_mode = true;		/* to allow multiple messages */
 
     textblock_init();
 
Index: lexer_v3.l
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/src/lexer_v3.l,v
retrieving revision 1.84
diff -u -r1.84 lexer_v3.l
--- lexer_v3.l	14 Sep 2003 21:17:52 -0000	1.84
+++ lexer_v3.l	14 Sep 2003 23:51:47 -0000
@@ -94,6 +94,10 @@
 static void reorder_html(void);
 
 static void skip_to(char chr);
+static void use_to(char chr);
+
+void yy_set_state_initial(void);
+void lexer_set_debug(int v);
 
 /* Function Definitions */
 
@@ -107,7 +111,7 @@
 %}
 
 %option warn
-%option debug nodebug
+%option nodebug debug
 %option align caseless 8bit
 %option never-interactive
 %option noreject noyywrap
@@ -194,6 +198,7 @@
 BREAKHTML	"<"({HBREAK}([ \n\t][^>]*|""))">"
 
 %s TEXT HTML BOGO_LEX
+%s INITEST
 %s HTOKEN HDISCARD SCOMMENT LCOMMENT HSCRIPT
 
 %%
@@ -207,6 +212,7 @@
 <BOGO_LEX>^\"{BOGOLEX_TOKEN}\"{NUM_NUM}		{ return BOGO_LEX_LINE; }
 <BOGO_LEX>\n					{ lineno += 1; }
 
+<INITEST>{ENCODED_TOKEN}			|
 <INITIAL>{ENCODED_TOKEN}			{ word_t *w = yy_text();
 						  size_t size = decode_text(w);
 						  while (size-- > 0)
@@ -214,22 +220,39 @@
 						}
 
 <INITIAL>^(To|From|Return-Path|Subject):	{ set_tag(yytext); }
+<INITEST>^(To|From|Return-Path|Subject):	{ set_tag(yytext); }
+<INITEST>^Received:				{ set_tag(yytext); return TOKEN; }
+
 <INITIAL>^Content-(Transfer-Encoding|Type|Disposition):{MTYPE}	{ mime_content(yy_text()); skip_to(':'); return TOKEN; }
+<INITEST>^Content-(Transfer-Encoding|Type|Disposition):{MTYPE}	{ mime_content(yy_text()); return TOKEN; }
+
 <INITIAL>^MIME-Version:.*			{ mime_version(yy_text()); skip_to(':'); return TOKEN; }
+<INITEST>^MIME-Version:.*			{ mime_version(yy_text()); return HEADKEY; }
 
 <INITIAL>^(Delivery-)?Date:.*			/* ignore */
+<INITEST>^(Delivery-)?Date:.*			{ return HEADKEY; }
 <INITIAL>^(Resent-)?Message-ID:.*		/* ignore */
+<INITEST>^(Resent-)?Message-ID:.*		{ return HEADKEY; }
 
+<INITEST>^(In-Reply-To|References):.* 		|
 <INITIAL>^(In-Reply-To|References):.* 		{ return HEADKEY; }
 
+<INITEST>boundary=[ ]*\"?{MIME_BOUNDARY}\"?	|
 <INITIAL>boundary=[ ]*\"?{MIME_BOUNDARY}\"?	{ mime_boundary_set(yy_text()); }
+
 <INITIAL>charset=\"?{CHARSET}\"?		{ got_charset(yytext); skip_to('='); return TOKEN; }
 
+<INITEST>(file)?name=\"?			|
 <INITIAL>(file)?name=\"?			/* ignore */
 <INITIAL>(ESMTP|SMTP)+[ \t\n]+id\ {ID}		/* ignore */
+<INITEST>(ESMTP|SMTP)+[ \t\n]+id\ {ID}		{ if (header_line_markup) { use_to(' '); return TOKEN; } }
+<INITEST>[:blank:]*id\ {ID}			|
 <INITIAL>[:blank:]*id\ {ID}			/* ignore */
 
+<INITEST>\n[ \t]				|
 <INITIAL>\n[ \t]				{ lineno += 1; }
+
+<INITEST>\n\n					|
 <INITIAL>\n\n					{ if (get_content_type() == MIME_TEXT_HTML)
 						      BEGIN HTML;
 						  else
@@ -240,10 +263,10 @@
 						}
 
 <INITIAL><<EOF>>				{ return NONE; }
+<INITEST><<EOF>>				{ add_hint("no_body:"); return NONE; }
 
 ^--{MIME_BOUNDARY}(--)?$			{ if (got_mime_boundary(yy_text())) {
-						      BEGIN INITIAL;
-						      msg_header = true;
+						      yy_set_state_initial();
 						      return BOUNDARY;
 						  } else {
 						      yyless(2);
@@ -287,8 +310,7 @@
 void lexer_v3_init(FILE *fp)
 {
     lineno = 0;
-    BEGIN INITIAL;
-    msg_header = true;
+    yy_set_state_initial();
     yyrestart(fp);
 }
 
@@ -298,6 +320,15 @@
     yyless(len);
 }
 
+static void use_to(char chr)
+{
+    char * p = memchr(yytext, chr, yyleng);
+    if (p) {
+	*p = 0;
+	yyleng = (p - yytext);
+    }
+}
+
 static void reorder_html(void)
 {
     char *chr = memchr(yytext, '<', yyleng);	/* find start of html tag */
@@ -315,11 +346,11 @@
 }
 
 char yy_get_state(void);
-void yy_set_state_initial(void);
 
 char yy_get_state()
 {
     switch (YYSTATE) {
+    case INITEST:
     case INITIAL:  return 'i';
     case TEXT:     return 't';
     case HTML:
@@ -330,12 +361,30 @@
     }
 }
 
-void yy_set_state_initial()
+void yy_set_state_initial(void)
 {
-    BEGIN INITIAL;
-    if (DEBUG_LEXER(1)) fprintf(dbgout, "%s:%d %s\n", __FILE__, __LINE__, "BEGIN INITIAL");
+    if (! (test & 1))		/* 1 - INITEST, 2 - debug (display lexer states) */
+	BEGIN INITIAL;
+    else
+	BEGIN INITEST;
+
+    msg_header = true;
+
+    if (DEBUG_LEXER(1))
+	fprintf(dbgout, "%s:%d BEGIN %s\n", 
+		__FILE__, __LINE__,
+		!test ? "INITIAL" : "INITEST" );
 }
 
+void lexer_set_debug(int v)
+{
+#ifndef	FLEX_DEBUG
+    (void) v;
+#else
+    yy_flex_debug = v;
+#endif
+}
+ 
 /*
  * The following sets edit modes for GNU EMACS
  * Local Variables:

-------------- next part --------------



More information about the Bogofilter-dev mailing list