[PATCH] experimenting with new parsing rules
David Relson
relson at osagesoftware.com
Sun Sep 14 19:54:29 EDT 2003
Matthias,
If you're interested in experimenting with Michael's parsing rules, I
have attached patch that can enable/disable them using environment
variable BOGOTEST. Sample uses would be:
BOGOTEST=0 bogolexer -p < message > 0.out
BOGOTEST=1 bogolexer -p < message > 1.out
where BOGOTEST=0 gives the current results and BOGOTEST=1 uses Michael's
rules. I often run with redirected stdout (as shown above) so I can run
"gtkdiff 0.out 1.out" to see what's different.
Actually, BOGOTEST is bit coded with 1 enabling the modified rules and 2
enabling the lexer's debug mode (so one can see which rules are being
applied).
David
-------------- next part --------------
Index: bogoconfig.c
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/src/bogoconfig.c,v
retrieving revision 1.106
diff -u -r1.106 bogoconfig.c
--- bogoconfig.c 8 Sep 2003 12:16:02 -0000 1.106
+++ bogoconfig.c 14 Sep 2003 23:51:46 -0000
@@ -107,6 +107,8 @@
extern double robx, robs;
extern wl_t wl_default;
+extern void lexer_set_debug(int v);
+
/*---------------------------------------------------------------------------*/
/* Notes:
@@ -164,9 +166,16 @@
void process_args_and_config_file(int argc, char **argv, bool warn_on_error)
{
+ const char *bogotest = getenv("BOGOTEST");
+
process_args_1(argc, argv);
process_config_files(warn_on_error);
process_args_2(argc, argv);
+
+ if (bogotest)
+ test = atoi(bogotest);
+
+ lexer_set_debug(test); /* 1 - INITEST, 2 - lexer states */
if (!twostate && !threestate) {
twostate = ham_cutoff < EPS;
Index: bogolexer.c
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/src/bogolexer.c,v
retrieving revision 1.44
diff -u -r1.44 bogolexer.c
--- bogolexer.c 8 Sep 2003 12:16:02 -0000 1.44
+++ bogolexer.c 14 Sep 2003 23:51:46 -0000
@@ -239,15 +239,24 @@
int count=0;
+extern void lexer_set_debug(int v);
+
int main(int argc, char **argv)
{
token_t t;
- mbox_mode = true; /* to allow multiple messages */
+ const char *bogotest = getenv("BOGOTEST");
process_args_1(argc, argv);
process_config_files(false);
process_args_2(argc, argv);
+
+ if (bogotest)
+ test = atoi(bogotest);
+
+ lexer_set_debug(test); /* 1 - INITEST, 2 - lexer states */
+
+ mbox_mode = true; /* to allow multiple messages */
textblock_init();
Index: lexer_v3.l
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/src/lexer_v3.l,v
retrieving revision 1.84
diff -u -r1.84 lexer_v3.l
--- lexer_v3.l 14 Sep 2003 21:17:52 -0000 1.84
+++ lexer_v3.l 14 Sep 2003 23:51:47 -0000
@@ -94,6 +94,10 @@
static void reorder_html(void);
static void skip_to(char chr);
+static void use_to(char chr);
+
+void yy_set_state_initial(void);
+void lexer_set_debug(int v);
/* Function Definitions */
@@ -107,7 +111,7 @@
%}
%option warn
-%option debug nodebug
+%option nodebug debug
%option align caseless 8bit
%option never-interactive
%option noreject noyywrap
@@ -194,6 +198,7 @@
BREAKHTML "<"({HBREAK}([ \n\t][^>]*|""))">"
%s TEXT HTML BOGO_LEX
+%s INITEST
%s HTOKEN HDISCARD SCOMMENT LCOMMENT HSCRIPT
%%
@@ -207,6 +212,7 @@
<BOGO_LEX>^\"{BOGOLEX_TOKEN}\"{NUM_NUM} { return BOGO_LEX_LINE; }
<BOGO_LEX>\n { lineno += 1; }
+<INITEST>{ENCODED_TOKEN} |
<INITIAL>{ENCODED_TOKEN} { word_t *w = yy_text();
size_t size = decode_text(w);
while (size-- > 0)
@@ -214,22 +220,39 @@
}
<INITIAL>^(To|From|Return-Path|Subject): { set_tag(yytext); }
+<INITEST>^(To|From|Return-Path|Subject): { set_tag(yytext); }
+<INITEST>^Received: { set_tag(yytext); return TOKEN; }
+
<INITIAL>^Content-(Transfer-Encoding|Type|Disposition):{MTYPE} { mime_content(yy_text()); skip_to(':'); return TOKEN; }
+<INITEST>^Content-(Transfer-Encoding|Type|Disposition):{MTYPE} { mime_content(yy_text()); return TOKEN; }
+
<INITIAL>^MIME-Version:.* { mime_version(yy_text()); skip_to(':'); return TOKEN; }
+<INITEST>^MIME-Version:.* { mime_version(yy_text()); return HEADKEY; }
<INITIAL>^(Delivery-)?Date:.* /* ignore */
+<INITEST>^(Delivery-)?Date:.* { return HEADKEY; }
<INITIAL>^(Resent-)?Message-ID:.* /* ignore */
+<INITEST>^(Resent-)?Message-ID:.* { return HEADKEY; }
+<INITEST>^(In-Reply-To|References):.* |
<INITIAL>^(In-Reply-To|References):.* { return HEADKEY; }
+<INITEST>boundary=[ ]*\"?{MIME_BOUNDARY}\"? |
<INITIAL>boundary=[ ]*\"?{MIME_BOUNDARY}\"? { mime_boundary_set(yy_text()); }
+
<INITIAL>charset=\"?{CHARSET}\"? { got_charset(yytext); skip_to('='); return TOKEN; }
+<INITEST>(file)?name=\"? |
<INITIAL>(file)?name=\"? /* ignore */
<INITIAL>(ESMTP|SMTP)+[ \t\n]+id\ {ID} /* ignore */
+<INITEST>(ESMTP|SMTP)+[ \t\n]+id\ {ID} { if (header_line_markup) { use_to(' '); return TOKEN; } }
+<INITEST>[:blank:]*id\ {ID} |
<INITIAL>[:blank:]*id\ {ID} /* ignore */
+<INITEST>\n[ \t] |
<INITIAL>\n[ \t] { lineno += 1; }
+
+<INITEST>\n\n |
<INITIAL>\n\n { if (get_content_type() == MIME_TEXT_HTML)
BEGIN HTML;
else
@@ -240,10 +263,10 @@
}
<INITIAL><<EOF>> { return NONE; }
+<INITEST><<EOF>> { add_hint("no_body:"); return NONE; }
^--{MIME_BOUNDARY}(--)?$ { if (got_mime_boundary(yy_text())) {
- BEGIN INITIAL;
- msg_header = true;
+ yy_set_state_initial();
return BOUNDARY;
} else {
yyless(2);
@@ -287,8 +310,7 @@
void lexer_v3_init(FILE *fp)
{
lineno = 0;
- BEGIN INITIAL;
- msg_header = true;
+ yy_set_state_initial();
yyrestart(fp);
}
@@ -298,6 +320,15 @@
yyless(len);
}
+static void use_to(char chr)
+{
+ char * p = memchr(yytext, chr, yyleng);
+ if (p) {
+ *p = 0;
+ yyleng = (p - yytext);
+ }
+}
+
static void reorder_html(void)
{
char *chr = memchr(yytext, '<', yyleng); /* find start of html tag */
@@ -315,11 +346,11 @@
}
char yy_get_state(void);
-void yy_set_state_initial(void);
char yy_get_state()
{
switch (YYSTATE) {
+ case INITEST:
case INITIAL: return 'i';
case TEXT: return 't';
case HTML:
@@ -330,12 +361,30 @@
}
}
-void yy_set_state_initial()
+void yy_set_state_initial(void)
{
- BEGIN INITIAL;
- if (DEBUG_LEXER(1)) fprintf(dbgout, "%s:%d %s\n", __FILE__, __LINE__, "BEGIN INITIAL");
+ if (! (test & 1)) /* 1 - INITEST, 2 - debug (display lexer states) */
+ BEGIN INITIAL;
+ else
+ BEGIN INITEST;
+
+ msg_header = true;
+
+ if (DEBUG_LEXER(1))
+ fprintf(dbgout, "%s:%d BEGIN %s\n",
+ __FILE__, __LINE__,
+ !test ? "INITIAL" : "INITEST" );
}
+void lexer_set_debug(int v)
+{
+#ifndef FLEX_DEBUG
+ (void) v;
+#else
+ yy_flex_debug = v;
+#endif
+}
+
/*
* The following sets edit modes for GNU EMACS
* Local Variables:
-------------- next part --------------
More information about the Bogofilter-dev
mailing list