[PATCH] Message Count

David Relson relson at osagesoftware.com
Sun Jan 26 19:22:20 CET 2003


Greetings,

The patch below is intended to address the problems people have been having 
with bogofilter-0.10.x getting wrong message counts when training on 
mailboxes.  I have tested it with Matt Armstrong's problem messages.  It 
gets the counts right for them.  I have also compared the counts of the new 
version with those of 0.9.1.2 for this month's daily mailboxes at 
osagesoftware.  The two versions of bogofilter give the same message 
counts.  Because of parser changes the word counts are different (which is 
appropriate).

Test it and let me know what it fixes, what it breaks, and what problems 
you have that it didn't affect at all.

David

Index: lexer.c
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/lexer.c,v
retrieving revision 1.11
diff -u -r1.11 lexer.c
--- lexer.c	25 Jan 2003 21:20:56 -0000	1.11
+++ lexer.c	26 Jan 2003 18:12:03 -0000
@@ -20,6 +20,7 @@
  #include "mime.h"
  #include "fgetsl.h"
  #include "textblock.h"
+#include "token.h"
  #include "xmalloc.h"
  #include "xstrdup.h"

@@ -49,6 +50,14 @@
      size_t count = fgetsl((char *)buf, size, fpin);
      yylineno += 1;
      if (DEBUG_LEXER(0)) fprintf(dbgout, "*** %2d %d %s\n", yylineno, 
msg_header, buf);
+
+    /* Special check for message separator.
+       If found, handle it immediately.
+    */
+
+    if (memcmp(buf, "From ", 5) == 0)
+	got_from();
+
      return count;
  }

Index: lexer_head.l
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/lexer_head.l,v
retrieving revision 1.6
diff -u -r1.6 lexer_head.l
--- lexer_head.l	25 Jan 2003 20:05:59 -0000	1.6
+++ lexer_head.l	26 Jan 2003 18:12:03 -0000
@@ -47,7 +47,7 @@
  #include "lexer.h"
  #include "mime.h"		/* for mime_*() */
  #include "textblock.h"
-#include "token.h"		/* for got_from() and got_newline() */
+#include "token.h"		/* for got_newline() */

  #define YY_DECL token_t yylex(void)
  #define YY_INPUT(buf,result,max_size) result = yyinput((byte *)buf, max_size)
@@ -72,7 +72,7 @@
  ^Content-Transfer-Encoding:{MTYPE}		{ mime_encoding((byte *)yytext, 
yyleng); yyredo((byte *)yytext, ':'); }
  ^Content-Type:{MTYPE};?				{ mime_type((byte *)yytext, yyleng); 
yyredo((byte *)yytext, ':'); }
  ^Content-Disposition:{MTYPE}			{ mime_disposition((byte *)yytext, 
yyleng); yyredo((byte *)yytext, ':'); }
-^From\ 						{ return got_from(yytext); }
+^From\ 						{ return (msg_header ? FROM : TOKEN); }
  ^Date:.*|Delivery-Date:.*			;
  ^Message-ID:.*					;
  ^\tid\ {ID}					;
Index: lexer_text_html.l
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/lexer_text_html.l,v
retrieving revision 1.5
diff -u -r1.5 lexer_text_html.l
--- lexer_text_html.l	21 Jan 2003 01:20:49 -0000	1.5
+++ lexer_text_html.l	26 Jan 2003 18:12:03 -0000
@@ -16,7 +16,7 @@

  #include "lexer.h"
  #include "mime.h"		/* for mime_*() */
-#include "token.h"		/* for got_from() and got_newline() */
+#include "token.h"		/* for got_newline() */

  #define YY_DECL token_t yylex(void)
  #define YY_INPUT(buf,result,max_size) result = yyinput((byte *)buf, max_size)
@@ -38,7 +38,7 @@
  \<						{ html_tag(1); }
  \>						{ html_tag(0); }

-^From\ 						{ return got_from(yytext); }
+^From\ 						{ return (msg_header ? FROM : TOKEN); }
  ^--{MIME_BOUNDARY}(--)?$			{ got_mime_boundary((byte *)yytext, yyleng); 
return (BOUNDARY); }

  {IPADDR}					{ return(IPADDR);}
Index: lexer_text_plain.l
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/lexer_text_plain.l,v
retrieving revision 1.5
diff -u -r1.5 lexer_text_plain.l
--- lexer_text_plain.l	21 Jan 2003 01:20:49 -0000	1.5
+++ lexer_text_plain.l	26 Jan 2003 18:12:03 -0000
@@ -16,7 +16,7 @@

  #include "lexer.h"
  #include "mime.h"		/* for mime_*() */
-#include "token.h"		/* for got_from() and got_newline() */
+#include "token.h"		/* for got_newline() */

  #define YY_DECL token_t yylex(void)
  #define YY_INPUT(buf,result,max_size) result = yyinput((byte *)buf, max_size)
@@ -33,7 +33,7 @@

  %%

-^From\ 						{ return got_from(yytext); }
+^From\ 						{ return (msg_header ? FROM : TOKEN); }
  ^--{MIME_BOUNDARY}(--)?$			{ got_mime_boundary((byte *)yytext, yyleng); 
return (BOUNDARY); }

  {IPADDR}					{ return(IPADDR);}
Index: token.c
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/token.c,v
retrieving revision 1.14
diff -u -r1.14 token.c
--- token.c	26 Jan 2003 01:07:27 -0000	1.14
+++ token.c	26 Jan 2003 18:12:03 -0000
@@ -212,16 +212,11 @@
      return(class);
  }

-token_t got_from(const char *text)
+void got_from(void)
  {
-    if (memcmp(text, "From ", 5) != 0 )
-	return(TOKEN);
-    else {
-	change_lexer_state(LEXER_HEAD);
-	mime_reset();
-	reset_html_level();
-	return(FROM);
-    }
+    change_lexer_state(LEXER_HEAD);
+    mime_reset();
+    reset_html_level();
  }

  void got_newline()
Index: token.h
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/token.h,v
retrieving revision 1.3
diff -u -r1.3 token.h
--- token.h	12 Jan 2003 14:02:00 -0000	1.3
+++ token.h	26 Jan 2003 18:12:03 -0000
@@ -14,7 +14,7 @@

  extern token_t get_token(void);

-extern token_t got_from(const char *text);
+extern void got_from(void);
  extern void got_newline(void);

  /* used by lexer_text_html.l */





More information about the Bogofilter mailing list