token pairs [was: Algorithm limitations]

David Relson relson at osagesoftware.com
Tue Apr 13 14:06:34 CEST 2004


On 13 Apr 2004 07:37:01 -0400
Tom Anderson wrote:

> On Mon, 2004-04-12 at 00:07, michael at optusnet.com.au wrote:
> > I'm already doing word pairs. You might have seen the patch I posted
> > previously for a lossy token database. That was design to support
> > exactly what you're talking about. Basically it would allow
> > bogofilter to generate a vast array of tokens and only keep the
> > ones that occur 'frequently'. 'frequently' here means "at a
> > frequency high enough that a second instance comes along before the
> > first has been discarded from the database". :)
> 
> Would it be possible to consider this patch for inclusion in the
> stable bogofilter branch, turned on via a switch?
> 
> Tom

Tom,

I'm not willing to include word pairs until after the 1.0 release, but
am willing to let users experiment with the technique.  Attached is a
patch from a couple of months ago and updated to work with 0.17.5. 
Below is a sample of the output using it:

[relson at osage src]$ echo this is a test of word pairs | bogofilter -C -H
-vvv
X-Bogosity: No, tests=bogofilter, spamicity=0.100088, version=0.17.5.cvs
                                     n    pgood     pbad      fw     U
"test"                            5637  0.069284  0.007706  0.100088 +
"pairs"                            247  0.002963  0.000426  0.125785 -
"word"                            4374  0.040844  0.021773  0.347716 -
"this"                           71436  0.469233  0.597469  0.560108 -
N_P_Q_S_s_x_md                       1  0.899912  0.100088  0.100088
                                        0.017800  0.520000  0.375000

[relson at osage src]$ echo this is a test of word pairs | bogofilter -C -H
-vvv -P
X-Bogosity: No, tests=bogofilter, spamicity=0.100088, version=0.17.5.cvs
                                     n    pgood     pbad      fw     U
"test"                            5637  0.069284  0.007706  0.100088 +
"pairs"                            247  0.002963  0.000426  0.125785 -
"word"                            4374  0.040844  0.021773  0.347716 -
"test:word"                          0  0.000000  0.000000  0.520000 -
"this:test"                          0  0.000000  0.000000  0.520000 -
"word:pairs"                         0  0.000000  0.000000  0.520000 -
"this"                           71436  0.469233  0.597469  0.560108 -
N_P_Q_S_s_x_md                       1  0.899912  0.100088  0.100088
                                        0.017800  0.520000  0.375000
-------------- next part --------------
Index: bogoconfig.c
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/src/bogoconfig.c,v
retrieving revision 1.170
diff -u -r1.170 bogoconfig.c
--- bogoconfig.c	18 Mar 2004 21:05:56 -0000	1.170
+++ bogoconfig.c	13 Apr 2004 11:50:53 -0000
@@ -380,7 +380,7 @@
 		  progtype, version, ds_version_str(), PACKAGE);
 }
 
-#define	OPTIONS	":-:bBc:Cd:DefFghHI:k:lL:m:MnNo:O:pqQRrsStTuUvVx:X:y:"
+#define	OPTIONS	":-:bBc:Cd:DefFghHI:k:lL:m:MnNo:O:pPqQRrsStTuUvVx:X:y:"
 
 /** These functions process command line arguments.
  **
@@ -640,6 +640,10 @@
 	    get_double(name, val, &ham_cutoff);
 	break;
 
+    case 'P':
+	pairs = true;
+	break;
+
     case 't':
 	terse = true;
 	break;
Index: globals.c
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/src/globals.c,v
retrieving revision 1.57
diff -u -r1.57 globals.c
--- globals.c	3 Apr 2004 00:29:02 -0000	1.57
+++ globals.c	13 Apr 2004 11:50:53 -0000
@@ -27,6 +27,7 @@
 bool	mbox_mode;			/* '-M' */
 bool	replace_nonascii_characters;	/* '-n' */
 bool	passthrough;			/* '-p' */
+bool	pairs = false;			/* '-P' */
 bool	quiet;				/* '-q' */
 bool	query;				/* '-Q' */
 int	Rtable = 0;			/* '-R' */
Index: globals.h
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/src/globals.h,v
retrieving revision 1.64
diff -u -r1.64 globals.h
--- globals.h	3 Apr 2004 00:29:02 -0000	1.64
+++ globals.h	13 Apr 2004 11:50:53 -0000
@@ -23,6 +23,7 @@
 extern	bool	mbox_mode;		/* '-M' */
 extern	char	outfname[PATH_LEN];	/* '-O' */
 extern	bool	passthrough;		/* '-p' */
+extern	bool	pairs;			/* '-P' */
 extern	bool	quiet;			/* '-q' */
 extern	bool	query;			/* '-Q' */
 extern	int	Rtable;			/* '-R' */
Index: token.c
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/src/token.c,v
retrieving revision 1.78
diff -u -r1.78 token.c
--- token.c	20 Mar 2004 21:57:52 -0000	1.78
+++ token.c	13 Apr 2004 11:50:54 -0000
@@ -40,6 +40,7 @@
 static word_t *w_recv = NULL;	/* Received:    */
 static word_t *w_head = NULL;	/* Header:      */
 static word_t *w_mime = NULL;	/* Mime:        */
+static word_t *w_colon = NULL;	/* :            */
 
 /* Global Variables */
 
@@ -48,8 +49,12 @@
 static word_t *token_prefix = NULL;
 static word_t *nonblank_line = NULL;
 
-/* Function Prototypes */
+static word_t *prv_word = NULL;
+static word_t *cur_word = NULL;
 
+typedef enum state_e state_t;
+enum state_e { GET_NEW_TOKEN, RETURN_WORD_PAIR };
+  
 /* Function Definitions */
 
 token_t get_token(void)
@@ -57,6 +62,9 @@
     token_t cls = NONE;
     unsigned char *cp;
     bool done = false;
+    static state_t state_flag = GET_NEW_TOKEN;
+    static token_t prv_cls = NONE;
+    static token_t cur_cls = NONE;
 
     /* If saved IPADDR, truncate last octet */
     if ( block_on_subnets && save_class == IPADDR )
@@ -77,9 +85,27 @@
 	yylval = word_new(NULL, 0);
 
     while (!done) {
-	cls = (*lexer->yylex)();
-	yylval->leng = (uint) *lexer->yyleng;
-	yylval->text = (unsigned char *)(*lexer->yytext);
+	switch (state_flag) {
+	case GET_NEW_TOKEN:
+	    if (pairs)
+		state_flag = RETURN_WORD_PAIR;
+	    if (prv_word)
+		word_free(prv_word);
+	    prv_cls = cur_cls;
+	    prv_word = cur_word;
+	    cls = (*lexer->yylex)();
+	    yylval->leng = (uint) *lexer->yyleng;
+	    yylval->text = (unsigned char *)(*lexer->yytext);
+	    cur_cls = cls;
+	    cur_word = word_dup(yylval);
+	    break;
+	case RETURN_WORD_PAIR:
+	    state_flag = GET_NEW_TOKEN;
+	    if (prv_word == NULL)
+		continue;
+	    cls = cur_cls;
+	    yylval = word_multicat(prv_word, w_colon, cur_word, NULL);
+	}
 
 	if (DEBUG_TEXT(2)) {
 	    word_puts(yylval, 0, dbgout);
@@ -277,6 +303,7 @@
 	w_recv = word_new((const byte *) "rcvd:", 0);	/* Received:    */
 	w_head = word_new((const byte *) "head:", 0);	/* Header:      */
 	w_mime = word_new((const byte *) "mime:", 0);	/* Mime:        */
+ 	w_colon = word_new((const byte *) ":", 0);	/* :            */
     }
 
     return;
@@ -289,6 +316,8 @@
 
 void set_tag(const char *text)
 {
+    word_t *new_prefix;
+
     if (!header_line_markup)
 	return;
 
@@ -301,25 +330,25 @@
     switch (tolower(*text)) {
     case 'c':				/* CC: */
     case 't':
-	token_prefix = w_to;		/* To: */
+	new_prefix = w_to;		/* To: */
 	break;
     case 'f':
-	token_prefix = w_from;		/* From: */
+	new_prefix = w_from;		/* From: */
 	break;
     case 'h':
 	if (msg_state == msg_state->parent)
-	    token_prefix = w_head;	/* Header: */
+	    new_prefix = w_head;	/* Header: */
 	else
-	    token_prefix = w_mime;	/* Mime:   */
+	    new_prefix = w_mime;	/* Mime:   */
 	break;
     case 'r':
 	if (tolower(text[2]) == 't')
-	    token_prefix = w_rtrn;	/* Return-Path: */
+	    new_prefix = w_rtrn;	/* Return-Path: */
 	else
-	    token_prefix = w_recv;	/* Received: */
+	    new_prefix = w_recv;	/* Received: */
 	break;
     case 's':
-	token_prefix = w_subj;		/* Subject: */
+	new_prefix = w_subj;		/* Subject: */
 	break;
     default:
 	fprintf(stderr, "%s:%d  invalid tag - '%s'\n",
@@ -327,6 +356,11 @@
 		text);
 	exit(EX_ERROR);
     }
+    if (new_prefix != token_prefix) {
+	word_free(prv_word);
+	prv_word = NULL;
+    }
+    token_prefix = new_prefix;
     return;
 }
 
Index: word.c
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/src/word.c,v
retrieving revision 1.13
diff -u -r1.13 word.c
--- word.c	31 Oct 2003 22:26:51 -0000	1.13
+++ word.c	13 Apr 2004 11:50:54 -0000
@@ -12,6 +12,8 @@
 
 #include "common.h"
 
+#include <stdarg.h>
+
 #include "word.h"
 #include "xmalloc.h"
 
@@ -86,6 +88,35 @@
     return ans;
 }
 
+word_t  *word_multicat(const word_t *arg, ...)
+{
+    uint n;
+    va_list ap;
+    word_t *val;
+    const word_t *t;
+
+    for (n = 0, va_start(ap, arg); 
+	 (t = *(const word_t **) (ap - sizeof(arg))) != NULL;
+	 ap += sizeof(arg)) {
+	n += t->leng;
+    }
+
+    val = word_new(NULL, n);
+
+    for (n = 0, va_start(ap, arg); 
+	 (t = *(const word_t **) (ap - sizeof(arg))) != NULL;
+	 ap += sizeof(arg)) {
+	memcpy(val->text+n, t->text, t->leng);
+	n += t->leng;
+    }
+
+    Z(val->text[n]);
+
+    va_end(ap);
+
+    return val;
+}
+
 void word_puts(const word_t *self, uint width, FILE *fp)
 {
     /* width = 0 - output all of the word
Index: word.h
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/src/word.h,v
retrieving revision 1.12
diff -u -r1.12 word.h
--- word.h	22 Feb 2004 20:34:26 -0000	1.12
+++ word.h	13 Apr 2004 11:50:54 -0000
@@ -26,6 +26,7 @@
 extern word_t  *word_cpy(word_t *dst, const word_t *src);
 extern int 	word_cmp(const word_t *w1, const word_t *w2);
 extern word_t  *word_concat(const word_t *w1, const word_t *w2);
+extern word_t  *word_multicat(const word_t *w, ...);
 extern void 	word_puts(const word_t *self, uint width, FILE *fp);
 
 #endif	/* WORD_H */



More information about the bogofilter mailing list