token pairs [was: Algorithm limitations]
David Relson
relson at osagesoftware.com
Tue Apr 13 14:06:34 CEST 2004
On 13 Apr 2004 07:37:01 -0400
Tom Anderson wrote:
> On Mon, 2004-04-12 at 00:07, michael at optusnet.com.au wrote:
> > I'm already doing word pairs. You might have seen the patch I posted
> > previously for a lossy token database. That was design to support
> > exactly what you're talking about. Basically it would allow
> > bogofilter to generate a vast array of tokens and only keep the
> > ones that occur 'frequently'. 'frequently' here means "at a
> > frequency high enough that a second instance comes along before the
> > first has been discarded from the database". :)
>
> Would it be possible to consider this patch for inclusion in the
> stable bogofilter branch, turned on via a switch?
>
> Tom
Tom,
I'm not willing to include word pairs until after the 1.0 release, but
am willing to let users experiment with the technique. Attached is a
patch from a couple of months ago and updated to work with 0.17.5.
Below is a sample of the output using it:
[relson at osage src]$ echo this is a test of word pairs | bogofilter -C -H
-vvv
X-Bogosity: No, tests=bogofilter, spamicity=0.100088, version=0.17.5.cvs
n pgood pbad fw U
"test" 5637 0.069284 0.007706 0.100088 +
"pairs" 247 0.002963 0.000426 0.125785 -
"word" 4374 0.040844 0.021773 0.347716 -
"this" 71436 0.469233 0.597469 0.560108 -
N_P_Q_S_s_x_md 1 0.899912 0.100088 0.100088
0.017800 0.520000 0.375000
[relson at osage src]$ echo this is a test of word pairs | bogofilter -C -H
-vvv -P
X-Bogosity: No, tests=bogofilter, spamicity=0.100088, version=0.17.5.cvs
n pgood pbad fw U
"test" 5637 0.069284 0.007706 0.100088 +
"pairs" 247 0.002963 0.000426 0.125785 -
"word" 4374 0.040844 0.021773 0.347716 -
"test:word" 0 0.000000 0.000000 0.520000 -
"this:test" 0 0.000000 0.000000 0.520000 -
"word:pairs" 0 0.000000 0.000000 0.520000 -
"this" 71436 0.469233 0.597469 0.560108 -
N_P_Q_S_s_x_md 1 0.899912 0.100088 0.100088
0.017800 0.520000 0.375000
-------------- next part --------------
Index: bogoconfig.c
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/src/bogoconfig.c,v
retrieving revision 1.170
diff -u -r1.170 bogoconfig.c
--- bogoconfig.c 18 Mar 2004 21:05:56 -0000 1.170
+++ bogoconfig.c 13 Apr 2004 11:50:53 -0000
@@ -380,7 +380,7 @@
progtype, version, ds_version_str(), PACKAGE);
}
-#define OPTIONS ":-:bBc:Cd:DefFghHI:k:lL:m:MnNo:O:pqQRrsStTuUvVx:X:y:"
+#define OPTIONS ":-:bBc:Cd:DefFghHI:k:lL:m:MnNo:O:pPqQRrsStTuUvVx:X:y:"
/** These functions process command line arguments.
**
@@ -640,6 +640,10 @@
get_double(name, val, &ham_cutoff);
break;
+ case 'P':
+ pairs = true;
+ break;
+
case 't':
terse = true;
break;
Index: globals.c
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/src/globals.c,v
retrieving revision 1.57
diff -u -r1.57 globals.c
--- globals.c 3 Apr 2004 00:29:02 -0000 1.57
+++ globals.c 13 Apr 2004 11:50:53 -0000
@@ -27,6 +27,7 @@
bool mbox_mode; /* '-M' */
bool replace_nonascii_characters; /* '-n' */
bool passthrough; /* '-p' */
+bool pairs = false; /* '-P' */
bool quiet; /* '-q' */
bool query; /* '-Q' */
int Rtable = 0; /* '-R' */
Index: globals.h
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/src/globals.h,v
retrieving revision 1.64
diff -u -r1.64 globals.h
--- globals.h 3 Apr 2004 00:29:02 -0000 1.64
+++ globals.h 13 Apr 2004 11:50:53 -0000
@@ -23,6 +23,7 @@
extern bool mbox_mode; /* '-M' */
extern char outfname[PATH_LEN]; /* '-O' */
extern bool passthrough; /* '-p' */
+extern bool pairs; /* '-P' */
extern bool quiet; /* '-q' */
extern bool query; /* '-Q' */
extern int Rtable; /* '-R' */
Index: token.c
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/src/token.c,v
retrieving revision 1.78
diff -u -r1.78 token.c
--- token.c 20 Mar 2004 21:57:52 -0000 1.78
+++ token.c 13 Apr 2004 11:50:54 -0000
@@ -40,6 +40,7 @@
static word_t *w_recv = NULL; /* Received: */
static word_t *w_head = NULL; /* Header: */
static word_t *w_mime = NULL; /* Mime: */
+static word_t *w_colon = NULL; /* : */
/* Global Variables */
@@ -48,8 +49,12 @@
static word_t *token_prefix = NULL;
static word_t *nonblank_line = NULL;
-/* Function Prototypes */
+static word_t *prv_word = NULL;
+static word_t *cur_word = NULL;
+typedef enum state_e state_t;
+enum state_e { GET_NEW_TOKEN, RETURN_WORD_PAIR };
+
/* Function Definitions */
token_t get_token(void)
@@ -57,6 +62,9 @@
token_t cls = NONE;
unsigned char *cp;
bool done = false;
+ static state_t state_flag = GET_NEW_TOKEN;
+ static token_t prv_cls = NONE;
+ static token_t cur_cls = NONE;
/* If saved IPADDR, truncate last octet */
if ( block_on_subnets && save_class == IPADDR )
@@ -77,9 +85,27 @@
yylval = word_new(NULL, 0);
while (!done) {
- cls = (*lexer->yylex)();
- yylval->leng = (uint) *lexer->yyleng;
- yylval->text = (unsigned char *)(*lexer->yytext);
+ switch (state_flag) {
+ case GET_NEW_TOKEN:
+ if (pairs)
+ state_flag = RETURN_WORD_PAIR;
+ if (prv_word)
+ word_free(prv_word);
+ prv_cls = cur_cls;
+ prv_word = cur_word;
+ cls = (*lexer->yylex)();
+ yylval->leng = (uint) *lexer->yyleng;
+ yylval->text = (unsigned char *)(*lexer->yytext);
+ cur_cls = cls;
+ cur_word = word_dup(yylval);
+ break;
+ case RETURN_WORD_PAIR:
+ state_flag = GET_NEW_TOKEN;
+ if (prv_word == NULL)
+ continue;
+ cls = cur_cls;
+ yylval = word_multicat(prv_word, w_colon, cur_word, NULL);
+ }
if (DEBUG_TEXT(2)) {
word_puts(yylval, 0, dbgout);
@@ -277,6 +303,7 @@
w_recv = word_new((const byte *) "rcvd:", 0); /* Received: */
w_head = word_new((const byte *) "head:", 0); /* Header: */
w_mime = word_new((const byte *) "mime:", 0); /* Mime: */
+ w_colon = word_new((const byte *) ":", 0); /* : */
}
return;
@@ -289,6 +316,8 @@
void set_tag(const char *text)
{
+ word_t *new_prefix;
+
if (!header_line_markup)
return;
@@ -301,25 +330,25 @@
switch (tolower(*text)) {
case 'c': /* CC: */
case 't':
- token_prefix = w_to; /* To: */
+ new_prefix = w_to; /* To: */
break;
case 'f':
- token_prefix = w_from; /* From: */
+ new_prefix = w_from; /* From: */
break;
case 'h':
if (msg_state == msg_state->parent)
- token_prefix = w_head; /* Header: */
+ new_prefix = w_head; /* Header: */
else
- token_prefix = w_mime; /* Mime: */
+ new_prefix = w_mime; /* Mime: */
break;
case 'r':
if (tolower(text[2]) == 't')
- token_prefix = w_rtrn; /* Return-Path: */
+ new_prefix = w_rtrn; /* Return-Path: */
else
- token_prefix = w_recv; /* Received: */
+ new_prefix = w_recv; /* Received: */
break;
case 's':
- token_prefix = w_subj; /* Subject: */
+ new_prefix = w_subj; /* Subject: */
break;
default:
fprintf(stderr, "%s:%d invalid tag - '%s'\n",
@@ -327,6 +356,11 @@
text);
exit(EX_ERROR);
}
+ if (new_prefix != token_prefix) {
+ word_free(prv_word);
+ prv_word = NULL;
+ }
+ token_prefix = new_prefix;
return;
}
Index: word.c
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/src/word.c,v
retrieving revision 1.13
diff -u -r1.13 word.c
--- word.c 31 Oct 2003 22:26:51 -0000 1.13
+++ word.c 13 Apr 2004 11:50:54 -0000
@@ -12,6 +12,8 @@
#include "common.h"
+#include <stdarg.h>
+
#include "word.h"
#include "xmalloc.h"
@@ -86,6 +88,35 @@
return ans;
}
+word_t *word_multicat(const word_t *arg, ...)
+{
+ uint n;
+ va_list ap;
+ word_t *val;
+ const word_t *t;
+
+ for (n = 0, va_start(ap, arg);
+ (t = *(const word_t **) (ap - sizeof(arg))) != NULL;
+ ap += sizeof(arg)) {
+ n += t->leng;
+ }
+
+ val = word_new(NULL, n);
+
+ for (n = 0, va_start(ap, arg);
+ (t = *(const word_t **) (ap - sizeof(arg))) != NULL;
+ ap += sizeof(arg)) {
+ memcpy(val->text+n, t->text, t->leng);
+ n += t->leng;
+ }
+
+ Z(val->text[n]);
+
+ va_end(ap);
+
+ return val;
+}
+
void word_puts(const word_t *self, uint width, FILE *fp)
{
/* width = 0 - output all of the word
Index: word.h
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/src/word.h,v
retrieving revision 1.12
diff -u -r1.12 word.h
--- word.h 22 Feb 2004 20:34:26 -0000 1.12
+++ word.h 13 Apr 2004 11:50:54 -0000
@@ -26,6 +26,7 @@
extern word_t *word_cpy(word_t *dst, const word_t *src);
extern int word_cmp(const word_t *w1, const word_t *w2);
extern word_t *word_concat(const word_t *w1, const word_t *w2);
+extern word_t *word_multicat(const word_t *w, ...);
extern void word_puts(const word_t *self, uint width, FILE *fp);
#endif /* WORD_H */
More information about the bogofilter
mailing list