[patch] try Gary Robinson's ideas in bogofilter

Fri Sep 20 16:27:07 CEST 2002

In case anyone's interested, here is a patch against version 0.7.4 that
does the spamicity calculation as proposed by Gary Robinson (see
http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html
for information).  I've implemented his S and f(w) calculations but not
his g(w) improvement.  There's a link in his essay to some results I
got that seem to indicate a significant improvement over the Graham
calculations done in current bogofilter.

I found by experiment that saving extrema didn't perform significantly
better than just setting a deviation threshold and allowing bogofilter to
consider all words with deviations greater than the threshold.  My
current version, therefore, doesn't save extrema and doesn't list the
keywords used in the spamicity calculation when the -v option is given;
it would be easy to add back code to do the latter, though.

It seemed to make sense, for this calculation method, to recombine
select_indicators() and compute_spamicity(), since the selection involves
accumulating product and invproduct on the fly instead of saving the
words and probablilties and acting on them later.

This patch is _very_ lightly tested; I'm mostly working with an earlier
version derived from Eric's 0.7.  If anyone tries it out, I'd be really
interested in the results.  Gary says you may need to play with the
SPAM_CUTOFF value for S; 0.5 works well for me but some people see too
many false positives at that level.

--- bogofilter.c.orig	2002-09-19 13:22:21.000000000 -0400
+++ bogofilter.c	2002-09-19 13:36:16.000000000 -0400
@@ -61,6 +61,17 @@
 
 I do the lexical analysis slightly differently, however.
 
+MOD: (Greg Louis <glouis at dynamicro.on.ca>) This version implements Gary
+    Robinson's proposed modifications to the "spamicity" calculation and
+    uses his f(w) individual probability calculation (but not his g(w)
+    improvement).  See
+
+    http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html
+    
+    In addition, this version does not store "extrema."  Instead it accumulates
+    Robinson's P and Q using all words deemed "characteristic," i.e. having
+    a deviation (fabs (0.5f - prob)) >= MIN_DEV, currently set to 0.4.
+
 ******************************************************************************/
 #include <stdio.h>
 #include <math.h>
@@ -78,14 +89,12 @@
 #define HEADER		"# bogofilter email-count (format version B): %lu\n"
 
 // constants for the Graham formula 
-#define HAM_BIAS	2		// give ham words more weight
-#define KEEPERS		15		// how many extrema to keep
-#define MINIMUM_FREQ	5		// minimum freq
-#define UNKNOWN_WORD	0.4f		// odds that unknown word is spammish
-#define SPAM_CUTOFF	0.9f		// if it's spammier than this...
+#define SPAM_CUTOFF	0.5f		// if it's spammier than this...
+#define MIN_DEV		0.4f		// look for characteristic words
 #define MAX_REPEATS	4		// cap on word frequency per message
 
-#define DEVIATION(n)	fabs((n) - 0.5f)	// deviation from average
+#define ROBA 1.0f			// Robinson's a
+#define ROBX 0.38f			// Robinson's x
 
 #define max(x, y)	(((x) > (y)) ? (x) : (y))
 #define min(x, y)	(((x) < (y)) ? (x) : (y))
@@ -410,43 +419,6 @@
     }
 }
 
-#ifdef __UNUSED__
-void logprintf(const char *fmt, ... )
-// log data from server
-{
-    char buf[BUFSIZ];
-    va_list ap;
-    int fd;
-
-    va_start(ap, fmt);
-    vsnprintf(buf, sizeof(buf), fmt, ap);
-    va_end(ap);
-    
-    fd=open("/tmp/bogolog", O_RDWR|O_CREAT|O_APPEND,0700);
-    write(fd,buf,strlen(buf));
-    close(fd);
-}
-#endif // __UNUSED__
-
-typedef struct 
-{
-    char        key[MAXWORDLEN+1];
-    double      prob;
-}
-discrim_t;
-
-typedef struct
-{
-    discrim_t extrema[KEEPERS];
-}
-bogostat_t;
-
-int compare_stats(discrim_t *d1, discrim_t *d2)
-{ 
-    return ( (d1->prob > d2->prob) ||
-	     ((d1->prob == d2->prob) && (strcmp(d1->key, d2->key) > 0)));
-}
-
 void *collect_words(int fd)
 // tokenize input text and save words in a Judy array.
 // returns:  the Judy array
@@ -472,35 +444,25 @@
     return PArray;
 }
 
-bogostat_t *select_indicators(void  *PArray)
+double compute_spamicity(void  *PArray)
 // selects the best spam/nonspam indicators and
-// populates the stats structure.
+// calculates Robinson's S.
 {
     void	**loc;
     char	tokenbuffer[BUFSIZ];
 
-    discrim_t *pp, *hit;
-    static bogostat_t stats;
-    
-#ifdef NON_EQUIPROBABLE
-    // There is an argument that we should by by number of *words* here.
-    double	msg_prob = (spam_list.msgcount / ham_list.msgcount);
-#endif // NON_EQUIPROBABLE
+    double invproduct = 1.0;	// Robinson's P
+    double product = 1.0;	// Robinson's Q
+    double spamicity, invn;
+    int robn = 0;
 
-    for (pp = stats.extrema; pp < stats.extrema+sizeof(stats.extrema)/sizeof(*stats.extrema); pp++)
-    {
- 	pp->prob = 0.5f;
- 	pp->key[0] = '\0';
-    }
- 
     yytext = tokenbuffer;
     for (loc  = JudySLFirst(PArray, tokenbuffer, 0);
 	 loc != (void *) NULL;
 	 loc  = JudySLNext(PArray, tokenbuffer, 0))
     {
 	double prob;
-	double dev;
-	double hamness, spamness, slotdev, hitdev;
+	double hamness, spamness;
 
 	hamness = getcount(yytext, &ham_list);
 	spamness  = getcount(yytext, &spam_list);
@@ -514,81 +476,34 @@
 	//  	    (double (/ 
 	// 		    (min 1 (/ b nspam)) 
 	// 		    (+ (min 1 (/ g nham)) (min 1 (/ b nspam)))))))))
-	// This assumes that spam and non-spam are equiprobable.
-	hamness *= HAM_BIAS;
-	if (hamness + spamness < MINIMUM_FREQ)
-#ifdef NON_EQUIPROBABLE
-	    // In the absence of evidence, the probability that a new word
-	    // will be spam is the historical ratio of spam words to
-	    // nonspam words.
-	    prob = msg_prob;
-#else
-	    prob = UNKNOWN_WORD;
-#endif // NON_EQUIPROBABLE
-	else
 	{
-	    register double pb = min(1, (spamness / spam_list.msgcount));
-	    register double pg = min(1, (hamness / ham_list.msgcount));
-
-#ifdef NON_EQUIPROBABLE
-	    prob = (pb * msg_prob) / ((pg * (1 - msg_prob)) + (pb * msg_prob));
-#else
-	    prob = pb / (pg + pb);
-#endif // NON_EQUIPROBABLE
-	    prob = min(prob, 0.99);
-	    prob = max(prob, 0.01);
+	    register double pb = spamness;
+	    register double pg = spam_list.msgcount * hamness / ham_list.msgcount;
+            // Robinson's f(w) = (a + n * p(w)) / ((a / x) + n)
+            // p(w) is pb / (pb + pg); n is pb + pg; so n * p(w) is pb
+	    prob = (ROBA + pb) / (ROBA / ROBX + pg + pb);
 	}
 
-	// update the list of tokens with maximum deviation
-	dev = DEVIATION(prob);
-        hit = NULL;
-        hitdev=1;
-	for (pp = stats.extrema; pp < stats.extrema+sizeof(stats.extrema)/sizeof(*stats.extrema); pp++)
-        {
-	    slotdev=DEVIATION(pp->prob);
-	    if (dev>slotdev && hitdev>slotdev)
-	    {
-		hit=pp;
-		hitdev=slotdev;
-            }
-        }
-        if (hit) 
-	{ 
-	    hit->prob = prob;
-	    strncpy(hit->key, yytext, MAXWORDLEN);
+        // Robinson's P and Q; accumulation step
+        // P = 1 - ((1-p1)*(1-p2)*...*(1-pn))^(1/n)     [spamminess]
+        // Q = 1 - (p1*p2*...*pn)^(1/n)                 [non-spamminess]
+        if (fabs(0.5 - prob) >= MIN_DEV) {
+            invproduct *= 1.0 - prob;
+            product *= prob;
+            robn ++;
 	}
     }
-    return (&stats);
-}
-
-double compute_spamicity(bogostat_t *stats)
-// computes the spamicity of the words in the bogostat structure
-// returns:  the spamicity
-{
-    double product, invproduct;
-    double spamicity = 0.0;
-
-    discrim_t *pp;
-
-    if (verbose)
-    {
-	// put the stats in ascending order by probability and alphabet
-	qsort(stats->extrema, KEEPERS, sizeof(discrim_t), compare_stats);
-    }
-
-    // Bayes' theorem.
-    // For discussion, see <http://www.mathpages.com/home/kmath267.htm>.
-    product = invproduct = 1.0f;
-    for (pp = stats->extrema; pp < stats->extrema+sizeof(stats->extrema)/sizeof(*stats->extrema); pp++)
-	if (pp->prob != 0)
-	{
-	    product *= pp->prob;
-	    invproduct *= (1 - pp->prob);
-	    spamicity = product / (product + invproduct);
-	    if (verbose)
-		printf("#  %f  %f  %s\n", pp->prob, spamicity, pp->key);
-	}
 
+    // Robinson's P, Q and S
+    // S = (P - Q) / (P + Q)                        [combined indicator]
+    if (robn) {
+        invn = 1.0 / (double)robn;
+        invproduct = 1.0 - pow(invproduct, invn);
+        product = 1.0 - pow(product, invn);
+        spamicity =
+            (1.0 + (invproduct - product) / (invproduct + product)) / 2.0;
+    } else spamicity = 0.5;
+                                                        
     if (verbose)
 	printf("#  Spamicity of %f\n", spamicity);
 
@@ -601,16 +516,12 @@
     rc_t	status;
     double 	spamicity;
     void	*PArray = (Pvoid_t) NULL;	// JudySL array.
-    bogostat_t	*stats;
 
 //  tokenize input text and save words in a Judy array.
     PArray = collect_words(fd);
     
-//  select the best spam/nonspam indicators.
-    stats = select_indicators(PArray);
-    
 //  computes the spamicity of the spam/nonspam indicators.
-    spamicity = compute_spamicity(stats);
+    spamicity = compute_spamicity(PArray);
 
     status = (spamicity > SPAM_CUTOFF) ? RC_SPAM : RC_NONSPAM;
 


-- 
| G r e g  L o u i s          | gpg public key:      |
|   http://www.bgl.nu/~glouis |   finger greg at bgl.nu |