[patch] try Gary Robinson's ideas in bogofilter

Sat Sep 21 16:33:42 CEST 2002

On 20020920 (Fri) at 1542:00 -0400, David Relson wrote:
> I'd be glad to run your Robinson's computation and compare results with the 
> other algorithms in my testbed.  However, your patch doesn't install 
> cleanly for me.  Can you send me the whole file?
> 
Did that offlist.  Here is the patch that produced it (this one has the
calculation fixed); run it in a freshly unpacked bogofilter-0.7.4
source tree and it should apply cleanly.

Note: setting MIN_DEV to 0 may actually improve performance of this
version.  Also, the value of ROBX should be calculated to match your
training data (see the URL below for instructions) or just set to 0.5.

--- bogofilter.c.orig	2002-09-19 13:22:21.000000000 -0400
+++ bogofilter.c	2002-09-19 13:36:16.000000000 -0400
@@ -61,6 +61,17 @@
 
 I do the lexical analysis slightly differently, however.
 
+MOD: (Greg Louis <glouis at dynamicro.on.ca>) This version implements Gary
+    Robinson's proposed modifications to the "spamicity" calculation and
+    uses his f(w) individual probability calculation (but not his g(w)
+    improvement).  See
+
+    http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html
+    
+    In addition, this version does not store "extrema."  Instead it accumulates
+    Robinson's P and Q using all words deemed "characteristic," i.e. having
+    a deviation (fabs (0.5f - prob)) >= MIN_DEV, currently set to 0.4.
+
 ******************************************************************************/
 #include <stdio.h>
 #include <math.h>
@@ -78,14 +89,12 @@
 #define HEADER		"# bogofilter email-count (format version B): %lu\n"
 
 // constants for the Graham formula 
-#define HAM_BIAS	2		// give ham words more weight
-#define KEEPERS		15		// how many extrema to keep
-#define MINIMUM_FREQ	5		// minimum freq
-#define UNKNOWN_WORD	0.4f		// odds that unknown word is spammish
-#define SPAM_CUTOFF	0.9f		// if it's spammier than this...
+#define SPAM_CUTOFF	0.5f		// if it's spammier than this...
+#define MIN_DEV		0.4f		// look for characteristic words
 #define MAX_REPEATS	4		// cap on word frequency per message
 
-#define DEVIATION(n)	fabs((n) - 0.5f)	// deviation from average
+#define ROBA 1.0f			// Robinson's a
+#define ROBX 0.38f			// Robinson's x
 
 #define max(x, y)	(((x) > (y)) ? (x) : (y))
 #define min(x, y)	(((x) < (y)) ? (x) : (y))
@@ -410,43 +419,6 @@
     }
 }
 
-#ifdef __UNUSED__
-void logprintf(const char *fmt, ... )
-// log data from server
-{
-    char buf[BUFSIZ];
-    va_list ap;
-    int fd;
-
-    va_start(ap, fmt);
-    vsnprintf(buf, sizeof(buf), fmt, ap);
-    va_end(ap);
-    
-    fd=open("/tmp/bogolog", O_RDWR|O_CREAT|O_APPEND,0700);
-    write(fd,buf,strlen(buf));
-    close(fd);
-}
-#endif // __UNUSED__
-
-typedef struct 
-{
-    char        key[MAXWORDLEN+1];
-    double      prob;
-}
-discrim_t;
-
-typedef struct
-{
-    discrim_t extrema[KEEPERS];
-}
-bogostat_t;
-
-int compare_stats(discrim_t *d1, discrim_t *d2)
-{ 
-    return ( (d1->prob > d2->prob) ||
-	     ((d1->prob == d2->prob) && (strcmp(d1->key, d2->key) > 0)));
-}
-
 void *collect_words(int fd)
 // tokenize input text and save words in a Judy array.
 // returns:  the Judy array
@@ -472,35 +444,25 @@
     return PArray;
 }
 
-bogostat_t *select_indicators(void  *PArray)
+double compute_spamicity(void  *PArray)
 // selects the best spam/nonspam indicators and
-// populates the stats structure.
+// calculates Robinson's S.
 {
     void	**loc;
     char	tokenbuffer[BUFSIZ];
 
-    discrim_t *pp, *hit;
-    static bogostat_t stats;
-    
-#ifdef NON_EQUIPROBABLE
-    // There is an argument that we should by by number of *words* here.
-    double	msg_prob = (spam_list.msgcount / ham_list.msgcount);
-#endif // NON_EQUIPROBABLE
+    double invproduct = 0.0;	// Robinson's P
+    double product = 0.0;	// Robinson's Q
+    double spamicity, invn;
+    int robn = 0;
 
-    for (pp = stats.extrema; pp < stats.extrema+sizeof(stats.extrema)/sizeof(*stats.extrema); pp++)
-    {
- 	pp->prob = 0.5f;
- 	pp->key[0] = '\0';
-    }
- 
     yytext = tokenbuffer;
     for (loc  = JudySLFirst(PArray, tokenbuffer, 0);
 	 loc != (void *) NULL;
 	 loc  = JudySLNext(PArray, tokenbuffer, 0))
     {
 	double prob;
-	double dev;
-	double hamness, spamness, slotdev, hitdev;
+	double hamness, spamness;
 
 	hamness = getcount(yytext, &ham_list);
 	spamness  = getcount(yytext, &spam_list);
@@ -514,81 +476,34 @@
 	//  	    (double (/ 
 	// 		    (min 1 (/ b nspam)) 
 	// 		    (+ (min 1 (/ g nham)) (min 1 (/ b nspam)))))))))
-	// This assumes that spam and non-spam are equiprobable.
-	hamness *= HAM_BIAS;
-	if (hamness + spamness < MINIMUM_FREQ)
-#ifdef NON_EQUIPROBABLE
-	    // In the absence of evidence, the probability that a new word
-	    // will be spam is the historical ratio of spam words to
-	    // nonspam words.
-	    prob = msg_prob;
-#else
-	    prob = UNKNOWN_WORD;
-#endif // NON_EQUIPROBABLE
-	else
 	{
-	    register double pb = min(1, (spamness / spam_list.msgcount));
-	    register double pg = min(1, (hamness / ham_list.msgcount));
-
-#ifdef NON_EQUIPROBABLE
-	    prob = (pb * msg_prob) / ((pg * (1 - msg_prob)) + (pb * msg_prob));
-#else
-	    prob = pb / (pg + pb);
-#endif // NON_EQUIPROBABLE
-	    prob = min(prob, 0.99);
-	    prob = max(prob, 0.01);
+	    register double pb = spamness;
+	    register double pg = spam_list.msgcount * hamness / ham_list.msgcount;
+            // Robinson's f(w) = (a + n * p(w)) / ((a / x) + n)
+            // p(w) is pb / (pb + pg); n is pb + pg; so n * p(w) is pb
+	    prob = (ROBA + pb) / (ROBA / ROBX + pg + pb);
 	}
 
-	// update the list of tokens with maximum deviation
-	dev = DEVIATION(prob);
-        hit = NULL;
-        hitdev=1;
-	for (pp = stats.extrema; pp < stats.extrema+sizeof(stats.extrema)/sizeof(*stats.extrema); pp++)
-        {
-	    slotdev=DEVIATION(pp->prob);
-	    if (dev>slotdev && hitdev>slotdev)
-	    {
-		hit=pp;
-		hitdev=slotdev;
-            }
-        }
-        if (hit) 
-	{ 
-	    hit->prob = prob;
-	    strncpy(hit->key, yytext, MAXWORDLEN);
+        // Robinson's P and Q; accumulation step
+        // P = 1 - ((1-p1)*(1-p2)*...*(1-pn))^(1/n)     [spamminess]
+        // Q = 1 - (p1*p2*...*pn)^(1/n)                 [non-spamminess]
+        if (fabs(0.5 - prob) >= MIN_DEV) {
+            invproduct += log(1.0 - prob);
+            product += log(prob);
+            robn ++;
 	}
     }
-    return (&stats);
-}
-
-double compute_spamicity(bogostat_t *stats)
-// computes the spamicity of the words in the bogostat structure
-// returns:  the spamicity
-{
-    double product, invproduct;
-    double spamicity = 0.0;
-
-    discrim_t *pp;
-
-    if (verbose)
-    {
-	// put the stats in ascending order by probability and alphabet
-	qsort(stats->extrema, KEEPERS, sizeof(discrim_t), compare_stats);
-    }
-
-    // Bayes' theorem.
-    // For discussion, see <http://www.mathpages.com/home/kmath267.htm>.
-    product = invproduct = 1.0f;
-    for (pp = stats->extrema; pp < stats->extrema+sizeof(stats->extrema)/sizeof(*stats->extrema); pp++)
-	if (pp->prob != 0)
-	{
-	    product *= pp->prob;
-	    invproduct *= (1 - pp->prob);
-	    spamicity = product / (product + invproduct);
-	    if (verbose)
-		printf("#  %f  %f  %s\n", pp->prob, spamicity, pp->key);
-	}
 
+    // Robinson's P, Q and S
+    // S = (P - Q) / (P + Q)                        [combined indicator]
+    if (robn) {
+        invn = (double)robn;
+        invproduct = 1.0 - exp(invproduct / invn);
+        product = 1.0 - exp(product / invn);
+        spamicity =
+            (1.0 + (invproduct - product) / (invproduct + product)) / 2.0;
+    } else spamicity = 0.5;
+                                                        
     if (verbose)
 	printf("#  Spamicity of %f\n", spamicity);
 
@@ -601,16 +516,12 @@
     rc_t	status;
     double 	spamicity;
     void	*PArray = (Pvoid_t) NULL;	// JudySL array.
-    bogostat_t	*stats;
 
 //  tokenize input text and save words in a Judy array.
     PArray = collect_words(fd);
     
-//  select the best spam/nonspam indicators.
-    stats = select_indicators(PArray);
-    
 //  computes the spamicity of the spam/nonspam indicators.
-    spamicity = compute_spamicity(stats);
+    spamicity = compute_spamicity(PArray);
 
     status = (spamicity > SPAM_CUTOFF) ? RC_SPAM : RC_NONSPAM;
 


-- 
| G r e g  L o u i s          | gpg public key:      |
|   http://www.bgl.nu/~glouis |   finger greg at bgl.nu |