Robinson's new f(w) calculation

Greg Louis glouis at dynamicro.on.ca
Sun Sep 22 15:12:25 EDT 2002


Here is another patch against 0.7.4, this time implementing a new form
of the f(w) calculation that Gary described in a posting on the
spambayes list.

--- bogofilter.c.orig	2002-09-19 13:22:21.000000000 -0400
+++ bogofilter.c	2002-09-19 13:36:16.000000000 -0400
@@ -61,6 +61,17 @@
 
 I do the lexical analysis slightly differently, however.
 
+MOD: (Greg Louis <glouis at dynamicro.on.ca>) This version implements Gary
+    Robinson's proposed modifications to the "spamicity" calculation and
+    uses his f(w) individual probability calculation.
+    See
+
+    http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html
+    
+    In addition, this version does not store "extrema."  Instead it accumulates
+    Robinson's P and Q using all words deemed "characteristic," i.e. having
+    a deviation (fabs (0.5f - prob)) >= MIN_DEV, currently set to 0.0.
+
 ******************************************************************************/
 #include <stdio.h>
 #include <math.h>
@@ -78,14 +89,12 @@
 #define HEADER		"# bogofilter email-count (format version B): %lu\n"
 
 // constants for the Graham formula 
-#define HAM_BIAS	2		// give ham words more weight
-#define KEEPERS		15		// how many extrema to keep
-#define MINIMUM_FREQ	5		// minimum freq
-#define UNKNOWN_WORD	0.4f		// odds that unknown word is spammish
-#define SPAM_CUTOFF	0.9f		// if it's spammier than this...
+#define SPAM_CUTOFF	0.52f		// if it's spammier than this...
+#define MIN_DEV		0.0f		// look for characteristic words
 #define MAX_REPEATS	4		// cap on word frequency per message
 
-#define DEVIATION(n)	fabs((n) - 0.5f)	// deviation from average
+#define ROBS 0.001f			// Robinson's a
+#define ROBX 0.415f			// Robinson's x
 
 #define max(x, y)	(((x) > (y)) ? (x) : (y))
 #define min(x, y)	(((x) < (y)) ? (x) : (y))
@@ -410,43 +419,6 @@
     }
 }
 
-#ifdef __UNUSED__
-void logprintf(const char *fmt, ... )
-// log data from server
-{
-    char buf[BUFSIZ];
-    va_list ap;
-    int fd;
-
-    va_start(ap, fmt);
-    vsnprintf(buf, sizeof(buf), fmt, ap);
-    va_end(ap);
-    
-    fd=open("/tmp/bogolog", O_RDWR|O_CREAT|O_APPEND,0700);
-    write(fd,buf,strlen(buf));
-    close(fd);
-}
-#endif // __UNUSED__
-
-typedef struct 
-{
-    char        key[MAXWORDLEN+1];
-    double      prob;
-}
-discrim_t;
-
-typedef struct
-{
-    discrim_t extrema[KEEPERS];
-}
-bogostat_t;
-
-int compare_stats(discrim_t *d1, discrim_t *d2)
-{ 
-    return ( (d1->prob > d2->prob) ||
-	     ((d1->prob == d2->prob) && (strcmp(d1->key, d2->key) > 0)));
-}
-
 void *collect_words(int fd)
 // tokenize input text and save words in a Judy array.
 // returns:  the Judy array
@@ -472,35 +444,25 @@
     return PArray;
 }
 
-bogostat_t *select_indicators(void  *PArray)
+double compute_spamicity(void  *PArray)
 // selects the best spam/nonspam indicators and
-// populates the stats structure.
+// calculates Robinson's S.
 {
     void	**loc;
     char	tokenbuffer[BUFSIZ];
 
-    discrim_t *pp, *hit;
-    static bogostat_t stats;
-    
-#ifdef NON_EQUIPROBABLE
-    // There is an argument that we should by by number of *words* here.
-    double	msg_prob = (spam_list.msgcount / ham_list.msgcount);
-#endif // NON_EQUIPROBABLE
+    double invproduct = 0.0;	// Robinson's P
+    double product = 0.0;	// Robinson's Q
+    double spamicity, invn;
+    int robn = 0;
 
-    for (pp = stats.extrema; pp < stats.extrema+sizeof(stats.extrema)/sizeof(*stats.extrema); pp++)
-    {
- 	pp->prob = 0.5f;
- 	pp->key[0] = '\0';
-    }
- 
     yytext = tokenbuffer;
     for (loc  = JudySLFirst(PArray, tokenbuffer, 0);
 	 loc != (void *) NULL;
 	 loc  = JudySLNext(PArray, tokenbuffer, 0))
     {
 	double prob;
-	double dev;
-	double hamness, spamness, slotdev, hitdev;
+	double hamness, spamness;
 
 	hamness = getcount(yytext, &ham_list);
 	spamness  = getcount(yytext, &spam_list);
@@ -514,81 +476,34 @@
 	//  	    (double (/ 
 	// 		    (min 1 (/ b nspam)) 
 	// 		    (+ (min 1 (/ g nham)) (min 1 (/ b nspam)))))))))
-	// This assumes that spam and non-spam are equiprobable.
-	hamness *= HAM_BIAS;
-	if (hamness + spamness < MINIMUM_FREQ)
-#ifdef NON_EQUIPROBABLE
-	    // In the absence of evidence, the probability that a new word
-	    // will be spam is the historical ratio of spam words to
-	    // nonspam words.
-	    prob = msg_prob;
-#else
-	    prob = UNKNOWN_WORD;
-#endif // NON_EQUIPROBABLE
-	else
 	{
-	    register double pb = min(1, (spamness / spam_list.msgcount));
-	    register double pg = min(1, (hamness / ham_list.msgcount));
-
-#ifdef NON_EQUIPROBABLE
-	    prob = (pb * msg_prob) / ((pg * (1 - msg_prob)) + (pb * msg_prob));
-#else
-	    prob = pb / (pg + pb);
-#endif // NON_EQUIPROBABLE
-	    prob = min(prob, 0.99);
-	    prob = max(prob, 0.01);
+	    register double pb = spamness;
+	    register double pg = spam_list.msgcount * hamness / ham_list.msgcount;
+            // Robinson's f(w) = (s * x + n * p(w)) / (s + n)
+            // p(w) is pb / (pb + pg); n is pb + pg; so n * p(w) is pb
+	    prob = (ROBS * ROBX + pb) / (ROBS + pg + pb);
 	}
 
-	// update the list of tokens with maximum deviation
-	dev = DEVIATION(prob);
-        hit = NULL;
-        hitdev=1;
-	for (pp = stats.extrema; pp < stats.extrema+sizeof(stats.extrema)/sizeof(*stats.extrema); pp++)
-        {
-	    slotdev=DEVIATION(pp->prob);
-	    if (dev>slotdev && hitdev>slotdev)
-	    {
-		hit=pp;
-		hitdev=slotdev;
-            }
-        }
-        if (hit) 
-	{ 
-	    hit->prob = prob;
-	    strncpy(hit->key, yytext, MAXWORDLEN);
+        // Robinson's P and Q; accumulation step
+        // P = 1 - ((1-p1)*(1-p2)*...*(1-pn))^(1/n)     [spamminess]
+        // Q = 1 - (p1*p2*...*pn)^(1/n)                 [non-spamminess]
+        if (fabs(0.5 - prob) >= MIN_DEV) {
+            invproduct += log(1.0 - prob);
+            product += log(prob);
+            robn ++;
 	}
     }
-    return (&stats);
-}
-
-double compute_spamicity(bogostat_t *stats)
-// computes the spamicity of the words in the bogostat structure
-// returns:  the spamicity
-{
-    double product, invproduct;
-    double spamicity = 0.0;
-
-    discrim_t *pp;
-
-    if (verbose)
-    {
-	// put the stats in ascending order by probability and alphabet
-	qsort(stats->extrema, KEEPERS, sizeof(discrim_t), compare_stats);
-    }
-
-    // Bayes' theorem.
-    // For discussion, see <http://www.mathpages.com/home/kmath267.htm>.
-    product = invproduct = 1.0f;
-    for (pp = stats->extrema; pp < stats->extrema+sizeof(stats->extrema)/sizeof(*stats->extrema); pp++)
-	if (pp->prob != 0)
-	{
-	    product *= pp->prob;
-	    invproduct *= (1 - pp->prob);
-	    spamicity = product / (product + invproduct);
-	    if (verbose)
-		printf("#  %f  %f  %s\n", pp->prob, spamicity, pp->key);
-	}
 
+    // Robinson's P, Q and S
+    // S = (P - Q) / (P + Q)                        [combined indicator]
+    if (robn) {
+        invn = (double)robn;
+        invproduct = 1.0 - exp(invproduct / invn);
+        product = 1.0 - exp(product / invn);
+        spamicity =
+            (1.0 + (invproduct - product) / (invproduct + product)) / 2.0;
+    } else spamicity = ROBX;
+                                                        
     if (verbose)
 	printf("#  Spamicity of %f\n", spamicity);
 
@@ -601,16 +516,12 @@
     rc_t	status;
     double 	spamicity;
     void	*PArray = (Pvoid_t) NULL;	// JudySL array.
-    bogostat_t	*stats;
 
 //  tokenize input text and save words in a Judy array.
     PArray = collect_words(fd);
     
-//  select the best spam/nonspam indicators.
-    stats = select_indicators(PArray);
-    
 //  computes the spamicity of the spam/nonspam indicators.
-    spamicity = compute_spamicity(stats);
+    spamicity = compute_spamicity(PArray);
 
     status = (spamicity > SPAM_CUTOFF) ? RC_SPAM : RC_NONSPAM;
 


-- 
| G r e g  L o u i s          | gpg public key:      |
|   http://www.bgl.nu/~glouis |   finger greg at bgl.nu |



More information about the Bogofilter mailing list