Robinson's new f(w) calculation
Greg Louis
glouis at dynamicro.on.ca
Sun Sep 22 15:12:25 EDT 2002
Here is another patch against 0.7.4, this time implementing a new form
of the f(w) calculation that Gary described in a posting on the
spambayes list.
--- bogofilter.c.orig 2002-09-19 13:22:21.000000000 -0400
+++ bogofilter.c 2002-09-19 13:36:16.000000000 -0400
@@ -61,6 +61,17 @@
I do the lexical analysis slightly differently, however.
+MOD: (Greg Louis <glouis at dynamicro.on.ca>) This version implements Gary
+ Robinson's proposed modifications to the "spamicity" calculation and
+ uses his f(w) individual probability calculation.
+ See
+
+ http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html
+
+ In addition, this version does not store "extrema." Instead it accumulates
+ Robinson's P and Q using all words deemed "characteristic," i.e. having
+ a deviation (fabs (0.5f - prob)) >= MIN_DEV, currently set to 0.0.
+
******************************************************************************/
#include <stdio.h>
#include <math.h>
@@ -78,14 +89,12 @@
#define HEADER "# bogofilter email-count (format version B): %lu\n"
// constants for the Graham formula
-#define HAM_BIAS 2 // give ham words more weight
-#define KEEPERS 15 // how many extrema to keep
-#define MINIMUM_FREQ 5 // minimum freq
-#define UNKNOWN_WORD 0.4f // odds that unknown word is spammish
-#define SPAM_CUTOFF 0.9f // if it's spammier than this...
+#define SPAM_CUTOFF 0.52f // if it's spammier than this...
+#define MIN_DEV 0.0f // look for characteristic words
#define MAX_REPEATS 4 // cap on word frequency per message
-#define DEVIATION(n) fabs((n) - 0.5f) // deviation from average
+#define ROBS 0.001f // Robinson's a
+#define ROBX 0.415f // Robinson's x
#define max(x, y) (((x) > (y)) ? (x) : (y))
#define min(x, y) (((x) < (y)) ? (x) : (y))
@@ -410,43 +419,6 @@
}
}
-#ifdef __UNUSED__
-void logprintf(const char *fmt, ... )
-// log data from server
-{
- char buf[BUFSIZ];
- va_list ap;
- int fd;
-
- va_start(ap, fmt);
- vsnprintf(buf, sizeof(buf), fmt, ap);
- va_end(ap);
-
- fd=open("/tmp/bogolog", O_RDWR|O_CREAT|O_APPEND,0700);
- write(fd,buf,strlen(buf));
- close(fd);
-}
-#endif // __UNUSED__
-
-typedef struct
-{
- char key[MAXWORDLEN+1];
- double prob;
-}
-discrim_t;
-
-typedef struct
-{
- discrim_t extrema[KEEPERS];
-}
-bogostat_t;
-
-int compare_stats(discrim_t *d1, discrim_t *d2)
-{
- return ( (d1->prob > d2->prob) ||
- ((d1->prob == d2->prob) && (strcmp(d1->key, d2->key) > 0)));
-}
-
void *collect_words(int fd)
// tokenize input text and save words in a Judy array.
// returns: the Judy array
@@ -472,35 +444,25 @@
return PArray;
}
-bogostat_t *select_indicators(void *PArray)
+double compute_spamicity(void *PArray)
// selects the best spam/nonspam indicators and
-// populates the stats structure.
+// calculates Robinson's S.
{
void **loc;
char tokenbuffer[BUFSIZ];
- discrim_t *pp, *hit;
- static bogostat_t stats;
-
-#ifdef NON_EQUIPROBABLE
- // There is an argument that we should by by number of *words* here.
- double msg_prob = (spam_list.msgcount / ham_list.msgcount);
-#endif // NON_EQUIPROBABLE
+ double invproduct = 0.0; // Robinson's P
+ double product = 0.0; // Robinson's Q
+ double spamicity, invn;
+ int robn = 0;
- for (pp = stats.extrema; pp < stats.extrema+sizeof(stats.extrema)/sizeof(*stats.extrema); pp++)
- {
- pp->prob = 0.5f;
- pp->key[0] = '\0';
- }
-
yytext = tokenbuffer;
for (loc = JudySLFirst(PArray, tokenbuffer, 0);
loc != (void *) NULL;
loc = JudySLNext(PArray, tokenbuffer, 0))
{
double prob;
- double dev;
- double hamness, spamness, slotdev, hitdev;
+ double hamness, spamness;
hamness = getcount(yytext, &ham_list);
spamness = getcount(yytext, &spam_list);
@@ -514,81 +476,34 @@
// (double (/
// (min 1 (/ b nspam))
// (+ (min 1 (/ g nham)) (min 1 (/ b nspam)))))))))
- // This assumes that spam and non-spam are equiprobable.
- hamness *= HAM_BIAS;
- if (hamness + spamness < MINIMUM_FREQ)
-#ifdef NON_EQUIPROBABLE
- // In the absence of evidence, the probability that a new word
- // will be spam is the historical ratio of spam words to
- // nonspam words.
- prob = msg_prob;
-#else
- prob = UNKNOWN_WORD;
-#endif // NON_EQUIPROBABLE
- else
{
- register double pb = min(1, (spamness / spam_list.msgcount));
- register double pg = min(1, (hamness / ham_list.msgcount));
-
-#ifdef NON_EQUIPROBABLE
- prob = (pb * msg_prob) / ((pg * (1 - msg_prob)) + (pb * msg_prob));
-#else
- prob = pb / (pg + pb);
-#endif // NON_EQUIPROBABLE
- prob = min(prob, 0.99);
- prob = max(prob, 0.01);
+ register double pb = spamness;
+ register double pg = spam_list.msgcount * hamness / ham_list.msgcount;
+ // Robinson's f(w) = (s * x + n * p(w)) / (s + n)
+ // p(w) is pb / (pb + pg); n is pb + pg; so n * p(w) is pb
+ prob = (ROBS * ROBX + pb) / (ROBS + pg + pb);
}
- // update the list of tokens with maximum deviation
- dev = DEVIATION(prob);
- hit = NULL;
- hitdev=1;
- for (pp = stats.extrema; pp < stats.extrema+sizeof(stats.extrema)/sizeof(*stats.extrema); pp++)
- {
- slotdev=DEVIATION(pp->prob);
- if (dev>slotdev && hitdev>slotdev)
- {
- hit=pp;
- hitdev=slotdev;
- }
- }
- if (hit)
- {
- hit->prob = prob;
- strncpy(hit->key, yytext, MAXWORDLEN);
+ // Robinson's P and Q; accumulation step
+ // P = 1 - ((1-p1)*(1-p2)*...*(1-pn))^(1/n) [spamminess]
+ // Q = 1 - (p1*p2*...*pn)^(1/n) [non-spamminess]
+ if (fabs(0.5 - prob) >= MIN_DEV) {
+ invproduct += log(1.0 - prob);
+ product += log(prob);
+ robn ++;
}
}
- return (&stats);
-}
-
-double compute_spamicity(bogostat_t *stats)
-// computes the spamicity of the words in the bogostat structure
-// returns: the spamicity
-{
- double product, invproduct;
- double spamicity = 0.0;
-
- discrim_t *pp;
-
- if (verbose)
- {
- // put the stats in ascending order by probability and alphabet
- qsort(stats->extrema, KEEPERS, sizeof(discrim_t), compare_stats);
- }
-
- // Bayes' theorem.
- // For discussion, see <http://www.mathpages.com/home/kmath267.htm>.
- product = invproduct = 1.0f;
- for (pp = stats->extrema; pp < stats->extrema+sizeof(stats->extrema)/sizeof(*stats->extrema); pp++)
- if (pp->prob != 0)
- {
- product *= pp->prob;
- invproduct *= (1 - pp->prob);
- spamicity = product / (product + invproduct);
- if (verbose)
- printf("# %f %f %s\n", pp->prob, spamicity, pp->key);
- }
+ // Robinson's P, Q and S
+ // S = (P - Q) / (P + Q) [combined indicator]
+ if (robn) {
+ invn = (double)robn;
+ invproduct = 1.0 - exp(invproduct / invn);
+ product = 1.0 - exp(product / invn);
+ spamicity =
+ (1.0 + (invproduct - product) / (invproduct + product)) / 2.0;
+ } else spamicity = ROBX;
+
if (verbose)
printf("# Spamicity of %f\n", spamicity);
@@ -601,16 +516,12 @@
rc_t status;
double spamicity;
void *PArray = (Pvoid_t) NULL; // JudySL array.
- bogostat_t *stats;
// tokenize input text and save words in a Judy array.
PArray = collect_words(fd);
-// select the best spam/nonspam indicators.
- stats = select_indicators(PArray);
-
// computes the spamicity of the spam/nonspam indicators.
- spamicity = compute_spamicity(stats);
+ spamicity = compute_spamicity(PArray);
status = (spamicity > SPAM_CUTOFF) ? RC_SPAM : RC_NONSPAM;
--
| G r e g L o u i s | gpg public key: |
| http://www.bgl.nu/~glouis | finger greg at bgl.nu |
More information about the Bogofilter
mailing list