Robinson's new f(w) calculation

Greg Louis glouis at dynamicro.on.ca
Sun Sep 22 21:12:25 CEST 2002


Here is another patch against 0.7.4, this time implementing a new form
of the f(w) calculation that Gary described in a posting on the
spambayes list.

--- bogofilter.c.orig	2002-09-19 13:22:21.000000000 -0400
+++ bogofilter.c	2002-09-19 13:36:16.000000000 -0400
@@ -61,6 +61,17 @@
 
 I do the lexical analysis slightly differently, however.
 
+MOD: (Greg Louis <glouis at dynamicro.on.ca>) This version implements Gary
+    Robinson's proposed modifications to the "spamicity" calculation and
+    uses his f(w) individual probability calculation.
+    See
+
+    http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html
+    
+    In addition, this version does not store "extrema."  Instead it accumulates
+    Robinson's P and Q using all words deemed "characteristic," i.e. having
+    a deviation (fabs (0.5f - prob)) >= MIN_DEV, currently set to 0.0.
+
 ******************************************************************************/
 #include <stdio.h>
 #include <math.h>
@@ -78,14 +89,12 @@
 #define HEADER		"# bogofilter email-count (format version B): %lu\n"
 
 // constants for the Graham formula 
-#define HAM_BIAS	2		// give ham words more weight
-#define KEEPERS		15		// how many extrema to keep
-#define MINIMUM_FREQ	5		// minimum freq
-#define UNKNOWN_WORD	0.4f		// odds that unknown word is spammish
-#define SPAM_CUTOFF	0.9f		// if it's spammier than this...
+#define SPAM_CUTOFF	0.52f		// if it's spammier than this...
+#define MIN_DEV		0.0f		// look for characteristic words
 #define MAX_REPEATS	4		// cap on word frequency per message
 
-#define DEVIATION(n)	fabs((n) - 0.5f)	// deviation from average
+#define ROBS 0.001f			// Robinson's a
+#define ROBX 0.415f			// Robinson's x
 
 #define max(x, y)	(((x) > (y)) ? (x) : (y))
 #define min(x, y)	(((x) < (y)) ? (x) : (y))
@@ -410,43 +419,6 @@
     }
 }
 
-#ifdef __UNUSED__
-void logprintf(const char *fmt, ... )
-// log data from server
-{
-    char buf[BUFSIZ];
-    va_list ap;
-    int fd;
-
-    va_start(ap, fmt);
-    vsnprintf(buf, sizeof(buf), fmt, ap);
-    va_end(ap);
-    
-    fd=open("/tmp/bogolog", O_RDWR|O_CREAT|O_APPEND,0700);
-    write(fd,buf,strlen(buf));
-    close(fd);
-}
-#endif // __UNUSED__
-
-typedef struct 
-{
-    char        key[MAXWORDLEN+1];
-    double      prob;
-}
-discrim_t;
-
-typedef struct
-{
-    discrim_t extrema[KEEPERS];
-}
-bogostat_t;
-
-int compare_stats(discrim_t *d1, discrim_t *d2)
-{ 
-    return ( (d1->prob > d2->prob) ||
-	     ((d1->prob == d2->prob) && (strcmp(d1->key, d2->key) > 0)));
-}
-
 void *collect_words(int fd)
 // tokenize input text and save words in a Judy array.
 // returns:  the Judy array
@@ -472,35 +444,25 @@
     return PArray;
 }
 
-bogostat_t *select_indicators(void  *PArray)
+double compute_spamicity(void  *PArray)
 // selects the best spam/nonspam indicators and
-// populates the stats structure.
+// calculates Robinson's S.
 {
     void	**loc;
     char	tokenbuffer[BUFSIZ];
 
-    discrim_t *pp, *hit;
-    static bogostat_t stats;
-    
-#ifdef NON_EQUIPROBABLE
-    // There is an argument that we should by by number of *words* here.
-    double	msg_prob = (spam_list.msgcount / ham_list.msgcount);
-#endif // NON_EQUIPROBABLE
+    double invproduct = 0.0;	// Robinson's P
+    double product = 0.0;	// Robinson's Q
+    double spamicity, invn;
+    int robn = 0;
 
-    for (pp = stats.extrema; pp < stats.extrema+sizeof(stats.extrema)/sizeof(*stats.extrema); pp++)
-    {
- 	pp->prob = 0.5f;
- 	pp->key[0] = '\0';
-    }
- 
     yytext = tokenbuffer;
     for (loc  = JudySLFirst(PArray, tokenbuffer, 0);
 	 loc != (void *) NULL;
 	 loc  = JudySLNext(PArray, tokenbuffer, 0))
     {
 	double prob;
-	double dev;
-	double hamness, spamness, slotdev, hitdev;
+	double hamness, spamness;
 
 	hamness = getcount(yytext, &ham_list);
 	spamness  = getcount(yytext, &spam_list);
@@ -514,81 +476,34 @@
 	//  	    (double (/ 
 	// 		    (min 1 (/ b nspam)) 
 	// 		    (+ (min 1 (/ g nham)) (min 1 (/ b nspam)))))))))
-	// This assumes that spam and non-spam are equiprobable.
-	hamness *= HAM_BIAS;
-	if (hamness + spamness < MINIMUM_FREQ)
-#ifdef NON_EQUIPROBABLE
-	    // In the absence of evidence, the probability that a new word
-	    // will be spam is the historical ratio of spam words to
-	    // nonspam words.
-	    prob = msg_prob;
-#else
-	    prob = UNKNOWN_WORD;
-#endif // NON_EQUIPROBABLE
-	else
 	{
-	    register double pb = min(1, (spamness / spam_list.msgcount));
-	    register double pg = min(1, (hamness / ham_list.msgcount));
-
-#ifdef NON_EQUIPROBABLE
-	    prob = (pb * msg_prob) / ((pg * (1 - msg_prob)) + (pb * msg_prob));
-#else
-	    prob = pb / (pg + pb);
-#endif // NON_EQUIPROBABLE
-	    prob = min(prob, 0.99);
-	    prob = max(prob, 0.01);
+	    register double pb = spamness;
+	    register double pg = spam_list.msgcount * hamness / ham_list.msgcount;
+            // Robinson's f(w) = (s * x + n * p(w)) / (s + n)
+            // p(w) is pb / (pb + pg); n is pb + pg; so n * p(w) is pb
+	    prob = (ROBS * ROBX + pb) / (ROBS + pg + pb);
 	}
 
-	// update the list of tokens with maximum deviation
-	dev = DEVIATION(prob);
-        hit = NULL;
-        hitdev=1;
-	for (pp = stats.extrema; pp < stats.extrema+sizeof(stats.extrema)/sizeof(*stats.extrema); pp++)
-        {
-	    slotdev=DEVIATION(pp->prob);
-	    if (dev>slotdev && hitdev>slotdev)
-	    {
-		hit=pp;
-		hitdev=slotdev;
-            }
-        }
-        if (hit) 
-	{ 
-	    hit->prob = prob;
-	    strncpy(hit->key, yytext, MAXWORDLEN);
+        // Robinson's P and Q; accumulation step
+        // P = 1 - ((1-p1)*(1-p2)*...*(1-pn))^(1/n)     [spamminess]
+        // Q = 1 - (p1*p2*...*pn)^(1/n)                 [non-spamminess]
+        if (fabs(0.5 - prob) >= MIN_DEV) {
+            invproduct += log(1.0 - prob);
+            product += log(prob);
+            robn ++;
 	}
     }
-    return (&stats);
-}
-
-double compute_spamicity(bogostat_t *stats)
-// computes the spamicity of the words in the bogostat structure
-// returns:  the spamicity
-{
-    double product, invproduct;
-    double spamicity = 0.0;
-
-    discrim_t *pp;
-
-    if (verbose)
-    {
-	// put the stats in ascending order by probability and alphabet
-	qsort(stats->extrema, KEEPERS, sizeof(discrim_t), compare_stats);
-    }
-
-    // Bayes' theorem.
-    // For discussion, see <http://www.mathpages.com/home/kmath267.htm>.
-    product = invproduct = 1.0f;
-    for (pp = stats->extrema; pp < stats->extrema+sizeof(stats->extrema)/sizeof(*stats->extrema); pp++)
-	if (pp->prob != 0)
-	{
-	    product *= pp->prob;
-	    invproduct *= (1 - pp->prob);
-	    spamicity = product / (product + invproduct);
-	    if (verbose)
-		printf("#  %f  %f  %s\n", pp->prob, spamicity, pp->key);
-	}
 
+    // Robinson's P, Q and S
+    // S = (P - Q) / (P + Q)                        [combined indicator]
+    if (robn) {
+        invn = (double)robn;
+        invproduct = 1.0 - exp(invproduct / invn);
+        product = 1.0 - exp(product / invn);
+        spamicity =
+            (1.0 + (invproduct - product) / (invproduct + product)) / 2.0;
+    } else spamicity = ROBX;
+                                                        
     if (verbose)
 	printf("#  Spamicity of %f\n", spamicity);
 
@@ -601,16 +516,12 @@
     rc_t	status;
     double 	spamicity;
     void	*PArray = (Pvoid_t) NULL;	// JudySL array.
-    bogostat_t	*stats;
 
 //  tokenize input text and save words in a Judy array.
     PArray = collect_words(fd);
     
-//  select the best spam/nonspam indicators.
-    stats = select_indicators(PArray);
-    
 //  computes the spamicity of the spam/nonspam indicators.
-    spamicity = compute_spamicity(stats);
+    spamicity = compute_spamicity(PArray);
 
     status = (spamicity > SPAM_CUTOFF) ? RC_SPAM : RC_NONSPAM;
 


-- 
| G r e g  L o u i s          | gpg public key:      |
|   http://www.bgl.nu/~glouis |   finger greg at bgl.nu |

>From  Thu 6 May 2004 12:51:12 2004
Return-Path: bogofilter-return-70-relson=osagesoftware.com at aotto.com
Return-Path: <bogofilter-return-70-relson=osagesoftware.com at aotto.com>
X-Original-To: relson at osagesoftware.com
Delivered-To: relson at osagesoftware.com
Received: from lists.refdesk.com (ns1.drudgereport.com [216.40.241.219])
	by mail.osagesoftware.com (Postfix) with SMTP id E75A62FEA6
	for <relson at osagesoftware.com>; Thu,  6 May 2004 12:51:12 -0400 (EDT)
Received: (qmail 19760 invoked by alias); 6 May 2004 16:50:15 -0000
Mailing-List: contact bogofilter-help at aotto.com; run by ezmlm
Precedence: bulk
List-Id: <bogofilter at aotto.com>
List-Post: <mailto:bogofilter at aotto.com>
List-Help: <mailto:bogofilter-help at aotto.com>
List-Unsubscribe: <mailto:bogofilter-unsubscribe at aotto.com>
List-Subscribe: <mailto:bogofilter-subscribe at aotto.com>
Delivered-To: mailing list bogofilter at aotto.com
X-Ezauth: gfjmbclieapgjocacikg
Received: (qmail 19750 invoked from network); 6 May 2004 16:50:15 -0000
Date: Thu, 6 May 2004 09:51:41 -0700
From: Ben Damm <bdamm-bogofilter at dammfine.com>
To: bogofilter <bogofilter at aotto.com>
Message-ID: <20040506165141.GO28747 at f00f.net>
References: <409A0A55.9020805 at tacocat.net>
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
In-Reply-To: <409A0A55.9020805 at tacocat.net>
User-Agent: Mutt/1.3.28i
X-Uptime: 09:36:55 up 57 days, 10:10, 20 users,  load average: 0.01, 0.02, 0.00
Subject: [bogofilter] Re: redundancy
X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=0.17.5.cvs
Status:   

This brings up something that has been on my mind lately.  I've noticed
that some spammers are adding many copies of my own email address to
random locations in the body of the email.  What this says to me is that
my email address will become a spammy token.  I suspect that this will
not significantly change the bogosity score of otherwise valid messages,
but can someone confirm this?

Thanks,
-Ben

On Thu, May 06, 2004 at 05:50:13AM -0400, Tom Allison wrote:
> I'm not sure I follow all of this, but it seems that the article on ESF 
> made the following assumptions:
> 
> That the spamminess of an email with one incident of the word, "foo" 
> tends to score the same as two incidents of the word, "foo".
> So he goes through this sqrt()/2 argument to show that you can reduce 
> the "effective size" of the email message and thereby correct for spam.
> 
> I occurred to me that since he's taking this from a model of protein 
> selection he's assuming a degree of assumption in what the item under 
> inspection is.
> 
> But under email, we can decisively remove all of the duplication for 
> each token that occurs within an email with ease and certainty. 
> Therefore, I'm not certain that the statistical estimation of the emails 
> size using ESF is going to be any more accurate or simpler than 
> explicitly removing all duplicate appearances of a token from spam scoring.
> 
> That, or I really don't understand the article that well and didn't get 
> enough sleep last night.

-- 
Just because your doctor has a name for your condition doesn't mean he
knows what it is.
>From  Tue 4 May 2004 13:59:12 2004
Return-Path: tallison at tacocat.net
Return-Path: <tallison at tacocat.net>
X-Original-To: relson at osagesoftware.com
Delivered-To: relson at osagesoftware.com
Received: from ms-smtp-04-eri0.ohiordc.rr.com (ms-smtp-04-smtplb.ohiordc.rr.com [65.24.5.138])
	by mail.osagesoftware.com (Postfix) with ESMTP id CB88E2FEA6
	for <relson at osagesoftware.com>; Tue,  4 May 2004 13:59:12 -0400 (EDT)
Received: from janus.tacocat.net (cpe-069-133-095-206.twmi.rr.com [69.133.95.206])
	by ms-smtp-04-eri0.ohiordc.rr.com (8.12.10/8.12.7) with ESMTP id i44Hx9Or025642
	for <relson at osagesoftware.com>; Tue, 4 May 2004 13:59:09 -0400 (EDT)
Received: from localhost (localhost [127.0.0.1])
	by janus.tacocat.net (Postfix) with ESMTP id 04B5821315D
	for <relson at osagesoftware.com>; Tue,  4 May 2004 13:59:09 -0400 (EDT)
Received: from janus.tacocat.net ([127.0.0.1])
 by localhost (janus [127.0.0.1]) (amavisd-new, port 10024) with ESMTP
 id 29751-07 for <relson at osagesoftware.com>;
 Tue,  4 May 2004 13:59:08 -0400 (EDT)
Received: from www.tacocat.net (localhost [127.0.0.1])
	by janus.tacocat.net (Postfix) with SMTP id 7C38321315C
	for <relson at osagesoftware.com>; Tue,  4 May 2004 13:59:08 -0400 (EDT)
Received: from 198.208.159.14 (proxying for unknown)
        (SquirrelMail authenticated user tallison)
        by www.tacocat.net with HTTP;
        Tue, 4 May 2004 13:59:08 -0400 (EDT)
Message-ID: <53195.198.208.159.14.1083693548.squirrel at www.tacocat.net>
In-Reply-To: <20040504135240.32935df8 at osage.osagesoftware.com>
References: 
    <40961B08.7040602 at tacocat.net><31604.198.208.159.14.1083692708.squirrel at www.tacocat.net>
    <20040504135240.32935df8 at osage.osagesoftware.com>
Date: Tue, 4 May 2004 13:59:08 -0400 (EDT)
Subject: Re: [bogofilter] test
From: tallison at tacocat.net
To: "David Relson" <relson at osagesoftware.com>
User-Agent: SquirrelMail/1.4.1
MIME-Version: 1.0
Content-Type: text/plain;charset=iso-8859-1
Content-Transfer-Encoding: 8bit
X-Priority: 3
Importance: Normal
X-Virus-Scanned: Symantec AntiVirus Scan Engine
X-Virus-Scanned: by amavisd-new at tacocat.net
X-Bogosity: Ham, tests=bogofilter, spamicity=0.000001, version=0.17.5.cvs
Status:   

> On Tue, 4 May 2004 13:45:08 -0400 (EDT)
> tallison at tacocat.net wrote:
>
>> > no mail in a weekend..
>> >
>> >
>> Took a while.
>> Must of been a bogon headwind.
>
> Indeed, a slow message...  The lists _were_ down.
>
>

Cool!
If I set it up correctly, I'll have my first data set from my tests.

I think I'm going to try something similar to what you suggestion on the
folding, but with a simleton's approach for now.
I have 4000 spam, 7000 ham.
I was going to run four data sets
first 1000 from each
first 2000 from each
last 1000 from each
last 2000 from each

Maybe I'll have it all done by Friday.
>From  May 2004 13:52:40 -040 Tue 4 May 2004 13:45:08 2004
Return-Path: tallison at tacocat.net
Date: Tue, 4 May 2004 13:52:40 -0400
From: David Relson <relson at osagesoftware.com>
To: Tom Allison <tallison at tacocat.net>
Subject: Re: [bogofilter] test
Message-Id: <20040504135240.32935df8 at osage.osagesoftware.com>
In-Reply-To: <31604.198.208.159.14.1083692708.squirrel at www.tacocat.net>
References: <40961B08.7040602 at tacocat.net>
	<31604.198.208.159.14.1083692708.squirrel at www.tacocat.net>
Organization: Osage Software Systems, Inc.
X-Mailer: Sylpheed version 0.9.10claws51 (GTK+ 1.2.10; i686-pc-linux-gnu)
Mime-Version: 1.0
Content-Type: text/plain; charset=US-ASCII
Content-Transfer-Encoding: 7bit

On Tue, 4 May 2004 13:45:08 -0400 (EDT)
tallison at tacocat.net wrote:

> > no mail in a weekend..
> >
> >
> Took a while.
> Must of been a bogon headwind.

Indeed, a slow message...  The lists _were_ down.
>From  Tue 4 May 2004 13:45:19 2004
Return-Path: bogofilter-return-50-relson=osagesoftware.com at aotto.com
Return-Path: <bogofilter-return-50-relson=osagesoftware.com at aotto.com>
X-Original-To: relson at osagesoftware.com
Delivered-To: relson at osagesoftware.com
Received: from lists.refdesk.com (ns1.drudgereport.com [216.40.241.219])
	by mail.osagesoftware.com (Postfix) with SMTP id AB8652FEA6
	for <relson at osagesoftware.com>; Tue,  4 May 2004 13:45:19 -0400 (EDT)
Received: (qmail 5367 invoked by alias); 4 May 2004 17:44:34 -0000
Mailing-List: contact bogofilter-help at aotto.com; run by ezmlm
Precedence: bulk
List-Id: <bogofilter at aotto.com>
List-Post: <mailto:bogofilter at aotto.com>
List-Help: <mailto:bogofilter-help at aotto.com>
List-Unsubscribe: <mailto:bogofilter-unsubscribe at aotto.com>
List-Subscribe: <mailto:bogofilter-subscribe at aotto.com>
Delivered-To: mailing list bogofilter at aotto.com
X-Ezauth: egfpbpkiodkdhfffjmlf
Received: (qmail 5355 invoked from network); 4 May 2004 17:44:34 -0000
Message-ID: <31604.198.208.159.14.1083692708.squirrel at www.tacocat.net>
In-Reply-To: <40961B08.7040602 at tacocat.net>
References: <40961B08.7040602 at tacocat.net>
Date: Tue, 4 May 2004 13:45:08 -0400 (EDT)
From: tallison at tacocat.net
To: "bogofilter" <bogofilter at aotto.com>
User-Agent: SquirrelMail/1.4.1
MIME-Version: 1.0
Content-Type: text/plain;charset=iso-8859-1
Content-Transfer-Encoding: 8bit
X-Priority: 3
Importance: Normal
X-Virus-Scanned: Symantec AntiVirus Scan Engine
X-Virus-Scanned: by amavisd-new at tacocat.net
Subject: Re: [bogofilter] test
X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=0.17.5.cvs
Status:   

> no mail in a weekend..
>
>
Took a while.
Must of been a bogon headwind.
>From  Tue 4 May 2004 09:55:05 2004
Return-Path: bogofilter-return-49-relson=osagesoftware.com at aotto.com
Return-Path: <bogofilter-return-49-relson=osagesoftware.com at aotto.com>
X-Original-To: relson at osagesoftware.com
Delivered-To: relson at osagesoftware.com
Received: from lists.refdesk.com (ns1.drudgereport.com [216.40.241.219])
	by mail.osagesoftware.com (Postfix) with SMTP id DE9752FEA6
	for <relson at osagesoftware.com>; Tue,  4 May 2004 09:55:05 -0400 (EDT)
Received: (qmail 24936 invoked by alias); 4 May 2004 13:54:23 -0000
Mailing-List: contact bogofilter-help at aotto.com; run by ezmlm
Precedence: bulk
List-Id: <bogofilter at aotto.com>
List-Post: <mailto:bogofilter at aotto.com>
List-Help: <mailto:bogofilter-help at aotto.com>
List-Unsubscribe: <mailto:bogofilter-unsubscribe at aotto.com>
List-Subscribe: <mailto:bogofilter-subscribe at aotto.com>
Delivered-To: mailing list bogofilter at aotto.com
X-Ezauth: fegbnaioodombbkbdapl
Received: (qmail 24925 invoked from network); 4 May 2004 13:54:23 -0000
Message-ID: <40961B08.7040602 at tacocat.net>
Date: Mon, 03 May 2004 06:12:24 -0400
From: Tom Allison <tallison at tacocat.net>
User-Agent: Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4) Gecko/20030821
X-Accept-Language: en-us, en
MIME-Version: 1.0
To: bogofilter <bogofilter at aotto.com>
X-Enigmail-Version: 0.76.5.0
X-Enigmail-Supports: pgp-inline, pgp-mime
Content-Type: text/plain; charset=us-ascii; format=flowed
Content-Transfer-Encoding: 7bit
X-Virus-Scanned: Symantec AntiVirus Scan Engine
X-Virus-Scanned: by amavisd-new at tacocat.net
Subject: [bogofilter] test
X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=0.17.5.cvs
Status:   

no mail in a weekend..
>From  Sat 1 May 2004 15:00:32 2004
Return-Path: bogofilter-return-48-relson=osagesoftware.com at aotto.com
Return-Path: <bogofilter-return-48-relson=osagesoftware.com at aotto.com>
X-Original-To: relson at osagesoftware.com
Delivered-To: relson at osagesoftware.com
Received: from lists.refdesk.com (ns1.drudgereport.com [216.40.241.219])
	by mail.osagesoftware.com (Postfix) with SMTP id 55E051FE59
	for <relson at osagesoftware.com>; Sat,  1 May 2004 15:00:32 -0400 (EDT)
Received: (qmail 17521 invoked by alias); 1 May 2004 19:00:09 -0000
Mailing-List: contact bogofilter-help at aotto.com; run by ezmlm
Precedence: bulk
List-Id: <bogofilter at aotto.com>
List-Post: <mailto:bogofilter at aotto.com>
List-Help: <mailto:bogofilter-help at aotto.com>
List-Unsubscribe: <mailto:bogofilter-unsubscribe at aotto.com>
List-Subscribe: <mailto:bogofilter-subscribe at aotto.com>
Delivered-To: mailing list bogofilter at aotto.com
X-Ezauth: eaohmdeleekbagljjhbi
Received: (qmail 17511 invoked from network); 1 May 2004 19:00:09 -0000
Message-ID: <4093F3C1.60901 at tacocat.net>
Date: Sat, 01 May 2004 15:00:17 -0400
From: Tom Allison <tallison at tacocat.net>
User-Agent: Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4.2) Gecko/20040220
X-Accept-Language: en-us, en
MIME-Version: 1.0
To: bogofilter <bogofilter at aotto.com>
References: <20040428085614.50d48857 at atipa.local>	 <40922B8E.9090502 at tacocat.net> <1083327404.14629.615.camel at linuxpc>	 <409381A7.1070306 at tacocat.net> <1083432750.14620.673.camel at linuxpc>
In-Reply-To: <1083432750.14620.673.camel at linuxpc>
X-Enigmail-Version: 0.76.8.0
X-Enigmail-Supports: pgp-inline, pgp-mime
Content-Type: text/plain; charset=us-ascii; format=flowed
Content-Transfer-Encoding: 7bit
X-Virus-Scanned: Symantec AntiVirus Scan Engine
X-Virus-Scanned: by amavisd-new at tacocat.net
Subject: Re: [bogofilter] using block_on_subnets
X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=0.17.5
Status:   

Tom Anderson wrote:
> 
>>Then I am going to repeat this test without spamitarium involved in 
>>either the creation of the database or the testing of the archives.
>>Will '-w' suffice?
>>Any other recommendations?
> 
> 
> Only to keep your bogofilter configuration the same between the tests,
> and it would be nice if you published it with your results as well.  In
> your non-spamitarium run, do you plan on using block_on_subnets?  It
> would actually be interesting to see that as a seperate test.  Wordlist
> size from each test set would be a welcome report as well.
> 

Hmmm....

I think I'm already looking at ~8 hours to get all the test done that I 
have right now.  Adding variations for block_on_subnets should push it 
to 16 hours for the raw test results.
>From  Sat 1 May 2004 13:31:45 2004
Return-Path: bogofilter-return-47-relson=osagesoftware.com at aotto.com
Return-Path: <bogofilter-return-47-relson=osagesoftware.com at aotto.com>
X-Original-To: relson at osagesoftware.com
Delivered-To: relson at osagesoftware.com
Received: from lists.refdesk.com (ns1.drudgereport.com [216.40.241.219])
	by mail.osagesoftware.com (Postfix) with SMTP id 9AA701FE59
	for <relson at osagesoftware.com>; Sat,  1 May 2004 13:31:45 -0400 (EDT)
Received: (qmail 20434 invoked by alias); 1 May 2004 17:31:25 -0000
Mailing-List: contact bogofilter-help at aotto.com; run by ezmlm
Precedence: bulk
List-Id: <bogofilter at aotto.com>
List-Post: <mailto:bogofilter at aotto.com>
List-Help: <mailto:bogofilter-help at aotto.com>
List-Unsubscribe: <mailto:bogofilter-unsubscribe at aotto.com>
List-Subscribe: <mailto:bogofilter-subscribe at aotto.com>
Delivered-To: mailing list bogofilter at aotto.com
X-Ezauth: dnhmlhlojjdpblggkibn
Received: (qmail 20424 invoked from network); 1 May 2004 17:31:25 -0000
From: Tom Anderson <tanderso at oac-design.com>
To: bogofilter <bogofilter at aotto.com>
In-Reply-To: <409381A7.1070306 at tacocat.net> 
References: <20040428085614.50d48857 at atipa.local>
	 <40922B8E.9090502 at tacocat.net> <1083327404.14629.615.camel at linuxpc>
	 <409381A7.1070306 at tacocat.net> 
Content-Type: multipart/signed; micalg=pgp-sha1; protocol="application/pgp-signature"; boundary="=-yTsRWdwBaFgb8fIykxMu"
Organization: 
Message-Id: <1083432750.14620.673.camel at linuxpc>
Mime-Version: 1.0
X-Mailer: Ximian Evolution 1.2.4 
Date: 01 May 2004 13:33:06 -0400
Subject: Re: [bogofilter] using block_on_subnets
X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=0.17.5
Status:   

--=-yTsRWdwBaFgb8fIykxMu
Content-Type: text/plain
Content-Transfer-Encoding: quoted-printable

On Sat, 2004-05-01 at 06:53, Tom Allison wrote:
> I want to run some tests on this but I need to confirm something first.

That's great.  I look forward to seeing the results.

> My test plan was to read maildir format message from stdin and use them=20
> to both build and then test my spam on the fly.  Without creating new=20
> messages all the time.
>=20
> In order to do that, I was hoping I could do something like this:
>=20
> cat $F | spamitarium.pl -sread | bogofilter -v

That looks fine to me, except that you want to pass the "w" flag to
spamitarium and the appropriate flags for registration/classification to
bogofilter.  I usually use the below format in testing...

 spamitarium -sreadw < $F | bogofilter -vvv

But I don't see anything wrong with piping from "cat".

> I believe that I will loose the BODY of every message doing this.
> if I include '-w' will I be able to capture the complete email body=20
> consistently?

Yes, the "w" flag will output the body.  Normally spamitarium reads the
header only (until the first blank line or end-of-input), as procmail
can deal with just filtering the header, so including the body would be
extra useless overhead.  I included the "w" option for command-line
testing, just as you'd like to do.  After the normal header filtering,
if "w" was passed in, it'll continue to read STDIN and output it
untouched so that the whole email can be passed to bogofilter. =20

By all means, test a few emails first to be sure there are no
inconsistencies that you can see.  It has only been tested by myself, so
I can't guarantee there are zero bugs.  However, I've been running it on
my own email for something like a month now, and I haven't seen any
problems.

The only part of the whole thing I'm not confident with is the CRLF line
termination.  RFC 822, 2822, etc., specify ASCII 13 followed by ASCII 10
(decimal), but when I used "\015\012" (octal form of the
aforementioned), Outlook Express didn't like it.  I changed it to just
"\n" and it worked OK in both OE and Evolution, so I left it like that.=20
I've considered also "\cM\cJ", but haven't tried it yet.  Please let me
know if you discover any problems due to the line termination
characters.

> I am going to use this approach to both build a word database and then=20
> to test my archives against that word database, all using the ASN=20
> approach you mention.

Sounds terrific.

> Then I am going to repeat this test without spamitarium involved in=20
> either the creation of the database or the testing of the archives.
> Will '-w' suffice?
> Any other recommendations?

Only to keep your bogofilter configuration the same between the tests,
and it would be nice if you published it with your results as well.  In
your non-spamitarium run, do you plan on using block_on_subnets?  It
would actually be interesting to see that as a seperate test.  Wordlist
size from each test set would be a welcome report as well.

Tom


--=-yTsRWdwBaFgb8fIykxMu
Content-Type: application/pgp-signature; name=signature.asc
Content-Description: This is a digitally signed message part

-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.2.1 (GNU/Linux)

iD8DBQBAk98uaTUjGSdb2eIRAudFAJ9mryFBcX+GWlFfjU2TKF29P6AJrgCZARY1
UVOTb8XtfopM9lLyzguLWYM=
=O8fO
-----END PGP SIGNATURE-----

--=-yTsRWdwBaFgb8fIykxMu--
>From  Sat 8 May 2004 14:05:30 2004
Return-Path: bogofilter-return-82-relson=osagesoftware.com at aotto.com
Return-Path: <bogofilter-return-82-relson=osagesoftware.com at aotto.com>
X-Original-To: relson at osagesoftware.com
Delivered-To: relson at osagesoftware.com
Received: from lists.refdesk.com (ns1.drudgereport.com [216.40.241.219])
	by mail.osagesoftware.com (Postfix) with SMTP id 5ABE230899
	for <relson at osagesoftware.com>; Sat,  8 May 2004 14:05:30 -0400 (EDT)
Received: (qmail 19454 invoked by alias); 8 May 2004 17:04:08 -0000
Mailing-List: contact bogofilter-help at aotto.com; run by ezmlm
Precedence: bulk
List-Id: <bogofilter at aotto.com>
List-Post: <mailto:bogofilter at aotto.com>
List-Help: <mailto:bogofilter-help at aotto.com>
List-Unsubscribe: <mailto:bogofilter-unsubscribe at aotto.com>
List-Subscribe: <mailto:bogofilter-subscribe at aotto.com>
Delivered-To: mailing list bogofilter at aotto.com
X-Ezauth: dpkagbipdkkaeceddifh
Received: (qmail 19432 invoked from network); 8 May 2004 17:04:08 -0000
Message-ID: <409D1349.40103 at tacocat.net>
Date: Sat, 08 May 2004 13:05:13 -0400
From: Tom Allison <tallison at tacocat.net>
User-Agent: Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4.2) Gecko/20040220
X-Accept-Language: en-us, en
MIME-Version: 1.0
To: bogofilter <bogofilter at aotto.com>
X-Enigmail-Version: 0.76.8.0
X-Enigmail-Supports: pgp-inline, pgp-mime
Content-Type: multipart/mixed;
 boundary="------------010907060305050000060207"
X-Virus-Scanned: Symantec AntiVirus Scan Engine
X-Virus-Scanned: by amavisd-new at tacocat.net
Subject: [bogofilter] spamitarium test results
X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=0.17.5.cvs
Status:   

--------------010907060305050000060207
Content-Type: text/plain; charset=us-ascii; format=flowed
Content-Transfer-Encoding: 7bit

OK, I think I finished after 30+ hours of crunching the following.

I broke up my 11,000 emails into three paired groups for spam/ham using 
one third of my total archive for each group.

Test variations were:
robs=(0.01, 0.1, 1.0)
block_on_subnets=(yes, no)
spamitarium => (not used, sradw, radw, sreadw, sradw)
with a three folded data set (or however you're supposed to describe it) 
for a total of some 90 test runs.

Method for final scoring:
For all tests given one set of corpus (ie: set one for training, set two 
and three for testing) I determined the best scores with respect to:
highest count of correct readings (ham corpus, score=no)
lowest count of unsure readings (ham corpus, score= unsure)
lowest count of incorrect readings (ham corpus, score = yes)

I then tallied up which test parameters resulted in how many of these 
matches.  That is to say: for the test corpus and a given set of test 
parameters (robs, subnets...) how many times did the resulting scores 
match the best scores seen for that test corpus for all test parameters.

Results where interesting:
For each group of emails, the results were consistent and the values 
themselves were reasonably grouped.  There was a distinct difference 
between how a final score was achieved with spamitarium showing the most 
accurate positive results when using block_on_subnets=yes and robs=1.0 
for all spamitarium arguements.  In fact, the differences between the 
raw scores of the different settings were negligable.  My guess is the 
DNS information added is more important than the Header information that 
is stripped.  I did not run any tests to simply strip the headers 
without DNS/ASN information being added.

However, spamitarium also had the worst false positives (not counting 
Unsure) in the groups tested.

Without running spamitarium, the best results where achieved with 
block_on_subnets=yes and robs=0.01 or robs=1.0.  While they did not show 
the best positive scores, they showed the best values for false 
positives and Unsure scores.  The very best settings found for 
minimizing false-positives was robs=0.01, block_on_subnets=yes

I guess it's a matter of how you would like to judge the accuracy of the 
process, most positives or least false positives.

--------------010907060305050000060207
Content-Type: text/plain;
 name="spamitariumtest.csv"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline;
 filename="spamitariumtest.csv"

ham,Yes,1.0,yes,none,1,4857,5
ham,Unsure,1.0,yes,none,1,4857,9
ham,No,1.0,yes,none,1,4857,4843
spam,Yes,1.0,yes,none,1,2892,2766
spam,Unsure,1.0,yes,none,1,2892,77
spam,No,1.0,yes,none,1,2892,49
ham,Yes,1.0,yes,none,2,4849,5
ham,Unsure,1.0,yes,none,2,4849,12
ham,No,1.0,yes,none,2,4849,4832
spam,Yes,1.0,yes,none,2,2889,2752
spam,Unsure,1.0,yes,none,2,2889,70
spam,No,1.0,yes,none,2,2889,67
ham,Yes,1.0,yes,none,3,4854,7
ham,Unsure,1.0,yes,none,3,4854,20
ham,No,1.0,yes,none,3,4854,4827
spam,Yes,1.0,yes,none,3,2891,2777
spam,Unsure,1.0,yes,none,3,2891,63
spam,No,1.0,yes,none,3,2891,51
ham,Yes,1.0,yes,radw,1,4898,6
ham,Unsure,1.0,yes,radw,1,4898,27
ham,No,1.0,yes,radw,1,4898,4865
spam,Yes,1.0,yes,radw,1,2908,2777
spam,Unsure,1.0,yes,radw,1,2908,78
spam,No,1.0,yes,radw,1,2908,53
ham,Yes,1.0,yes,radw,2,4886,6
ham,Unsure,1.0,yes,radw,2,4886,19
ham,No,1.0,yes,radw,2,4886,4861
spam,Yes,1.0,yes,radw,2,2921,2765
spam,Unsure,1.0,yes,radw,2,2921,80
spam,No,1.0,yes,radw,2,2921,76
ham,Yes,1.0,yes,radw,3,4898,7
ham,Unsure,1.0,yes,radw,3,4898,28
ham,No,1.0,yes,radw,3,4898,4863
spam,Yes,1.0,yes,radw,3,2917,2787
spam,Unsure,1.0,yes,radw,3,2917,76
spam,No,1.0,yes,radw,3,2917,54
ham,Yes,1.0,yes,sradw,1,4898,6
ham,Unsure,1.0,yes,sradw,1,4898,27
ham,No,1.0,yes,sradw,1,4898,4865
spam,Yes,1.0,yes,sradw,1,2908,2777
spam,Unsure,1.0,yes,sradw,1,2908,78
spam,No,1.0,yes,sradw,1,2908,53
ham,Yes,1.0,yes,sradw,2,4886,6
ham,Unsure,1.0,yes,sradw,2,4886,19
ham,No,1.0,yes,sradw,2,4886,4861
spam,Yes,1.0,yes,sradw,2,2921,2765
spam,Unsure,1.0,yes,sradw,2,2921,80
spam,No,1.0,yes,sradw,2,2921,76
ham,Yes,1.0,yes,sradw,3,4898,7
ham,Unsure,1.0,yes,sradw,3,4898,28
ham,No,1.0,yes,sradw,3,4898,4863
spam,Yes,1.0,yes,sradw,3,2917,2787
spam,Unsure,1.0,yes,sradw,3,2917,76
spam,No,1.0,yes,sradw,3,2917,54
ham,Yes,1.0,yes,sreadw,1,4898,6
ham,Unsure,1.0,yes,sreadw,1,4898,27
ham,No,1.0,yes,sreadw,1,4898,4865
spam,Yes,1.0,yes,sreadw,1,2908,2777
spam,Unsure,1.0,yes,sreadw,1,2908,78
spam,No,1.0,yes,sreadw,1,2908,53
ham,Yes,1.0,yes,sreadw,2,4886,6
ham,Unsure,1.0,yes,sreadw,2,4886,19
ham,No,1.0,yes,sreadw,2,4886,4861
spam,Yes,1.0,yes,sreadw,2,2921,2765
spam,Unsure,1.0,yes,sreadw,2,2921,80
spam,No,1.0,yes,sreadw,2,2921,76
ham,Yes,1.0,yes,sreadw,3,4898,7
ham,Unsure,1.0,yes,sreadw,3,4898,28
ham,No,1.0,yes,sreadw,3,4898,4863
spam,Yes,1.0,yes,sreadw,3,2917,2787
spam,Unsure,1.0,yes,sreadw,3,2917,76
spam,No,1.0,yes,sreadw,3,2917,54
ham,Yes,1.0,yes,readw,1,4898,6
ham,Unsure,1.0,yes,readw,1,4898,27
ham,No,1.0,yes,readw,1,4898,4865
spam,Yes,1.0,yes,readw,1,2908,2777
spam,Unsure,1.0,yes,readw,1,2908,78
spam,No,1.0,yes,readw,1,2908,53
ham,Yes,1.0,yes,readw,2,4886,6
ham,Unsure,1.0,yes,readw,2,4886,19
ham,No,1.0,yes,readw,2,4886,4861
spam,Yes,1.0,yes,readw,2,2921,2765
spam,Unsure,1.0,yes,readw,2,2921,80
spam,No,1.0,yes,readw,2,2921,76
ham,Yes,1.0,yes,readw,3,4898,7
ham,Unsure,1.0,yes,readw,3,4898,28
ham,No,1.0,yes,readw,3,4898,4863
spam,Yes,1.0,yes,readw,3,2917,2787
spam,Unsure,1.0,yes,readw,3,2917,76
spam,No,1.0,yes,readw,3,2917,54
ham,Yes,1.0,no,none,1,4857,4
ham,Unsure,1.0,no,none,1,4857,10
ham,No,1.0,no,none,1,4857,4843
spam,Yes,1.0,no,none,1,2892,2750
spam,Unsure,1.0,no,none,1,2892,93
spam,No,1.0,no,none,1,2892,49
ham,Yes,1.0,no,none,2,4849,6
ham,Unsure,1.0,no,none,2,4849,12
ham,No,1.0,no,none,2,4849,4831
spam,Yes,1.0,no,none,2,2889,2742
spam,Unsure,1.0,no,none,2,2889,81
spam,No,1.0,no,none,2,2889,66
ham,Yes,1.0,no,none,3,4854,7
ham,Unsure,1.0,no,none,3,4854,23
ham,No,1.0,no,none,3,4854,4824
spam,Yes,1.0,no,none,3,2891,2770
spam,Unsure,1.0,no,none,3,2891,70
spam,No,1.0,no,none,3,2891,51
ham,Yes,1.0,no,radw,1,4898,6
ham,Unsure,1.0,no,radw,1,4898,27
ham,No,1.0,no,radw,1,4898,4865
spam,Yes,1.0,no,radw,1,2908,2772
spam,Unsure,1.0,no,radw,1,2908,84
spam,No,1.0,no,radw,1,2908,52
ham,Yes,1.0,no,radw,2,4886,6
ham,Unsure,1.0,no,radw,2,4886,21
ham,No,1.0,no,radw,2,4886,4859
spam,Yes,1.0,no,radw,2,2921,2755
spam,Unsure,1.0,no,radw,2,2921,91
spam,No,1.0,no,radw,2,2921,75
ham,Yes,1.0,no,radw,3,4898,7
ham,Unsure,1.0,no,radw,3,4898,29
ham,No,1.0,no,radw,3,4898,4862
spam,Yes,1.0,no,radw,3,2917,2780
spam,Unsure,1.0,no,radw,3,2917,83
spam,No,1.0,no,radw,3,2917,54
ham,Yes,1.0,no,sradw,1,4898,6
ham,Unsure,1.0,no,sradw,1,4898,27
ham,No,1.0,no,sradw,1,4898,4865
spam,Yes,1.0,no,sradw,1,2908,2772
spam,Unsure,1.0,no,sradw,1,2908,84
spam,No,1.0,no,sradw,1,2908,52
ham,Yes,1.0,no,sradw,2,4886,6
ham,Unsure,1.0,no,sradw,2,4886,21
ham,No,1.0,no,sradw,2,4886,4859
spam,Yes,1.0,no,sradw,2,2921,2755
spam,Unsure,1.0,no,sradw,2,2921,91
spam,No,1.0,no,sradw,2,2921,75
ham,Yes,1.0,no,sradw,3,4898,7
ham,Unsure,1.0,no,sradw,3,4898,29
ham,No,1.0,no,sradw,3,4898,4862
spam,Yes,1.0,no,sradw,3,2917,2780
spam,Unsure,1.0,no,sradw,3,2917,83
spam,No,1.0,no,sradw,3,2917,54
ham,Yes,1.0,no,sreadw,1,4898,6
ham,Unsure,1.0,no,sreadw,1,4898,27
ham,No,1.0,no,sreadw,1,4898,4865
spam,Yes,1.0,no,sreadw,1,2908,2772
spam,Unsure,1.0,no,sreadw,1,2908,84
spam,No,1.0,no,sreadw,1,2908,52
ham,Yes,1.0,no,sreadw,2,4886,6
ham,Unsure,1.0,no,sreadw,2,4886,21
ham,No,1.0,no,sreadw,2,4886,4859
spam,Yes,1.0,no,sreadw,2,2921,2755
spam,Unsure,1.0,no,sreadw,2,2921,91
spam,No,1.0,no,sreadw,2,2921,75
ham,Yes,1.0,no,sreadw,3,4898,7
ham,Unsure,1.0,no,sreadw,3,4898,29
ham,No,1.0,no,sreadw,3,4898,4862
spam,Yes,1.0,no,sreadw,3,2917,2780
spam,Unsure,1.0,no,sreadw,3,2917,83
spam,No,1.0,no,sreadw,3,2917,54
ham,Yes,1.0,no,readw,1,4898,6
ham,Unsure,1.0,no,readw,1,4898,27
ham,No,1.0,no,readw,1,4898,4865
spam,Yes,1.0,no,readw,1,2908,2772
spam,Unsure,1.0,no,readw,1,2908,84
spam,No,1.0,no,readw,1,2908,52
ham,Yes,1.0,no,readw,2,4886,6
ham,Unsure,1.0,no,readw,2,4886,21
ham,No,1.0,no,readw,2,4886,4859
spam,Yes,1.0,no,readw,2,2921,2755
spam,Unsure,1.0,no,readw,2,2921,91
spam,No,1.0,no,readw,2,2921,75
ham,Yes,1.0,no,readw,3,4898,7
ham,Unsure,1.0,no,readw,3,4898,29
ham,No,1.0,no,readw,3,4898,4862
spam,Yes,1.0,no,readw,3,2917,2780
spam,Unsure,1.0,no,readw,3,2917,83
spam,No,1.0,no,readw,3,2917,54
ham,Yes,0.1,yes,none,1,4857,3
ham,Unsure,0.1,yes,none,1,4857,22
ham,No,0.1,yes,none,1,4857,4832
spam,Yes,0.1,yes,none,1,2892,2765
spam,Unsure,0.1,yes,none,1,2892,88
spam,No,0.1,yes,none,1,2892,39
ham,Yes,0.1,yes,none,2,4849,4
ham,Unsure,0.1,yes,none,2,4849,19
ham,No,0.1,yes,none,2,4849,4826
spam,Yes,0.1,yes,none,2,2889,2721
spam,Unsure,0.1,yes,none,2,2889,118
spam,No,0.1,yes,none,2,2889,50
ham,Yes,0.1,yes,none,3,4854,4
ham,Unsure,0.1,yes,none,3,4854,33
ham,No,0.1,yes,none,3,4854,4817
spam,Yes,0.1,yes,none,3,2891,2774
spam,Unsure,0.1,yes,none,3,2891,80
spam,No,0.1,yes,none,3,2891,37
ham,Yes,0.1,yes,radw,1,4898,3
ham,Unsure,0.1,yes,radw,1,4898,41
ham,No,0.1,yes,radw,1,4898,4854
spam,Yes,0.1,yes,radw,1,2908,2773
spam,Unsure,0.1,yes,radw,1,2908,92
spam,No,0.1,yes,radw,1,2908,43
ham,Yes,0.1,yes,radw,2,4886,4
ham,Unsure,0.1,yes,radw,2,4886,27
ham,No,0.1,yes,radw,2,4886,4855
spam,Yes,0.1,yes,radw,2,2921,2733
spam,Unsure,0.1,yes,radw,2,2921,130
spam,No,0.1,yes,radw,2,2921,58
ham,Yes,0.1,yes,radw,3,4898,5
ham,Unsure,0.1,yes,radw,3,4898,42
ham,No,0.1,yes,radw,3,4898,4851
spam,Yes,0.1,yes,radw,3,2917,2786
spam,Unsure,0.1,yes,radw,3,2917,90
spam,No,0.1,yes,radw,3,2917,41
ham,Yes,0.1,yes,sradw,1,4898,3
ham,Unsure,0.1,yes,sradw,1,4898,41
ham,No,0.1,yes,sradw,1,4898,4854
spam,Yes,0.1,yes,sradw,1,2908,2773
spam,Unsure,0.1,yes,sradw,1,2908,92
spam,No,0.1,yes,sradw,1,2908,43
ham,Yes,0.1,yes,sradw,2,4886,4
ham,Unsure,0.1,yes,sradw,2,4886,27
ham,No,0.1,yes,sradw,2,4886,4855
spam,Yes,0.1,yes,sradw,2,2921,2733
spam,Unsure,0.1,yes,sradw,2,2921,130
spam,No,0.1,yes,sradw,2,2921,58
ham,Yes,0.1,yes,sradw,3,4898,5
ham,Unsure,0.1,yes,sradw,3,4898,42
ham,No,0.1,yes,sradw,3,4898,4851
spam,Yes,0.1,yes,sradw,3,2917,2786
spam,Unsure,0.1,yes,sradw,3,2917,90
spam,No,0.1,yes,sradw,3,2917,41
ham,Yes,0.1,yes,sreadw,1,4898,3
ham,Unsure,0.1,yes,sreadw,1,4898,41
ham,No,0.1,yes,sreadw,1,4898,4854
spam,Yes,0.1,yes,sreadw,1,2908,2773
spam,Unsure,0.1,yes,sreadw,1,2908,92
spam,No,0.1,yes,sreadw,1,2908,43
ham,Yes,0.1,yes,sreadw,2,4886,4
ham,Unsure,0.1,yes,sreadw,2,4886,27
ham,No,0.1,yes,sreadw,2,4886,4855
spam,Yes,0.1,yes,sreadw,2,2921,2733
spam,Unsure,0.1,yes,sreadw,2,2921,130
spam,No,0.1,yes,sreadw,2,2921,58
ham,Yes,0.1,yes,sreadw,3,4898,5
ham,Unsure,0.1,yes,sreadw,3,4898,42
ham,No,0.1,yes,sreadw,3,4898,4851
spam,Yes,0.1,yes,sreadw,3,2917,2786
spam,Unsure,0.1,yes,sreadw,3,2917,90
spam,No,0.1,yes,sreadw,3,2917,41
ham,Yes,0.1,yes,readw,1,4898,3
ham,Unsure,0.1,yes,readw,1,4898,41
ham,No,0.1,yes,readw,1,4898,4854
spam,Yes,0.1,yes,readw,1,2908,2773
spam,Unsure,0.1,yes,readw,1,2908,92
spam,No,0.1,yes,readw,1,2908,43
ham,Yes,0.1,yes,readw,2,4886,4
ham,Unsure,0.1,yes,readw,2,4886,27
ham,No,0.1,yes,readw,2,4886,4855
spam,Yes,0.1,yes,readw,2,2921,2733
spam,Unsure,0.1,yes,readw,2,2921,130
spam,No,0.1,yes,readw,2,2921,58
ham,Yes,0.1,yes,readw,3,4898,5
ham,Unsure,0.1,yes,readw,3,4898,42
ham,No,0.1,yes,readw,3,4898,4851
spam,Yes,0.1,yes,readw,3,2917,2786
spam,Unsure,0.1,yes,readw,3,2917,90
spam,No,0.1,yes,readw,3,2917,41
ham,Yes,0.1,no,none,1,4857,3
ham,Unsure,0.1,no,none,1,4857,24
ham,No,0.1,no,none,1,4857,4830
spam,Yes,0.1,no,none,1,2892,2762
spam,Unsure,0.1,no,none,1,2892,91
spam,No,0.1,no,none,1,2892,39
ham,Yes,0.1,no,none,2,4849,4
ham,Unsure,0.1,no,none,2,4849,19
ham,No,0.1,no,none,2,4849,4826
spam,Yes,0.1,no,none,2,2889,2722
spam,Unsure,0.1,no,none,2,2889,118
spam,No,0.1,no,none,2,2889,49
ham,Yes,0.1,no,none,3,4854,4
ham,Unsure,0.1,no,none,3,4854,35
ham,No,0.1,no,none,3,4854,4815
spam,Yes,0.1,no,none,3,2891,2766
spam,Unsure,0.1,no,none,3,2891,87
spam,No,0.1,no,none,3,2891,38
ham,Yes,0.1,no,radw,1,4898,4
ham,Unsure,0.1,no,radw,1,4898,40
ham,No,0.1,no,radw,1,4898,4854
spam,Yes,0.1,no,radw,1,2908,2770
spam,Unsure,0.1,no,radw,1,2908,96
spam,No,0.1,no,radw,1,2908,42
ham,Yes,0.1,no,radw,2,4886,4
ham,Unsure,0.1,no,radw,2,4886,27
ham,No,0.1,no,radw,2,4886,4855
spam,Yes,0.1,no,radw,2,2921,2735
spam,Unsure,0.1,no,radw,2,2921,129
spam,No,0.1,no,radw,2,2921,57
ham,Yes,0.1,no,radw,3,4898,5
ham,Unsure,0.1,no,radw,3,4898,43
ham,No,0.1,no,radw,3,4898,4850
spam,Yes,0.1,no,radw,3,2917,2778
spam,Unsure,0.1,no,radw,3,2917,97
spam,No,0.1,no,radw,3,2917,42
ham,Yes,0.1,no,sradw,1,4898,4
ham,Unsure,0.1,no,sradw,1,4898,40
ham,No,0.1,no,sradw,1,4898,4854
spam,Yes,0.1,no,sradw,1,2908,2770
spam,Unsure,0.1,no,sradw,1,2908,96
spam,No,0.1,no,sradw,1,2908,42
ham,Yes,0.1,no,sradw,2,4886,4
ham,Unsure,0.1,no,sradw,2,4886,27
ham,No,0.1,no,sradw,2,4886,4855
spam,Yes,0.1,no,sradw,2,2921,2735
spam,Unsure,0.1,no,sradw,2,2921,129
spam,No,0.1,no,sradw,2,2921,57
ham,Yes,0.1,no,sradw,3,4898,5
ham,Unsure,0.1,no,sradw,3,4898,43
ham,No,0.1,no,sradw,3,4898,4850
spam,Yes,0.1,no,sradw,3,2917,2778
spam,Unsure,0.1,no,sradw,3,2917,97
spam,No,0.1,no,sradw,3,2917,42
ham,Yes,0.1,no,sreadw,1,4898,4
ham,Unsure,0.1,no,sreadw,1,4898,40
ham,No,0.1,no,sreadw,1,4898,4854
spam,Yes,0.1,no,sreadw,1,2908,2770
spam,Unsure,0.1,no,sreadw,1,2908,96
spam,No,0.1,no,sreadw,1,2908,42
ham,Yes,0.1,no,sreadw,2,4886,4
ham,Unsure,0.1,no,sreadw,2,4886,27
ham,No,0.1,no,sreadw,2,4886,4855
spam,Yes,0.1,no,sreadw,2,2921,2735
spam,Unsure,0.1,no,sreadw,2,2921,129
spam,No,0.1,no,sreadw,2,2921,57
ham,Yes,0.1,no,sreadw,3,4898,5
ham,Unsure,0.1,no,sreadw,3,4898,43
ham,No,0.1,no,sreadw,3,4898,4850
spam,Yes,0.1,no,sreadw,3,2917,2778
spam,Unsure,0.1,no,sreadw,3,2917,97
spam,No,0.1,no,sreadw,3,2917,42
ham,Yes,0.1,no,readw,1,4898,4
ham,Unsure,0.1,no,readw,1,4898,41
ham,No,0.1,no,readw,1,4898,4853
spam,Yes,0.1,no,readw,1,2908,2770
spam,Unsure,0.1,no,readw,1,2908,96
spam,No,0.1,no,readw,1,2908,42
ham,Yes,0.1,no,readw,2,4886,4
ham,Unsure,0.1,no,readw,2,4886,27
ham,No,0.1,no,readw,2,4886,4855
spam,Yes,0.1,no,readw,2,2921,2735
spam,Unsure,0.1,no,readw,2,2921,130
spam,No,0.1,no,readw,2,2921,56
ham,Yes,0.1,no,readw,3,4898,5
ham,Unsure,0.1,no,readw,3,4898,43
ham,No,0.1,no,readw,3,4898,4850
spam,Yes,0.1,no,readw,3,2917,2778
spam,Unsure,0.1,no,readw,3,2917,97
spam,No,0.1,no,readw,3,2917,42
ham,Yes,0.01,yes,none,1,4857,3
ham,Unsure,0.01,yes,none,1,4857,48
ham,No,0.01,yes,none,1,4857,4806
spam,Yes,0.01,yes,none,1,2892,2728
spam,Unsure,0.01,yes,none,1,2892,132
spam,No,0.01,yes,none,1,2892,32
ham,Yes,0.01,yes,none,2,4849,3
ham,Unsure,0.01,yes,none,2,4849,36
ham,No,0.01,yes,none,2,4849,4810
spam,Yes,0.01,yes,none,2,2889,2654
spam,Unsure,0.01,yes,none,2,2889,194
spam,No,0.01,yes,none,2,2889,41
ham,Yes,0.01,yes,none,3,4854,2
ham,Unsure,0.01,yes,none,3,4854,57
ham,No,0.01,yes,none,3,4854,4795
spam,Yes,0.01,yes,none,3,2891,2733
spam,Unsure,0.01,yes,none,3,2891,123
spam,No,0.01,yes,none,3,2891,35
ham,Yes,0.01,yes,radw,1,4898,3
ham,Unsure,0.01,yes,radw,1,4898,74
ham,No,0.01,yes,radw,1,4898,4821
spam,Yes,0.01,yes,radw,1,2908,2746
spam,Unsure,0.01,yes,radw,1,2908,126
spam,No,0.01,yes,radw,1,2908,36
ham,Yes,0.01,yes,radw,2,4886,3
ham,Unsure,0.01,yes,radw,2,4886,44
ham,No,0.01,yes,radw,2,4886,4839
spam,Yes,0.01,yes,radw,2,2921,2664
spam,Unsure,0.01,yes,radw,2,2921,210
spam,No,0.01,yes,radw,2,2921,47
ham,Yes,0.01,yes,radw,3,4898,4
ham,Unsure,0.01,yes,radw,3,4898,70
ham,No,0.01,yes,radw,3,4898,4824
spam,Yes,0.01,yes,radw,3,2917,2745
spam,Unsure,0.01,yes,radw,3,2917,133
spam,No,0.01,yes,radw,3,2917,39
ham,Yes,0.01,yes,sradw,1,4898,3
ham,Unsure,0.01,yes,sradw,1,4898,74
ham,No,0.01,yes,sradw,1,4898,4821
spam,Yes,0.01,yes,sradw,1,2908,2746
spam,Unsure,0.01,yes,sradw,1,2908,126
spam,No,0.01,yes,sradw,1,2908,36
ham,Yes,0.01,yes,sradw,2,4886,3
ham,Unsure,0.01,yes,sradw,2,4886,44
ham,No,0.01,yes,sradw,2,4886,4839
spam,Yes,0.01,yes,sradw,2,2921,2664
spam,Unsure,0.01,yes,sradw,2,2921,210
spam,No,0.01,yes,sradw,2,2921,47
ham,Yes,0.01,yes,sradw,3,4898,4
ham,Unsure,0.01,yes,sradw,3,4898,70
ham,No,0.01,yes,sradw,3,4898,4824
spam,Yes,0.01,yes,sradw,3,2917,2745
spam,Unsure,0.01,yes,sradw,3,2917,133
spam,No,0.01,yes,sradw,3,2917,39
ham,Yes,0.01,yes,sreadw,1,4898,3
ham,Unsure,0.01,yes,sreadw,1,4898,74
ham,No,0.01,yes,sreadw,1,4898,4821
spam,Yes,0.01,yes,sreadw,1,2908,2746
spam,Unsure,0.01,yes,sreadw,1,2908,126
spam,No,0.01,yes,sreadw,1,2908,36
ham,Yes,0.01,yes,sreadw,2,4886,3
ham,Unsure,0.01,yes,sreadw,2,4886,44
ham,No,0.01,yes,sreadw,2,4886,4839
spam,Yes,0.01,yes,sreadw,2,2921,2664
spam,Unsure,0.01,yes,sreadw,2,2921,210
spam,No,0.01,yes,sreadw,2,2921,47
ham,Yes,0.01,yes,sreadw,3,4898,4
ham,Unsure,0.01,yes,sreadw,3,4898,70
ham,No,0.01,yes,sreadw,3,4898,4824
spam,Yes,0.01,yes,sreadw,3,2917,2745
spam,Unsure,0.01,yes,sreadw,3,2917,133
spam,No,0.01,yes,sreadw,3,2917,39
ham,Yes,0.01,yes,readw,1,4898,3
ham,Unsure,0.01,yes,readw,1,4898,74
ham,No,0.01,yes,readw,1,4898,4821
spam,Yes,0.01,yes,readw,1,2908,2746
spam,Unsure,0.01,yes,readw,1,2908,126
spam,No,0.01,yes,readw,1,2908,36
ham,Yes,0.01,yes,readw,2,4886,3
ham,Unsure,0.01,yes,readw,2,4886,44
ham,No,0.01,yes,readw,2,4886,4839
spam,Yes,0.01,yes,readw,2,2921,2664
spam,Unsure,0.01,yes,readw,2,2921,210
spam,No,0.01,yes,readw,2,2921,47
ham,Yes,0.01,yes,readw,3,4898,4
ham,Unsure,0.01,yes,readw,3,4898,70
ham,No,0.01,yes,readw,3,4898,4824
spam,Yes,0.01,yes,readw,3,2917,2745
spam,Unsure,0.01,yes,readw,3,2917,133
spam,No,0.01,yes,readw,3,2917,39
ham,Yes,0.01,no,none,1,4857,3
ham,Unsure,0.01,no,none,1,4857,51
ham,No,0.01,no,none,1,4857,4803
spam,Yes,0.01,no,none,1,2892,2722
spam,Unsure,0.01,no,none,1,2892,138
spam,No,0.01,no,none,1,2892,32
ham,Yes,0.01,no,none,2,4849,3
ham,Unsure,0.01,no,none,2,4849,37
ham,No,0.01,no,none,2,4849,4809
spam,Yes,0.01,no,none,2,2889,2650
spam,Unsure,0.01,no,none,2,2889,200
spam,No,0.01,no,none,2,2889,39
ham,Yes,0.01,no,none,3,4854,2
ham,Unsure,0.01,no,none,3,4854,60
ham,No,0.01,no,none,3,4854,4792
spam,Yes,0.01,no,none,3,2891,2731
spam,Unsure,0.01,no,none,3,2891,124
spam,No,0.01,no,none,3,2891,36
ham,Yes,0.01,no,radw,1,4898,4
ham,Unsure,0.01,no,radw,1,4898,74
ham,No,0.01,no,radw,1,4898,4820
spam,Yes,0.01,no,radw,1,2908,2733
spam,Unsure,0.01,no,radw,1,2908,139
spam,No,0.01,no,radw,1,2908,36
ham,Yes,0.01,no,radw,2,4886,3
ham,Unsure,0.01,no,radw,2,4886,45
ham,No,0.01,no,radw,2,4886,4838
spam,Yes,0.01,no,radw,2,2921,2662
spam,Unsure,0.01,no,radw,2,2921,215
spam,No,0.01,no,radw,2,2921,44
ham,Yes,0.01,no,radw,3,4898,4
ham,Unsure,0.01,no,radw,3,4898,71
ham,No,0.01,no,radw,3,4898,4823
spam,Yes,0.01,no,radw,3,2917,2744
spam,Unsure,0.01,no,radw,3,2917,133
spam,No,0.01,no,radw,3,2917,40
ham,Yes,0.01,no,sradw,1,4898,4
ham,Unsure,0.01,no,sradw,1,4898,74
ham,No,0.01,no,sradw,1,4898,4820
spam,Yes,0.01,no,sradw,1,2908,2733
spam,Unsure,0.01,no,sradw,1,2908,139
spam,No,0.01,no,sradw,1,2908,36
ham,Yes,0.01,no,sradw,2,4886,3
ham,Unsure,0.01,no,sradw,2,4886,45
ham,No,0.01,no,sradw,2,4886,4838
spam,Yes,0.01,no,sradw,2,2921,2662
spam,Unsure,0.01,no,sradw,2,2921,215
spam,No,0.01,no,sradw,2,2921,44
ham,Yes,0.01,no,sradw,3,4898,4
ham,Unsure,0.01,no,sradw,3,4898,71
ham,No,0.01,no,sradw,3,4898,4823
spam,Yes,0.01,no,sradw,3,2917,2744
spam,Unsure,0.01,no,sradw,3,2917,133
spam,No,0.01,no,sradw,3,2917,40
ham,Yes,0.01,no,sreadw,1,4898,4
ham,Unsure,0.01,no,sreadw,1,4898,74
ham,No,0.01,no,sreadw,1,4898,4820
spam,Yes,0.01,no,sreadw,1,2908,2733
spam,Unsure,0.01,no,sreadw,1,2908,139
spam,No,0.01,no,sreadw,1,2908,36
ham,Yes,0.01,no,sreadw,2,4886,3
ham,Unsure,0.01,no,sreadw,2,4886,45
ham,No,0.01,no,sreadw,2,4886,4838
spam,Yes,0.01,no,sreadw,2,2921,2662
spam,Unsure,0.01,no,sreadw,2,2921,215
spam,No,0.01,no,sreadw,2,2921,44
ham,Yes,0.01,no,sreadw,3,4898,4
ham,Unsure,0.01,no,sreadw,3,4898,71
ham,No,0.01,no,sreadw,3,4898,4823
spam,Yes,0.01,no,sreadw,3,2917,2744
spam,Unsure,0.01,no,sreadw,3,2917,133
spam,No,0.01,no,sreadw,3,2917,40
ham,Yes,0.01,no,readw,1,4898,4
ham,Unsure,0.01,no,readw,1,4898,74
ham,No,0.01,no,readw,1,4898,4820
spam,Yes,0.01,no,readw,1,2908,2733
spam,Unsure,0.01,no,readw,1,2908,139
spam,No,0.01,no,readw,1,2908,36
ham,Yes,0.01,no,readw,2,4886,3
ham,Unsure,0.01,no,readw,2,4886,45
ham,No,0.01,no,readw,2,4886,4838
spam,Yes,0.01,no,readw,2,2921,2662
spam,Unsure,0.01,no,readw,2,2921,215
spam,No,0.01,no,readw,2,2921,44
ham,Yes,0.01,no,readw,3,4898,4
ham,Unsure,0.01,no,readw,3,4898,71
ham,No,0.01,no,readw,3,4898,4823
spam,Yes,0.01,no,readw,3,2917,2744
spam,Unsure,0.01,no,readw,3,2917,133
spam,No,0.01,no,readw,3,2917,40

--------------010907060305050000060207--
>From  Thu 6 May 2004 05:27:20 2004
Return-Path: bogofilter-return-59-relson=osagesoftware.com at aotto.com
Return-Path: <bogofilter-return-59-relson=osagesoftware.com at aotto.com>
X-Original-To: relson at osagesoftware.com
Delivered-To: relson at osagesoftware.com
Received: from lists.refdesk.com (ns1.drudgereport.com [216.40.241.219])
	by mail.osagesoftware.com (Postfix) with SMTP id E2EB02FEA6
	for <relson at osagesoftware.com>; Thu,  6 May 2004 05:27:20 -0400 (EDT)
Received: (qmail 31430 invoked by alias); 6 May 2004 09:26:24 -0000
Mailing-List: contact bogofilter-help at aotto.com; run by ezmlm
Precedence: bulk
List-Id: <bogofilter at aotto.com>
List-Post: <mailto:bogofilter at aotto.com>
List-Help: <mailto:bogofilter-help at aotto.com>
List-Unsubscribe: <mailto:bogofilter-unsubscribe at aotto.com>
List-Subscribe: <mailto:bogofilter-subscribe at aotto.com>
Delivered-To: mailing list bogofilter at aotto.com
X-Ezauth: giabgadecobcecjjijha
Received: (qmail 31420 invoked from network); 6 May 2004 09:26:24 -0000
Message-ID: <409A04EF.90109 at tacocat.net>
Date: Thu, 06 May 2004 05:27:11 -0400
From: Tom Allison <tallison at tacocat.net>
User-Agent: Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4.2) Gecko/20040220
X-Accept-Language: en-us, en
MIME-Version: 1.0
To: bogofilter <bogofilter at aotto.com>,
	Tom Anderson <tanderso at oac-design.com>
X-Enigmail-Version: 0.76.8.0
X-Enigmail-Supports: pgp-inline, pgp-mime
Content-Type: multipart/mixed;
 boundary="------------090508080503070809020103"
X-Virus-Scanned: Symantec AntiVirus Scan Engine
X-Virus-Scanned: by amavisd-new at tacocat.net
Subject: [bogofilter] spamitarium & block_on_subnets results
X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=0.17.5.cvs
Status:   

--------------090508080503070809020103
Content-Type: text/plain; charset=us-ascii; format=flowed
Content-Transfer-Encoding: 7bit

First, my apologies for posting with attachments, but I thought this 
might be the easiest way to get the data out.

The following tests were performed to evaluate the effective 
contribution of the option block_on_subnets and the external program 
spamitarium.pl (Tom Anderson).

CORPUS
7226 ham emails
4261 spam emails

CONFIGURATION
This was taken from my existing configuration
# bogofilter version 0.17.5

robx        = 0.600000  # (6.00e-01)
robs        = 0.100000  # (1.00e-01)
min_dev     = 0.050000  # (5.00e-02)
ham_cutoff  = 0.150000  # (1.50e-01)
spam_cutoff = 0.850000  # (8.50e-01)

block_on_subnets  = yes
charset_default   = us-ascii
replace_nonascii_characters = no
stats_in_header   = yes
thresh_update     = 0.000000
timestamp         = yes

TRAINING
For each test configuration, two tests were run.
The first test used the first 2000 emails (head) from both spam and ham 
for a total of 4,000 emails for training.  The subsequent testing or 
evaluation was based on the remaining emails and did not include the 
training bodies.
The second test was identical except that the last 2000 emails (tail) 
from both groups were used for the training.

TEST CONFIGURATIONS
Six test configurations were evaluated, each with two tests for 
head/tail training.  The specific details of the spamitarium.pl 
arguements are presented later in this email.

yes.basic: configuration file was over written by the command line 
argument, --block_on_subnets=yes, spamitarium.pl was not used

no.basic: configuration file was over written by the command line 
argument, --block_on_subnets=no, spamitarium.pl was not used.

yes.sreadw: --block_on_subnets=yes, spamitarium.pl was run with the 
option to strip out non-required headers (X-Headers et al) prior to the 
training and testing to bogofilter (spamitarium.pl -sreadw | bogofilter...)

yes.readw:  --block_on_subnets=yes, spamitarium.pl was run without the 
option to strip out the non-required headers.

Similarly, the last two test configurations where:
no.sreadw
no.readw

The wordlist database was only added to during the initial 2000 corpus 
training and no additional updates were applied during each run.  At the 
beginning of each run, the wordlist was recreated using the same test 
configurations of spamitarium.pl and bogofilter as the evaluation portion.

Time to complete: 639 minutes.

I have included all the raw data in the attached file in addition to 
some summary results (DataPilot_Sheet3_1).  I have even rawer data on 
file (actualy scores).

----------

I'm going to attempt minimal conclusions since that's why I provided the 
raw data and would rather have a discussion than a "And that's they way 
it is"

spamitarium.pl, in either configuration, tends to push email towards the 
spamminess of the scoring.  This applied to all categories of scoring 
(HAM: Yes/No/Unsure | SPAM: Yes/No/Unsure).  Since this is consistent 
across both HAM and SPAM corpus I personally am not certain that this is 
  the configurations tested will be optimal as accuracy for both 
populations, ham and spam, are what becomes really important.

The use of --block_on_subnets=yes appears to have marginal test results 
and may be an area of continued debate.

Considering the number of configuration options used in spamitarium.pl, 
it may have merit to retest using a different group of settings to 
determine their individual impacts (assume independence) of using the 
following options:

Of most interest to myself would be to retest using -aw options only.  I 
think this would be the least intrusive into the original email and may 
provide valuable information.  NOTE: '-w' is required in testing and is 
included in the discussion for consistency between code and conversation.

        s   allow standard fields only (RFC
            822/2822/1049/1341/1521/2183/1864) ...  all others are
            stripped from the email

        r   insert new received line containing verified received-
            line tokens

        e   include helo string in received line

        d   allow DNS lookups (forward and reverse) to help fill
            in all necessary received fields

        f   force rDNS lookups even when provided already by the
            MTA

        a   perform ASN lookups and include in received lines

I'm open to suggestions on future testing or techniques.

File Size
For each test configuration, I dumped the data (bogoutil -d...) and 
counted the number of rows in each.  Results are as follows:
  154497 wordlist.2000.no.basic
  151964 wordlist.2000.no.readw
  144766 wordlist.2000.no.sreadw
  163823 wordlist.2000.yes.basic
  159749 wordlist.2000.yes.readw
  151685 wordlist.2000.yes.sreadw
Actual file size ranged from 4.9MB to 5.7MB

--------------090508080503070809020103
Content-Type: application/vnd.sun.xml.calc;
 name="spamtest2000.sxc"
Content-Transfer-Encoding: base64
Content-Disposition: inline;
 filename="spamtest2000.sxc"

UEsDBBQAAAAAALlFpjBFvBOUHAAAABwAAAAIAAAAbWltZXR5cGVhcHBsaWNhdGlvbi92bmQu
c3VuLnhtbC5jYWxjUEsDBBQACAAIALlFpjAAAAAAAAAAAAAAAAALAAAAY29udGVudC54bWzt
XW2T4jiS/n6/guNu5xvGepd6untjtrsn9iLmLaJnLm5jY2KCBlcVsRRmwXRN//tLGygk2jYg
WVimmA9dI2zrNVOZelKZ+fqvfz7Oep+T5Wqazt/0URT3e8l8nE6m8/s3/d9+/X4g+399+x+v
//P9z+9+/ccvH3rp3d10nLyapOP1YzLPBuN0nsHf3i+//e2H/3nX6w+Gw58Xyfzn4rUoXd4P
h+9/fd/blN9vv+pBO8Phh5/6vf6mvmiSTfpvX1dVDn2cr15tnr7pP2TZ4tVwmEIz6b4ZHMfx
cFPubz9YZV9m9e8Xb+xez5I/s9q38xeeXx59OlJ38cbu9cly9FT7dv4CzPnu/bv0+e2np6fo
iRRvIqXU8P8+/jD8Pl0+jp778udsOv9X5fvF092r8/Xjp2RZ35NRNjLmZfX5vqzyzQR+fu7y
+GG0rJ+/4o39jJDJkRkhk93LMNiHigHK4Y/wsPjnxx/207d8rK08f+F5fOPldFHf880r/R31
j2ej1epNf7VYJqPJ6iFJ9o8MVnqm6M33w+fyHRD2YJKMZ6u3r4u53v/S25Tno0cgrx/W4+lk
tBrNV/3eXbp56W70OJ19MZ9pdSym2Rjm6vNoOS0IcFjfwk/Tx0/rVe8jVNP7AX3dyjejRbr6
1nhr81Njjdq2uXl7cJ/Mk+V0DKvxNF0dnYphyQJsfxqtsxTobDoeFFU8r0zxr9H9cYqe29l2
uWB22K9m68d5f/flYgm0tMymySof4icgln8NPiVAelBH3tquks1ng6fpJKfyOFJS0Ol8/FB0
WOtEXY+w1x5Jhdm5PSIee4QibDFH1OscMc7O7hHz2CMSxThW5/VoWUXZy/SprDubH+Dh4CGZ
3j9k+TwgJXDRan1n16tkkC6y6eNoNtAryJbr5PQOZ6PyDu9+fARRliwHi9F9Mth88T65G61n
Wdloii9fTaarxWz0paonGwn6Cr4Zg2qS1/t1r35CX/Uql6qbN/vPdWz+9LYl2I2KyYDGxwlI
F+Do7ZPH6XwwBU3oHkYymd5PM3iK8p5tn+d6ydu/vB7qxefSYUdryDGp2taS2Wz3ZDFa5ppZ
UTiY0e0b+5EO9rMxrGu2au860iy088t0lma9d+kSZEAVu6TLCczbpzSD7f1Nf57Ok772+yy5
K6g2xjIn2t4qnU0nvf+Ki//0F5cb8jz8PksX1Z+fvhMkVbvlqXPw/TSZTU6bAugtIkdGe+60
nFLnbqrKXzxjrqr28ZPpZQS8lC6/XIZiLjo1VQLFz9S0SknFupwxNbz1XebwV89bivC+pTRD
szIAdq5YmovyrgqUd13n5lxGRXGgsvDomxZU48rkX2uagVDNGfNVtQE6cxRy1Sz/dzRbJ7WK
rZsK0f4MueqdDc3Q5ejq7N3ostqm87bsj1RcdUuvzHTx2XBVJy/OOOfM0NlM4qprelOuTzh4
NCaKL6vINqHAnTo17uziqty2sXlccoKwq9Ibyn5y6qSVbzHDSmPM9sGndPLl7esNUlv8u0Vt
N8P8mNviQKXZ/KZPQA4UG99tQfaSVwvzzrbWAkPdvroaLJNFAjw6KcDZLVq8AT+LdSpFRYcn
t4obaDXHcA9aXKZPJc3lSP9Bz6Au+CX5M3u1ePsund9N79dLWIJ0/nq4/fX1sOSDqio+jtNl
Yvfpr8kq6wF9OHydPaW1Xw8PpqiBOfuSrKJPo9V0bNftv3/3Y+8fyeqsj7ed/Jzz/iD7soBO
7vH+vv60YMw4ZoLGDEuEJZNCkv5z6/CU/8Vn2whzrBBHUmKpCN63jCJ8pOUw1+q3+Wp9JoGf
u1wMcU4FI4gJrGKmrxbDXldLcikIwYwLhRWT+5ZlxFUXV+un+u3AbbqUopgpKpAUiCKNtpWK
KPG5UCqOqRIMYUqELGT8rmVYRNa1hfr4i/c9UGGGGSUIY5g0OMTtJwxHzO9SCRGDboW5pBy2
YK1lEYm4k0vlfwsEeaU4B5EhCWxF2qSxSBwhb8eWMUbAy7GUMfxhmqTEEUadXC2vW2CMBKb5
biQ5QzFRunwXfmVVDItEGaeE4Zhxru+/8LQdabXKbwY+BawICs5oTASLKaiBQhhTJv0yFuIY
c8YU0AmiWGcs1I4i6LpY/rfBrXyPFWVYEG4oz9TnaiHOlKa6aw3ziHWTtfxqghLEFBeSkZhi
qgt5UJy98hWc6ajgMSiBBBuSUgKNtKMJuqzUJVRBioGfmIITDiZUk1iKRvgIbbtSCY2FkBSO
d1hJZFAJbUe5cF4r/5sgZQhh2AUVo1IKTXDQiB2ZNNftl1FgKcERj5WKhb79so6ylmdlECma
CwwB4goUM33CkFfOAs2GUowlRrAXYklibmg2R2Slp8UKXRdUjAtGORx1lFDCUJ/RkYOpM2Nx
4CcBTVMZc5OxRBfX6gKgoFIsBtVCMI6RNM5ZsdfFQsBSMpaYc8JjpgktBCe8Ti6WZ01QxaBZ
KDjpMK4j7SDjlVdxJTmoFZIoDkIL6Zuv5BFv54QVuCJYKICSIUlA0OvqGIZnLwwTDF8PZExg
wRhFAgQX1wQ8Oyo0HFvGGHiZAF8LQal+DMdt2bACVwNZDjEpAucdxTE1xLtfxSI0NXCe3kzD
NegWppzyLSppgFtHdt8gV+oCOqCIhQKm4gopTJXS10r4PQrDxisJxzFhShAdZhIRu/wW6L5Y
nu3CoFewfOtTzDQ2qgj7NTbmxk0F5MFBudB3P9VBprqIChgTCf9Igqhx0wJUwGPiwrFlISTs
uAohBX+YqQJeHmFvYKX8739wvIlpzCWcrmKktDnjEfZtvMJYMQr/KQJi0jBedZKtPCuAPId1
MJxvFEgOw27l1xoSnFEYlip4m3B7OCDOlXSkFJGCC4Or4iNiMsyluoBFmCIG2x/MGNGFBoqo
V8ACwXlOIoIpopIa9mASiVb0v8DNwTBJBMMZGCQW0y8Gyoh6NYRIilGOZsG5CkhEoxBJ27i/
5LhQl9AACY2VUowJgYTUl4octcm6EgmiBUOBom7e4ZQRagWq6IAxmBHFgMaBwhllREMrWEQ9
26xg/yOMK4wEMo5WKJLdZKybLfhia3VTASvJBM5TTCHOBGzDyPAPIa0oFuGrgAiOVjQWwF9C
MZOxvNpBEALdAnGqKGWxLioRakm3CFsHFETEoKYjEBj65SEQ78IrtCQFhd2PEqIkUVhrWYqj
binBrdNlfEO44gzIGgNjmSCgX8uiEjzn5hwoplTfdpWIeNc46lIaoMIczqKISSGUoQAqr9B6
jLHKr/kSxAiFzVdHK8jlbfYNrNX1GoFbwAC3P299uOE93YGbIJiCktU0P/ra8ZsWnvKNEY9l
c/ovRqW7+syQBH9svPH/eJ4Ta3/8U/ze6Rme9uSE+qSN536dXz5uNBQAPaVJ1vQYlEtwgZIX
98F4t5z4/XSWJUubTaGs2mOTKLywlU1PpPee5GF2D6mh6k1+8pso3q+dfcyIik6ctE2d1k3/
K52H8N2L1bNjX5TWqKns+a7qXqGOLloZD8trpUat56sjp9RqgXIdrdbOq/akapvqrTSrPWEW
POiYFmfk52+/+5wsQTvrDXpWUV06c+GmtVN+ezamFsNetHa53sXB07P8+Vrbq3pTO7MesOex
sEmnbVqkf+akOtxWO61H/MweuUBnnnrkYND10yOX2DOe5sjB8+WkHuVR+s7rUVhe2ZaGiXaE
eZvXnFuzrbR1safNYGOtuTS6xNW4CXMPLgJ+xIKLvdJTjxyu0fnpkUvEP1+rZu9v7EeYBxYM
x+ZCSFuS3IN32fEouV249tKm312743aIQHaTvc27UnoSKw7Xujz1yN7XwNcU2UdG9tUj+7As
fiRvYEEDbW5NtCR52/PAau/eR3s3jtsLgtrJ2b7JcS9XlzxJhTD75HDF/zZPDc7T+TI9NPcM
23urLUn29pzgW7t5257TV3tx6NuLduUQb/Um2D0EkPAkGhyus3vqkYOHpbc5sk4G4a1H1qHo
PInzsIIkW7kLtSPJW4w91J7DU3ve9u0lAGovzKhDpPubKPcQuMuPUHDxIvTUI4fQFhU9Om72
c55F+5i8fkRrs+kiTvPiKny3Sqe04Uyqx32xzs+ketw56vw6mYc6rfK5OtV3oT426YxY6rb0
zAJZLgCWyQp6lqczPs91rK7i4UP6mEAds9l0lc6HeTvDbTsRjuM4enYeicarz43uZOSy3p5H
JjiFOeg9jB5dZ7Zkz7ubpaPsYLtjGOsgB5Q603BYS7ZaXGzNYK40qZSXOtOwh6Onk1cZbGGP
69noTf/NP6N39PdvRo+Lb7/59zrNvu1t/hS//DN6T3/fC+fldH4/2E7J9lbYflacronBx3Yf
NhCpo2zBNeiF+MQ+fV/LKhubDu7y8zafc6GaK8jFXcYvrIZfWA2/bIw+Jsu4GIKsuaaZqCml
O6UONQuvrOPXkaBscFS/6gYFn8O7hszoZbzDa3iH1/DOT6nJN7bXoqx5xj16Tak2iJSuDULJ
Jz7n15G2lGWEbljJSz7Hdw1Z6suYRtQwjahgmp1lQosW4WSryL8OSkfDsdIvaEKpu3B++aEj
1q2kUPI5vvAyhDXDOrKGdWQd6xwqa87XduwZyJ+6hogWWAcKPjUav3dbSoWrNjjmdWwOhvug
uUfVcI+q4x5dXXO6xm7PNX4UNv22NfG5I3u+5F0qcTS69Yt6BJE0pRoPNHgAxTVMAA/LuaC3
H9Pm76pAAZu0Q1b3/1IBHk+dQVQ3g6hiBg9NLtqMfvXIbodo16izibEVulXHECr2wdFMesB1
9IBfJOKsI0t+914Hh1jLsSGku+sgr4h6WB58TTINqWMa8rJhZ677JTC/Ng3rUC6WY5O6ZywU
PI4usMgTTXJPnY0T1Rk5rxZ4ZhpZ5SWf6JJDNENbeYqpDjxDyeP4Agu/1iTf1Nk6UZWx8/qx
Z6Sbs/OST+5xuFZtOz6MNYmal7zuDk3eOw2Ke+qsnajK3PmC4OdYvxQSewWdHNwMLUenR08l
fpXSoHyimmSgOssnqjV9Xi0CfbkrNn4DnJRuCFobflGQIGICnIqf1hkxUZUV84ZAazNYZ8hC
VZasa0eguwdANyNVcJ1FB1dZdK4bf9Z0Fa+qiu9gkKUImtaI9KpkOkS0C5tj6ix4uMqC90LA
Zz0ZCvNrOPcawbwUWEc6so58bg0ugZjDZp46cyeuM3deLfbMNdrNS17RJa85fMr9BLA2vrzk
E3t2SEYSNtvUGTxxlcHzBUDPse6LCCWfzOM3lsjt2rM35qmzd+Iqe+cLQp51J2x0pg/2mVqb
39B65QJWF68+x+YQHyxs/qmze+Jau+fVAs9Eu6NGvF5S8xtl+gY8nwqb4jr7Ja6yX96AZ20G
6wxYuMqAdc3A8y4fdGdwZ5cE1iYt1JlxcJUZ57ph56sOtKEJLu5XXlqn4wmaX+qMdrjKaPdC
QGeiGReI30AbftOjljvZ6VdnlFeDlEMayZCZh9TZOEmdjfNqQWdpRNqQfiNtYEJZfnZRzAwX
oCLsK9KGbm/MSz7H55BJPWiuqbNzkio75/VjzrF+LTcvecWcvcYpL8ec9Wgxeckv5mwd0Dlo
3qkzc5IqM+cLgpx1b0Qo+FRq/CbtKRU/muZEpVfI2T61SdDsU2fuJLXmzutFnDWiIl6Jym/+
yluwjVPxUlJntyRVdssb4qzNYJ3lilRZrq4cce5YrI3nHrtLlToLDqkLuHm9mPM1X3VGRh4z
r9E2HTKch80ydSY7UmWyeyGws9CsC8KvTYMiBscXoC6iH5JRRP3cOzNyEkLB49gQEUwiUmQs
okaYDRKJVjDnplinzsJJ6iycVws6UyPKBvUbZQOIiWDBFJz+daeE3I/ej78mpfpl1LzkcXyS
YpRfQ+WUw6ag7QmSthFls0G2qTN0kto4m1eNOiNkBNlAXoNs+M0UWYE6GzedvcoclxR5QTMP
rTN00ipD5wuCnbEOO2OvWJPfxOmlAkgXP15vbjpkiw6bf+pMnrTW5Hm1uPMtxEYT6EeXbjrT
OuMlrTJe3nBnbQbr7Fe0yn515bhz52DnhmRKbUbBl5lS8JpRZ90zVPjVwgiTTCHOBJzWkJFS
kLSCnDXEMHVGO/rCcwrquVfPzLx6tjbGBaaxAKVMKGZqY548NY1LqV7vpCKEEUGcKkpZrCMc
CLUEnzXEO3UmTvoicwpyPfERlLziSoKIWBKJ4Fysh55UMhJ+bmtSxo3wGl7Dh0hB4RBDCVGS
KP1cKMXRnIlBc02dlZO+5KSCevYuKPnkHQy0yxlsyBjEjnnT2VNkAIx1Hs1LPscneC5Rcy8I
SnVFVImId1ni1Jk56S2rINIjBSC/oQKYwpyB8GFSCGUgzsqPn83lor1hrPLECQQxQo3bCTgi
l49N0xz31Fk76QvNKnjFoTWu+qKzCdXBewZOh9QmBWn5wCrxPdksvmfZnP6LUemuvqLJjw9J
kqF+SWey0VedKZoueXW8n8lJcjdaz7Ki4wP9nfebB18B4tV1Ug91Mg914qN1jhPUaH0X6qOV
nGk3JHa3QpNkKQyn9zB6dJ2kk3UfPWxaXupMw95mf7W42PTDsA031TPB0DYb9jD9oRl4XEPZ
mEry70P497/3hufTtaAWAt1ovMmtIGx97B+KsX+wGPs+cyGWihjGoRaCEQZoSXG/12IQKbMm
UoY4p4LB0VZgw8c/jpif6/1Uj4sEBWcqZbZUKrkUhGDGhcKK6ckCI3559//gbBfKiNKinLeT
d9ySSpWimCkqkBSI6qdJpY4i/7ZEKvRrUnnJmUy5JZmqOKZKsG26W/2+cA66XZxMQzQU6MAG
lJwJVWwIlZy9VJhhRgnCmCEz2BCOmB9CbSZIt0GoYkOo548+rBDeYeLxmj0YCs6UKi0pNddN
FeegnknCjUS5LBJHNhVbgaKbwp1H/kFaUmmcBxWiKJaycGzUbYxtXL4IDv/WdhPivJe8U7b0
iQSmueSTnKGYGAl2hKd0B47ouEGdypY6AwkScgxH475wtFYxx44FpwgNd9HPdc47B8L2wEse
ezc3rsVUSKIHRMgPfX58OZB+RxE5w04fNsO3OdWC4okxZ0zBxmlEOsMRumEvOUimR2GzugFu
EiqxJdTtkS5WlGFBDEc27OfCntQjk0q7wKQmmRJLMkWcKQ0m1MbOI3YDXxAzohUwd4gQ2QLZ
+c0XxYVkJKZYDy6opK+IbNCOjr5AyZ1QbbFsCVwqeCxhU8fGQUkCl97glz5GxIgNQNzhF8Rs
8ReKYS9lapMsTTs0KBphP06cGGOm4y/YWaB8QMwWgJE0FkJSxGFb1eME52FHunS09QjAxLox
Km6AVrntCZcyhDCIf8WolPqdXBoxT8H89Vj+DdAptz3kIkZhKxUc8Vip2EzO3KUtNVSfd5NC
bdHsDrrEm/RpC2SH4jH/MkGYGwbjdHJwdG428/7E9hjM5X2fdU9K6Q7hbgZvdfuFcZBqAsZO
81jWhni7fPr0ABEYd88Nk0yRNZkqxWI41ArGMZKGlSH2Q6Qc6egTctfCNoO3QWBAuslYYs4J
j/U8tQhFokNk6s11V+kIjJ1rq0mmtoi2kiqGI63ijDGu3yOEc53yc1SgDBuevdj9WgG2RbQl
h/OsJIrDoQHpaqjkEb88oh0gAtNEHnqTVIn1DZiOpqk3SZXcrsCEnO/dpFVqfQemg+ngTTql
9pdgQsoWHxoC4+4EepjC3haB6Vz6dZM+bZHsGwJzSwreUQCmUecjLDvlfeScZtvcPaS9+1FA
WbgDxF/cU1Yfpge3pdJuZrQ2yVTZkmlYCa+DA2CayBJ9mInb1gGpo0mkDUIltoB2YDmmwwNg
GknKfJj+2haA6WjOZpNUkT0AE1JK5yABmAayHx+mm7Y83nYxObJJp9gagAkqd3JwAIxzuuHD
jM62AEznshGb9GkLZN/8kG45cruKwDR6BYbYBi3obNJZcwOxjVoQVk7aADEY9/yth7lybb2Q
Opje1aRRYXsDJqzsr8EBME1kTD3MSmt7A6ajCVVNQrUFtAPLtxoeANNIftLDTLCWAExX05ea
pGobwCCw7KZBAjAN5AE9TLxqewOmg2lCzQiFsbUPUlBZREMDYBr2QaK2cHbXfZCoLZB9uwFz
SxZ5w1+KBJYdwl/c0y+au4d9AN6gsjMGCL+4pzI8TBtpC790MtOhSaa2EXgDS4QYHADTRPbA
wwyNtgBMR5MLmoRqi2YHlnswPACmkWR9h2kRrYPwdjOXn0mq9lF4g0r1FyQA00BWvMM0hLYA
TPeS5plk6hCGN6SceqHhLw17IFH7OLydy1Jn0me3AvGaEEJDSewahmgsm9N/2VU6GWWjwWI6
S7NB8cOq6sGuzU3ONHj4S/7seb5Gi8VsOh5l03Q+yD990989yUbL+yQbLEfz+2QwmkyWyWql
1fBHkTSP/IGi79Crkl//jviupk/rLEvnVR/3yn6lZb/+rfTXd2TXzj30dTIoMlrlK/D0vAKr
dA3UWizBZjzbD/Y/7Ae4qTcf1Pb/3hOxzwmnze7dNJlNekYDxU9bCniXzu+m9+tlMbW7HqbL
ac40+U95Qrp8/XeP1qtkMniYJsvRcvzw5U1/8LxEd+v5ePPFaJ2l/ZKuzJLPyY64JtPVYjb6
MkgeFxlUczearRKNrA4HcP7APo7TZVI6oHzKuzaaXbemq4L8B9BYut5+9aafLddtDbVousmR
/prDwum8fDx5NWcN6DM8vk9aX75iUNlTGvSgvn6w3corHsBePkzv7qZj2DrTyZd9aZKO148w
PhAd8wz+vv1/UEsHCGysMKWQGQAAk5QBAFBLAwQUAAgACAC5RaYwAAAAAAAAAAAAAAAACgAA
AHN0eWxlcy54bWzVWW1v2zYQ/r5foalb0QJTJDtOGqe2AixpsQFpG7TpsH2kJUomKpECRcXJ
fv2Ob3qzLLdJs7YN0Fa8R8fnnjseSWVxdptnzg3mJWF06U4OAtfBNGIxoenS/Xj92jtxz8Kf
Fj9fvDu//ufqlcOShET4NGZRlWMqvFLcZbh0rj7+fvnnueN6vv+uwPSdQh0wnvr+xfWFo58v
zEsOTOP7r966jqvdHcQidsPFDt/AkJan2rh010IUp77PYBbWzDINgsDXz655Qb09ilcICxf4
VoyiJaAGo9Ue3wph4TFHm1G0BIDiFp+wGr3ZbA42hwo5mc/n/t8fLv3XjOeo5nKbEfppJ15Z
LZRW+QrzcSZIoI4u5U065FwLeFNTjtaIj+unEI0ih/EeRQ5jC4Zg1zsCPPHfgFH99eaykY/n
o84loI4v4qQYZ64hrq39zmqpqzZhULExjrIyXCgBmxFHP1OUQ81cVhGJUYlo6ToJ06AE5SS7
69paPgoiIhDgBnGiqsofn+EtyVdV6XwAN87lZHuWp6hg5csOSg99tUnvO6dGeymmmJNo6ZYb
Uu6Vwh9IgBnSDcQSj3GCqsy0FevU8FPL1YtwlrkWXnCoBS4IdCD7fkRylHlFhiJcLt1ph9ku
GTJE0wqlYMNUDUSsooLDnB8/bDnwUEkQ3U6hhllXFkUZxdZmvA6ZGu8Ry4sM3w4VWu27xgx5
HzSCdqApK7yYlAJR2aSDgyNCo7VKz4D44UJ3ItOQOhkxOgZuL0OyMWmg23vdMU85oR6hAqfg
MSYpEZCjiaIwMFvtI6o4hx3vbojEJJhdjRExlhuWIUHkhiB41aLXuL7LVyyzPLsVYbGtqgh/
qSn3XOyIfKAyRzWx1pSzqlA7vSLekqqryueKNZqzrVWl1kLGYD96kiTQZwNJwEwkN9vQq+mo
xx9YVh17jop6PdGYCLWJ3KCsws+eP03Fy2Vda6goMiOy16nF/SkCdSM4PMmlPJik4y9fWH0d
Dvfo0Evjr700+juYfnYILx4ewuwbh3Dy8BAmjxWCJrZN+0K38D7z1tbp7375PS7H3zWWAvH6
6O915x1uIWp3M2d9IlBGonpfgji9isaYwxkYrCUsyaZpd62e6UTKm/p/c4bZYJKu4YQIjSJu
7Wh71NIBT+8VcVespjRavWBM6z8witVt4quLrUQDjVNoXLJqMO/IqSxeySouDwEJuW1ELMm/
MDQ5lufoHVl7mN4m6Mm9oraKDUcdE44j3a0zUUfMmUBy0EOyrJbuPPh8sheQ0iuSMeGcM04b
Fe+VqpFKaOb5S24zjz/Na4IhaY8+zTmCbsf43SOXOBxnudhT4ffI+TUR2cNy8XWoP3DFNQE9
bnMfZ+f37nrmEVWC5bA+I88aLAXY6HJUQuvqBFPkk903vw2HwxpNvZzFAM24J1ZNna6hdzS3
iqEo5P68NkEEB9P50Qt1OVImxFOwZjiRtv4wN+/0x1dMQHTSWTA/mfVuWkN8EsbE/8wRLoQ7
CHbJ+Ft52Z+q6Q+dKmldMR7LL3EwfDg7lsNOyTISO08C9UeBChTr77CACl5MmpdR9EneMWhs
jy1PokD+1LK0EHBmTHEjfhPu914w35tM+8vW39l7jEEDex3JDEpPY8ft1kzedsvSOQwXstOf
Fubfco2x/vITnp2dLfz+oBkpepXQWy0yjfYkSkq4fUBrT1BWdj4JMtGa/UqGYh4ka33TCCd2
vtbYFgXrqqP5KAV/S8Z9yr7HBeOjwk63hDVnPpzKE5/k84VaO880Tsi9vwXRz8+3dOjM1BlS
66c3O1wRbIjbtwW4g9QgT31uWLrTIJh5wZEXHLthcOQHx/JL98ywkMDwN8cSBvbB7HR2dHoU
1KSHqqfL75uVlOO3geoDUDift4F67JFKzx9e6/7wL7fC/wBQSwcI0xaKTJQFAACBGwAAUEsD
BBQAAAAAALlFpjAJ9QpItQMAALUDAAAIAAAAbWV0YS54bWw8P3htbCB2ZXJzaW9uPSIxLjAi
IGVuY29kaW5nPSJVVEYtOCI/Pgo8IURPQ1RZUEUgb2ZmaWNlOmRvY3VtZW50LW1ldGEgUFVC
TElDICItLy9PcGVuT2ZmaWNlLm9yZy8vRFREIE9mZmljZURvY3VtZW50IDEuMC8vRU4iICJv
ZmZpY2UuZHRkIj48b2ZmaWNlOmRvY3VtZW50LW1ldGEgeG1sbnM6b2ZmaWNlPSJodHRwOi8v
b3Blbm9mZmljZS5vcmcvMjAwMC9vZmZpY2UiIHhtbG5zOnhsaW5rPSJodHRwOi8vd3d3Lncz
Lm9yZy8xOTk5L3hsaW5rIiB4bWxuczpkYz0iaHR0cDovL3B1cmwub3JnL2RjL2VsZW1lbnRz
LzEuMS8iIHhtbG5zOm1ldGE9Imh0dHA6Ly9vcGVub2ZmaWNlLm9yZy8yMDAwL21ldGEiIG9m
ZmljZTp2ZXJzaW9uPSIxLjAiPjxvZmZpY2U6bWV0YT48bWV0YTpnZW5lcmF0b3I+T3Blbk9m
ZmljZS5vcmcgMS4xLjAgKExpbnV4KTwvbWV0YTpnZW5lcmF0b3I+PCEtLTY0NShCdWlsZDo4
Njg3KS0tPjxtZXRhOmNyZWF0aW9uLWRhdGU+MjAwNC0wNS0wNlQwNDoyOTo1MzwvbWV0YTpj
cmVhdGlvbi1kYXRlPjxkYzpkYXRlPjIwMDQtMDUtMDZUMDQ6NDU6NTA8L2RjOmRhdGU+PGRj
Omxhbmd1YWdlPmVuLVVTPC9kYzpsYW5ndWFnZT48bWV0YTplZGl0aW5nLWN5Y2xlcz4zPC9t
ZXRhOmVkaXRpbmctY3ljbGVzPjxtZXRhOmVkaXRpbmctZHVyYXRpb24+UFQxNk0xOFM8L21l
dGE6ZWRpdGluZy1kdXJhdGlvbj48bWV0YTp1c2VyLWRlZmluZWQgbWV0YTpuYW1lPSJJbmZv
IDEiLz48bWV0YTp1c2VyLWRlZmluZWQgbWV0YTpuYW1lPSJJbmZvIDIiLz48bWV0YTp1c2Vy
LWRlZmluZWQgbWV0YTpuYW1lPSJJbmZvIDMiLz48bWV0YTp1c2VyLWRlZmluZWQgbWV0YTpu
YW1lPSJJbmZvIDQiLz48bWV0YTpkb2N1bWVudC1zdGF0aXN0aWMgbWV0YTp0YWJsZS1jb3Vu
dD0iNCIgbWV0YTpjZWxsLWNvdW50PSI4MTMiLz48L29mZmljZTptZXRhPjwvb2ZmaWNlOmRv
Y3VtZW50LW1ldGE+UEsDBBQACAAIALlFpjAAAAAAAAAAAAAAAAAMAAAAc2V0dGluZ3MueG1s
7Vpbc9o6EH4/v4Lj105qA0lO6CR0ZHMNgWCugZeOsBWjIkuOL3HcX38kG9LGxbmAmel04CGx
tKtvVyvpk7T25dcnmxQekethRq+k4mdFKiBqMBNT60oajxonF9LX6j+X/9ZutdGsXy+w+3ts
oC8mMwIbUf/EQ77Pdb1Cf6zetLWCdCLLtw6it7HeZ+Zaslwb1QpJubZuVuCGZLnekwpSAvjZ
9E2pepmJzr2k3pdEfCUtfd/5IsuM22E/7ZQURZGTsrRu8EQwXT3rh2H4OSzHusVKpSLH0o2q
weg9tl7BLsqJirSJwYuoPfu+cbl6maivgU+wj2zRn8K6mkKb9+QRo/C5l9K2Ni/1J9jDC4KA
i+CIOdJG6EcOF2LqS1XlUv4d5EPAN+jePwzyFJv+cht05ax4ujd6C2FrudXz4sXp+dl78U9s
6JxgaqInZKZtoXD7IMVt+IR1o/d4jMK2mXLT810+A6SqEBY/5KkATfk5gjwibzn6skkN+rCP
CfO/DZcI+eVvxXdMRi1wPeb2mYd9vg7ucpwzL5FnW8f0fDfoFnPxD0Z9SIYOwX6XmSg9Fkvm
7jHdketj41DoKe83Ecpzuf7q/wHwgeHjRxSjDyC1MsJT2g1842/OFLaBHWQRzJ64+VL5BlVl
vs/sDwBvqXyd2LLYJOaQP55B8lmCRwI5EsiRQA5BIKV8CGTHmfAOAilfHBnkyCBHBtmFQUoZ
t5x8KaT8p59Byu++7B0p5EghRwp5kQHICO8bFJIljhMp7x35OMWSkcNJDi97rwzDZYQsoJuZ
NCv9t2No54zZI46T65oWoBNIgjRqMlTKrrMAWkjkxF5FP98RfLhkoTCgugiu+i4SydgU/oIx
giCVqveQeGh3M3Pksth/L8uA7wZ74PeYfyjopovT2cockAWqxghzU9CEiSVULJ2XS6WzHVN8
L8b1AFFpQY+7Hth0wMIWgiZyD2MkJhLONAdAb3u3gU8wRcPIXjDiDVF6R8jFyJBCZ8QG0PNR
eqDzWFkJcNtbvwQ4mIUB8vh4Zx4PizszcRp+e6Z7T/hhsDDxI/Yy3c8JfLvzu06dBB48YW8Y
UWPpMop/oB2IaMfTwPoF0HYFD/nve72XFAIXisH9yHu+457xl+0ZN5iuxo4JfZR9myoft6Pj
dnTcjv7q7egtSyDwmQaJERBOFfnD9/kVlfejxwsZt9cmosjFRmGtuZcZvoyDdE5gAT10fqpi
Ct1IqgbWzSdZMcnCnkRw2rXGrWtnQQfEsMAf+RsrZmNE1OHkbdUpAF3+rx0XZPmiARrzMX9c
LXQAhnYFD5oNZTYETxpVed/PlPlduzIoTYL53bUzi1TdsElgNieRZle4fMKfGwqcVoL+RH00
6CCaTYmi2b1Ho0mI8UPhOL3vs+kT6Y9AZzFtRPMSCebNxoN511MWon1NCW9qwOtqYSjaLZqV
aN6chGbTqnS/6x2j1fPmd3Oy4Hq/yEtw2osxxw21rpcqwqfT2Hfd0WdTk4xL5Hw+crrzu4Ho
AwCa0xoAdQlaOpjrwAKaBXi5Bxor0NGT4CigDkQ9/xOWh04rHbxAyFUhrzEhP+8pDwNFBzNd
VUBzLKLKQF0RuHOgrZJG10AFop7/LoR+mAJtWUI+E4+d3unDIPqOOiuzDpp1YAKVAI3XC7wY
FzhJI1UHuqgXrtS4vpF29VTIm+KpGeOxMnZaE10dCbyBrgagaYAFUB3Q6gK4blR/7r8aCv00
6KfYdByfGO9B+KtYAOrqSvT/568W9yeeZTdp/9TOzfg+for9uxXye2ubvajHeDxA5CjA6j6Y
oF3r0PVQXe1IZI5DorGHXPFpUf5Epi2hCw3OMRqzHRd5gvFzz2m1vQ5yKfAwpP2AGn4At6SR
c0kRwUc0ST4nvKUaYd4hDg/J8bPhMnuEbOe1/WUPI4AQfpAW9H/NFhqkBiI7WMm8dMm/fWUp
Z30zWv0fUEsHCMee8a24BQAA3CoAAFBLAwQUAAgACAC5RaYwAAAAAAAAAAAAAAAAFQAAAE1F
VEEtSU5GL21hbmlmZXN0LnhtbK3SsWrDMBAG4L1Pod5un9OpBDuBxikU2tqDM3QU8jkRyJKx
ziF++6qD40KyBLxJ4u67X3Dp9tIacabea2czWMUJCLLK1doeMzhU79ErbDdP6XNe7Kqfci9a
aXVDntfTQZSHt8+PnYAIsejIFk2jFcWuPyLmVS6+prpgI+6/QcD0FNdcQ8BvzRDK+us1gxNz
t0Z0wXez/5IkK5yKAiRmqdGGIrLcj/8SU61lxGNHGciuM1pJDr/Gs61jP9g4DI2VNArmlmYw
JuoknzJAwIcm3FdKrXjoyT+qMV0YQ8D7qnKWQ/PfDxZ1PY+G/OJsSyyXz0rMYWmvaVO8WavN
L1BLBwhTyxNT/AAAAO4CAABQSwECFAAUAAAAAAC5RaYwRbwTlBwAAAAcAAAACAAAAAAAAAAA
AAAAAAAAAAAAbWltZXR5cGVQSwECFAAUAAgACAC5RaYwbKwwpZAZAACTlAEACwAAAAAAAAAA
AAAAAABCAAAAY29udGVudC54bWxQSwECFAAUAAgACAC5RaYw0xaKTJQFAACBGwAACgAAAAAA
AAAAAAAAAAALGgAAc3R5bGVzLnhtbFBLAQIUABQAAAAAALlFpjAJ9QpItQMAALUDAAAIAAAA
AAAAAAAAAAAAANcfAABtZXRhLnhtbFBLAQIUABQACAAIALlFpjDHnvGtuAUAANwqAAAMAAAA
AAAAAAAAAAAAALIjAABzZXR0aW5ncy54bWxQSwECFAAUAAgACAC5RaYwU8sTU/wAAADuAgAA
FQAAAAAAAAAAAAAAAACkKQAATUVUQS1JTkYvbWFuaWZlc3QueG1sUEsFBgAAAAAGAAYAWgEA
AOMqAAAAAA==
--------------090508080503070809020103--
>From  Thu 6 May 2004 09:14:27 2004
Return-Path: bogofilter-return-62-relson=osagesoftware.com at aotto.com
Return-Path: <bogofilter-return-62-relson=osagesoftware.com at aotto.com>
X-Original-To: relson at osagesoftware.com
Delivered-To: relson at osagesoftware.com
Received: from lists.refdesk.com (ns1.drudgereport.com [216.40.241.219])
	by mail.osagesoftware.com (Postfix) with SMTP id D933B2FEA6
	for <relson at osagesoftware.com>; Thu,  6 May 2004 09:14:27 -0400 (EDT)
Received: (qmail 10241 invoked by alias); 6 May 2004 13:13:29 -0000
Mailing-List: contact bogofilter-help at aotto.com; run by ezmlm
Precedence: bulk
List-Id: <bogofilter at aotto.com>
List-Post: <mailto:bogofilter at aotto.com>
List-Help: <mailto:bogofilter-help at aotto.com>
List-Unsubscribe: <mailto:bogofilter-unsubscribe at aotto.com>
List-Subscribe: <mailto:bogofilter-subscribe at aotto.com>
Delivered-To: mailing list bogofilter at aotto.com
X-Ezauth: ffnmpffnhkoopnmflkcl
Received: (qmail 10231 invoked from network); 6 May 2004 13:13:29 -0000
Message-ID: <21835.198.208.159.14.1083849259.squirrel at www.tacocat.net>
In-Reply-To: <1083845605.14628.918.camel at linuxpc>
References: <409A04EF.90109 at tacocat.net>
    <1083845605.14628.918.camel at linuxpc>
Date: Thu, 6 May 2004 09:14:19 -0400 (EDT)
From: tallison at tacocat.net
To: "Tom Anderson" <tanderso at oac-design.com>
Cc: "bogofilter" <bogofilter at aotto.com>
User-Agent: SquirrelMail/1.4.1
MIME-Version: 1.0
Content-Type: text/plain;charset=iso-8859-1
Content-Transfer-Encoding: 8bit
X-Priority: 3
Importance: Normal
X-Virus-Scanned: Symantec AntiVirus Scan Engine
X-Virus-Scanned: by amavisd-new at tacocat.net
Subject: Re: [bogofilter] spamitarium & block_on_subnets results
X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=0.17.5.cvs
Status:   

> On Thu, 2004-05-06 at 05:27, Tom Allison wrote:
>> spamitarium.pl, in either configuration, tends to push email towards the
>> spamminess of the scoring.  This applied to all categories of scoring
>> (HAM: Yes/No/Unsure | SPAM: Yes/No/Unsure).  Since this is consistent
>> across both HAM and SPAM corpus I personally am not certain that this is
>>   the configurations tested will be optimal as accuracy for both
>> populations, ham and spam, are what becomes really important.
>
> I think your ham results were unusual in all of the tests.  I've gotten
> zero false positives in months, with my first ham unsure a few days ago
> when someone forwarded me a commercial advertisement.  I think the
> difference stems from your configuration:
>
> robx=0.60, robs=0.1, min_dev=0.05, spam_cutoff=0.850, ham_cutoff=0.15
>
> vs. mine:
>
> robx=0.46, robs=0.2, min_dev=0.20, spam_cutoff=0.465, ham_cutoff=0.10
>

Considering that I used the same configurations through all of the tests
and training, the percentage of various scores may not be ideal when
compared to other configurations.  However, the important thing to note
here is not the absolute value of the accuracy (or lack thereof) but the
comparative differences between which one is better/worse.

For me, these settings give me a field experience of ~1-2 Unsure a week
that are all spam and nothing that's incorrectly classified beyond that. 
This works for me as I'm very cautious about losing real email.

I very intentionally leave my spam_cutoff and ham_cutoff closer to 1,0 and
0.0 respectively than bogotune recommends because I would rather have spam
scored as Unsure than Ham scored as Yes.  I consider Unsure scores to be a
minor error and false scores to be very major errors.

> The robx value plays a big role in my opinion, as you always want to
> give previously unseen tokens the benefit of the doubt.  Since you are
> predisposing unknown tokens toward spam, and spamitarium likely reduces
> some redundancy in tokens seen, I think this is why you're seeing hams
> pushed in the spammy direction, albeit not by that much.  My experience
> has been the opposite... hams become hammier, and spams also become
> spammier, but perhaps not as much as you've seen in your tests.  Note
> also that your robx is outside of your min_dev zone.
>
>> The use of --block_on_subnets=yes appears to have marginal test results
>> and may be an area of continued debate.
>

<snip>
>> Considering the number of configuration options used in spamitarium.pl,
>> it may have merit to retest using a different group of settings to
>> determine their individual impacts (assume independence) of using the
>> following options:
>
> e,d,a,f are all dependent on r.  f is dependent on d.  Other than that,
> you can assume some independence.  Turning ASN lookups on and off (with
> everything else enabled except perhaps s) vs. block-on-subnets on and
> off would be a good test.
>

They may be functionally independent of r and other values.  But there may
be a statistical dependency between these values that we are not going to
find so easily.  It's the statistical relationships that I'm concerned
with.

>> I'm open to suggestions on future testing or techniques.
>
> Primarily, I'd like to see results with a different configuration,
> perhaps the default one.  I think your robx and min_dev played a large
> role in these results.
>

I'm not sure that I agree with this based on my earlier comments.

The important thing in these measurements is not achieving a score of
>99.9% but achieving scores that provide a convincing difference or lack
thereof between different settings and parameters.  For the samples that I
ran, the ideal arguement would be to run all the training based on a
configuration file that was generated by bogotune exclusively.  But I'm
not convinced that this is going to make much a difference in the end.  I
believe we are looking for statistically significant "shifts" in the data
more than we are looking for specific target values of attribute/variable
data.  This perspective removes a dependency on the clause "YMMV".
>From  Thu 6 May 2004 12:15:32 2004
Return-Path: bogofilter-return-69-relson=osagesoftware.com at aotto.com
Return-Path: <bogofilter-return-69-relson=osagesoftware.com at aotto.com>
X-Original-To: relson at osagesoftware.com
Delivered-To: relson at osagesoftware.com
Received: from lists.refdesk.com (ns1.drudgereport.com [216.40.241.219])
	by mail.osagesoftware.com (Postfix) with SMTP id C67EE2FEA6
	for <relson at osagesoftware.com>; Thu,  6 May 2004 12:15:32 -0400 (EDT)
Received: (qmail 7569 invoked by alias); 6 May 2004 16:14:34 -0000
Mailing-List: contact bogofilter-help at aotto.com; run by ezmlm
Precedence: bulk
List-Id: <bogofilter at aotto.com>
List-Post: <mailto:bogofilter at aotto.com>
List-Help: <mailto:bogofilter-help at aotto.com>
List-Unsubscribe: <mailto:bogofilter-unsubscribe at aotto.com>
List-Subscribe: <mailto:bogofilter-subscribe at aotto.com>
Delivered-To: mailing list bogofilter at aotto.com
X-Ezauth: ghkbnfnlmdndpelbeacp
Received: (qmail 7559 invoked from network); 6 May 2004 16:14:34 -0000
Message-ID: <40038.198.208.159.14.1083860124.squirrel at www.tacocat.net>
In-Reply-To: <0e1701c43377$432f88d0$6ecfcfcf at Betson110>
References: 
    <409A04EF.90109 at tacocat.net><1083845605.14628.918.camel at linuxpc><21835.198.208.159.14.1083849259.squirrel at www.tacocat.net><0de501c43371$14540c30$6ecfcfcf at Betson110>
       <20040506100346.539ba11e at osage.osagesoftware.com> 
    <26973.198.208.159.14.1083852681.squirrel at www.tacocat.net>
    <0e1701c43377$432f88d0$6ecfcfcf at Betson110>
Date: Thu, 6 May 2004 12:15:24 -0400 (EDT)
From: tallison at tacocat.net
To: "bogofilter" <bogofilter at aotto.com>
User-Agent: SquirrelMail/1.4.1
MIME-Version: 1.0
Content-Type: text/plain;charset=iso-8859-1
Content-Transfer-Encoding: 8bit
X-Priority: 3
Importance: Normal
X-Virus-Scanned: Symantec AntiVirus Scan Engine
X-Virus-Scanned: by amavisd-new at tacocat.net
Subject: Re: [bogofilter] spamitarium & block_on_subnets results
X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=0.17.5.cvs
Status:   

> From: <tallison at tacocat.net>
>> How does
>> robx=0.60 (based on some recent adjustments proposed to the default
>> bogofilter.cf file)
>> min_dev = 0.10
>> strike everyone?  Or should min_dev > 0.10 to exclude robx skewing?
>
> I don't like the idea at all.  Assume you have a completely empty
> wordlist,
> and you start training on error.  You'll get loads of false positives
> since
> bogofilter will think everything is spammy!  I think that robx should be
> on
> the low side of the min_dev range with a fairly significant robs so that
> it
> doesn't get out of that range too quickly.  It has to have seen a token
> several times in my opinion before it can conclusively say it is
> indicative
> of spam.  This way, in the event you are training on error with a virgin
> wordlist, all of your initial classifications will be "unsure", as it
> should
> be, and you'll slowly start filtering hams and spams appropriately after
> you've done some training and bogofilter becomes more confident about
> those
> tokens.  Overconfidence is just as likely to lead to mistakes in a spam
> filter as it is in people, so making bogofilter more timid I think is
> useful.
>

As long as robx is within the min_dev range, it would not contribute to
the score.  I think setting robx = 0.60 would tend to pick up new spammy
words much faster than new hammy words, giving you a biase towards spam. 
To effectively negate this we could decide upon: robx=0.50, min_dev=0.10,
robs=1.0.

While this might be the best approach for testing the effect of your
script, I couldn't recommend it as the best solution in practice.  Most of
the new spam that I get has these huge blocks of dictionary spew which are
99% new words and, for me, I do want to consider as spam.

Eventually, with enough training (getting to your 75% train, 25% test
approach) moving robx outside of the min_dev range might not be too
unstable.

But I can only run so many tests in a day.
I think I need to set up my other box for this stuff.
>From  May 2004 10:43:17 -040 Thu 6 May 2004 10:11:21 2004
Return-Path: bogofilter-return-69-relson=osagesoftware.com at aotto.com
Date: Thu, 6 May 2004 10:43:17 -0400
From: David Relson <relson at osagesoftware.com>
Cc: "bogofilter" <bogofilter at aotto.com>
Subject: Re: [bogofilter] spamitarium & block_on_subnets results
Message-Id: <20040506104317.419362b6 at osage.osagesoftware.com>
In-Reply-To: <26973.198.208.159.14.1083852681.squirrel at www.tacocat.net>
References: <409A04EF.90109 at tacocat.net>
	<1083845605.14628.918.camel at linuxpc>
	<21835.198.208.159.14.1083849259.squirrel at www.tacocat.net>
	<0de501c43371$14540c30$6ecfcfcf at Betson110>
	<20040506100346.539ba11e at osage.osagesoftware.com>
	<26973.198.208.159.14.1083852681.squirrel at www.tacocat.net>
Organization: Osage Software Systems, Inc.
X-Mailer: Sylpheed version 0.9.10claws51 (GTK+ 1.2.10; i686-pc-linux-gnu)
Mime-Version: 1.0
Content-Type: text/plain; charset=US-ASCII
Content-Transfer-Encoding: 7bit

On Thu, 6 May 2004 10:11:21 -0400 (EDT)
tallison at tacocat.net wrote:


> I see the errors of my ways....

You're a smart young man to listen to your elders :-)

> I guess I'll make some adjustments and try it again.
> 
> How does
> robx=0.60 (based on some recent adjustments proposed to the default
> bogofilter.cf file)
> min_dev = 0.10
> strike everyone?  Or should min_dev > 0.10 to exclude robx skewing?

0.60 and 0.10 are fine to exclude robx skewing.  I verified that with:

   mkdir test
   echo junk | bogofilter -d test -s
   echo junk | bogofilter -d test -n
   echo this is a test | bogofilter -d test -vvv

and all looks fine.  (Note: the final column is "+" or "-", indicating
if the token is included in the score or not.)

> While we're doing tweaks and I'm on a learning curve:
> Currently I do only two tests with 2000 training emails from ham and
> spam each.  Is there any reason to think the test results would be
> improved if I went to four tests using 25% of the total for each
> training set?  This would change the training sizes to approximately
> 1100 spam and 1800 ham and the remaining 75% would be used for
> testing.

Bogofilter is fast enough that you can quickly run any of them -- both
50/50 splits, all four 25/75 splits, or even three 33/67 splits.

I predict that the numbers of fp/fn will differ with every run, which
just goes to show that no two message sets are identical.  Patterns
(trends) should, however, be apparent.

David
>From  Thu 6 May 2004 10:35:53 2004
Return-Path: bogofilter-return-67-relson=osagesoftware.com at aotto.com
Return-Path: <bogofilter-return-67-relson=osagesoftware.com at aotto.com>
X-Original-To: relson at osagesoftware.com
Delivered-To: relson at osagesoftware.com
Received: from lists.refdesk.com (ns1.drudgereport.com [216.40.241.219])
	by mail.osagesoftware.com (Postfix) with SMTP id 289E02FEA6
	for <relson at osagesoftware.com>; Thu,  6 May 2004 10:35:53 -0400 (EDT)
Received: (qmail 6176 invoked by alias); 6 May 2004 14:34:55 -0000
Mailing-List: contact bogofilter-help at aotto.com; run by ezmlm
Precedence: bulk
List-Id: <bogofilter at aotto.com>
List-Post: <mailto:bogofilter at aotto.com>
List-Help: <mailto:bogofilter-help at aotto.com>
List-Unsubscribe: <mailto:bogofilter-unsubscribe at aotto.com>
List-Subscribe: <mailto:bogofilter-subscribe at aotto.com>
Delivered-To: mailing list bogofilter at aotto.com
X-Ezauth: ekjkijkojgilcpnhmdmh
Received: (qmail 6166 invoked from network); 6 May 2004 14:34:55 -0000
Message-ID: <0e1701c43377$432f88d0$6ecfcfcf at Betson110>
Reply-To: "Tom Anderson" <tanderso at oac-design.com>
From: "Tom Anderson" <tanderso at oac-design.com>
To: "bogofilter" <bogofilter at aotto.com>
References:    <409A04EF.90109 at tacocat.net><1083845605.14628.918.camel at linuxpc><21835.198.208.159.14.1083849259.squirrel at www.tacocat.net><0de501c43371$14540c30$6ecfcfcf at Betson110>    <20040506100346.539ba11e at osage.osagesoftware.com>  <26973.198.208.159.14.1083852681.squirrel at www.tacocat.net> 
Date: Thu, 6 May 2004 10:34:08 -0400
MIME-Version: 1.0
Content-Type: text/plain;
	charset="iso-8859-1"
Content-Transfer-Encoding: 7bit
X-Priority: 3
X-MSMail-Priority: Normal
X-Mailer: Microsoft Outlook Express 6.00.2800.1409
X-MimeOLE: Produced By Microsoft MimeOLE V6.00.2800.1409
Subject: Re: [bogofilter] spamitarium & block_on_subnets results 
X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=0.17.5.cvs
Status:   

From: <tallison at tacocat.net>
> How does
> robx=0.60 (based on some recent adjustments proposed to the default
> bogofilter.cf file)
> min_dev = 0.10
> strike everyone?  Or should min_dev > 0.10 to exclude robx skewing?

I don't like the idea at all.  Assume you have a completely empty wordlist,
and you start training on error.  You'll get loads of false positives since
bogofilter will think everything is spammy!  I think that robx should be on
the low side of the min_dev range with a fairly significant robs so that it
doesn't get out of that range too quickly.  It has to have seen a token
several times in my opinion before it can conclusively say it is indicative
of spam.  This way, in the event you are training on error with a virgin
wordlist, all of your initial classifications will be "unsure", as it should
be, and you'll slowly start filtering hams and spams appropriately after
you've done some training and bogofilter becomes more confident about those
tokens.  Overconfidence is just as likely to lead to mistakes in a spam
filter as it is in people, so making bogofilter more timid I think is
useful.

> Currently I do only two tests with 2000 training emails from ham and spam
> each.  Is there any reason to think the test results would be improved if
> I went to four tests using 25% of the total for each training set?  This
> would change the training sizes to approximately 1100 spam and 1800 ham
> and the remaining 75% would be used for testing.

I'd imagine that the unknown bias would be multiplied.  If you're testing
the effectiveness of block-on-subnets and ASNs, then you probably want to
have a decent population of these in your wordlist with which to test.  If
your testing set is significantly larger than your training set, then most
of your tested emails will not contain those same IP ranges and ASNs as the
training set.  And those that do will not be very significant in the
classifications if they are mostly hapaxes.  Therefore, I would suggest
exactly the opposite... train with 75% and test with the remaining 25%.
There would of course be overlap if you performed more than one test, but if
choosing the sets randomly each time, you would still provide meaningful
results.

Tom
>From  Thu 6 May 2004 08:12:19 2004
Return-Path: bogofilter-return-61-relson=osagesoftware.com at aotto.com
Return-Path: <bogofilter-return-61-relson=osagesoftware.com at aotto.com>
X-Original-To: relson at osagesoftware.com
Delivered-To: relson at osagesoftware.com
Received: from lists.refdesk.com (ns1.drudgereport.com [216.40.241.219])
	by mail.osagesoftware.com (Postfix) with SMTP id 327D22FEA6
	for <relson at osagesoftware.com>; Thu,  6 May 2004 08:12:19 -0400 (EDT)
Received: (qmail 21725 invoked by alias); 6 May 2004 12:11:22 -0000
Mailing-List: contact bogofilter-help at aotto.com; run by ezmlm
Precedence: bulk
List-Id: <bogofilter at aotto.com>
List-Post: <mailto:bogofilter at aotto.com>
List-Help: <mailto:bogofilter-help at aotto.com>
List-Unsubscribe: <mailto:bogofilter-unsubscribe at aotto.com>
List-Subscribe: <mailto:bogofilter-subscribe at aotto.com>
Delivered-To: mailing list bogofilter at aotto.com
X-Ezauth: hcaffnnneojjcfnjmloi
Received: (qmail 21715 invoked from network); 6 May 2004 12:11:22 -0000
From: Tom Anderson <tanderso at oac-design.com>
To: bogofilter <bogofilter at aotto.com>
In-Reply-To: <409A04EF.90109 at tacocat.net> 
References: <409A04EF.90109 at tacocat.net> 
Content-Type: multipart/signed; micalg=pgp-sha1; protocol="application/pgp-signature"; boundary="=-3AIgtRcT0jBQ0BQj75u3"
Organization: 
Message-Id: <1083845605.14628.918.camel at linuxpc>
Mime-Version: 1.0
X-Mailer: Ximian Evolution 1.2.4 
Date: 06 May 2004 08:14:00 -0400
Subject: Re: [bogofilter] spamitarium & block_on_subnets results
X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=0.17.5.cvs
Status:   

--=-3AIgtRcT0jBQ0BQj75u3
Content-Type: text/plain
Content-Transfer-Encoding: quoted-printable

On Thu, 2004-05-06 at 05:27, Tom Allison wrote:
> spamitarium.pl, in either configuration, tends to push email towards the=20
> spamminess of the scoring.  This applied to all categories of scoring=20
> (HAM: Yes/No/Unsure | SPAM: Yes/No/Unsure).  Since this is consistent=20
> across both HAM and SPAM corpus I personally am not certain that this is=20
>   the configurations tested will be optimal as accuracy for both=20
> populations, ham and spam, are what becomes really important.

Just to clarify... the program is called "spamitarium"... you must have
added the ".pl" when you downloaded it.  I prefer not to include an
extension, as that is a Redmondian thing to do, and it makes me think
you've got the Polish version or something ;)

I think your ham results were unusual in all of the tests.  I've gotten
zero false positives in months, with my first ham unsure a few days ago
when someone forwarded me a commercial advertisement.  I think the
difference stems from your configuration:

robx=3D0.60, robs=3D0.1, min_dev=3D0.05, spam_cutoff=3D0.850, ham_cutoff=3D=
0.15

vs. mine:

robx=3D0.46, robs=3D0.2, min_dev=3D0.20, spam_cutoff=3D0.465, ham_cutoff=3D=
0.10

The robx value plays a big role in my opinion, as you always want to
give previously unseen tokens the benefit of the doubt.  Since you are
predisposing unknown tokens toward spam, and spamitarium likely reduces
some redundancy in tokens seen, I think this is why you're seeing hams
pushed in the spammy direction, albeit not by that much.  My experience
has been the opposite... hams become hammier, and spams also become
spammier, but perhaps not as much as you've seen in your tests.  Note
also that your robx is outside of your min_dev zone.

> The use of --block_on_subnets=3Dyes appears to have marginal test results=
=20
> and may be an area of continued debate.

Indeed, your test shows that it may have actually been worse.  But, I
think this also stems from your configuration, as all hapax IPs will be
considered spammy in your tests.  Using a more conservative
configuration, I think, would improve the results all around.

> Considering the number of configuration options used in spamitarium.pl,=20
> it may have merit to retest using a different group of settings to=20
> determine their individual impacts (assume independence) of using the=20
> following options:

e,d,a,f are all dependent on r.  f is dependent on d.  Other than that,
you can assume some independence.  Turning ASN lookups on and off (with
everything else enabled except perhaps s) vs. block-on-subnets on and
off would be a good test.

> I'm open to suggestions on future testing or techniques.

Primarily, I'd like to see results with a different configuration,
perhaps the default one.  I think your robx and min_dev played a large
role in these results.

>   154497 wordlist.2000.no.basic
>   151964 wordlist.2000.no.readw
>   144766 wordlist.2000.no.sreadw
>   163823 wordlist.2000.yes.basic
>   159749 wordlist.2000.yes.readw
>   151685 wordlist.2000.yes.sreadw
> Actual file size ranged from 4.9MB to 5.7MB

This is fairly significant... a roughly 15% difference in filesize
between block-on-subnets and spamitarium with all the options.

Tom


--=-3AIgtRcT0jBQ0BQj75u3
Content-Type: application/pgp-signature; name=signature.asc
Content-Description: This is a digitally signed message part

-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.2.1 (GNU/Linux)

iD8DBQBAmivlaTUjGSdb2eIRAi8AAJ0fhqB5N7s19tpbHohVv5+PYj0QYwCfYKHm
uohk8iVA1nkxHufuFGXRYDw=
=eRdk
-----END PGP SIGNATURE-----

--=-3AIgtRcT0jBQ0BQj75u3--
>From  Fri 7 May 2004 08:45:58 2004
Return-Path: bogofilter-return-79-relson=osagesoftware.com at aotto.com
Return-Path: <bogofilter-return-79-relson=osagesoftware.com at aotto.com>
X-Original-To: relson at osagesoftware.com
Delivered-To: relson at osagesoftware.com
Received: from lists.refdesk.com (ns1.drudgereport.com [216.40.241.219])
	by mail.osagesoftware.com (Postfix) with SMTP id 2569A2FEA6
	for <relson at osagesoftware.com>; Fri,  7 May 2004 08:45:58 -0400 (EDT)
Received: (qmail 17900 invoked by alias); 7 May 2004 12:44:53 -0000
Mailing-List: contact bogofilter-help at aotto.com; run by ezmlm
Precedence: bulk
List-Id: <bogofilter at aotto.com>
List-Post: <mailto:bogofilter at aotto.com>
List-Help: <mailto:bogofilter-help at aotto.com>
List-Unsubscribe: <mailto:bogofilter-unsubscribe at aotto.com>
List-Subscribe: <mailto:bogofilter-subscribe at aotto.com>
Delivered-To: mailing list bogofilter at aotto.com
X-Ezauth: agkljohhbeeggeffgmie
Received: (qmail 17890 invoked from network); 7 May 2004 12:44:53 -0000
Message-ID: <409B84FD.1040101 at tacocat.net>
Date: Fri, 07 May 2004 08:45:49 -0400
From: Tom Allison <tallison at tacocat.net>
User-Agent: Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4.2) Gecko/20040220
X-Accept-Language: en-us, en
MIME-Version: 1.0
To: bogofilter <bogofilter at aotto.com>
References: <409A04EF.90109 at tacocat.net>	 <1083845605.14628.918.camel at linuxpc>	 <21835.198.208.159.14.1083849259.squirrel at www.tacocat.net>	 <0de501c43371$14540c30$6ecfcfcf at Betson110>	 <20040506100346.539ba11e at osage.osagesoftware.com>	 <26973.198.208.159.14.1083852681.squirrel at www.tacocat.net>	 <20040506104317.419362b6 at osage.osagesoftware.com>	 <409B6DAA.6000309 at tacocat.net> <1083930760.14629.970.camel at linuxpc>
In-Reply-To: <1083930760.14629.970.camel at linuxpc>
X-Enigmail-Version: 0.76.8.0
X-Enigmail-Supports: pgp-inline, pgp-mime
Content-Type: text/plain; charset=us-ascii; format=flowed
Content-Transfer-Encoding: 7bit
X-Virus-Scanned: Symantec AntiVirus Scan Engine
X-Virus-Scanned: by amavisd-new at tacocat.net
Subject: Re: [bogofilter] spamitarium & block_on_subnets results
X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=0.17.5.cvs
Status:   

Tom Anderson wrote:
 > On Fri, 2004-05-07 at 07:06, Tom Allison wrote:
 >
 >>Well, I decided to re-run the tests using
 >>robx=0.55
 >>robx=(1.0, 0.1 0.01)
 >
 >
 >   ^^^^
 > Do you mean robs here?
 >
YES

 >
 >>block_on_subnets=(yes no)
 >>and varous implimentations of spamitarium for a total of something like
 >>90 test sets.
 >
 >
 > Wow, that's ambitious!
 >

yeah, well bogofilter in bulk modem (-M) runs these tests pretty nicely,
but the single-mode of spamitarium adds some time.  But this will be
true with any type of "munging script" that we might put in front of
bogofilter to improve the data.  No fault of spamitarium, but a feature 
that will exist with anything that precedes bogofilter.

I'm guessing it will take something under 48 hours.

 > I'd still go lower with robx, but as long as min_dev > 0.05 in this
 > case, then it should be an adequate test.

With robx = 0.55 and min_dev=0.10 it should be sufficient to keep things
out of the test schema until they've been seen a few times.

Try running some tests on just a few words and see how long it takes to
move out of the min_dev arena with various robs values.  In all cases
that I tested it's not exactly 1 or 2 hits that will affect you.

 >
 >
 >>I need a course in remedial statistics.
 >
 >
 > Don't we all?
>From  Thu 6 May 2004 10:11:27 2004
Return-Path: bogofilter-return-66-relson=osagesoftware.com at aotto.com
Return-Path: <bogofilter-return-66-relson=osagesoftware.com at aotto.com>
X-Original-To: relson at osagesoftware.com
Delivered-To: relson at osagesoftware.com
Received: from lists.refdesk.com (ns1.drudgereport.com [216.40.241.219])
	by mail.osagesoftware.com (Postfix) with SMTP id 8D6DE2FEA6
	for <relson at osagesoftware.com>; Thu,  6 May 2004 10:11:27 -0400 (EDT)
Received: (qmail 30058 invoked by alias); 6 May 2004 14:10:30 -0000
Mailing-List: contact bogofilter-help at aotto.com; run by ezmlm
Precedence: bulk
List-Id: <bogofilter at aotto.com>
List-Post: <mailto:bogofilter at aotto.com>
List-Help: <mailto:bogofilter-help at aotto.com>
List-Unsubscribe: <mailto:bogofilter-unsubscribe at aotto.com>
List-Subscribe: <mailto:bogofilter-subscribe at aotto.com>
Delivered-To: mailing list bogofilter at aotto.com
X-Ezauth: cblokkdiagedimolimfh
Received: (qmail 30048 invoked from network); 6 May 2004 14:10:30 -0000
Message-ID: <26973.198.208.159.14.1083852681.squirrel at www.tacocat.net>
In-Reply-To: <20040506100346.539ba11e at osage.osagesoftware.com>
References: 
    <409A04EF.90109 at tacocat.net><1083845605.14628.918.camel at linuxpc><21835.198.208.159.14.1083849259.squirrel at www.tacocat.net><0de501c43371$14540c30$6ecfcfcf at Betson110>
    <20040506100346.539ba11e at osage.osagesoftware.com>
Date: Thu, 6 May 2004 10:11:21 -0400 (EDT)
From: tallison at tacocat.net
To: "bogofilter" <bogofilter at aotto.com>
User-Agent: SquirrelMail/1.4.1
MIME-Version: 1.0
Content-Type: text/plain;charset=iso-8859-1
Content-Transfer-Encoding: 8bit
X-Priority: 3
Importance: Normal
X-Virus-Scanned: Symantec AntiVirus Scan Engine
X-Virus-Scanned: by amavisd-new at tacocat.net
Subject: Re: [bogofilter] spamitarium & block_on_subnets results
X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=0.17.5.cvs
Status:   

> Greetings Tom A. and Tom A.,
>
> Tom Anderson is correct in his comments on robx and min_dev.  robx
> provides a ham/spam bias for unknown and little used tokens.  min_dev
> provides an "exclusion interval" for neutral tokens, with range of
> 0.5-min_dev to 0.5+min_dev.  It is generally wise to have robx be inside
> the exclusion interval.  The exception to this recomendation is when
> bogotune recommends a low min_dev, like 0.02 or 0.05, which it does for
> some people's messages.
>
> David
>
>

I see the errors of my ways....

I guess I'll make some adjustments and try it again.

How does
robx=0.60 (based on some recent adjustments proposed to the default
bogofilter.cf file)
min_dev = 0.10
strike everyone?  Or should min_dev > 0.10 to exclude robx skewing?

While we're doing tweaks and I'm on a learning curve:
Currently I do only two tests with 2000 training emails from ham and spam
each.  Is there any reason to think the test results would be improved if
I went to four tests using 25% of the total for each training set?  This
would change the training sizes to approximately 1100 spam and 1800 ham
and the remaining 75% would be used for testing.
>From  Fri 7 May 2004 07:51:13 2004
Return-Path: bogofilter-return-77-relson=osagesoftware.com at aotto.com
Return-Path: <bogofilter-return-77-relson=osagesoftware.com at aotto.com>
X-Original-To: relson at osagesoftware.com
Delivered-To: relson at osagesoftware.com
Received: from lists.refdesk.com (ns1.drudgereport.com [216.40.241.219])
	by mail.osagesoftware.com (Postfix) with SMTP id 44F582FEA6
	for <relson at osagesoftware.com>; Fri,  7 May 2004 07:51:13 -0400 (EDT)
Received: (qmail 31655 invoked by alias); 7 May 2004 11:49:47 -0000
Mailing-List: contact bogofilter-help at aotto.com; run by ezmlm
Precedence: bulk
List-Id: <bogofilter at aotto.com>
List-Post: <mailto:bogofilter at aotto.com>
List-Help: <mailto:bogofilter-help at aotto.com>
List-Unsubscribe: <mailto:bogofilter-unsubscribe at aotto.com>
List-Subscribe: <mailto:bogofilter-subscribe at aotto.com>
Delivered-To: mailing list bogofilter at aotto.com
X-Ezauth: eonplmppbhjcldeiccep
Received: (qmail 31645 invoked from network); 7 May 2004 11:49:47 -0000
From: Tom Anderson <tanderso at oac-design.com>
To: bogofilter <bogofilter at aotto.com>
In-Reply-To: <409B6DAA.6000309 at tacocat.net> 
References: <409A04EF.90109 at tacocat.net>
	 <1083845605.14628.918.camel at linuxpc>
	 <21835.198.208.159.14.1083849259.squirrel at www.tacocat.net>
	 <0de501c43371$14540c30$6ecfcfcf at Betson110>
	 <20040506100346.539ba11e at osage.osagesoftware.com>
	 <26973.198.208.159.14.1083852681.squirrel at www.tacocat.net>
	 <20040506104317.419362b6 at osage.osagesoftware.com>
	 <409B6DAA.6000309 at tacocat.net> 
Content-Type: multipart/signed; micalg=pgp-sha1; protocol="application/pgp-signature"; boundary="=-zS9+WdAdSEJAzHvtIU+R"
Organization: 
Message-Id: <1083930760.14629.970.camel at linuxpc>
Mime-Version: 1.0
X-Mailer: Ximian Evolution 1.2.4 
Date: 07 May 2004 07:52:40 -0400
Subject: Re: [bogofilter] spamitarium & block_on_subnets results
X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=0.17.5.cvs
Status:   

--=-zS9+WdAdSEJAzHvtIU+R
Content-Type: text/plain
Content-Transfer-Encoding: quoted-printable

On Fri, 2004-05-07 at 07:06, Tom Allison wrote:
> Well, I decided to re-run the tests using
> robx=3D0.55
> robx=3D(1.0, 0.1 0.01)

  ^^^^
Do you mean robs here?

> block_on_subnets=3D(yes no)
> and varous implimentations of spamitarium for a total of something like=20
> 90 test sets.

Wow, that's ambitious!

> After I finish these, I am hoping there's enough information at this=20
> point to have some concensus.  I made some honest mistakes before, but=20
> am of the opinion that the issues with robx/robs have been addressed.

I'd still go lower with robx, but as long as min_dev > 0.05 in this
case, then it should be an adequate test.

> I need a course in remedial statistics.

Don't we all?

Tom



--=-zS9+WdAdSEJAzHvtIU+R
Content-Type: application/pgp-signature; name=signature.asc
Content-Description: This is a digitally signed message part

-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.2.1 (GNU/Linux)

iD8DBQBAm3iHaTUjGSdb2eIRAv/bAJwKu0O/ak1kjbC3uySZuvkqyl8dMwCeLh8/
0wjjI5ionmm01JrPhtg/HnU=
=QSO6
-----END PGP SIGNATURE-----

--=-zS9+WdAdSEJAzHvtIU+R--
>From  Fri 7 May 2004 07:06:38 2004
Return-Path: bogofilter-return-75-relson=osagesoftware.com at aotto.com
Return-Path: <bogofilter-return-75-relson=osagesoftware.com at aotto.com>
X-Original-To: relson at osagesoftware.com
Delivered-To: relson at osagesoftware.com
Received: from lists.refdesk.com (ns1.drudgereport.com [216.40.241.219])
	by mail.osagesoftware.com (Postfix) with SMTP id 31B752FEA6
	for <relson at osagesoftware.com>; Fri,  7 May 2004 07:06:38 -0400 (EDT)
Received: (qmail 16409 invoked by alias); 7 May 2004 11:05:22 -0000
Mailing-List: contact bogofilter-help at aotto.com; run by ezmlm
Precedence: bulk
List-Id: <bogofilter at aotto.com>
List-Post: <mailto:bogofilter at aotto.com>
List-Help: <mailto:bogofilter-help at aotto.com>
List-Unsubscribe: <mailto:bogofilter-unsubscribe at aotto.com>
List-Subscribe: <mailto:bogofilter-subscribe at aotto.com>
Delivered-To: mailing list bogofilter at aotto.com
X-Ezauth: lfiejglolplgpbjagnoh
Received: (qmail 16397 invoked from network); 7 May 2004 11:05:22 -0000
Message-ID: <409B6DAA.6000309 at tacocat.net>
Date: Fri, 07 May 2004 07:06:18 -0400
From: Tom Allison <tallison at tacocat.net>
User-Agent: Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4.2) Gecko/20040220
X-Accept-Language: en-us, en
MIME-Version: 1.0
To: bogofilter <bogofilter at aotto.com>
References: <409A04EF.90109 at tacocat.net>	<1083845605.14628.918.camel at linuxpc>	<21835.198.208.159.14.1083849259.squirrel at www.tacocat.net>	<0de501c43371$14540c30$6ecfcfcf at Betson110>	<20040506100346.539ba11e at osage.osagesoftware.com>	<26973.198.208.159.14.1083852681.squirrel at www.tacocat.net> <20040506104317.419362b6 at osage.osagesoftware.com>
In-Reply-To: <20040506104317.419362b6 at osage.osagesoftware.com>
X-Enigmail-Version: 0.76.8.0
X-Enigmail-Supports: pgp-inline, pgp-mime
Content-Type: text/plain; charset=us-ascii; format=flowed
Content-Transfer-Encoding: 7bit
X-Virus-Scanned: Symantec AntiVirus Scan Engine
X-Virus-Scanned: by amavisd-new at tacocat.net
Subject: Re: [bogofilter] spamitarium & block_on_subnets results
X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=0.17.5.cvs
Status:   

David Relson wrote:
> On Thu, 6 May 2004 10:11:21 -0400 (EDT)
> tallison at tacocat.net wrote:
> 
> 
> 
>>I see the errors of my ways....
> 
> 
> You're a smart young man to listen to your elders :-)
> 
> 
>>I guess I'll make some adjustments and try it again.
>>
> 
> 0.60 and 0.10 are fine to exclude robx skewing.  I verified that with:
> 
>    mkdir test
>    echo junk | bogofilter -d test -s
>    echo junk | bogofilter -d test -n
>    echo this is a test | bogofilter -d test -vvv
> 
> and all looks fine.  (Note: the final column is "+" or "-", indicating
> if the token is included in the score or not.)
> 
> 

Well, I decided to re-run the tests using
robx=0.55
robx=(1.0, 0.1 0.01)
block_on_subnets=(yes no)
and varous implimentations of spamitarium for a total of something like 
90 test sets.

I've converted everything into three corpus bodies each for spam/ham 
using a quickie script I wrote up (Dave, I couldn't get yours to behave 
so went with what I knew):
## dist.sh ##
X=0
LIM=2

rsync -e ssh -r 192.168.xxx.yyy:~/Maildir/.training* ~/Maildir/
rm -f dist*.mbx

BOX="ham spam"
for B in $BOX
do
   D="$HOME/Maildir/.training.$B/cur/"
   for F in `find $D -type f`;
     do
     formail -bz -I X-Spam-Status -I X-Spam-Level -I X-Bogosity -I 
X-Razor-Check < $F >> dist$B$X.mbx
     X=$(($X+1));
     [ $X -gt $LIM ] && X=0
   done
done
########

This should have the added improvement over my last tests in how the 
email is distributed.  Previously training and test bodies were 
differentiated by chronology.  Not intentionally.  This would distribute 
the emails the same way you would deal out a deck of cards.

SIDE NOTE: I noticed that at least one email was rather corrupted and 
couldn't read correctly (characters were not normal ascii) and even a 
stdout print would cause the term windows to go into that annoying 
graphics mode.  Any suggestions on how to detect these and remove them 
out of 11,000 emails?

After I finish these, I am hoping there's enough information at this 
point to have some concensus.  I made some honest mistakes before, but 
am of the opinion that the issues with robx/robs have been addressed.

I suppose the next trick would be to actually understand what ESF is and 
how it actually works.  But that's going to take some more reading.  I 
haven't finished the first article yet and I'm already going all cross-eyed.

I need a course in remedial statistics.
>From  Thu 6 May 2004 09:51:36 2004
Return-Path: bogofilter-return-64-relson=osagesoftware.com at aotto.com
Return-Path: <bogofilter-return-64-relson=osagesoftware.com at aotto.com>
X-Original-To: relson at osagesoftware.com
Delivered-To: relson at osagesoftware.com
Received: from lists.refdesk.com (ns1.drudgereport.com [216.40.241.219])
	by mail.osagesoftware.com (Postfix) with SMTP id AD2EB2FEA6
	for <relson at osagesoftware.com>; Thu,  6 May 2004 09:51:36 -0400 (EDT)
Received: (qmail 23100 invoked by alias); 6 May 2004 13:50:40 -0000
Mailing-List: contact bogofilter-help at aotto.com; run by ezmlm
Precedence: bulk
List-Id: <bogofilter at aotto.com>
List-Post: <mailto:bogofilter at aotto.com>
List-Help: <mailto:bogofilter-help at aotto.com>
List-Unsubscribe: <mailto:bogofilter-unsubscribe at aotto.com>
List-Subscribe: <mailto:bogofilter-subscribe at aotto.com>
Delivered-To: mailing list bogofilter at aotto.com
X-Ezauth: aealchcikdeoojlfefdj
Received: (qmail 23090 invoked from network); 6 May 2004 13:50:39 -0000
Message-ID: <0de501c43371$14540c30$6ecfcfcf at Betson110>
Reply-To: "Tom Anderson" <tanderso at oac-design.com>
From: "Tom Anderson" <tanderso at oac-design.com>
To: "bogofilter" <bogofilter at aotto.com>
References: <409A04EF.90109 at tacocat.net>    <1083845605.14628.918.camel at linuxpc>  <21835.198.208.159.14.1083849259.squirrel at www.tacocat.net> 
Date: Thu, 6 May 2004 09:49:53 -0400
MIME-Version: 1.0
Content-Type: text/plain;
	charset="iso-8859-1"
Content-Transfer-Encoding: 7bit
X-Priority: 3
X-MSMail-Priority: Normal
X-Mailer: Microsoft Outlook Express 6.00.2800.1409
X-MimeOLE: Produced By Microsoft MimeOLE V6.00.2800.1409
Subject: Re: [bogofilter] spamitarium & block_on_subnets results 
X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=0.17.5.cvs
Status:   

From: <tallison at tacocat.net>
> Considering that I used the same configurations through all of the tests
> and training, the percentage of various scores may not be ideal when
> compared to other configurations.  However, the important thing to note
> here is not the absolute value of the accuracy (or lack thereof) but the
> comparative differences between which one is better/worse.

The robx and min_dev play a role in the comparative differences.  In your
configuration, if bogofilter has not seen a token before, then it will
assign it a value of 0.6.  Normally we want such tokens to not play a role
in classification since we have no basis in experience on which to classify
them.  However, your min_dev range is 0.45 to 0.55, so you're telling
bogofilter to classify every single new token as spammy.  This is pushing
all of your scores toward the spam direction pretty drastically.  And since
spamitarium reduces certain unwanted redundancy and introduces new helo-,
ASN, rDNS, and IP tokens, the spamitarium results likely have more hapaxes.
This is the core purpose of spamitarium, as when these tokens are seen
again, they help filter the email appropriately.  However, when these tokens
are unknowns/hapaxes (being classified as robx for the first time), your
config values are causing them to push all emails (notably hams) into the
spam direction.  This is not ideal behavior.  A better test of spamitarium
(and a better useage of bogofilter, IMHO) is to keep your robx value within
the min_dev exclusion range.  Otherwise, you're biasing the test against
spamitarium's core purpose.  This same phenomenon is responsible for your
poor results with block-on-subnets, as again, there are more unknown tokens
under those conditions I believe.

> scored as Unsure than Ham scored as Yes.  I consider Unsure scores to be a
> minor error and false scores to be very major errors.

I agree with this.  However, your settings do not follow your philosophy.
Right now, if you get a ham email from someone discussing a topic with lots
of words unknown to your wordlist, and just a few spammy words thrown in,
then you'll probably classify it as spam due to the bias you've set up with
your robx and min_dev values.  I would give such an email the benefit of the
doubt by not deciding on the unknown words until I've seen them more than
once.

> ran, the ideal arguement would be to run all the training based on a
> configuration file that was generated by bogotune exclusively.  But I'm
> not convinced that this is going to make much a difference in the end.  I
> believe we are looking for statistically significant "shifts" in the data
> more than we are looking for specific target values of attribute/variable
> data.  This perspective removes a dependency on the clause "YMMV".

You don't need to run bogotune.  Just reduce your robx closer to 0.5,
preferably favoring ham slightly.  And make sure it is closer to 0.5 than
min_dev.  This is important to properly test spamitarium.  The cutoffs don't
really matter from a comparative perspective, but classifying unknowns as
spam is detrimental.

Tom
>From  Thu 6 May 2004 12:52:34 2004
Return-Path: bogofilter-return-71-relson=osagesoftware.com at aotto.com
Return-Path: <bogofilter-return-71-relson=osagesoftware.com at aotto.com>
X-Original-To: relson at osagesoftware.com
Delivered-To: relson at osagesoftware.com
Received: from lists.refdesk.com (ns1.drudgereport.com [216.40.241.219])
	by mail.osagesoftware.com (Postfix) with SMTP id 5D68E2FEA6
	for <relson at osagesoftware.com>; Thu,  6 May 2004 12:52:34 -0400 (EDT)
Received: (qmail 20473 invoked by alias); 6 May 2004 16:51:37 -0000
Mailing-List: contact bogofilter-help at aotto.com; run by ezmlm
Precedence: bulk
List-Id: <bogofilter at aotto.com>
List-Post: <mailto:bogofilter at aotto.com>
List-Help: <mailto:bogofilter-help at aotto.com>
List-Unsubscribe: <mailto:bogofilter-unsubscribe at aotto.com>
List-Subscribe: <mailto:bogofilter-subscribe at aotto.com>
Delivered-To: mailing list bogofilter at aotto.com
X-Ezauth: iijocopmgapllifmdacn
Received: (qmail 20463 invoked from network); 6 May 2004 16:51:37 -0000
Message-ID: <0e7301c4338a$5c16ad70$6ecfcfcf at Betson110>
Reply-To: "Tom Anderson" <tanderso at oac-design.com>
From: "Tom Anderson" <tanderso at oac-design.com>
To: "bogofilter" <bogofilter at aotto.com>
References:    <409A04EF.90109 at tacocat.net><1083845605.14628.918.camel at linuxpc><21835.198.208.159.14.1083849259.squirrel at www.tacocat.net><0de501c43371$14540c30$6ecfcfcf at Betson110>       <20040506100346.539ba11e at osage.osagesoftware.com>     <26973.198.208.159.14.1083852681.squirrel at www.tacocat.net>    <0e1701c43377$432f88d0$6ecfcfcf at Betson110>  <40038.198.208.159.14.1083860124.squirrel at www.tacocat.net> 
Date: Thu, 6 May 2004 12:50:51 -0400
MIME-Version: 1.0
Content-Type: text/plain;
	charset="iso-8859-1"
Content-Transfer-Encoding: 7bit
X-Priority: 3
X-MSMail-Priority: Normal
X-Mailer: Microsoft Outlook Express 6.00.2800.1409
X-MimeOLE: Produced By Microsoft MimeOLE V6.00.2800.1409
Subject: Re: [bogofilter] spamitarium & block_on_subnets results 
X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=0.17.5.cvs
Status:   

From: <tallison at tacocat.net>
> As long as robx is within the min_dev range, it would not contribute to
> the score.  I think setting robx = 0.60 would tend to pick up new spammy
> words much faster than new hammy words, giving you a biase towards spam.
> To effectively negate this we could decide upon: robx=0.50, min_dev=0.10,
> robs=1.0.

I think you mean robs=0.1, right?  You don't want tokens to be ignored for
too long.  They should start contributing to classifications after 2-3
registrations.  Mine is set at 0.2, but I'm not entirely certain if that
achieves my goal.  Does anyone know the effect of robs=1.0?

> While this might be the best approach for testing the effect of your
> script, I couldn't recommend it as the best solution in practice.  Most of
> the new spam that I get has these huge blocks of dictionary spew which are
> 99% new words and, for me, I do want to consider as spam.

On the contrary, it works very well in practice.  If you get an email with
large dictionary spews, then all of those tokens are completely ignored, and
only the important ones like the IP, domain, ASN, subject line, spam
payload, etc., (assuming they were seen before) are used to classify the
message.  This is exactly what we want.  If you set robx above min_dev, then
you run the risk of false positives when you recieve an email about a new
subject or from a new contact.  I would want these to definitely be "unsure"
rather than have bogofilter assume they are spam.  I receive relatively few
dictionary spew spam as unsures.  Mine are mostly "long story" spams and
virii with just an attachment (2-3 per day perhaps).  But these are
decreasing as I add ASNs to my wordlist over time.

> Eventually, with enough training (getting to your 75% train, 25% test
> approach) moving robx outside of the min_dev range might not be too
> unstable.

I still wouldn't do it for the above reason.  I want almost zero possibility
of a false positive.  Spamitarium was written so that I could still be
conservative with my robx and min_dev (thus producing near-zero false
positives), but also more aggressively filter the header info so that I get
less false negatives and unsures.

Tom
>From  Thu 6 May 2004 21:03:41 2004
Return-Path: bogofilter-return-73-relson=osagesoftware.com at aotto.com
Return-Path: <bogofilter-return-73-relson=osagesoftware.com at aotto.com>
X-Original-To: relson at osagesoftware.com
Delivered-To: relson at osagesoftware.com
Received: from lists.refdesk.com (ns1.drudgereport.com [216.40.241.219])
	by mail.osagesoftware.com (Postfix) with SMTP id EF5C42FEA6
	for <relson at osagesoftware.com>; Thu,  6 May 2004 21:03:41 -0400 (EDT)
Received: (qmail 21165 invoked by alias); 7 May 2004 01:02:39 -0000
Mailing-List: contact bogofilter-help at aotto.com; run by ezmlm
Precedence: bulk
List-Id: <bogofilter at aotto.com>
List-Post: <mailto:bogofilter at aotto.com>
List-Help: <mailto:bogofilter-help at aotto.com>
List-Unsubscribe: <mailto:bogofilter-unsubscribe at aotto.com>
List-Subscribe: <mailto:bogofilter-subscribe at aotto.com>
Delivered-To: mailing list bogofilter at aotto.com
X-Ezauth: iklbiipcahcnjcfcleok
Received: (qmail 21155 invoked from network); 7 May 2004 01:02:39 -0000
Message-ID: <409AE064.8000309 at tacocat.net>
Date: Thu, 06 May 2004 21:03:32 -0400
From: Tom Allison <tallison at tacocat.net>
User-Agent: Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4.2) Gecko/20040220
X-Accept-Language: en-us, en
MIME-Version: 1.0
To: bogofilter <bogofilter at aotto.com>
References: <409A0A55.9020805 at tacocat.net> <20040506165141.GO28747 at f00f.net>
In-Reply-To: <20040506165141.GO28747 at f00f.net>
X-Enigmail-Version: 0.76.8.0
X-Enigmail-Supports: pgp-inline, pgp-mime
Content-Type: text/plain; charset=us-ascii; format=flowed
Content-Transfer-Encoding: 7bit
X-Virus-Scanned: Symantec AntiVirus Scan Engine
X-Virus-Scanned: by amavisd-new at tacocat.net
Subject: Re: [bogofilter] Re: redundancy
X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=0.17.5.cvs
Status:   

Ben Damm wrote:
> This brings up something that has been on my mind lately.  I've noticed
> that some spammers are adding many copies of my own email address to
> random locations in the body of the email.  What this says to me is that
> my email address will become a spammy token.  I suspect that this will
> not significantly change the bogosity score of otherwise valid messages,
> but can someone confirm this?
> 
> Thanks,
> -Ben
> 

header identifiers will accomodate this.

As a test, do this:

bogoutil -d .bogofilter/wordlist.db | grep dammfine

And you'll see there is generally a difference between the header and 
the body information.
>From  Thu 6 May 2004 13:02:54 2004
Return-Path: bogofilter-return-72-relson=osagesoftware.com at aotto.com
Return-Path: <bogofilter-return-72-relson=osagesoftware.com at aotto.com>
X-Original-To: relson at osagesoftware.com
Delivered-To: relson at osagesoftware.com
Received: from lists.refdesk.com (ns1.drudgereport.com [216.40.241.219])
	by mail.osagesoftware.com (Postfix) with SMTP id 67D052FEA6
	for <relson at osagesoftware.com>; Thu,  6 May 2004 13:02:54 -0400 (EDT)
Received: (qmail 24284 invoked by alias); 6 May 2004 17:01:55 -0000
Mailing-List: contact bogofilter-help at aotto.com; run by ezmlm
Precedence: bulk
List-Id: <bogofilter at aotto.com>
List-Post: <mailto:bogofilter at aotto.com>
List-Help: <mailto:bogofilter-help at aotto.com>
List-Unsubscribe: <mailto:bogofilter-unsubscribe at aotto.com>
List-Subscribe: <mailto:bogofilter-subscribe at aotto.com>
Delivered-To: mailing list bogofilter at aotto.com
X-Ezauth: aplbknknpehomdjnelda
Received: (qmail 24274 invoked from network); 6 May 2004 17:01:55 -0000
Message-ID: <0e8e01c4338b$cbe093e0$6ecfcfcf at Betson110>
Reply-To: "Tom Anderson" <tanderso at oac-design.com>
From: "Tom Anderson" <tanderso at oac-design.com>
To: "bogofilter" <bogofilter at aotto.com>
References: <409A0A55.9020805 at tacocat.net>  <20040506165141.GO28747 at f00f.net> 
Date: Thu, 6 May 2004 13:01:08 -0400
MIME-Version: 1.0
Content-Type: text/plain;
	charset="iso-8859-1"
Content-Transfer-Encoding: 7bit
X-Priority: 3
X-MSMail-Priority: Normal
X-Mailer: Microsoft Outlook Express 6.00.2800.1409
X-MimeOLE: Produced By Microsoft MimeOLE V6.00.2800.1409
Subject: Re: [bogofilter] Re: redundancy 
X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=0.17.5.cvs
Status:   

From: "Ben Damm" <bdamm-bogofilter at dammfine.com>
> This brings up something that has been on my mind lately.  I've noticed
> that some spammers are adding many copies of my own email address to
> random locations in the body of the email.  What this says to me is that
> my email address will become a spammy token.  I suspect that this will
> not significantly change the bogosity score of otherwise valid messages,
> but can someone confirm this?

Bogofilter uses @ as a seperator (wrongly IMHO), so your full email address
will not show up in your wordlist at all, unless the @ has been replaced by
another character, as is often done.  Sometimes instead of user at domain, spam
software will turn this into domain-user, user!domain, domainuser,
userdomain, etc.  Most of these are decisively spammy in my wordlist, as it
should be since my email address is not often referenced in the body of
hams.  "rcvd:username" is roughly neutral.  Some combinations are hammy.
The point is, this is just like any other token.  Bogofilter will handle it
appropriately.

Tom
>From  Wed 5 May 2004 10:00:10 2004
Return-Path: bogofilter-return-54-relson=osagesoftware.com at aotto.com
Return-Path: <bogofilter-return-54-relson=osagesoftware.com at aotto.com>
X-Original-To: relson at osagesoftware.com
Delivered-To: relson at osagesoftware.com
Received: from lists.refdesk.com (ns1.drudgereport.com [216.40.241.219])
	by mail.osagesoftware.com (Postfix) with SMTP id 3AB1B2FEA6
	for <relson at osagesoftware.com>; Wed,  5 May 2004 10:00:10 -0400 (EDT)
Received: (qmail 9697 invoked by alias); 5 May 2004 13:59:20 -0000
Mailing-List: contact bogofilter-help at aotto.com; run by ezmlm
Precedence: bulk
List-Id: <bogofilter at aotto.com>
List-Post: <mailto:bogofilter at aotto.com>
List-Help: <mailto:bogofilter-help at aotto.com>
List-Unsubscribe: <mailto:bogofilter-unsubscribe at aotto.com>
List-Subscribe: <mailto:bogofilter-subscribe at aotto.com>
Delivered-To: mailing list bogofilter at aotto.com
X-Ezauth: gbnljfdchjdjajdfjfck
Received: (qmail 9687 invoked from network); 5 May 2004 13:59:20 -0000
Date: Wed, 5 May 2004 15:00:00 +0100
From: Richard Kimber <rkimber at ntlworld.com>
To: bogofilter <bogofilter at aotto.com>
Message-Id: <20040505150000.63e1a966.rkimber at ntlworld.com>
X-Mailer: Sylpheed version 0.9.10 (GTK+ 1.2.10; i686-pc-linux-gnu)
Mime-Version: 1.0
Content-Type: text/plain; charset=US-ASCII
Content-Transfer-Encoding: 7bit
Sender: Political Science Resources <rkimber at ntlworld.com>
Subject: [bogofilter] interesting paper
X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=0.17.5.cvs
Status:   

There's an interesting paper at:

http://crm114.sourceforge.net/Plateau_Paper.html

entitled: The Spam-Filtering Accuracy Plateau at 99.9% Accuracy and How
to Get Past It.

- Richard.
-- 
Richard Kimber
http://www.psr.keele.ac.uk/
>From  Wed 5 May 2004 17:47:31 2004
Return-Path: bogofilter-return-56-relson=osagesoftware.com at aotto.com
Return-Path: <bogofilter-return-56-relson=osagesoftware.com at aotto.com>
X-Original-To: relson at osagesoftware.com
Delivered-To: relson at osagesoftware.com
Received: from lists.refdesk.com (ns1.drudgereport.com [216.40.241.219])
	by mail.osagesoftware.com (Postfix) with SMTP id 45BE62FEA6
	for <relson at osagesoftware.com>; Wed,  5 May 2004 17:47:31 -0400 (EDT)
Received: (qmail 4472 invoked by alias); 5 May 2004 21:46:39 -0000
Mailing-List: contact bogofilter-help at aotto.com; run by ezmlm
Precedence: bulk
List-Id: <bogofilter at aotto.com>
List-Post: <mailto:bogofilter at aotto.com>
List-Help: <mailto:bogofilter-help at aotto.com>
List-Unsubscribe: <mailto:bogofilter-unsubscribe at aotto.com>
List-Subscribe: <mailto:bogofilter-subscribe at aotto.com>
Delivered-To: mailing list bogofilter at aotto.com
X-Ezauth: mbhkjennfoghcfgloaof
Received: (qmail 4460 invoked from network); 5 May 2004 21:46:39 -0000
Message-ID: <409960EB.5080106 at tacocat.net>
Date: Wed, 05 May 2004 17:47:23 -0400
From: Tom Allison <tallison at tacocat.net>
User-Agent: Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4.2) Gecko/20040220
X-Accept-Language: en-us, en
MIME-Version: 1.0
To: Tom Anderson <tanderso at oac-design.com>
Cc: bogofilter <bogofilter at aotto.com>
References: <20040505150000.63e1a966.rkimber at ntlworld.com> <0cd601c432b5$2746fe90$6ecfcfcf at Betson110>
In-Reply-To: <0cd601c432b5$2746fe90$6ecfcfcf at Betson110>
X-Enigmail-Version: 0.76.8.0
X-Enigmail-Supports: pgp-inline, pgp-mime
Content-Type: text/plain; charset=us-ascii; format=flowed
Content-Transfer-Encoding: 7bit
X-Virus-Scanned: Symantec AntiVirus Scan Engine
X-Virus-Scanned: by amavisd-new at tacocat.net
Subject: Re: [bogofilter] interesting paper
X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=0.17.5.cvs
Status:   

Tom Anderson wrote:
> From: "Richard Kimber" <rkimber at ntlworld.com>
> 
>>http://crm114.sourceforge.net/Plateau_Paper.html
>>
>>entitled: The Spam-Filtering Accuracy Plateau at 99.9% Accuracy and How
>>to Get Past It.
> 
> 
> I agree with this idea that shared resources is useful, but not in a
> blacklisting sense.  It's odd that the author would "solve" the statistical
> plateau with a non-statistical approach.  Instead, I'd like to see the
> ability for bogofilter to accept several URLs (perhaps with passwords) in
> its configuration for corroborative or supplementary wordlists.  That is,
> look up all of the tokens in your own local wordlist first, but if you
> haven't seen a token before, then instead of using robx (unless you have no
> URLs specified or no connection), look up that particular token in your
> first supplementary wordlist.  If it isn't there, or you timeout, then move
> to the next one.  This way, you could share with your friends or colleagues,
> but in the order in which you think your email is most similar, with
> failover.  This would also allow an organization to maintain a global
> wordlist, perhaps from a spam trap, and keep local wordlists for each user
> that only represent the deviation from the global one, thus significantly
> reducing their size.
> 
> Tom
> 

I could imagine that sharing the IP/ASN information between different 
users might make a contribution that is more independent of one's 
personal interpretation of spam.  Recall that bogofilter is highly tuned 
to one persons email and context.  But it might be possible that IP/ASN 
information is more independent than other data.
>From  Wed 5 May 2004 11:26:26 2004
Return-Path: bogofilter-return-55-relson=osagesoftware.com at aotto.com
Return-Path: <bogofilter-return-55-relson=osagesoftware.com at aotto.com>
X-Original-To: relson at osagesoftware.com
Delivered-To: relson at osagesoftware.com
Received: from lists.refdesk.com (ns1.drudgereport.com [216.40.241.219])
	by mail.osagesoftware.com (Postfix) with SMTP id C3FD62FEA6
	for <relson at osagesoftware.com>; Wed,  5 May 2004 11:26:26 -0400 (EDT)
Received: (qmail 6341 invoked by alias); 5 May 2004 15:25:34 -0000
Mailing-List: contact bogofilter-help at aotto.com; run by ezmlm
Precedence: bulk
List-Id: <bogofilter at aotto.com>
List-Post: <mailto:bogofilter at aotto.com>
List-Help: <mailto:bogofilter-help at aotto.com>
List-Unsubscribe: <mailto:bogofilter-unsubscribe at aotto.com>
List-Subscribe: <mailto:bogofilter-subscribe at aotto.com>
Delivered-To: mailing list bogofilter at aotto.com
X-Ezauth: jlecpnhafifepdamkofe
Received: (qmail 6331 invoked from network); 5 May 2004 15:25:34 -0000
Message-ID: <0cd601c432b5$2746fe90$6ecfcfcf at Betson110>
Reply-To: "Tom Anderson" <tanderso at oac-design.com>
From: "Tom Anderson" <tanderso at oac-design.com>
To: "bogofilter" <bogofilter at aotto.com>
References: <20040505150000.63e1a966.rkimber at ntlworld.com> 
Date: Wed, 5 May 2004 11:24:39 -0400
MIME-Version: 1.0
Content-Type: text/plain;
	charset="iso-8859-1"
Content-Transfer-Encoding: 7bit
X-Priority: 3
X-MSMail-Priority: Normal
X-Mailer: Microsoft Outlook Express 6.00.2800.1409
X-MimeOLE: Produced By Microsoft MimeOLE V6.00.2800.1409
Subject: Re: [bogofilter] interesting paper 
X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=0.17.5.cvs
Status:   

From: "Richard Kimber" <rkimber at ntlworld.com>
> http://crm114.sourceforge.net/Plateau_Paper.html
>
> entitled: The Spam-Filtering Accuracy Plateau at 99.9% Accuracy and How
> to Get Past It.

I agree with this idea that shared resources is useful, but not in a
blacklisting sense.  It's odd that the author would "solve" the statistical
plateau with a non-statistical approach.  Instead, I'd like to see the
ability for bogofilter to accept several URLs (perhaps with passwords) in
its configuration for corroborative or supplementary wordlists.  That is,
look up all of the tokens in your own local wordlist first, but if you
haven't seen a token before, then instead of using robx (unless you have no
URLs specified or no connection), look up that particular token in your
first supplementary wordlist.  If it isn't there, or you timeout, then move
to the next one.  This way, you could share with your friends or colleagues,
but in the order in which you think your email is most similar, with
failover.  This would also allow an organization to maintain a global
wordlist, perhaps from a spam trap, and keep local wordlists for each user
that only represent the deviation from the global one, thus significantly
reducing their size.

Tom
>From  Wed 5 May 2004 21:31:42 2004
Return-Path: tallison at tacocat.net
Return-Path: <tallison at tacocat.net>
X-Original-To: relson at osagesoftware.com
Delivered-To: relson at osagesoftware.com
Received: from ms-smtp-01-eri0.ohiordc.rr.com (ms-smtp-01-smtplb.ohiordc.rr.com [65.24.5.135])
	by mail.osagesoftware.com (Postfix) with ESMTP id 62A4C2FEA6
	for <relson at osagesoftware.com>; Wed,  5 May 2004 21:31:42 -0400 (EDT)
Received: from janus.tacocat.net (cpe-069-133-095-206.twmi.rr.com [69.133.95.206])
	by ms-smtp-01-eri0.ohiordc.rr.com (8.12.10/8.12.7) with ESMTP id i461VcUK012158
	for <relson at osagesoftware.com>; Wed, 5 May 2004 21:31:39 -0400 (EDT)
Received: from localhost (localhost [127.0.0.1])
	by janus.tacocat.net (Postfix) with ESMTP id 5D76321315D
	for <relson at osagesoftware.com>; Wed,  5 May 2004 21:31:38 -0400 (EDT)
Received: from janus.tacocat.net ([127.0.0.1])
 by localhost (janus [127.0.0.1]) (amavisd-new, port 10024) with ESMTP
 id 06029-02 for <relson at osagesoftware.com>;
 Wed,  5 May 2004 21:31:38 -0400 (EDT)
Received: from tacocat.net (unknown [192.168.1.10])
	by janus.tacocat.net (Postfix) with ESMTP id DD80121311C
	for <relson at osagesoftware.com>; Wed,  5 May 2004 21:31:37 -0400 (EDT)
Message-ID: <40999578.7000702 at tacocat.net>
Date: Wed, 05 May 2004 21:31:36 -0400
From: Tom Allison <tallison at tacocat.net>
User-Agent: Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4.2) Gecko/20040220
X-Accept-Language: en-us, en
MIME-Version: 1.0
To: David Relson <relson at osagesoftware.com>
Subject: Re: [bogofilter] Improved Calculations
References: <20040504182252.4b87349a at osage.osagesoftware.com>	<1083757382.14620.846.camel at linuxpc>	<20040505075432.399f3ad9 at osage.osagesoftware.com>	<409969D0.1020509 at tacocat.net> <20040505190347.6d1300c9 at osage.osagesoftware.com>
In-Reply-To: <20040505190347.6d1300c9 at osage.osagesoftware.com>
X-Enigmail-Version: 0.76.8.0
X-Enigmail-Supports: pgp-inline, pgp-mime
Content-Type: text/plain; charset=us-ascii; format=flowed
Content-Transfer-Encoding: 7bit
X-Virus-Scanned: Symantec AntiVirus Scan Engine
X-Virus-Scanned: by amavisd-new at tacocat.net
X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=0.17.5.cvs
Status:   

David Relson wrote:
> On Wed, 05 May 2004 18:25:20 -0400
> Tom Allison wrote:
> 
> ...[snip]...
> 
> 
>>I see this getting really ugly really fast.
>>How does spam_cutoff/ham_cutoff and sp_esf/ns_esf inter-relate to each
>>
>>other?
> 
> 
> In some ways this is definitely so.  In its first scan bogotune uses 5
> values of robs, 5 of robx, and 9 min_dev for a total of 225
> combinations.  Adding 5 for sp_esf and 5 for ns_esf increase the count
> by a factor of 25, i.e. to 5625.  So the compehensive scan becomes time
> consuming.
> 
> spam_cutoff and ham_cutoff are pretty much separate from one another,
> except for the obvious -- ham_cutoff must be less than or equal to
> spam_cutoff.  The two sp_esf and ns_esf factors are separate from one
> another, and separate from robs, robx, and min_dev.  However, as we've
> learned, the various factors interact with one another in complex
> non-obvious ways, which means that we can't arbitrarily change one and
> still expect the best performance.  A parameter tester, like bogotune,
> is needed to test the combinations and determine which combo works best
> for the data (messages) being tested.
> 
> Initially, at least, bogofilter will just use 1.0 for both sp_esf and
> ns_esf.  That will give the same answers as not using ESF.  Gary
> Robinson has shown that this new idea has merit, and Greg Louis has
> confirmed the merit.  Somewhere down the road, as experiments are run
> and we learn more, other values are likely to be values.  
> 
> 
>>I got through part of the article this morning but haven't had a
>>chance to complete it.  I'm not very good at statistics.  At least,
>>not that good.
> 
> 
> I'm not a statistician either.  I took an introductory statistics course
> in college in the '60's, have had little need since then, and have
> forgotten virtually all of it.  Fortunately there are others, like Gary
> Robinson and Greg Louis, who have the knowledge and the skill to apply
> it to the problem at hand -- identifying spam.
> 
> Hope this helps!
> 
> David
> 

It does, on several levels.

I'll help run tests...
I can do that!
>From  May 2004 21:23:07 -040 Wed 5 May 2004 19:17:54 2004
Return-Path: bogofilter-return-74-relson=osagesoftware.com at aotto.com
Date: Wed, 5 May 2004 21:23:07 -0400
From: David Relson <relson at osagesoftware.com>
To: Greg Louis <glouis at dynamicro.on.ca>
Subject: Re: [bogofilter] Improved Calculations
Message-Id: <20040505212307.1545dda2 at osage.osagesoftware.com>
In-Reply-To: <20040505231754.GA16417 at athame.dynamicro.on.ca>
References: <20040504182252.4b87349a at osage.osagesoftware.com>
	<1083757382.14620.846.camel at linuxpc>
	<20040505075432.399f3ad9 at osage.osagesoftware.com>
	<409969D0.1020509 at tacocat.net>
	<20040505190347.6d1300c9 at osage.osagesoftware.com>
	<20040505231754.GA16417 at athame.dynamicro.on.ca>
Organization: Osage Software Systems, Inc.
X-Mailer: Sylpheed version 0.9.10claws51 (GTK+ 1.2.10; i686-pc-linux-gnu)
Mime-Version: 1.0
Content-Type: text/plain; charset=US-ASCII
Content-Transfer-Encoding: 7bit

On Wed, 5 May 2004 19:17:54 -0400
Greg Louis wrote:

> On 20040505 (Wed) at 1903:47 -0400, David Relson wrote:
> > On Wed, 05 May 2004 18:25:20 -0400
> > Tom Allison wrote:
> 
> This is a problem: those who know little should keep quiet, and
> haven't the sense to do so.  I left the bogofilter lists primarily for
> this reason.  The spamfilt list is much more rewarding: the people
> there know what they're talking about in statistical matters.  (They
> know so much more than I do that I tend to lurk there, but that's
> another story.)

I look at Tom's question as that of a seeker wanting to know more.  I
certainly find myself in that role, as you do.  I've been reading the
spamfilt list of late and the math quickly leaves me scratching my head.
>From  May 2004 19:03:47 -040 Wed 05 May 2004 18:25:20 2004
Return-Path: bogofilter-return-74-relson=osagesoftware.com at aotto.com
Date: Wed, 5 May 2004 19:03:47 -0400
From: David Relson <relson at osagesoftware.com>
To: bogofilter <bogofilter at aotto.com>
Cc: Greg Louis <glouis at dynamicro.on.ca>
Subject: Re: [bogofilter] Improved Calculations
Message-Id: <20040505190347.6d1300c9 at osage.osagesoftware.com>
In-Reply-To: <409969D0.1020509 at tacocat.net>
References: <20040504182252.4b87349a at osage.osagesoftware.com>
	<1083757382.14620.846.camel at linuxpc>
	<20040505075432.399f3ad9 at osage.osagesoftware.com>
	<409969D0.1020509 at tacocat.net>
Organization: Osage Software Systems, Inc.
X-Mailer: Sylpheed version 0.9.10claws51 (GTK+ 1.2.10; i686-pc-linux-gnu)
Mime-Version: 1.0
Content-Type: text/plain; charset=US-ASCII
Content-Transfer-Encoding: 7bit

On Wed, 05 May 2004 18:25:20 -0400
Tom Allison wrote:

...[snip]...

> I see this getting really ugly really fast.
> How does spam_cutoff/ham_cutoff and sp_esf/ns_esf inter-relate to each
> 
> other?

In some ways this is definitely so.  In its first scan bogotune uses 5
values of robs, 5 of robx, and 9 min_dev for a total of 225
combinations.  Adding 5 for sp_esf and 5 for ns_esf increase the count
by a factor of 25, i.e. to 5625.  So the compehensive scan becomes time
consuming.

spam_cutoff and ham_cutoff are pretty much separate from one another,
except for the obvious -- ham_cutoff must be less than or equal to
spam_cutoff.  The two sp_esf and ns_esf factors are separate from one
another, and separate from robs, robx, and min_dev.  However, as we've
learned, the various factors interact with one another in complex
non-obvious ways, which means that we can't arbitrarily change one and
still expect the best performance.  A parameter tester, like bogotune,
is needed to test the combinations and determine which combo works best
for the data (messages) being tested.

Initially, at least, bogofilter will just use 1.0 for both sp_esf and
ns_esf.  That will give the same answers as not using ESF.  Gary
Robinson has shown that this new idea has merit, and Greg Louis has
confirmed the merit.  Somewhere down the road, as experiments are run
and we learn more, other values are likely to be values.  

> I got through part of the article this morning but haven't had a
> chance to complete it.  I'm not very good at statistics.  At least,
> not that good.

I'm not a statistician either.  I took an introductory statistics course
in college in the '60's, have had little need since then, and have
forgotten virtually all of it.  Fortunately there are others, like Gary
Robinson and Greg Louis, who have the knowledge and the skill to apply
it to the problem at hand -- identifying spam.

Hope this helps!

David
>From  Wed 5 May 2004 07:41:24 2004
Return-Path: bogofilter-return-52-relson=osagesoftware.com at aotto.com
Return-Path: <bogofilter-return-52-relson=osagesoftware.com at aotto.com>
X-Original-To: relson at osagesoftware.com
Delivered-To: relson at osagesoftware.com
Received: from lists.refdesk.com (ns1.drudgereport.com [216.40.241.219])
	by mail.osagesoftware.com (Postfix) with SMTP id D96DF2FEA6
	for <relson at osagesoftware.com>; Wed,  5 May 2004 07:41:24 -0400 (EDT)
Received: (qmail 27872 invoked by alias); 5 May 2004 11:40:35 -0000
Mailing-List: contact bogofilter-help at aotto.com; run by ezmlm
Precedence: bulk
List-Id: <bogofilter at aotto.com>
List-Post: <mailto:bogofilter at aotto.com>
List-Help: <mailto:bogofilter-help at aotto.com>
List-Unsubscribe: <mailto:bogofilter-unsubscribe at aotto.com>
List-Subscribe: <mailto:bogofilter-subscribe at aotto.com>
Delivered-To: mailing list bogofilter at aotto.com
X-Ezauth: opjhfogbgmcdigjdmbbe
Received: (qmail 27862 invoked from network); 5 May 2004 11:40:35 -0000
From: Tom Anderson <tanderso at oac-design.com>
To: bogofilter <bogofilter at aotto.com>
In-Reply-To: <20040504182252.4b87349a at osage.osagesoftware.com> 
References: <20040504182252.4b87349a at osage.osagesoftware.com> 
Content-Type: multipart/signed; micalg=pgp-sha1; protocol="application/pgp-signature"; boundary="=-U9ej2lwMSWOTj5JnqJB5"
Organization: 
Message-Id: <1083757382.14620.846.camel at linuxpc>
Mime-Version: 1.0
X-Mailer: Ximian Evolution 1.2.4 
Date: 05 May 2004 07:43:02 -0400
Subject: Re: [bogofilter] Improved Calculations
X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=0.17.5.cvs
Status:   

--=-U9ej2lwMSWOTj5JnqJB5
Content-Type: text/plain
Content-Transfer-Encoding: quoted-printable

On Tue, 2004-05-04 at 18:22, David Relson wrote:
> Later this week, bogofilter-0.17.6 will be released with ESF support.=20
> Greg modified bogotune to scan ESF values and used the modified version
> in his test.  His mods will be in the release.

Will using ESFs invalidate our current cutoffs?

Tom


--=-U9ej2lwMSWOTj5JnqJB5
Content-Type: application/pgp-signature; name=signature.asc
Content-Description: This is a digitally signed message part

-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.2.1 (GNU/Linux)

iD8DBQBAmNNGaTUjGSdb2eIRAksJAJ43supUfiFfABnFsJjDhGrVPmq+JgCfXpeW
cxDawAEJjsJhMBwXibG+i4A=
=fnXZ
-----END PGP SIGNATURE-----

--=-U9ej2lwMSWOTj5JnqJB5--
>From  Wed 5 May 2004 19:17:55 2004
Return-Path: glouis at dynamicro.on.ca
Return-Path: <glouis at dynamicro.on.ca>
X-Original-To: relson at osagesoftware.com
Delivered-To: relson at osagesoftware.com
Received: from csl2.consultronics.on.ca (csl2.consultronics.on.ca [204.138.93.2])
	by mail.osagesoftware.com (Postfix) with ESMTP id 787632FEA6
	for <relson at osagesoftware.com>; Wed,  5 May 2004 19:17:55 -0400 (EDT)
Received: from [127.0.0.1] (helo=athame.dynamicro.internal ident=nobody)
	by csl2.consultronics.on.ca with esmtp (Exim 4.32)
	id 1BLVeE-0008KG-LL
	for relson at osagesoftware.com; Wed, 05 May 2004 19:17:54 -0400
Received: from root by athame.dynamicro.internal with local (Exim 4.32)
	id 1BLVeE-0004Hh-42
	for relson at osagesoftware.com; Wed, 05 May 2004 19:17:54 -0400
Date: Wed, 5 May 2004 19:17:54 -0400
From: Greg Louis <glouis at dynamicro.on.ca>
To: David Relson <relson at osagesoftware.com>
Subject: Re: [bogofilter] Improved Calculations
Message-ID: <20040505231754.GA16417 at athame.dynamicro.on.ca>
Reply-To: Greg Louis <glouis at dynamicro.on.ca>
References: <20040504182252.4b87349a at osage.osagesoftware.com> <1083757382.14620.846.camel at linuxpc> <20040505075432.399f3ad9 at osage.osagesoftware.com> <409969D0.1020509 at tacocat.net> <20040505190347.6d1300c9 at osage.osagesoftware.com>
Mime-Version: 1.0
Content-Type: text/plain; charset=iso-8859-1
Content-Disposition: inline
In-Reply-To: <20040505190347.6d1300c9 at osage.osagesoftware.com>
Organization: Dynamicro Consulting Limited
X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=0.17.5.cvs
Status:   

On 20040505 (Wed) at 1903:47 -0400, David Relson wrote:
> On Wed, 05 May 2004 18:25:20 -0400
> Tom Allison wrote:

This is a problem: those who know little should keep quiet, and
haven't the sense to do so.  I left the bogofilter lists primarily for
this reason.  The spamfilt list is much more rewarding: the people
there know what they're talking about in statistical matters.  (They
know so much more than I do that I tend to lurk there, but that's
another story.)

Piet Hein had it right:
  Knowing what
    Thou knowest not
  Is in a sense
    Omniscience.

Take it at a level one deeper than face value...

-- 
| G r e g  L o u i s         | gpg public key: 0x400B1AA86D9E3E64 |
|  http://www.bgl.nu/~glouis |   (on my website or any keyserver) |
|  http://wecanstopspam.org in signatures helps fight junk email. |
>From  May 2004 07:54:32 -040 On 05 May 2004 07:43:02 2004
Return-Path: bogofilter-return-74-relson=osagesoftware.com at aotto.com
Date: Wed, 5 May 2004 07:54:32 -0400
From: David Relson <relson at osagesoftware.com>
Cc: bogofilter <bogofilter at aotto.com>
Subject: Re: [bogofilter] Improved Calculations
Message-Id: <20040505075432.399f3ad9 at osage.osagesoftware.com>
In-Reply-To: <1083757382.14620.846.camel at linuxpc>
References: <20040504182252.4b87349a at osage.osagesoftware.com>
	<1083757382.14620.846.camel at linuxpc>
Organization: Osage Software Systems, Inc.
X-Mailer: Sylpheed version 0.9.10claws51 (GTK+ 1.2.10; i686-pc-linux-gnu)
Mime-Version: 1.0
Content-Type: text/plain; charset=US-ASCII
Content-Transfer-Encoding: 7bit

On 05 May 2004 07:43:02 -0400
Tom Anderson wrote:

> On Tue, 2004-05-04 at 18:22, David Relson wrote:
> > Later this week, bogofilter-0.17.6 will be released with ESF
> > support. Greg modified bogotune to scan ESF values and used the
> > modified version in his test.  His mods will be in the release.
> 
> Will using ESFs invalidate our current cutoffs?
> 
> Tom

Hi Tom,

A good question.  The short answer is "No".

The default ESFs will be 1.0, which will give the exact same results
bogofilter now gets.  Bogotune's task is increased as the list of
tunable parameters increases from 4 to 6, i.e. from robs, robx, min_dev,
and spam_cutoff to also include sp_esf and ns_esf.

David
>From  Thu 6 May 2004 09:15:58 2004
Return-Path: bogofilter-return-63-relson=osagesoftware.com at aotto.com
Return-Path: <bogofilter-return-63-relson=osagesoftware.com at aotto.com>
X-Original-To: relson at osagesoftware.com
Delivered-To: relson at osagesoftware.com
Received: from lists.refdesk.com (ns1.drudgereport.com [216.40.241.219])
	by mail.osagesoftware.com (Postfix) with SMTP id DF85D2FEA6
	for <relson at osagesoftware.com>; Thu,  6 May 2004 09:15:58 -0400 (EDT)
Received: (qmail 11079 invoked by alias); 6 May 2004 13:14:56 -0000
Mailing-List: contact bogofilter-help at aotto.com; run by ezmlm
Precedence: bulk
List-Id: <bogofilter at aotto.com>
List-Post: <mailto:bogofilter at aotto.com>
List-Help: <mailto:bogofilter-help at aotto.com>
List-Unsubscribe: <mailto:bogofilter-unsubscribe at aotto.com>
List-Subscribe: <mailto:bogofilter-subscribe at aotto.com>
Delivered-To: mailing list bogofilter at aotto.com
X-Ezauth: ebmhjenlbaafligaokig
Received: (qmail 11069 invoked from network); 6 May 2004 13:14:56 -0000
Message-ID: <0dd001c4336c$1643e790$6ecfcfcf at Betson110>
Reply-To: "Tom Anderson" <tanderso at oac-design.com>
From: "Tom Anderson" <tanderso at oac-design.com>
To: "bogofilter" <bogofilter at aotto.com>
References: <20040504182252.4b87349a at osage.osagesoftware.com> <1083757382.14620.846.camel at linuxpc> <20040505075432.399f3ad9 at osage.osagesoftware.com>  <409969D0.1020509 at tacocat.net> 
Date: Thu, 6 May 2004 09:14:09 -0400
MIME-Version: 1.0
Content-Type: text/plain;
	charset="iso-8859-1"
Content-Transfer-Encoding: 7bit
X-Priority: 3
X-MSMail-Priority: Normal
X-Mailer: Microsoft Outlook Express 6.00.2800.1409
X-MimeOLE: Produced By Microsoft MimeOLE V6.00.2800.1409
Subject: Re: [bogofilter] Improved Calculations 
X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=0.17.5.cvs
Status:   

From: "Tom Allison" <tallison at tacocat.net>
> > The default ESFs will be 1.0, which will give the exact same results
> > bogofilter now gets.  Bogotune's task is increased as the list of
> > tunable parameters increases from 4 to 6, i.e. from robs, robx, min_dev,
> > and spam_cutoff to also include sp_esf and ns_esf.
...
> How does spam_cutoff/ham_cutoff and sp_esf/ns_esf inter-relate to each
> other?

>From what I read, Gary thinks that more redundancy in spam tokens tends to
push the optimal cutoff values downward.  Therefore, his solution with ESFs
should tend to bring optimal cutoff values into a more intuitive range.
However, unless you determine your cutoffs using bogotune or reset them
closer to the defaults, then if you change the ESFs to a value other than
1.0, your existing cutoffs will likely be on the hammy side, and you'll get
false positives if you don't change them.  Correct me if I'm wrong.

Tom
>From  Wed 5 May 2004 18:25:26 2004
Return-Path: tallison at tacocat.net
Return-Path: <tallison at tacocat.net>
X-Original-To: relson at osagesoftware.com
Delivered-To: relson at osagesoftware.com
Received: from ms-smtp-03-eri0.ohiordc.rr.com (ms-smtp-03-smtplb.ohiordc.rr.com [65.24.5.137])
	by mail.osagesoftware.com (Postfix) with ESMTP id AC1D42FEA6
	for <relson at osagesoftware.com>; Wed,  5 May 2004 18:25:26 -0400 (EDT)
Received: from janus.tacocat.net (cpe-069-133-095-206.twmi.rr.com [69.133.95.206])
	by ms-smtp-03-eri0.ohiordc.rr.com (8.12.10/8.12.7) with ESMTP id i45MPL2w027750;
	Wed, 5 May 2004 18:25:21 -0400 (EDT)
Received: from localhost (localhost [127.0.0.1])
	by janus.tacocat.net (Postfix) with ESMTP
	id 273B321315D; Wed,  5 May 2004 18:25:21 -0400 (EDT)
Received: from janus.tacocat.net ([127.0.0.1])
 by localhost (janus [127.0.0.1]) (amavisd-new, port 10024) with ESMTP
 id 05029-03; Wed,  5 May 2004 18:25:20 -0400 (EDT)
Received: from tacocat.net (unknown [192.168.1.10])
	by janus.tacocat.net (Postfix) with ESMTP
	id 937E721311C; Wed,  5 May 2004 18:25:20 -0400 (EDT)
Message-ID: <409969D0.1020509 at tacocat.net>
Date: Wed, 05 May 2004 18:25:20 -0400
From: Tom Allison <tallison at tacocat.net>
User-Agent: Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4.2) Gecko/20040220
X-Accept-Language: en-us, en
MIME-Version: 1.0
To: David Relson <relson at osagesoftware.com>
Cc: bogofilter <bogofilter at aotto.com>
Subject: Re: [bogofilter] Improved Calculations
References: <20040504182252.4b87349a at osage.osagesoftware.com>	<1083757382.14620.846.camel at linuxpc> <20040505075432.399f3ad9 at osage.osagesoftware.com>
In-Reply-To: <20040505075432.399f3ad9 at osage.osagesoftware.com>
X-Enigmail-Version: 0.76.8.0
X-Enigmail-Supports: pgp-inline, pgp-mime
Content-Type: text/plain; charset=us-ascii; format=flowed
Content-Transfer-Encoding: 7bit
X-Virus-Scanned: Symantec AntiVirus Scan Engine
X-Virus-Scanned: by amavisd-new at tacocat.net
X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=0.17.5.cvs
Status:   

David Relson wrote:
> On 05 May 2004 07:43:02 -0400
> Tom Anderson wrote:
> 
> 
>>On Tue, 2004-05-04 at 18:22, David Relson wrote:
>>
>>>Later this week, bogofilter-0.17.6 will be released with ESF
>>>support. Greg modified bogotune to scan ESF values and used the
>>>modified version in his test.  His mods will be in the release.
>>
>>Will using ESFs invalidate our current cutoffs?
>>
>>Tom
> 
> 
> Hi Tom,
> 
> A good question.  The short answer is "No".
> 
> The default ESFs will be 1.0, which will give the exact same results
> bogofilter now gets.  Bogotune's task is increased as the list of
> tunable parameters increases from 4 to 6, i.e. from robs, robx, min_dev,
> and spam_cutoff to also include sp_esf and ns_esf.
> 
> David
> 

I see this getting really ugly really fast.
How does spam_cutoff/ham_cutoff and sp_esf/ns_esf inter-relate to each 
other?

I got through part of the article this morning but haven't had a chance 
to complete it.  I'm not very good at statistics.  At least, not that good.
>From  Thu 6 May 2004 21:50:30 2004
Return-Path: bogofilter-return-74-relson=osagesoftware.com at aotto.com
Return-Path: <bogofilter-return-74-relson=osagesoftware.com at aotto.com>
X-Original-To: relson at osagesoftware.com
Delivered-To: relson at osagesoftware.com
Received: from lists.refdesk.com (ns1.drudgereport.com [216.40.241.219])
	by mail.osagesoftware.com (Postfix) with SMTP id 0C40F2FEA9
	for <relson at osagesoftware.com>; Thu,  6 May 2004 21:50:30 -0400 (EDT)
Received: (qmail 4477 invoked by alias); 7 May 2004 01:49:29 -0000
Mailing-List: contact bogofilter-help at aotto.com; run by ezmlm
Precedence: bulk
List-Id: <bogofilter at aotto.com>
List-Post: <mailto:bogofilter at aotto.com>
List-Help: <mailto:bogofilter-help at aotto.com>
List-Unsubscribe: <mailto:bogofilter-unsubscribe at aotto.com>
List-Subscribe: <mailto:bogofilter-subscribe at aotto.com>
Delivered-To: mailing list bogofilter at aotto.com
X-Ezauth: fncahbomgkjalejacjom
Received: (qmail 4467 invoked from network); 7 May 2004 01:49:29 -0000
Sender: m at mo.optusnet.com.au
To: Tom Allison <tallison at tacocat.net>
Cc: bogofilter <bogofilter at aotto.com>
References: <409A0A55.9020805 at tacocat.net>
From: michael at optusnet.com.au
Date: 07 May 2004 11:50:20 +1000
In-Reply-To: <409A0A55.9020805 at tacocat.net>
Message-ID: <m1brl1dmzn.fsf at mo.optusnet.com.au>
Lines: 47
User-Agent: Gnus/5.09 (Gnus v5.9.0) Emacs/21.3
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Subject: Re: [bogofilter] ESF and redundancy
X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=0.17.5.cvs
Status:   

Tom Allison <tallison at tacocat.net> writes:
> I'm not sure I follow all of this, but it seems that the article on
> ESF made the following assumptions:
> 
> That the spamminess of an email with one incident of the word, "foo"
> tends to score the same as two incidents of the word, "foo".
> So he goes through this sqrt()/2 argument to show that you can reduce
> the "effective size" of the email message and thereby correct for spam.
> 
> I occurred to me that since he's taking this from a model of protein
> selection he's assuming a degree of assumption in what the item under
> inspection is.
> 
> But under email, we can decisively remove all of the duplication for
> each token that occurs within an email with ease and
> certainty. Therefore, I'm not certain that the statistical estimation
> of the emails size using ESF is going to be any more accurate or
> simpler than explicitly removing all duplicate appearances of a token
> from spam scoring.

The 'foo foo' example given in the article is a little misleading.
What he's actually trying to say is something like..

        The bayes algorithm assumes the word probabilities
        are independant. We know this is wrong.

        Say 'stone wall' is a common phrase.

        If the word 'stone' appears in an email, and the word
        'wall' does, then we should discount the combined
        probabilities because they are _NOT_ independant, they
        commonly occur together.

        We can generalize and talk about the over-all level
        of discount for english text, knowing that "stone wall"
        is much more likely to occur than "stone jumped" and
        similarly for many other word sequences.

        Question: Is the discount for normal emails the same
        as the level of discount for spam?

The paper basically shows that it's not, and that we need
to correct for the difference.

Does that make any more sense?

Michael.
>From  Fri 7 May 2004 09:16:46 2004
Return-Path: bogofilter-return-81-relson=osagesoftware.com at aotto.com
Return-Path: <bogofilter-return-81-relson=osagesoftware.com at aotto.com>
X-Original-To: relson at osagesoftware.com
Delivered-To: relson at osagesoftware.com
Received: from lists.refdesk.com (ns1.drudgereport.com [216.40.241.219])
	by mail.osagesoftware.com (Postfix) with SMTP id D4CCE2FEA6
	for <relson at osagesoftware.com>; Fri,  7 May 2004 09:16:46 -0400 (EDT)
Received: (qmail 28718 invoked by alias); 7 May 2004 13:15:39 -0000
Mailing-List: contact bogofilter-help at aotto.com; run by ezmlm
Precedence: bulk
List-Id: <bogofilter at aotto.com>
List-Post: <mailto:bogofilter at aotto.com>
List-Help: <mailto:bogofilter-help at aotto.com>
List-Unsubscribe: <mailto:bogofilter-unsubscribe at aotto.com>
List-Subscribe: <mailto:bogofilter-subscribe at aotto.com>
Delivered-To: mailing list bogofilter at aotto.com
X-Ezauth: ldahpecofbdalgcjgdgi
Received: (qmail 28706 invoked from network); 7 May 2004 13:15:39 -0000
Message-ID: <0fd401c43435$5d797b10$6ecfcfcf at Betson110>
Reply-To: "Tom Anderson" <tanderso at oac-design.com>
From: "Tom Anderson" <tanderso at oac-design.com>
To: "bogofilter" <bogofilter at aotto.com>
References: <409A0A55.9020805 at tacocat.net>  <m1brl1dmzn.fsf at mo.optusnet.com.au> <1083929903.14629.962.camel at linuxpc>  <409B856C.1050807 at tacocat.net> 
Date: Fri, 7 May 2004 09:14:57 -0400
MIME-Version: 1.0
Content-Type: text/plain;
	charset="iso-8859-1"
Content-Transfer-Encoding: 7bit
X-Priority: 3
X-MSMail-Priority: Normal
X-Mailer: Microsoft Outlook Express 6.00.2800.1409
X-MimeOLE: Produced By Microsoft MimeOLE V6.00.2800.1409
Subject: Re: [bogofilter] ESF and redundancy 
X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=0.17.5.cvs
Status:   

From: "Tom Allison" <tallison at tacocat.net>
> I thought there was some article on slashdot last year on this.
> I was not impressed that Markovian was significantly better for the
> costs involved.  Anything new?

Read the "plateau" article posted to this list a few days ago.
http://crm114.sourceforge.net/Plateau_Paper.html

Moreover, my own description and justification for my suggestion preceded
the comment you quoted from my last email.  What's new is that it addresses
this issue of redundancy and correlation that we're currently "solving" with
ESF, which may very well be the wrong or non-ideal solution.

Tom
>From  Thu 6 May 2004 05:50:20 2004
Return-Path: bogofilter-return-60-relson=osagesoftware.com at aotto.com
Return-Path: <bogofilter-return-60-relson=osagesoftware.com at aotto.com>
X-Original-To: relson at osagesoftware.com
Delivered-To: relson at osagesoftware.com
Received: from lists.refdesk.com (ns1.drudgereport.com [216.40.241.219])
	by mail.osagesoftware.com (Postfix) with SMTP id 7F5E72FEA6
	for <relson at osagesoftware.com>; Thu,  6 May 2004 05:50:20 -0400 (EDT)
Received: (qmail 7079 invoked by alias); 6 May 2004 09:49:25 -0000
Mailing-List: contact bogofilter-help at aotto.com; run by ezmlm
Precedence: bulk
List-Id: <bogofilter at aotto.com>
List-Post: <mailto:bogofilter at aotto.com>
List-Help: <mailto:bogofilter-help at aotto.com>
List-Unsubscribe: <mailto:bogofilter-unsubscribe at aotto.com>
List-Subscribe: <mailto:bogofilter-subscribe at aotto.com>
Delivered-To: mailing list bogofilter at aotto.com
X-Ezauth: dadmegfnopfdmobbbdfo
Received: (qmail 7069 invoked from network); 6 May 2004 09:49:25 -0000
Message-ID: <409A0A55.9020805 at tacocat.net>
Date: Thu, 06 May 2004 05:50:13 -0400
From: Tom Allison <tallison at tacocat.net>
User-Agent: Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4.2) Gecko/20040220
X-Accept-Language: en-us, en
MIME-Version: 1.0
To: bogofilter <bogofilter at aotto.com>
X-Enigmail-Version: 0.76.8.0
X-Enigmail-Supports: pgp-inline, pgp-mime
Content-Type: text/plain; charset=us-ascii; format=flowed
Content-Transfer-Encoding: 7bit
X-Virus-Scanned: Symantec AntiVirus Scan Engine
X-Virus-Scanned: by amavisd-new at tacocat.net
Subject: [bogofilter] ESF and redundancy
X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=0.17.5.cvs
Status:   

I'm not sure I follow all of this, but it seems that the article on ESF 
made the following assumptions:

That the spamminess of an email with one incident of the word, "foo" 
tends to score the same as two incidents of the word, "foo".
So he goes through this sqrt()/2 argument to show that you can reduce 
the "effective size" of the email message and thereby correct for spam.

I occurred to me that since he's taking this from a model of protein 
selection he's assuming a degree of assumption in what the item under 
inspection is.

But under email, we can decisively remove all of the duplication for 
each token that occurs within an email with ease and certainty. 
Therefore, I'm not certain that the statistical estimation of the emails 
size using ESF is going to be any more accurate or simpler than 
explicitly removing all duplicate appearances of a token from spam scoring.

That, or I really don't understand the article that well and didn't get 
enough sleep last night.



More information about the Bogofilter mailing list