bogotrain
Bob Vincent
bogofilter at bobvincent.org
Wed Aug 4 19:04:59 CEST 2004
On Wed, Aug 04, 2004 at 12:44:30PM -0400, David Relson wrote:
> On Wed, 4 Aug 2004 12:27:19 -0400
> Bob Vincent wrote:
> > I don't understand why the suggested spam_cutoff is lower than the
> > suggested ham_cutoff. Can anyone explain?
<snip!>
>
> How large are your message samples (spam and non-spam)?
About 2,400 each.
> Have you checked for incorrectly classified messages?
Every time I read mail, I run my script (attached; names changed to
protect the guilty) which re-trains the database. If any message has
to be trained twice, it asks for confirmation.
So yes, I've checked pretty thoroughly, and yes, I've gotten some
pretty hammish-looking spams come through the reiserfs mailing list.
A couple in particular trigger the confirmation check nearly every
time.
> Attached is a script I wrote a couple of days ago for checking
> classifications. It uses the current wordlist, scores a set of ham (or
> spam) messages, discards ham scoring 0.0000000 and spam scoring
> 1.000000, and sorts the scores. With modification for your
> directories/mboxes/etc, it should help you find any misclassifications.
Thanks. Will double-check. Bogofilter seems have to peaked at about
99.5% accuracy. I'd like to exceed that, as 0.5% of 4000+ messages
still means that I see roughtly 20 spams a day.
-------------- next part --------------
#!/usr/bin/perl
use Term::ReadKey;
$home = '/home/myusername';
$bin = "$home/bin";
$realmailprog = '/usr/bin/mutt';
$downloader = 'offlineimap';
$bogofilter = "$bin/bogofilter -c $home/.bogofilter/config";
$bogoutil = "$bin/bogoutil";
$bogodir = "$home/.bogofilter";
$spamfolder="$home/Mail/spam";
$goodfolder="$home/Mail/ham";
$dbfile = "$bogodir/wordlist.qdbm";
$ftphost = 'myisphost';
$ftpuser = 'myusername';
$ftppass = 'mypassword';
$ftpdir = '.bogofilter';
$spindex = 0;
@spinner = (chr(8).'|',chr(8).'/',chr(8).'-',chr(8).'\\');
$spinstop = chr(8).' ';
$oldstats = `$bogoutil -w $bogodir .MSG_COUNT`;
$oldstats =~ /.MSG_COUNT\s+([0-9]+)\s+([0-9]+)/;
$oldgood = $2;
print("Getting new messages...");
system($downloader);
print("Done. Press Enter key to invoke $realmailprog\n");
$dummy=<STDIN>;
system($realmailprog, at ARGV);
print("Uploading changes...");
system($downloader);
print("Done.\n");
my (%tspam,%tgood,$pos,$neg,$uns,$run);
print('Loading and categorizing training set... ');
loadmaildir(\%tgood,$goodfolder,'Spam',\$pos,'????',\$uns);
loadmaildir(\%tspam,$spamfolder,'Good',\$neg,'????',\$uns);
print(" Done.\n");
$numgood = scalar keys %tgood;
$numspam = scalar keys %tspam;
$nummiss = $pos + $neg + $uns;
# Assuming we deleted excess spam on the last run so that #spam=#good,
# we can assume that any spam or ham in excess of that number is new.
$numnew = ($numspam - $oldgood) + ($numgood - $oldgood);
printf("%d Good, %d Spam",$numgood,$numspam);
if ($pos>0)
{
printf(", %d False Positive%s",$pos,$pos>0 ? 's' : '');
}
if ($neg>0)
{
printf(", %d False Negative%s",$neg,$neg>0 ? 's' : '');
}
if ($uns>0)
{
printf(", %d Unsure%s",$uns,$uns>0 ? 's' : '');
}
# Again, I'm assuming that we ran this program last time,
# so the number of misses on the original dataset should
# be zero. Any misses are solely due to the new messages.
printf(".\n%d new Messages, %d Misclassified, %.2f%% Accuracy.\n",$numnew,$nummiss,($numnew-$nummiss) * 100.0 / $numnew);
print("Resetting database and preloading known-good messages... ");
unlink $dbfile;
system("find '$goodfolder' -type f | $bogofilter -n -b");
foreach $filename (keys %tgood)
{
$tgood{$filename} = 1;
}
print("Done.\n");
my @filenames=();
do
{
($pos,$neg,$uns) = (0,0,0);
@filenames = ();
push @filenames, keys %tgood;
push @filenames, keys %tspam;
foreach $filename ( sort { rand() <=> rand() } @filenames )
{
next unless -f $filename;
print($spinner[$spindex=($spindex+1)%4]);
system("$bogofilter -I '$filename'");
if ($? == -1)
{
print($spinstop);
die("$bogofilter failed to execute: $!\n");
}
elsif ($? & 127)
{
print($spinstop);
printf("\n$bogofilter died with signal %d\n",$? & 127);
}
else
{
$status = ($? >> 8);
if ($status ==3)
{
if (! -f $filename)
{
print($spinstop);
printf("File disappeared: %s\n",$filename);
}
else
{
print($spinstop);
printf("\nI/O or other error on file: %s\n",$filename);
}
undef($tgood{$filename});
undef($tspam{$filename});
}
elsif (($status !=1) && defined($tgood{$filename})) # misclassified good
{
train(\%tgood,($status==0) ? \$pos : \$uns,$filename,'n');
}
elsif (($status != 0) && defined($tspam{$filename})) # misclassified spam
{
print($spinstop);
train(\%tspam,($status==1) ? \$neg : \$uns,$filename,'s');
}
}
}
} until (($pos+$neg+$uns) == 0);
print($spinstop);
prune(\%tspam,scalar keys %tgood,'spam');
print('Synchronizing Deletions with IMAP server...');
system($downloader);
print("Done.\n");
print "Compacting database ...";
system("$bogoutil -d $dbfile | $bogoutil -l $bogodir/compact.qdbm");
rename("$bogodir/compact.qdbm","$dbfile");
print('Uploading fully-trained database...');
system("ftp-upload -h '$ftphost' --user '$ftpuser' --password '$ftppass' --tmp-samedir --dir '$ftpdir' $dbfile");
print("Done.\n");
sub loadmaildir($$$$$$)
{
my $hashref = shift;
my $folder = shift;
my $poslabel = shift;
my $posref = shift;
my $unslabel = shift;
my $unsref = shift;
my $filename = '';
my $result = '';
open(RESULT,'-|',"$bogofilter -v -B '$folder'");
while(<RESULT>)
{
print($spinner[$spindex=($spindex+1)%4]);
if (m|^($folder[^ ]+) x-label: ([^ ]+) |)
{
$filename = $1;
if (-f $filename)
{
$result = $2;
$$hashref{$filename} = 0;
if ("$poslabel" eq "$result")
{
$$posref++;
}
elsif ("$unslabel" eq "$result")
{
$$unsref++;
}
}
else
{
print($spinstop);
printf("unexpected result from bogofilter:\n%s\n",$_);
}
}
}
print($spinstop);
close(RESULT);
}
sub train($$$$)
{
my $hashref = shift;
my $countref = shift;
my $filename = shift;
my $tchar = shift;
my $runs;
return unless -f $filename;
$$countref++;
print(chr(8).$tchar.' ');
if ( ($runs = ++$$hashref{$filename}) > 1)
{
while (1)
{
$head = 1;
$lines = 0;
$limit = 20;
print("\n");
open(TESTFILE,'<',$filename);
while (<TESTFILE>)
{
chomp;
if (($head &&= length))
{
next unless (/^(Envelope|To|From|Subject|Date)/);
}
print "$_\n";
last if ((++$lines) > $limit);
}
close(TESTFILE);
my ($s1,$s2,$s3,$s4) =
($tchar eq 's') ?
('S','pam','G','ood') :
('G','ood','S','pam');
printf("Train as (%s)%s, Reclassify as (%s)%s, (D)elete, or (V)iew? ",
$s1,$s2,$s3,$s4);
my $ans = $s1;
my $stopwatch = time() + 60;
ReadMode('cbreak');
while(time()<$stopwatch)
{
sleep(1);
next unless defined (my $char = ReadKey(-1));
$ans = uc($char);
last;
}
ReadMode('normal');
print("$ans\n ");
if ($ans eq $s1)
{
# Here we should make a copy of the message
# so we don't get asked this question every
# single time the proggie runs...
my $folder = $tchar eq 's' ? $spamfolder : $goodfolder;
chomp($filename="$folder/new/".`safecat $folder/tmp $folder/new < $filename`);
if (-f $filename)
{
($tchar eq 's' ? $tspam{$filename} : $tgood{$newfile} ) = 1;
}
else
{
printf("Could not copy message to %s\n",$filename);
}
last;
}
elsif ($ans eq 'V')
{
system("less '$filename'");
$stopwatch = time() + 60;
next;
}
elsif (($ans eq $s3) or ($ans eq 'D'))
{
$tchar = uc($tchar);
while (--$runs)
{
system("$bogofilter -$tchar -I '$filename'");
if ($? == -1)
{
die("$bogofilter failed to execute: $!\n");
return;
}
elsif ($? & 127)
{
printf("$bogofilter died with signal %d\n",$? & 127);
return;
}
}
undef($$hashref{$filename});
if ($ans eq $s3)
{
if ($tchar eq 'S')
{
chomp($newname="$goodfolder/new/".`safecat $goodfolder/tmp $goodfolder/new < $filename`);
if (-f $newname)
{
printf("Moved %s to %s\n",$filename,$newname);
unlink($filename);
undef($tspam{$filename});
$filename = $newname;
$tgood{$filename} = 1;
last;
}
else
{
printf("Could not move message to %s\n",$newname);
return;
}
}
else
{
chomp($newname="$spamfolder/new/".`safecat $spamfolder/tmp $goodfolder/new < $filename`);
if (-f $newname)
{
printf("Moved %s to %s\n",$filename,$newname);
unlink($filename);
undef($tgood{$filename});
$filename = $newname;
$tspam{$filename} = 1;
last;
}
else
{
printf("Could not move message to %s\n",$newname);
return;
}
}
}
else
{
unlink($filename);
undef($tspam{$filename});
undef($tgood{$filename});
return;
}
}
printf("'%s' is not a valid response.\n",$ans);
}
}
system("$bogofilter -$tchar -I '$filename'");
if ($? == -1)
{
die("$bogofilter failed to execute: $!\n");
}
elsif ($? & 127)
{
printf("$bogofilter died with signal %d\n",$? & 127);
}
}
sub prune($$$)
{
my $hashref = shift;
my $maxsave = shift;
my $msgkind = shift;
my @filenames;
@filenames = sort { (-M $b) <=> (-M $a) } keys %$hashref;
my $numfiles;
$numfiles = scalar(@filenames);
my $todelete;
$todelete = $numfiles - $maxsave;
if ($todelete > 3000)
{
$todelete = 3000;
}
my $deleted;
if ($todelete > 0)
{
printf("\nDeleting oldest %d %s messages not used for training... ",
$todelete, $msgkind);
foreach $filename (@filenames)
{
print($spinner[$spindex=($spindex+1)%4]);
if ($$hashref{$filename}==0)
{
unlink($filename);
$todelete --;
if ($todelete < 1)
{
print($spinstop);
print(" Done.\n");
return;
}
}
}
print($spinstop);
printf(" %d not deleted.\n",$todelete-$deleted);
}
}
More information about the Bogofilter
mailing list