bogotrain

Bob Vincent bogofilter at bobvincent.org
Wed Aug 4 19:04:59 CEST 2004


On Wed, Aug 04, 2004 at 12:44:30PM -0400, David Relson wrote:
> On Wed, 4 Aug 2004 12:27:19 -0400
> Bob Vincent wrote:
> > I don't understand why the suggested spam_cutoff is lower than the
> > suggested ham_cutoff.  Can anyone explain?
<snip!>
> 
> How large are your message samples (spam and non-spam)?

About 2,400 each.

> Have you checked for incorrectly classified messages?

Every time I read mail, I run my script (attached; names changed to
protect the guilty) which re-trains the database.  If any message has
to be trained twice, it asks for confirmation.

So yes, I've checked pretty thoroughly, and yes, I've gotten some
pretty hammish-looking spams come through the reiserfs mailing list.
A couple in particular trigger the confirmation check nearly every
time.

> Attached is a script I wrote a couple of days ago for checking
> classifications.  It uses the current wordlist, scores a set of ham (or
> spam) messages, discards ham scoring 0.0000000 and spam scoring
> 1.000000, and sorts the scores.  With modification for your
> directories/mboxes/etc, it should help you find any misclassifications.

Thanks.  Will double-check.  Bogofilter seems have to peaked at about
99.5% accuracy. I'd like to exceed that, as 0.5% of 4000+ messages
still means that I see roughtly 20 spams a day.
-------------- next part --------------
#!/usr/bin/perl
use Term::ReadKey;
$home = '/home/myusername';
$bin = "$home/bin";
$realmailprog = '/usr/bin/mutt';
$downloader = 'offlineimap';
$bogofilter = "$bin/bogofilter -c $home/.bogofilter/config";
$bogoutil = "$bin/bogoutil";
$bogodir = "$home/.bogofilter";
$spamfolder="$home/Mail/spam";
$goodfolder="$home/Mail/ham";
$dbfile = "$bogodir/wordlist.qdbm";
$ftphost = 'myisphost';
$ftpuser = 'myusername';
$ftppass = 'mypassword';
$ftpdir =  '.bogofilter';
$spindex = 0;
@spinner = (chr(8).'|',chr(8).'/',chr(8).'-',chr(8).'\\');
$spinstop = chr(8).' ';
$oldstats = `$bogoutil -w $bogodir .MSG_COUNT`;
$oldstats =~ /.MSG_COUNT\s+([0-9]+)\s+([0-9]+)/;
$oldgood = $2;
print("Getting new messages...");
system($downloader);
print("Done. Press Enter key to invoke $realmailprog\n");
$dummy=<STDIN>;
system($realmailprog, at ARGV);
print("Uploading changes...");
system($downloader);
print("Done.\n");
my (%tspam,%tgood,$pos,$neg,$uns,$run);
print('Loading and categorizing training set... ');
loadmaildir(\%tgood,$goodfolder,'Spam',\$pos,'????',\$uns);
loadmaildir(\%tspam,$spamfolder,'Good',\$neg,'????',\$uns);
print(" Done.\n");
$numgood = scalar keys %tgood;
$numspam = scalar keys %tspam;
$nummiss = $pos + $neg + $uns;
# Assuming we deleted excess spam on the last run so that #spam=#good,
# we can assume that any spam or ham in excess of that number is new.
$numnew = ($numspam - $oldgood) + ($numgood - $oldgood);
printf("%d Good, %d Spam",$numgood,$numspam);
if ($pos>0)
  {
    printf(", %d False Positive%s",$pos,$pos>0 ? 's' : '');
  }
if ($neg>0)
  {
    printf(", %d False Negative%s",$neg,$neg>0 ? 's' : '');
  }
if ($uns>0)
  {
    printf(", %d Unsure%s",$uns,$uns>0 ? 's' : '');
  }
# Again, I'm assuming that we ran this program last time,
# so the number of misses on the original dataset should
# be zero.  Any misses are solely due to the new messages.
printf(".\n%d new Messages, %d Misclassified, %.2f%% Accuracy.\n",$numnew,$nummiss,($numnew-$nummiss) * 100.0 / $numnew);
print("Resetting database and preloading known-good messages... ");
unlink $dbfile;
system("find '$goodfolder' -type f | $bogofilter -n -b");
foreach $filename (keys %tgood)
  {
    $tgood{$filename} = 1;
  }
print("Done.\n");
my @filenames=();
do
  {
    ($pos,$neg,$uns) = (0,0,0);
    @filenames = ();
    push @filenames, keys %tgood;
    push @filenames, keys %tspam;
    foreach $filename ( sort { rand() <=> rand() } @filenames )
      {
	next unless -f $filename;
	print($spinner[$spindex=($spindex+1)%4]);
        system("$bogofilter -I '$filename'");
	if ($? == -1)
          {
	    print($spinstop);
            die("$bogofilter failed to execute: $!\n");
	  }
	elsif ($? & 127)
	  {
	    print($spinstop);
	    printf("\n$bogofilter died with signal %d\n",$? & 127);
	  }
	else
	  {
	    $status = ($? >> 8);
	    if ($status ==3)
	      {
		if (! -f $filename)
		  {
		    print($spinstop);
		    printf("File disappeared: %s\n",$filename);
		  }
		else
		  {
		    print($spinstop);
		    printf("\nI/O or other error on file: %s\n",$filename);
		  }
		undef($tgood{$filename});
		undef($tspam{$filename});
	      }
	    elsif (($status !=1) && defined($tgood{$filename})) # misclassified good
	      {
		train(\%tgood,($status==0) ? \$pos : \$uns,$filename,'n');
	      }
	    elsif (($status != 0) && defined($tspam{$filename})) # misclassified spam
	      {
		print($spinstop);
		train(\%tspam,($status==1) ? \$neg : \$uns,$filename,'s');
	      }
	  }
      }
  } until (($pos+$neg+$uns) == 0);
print($spinstop);
prune(\%tspam,scalar keys %tgood,'spam');
print('Synchronizing Deletions with IMAP server...');
system($downloader);
print("Done.\n");
print "Compacting database ...";
system("$bogoutil -d $dbfile | $bogoutil -l $bogodir/compact.qdbm");
rename("$bogodir/compact.qdbm","$dbfile");
print('Uploading fully-trained database...');
system("ftp-upload -h '$ftphost' --user '$ftpuser' --password '$ftppass' --tmp-samedir --dir '$ftpdir' $dbfile");
print("Done.\n");

sub loadmaildir($$$$$$)
  {
    my $hashref = shift;
    my $folder = shift;
    my $poslabel = shift;
    my $posref = shift;
    my $unslabel = shift;
    my $unsref = shift;
    my $filename = '';
    my $result = '';
    open(RESULT,'-|',"$bogofilter -v -B '$folder'");
    while(<RESULT>)
      {
	print($spinner[$spindex=($spindex+1)%4]);
	if (m|^($folder[^ ]+) x-label: ([^ ]+) |)
          {
	    $filename = $1;
	    if (-f $filename)
	      {
		$result = $2;
		$$hashref{$filename} = 0;
		if ("$poslabel" eq "$result")
		  {
		    $$posref++;
		  }
		elsif ("$unslabel" eq "$result")
		  {
		    $$unsref++;
		  }
	      }
	    else
	      {
		print($spinstop);
		printf("unexpected result from bogofilter:\n%s\n",$_);
	      }
	  }
      }
    print($spinstop);
    close(RESULT);
  }

sub train($$$$)
  {
    my $hashref = shift;
    my $countref = shift;
    my $filename = shift;
    my $tchar = shift;
    my $runs;
    return unless -f $filename;
    $$countref++;
    print(chr(8).$tchar.' ');
    if ( ($runs = ++$$hashref{$filename}) > 1)
      {
	while (1)
	  {
	    $head = 1;
	    $lines = 0;
	    $limit = 20;
	    print("\n");
	    open(TESTFILE,'<',$filename);
	    while (<TESTFILE>)
	      {
		chomp;
		if (($head &&= length))
		  {
		    next unless (/^(Envelope|To|From|Subject|Date)/);
		  }
		print "$_\n";
		last if ((++$lines) > $limit);
	      }
	    close(TESTFILE);
	    my ($s1,$s2,$s3,$s4) =
	      ($tchar eq 's') ?
		('S','pam','G','ood') :
		  ('G','ood','S','pam');
	    printf("Train as (%s)%s, Reclassify as (%s)%s, (D)elete, or (V)iew?  ",
		   $s1,$s2,$s3,$s4);
	    my $ans = $s1;
	    my $stopwatch = time() + 60;
	    ReadMode('cbreak');
	    while(time()<$stopwatch)
	      {
		sleep(1);
		next unless defined (my $char = ReadKey(-1));
		$ans = uc($char);
		last;
	      }
	    ReadMode('normal');
	    print("$ans\n ");
	    if ($ans eq $s1)
	      {
		# Here we should make a copy of the message
		# so we don't get asked this question every
		# single time the proggie runs...
		my $folder = $tchar eq 's' ? $spamfolder : $goodfolder;
		chomp($filename="$folder/new/".`safecat $folder/tmp $folder/new < $filename`);
		if (-f $filename)
		  {
		    ($tchar eq 's' ? $tspam{$filename} : $tgood{$newfile} ) = 1;
		  }
		else
		  {
		    printf("Could not copy message to %s\n",$filename);
		  }
		last;
	      }
	    elsif ($ans eq  'V')
	      {
		system("less '$filename'");
		$stopwatch = time() + 60;
		next;
	      }
	    elsif (($ans eq $s3) or ($ans eq 'D'))
	      {
		$tchar = uc($tchar);
		while (--$runs)
		  {
		    system("$bogofilter -$tchar -I '$filename'");
		    if ($? == -1)
		      {
			die("$bogofilter failed to execute: $!\n");
			return;
		      }
		    elsif ($? & 127)
		      {
			printf("$bogofilter died with signal %d\n",$? & 127);
			return;
		      }
		  }
		undef($$hashref{$filename});
		if ($ans eq $s3)
		  {
		    if ($tchar eq 'S')
		      {
			chomp($newname="$goodfolder/new/".`safecat $goodfolder/tmp $goodfolder/new < $filename`);
			if (-f $newname)
			  {
			    printf("Moved %s to %s\n",$filename,$newname);
			    unlink($filename);
			    undef($tspam{$filename});
			    $filename = $newname;
			    $tgood{$filename} = 1;
			    last;
			  }
			else
			  {
			    printf("Could not move message to %s\n",$newname);
			    return;
			  }
		      }
		    else
		      {
			chomp($newname="$spamfolder/new/".`safecat $spamfolder/tmp $goodfolder/new < $filename`);
			if (-f $newname)
			  {
			    printf("Moved %s to %s\n",$filename,$newname);
			    unlink($filename);
			    undef($tgood{$filename});
			    $filename = $newname;
			    $tspam{$filename} = 1;
			    last;
			  }
			else
			  {
			    printf("Could not move message to %s\n",$newname);
			    return;
			  }
		      }
		  }
		else
		  {
		    unlink($filename);
		    undef($tspam{$filename});
		    undef($tgood{$filename});
		    return;
		  }
	      }
           printf("'%s' is not a valid response.\n",$ans);
	  }
      }
    system("$bogofilter -$tchar -I '$filename'");
    if ($? == -1)
      {
	die("$bogofilter failed to execute: $!\n");
      }
    elsif ($? & 127)
      {
	printf("$bogofilter died with signal %d\n",$? & 127);
      }
  }

sub prune($$$)
  {
    my $hashref = shift;
    my $maxsave = shift;
    my $msgkind = shift;
    my @filenames;
    @filenames = sort { (-M $b) <=> (-M $a) } keys %$hashref;
    my $numfiles;
    $numfiles = scalar(@filenames);
    my $todelete;
    $todelete = $numfiles - $maxsave;
    if ($todelete > 3000)
      {
	$todelete = 3000;
      }
    my $deleted;
    if ($todelete > 0)
      {
	printf("\nDeleting oldest %d %s messages not used for training... ",
	       $todelete, $msgkind);
	foreach $filename (@filenames)
	  {
	    print($spinner[$spindex=($spindex+1)%4]);
	    if ($$hashref{$filename}==0)
	      {
	        unlink($filename);
		$todelete --;
		if ($todelete < 1)
		  {
		    print($spinstop);
		    print(" Done.\n");
		    return;
		  }
	      }
	  }
	print($spinstop);
        printf(" %d not deleted.\n",$todelete-$deleted);
      }
  }


More information about the Bogofilter mailing list