Handling an extremely high spam load...

Bob Vincent bogofilter at bobvincent.org
Tue Jul 20 16:06:53 CEST 2004


Okay, attached is the final version of my training script; the
original version I posted earlier had problems.

This works for me; it may not work for others.  I'm sharing it because
Tom asked me to, indirectly.

My revised script does this:

  * Download any new mail

  * Run my mail client so I can manually classify any new "unsures"

  * Restart training from an empty database.

  * Randomize spam and non-spam messages on each training run.

  * Keep track of which spam messages get used for training.

  * If any message needs to be trained more than six times,
    assume that it was misclassified:

      -  Print the filename
      -  Reverse its training
      -  Remove it from the message hash.
         (This also prevents it from being deleted.)

  * After the database is fully trained (zero errors), delete
    the oldest spam messages not used for training until the
    number of spams and non-spams are equal.

  * Upload the resultant training set and database.


I've aliased my email client (mutt) to this script, so it runs every
time I check my email.

Current accuracy is around 99% and getting better.

Incidentally, my training configuration uses cutoff values of 0.01 and
0.99 while my real configuration is using 0.10 and 0.75.

In another month or so I'll have my 2,000 ham messages so I can run
bogotune.

Usernames and passwords stored in the script have been changed for
privacy reasons, obviously.

-- 
Robert August Vincent, II
(pronounced "Bob" or "Bob-Vee")
The Web is like Usenet, but
the elephants are untrained.
-------------- next part --------------
#!/usr/bin/perl
$home = '/home/bobvin';
$bin = "$home/bin";
$realmailprog = '/usr/bin/mutt';
$downloader = 'offlineimap';
$maxruns = 6;
$bogotrain  = "$bin/bogofilter -c $home/.bogofilter/training";
$bogofilter = "$bin/bogofilter -c $home/.bogofilter/config";
$bogodir = "$home/.bogofilter";
$spamfolder="$home/Mail/spam";
$goodfolder="$home/Mail/ham";
$dbfile = "$bogodir/wordlist.qdbm";
$ftphost = 'my-hosted-domain.com';
$ftpuser = 'myusername';
$ftppass = 'mypassword';
$ftpdir =  '.bogofilter';
$spindex = 0;
@spinner = (chr(8).'|',chr(8).'/',chr(8).'-',chr(8).'\\');
$SIG{ALRM} = sub { print($spinner[$spindex=($spindex+1)%4]); alarm(1); };
print("Getting new messages...");
system($downloader);
print("Done.\n");
system($realmailprog, at ARGV);
my (%tspam,%tgood,$pos,$neg,$uns,$run);
print(chr(8).'Loading and categorizing training set... ');
alarm(1);
loadmaildir(\%tgood,$goodfolder,'Spam',\$pos,'????',\$uns);
loadmaildir(\%tspam,$spamfolder,'Good',\$neg,'????',\$uns);
alarm(0);
print(chr(8)." Done.\n");
$numgood = scalar keys %tgood;
$numspam = scalar keys %tspam;
$nummsgs = $numgood + $numspam;
$nummiss = $pos + $neg + $uns;
printf("%d Good, %d False Positive\n%d Spam, %d False Negative, %d Unsure\n%d Msgs, %d Misclassified, %.2f%% Initial Accuracy\n",$numgood,$pos,$numspam,$neg,$uns,$nummsgs,$nummiss,($nummsgs-$nummiss) * 100.0 / $nummsgs);
print("Resetting database... ");
unlink $dbfile;
system(sprintf("$bogotrain -n -I %s",each(%tgood)));
print("Done.\n");
my @filenames=();
do
  {
    ($pos,$neg,$uns) = (0,0,0);
    printf("\n%d:  ",++$run);
    alarm(1);
    loadfilenames(\@filenames,\%tgood,\%tspam);
    foreach $filename ( @filenames )
      {
        system("$bogotrain -I '$filename' \&>/dev/null");
	if ($? == -1)
          {
            die("$bogotrain failed to execute: $!\n");
	  }
	elsif ($? & 127)
	  {
	    printf("\n$bogotrain died with signal %d\n",$? & 127);
	  }
	else
	  {
	    $status = ($? >> 8);
	    if ($status ==3)
	      {
		printf("\nUnreadable file: %s\n",$filename);
	      }
	    elsif (($status !=1) && defined($tgood{$filename})) # misclassified good
	      {
		train(\%tgood,($status==0) ? \$pos : \$uns,$filename,'n');
	      }
	    elsif (($status != 0) && defined($tspam{$filename})) # misclassified spam
	      {
		train(\%tspam,($status==1) ? \$neg : \$uns,$filename,'s');
	      }
	  }
      }
    alarm(0);
    print(chr(8)." Done.");
  } until (($pos+$neg+$uns) == 0);
prune(\%tspam,scalar keys %tgood,'spam');
print('Syncronizing Deletions with IMAP server...');
system($downloader);
print("Done.\n");
print('Uploading fully-trained database...');
system("ftp-upload -h '$ftphost' --user '$ftpuser' --password '$ftppass' --tmp-samedir --dir '$ftpdir' $dbfile");
print("Done.\n");

sub loadmaildir($$$$$$)
  {
    my $hashref = shift;
    my $folder = shift;
    my $poslabel = shift;
    my $posref = shift;
    my $unslabel = shift;
    my $unsref = shift;
    my $filename = '';
    my $result = '';
    open(RESULT,'-|',"$bogofilter -v -B '$folder'");
    while(<RESULT>)
      {
	if (m|^($folder[^ ]+) x-label: ([^ ]+) |)
          {
	    $filename = $1;
	    $result = $2;
	    $$hashref{$filename} = 0;
	    if ("$poslabel" eq "$result")
	      {
		$$posref++;
	      }
	    elsif ("$unslabel" eq "$result")
	      {
		$$unsref++;
	      }
	  }
      }
    close(RESULT);
  }


sub loadfilenames($$$)
  {
    my $filenames = shift;
    my $goodref = shift;
    my $spamref = shift;
    @$filenames = ();
    push @$filenames,keys %$goodref;
    push @$filenames,keys %$spamref;
    @$filenames = sort { rand() <=> rand() } @$filenames;
  }

    
sub train($$$$)
  {
    my $hashref = shift;
    my $countref = shift;
    my $filename = shift;
    my $tchar = shift;
    my $runs;
    $$countref++;
    if ( ($runs = ++$$hashref{$filename}) > $maxruns)
      {
	print("\nFile exceeded training threshold; untraining $filename\n");
	$tchar = uc($tchar);
	while (--$runs)
	  {
	    system("$bogotrain -$tchar -I '$filename'");
	    if ($? == -1)
	      {
		die("$bogotrain failed to execute: $!\n");
	      }
	    elsif ($? & 127)
	      {
		printf("$bogotrain died with signal %d\n",$? & 127);
	      }
	  }
	# unlink($filename);
	undef($$hashref{$filename});
      }
    else
      {
	system("$bogotrain -$tchar -I '$filename'");
	if ($? == -1)
          {
            die("$bogotrain failed to execute: $!\n");
	  }
	elsif ($? & 127)
	  {
	    printf("$bogotrain died with signal %d\n",$? & 127);
	  }
      }
    print(chr(8).$tchar.' ');
  }

sub prune($$$)
  {
    my $hashref = shift;
    my $maxsave = shift;
    my $msgkind = shift;
    my @filenames;
    @filenames = sort { (-M $b) <=> (-M $a) } keys %$hashref;
    my $numfiles;
    $numfiles = scalar(@filenames);
    my $todelete;
    $todelete = $numfiles - $maxsave;
    my $deleted;
    if ($todelete > 0)
      {
	printf("\nDeleting oldest %d %s messages not used for training... ",
	       $todelete, $msgkind);
        alarm(1);
	foreach $filename (@filenames)
	  {
	    if ($$hashref{$filename}==0)
	      {
	        unlink($filename);
		$todelete --;
		if ($todelete < 1)
		  {
                    alarm(0);
		    print(chr(8)." Done.\n");
		    return;
		  }
	      }
	  }
        alarm(0);
        printf(chr(8)." %d not deleted.\n",$todelete-$deleted);
      }
  }


More information about the Bogofilter mailing list