Training script for extremely high spam loads

Bob Vincent bobvin at pillars.net
Fri Jul 23 06:28:57 CEST 2004


Enclosed is the third (and hopefully final) revision of my training script.

Again, I'm running this every time I read mail, but the strategy has changed again.

Pseudocode:

  Download new mail
  Run mail program;
    (I manually classify "unsures" by dropping them in my ham or spam folder.)
  Delete bogofilter database and reload with hams.
  Randomly mix together all training messages (hams and spams).
  For each message:
     Check it with bogofilter
     If the result doesn't match the folder it came from:
       If we've already trained it once, ask the user for confirmation.
         If the user says keep training, 
           Make a COPY of the message in the same folder.
           Set the current message to the copy.
         If the user says reclassify,
	   Untrain the message.
           MOVE the message into the opposite folder.
	   Reverse the training direction
       Train the message with bogofilter
  Repeat until all messages are properly classified with no re-training.
  If we've got more spams than hams (Nearly always the case)
    Delete the oldest spams not used for training until #spams = #hams




 

         
-------------- next part --------------
#!/usr/bin/perl
$home = '/home/myusername';
$bin = "$home/bin";
$realmailprog = '/usr/bin/mutt';
$downloader = 'offlineimap';
$bogotrain  = "$bin/bogofilter -c $home/.bogofilter/training";
$bogofilter = "$bin/bogofilter -c $home/.bogofilter/config";
$bogodir = "$home/.bogofilter";
$spamfolder="$home/Mail/spam";
$goodfolder="$home/Mail/ham";
$dbfile = "$bogodir/wordlist.qdbm";
$ftphost = 'myhost.name';
$ftpuser = 'myusername';
$ftppass = 'mypassword';
$ftpdir =  '.bogofilter';
$spindex = 0;
@spinner = (chr(8).'|',chr(8).'/',chr(8).'-',chr(8).'\\');
$SIG{ALRM} = sub { print($spinner[$spindex=($spindex+1)%4]); alarm(1); };
print("Getting new messages...");
system($downloader);
print("Done.\n");
system($realmailprog, at ARGV);
my (%tspam,%tgood,$pos,$neg,$uns,$run);
print('Loading and categorizing training set... ');
loadmaildir(\%tgood,$goodfolder,'Spam',\$pos,'????',\$uns);
loadmaildir(\%tspam,$spamfolder,'Good',\$neg,'????',\$uns);
print(" Done.\n");
$numgood = scalar keys %tgood;
$numspam = scalar keys %tspam;
$nummsgs = $numgood + $numspam;
$nummiss = $pos + $neg + $uns;
printf("%d Good, %d Spam",$numgood,$numspam);
if ($pos>0)
  {
    printf(", %d False Positive%s",$pos,$pos>0 ? 's' : '');
  }
if ($neg>0)
  {
    printf(", %d False Negative%s",$neg,$neg>0 ? 's' : '');
  }
if ($uns>0)
  {
    printf(", %d Unsure%s",$uns,$uns>0 ? 's' : '');
  }
printf(".\n%d Messages, %d Misclassified, %.2f%% Initial Accuracy.\n",$nummsgs,$nummiss,($nummsgs-$nummiss) * 100.0 / $nummsgs);
print("Resetting database and preloading known-good messages... ");
unlink $dbfile;
system("find '$goodfolder' -type f | $bogotrain -n -b");
foreach $filename (keys %tgood)
  {
    $tgood{$filename} = 1;
  }
print("Done.\n");
my @filenames=();
print(' ');alarm(1);
do
  {
    ($pos,$neg,$uns) = (0,0,0);
    @filenames = ();
    push @filenames, keys %tgood;
    push @filenames, keys %tspam;
    foreach $filename ( sort { rand() <=> rand() } @filenames )
      {
        system("$bogotrain -I '$filename'");
	if ($? == -1)
          {
            die("$bogotrain failed to execute: $!\n");
	  }
	elsif ($? & 127)
	  {
	    printf("\n$bogotrain died with signal %d\n",$? & 127);
	  }
	else
	  {
	    $status = ($? >> 8);
	    if ($status ==3)
	      {
		printf("\nI/O or other error on file: %s\n",$filename);
	      }
	    elsif (($status !=1) && defined($tgood{$filename})) # misclassified good
	      {
		train(\%tgood,($status==0) ? \$pos : \$uns,$filename,'n');
	      }
	    elsif (($status != 0) && defined($tspam{$filename})) # misclassified spam
	      {
		train(\%tspam,($status==1) ? \$neg : \$uns,$filename,'s');
	      }
	  }
      }
  } until (($pos+$neg+$uns) == 0);
alarm(0);
prune(\%tspam,scalar keys %tgood,'spam');
print('Synchronizing Deletions with IMAP server...');
system($downloader);
print("Done.\n");
print('Uploading fully-trained database...');
system("ftp-upload -h '$ftphost' --user '$ftpuser' --password '$ftppass' --tmp-samedir --dir '$ftpdir' $dbfile");
print("Done.\n");

sub loadmaildir($$$$$$)
  {
    my $hashref = shift;
    my $folder = shift;
    my $poslabel = shift;
    my $posref = shift;
    my $unslabel = shift;
    my $unsref = shift;
    my $filename = '';
    my $result = '';
    open(RESULT,'-|',"$bogofilter -v -B '$folder'");
    while(<RESULT>)
      {
	if (m|^($folder[^ ]+) x-label: ([^ ]+) |)
          {
	    $filename = $1;
	    $result = $2;
	    $$hashref{$filename} = 0;
	    if ("$poslabel" eq "$result")
	      {
		$$posref++;
	      }
	    elsif ("$unslabel" eq "$result")
	      {
		$$unsref++;
	      }
	  }
      }
    close(RESULT);
  }

sub train($$$$)
  {
    my $hashref = shift;
    my $countref = shift;
    my $filename = shift;
    my $tchar = shift;
    my $runs;
    $$countref++;
    if ( ($runs = ++$$hashref{$filename}) > 1)
      {
	alarm(0);
	my $ans = '';
	while (1)
	  {
	    $head = 1;
	    $lines = 0;
	    $limit = 20;
	    print("\n");
	    open(TESTFILE,'<',$filename);
	    while (<TESTFILE>)
	      {
		chomp;
		if (($head &&= length))
		  {
		    next unless (/^(Envelope|To|From|Subject|Date)/);
		  }
		print "$_\n";
		last if ((++$lines) > $limit);
	      }
	    close(TESTFILE);
	    my ($s1,$s2,$s3,$s4) =
	      ($tchar eq 's') ?
		('S','pam','G','ood') :
		  ('G','ood','S','pam');
	    printf("Train as (%s)%s/Reclassify as (%s)%s/(D)elete/(V)iew?  ",
		   $s1,$s2,$s3,$s4);
	    $ans = uc(substr(<STDIN>,0,1));
	    if ($ans eq $s1)
	      {
		# Here we should make a copy of the message
		# so we don't get asked this question every
		# single time the proggie runs...
		my $folder = $tchar eq 's' ? $spamfolder : $goodfolder;
		chomp($filename="$folder/new/".`safecat $folder/tmp $folder/new < $filename`);
		($tchar eq 's' ? $tspam{$filename} : $tgood{$newfile} ) = 1;
		last;
	      }
	    elsif ($ans eq  'V')
	      {
		system("less '$filename'");
		next;
	      }
	    elsif (($ans eq $s3) or ($ans eq 'D'))
	      {
		$tchar = uc($tchar);
		while (--$runs)
		  {
		    system("$bogotrain -$tchar -I '$filename'");
		    if ($? == -1)
		      {
			die("$bogotrain failed to execute: $!\n");
		      }
		    elsif ($? & 127)
		      {
			printf("$bogotrain died with signal %d\n",$? & 127);
		      }
		  }
		undef($$hashref{$filename});
		if ($ans eq $s3)
		  {
		    if ($tchar eq 'S')
		      {
			$tgood{$filename} = 0;
			system("mv $filename $goodfolder");
		      }
		    else
		      {
			$tspam{$filename} = 0;
			system("mv $filename $spamfolder");
		      }
		  }
		else
		  {
		    unlink($filename);
		  }
		print(' ');alarm(1);
		return;
	      }
           printf("'%s' is not a valid response.\n",$ans);
	  }
	print(' ');alarm(1);
      }
    system("$bogotrain -$tchar -I '$filename'");
    if ($? == -1)
      {
	die("$bogotrain failed to execute: $!\n");
      }
    elsif ($? & 127)
      {
	printf("$bogotrain died with signal %d\n",$? & 127);
      }
  }

sub prune($$$)
  {
    my $hashref = shift;
    my $maxsave = shift;
    my $msgkind = shift;
    my @filenames;
    @filenames = sort { (-M $b) <=> (-M $a) } keys %$hashref;
    my $numfiles;
    $numfiles = scalar(@filenames);
    my $todelete;
    $todelete = $numfiles - $maxsave;
    my $deleted;
    if ($todelete > 0)
      {
	printf("\nDeleting oldest %d %s messages not used for training... ",
	       $todelete, $msgkind);
	foreach $filename (@filenames)
	  {
	    if ($$hashref{$filename}==0)
	      {
	        unlink($filename);
		$todelete --;
		if ($todelete < 1)
		  {
		    print(" Done.\n");
		    return;
		  }
	      }
	  }
        printf(" %d not deleted.\n",$todelete-$deleted);
      }
  }


More information about the Bogofilter mailing list