New "register if needed" feature?

Randall Nortman bogofilterlist at wonderclown.com
Mon Aug 1 03:36:06 CEST 2005


I've created a *preliminary* patch implementing the feature I
proposed.  The patch against 0.95.2 is attached; I haven't tried
applying it to the CVS code yet.  The code is minimally tested, and I
wasn't very careful about matching the existing indentation style,
which I can fix.  As David Relson said, most of the logic works in
much the same way the thresh_update logic works.  I added two options:

  --train-on-error                  Train only if message is misclassified
  --output-only-misclassified       Output only misclassified messages

Any other opinions on option names are welcome.  I hate naming
things.

The --train-on-error option enables the train on error feature.  It
must be combined with either -n or -s (and cannot be combined with -N,
-S, or -u), and it works in single-message mode as well as with -M,
-B, or -b.  If -v is specified, a line is printed to stderr (actually,
dbgout) saying how many messages were misclassified.

The --output-only-misclassified option is so that scripts can
determine which messages in the set were misclassified (and
subsequently registered in the wordlist).  If combined with -p on an
mbox, the output will be an mbox with only the misclassified messages
(I think; this is untested).  If combined with -v on a maildir or in
bulk mode, you'll get the filenames of only the misclassified
messages, plus the X-Bogosity information (just like -v normally
does).

While thinking of how I'm going to integrate this new feature into my
training scripts, another new feature occurred to me: I'd like to add
a REG_MIXED mode, which would be combined with bulk mode (-b or -B),
to allow registration of both ham and spam in a single run.  Instead
of just providing filenames of objects to be classified, you would
provide "spam:<filename>" or "ham:<filename>".  Then, each time
bogofilter moves on to the next object, it would switch into REG_SPAM
or REG_GOOD mode as appropriate.  This could be combined with
train-on-error or used independently.  This would primarily be useful
for initial training, so that the training script can alternate
between ham and spam messages.  From what I can tell, you need to
register some of each before bogofilter will start classifying
anything, right?  So if train-on-error is going to be useful for
initial training, you need to be able to switch between ham and spam.
(This is how bogominitrain.pl works.)  I intend to use this on
maildirs, and give bogofilter one filename at a time on stdin using
-b, picking files alternately from the ham and spam maildirs.

I think this is rather easy to implement by having the _next_mailstore
functions set the right bit (REG_SPAM or REG_GOOD) in the global
run_type variable each time a mailstore is opened (after parsing out
the "spam:" or "ham:" prefix).  You could also allow it to work in
mbox mode (-M) rather than bulk mode if the messages had a header to
tell bogofilter whether to register them as ham or spam, but that
might be a bit trickier to implement.

But before I do any more on this, I figured I'd share my work so far
in case anybody thinks I'm taking the wrong approach.  Ultimately, I'd
like to see my patch integrated into the mainstream code, if others
consider it useful.  Comments welcome.

Randall Nortman
-------------- next part --------------
diff -ru bogofilter-0.95.2-dist/src/bogoconfig.c bogofilter-0.95.2/src/bogoconfig.c
--- bogofilter-0.95.2-dist/src/bogoconfig.c	2005-06-29 07:12:09.000000000 -0400
+++ bogofilter-0.95.2/src/bogoconfig.c	2005-07-31 19:58:55.000000000 -0400
@@ -142,6 +142,8 @@
     { "timestamp",			R, 0, O_TIMESTAMP },
     { "unsure-subject-tag",		R, 0, O_UNSURE_SUBJECT_TAG },
     { "wordlist",			R, 0, O_WORDLIST },
+    { "train-on-error",			N, 0, O_TRAIN_ON_ERROR },
+    { "output-only-misclassified",	N, 0, O_OUTPUT_ONLY_MISCLASSIFIED },
     /* end of list */
     { NULL,				0, 0, 0 }
 };
@@ -253,7 +255,8 @@
 {
 /*  flags '-s', '-n', '-S', and '-N' are mutually exclusive with
     flags '-p', '-u', '-e', and '-R'. */
-    run_classify = (run_type & (RUN_NORMAL | RUN_UPDATE)) != 0;
+    run_classify =
+	(run_type & (RUN_NORMAL | RUN_UPDATE | RUN_TRAINERROR)) != 0;
     run_register = (run_type & (REG_SPAM | REG_GOOD | UNREG_SPAM | UNREG_GOOD)) != 0;
 
     if (*outfname && !passthrough)
@@ -263,7 +266,7 @@
 		      outfname);
     }
     
-    if (run_register && (run_classify || Rtable))
+    if (run_register && ((run_type & RUN_UPDATE) || Rtable))
     {
 	(void)fprintf(stderr,
 		      "Error:  Option '-u' may not be used with options '-s', '-n', '-S', or '-N'.\n"
@@ -271,6 +274,11 @@
 	return EX_ERROR;
     }
 
+    if ((run_type & RUN_TRAINERROR) && !run_register)
+    {
+	(void)fprintf(stderr,
+		      "Warning: Option --train-on-error has no effect without -s or -n\n");
+    }
     return EX_OK;
 }
 
@@ -350,6 +358,8 @@
     "  --unsure-subject-tag              like spam-subject-tag\n",
     "  --user-config-file                configuration file\n",
     "  --wordlist                        specify wordlist parameters\n",
+    "  --train-on-error                  Train only if message is misclassified\n",
+    "  --output-only-misclassified       Output only misclassified messages\n",
     "\n",
     "bogofilter is a tool for classifying email as spam or non-spam.\n",
     "\n",
@@ -537,7 +547,7 @@
 	break;
 
     case 'N':
-	run_type = check_run_type(UNREG_GOOD, REG_GOOD | UNREG_SPAM);
+	run_type = check_run_type(UNREG_GOOD, REG_GOOD | UNREG_SPAM | RUN_TRAINERROR);
 	break;
 
     case 'O':
@@ -567,11 +577,11 @@
 	break;
 
     case 'S':
-	run_type = check_run_type(UNREG_SPAM, REG_SPAM | UNREG_GOOD);
+	run_type = check_run_type(UNREG_SPAM, REG_SPAM | UNREG_GOOD | RUN_TRAINERROR);
 	break;
 
     case 'u':
-	run_type |= RUN_UPDATE;
+	run_type = check_run_type(RUN_UPDATE, RUN_TRAINERROR);
 	break;
 	
     case 'U':
@@ -708,6 +718,12 @@
     case O_UNSURE_SUBJECT_TAG:		unsure_subject_tag = get_string(name, val);		break;
     case O_UNICODE:			encoding = get_bool(name, val) ? E_UNICODE : E_RAW;	break;
     case O_WORDLIST:			configure_wordlist(val);				break;
+    case O_TRAIN_ON_ERROR:
+	run_type = check_run_type(RUN_TRAINERROR, RUN_UPDATE | UNREG_SPAM | UNREG_GOOD);
+	break;
+    case O_OUTPUT_ONLY_MISCLASSIFIED:
+	output_only_misclassified = true;
+	break;
 
     case O_DB_TRANSACTION:		eTransaction = get_txn(name, val);			break;
 
diff -ru bogofilter-0.95.2-dist/src/bogofilter.c bogofilter-0.95.2/src/bogofilter.c
--- bogofilter-0.95.2-dist/src/bogofilter.c	2005-05-25 19:32:19.000000000 -0400
+++ bogofilter-0.95.2/src/bogofilter.c	2005-07-31 20:20:30.000000000 -0400
@@ -71,10 +71,12 @@
     uint msgcount = 0;
     rc_t status = RC_OK;
     bool register_opt = (run_type & (REG_SPAM | UNREG_SPAM | REG_GOOD | UNREG_GOOD)) != 0;
-    bool register_bef = register_opt && passthrough;
-    bool register_aft = ((register_opt && !passthrough) || (run_type & RUN_UPDATE)) != 0;
+    bool register_bef = register_opt && passthrough && !(run_type & RUN_TRAINERROR);
+    bool register_aft = ((register_opt && !passthrough && !(run_type & RUN_TRAINERROR)) || (run_type & RUN_UPDATE)) != 0;
     bool write_msg    = passthrough || Rtable;
-    bool classify_msg = write_msg || ((run_type & (RUN_NORMAL | RUN_UPDATE))) != 0;
+    bool classify_msg = write_msg || ((run_type & (RUN_NORMAL | RUN_UPDATE | RUN_TRAINERROR))) != 0;
+    uint num_misclassified = 0;
+    bool last_msg_misclassified;
 
     wordhash_t *words;
 
@@ -90,6 +92,7 @@
     while ((*reader_more)()) {
 	double spamicity;
 	wordhash_t *w = wordhash_new();
+	last_msg_misclassified = false;
 
 	rstats_init();
 	passthrough_setup();
@@ -119,14 +122,33 @@
 		if (status == RC_HAM && spamicity >= thresh_update)
 		    register_words(REG_GOOD, w, msgcount);
 	    }
+            else if (run_type & RUN_TRAINERROR)
+            {
+                if (((run_type & REG_SPAM) && (status != RC_SPAM))
+                    || ((run_type & REG_GOOD) && (status != RC_HAM)))
+                {
+                    /* We miscategorized, so first register the message
+                       appropriately */
+                    register_words(run_type & (REG_SPAM|REG_GOOD), w, 1);
+                    ++num_misclassified;
+                    last_msg_misclassified = true;
+                    /* Now re-classify to get the updated score and status */
+                    lookup_words(w);
+                    spamicity = msg_compute_spamicity(w, NULL);
+                    status = msg_status();
+                }
+            }
 
-	    if (verbose && !passthrough && !quiet) {
-		const char *filename = (*reader_filename)();
-		if (filename)
-		    fprintf(fpo, "%s ", filename); 
-	    }
+	    if (!output_only_misclassified || last_msg_misclassified)
+	    {
+		if (verbose && !passthrough && !quiet) {
+		    const char *filename = (*reader_filename)();
+                    if (filename)
+                      fprintf(fpo, "%s ", filename); 
+                }
 
-	    write_message(status);		/* passthrough */
+                write_message(status);		/* passthrough */
+            }
 	    if (logflag && !register_opt) {
 		write_log_message(status);
 		msgcount = 0;
@@ -162,6 +184,12 @@
     if (DEBUG_MEMORY(1))
 	MEMDISPLAY;
 
+    if ((run_type & RUN_TRAINERROR) && verbose) {
+      (void)fprintf(dbgout,
+                    "# %d messages were misclassified and subsequently registered as %s.\n",
+                    num_misclassified, (run_type & REG_SPAM) ? "spam" : "ham");
+    }
+
     return status;
 }
 
diff -ru bogofilter-0.95.2-dist/src/common.h bogofilter-0.95.2/src/common.h
--- bogofilter-0.95.2-dist/src/common.h	2005-06-18 09:37:00.000000000 -0400
+++ bogofilter-0.95.2/src/common.h	2005-07-31 11:55:48.000000000 -0400
@@ -104,7 +104,8 @@
     REG_SPAM   = BIT(2),
     REG_GOOD   = BIT(3),
     UNREG_SPAM = BIT(4),
-    UNREG_GOOD = BIT(5)
+    UNREG_GOOD = BIT(5),
+    RUN_TRAINERROR = BIT(6)
 } run_t;
 extern run_t run_type;
 extern bool  run_classify;
diff -ru bogofilter-0.95.2-dist/src/globals.c bogofilter-0.95.2/src/globals.c
--- bogofilter-0.95.2-dist/src/globals.c	2005-06-28 07:18:52.000000000 -0400
+++ bogofilter-0.95.2/src/globals.c	2005-07-31 19:48:54.000000000 -0400
@@ -36,6 +36,7 @@
 bool	terse;				/* '-t' */
 int	bogotest;			/* '-X', env("BOGOTEST") */
 int	verbose;			/* '-v' */
+bool	output_only_misclassified;	/* '--output-only-misclassified' */
 
 /* config file options */
 double	min_dev;
diff -ru bogofilter-0.95.2-dist/src/globals.h bogofilter-0.95.2/src/globals.h
--- bogofilter-0.95.2-dist/src/globals.h	2005-06-07 19:06:20.000000000 -0400
+++ bogofilter-0.95.2/src/globals.h	2005-07-31 19:49:13.000000000 -0400
@@ -37,6 +37,8 @@
 extern	int	bogotest;		/* '-X', env("BOGOTEST") */
 extern	int	verbose;		/* '-v' */
 extern	bool	replace_nonascii_characters;	/* '-n' */
+extern	bool	output_only_misclassified;	/* '--output-only-misclassified' */
+
 
 /* config file options */
 extern	double	min_dev;
diff -ru bogofilter-0.95.2-dist/src/longoptions.h bogofilter-0.95.2/src/longoptions.h
--- bogofilter-0.95.2-dist/src/longoptions.h	2005-06-20 19:30:56.000000000 -0400
+++ bogofilter-0.95.2/src/longoptions.h	2005-07-31 19:50:51.000000000 -0400
@@ -65,7 +65,9 @@
     O_UNICODE,
     O_UNSURE_SUBJECT_TAG,
     O_USER_CONFIG_FILE,
-    O_WORDLIST
+    O_WORDLIST,
+    O_TRAIN_ON_ERROR,
+    O_OUTPUT_ONLY_MISCLASSIFIED
 } longopts_t;
 
 #ifndef	DISABLE_UNICODE


More information about the bogofilter-dev mailing list