[PATCH] -U option as the opposite of -u

Chris Wilkes cwilkes-bf at ladro.com
Tue Jan 28 19:25:17 CET 2003


Hi,

  Many thanks to David Relson who told me about this list and helped me
through some of the code.
  I use bogofilter's -u switch to automatically register mail whenever
it comes in.  Once in a while an email is mislabeled and I would like to
undo it.  I would like to have BF be smart enough to figure out if it
should run the -S or the -N switch based on its current value.
  So I put in a patch to BF 0.10.1.2 that gives a -U switch.  In a
nutshell I've added:
	run_type = RUN_UPDATE_REVERSED
  to the possibilities of run_type.  That part of bogofilter.c has also
changed a bit with the addition of a 'todo' variable that's the same
type as run_type and gets set to what should be called in the
register_words() function.
  I modified the bogofilter.xml file to update the man pages, but I
couldn't do a make on it as I couldn't figure out how to install the
getopt longparamaters package on my FreeBSD box required for xmlto.  If
anyone has a pointer, please let me know.
  Here's what happens when you run with -U where $s is the email that
was marked as good and should be spam:
   $ bogofilter -v < $s
   X-Bogosity: No, tests=bogofilter, spamicity=0.491674, version=0.10.1.2
   $ bogofilter -U < $s
   $ bogofilter -v < $s
   X-Bogosity: Yes, tests=bogofilter, spamicity=0.660724, version=0.10.1.2
  The files I've changed:
config.c           # add "todo" and the -U switch to getopt
main.c             # add case of RUN_UPDATE_REVERSED to switch
common.h           # made "todo" of type run_t -- could change this
bogofilter.c       # updated the register_words() call
doc/bogofilter.xml # add -U to the man page
robinson.c         # have to put run_type = RUN_UPDATE_REVERSED in if()

Chris


--- config.c	Fri Jan 24 05:48:43 2003
+++ config.c.new	Tue Jan 28 10:16:41 2003
@@ -69,6 +69,7 @@
 bool	stats_in_header = true;
 
 run_t run_type = RUN_NORMAL; 
+run_t todo;
 
 const char *logtag = NULL;
 
@@ -362,8 +363,8 @@
 {
     bool registration, classification;
 
-/*  flags '-s', '-n', '-S', or '-N', are mutually exclusive of flags '-p', '-u', '-e', and '-R'. */
-    classification = (run_type == RUN_NORMAL) ||(run_type == RUN_UPDATE) || passthrough || nonspam_exits_zero || (Rtable != 0);
+/*  flags '-s', '-n', '-S', or '-N', are mutually exclusive of flags '-p', '-u', '-U', '-e', and '-R'. */
+    classification = (run_type == RUN_NORMAL) ||(run_type == RUN_UPDATE) || passthrough || nonspam_exits_zero || (Rtable != 0) || (run_type == RUN_UPDATE_REVERSED);
     registration   = (run_type == REG_SPAM) || (run_type == REG_GOOD) || (run_type == REG_GOOD_TO_SPAM) || (run_type == REG_SPAM_TO_GOOD);
 
     if (*outfname && !passthrough)
@@ -379,7 +380,7 @@
 		      "Error:  Invalid combination of options.\n"
 		      "\n"
 		      "    Options '-s', '-n', '-S', and '-N' are used when registering words.\n"
-		      "    Options '-p', '-u', '-e', and '-R' are used when classifying messages.\n"
+		      "    Options '-p', '-u', '-U', '-e', and '-R' are used when classifying messages.\n"
 		      "    The two sets of options may not be used together.\n"
 		      "    \n"
 #ifdef	GRAHAM_AND_ROBINSON
@@ -422,6 +423,7 @@
 		  "\t-n\t- register message as non-spam.\n"
 		  "\t-o val\t- set user defined spamicity cutoff.\n"
 		  "\t-u\t- classify message as spam or non-spam and register accordingly.\n"
+		  "\t-U\t- reverse classification (i.e. was registered as spam but is good) of message\n"
 		  "\t-S\t- move message's words from non-spam list to spam list.\n"
 		  "\t-N\t- move message's words from spam list to spam non-list.\n"
 		  "\t-R\t- print an R data frame.\n"
@@ -486,7 +488,7 @@
 
     fpin = stdin;
 
-    while ((option = getopt(argc, argv, "23d:eFhl::o:snSNvVpuc:CgrRx:fqtI:O:y:k:" G R F)) != EOF)
+    while ((option = getopt(argc, argv, "23d:eFhl::o:snSNvVpuUc:CgrRx:fqtI:O:y:k:" G R F)) != EOF)
     {
 	switch(option)
 	{
@@ -560,6 +562,10 @@
 
 	case 'u':
 	    run_type = RUN_UPDATE;
+	    break;
+
+	case 'U':
+	    run_type = RUN_UPDATE_REVERSED;
 	    break;
 
 	case 'k':
--- main.c	Sun Jan 19 07:02:00 2003
+++ main.c.new	Tue Jan 28 10:16:41 2003
@@ -98,6 +98,7 @@
     switch(run_type) {
 	case RUN_NORMAL:
 	case RUN_UPDATE:
+	case RUN_UPDATE_REVERSED:
 	    {
 		double spamicity;
 		rc_t   status = bogofilter(&spamicity);
@@ -188,6 +189,9 @@
 	    syslog(LOG_INFO, "%s\n", msg_bogofilter);
 	    break;
 	case RUN_UPDATE:
+	    syslog(LOG_INFO, "%s, %s\n", msg_bogofilter, msg_register);
+	    break;
+	case RUN_UPDATE_REVERSED: /* what is the msg_register? */
 	    syslog(LOG_INFO, "%s, %s\n", msg_bogofilter, msg_register);
 	    break;
 	default:
--- common.h	Tue Jan 21 19:04:19 2003
+++ common.h.new	Tue Jan 28 10:16:41 2003
@@ -57,10 +57,12 @@
 typedef enum run_e {
     RUN_NORMAL='r',
     RUN_UPDATE='u',
+    RUN_UPDATE_REVERSED='U',
     REG_SPAM='s', REG_SPAM_TO_GOOD='N', 
-    REG_GOOD='n', REG_GOOD_TO_SPAM='S'
+    REG_GOOD='n', REG_GOOD_TO_SPAM='S',
 } run_t;
 extern run_t run_type;
+extern run_t todo;
 
 typedef struct {
     double mant;
--- bogofilter.c	Sun Jan 19 07:01:50 2003
+++ bogofilter.c.new	Tue Jan 28 10:16:41 2003
@@ -58,6 +58,7 @@
     wordhash_t  *wordhash;
     long	wordcount, msgcount = 0;
     bool	cont;
+    /* todo is like run_type */
 
     set_list_active_status(true);
 
@@ -76,13 +77,22 @@
     if (xss != NULL)
         *xss = spamicity;
 
-    if (run_type == RUN_UPDATE)		/* Note: don't register if RC_UNSURE */
-    {
-	if (status == RC_SPAM)
-	    register_words(REG_SPAM, wordhash, msgcount, wordcount);
-	if (status == RC_HAM)
-	    register_words(REG_GOOD, wordhash, msgcount, wordcount);
-    }
+	/* updated to include the _REVERSED method */
+	if (run_type == RUN_UPDATE)		/* Note: don't register if RC_UNSURE */
+	{
+		if (status == RC_SPAM) todo=REG_SPAM;
+		if (status == RC_HAM)  todo=REG_GOOD;
+	}
+	if (run_type == RUN_UPDATE_REVERSED)	/* opposite of above */
+	{
+		/* if it was SPAM it should be re-classified as GOOD */
+		if (status == RC_SPAM) todo=REG_SPAM_TO_GOOD;
+		/* if it was GOOD it should be re-classified as SPAM */
+		if (status == RC_HAM)  todo=REG_GOOD_TO_SPAM;
+	}
+	/* now that we're through the above choices, actually do something */
+	if (todo)
+	    register_words(todo, wordhash, msgcount, wordcount);
 
     wordhash_free(wordhash);
 
--- doc/bogofilter.xml	Wed Jan 22 12:22:20 2003
+++ doc/bogofilter.xml.new	Tue Jan 28 10:16:41 2003
@@ -31,6 +31,7 @@
   <arg choice='opt'>-3</arg>
   <arg choice='opt'>-R</arg>
   <arg choice='opt'>-u</arg>
+  <arg choice='opt'>-U</arg>
   <arg choice='opt'>-v</arg>
   <arg choice='opt'>-V</arg>
   <arg choice='opt'>-x <replaceable>flags</replaceable></arg>
@@ -137,6 +138,14 @@
 on the spamlist and a non-spam message on the goodlist.  If using the
 Robinson-Fisher method and the classification is "unsure", the message will
 not be registered.</para>
+
+<para>The <option>-U</option> option tells
+<application>bogofilter</application> to register the message's text 
+AS THE OPPOSITE of what it currently is classified as.  This is so that if
+a message was classified as spam and it is really non-spam you can just
+send it through bogofilter again with the -U option to unregister the
+words in the spamlist and register them in the goodlist.  Its essentially
+a -S or -N after knowing the spamicity of the mail.</para>
 
 <para>The <option>-2</option> option tells
 <application>bogofilter</application> to binary classify the message as either
--- robinson.c	Tue Jan 21 20:41:10 2003
+++ robinson.c.new	Tue Jan 28 10:16:41 2003
@@ -293,7 +293,7 @@
     ** If we're registering tokens, we needn't get .MSG_COUNT
     */
 
-    if (run_type == RUN_NORMAL || run_type == RUN_UPDATE) {
+    if (run_type == RUN_NORMAL || run_type == RUN_UPDATE || run_type == RUN_UPDATE_REVERSED) {
 	scalefactor = compute_scale();
 	if (fabs(robs) < EPS)
 	    robs = ROBS;




More information about the bogofilter-dev mailing list