singletons

Sun Dec 28 01:16:29 CET 2003

David Relson <relson at osagesoftware.com> writes:
[..]
> Good ideas :-)

Indeed. :)
 
> I've got a "hint" mechanism that's not yet released.  Counts are kept of
> "interesting" occurrences (of which "singleton" could be one).  At the
> end of parsing a message, the counts are converted to tokens like
> "hint:singleton:x", where x is in a sequence like
> 1,2,5,10,20,50,100,200,500, etc.  If a message had 7 tokens, then the
> "...:1", "...:2", and "...:5" tokens would be generated.  At final
> scoring the hints are treated just like other tokens.  More work is
> needed to see what's useful.

My current version has a patch similar to this, but I used
a formula like 

#define MLOG(a) ((int) (5.0*log( 1.0 + (a) ) ))    // 'a' is the raw count.

instead. This provided better granularity in the small end
of the change. (In my testing, the significance
of '2' is very different from '3')

> Token-pairs are pretty easy.  A flag in the get_token() routine and
> remembering the previous token will allow the routine to alternate
> between returning single tokens and tokens pairs.

Something like the below maybe?? (no option flag in
this patch though).

(ps: I hand edited this patch; It may not apply cleanly, but
you get the idea...  )

diff -u -r bogofilter-0.15.3/src/token.c bogofilter-dev/src/token.c

--- bogofilter-0.15.3/src/token.c	Wed Sep 10 11:04:15 2003
+++ bogofilter-dev/src/token.c	Sun Sep 14 08:13:07 2003
@@ -37,6 +37,8 @@
 static word_t *w_from = NULL;	/* From: */
 static word_t *w_rtrn = NULL;	/* Return-Path: */
 static word_t *w_subj = NULL;	/* Subject: */
+static word_t *w_recv = NULL;	/* Received: */
+static word_t *w_header = NULL;	/* Received: */
 
 /* Global Variables */
 
@@ -49,8 +51,81 @@
 
 /* Function Definitions */
 
+#define STACK_SIZE 20
+static word_t * stack_word[STACK_SIZE];
+static token_t stack_class[STACK_SIZE];
+static int sp = 0;
+
+static word_t * lastword = NULL;
+static token_t lastclass = -1;
+
+static word_t *spaceword = NULL;
+/* Function Definitions */
+token_t get_raw_token(void);
+
 token_t get_token(void)
 {
+    token_t c, class;
+    word_t * w;
+
+
+    if (sp > 0) {
+        --sp;
+        if (yylval)
+                word_free(yylval);
+        yylval = stack_word[sp];
+        return stack_class[sp];
+    }
+
+again:
+    class = get_raw_token();
+
+    if (!class && lastword) { /* EOF */
+        word_free(lastword);
+        lastword = NULL;
+        return class;
+    }
+
+    if (!class)
+        return class;
+
+#if 0
+
+
+
+
+
+
+
+
+
+
+
+#endif
+
+    if (!spaceword)  spaceword = word_new("*", 1); /* spacer. */
+
+    if (lastword) {
+        w = word_concat(lastword, spaceword);
+        stack_word[sp] = word_concat(w, yylval);
+        stack_class[sp] = class;
+        sp++;
+
+        word_free(w);
+        word_free(lastword);
+        lastword = NULL;
+    }
+
+    lastword = word_dup(yylval);
+    lastclass = class;
+
+    return class;
+}
+
+
+
+token_t get_raw_token(void)
+{
     token_t cls = NONE;
     unsigned char *cp;
     bool done = false;
@@ -59,13 +134,17 @@
     if ( block_on_subnets && save_class == IPADDR )
     {
 	byte *t = xmemrchr(ipsave->text, '.', ipsave->leng);
-	if (t == NULL)
+	if (t == NULL) {
 	    save_class = NONE;
+	    word_free(ipsave);
+	}
 	else
 	{
 	    *t = '\0';	
 	    ipsave->leng = t - ipsave->text;
-	    yylval = ipsave;
+	    if (yylval)
+		word_free(yylval);
+	    yylval = word_dup(ipsave);
 	    return save_class;
 	}
     }
@@ -78,12 +157,12 @@
 	yylval->leng = *lexer->yyleng;
 	yylval->text = (unsigned char *)(*lexer->yytext);
 
-	if (DEBUG_TEXT(2)) { 
+	if (DEBUG_TEXT(2)) {
 	    word_puts(yylval, 0, dbgout);
 	    fputc('\n', dbgout);
 	}
-	    
-	if (cls == NONE)
+	
+	if (cls == NONE) /* End of message */
 	    break;
 
 	switch (cls) {
@@ -96,7 +175,6 @@
 	    else	/* "spc:invalid_end_of_header" */
 		yylval = word_dup(nonblank_line);
 	    break;
-
 	case BOUNDARY:	/* don't return boundary tokens to the user */
 	    continue;
 
@@ -139,23 +217,23 @@
 		 * Trick collected by ESR in real time during John
 		 * Graham-Cummings's talk at Paul Graham's spam conference
 		 * in January 2003...  Some spammers know that people are
-		 * doing recognition on spamhaus IP addresses.  They use 
-		 * the fact that HTML clients normally interpret IP addresses 
-		 * by doing a simple accumulate-and-shift algorithm; they 
+		 * doing recognition on spamhaus IP addresses.  They use
+		 * the fact that HTML clients normally interpret IP addresses
+		 * by doing a simple accumulate-and-shift algorithm; they
 		 * add large random multiples of 256 to the quads to
-		 * mask their origin.  Nuke the high bits to unmask the 
+		 * mask their origin.  Nuke the high bits to unmask the
 		 * address.
 		 */
 		if (sscanf((const char *)yylval->text, "%d.%d.%d.%d", &q1, &q2, &q3, &q4) == 4)
 		    /* safe because result string guaranteed to be shorter */
-		    sprintf((char *)yylval->text, "%d.%d.%d.%d", 
-			    q1 & 0xff, q2 & 0xff, q3 & 0xff, q4 & 0xff);		    
+		    sprintf((char *)yylval->text, "%d.%d.%d.%d",
+			    q1 & 0xff, q2 & 0xff, q3 & 0xff, q4 & 0xff);		
 		yylval->leng = strlen((const char *)yylval->text);
 		ipsave = word_new(NULL, plen + yylval->leng);
 		memcpy(ipsave->text, prefix, plen);
 		memcpy(ipsave->text+plen, yylval->text, yylval->leng+1);
 		word_free(yylval);
-		yylval = ipsave;
+		yylval = word_dup(ipsave);
 		save_class = IPADDR;
 		return (cls);
 	    }
@@ -171,11 +249,11 @@
 	    break;
 	}
 
-	if (DEBUG_TEXT(1)) { 
+	if (DEBUG_TEXT(1)) {
 	    word_puts(yylval, 0, dbgout);
 	    fputc('\n', dbgout);
 	}
-	    
+	
 	/* eat all long words */
 	if (yylval->leng <= MAXTOKENLEN)
 	    done = true;
@@ -220,6 +298,8 @@
 	w_from = word_new((const byte *) "from:", 0);	/* From: */
 	w_rtrn = word_new((const byte *) "rtrn:", 0);	/* Return-Path: */
 	w_subj = word_new((const byte *) "subj:", 0);	/* Subject: */
+
+
     }
 
     return;
@@ -263,4 +354,14 @@
     if (nonblank_line)
 	word_free(nonblank_line);
     nonblank_line = NULL;
+
+    while (sp > 0) {
+	--sp;
+	if (stack_word[sp])
+	    word_free(stack_word[sp]);
+    }
+    if (lastword)
+	word_free(lastword);
+    lastword = NULL;
+
 }