singletons
michael at optusnet.com.au
michael at optusnet.com.au
Sun Dec 28 01:16:29 CET 2003
David Relson <relson at osagesoftware.com> writes:
[..]
> Good ideas :-)
Indeed. :)
> I've got a "hint" mechanism that's not yet released. Counts are kept of
> "interesting" occurrences (of which "singleton" could be one). At the
> end of parsing a message, the counts are converted to tokens like
> "hint:singleton:x", where x is in a sequence like
> 1,2,5,10,20,50,100,200,500, etc. If a message had 7 tokens, then the
> "...:1", "...:2", and "...:5" tokens would be generated. At final
> scoring the hints are treated just like other tokens. More work is
> needed to see what's useful.
My current version has a patch similar to this, but I used
a formula like
#define MLOG(a) ((int) (5.0*log( 1.0 + (a) ) )) // 'a' is the raw count.
instead. This provided better granularity in the small end
of the change. (In my testing, the significance
of '2' is very different from '3')
> Token-pairs are pretty easy. A flag in the get_token() routine and
> remembering the previous token will allow the routine to alternate
> between returning single tokens and tokens pairs.
Something like the below maybe?? (no option flag in
this patch though).
(ps: I hand edited this patch; It may not apply cleanly, but
you get the idea... )
diff -u -r bogofilter-0.15.3/src/token.c bogofilter-dev/src/token.c
--- bogofilter-0.15.3/src/token.c Wed Sep 10 11:04:15 2003
+++ bogofilter-dev/src/token.c Sun Sep 14 08:13:07 2003
@@ -37,6 +37,8 @@
static word_t *w_from = NULL; /* From: */
static word_t *w_rtrn = NULL; /* Return-Path: */
static word_t *w_subj = NULL; /* Subject: */
+static word_t *w_recv = NULL; /* Received: */
+static word_t *w_header = NULL; /* Received: */
/* Global Variables */
@@ -49,8 +51,81 @@
/* Function Definitions */
+#define STACK_SIZE 20
+static word_t * stack_word[STACK_SIZE];
+static token_t stack_class[STACK_SIZE];
+static int sp = 0;
+
+static word_t * lastword = NULL;
+static token_t lastclass = -1;
+
+static word_t *spaceword = NULL;
+/* Function Definitions */
+token_t get_raw_token(void);
+
token_t get_token(void)
{
+ token_t c, class;
+ word_t * w;
+
+
+ if (sp > 0) {
+ --sp;
+ if (yylval)
+ word_free(yylval);
+ yylval = stack_word[sp];
+ return stack_class[sp];
+ }
+
+again:
+ class = get_raw_token();
+
+ if (!class && lastword) { /* EOF */
+ word_free(lastword);
+ lastword = NULL;
+ return class;
+ }
+
+ if (!class)
+ return class;
+
+#if 0
+
+
+
+
+
+
+
+
+
+
+
+#endif
+
+ if (!spaceword) spaceword = word_new("*", 1); /* spacer. */
+
+ if (lastword) {
+ w = word_concat(lastword, spaceword);
+ stack_word[sp] = word_concat(w, yylval);
+ stack_class[sp] = class;
+ sp++;
+
+ word_free(w);
+ word_free(lastword);
+ lastword = NULL;
+ }
+
+ lastword = word_dup(yylval);
+ lastclass = class;
+
+ return class;
+}
+
+
+
+token_t get_raw_token(void)
+{
token_t cls = NONE;
unsigned char *cp;
bool done = false;
@@ -59,13 +134,17 @@
if ( block_on_subnets && save_class == IPADDR )
{
byte *t = xmemrchr(ipsave->text, '.', ipsave->leng);
- if (t == NULL)
+ if (t == NULL) {
save_class = NONE;
+ word_free(ipsave);
+ }
else
{
*t = '\0';
ipsave->leng = t - ipsave->text;
- yylval = ipsave;
+ if (yylval)
+ word_free(yylval);
+ yylval = word_dup(ipsave);
return save_class;
}
}
@@ -78,12 +157,12 @@
yylval->leng = *lexer->yyleng;
yylval->text = (unsigned char *)(*lexer->yytext);
- if (DEBUG_TEXT(2)) {
+ if (DEBUG_TEXT(2)) {
word_puts(yylval, 0, dbgout);
fputc('\n', dbgout);
}
-
- if (cls == NONE)
+
+ if (cls == NONE) /* End of message */
break;
switch (cls) {
@@ -96,7 +175,6 @@
else /* "spc:invalid_end_of_header" */
yylval = word_dup(nonblank_line);
break;
-
case BOUNDARY: /* don't return boundary tokens to the user */
continue;
@@ -139,23 +217,23 @@
* Trick collected by ESR in real time during John
* Graham-Cummings's talk at Paul Graham's spam conference
* in January 2003... Some spammers know that people are
- * doing recognition on spamhaus IP addresses. They use
- * the fact that HTML clients normally interpret IP addresses
- * by doing a simple accumulate-and-shift algorithm; they
+ * doing recognition on spamhaus IP addresses. They use
+ * the fact that HTML clients normally interpret IP addresses
+ * by doing a simple accumulate-and-shift algorithm; they
* add large random multiples of 256 to the quads to
- * mask their origin. Nuke the high bits to unmask the
+ * mask their origin. Nuke the high bits to unmask the
* address.
*/
if (sscanf((const char *)yylval->text, "%d.%d.%d.%d", &q1, &q2, &q3, &q4) == 4)
/* safe because result string guaranteed to be shorter */
- sprintf((char *)yylval->text, "%d.%d.%d.%d",
- q1 & 0xff, q2 & 0xff, q3 & 0xff, q4 & 0xff);
+ sprintf((char *)yylval->text, "%d.%d.%d.%d",
+ q1 & 0xff, q2 & 0xff, q3 & 0xff, q4 & 0xff);
yylval->leng = strlen((const char *)yylval->text);
ipsave = word_new(NULL, plen + yylval->leng);
memcpy(ipsave->text, prefix, plen);
memcpy(ipsave->text+plen, yylval->text, yylval->leng+1);
word_free(yylval);
- yylval = ipsave;
+ yylval = word_dup(ipsave);
save_class = IPADDR;
return (cls);
}
@@ -171,11 +249,11 @@
break;
}
- if (DEBUG_TEXT(1)) {
+ if (DEBUG_TEXT(1)) {
word_puts(yylval, 0, dbgout);
fputc('\n', dbgout);
}
-
+
/* eat all long words */
if (yylval->leng <= MAXTOKENLEN)
done = true;
@@ -220,6 +298,8 @@
w_from = word_new((const byte *) "from:", 0); /* From: */
w_rtrn = word_new((const byte *) "rtrn:", 0); /* Return-Path: */
w_subj = word_new((const byte *) "subj:", 0); /* Subject: */
+
+
}
return;
@@ -263,4 +354,14 @@
if (nonblank_line)
word_free(nonblank_line);
nonblank_line = NULL;
+
+ while (sp > 0) {
+ --sp;
+ if (stack_word[sp])
+ word_free(stack_word[sp]);
+ }
+ if (lastword)
+ word_free(lastword);
+ lastword = NULL;
+
}
More information about the Bogofilter
mailing list