filter evasion

Fri Nov 7 19:26:05 CET 2003

On Fri, 7 Nov 2003 12:57:17 -0500 (EST)
Stefan Mashkevich <mash at mashke.org> wrote:

> On Fri, 7 Nov 2003, David Relson wrote:
> 
> > > Now what do we do about the font=white words?
> > 
> > John,
> > 
> > I don't have a good answer for that, at present.  Since bogofilter
> > is scoring the innards of <font> tags, it has _some_ info on the
> > ruse.
> 
> An all but blind shot -- but, given that we seem to be likely to
> encounter more witty experiments with tags in the future, could it
> make sense to treat them (and possibly attributes) specially? Say,
> <font color=white> would yield something like
> 
> tag:font
> tag:font:white
> 
> The latter should expose the criminal intent clearly enough, without
> resorting to rendering the message with a graphical engine and OCR'ing
> it back :-)
> 
>                                                        Stefan

Bogofilter currently scores the innards of a, img, and font tags.  
Attached is a patch that will add a:, img:, and font: prefixes to those
tokens.  Let me know how well it works!
-------------- next part --------------
Index: lexer_v3.l
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/src/lexer_v3.l,v
retrieving revision 1.109
diff -u -r1.109 lexer_v3.l

--- lexer_v3.l	5 Nov 2003 13:34:02 -0000	1.109
+++ lexer_v3.l	7 Nov 2003 18:23:31 -0000
@@ -281,7 +280,7 @@
 
 <HTML>"<!--"					{ BEGIN SCOMMENT; }
 <HTML>"<!"					{ BEGIN (strict_check ? HTOKEN : LCOMMENT ); }
-<HTML>"<"(a|img|font){WHITESPACE}		{ BEGIN HTOKEN; }
+<HTML>"<"(a|img|font){WHITESPACE}		{ set_tag(yytext); BEGIN HTOKEN; }
 <HTML>"<"					{ BEGIN HDISCARD; }	/* unknown tag */
 
 <HTOKEN>{TOKEN}					{ if (tokenize_html_tags)     return TOKEN; }
Index: token.c
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/src/token.c,v
retrieving revision 1.69
diff -u -r1.69 token.c
--- token.c	31 Oct 2003 18:28:05 -0000	1.69
+++ token.c	7 Nov 2003 18:23:31 -0000
@@ -39,6 +39,9 @@
 static word_t *w_subj = NULL;	/* Subject:     */
 static word_t *w_recv = NULL;	/* Received:    */
 static word_t *w_head = NULL;	/* Header:      */
+static word_t *w_anchor=NULL;	/* <a...>       */
+static word_t *w_font = NULL;	/* <font...>    */
+static word_t *w_img  = NULL;	/* <img...>     */
 
 /* Global Variables */
 
@@ -262,6 +265,7 @@
     if (w_to == NULL) {
 	w_to   = word_new((const byte *) "to:",   0);	/* To:          */
 	w_from = word_new((const byte *) "from:", 0);	/* From:        */
+	w_font = word_new((const byte *) "font:", 0);	/* <font ...>   */
 	w_rtrn = word_new((const byte *) "rtrn:", 0);	/* Return-Path: */
 	w_subj = word_new((const byte *) "subj:", 0);	/* Subject:     */
 	w_recv = word_new((const byte *) "rcvd:", 0);	/* Received:    */
@@ -291,12 +295,21 @@
     case 't':
 	token_prefix = w_to;		/* To: */
 	break;
+    case 'a':
+	token_prefix = w_anchor;	/* <a...> */
+	break;
     case 'f':
-	token_prefix = w_from;		/* From: */
+	if (tolower(text[2]) == 'r')
+	    token_prefix = w_from;	/* From: */
+	else
+	    token_prefix = w_font;	/* <font...> */
 	break;
     case 'h':
 	token_prefix = w_head;		/* Header: */
 	break;
+    case 'i':
+	token_prefix = w_img;		/* <img...> */
+	break;
     case 'r':			
 	if (tolower(text[2]) == 't')
 	    token_prefix = w_rtrn;	/* Return-Path: */
@@ -306,6 +319,7 @@
     case 's':
 	token_prefix = w_subj;		/* Subject: */
 	break;
+
     default:
 	fprintf(stderr, "%s:%d  invalid tag - '%s'\n", 
 		__FILE__, __LINE__, 
@@ -328,4 +342,7 @@
     WFREE(w_subj);
     WFREE(w_recv);
     WFREE(w_head);
+    WFREE(w_img);
+    WFREE(w_font);
+    WFREE(w_anchor);
 }