Including html-tag contents may be unnecessary

Mon May 12 01:10:36 CEST 2003

Greg,

What's the URL of the Paul Graham quote on parsing?

Looking at the list of suggestions, I see them as dividing into 3 types:

1 - case folding
2 - changed token definitions (tagged header fields, money, exclamation point)
3 - html changing (process a, font, and img tags)

The attached patch implements #1 and #2 as options "-Hf" (for folding) and 
"-Hh" (for header).

When I created the "-H?" switches, they were all oriented toward parsing 
html, hence the "H".  Viewed as a group of parsing switches, the 
case_folding and header_tagging options belong with them.  Also, it becomes 
apparent that a better name is "P" to designate parsing options.  Unless 
there's vehement objection, I will change from "H" to "P" for the next 
revision.

Yet to do: processing of Paul Graham's 3 html tags ...

Peace,

David
-------------- next part --------------
Index: bogoconfig.c
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/src/bogoconfig.c,v
retrieving revision 1.54
diff -u -r1.54 bogoconfig.c

--- bogoconfig.c	11 May 2003 15:44:28 -0000	1.54
+++ bogoconfig.c	11 May 2003 23:04:03 -0000
@@ -156,6 +156,7 @@
     { "tag_header_lines", 	     CP_BOOLEAN, { (void *) &tag_header_lines } },
     { "strict_check", 	  	     CP_BOOLEAN, { (void *) &strict_check } },
 
+    { "fold_case", 	  	     CP_BOOLEAN, { (void *) &fold_case } },
     { "tokenize_html_tags",	     CP_BOOLEAN, { (void *) &tokenize_html_tags } },
     { "tokenize_html_script",	     CP_BOOLEAN, { (void *) &tokenize_html_script } },	/* Not yet in use */
     { "tokenize_html_comments",	     CP_BOOLEAN, { (void *) &tokenize_html_comments } },/* Not yet in use */
@@ -524,14 +525,21 @@
 	    {
 		switch (*s)
 		{
-		case 't': tokenize_html_tags ^= true;
+		case 't': tokenize_html_tags ^= true;		/* -Ht */
 		    break;
-		case 's': tokenize_html_script ^= true;		/* Not yet in use */
+		case 's': tokenize_html_script ^= true;		/* -Hs - not yet in use */
 		    break;
-		case 'C': strict_check ^= true;
+		case 'C': strict_check ^= true;			/* -HC */
 		    /*@fallthrough@*/
-		case 'c': tokenize_html_comments ^= true;	/* Not yet in use */
+		case 'c': tokenize_html_comments ^= true;	/* -Hc - not yet in use */
 		    break;
+		case 'h': tag_header_lines ^= true;		/* -Hh */
+		    break;
+		case 'f': fold_case ^= true;			/* -Hf */
+		    break;
+		default:
+		    fprintf(stderr, "Unknown parsing option -H%c.\n", *s);
+		    exit(2);
 		}
 	    }
 	    break;
Index: bogolexer.c
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/src/bogolexer.c,v
retrieving revision 1.21
diff -u -r1.21 bogolexer.c
--- bogolexer.c	2 May 2003 23:03:05 -0000	1.21
+++ bogolexer.c	11 May 2003 23:04:03 -0000
@@ -136,14 +136,21 @@
 	    {
 		switch (*s)
 		{
-		case 't': tokenize_html_tags ^= true;
+		case 't': tokenize_html_tags ^= true;		/* -Ht */
 		    break;
-		case 's': tokenize_html_script ^= true;		/* Not yet in use */
+		case 's': tokenize_html_script ^= true;		/* -Hs - not yet in use */
 		    break;
-		case 'C': strict_check ^= true;
+		case 'C': strict_check ^= true;			/* -HC */
 		    /*@fallthrough@*/
-		case 'c': tokenize_html_comments ^= true;	/* Not yet in use */
+		case 'c': tokenize_html_comments ^= true;	/* -Hc - not yet in use */
 		    break;
+		case 'h': tag_header_lines ^= true;		/* -Hh */
+		    break;
+		case 'f': fold_case ^= true;			/* -Hf */
+		    break;
+		default:
+		    fprintf(stderr, "Unknown parsing option -H%c.\n", *s);
+		    exit(2);
 		}
 	    }
 	    break;
Index: globals.c
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/src/globals.c,v
retrieving revision 1.16
diff -u -r1.16 globals.c
--- globals.c	2 May 2003 23:03:04 -0000	1.16
+++ globals.c	11 May 2003 23:04:04 -0000
@@ -42,16 +42,17 @@
 double	min_dev;
 double	spam_cutoff;
 double	thresh_stats;
-bool	tag_header_lines = false;	/* true */
 
 const char	*update_dir;
 /*@observer@*/
 const char	*stats_prefix;
 
 /* for lexer_v3.l */
-bool	tokenize_html_tags = false;
-bool	tokenize_html_script = false;	/* Not yet in use */
-bool	tokenize_html_comments = false;	/* Not yet in use */
+bool	fold_case = true;		/* -Hf */
+bool	tag_header_lines = false;	/* -Hh */
+bool	tokenize_html_tags = false;	/* -Ht */
+bool	tokenize_html_script = false;	/* -Hs - not yet in use */
+bool	tokenize_html_comments = false;	/* -Hc - Not yet in use */
 
 /* dual definition options */
 char	*directory;			/* '-d' */
Index: globals.h
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/src/globals.h,v
retrieving revision 1.18
diff -u -r1.18 globals.h
--- globals.h	2 May 2003 23:03:04 -0000	1.18
+++ globals.h	11 May 2003 23:04:04 -0000
@@ -43,12 +43,13 @@
 
 extern	int	abort_on_error;
 extern	bool	stats_in_header;
-extern	bool	tag_header_lines;
 
 /* for lexer_v3.l */
-extern	bool	tokenize_html_tags;
-extern	bool	tokenize_html_script;
-extern	bool	tokenize_html_comments;
+extern	bool	fold_case;		/* -Hf */
+extern	bool	tag_header_lines;	/* -Hh */
+extern	bool	tokenize_html_tags;	/* -Ht */
+extern	bool	tokenize_html_script;	/* -Hs */
+extern	bool	tokenize_html_comments;	/* -Hc */
 
 extern	int	db_cachesize;
 
Index: lexer_v3.l
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/src/lexer_v3.l,v
retrieving revision 1.13
diff -u -r1.13 lexer_v3.l
--- lexer_v3.l	10 May 2003 14:50:23 -0000	1.13
+++ lexer_v3.l	11 May 2003 23:04:04 -0000
@@ -13,11 +13,8 @@
  * Our lexical analysis is different from Paul Graham's rules: 
  *
  * We throw away headers that are readily identifiable as dates.
- * We throw away text lines that look like BASE64 encoding.
  * We throw away all digit strings that don't look like IP address parts.
  * We thow away lines beginning with <tab>id<space> -- mailer UDs.
- * We throw away various bits of MIME cruft (things that look like
- * declarations and boundaries for multipart declarations).
  * We throw away *all* tokens of length 1 or 2.
  *
  * These are optimizations to keep the token lists from bloating.
@@ -33,8 +30,6 @@
  * same site is as well, so the hostname part should be an adequate
  * statistical trigger.  
  *
- * The list of HTML keywords and attributes to be ignored is from the 4.0 DTD.
- *
  * LEXED_TOKENS, which are found in "msg-count" files need a special pattern
  * because they can be:
  *	1 - normal bogofilter tokens
@@ -42,6 +37,24 @@
  *	3 - mime boundaries
  */
 
+/* Paul Graham's latest ideas (5/11/03):
+
+   1. Case is preserved.
+
+   2. Exclamation points are constituent characters.
+
+   3. Periods and commas are constituents if they occur between two
+   digits. This lets me get ip addresses and prices intact.
+
+   4. A price range like $20-25 yields two tokens, $20 and $25.
+
+   5. Tokens that occur within the To, From, Subject, and Return-Path
+lines, or within urls, get marked accordingly. E.g. `foo'' in the
+Subject line becomes `Subject*foo''. (The asterisk could be any
+character you don't allow as a constituent.) 
+
+*/
+
 #include <ctype.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -105,13 +118,18 @@
 TOKENMID_CA_075	[^][:blank:]<>;=():&%$#@!+|/\\{}^\"\?\*,[:cntrl:][]+
 TOKENMID_NJS	[^\][:blank:]<>;=():&%$#@!+|/\\{}^\"\?\*,[:cntrl:]\[]+
 TOKENMID_NJS_DR	[^[:blank:]<>;=():&%$#@!+|/\\{}^\"\?\*,[:cntrl:]\[\]]+
+TOKENMID_XXX_PG	[^[:blank:]<>;=():&%$#@+|/\\{}^\"\?\*,[:cntrl:]\[\]]+
 
 TOKENFRONT	[^[:blank:][:cntrl:][:digit:][:punct:]]
 TOKENMID	[^[:blank:]<>;=():&%$#@!+|/\\{}^\"\?\*,[:cntrl:]\[\]]+
 BOGOLEX_TOKEN	[^[:blank:]<>;    &%$ @! |/\\{}^\"  \*,[:cntrl:]\[\]]+
 TOKENBACK	[^[:blank:][:punct:][:cntrl:]]
 
-TOKEN 		{TOKENFRONT}{TOKENMID_NJS_DR}{TOKENBACK}
+DOLLARS			[0-9]+
+CENTS			[0-9]+
+DOLLARS_AND_CENTS	{DOLLARS}\.{CENTS}
+
+TOKEN 		{TOKENFRONT}{TOKENMID_XXX_PG}{TOKENBACK}
 
 %s TEXT HTML BOGO_LEX
 %s HTOKEN SCOMMENT LCOMMENT HSCRIPT
@@ -136,7 +154,10 @@
 <INITIAL>^Content-Transfer-Encoding:{MTYPE}	{ mime_encoding(yy_text()); yyless(strlen("Content-Transfer-Encoding:")); return(TOKEN); }
 <INITIAL>^Content-Type:{MTYPE};?		{ mime_type(yy_text()); yyless(strlen("Content-Type:")); return TOKEN; }
 <INITIAL>^Content-Disposition:{MTYPE}		{ mime_disposition(yy_text()); yyless(strlen("Content-Disposition:")); return TOKEN; }
-<INITIAL>^Subject/:\.*				{ set_tag("subj:"); return TOKEN; }
+<INITIAL>^To/:\.*				{ set_tag("to:"); }
+<INITIAL>^From/:\.*				{ set_tag("from:"); }
+<INITIAL>^Return-Path/:\.*			{ set_tag("rtrn:"); }
+<INITIAL>^Subject/:\.*				{ set_tag("subj:"); }
 <INITIAL>^Date:.*|Delivery-Date:.*		;
 <INITIAL>^Message-ID:.*				;
 <INITIAL>^\tid\ {ID}				;
@@ -174,18 +195,18 @@
 <HTML>"<"					{ BEGIN HTOKEN; }
 <HTOKEN>">"					{ BEGIN HTML; }	/* end of tag; return to normal html processing */
 
-<HTOKEN>{TOKEN}					{ if (tokenize_html_tags)     return TOKEN; }
+<HTOKEN>{TOKEN}/"/"				{ if (tokenize_html_tags)     return TOKEN; }
 <SCOMMENT,LCOMMENT>{TOKEN}			{ if (tokenize_html_comments) return TOKEN; }
 <HSCRIPT>{TOKEN}				{ if (tokenize_html_script)   return TOKEN; }
 
 {IPADDR}					{ return IPADDR;}
 {TOKEN}						{ return TOKEN;}
-
-.						;
+${DOLLARS_AND_CENTS}				{ return TOKEN;}
+${DOLLARS}					{ return TOKEN;}
+.						/* ignore character */ ;
 \n						{ got_newline(); 
 						  lineno += 1;
 						}
-
 %%
 
 void lexer_v3_init(FILE *fp)
Index: token.c
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/src/token.c,v
retrieving revision 1.16
diff -u -r1.16 token.c
--- token.c	10 May 2003 14:50:24 -0000	1.16
+++ token.c	11 May 2003 23:04:04 -0000
@@ -69,6 +69,11 @@
 	yylval->leng = lexer_v3_leng;
 	yylval->text = (byte *)lexer_v3_text;
 
+	if (DEBUG_TEXT(1)) { 
+	    word_puts(yylval, 0, dbgout);
+	    fputc('\n', dbgout);
+	}
+	    
 	if (class <= 0)
 	    break;
 
@@ -164,8 +169,9 @@
     }
 
     /* Need separate loop so lexer can see "From", "Date", etc */
-    for (cp = yylval->text; cp < yylval->text+yylval->leng; cp += 1)
-	*cp = casefold_table[*cp];
+    if (fold_case)
+	for (cp = yylval->text; cp < yylval->text+yylval->leng; cp += 1)
+	    *cp = casefold_table[*cp];
 
     return(class);
 }