Including html-tag contents may be unnecessary
David Relson
relson at osagesoftware.com
Mon May 12 01:10:36 CEST 2003
Greg,
What's the URL of the Paul Graham quote on parsing?
Looking at the list of suggestions, I see them as dividing into 3 types:
1 - case folding
2 - changed token definitions (tagged header fields, money, exclamation point)
3 - html changing (process a, font, and img tags)
The attached patch implements #1 and #2 as options "-Hf" (for folding) and
"-Hh" (for header).
When I created the "-H?" switches, they were all oriented toward parsing
html, hence the "H". Viewed as a group of parsing switches, the
case_folding and header_tagging options belong with them. Also, it becomes
apparent that a better name is "P" to designate parsing options. Unless
there's vehement objection, I will change from "H" to "P" for the next
revision.
Yet to do: processing of Paul Graham's 3 html tags ...
Peace,
David
-------------- next part --------------
Index: bogoconfig.c
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/src/bogoconfig.c,v
retrieving revision 1.54
diff -u -r1.54 bogoconfig.c
--- bogoconfig.c 11 May 2003 15:44:28 -0000 1.54
+++ bogoconfig.c 11 May 2003 23:04:03 -0000
@@ -156,6 +156,7 @@
{ "tag_header_lines", CP_BOOLEAN, { (void *) &tag_header_lines } },
{ "strict_check", CP_BOOLEAN, { (void *) &strict_check } },
+ { "fold_case", CP_BOOLEAN, { (void *) &fold_case } },
{ "tokenize_html_tags", CP_BOOLEAN, { (void *) &tokenize_html_tags } },
{ "tokenize_html_script", CP_BOOLEAN, { (void *) &tokenize_html_script } }, /* Not yet in use */
{ "tokenize_html_comments", CP_BOOLEAN, { (void *) &tokenize_html_comments } },/* Not yet in use */
@@ -524,14 +525,21 @@
{
switch (*s)
{
- case 't': tokenize_html_tags ^= true;
+ case 't': tokenize_html_tags ^= true; /* -Ht */
break;
- case 's': tokenize_html_script ^= true; /* Not yet in use */
+ case 's': tokenize_html_script ^= true; /* -Hs - not yet in use */
break;
- case 'C': strict_check ^= true;
+ case 'C': strict_check ^= true; /* -HC */
/*@fallthrough@*/
- case 'c': tokenize_html_comments ^= true; /* Not yet in use */
+ case 'c': tokenize_html_comments ^= true; /* -Hc - not yet in use */
break;
+ case 'h': tag_header_lines ^= true; /* -Hh */
+ break;
+ case 'f': fold_case ^= true; /* -Hf */
+ break;
+ default:
+ fprintf(stderr, "Unknown parsing option -H%c.\n", *s);
+ exit(2);
}
}
break;
Index: bogolexer.c
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/src/bogolexer.c,v
retrieving revision 1.21
diff -u -r1.21 bogolexer.c
--- bogolexer.c 2 May 2003 23:03:05 -0000 1.21
+++ bogolexer.c 11 May 2003 23:04:03 -0000
@@ -136,14 +136,21 @@
{
switch (*s)
{
- case 't': tokenize_html_tags ^= true;
+ case 't': tokenize_html_tags ^= true; /* -Ht */
break;
- case 's': tokenize_html_script ^= true; /* Not yet in use */
+ case 's': tokenize_html_script ^= true; /* -Hs - not yet in use */
break;
- case 'C': strict_check ^= true;
+ case 'C': strict_check ^= true; /* -HC */
/*@fallthrough@*/
- case 'c': tokenize_html_comments ^= true; /* Not yet in use */
+ case 'c': tokenize_html_comments ^= true; /* -Hc - not yet in use */
break;
+ case 'h': tag_header_lines ^= true; /* -Hh */
+ break;
+ case 'f': fold_case ^= true; /* -Hf */
+ break;
+ default:
+ fprintf(stderr, "Unknown parsing option -H%c.\n", *s);
+ exit(2);
}
}
break;
Index: globals.c
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/src/globals.c,v
retrieving revision 1.16
diff -u -r1.16 globals.c
--- globals.c 2 May 2003 23:03:04 -0000 1.16
+++ globals.c 11 May 2003 23:04:04 -0000
@@ -42,16 +42,17 @@
double min_dev;
double spam_cutoff;
double thresh_stats;
-bool tag_header_lines = false; /* true */
const char *update_dir;
/*@observer@*/
const char *stats_prefix;
/* for lexer_v3.l */
-bool tokenize_html_tags = false;
-bool tokenize_html_script = false; /* Not yet in use */
-bool tokenize_html_comments = false; /* Not yet in use */
+bool fold_case = true; /* -Hf */
+bool tag_header_lines = false; /* -Hh */
+bool tokenize_html_tags = false; /* -Ht */
+bool tokenize_html_script = false; /* -Hs - not yet in use */
+bool tokenize_html_comments = false; /* -Hc - Not yet in use */
/* dual definition options */
char *directory; /* '-d' */
Index: globals.h
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/src/globals.h,v
retrieving revision 1.18
diff -u -r1.18 globals.h
--- globals.h 2 May 2003 23:03:04 -0000 1.18
+++ globals.h 11 May 2003 23:04:04 -0000
@@ -43,12 +43,13 @@
extern int abort_on_error;
extern bool stats_in_header;
-extern bool tag_header_lines;
/* for lexer_v3.l */
-extern bool tokenize_html_tags;
-extern bool tokenize_html_script;
-extern bool tokenize_html_comments;
+extern bool fold_case; /* -Hf */
+extern bool tag_header_lines; /* -Hh */
+extern bool tokenize_html_tags; /* -Ht */
+extern bool tokenize_html_script; /* -Hs */
+extern bool tokenize_html_comments; /* -Hc */
extern int db_cachesize;
Index: lexer_v3.l
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/src/lexer_v3.l,v
retrieving revision 1.13
diff -u -r1.13 lexer_v3.l
--- lexer_v3.l 10 May 2003 14:50:23 -0000 1.13
+++ lexer_v3.l 11 May 2003 23:04:04 -0000
@@ -13,11 +13,8 @@
* Our lexical analysis is different from Paul Graham's rules:
*
* We throw away headers that are readily identifiable as dates.
- * We throw away text lines that look like BASE64 encoding.
* We throw away all digit strings that don't look like IP address parts.
* We thow away lines beginning with <tab>id<space> -- mailer UDs.
- * We throw away various bits of MIME cruft (things that look like
- * declarations and boundaries for multipart declarations).
* We throw away *all* tokens of length 1 or 2.
*
* These are optimizations to keep the token lists from bloating.
@@ -33,8 +30,6 @@
* same site is as well, so the hostname part should be an adequate
* statistical trigger.
*
- * The list of HTML keywords and attributes to be ignored is from the 4.0 DTD.
- *
* LEXED_TOKENS, which are found in "msg-count" files need a special pattern
* because they can be:
* 1 - normal bogofilter tokens
@@ -42,6 +37,24 @@
* 3 - mime boundaries
*/
+/* Paul Graham's latest ideas (5/11/03):
+
+ 1. Case is preserved.
+
+ 2. Exclamation points are constituent characters.
+
+ 3. Periods and commas are constituents if they occur between two
+ digits. This lets me get ip addresses and prices intact.
+
+ 4. A price range like $20-25 yields two tokens, $20 and $25.
+
+ 5. Tokens that occur within the To, From, Subject, and Return-Path
+lines, or within urls, get marked accordingly. E.g. `foo'' in the
+Subject line becomes `Subject*foo''. (The asterisk could be any
+character you don't allow as a constituent.)
+
+*/
+
#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
@@ -105,13 +118,18 @@
TOKENMID_CA_075 [^][:blank:]<>;=():&%$#@!+|/\\{}^\"\?\*,[:cntrl:][]+
TOKENMID_NJS [^\][:blank:]<>;=():&%$#@!+|/\\{}^\"\?\*,[:cntrl:]\[]+
TOKENMID_NJS_DR [^[:blank:]<>;=():&%$#@!+|/\\{}^\"\?\*,[:cntrl:]\[\]]+
+TOKENMID_XXX_PG [^[:blank:]<>;=():&%$#@+|/\\{}^\"\?\*,[:cntrl:]\[\]]+
TOKENFRONT [^[:blank:][:cntrl:][:digit:][:punct:]]
TOKENMID [^[:blank:]<>;=():&%$#@!+|/\\{}^\"\?\*,[:cntrl:]\[\]]+
BOGOLEX_TOKEN [^[:blank:]<>; &%$ @! |/\\{}^\" \*,[:cntrl:]\[\]]+
TOKENBACK [^[:blank:][:punct:][:cntrl:]]
-TOKEN {TOKENFRONT}{TOKENMID_NJS_DR}{TOKENBACK}
+DOLLARS [0-9]+
+CENTS [0-9]+
+DOLLARS_AND_CENTS {DOLLARS}\.{CENTS}
+
+TOKEN {TOKENFRONT}{TOKENMID_XXX_PG}{TOKENBACK}
%s TEXT HTML BOGO_LEX
%s HTOKEN SCOMMENT LCOMMENT HSCRIPT
@@ -136,7 +154,10 @@
<INITIAL>^Content-Transfer-Encoding:{MTYPE} { mime_encoding(yy_text()); yyless(strlen("Content-Transfer-Encoding:")); return(TOKEN); }
<INITIAL>^Content-Type:{MTYPE};? { mime_type(yy_text()); yyless(strlen("Content-Type:")); return TOKEN; }
<INITIAL>^Content-Disposition:{MTYPE} { mime_disposition(yy_text()); yyless(strlen("Content-Disposition:")); return TOKEN; }
-<INITIAL>^Subject/:\.* { set_tag("subj:"); return TOKEN; }
+<INITIAL>^To/:\.* { set_tag("to:"); }
+<INITIAL>^From/:\.* { set_tag("from:"); }
+<INITIAL>^Return-Path/:\.* { set_tag("rtrn:"); }
+<INITIAL>^Subject/:\.* { set_tag("subj:"); }
<INITIAL>^Date:.*|Delivery-Date:.* ;
<INITIAL>^Message-ID:.* ;
<INITIAL>^\tid\ {ID} ;
@@ -174,18 +195,18 @@
<HTML>"<" { BEGIN HTOKEN; }
<HTOKEN>">" { BEGIN HTML; } /* end of tag; return to normal html processing */
-<HTOKEN>{TOKEN} { if (tokenize_html_tags) return TOKEN; }
+<HTOKEN>{TOKEN}/"/" { if (tokenize_html_tags) return TOKEN; }
<SCOMMENT,LCOMMENT>{TOKEN} { if (tokenize_html_comments) return TOKEN; }
<HSCRIPT>{TOKEN} { if (tokenize_html_script) return TOKEN; }
{IPADDR} { return IPADDR;}
{TOKEN} { return TOKEN;}
-
-. ;
+${DOLLARS_AND_CENTS} { return TOKEN;}
+${DOLLARS} { return TOKEN;}
+. /* ignore character */ ;
\n { got_newline();
lineno += 1;
}
-
%%
void lexer_v3_init(FILE *fp)
Index: token.c
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/src/token.c,v
retrieving revision 1.16
diff -u -r1.16 token.c
--- token.c 10 May 2003 14:50:24 -0000 1.16
+++ token.c 11 May 2003 23:04:04 -0000
@@ -69,6 +69,11 @@
yylval->leng = lexer_v3_leng;
yylval->text = (byte *)lexer_v3_text;
+ if (DEBUG_TEXT(1)) {
+ word_puts(yylval, 0, dbgout);
+ fputc('\n', dbgout);
+ }
+
if (class <= 0)
break;
@@ -164,8 +169,9 @@
}
/* Need separate loop so lexer can see "From", "Date", etc */
- for (cp = yylval->text; cp < yylval->text+yylval->leng; cp += 1)
- *cp = casefold_table[*cp];
+ if (fold_case)
+ for (cp = yylval->text; cp < yylval->text+yylval->leng; cp += 1)
+ *cp = casefold_table[*cp];
return(class);
}
More information about the Bogofilter
mailing list