lexer size [was: Much simplified lexer]
Boris 'pi' Piwinger
3.14 at logic.univie.ac.at
Thu Nov 13 17:15:33 CET 2003
David Relson wrote:
> Right. This change:
>
> - <INITIAL>(ESMTP|SMTP)+/[ \t\n]+id\ {ID}
> + <INITIAL>(ESMTP|SMTP)+
>
> has the following size effects:
> 41547 8 60 41615 a28f lexer_v3.o
> 43405 8 65640 109053 1a9fd lexer_v3.o
Really strange, how little changes can have so huge effects.
>> I don't understand what this is good for. In the original
>> expression the / seems to be wrong, maybe the space behind
>> "id" should also be any kind of whitespace. But why
>> completely remove it?
>
> Actually Matthias made this change. The old pattern allows a multiline
> ESMTP|SMTP line, the new line does not. In his test, the simpler
> pattern works fine.
Well, then.
> Also, the "/" is a lexer operator that causes it to return the matching
> text before the slash. The text after the slash will be reparsed
> (later).
OK, so in effect we return SMTP or ESMTP.
>> Anyhow, wouldn't the following be nicer:
>> <INITIAL>(E?SMTP)+
>
> Looks reasonable.
>
>> And why the +? I only see it in the form "with ESMTP id
>> PAA16337" etc., no repeated SMTP or ESMTP.
I still don't understand the +.
>> So I would have assumed that version:
>> <INITIAL>E?SMTP{WHITESPACE}+{WHITESPACE}id{ID}
>
> Appears to be unnecessary. Without it, "make check" still succeeds so
> the effect of the change is minor.
I did my personal test.
<INITIAL>E?SMTP
in my version of lexer does not change any score. And it
gets the size down.
The same in your version of lexer passes "make check".
pi
PS: Attached is my new version of lexer.
-------------- next part --------------
/* $Id: lexer_v3.l,v 1.2 2003/11/13 23:43:39 relson Exp $ */
%{
/*
* NAME
* lexer_v3.l -- bogofilter's lexical analyzer for message headers
*
* 01/01/2003 - split out of lexer.l
*
*/
/*
* Our lexical analysis is different from Paul Graham's rules:
*
* We throw away headers that are readily identifiable as dates.
* We throw away all digit strings that don't look like IP address parts.
* We thow away lines beginning with <tab>id<space> -- mailer UDs.
* We throw away *all* tokens of length 1 or 2.
*
* These are optimizations to keep the token lists from bloating.
* The big win is recognizing machine-generated unique IDs that
* we'll never see again and shouldn't
*
* We don't treat dot between two alphanumerics as a separator,
* because we want to keep domain names and IP addresses together as
* recognizable units.
*
* Having done the above, there isn't much need to recognize URLs.
* If a URL is a spam indicator, very likely any other URL from the
* same site is as well, so the hostname part should be an adequate
* statistical trigger.
*
* LEXED_TOKENS, which are found in "msg-count" files need a special pattern
* because they can be:
* 1 - normal bogofilter tokens
* 2 - url:xxx and subj: tokens
* 3 - mime boundaries
*/
/* 12 May 2003
* Added Paul Graham's latest ideas on parsing.
* (From http://www.paulgraham.com/better.html)
*
* 1. Case is preserved.
*
* 3. Periods and commas are constituents if they occur between two
* digits. This lets me get ip addresses and prices intact.
*
* 5. Tokens that occur within the To, From, Subject, and Return-Path
* lines, or within urls, get marked accordingly.
* For example. "foo" in the Subject line becomes "subj:foo".
*/
/* DR 08/29/2003:
**
** With flex-2.5.31 and '%option never-interactive noreject', file
** msg.dr.0118.base64 (in tests/bogofilter/inputs/split.d) parses
** incorrectly because line 24 isn't base64 decoded.
*/
#include "common.h"
#include <ctype.h>
#include <stdlib.h>
#include "buff.h"
#include "charset.h"
#include "lexer.h"
#include "mime.h" /* for mime_*() */
#include "msgcounts.h"
#include "textblock.h"
#include "token.h"
#include "xmalloc.h"
#define YY_DECL token_t yylex(void)
YY_DECL; /* declare function */
#define YY_INPUT(buf,result,max_size) result = yyinput((byte *)buf, max_size)
#define YY_EXIT_FAILURE EX_ERROR
#undef stderr
#define stderr dbgout /* for debug & -D options */
static word_t yyt;
static int lineno;
/* Function Prototypes */
static word_t *yy_text(void);
static void html_char(void);
static void html_reorder(void);
static void url_char(void);
static void skip_to(char chr);
char yy_get_state(void);
void yy_set_state_initial(void);
/* Function Definitions */
static word_t *yy_text(void)
{
yyt.text = (byte *)yytext;
yyt.leng = yyleng;
return &yyt;
}
%}
%option warn
%option nodebug debug
%option align caseless 8bit
%option never-interactive
%option noreject noyywrap
%option prefix="lexer_v3_"
UINT8 ([01]?[0-9]?[0-9]|2([0-4][0-9]|5[0-5]))
IPADDR {UINT8}\.{UINT8}\.{UINT8}\.{UINT8}
BCHARSNOSPC [[:alnum:]'()+_,-./:=?#]
BCHARS [[:alnum:]'()+_,-./:=?# ]
MIME_BOUNDARY {BCHARS}*{BCHARSNOSPC}
ID <?[[:alnum:]-]*>?
CHARSET [[:alnum:]-]+
MTYPE [ \t]*[[:alnum:]/-]*
NUM [[:digit:]]+
NUM_NUM \ {NUM}\ {NUM}
MSG_COUNT ^\".MSG_COUNT\"
TOKENBORDER [^[:blank:][:cntrl:]<>;&%@|/\\{}^"*,[\]?=():#+._!'`~-]
TOKENMID [^[:blank:][:cntrl:]<>;&%@|/\\{}^"*,[\]?=():#+]
BOGOLEX_TOKEN [^[:blank:][:cntrl:]<>;&%@|/\\{}^"*,[\]]+
TOKEN {TOKENBORDER}({TOKENMID}*{TOKENBORDER})?
/* RFC2047.2
encoded-word = "=?" charset "?" encoding "?" encoded-text "?="
charset = token ; see section 3
encoding = token ; see section 4
token = 1*<Any CHAR except SPACE, CTLs, and especials>
especials = "(" / ")" / "<" / ">" / "@" / "," / ";" / ":" / "
<"> / "/" / "[" / "]" / "?" / "." / "="
encoded-text = 1*<Any printable ASCII character other than "?"
or SPACE>
; (but see "Use of encoded-words in message
; headers", section 5)
*/
/* 09/01/03
Using "[^?]" in the pattern and validating the charset in 'C'
reduces executable size by approx 290k.
new: ENCODED_WORD =\?{CHARSET}\?[bq]\?[^?]*\?=
old: ENCODED_WORD =\?{CHARSET}\?(b\?{BASE64}|q\?{QP})\?=
BASE64 [[:alnum:]/+=]+
QP [!->@-~]+
*/
WHITESPACE [ \t\n]
NOTWHITESPACE [^ \t\n]
ENCODED_WORD =\?{CHARSET}\?[bq]\?[^?]*\?=
ENCODED_TOKEN ({TOKENBORDER}{TOKENMID}*)?({ENCODED_WORD}{WHITESPACE}+)*{ENCODED_WORD}
HTML_ENCODING "&#"x?[[:xdigit:]]+";"
URL_ENCODING "%"[[:xdigit:]][[:xdigit:]]
HTMLTOKEN "<"[^>]*">"
/*
* Generally, there are some html tags that cause an "eyebreak" and some
* that do not. For example, the "P" tag or the "BR" tag cause a break,
* and can be interpreted in place, while, the B (bold) tag does not.
* No close tags seem to cause a break.
* Comments do not. This is an attempt to make an exhaustive list of
* tags that cause an "eyebreak". When the exit tag also causes a break,
* we include the /?. I believe this to be a complete list of tags that
* can cause a formatting break.
*/
HBREAK p|br|li|h[1-6]|hr|title|table|center|dd|dt|iframe|img|input|select|td|textarea|th|\/?(div|blockquote|pre|dir|dl|fieldset|legend|form|menu|ol|ul)
BREAKHTML "<"({HBREAK}({WHITESPACE}[^>]*|""))">"
VERP {TOKEN}-{NUM}-{TOKEN}={TOKEN}@{TOKEN}
%s TEXT HTML BOGO_LEX
%s HTOKEN HDISCARD SCOMMENT LCOMMENT HSCRIPT
%%
<INITIAL,BOGO_LEX>{MSG_COUNT}{NUM_NUM} { if (lineno == 0) {
BEGIN BOGO_LEX;
set_msg_counts(strchr(yytext, ' ') + 1);
}
return MSG_COUNT_LINE;
}
<BOGO_LEX>^\"{BOGOLEX_TOKEN}\"{NUM_NUM} { return BOGO_LEX_LINE; }
<BOGO_LEX>\n { lineno += 1; }
<INITIAL>{ENCODED_TOKEN} { word_t *w = yy_text();
size_t size = text_decode(w);
while (size-- > 0)
unput(w->text[size]);
}
<INITIAL>^(To|From|Return-Path|Subject): { set_tag(yytext); }
<INITIAL>^Received: { set_tag(yytext); return TOKEN; }
<INITIAL>^Content-(Transfer-Encoding|Type|Disposition):{MTYPE} { mime_content(yy_text()); skip_to(':'); return TOKEN; }
<INITIAL>^MIME-Version:.* { mime_version(yy_text()); return HEADKEY; }
<INITIAL>^(Delivery-)?Date:.* { return HEADKEY; }
<INITIAL>^(Resent-)?Message-ID:.* { return HEADKEY; }
<INITIAL>^(In-Reply-To|References):.* { return HEADKEY; }
<INITIAL>boundary=[ ]*\"?{MIME_BOUNDARY}\"? { mime_boundary_set(yy_text()); }
<INITIAL>charset=\"?{CHARSET}\"? { got_charset(yytext); skip_to('='); return TOKEN; }
<INITIAL>(file)?name=\"? /* ignore */
<INITIAL>E?SMTP { if (header_line_markup) { return TOKEN; } }
<INITIAL>[:blank:]*id\ {ID} /* ignore */
<INITIAL>\n[ \t] { lineno += 1; }
<INITIAL>\n\n { enum mimetype type = get_content_type();
have_body = true;
msg_header = false;
clr_tag();
switch (type) {
case MIME_TEXT_HTML: BEGIN HTML; break;
case MIME_MESSAGE: yy_set_state_initial(); break;
default: BEGIN TEXT;
}
return EOH;
}
<INITIAL>\n { set_tag("Header"); lineno += 1; }
<INITIAL><<EOF>> { return NONE; }
<INITIAL>{VERP} { skip_to('='); return VERP; }
^--{MIME_BOUNDARY}(--)?$ { if (got_mime_boundary(yy_text())) {
yy_set_state_initial();
return BOUNDARY;
} else {
yyless(2);
}
}
/* This has to match just as much or more than the below rules, so as to be the
controlling rule. */
<HTML>{TOKEN}({HTMLTOKEN}*{BREAKHTML}+{HTMLTOKEN}*.?|({HTMLTOKEN})+{WHITESPACE}) {
char *chr = memchr(yytext, '<', yyleng); /* find start of html tag */
size_t len = chr - yytext;
yyless(len);
return TOKEN;
}
<HTML>{TOKEN}({HTMLTOKEN})+/{NOTWHITESPACE} { html_reorder(); }
<HTML>"<!--" { BEGIN SCOMMENT; }
<HTML>"<!" { BEGIN (strict_check ? HTOKEN : LCOMMENT ); }
<HTML>"<"(a|img|font){WHITESPACE} { BEGIN HTOKEN; }
<HTML>"<" { BEGIN HDISCARD; } /* unknown tag */
<HTOKEN>{TOKEN} { if (tokenize_html_tags) return TOKEN; }
<HSCRIPT>{TOKEN} { if (tokenize_html_script) return TOKEN; }
<HDISCARD,LCOMMENT,SCOMMENT>{TOKEN} { /* discard innards of html tokens and comments */ }
<HTOKEN,HDISCARD,LCOMMENT>">" { BEGIN HTML; } /* end of tag, loose comment; return to normal html processing */
<SCOMMENT>"-->" { BEGIN HTML; } /* end of strict comment; return to normal html processing */
{IPADDR} { return IPADDR;}
{TOKEN} { return TOKEN;}
<HTML>{TOKEN}?{HTML_ENCODING} { html_char(); } /* process escaped chars, eg 'e' is 'a' */
<HTOKEN>{TOKEN}?{URL_ENCODING}+ { url_char(); } /* process escaped chars, eg '%61' is 'a' */
. /* ignore character */
\n { lineno += 1; clr_tag(); }
<<EOF>> { return NONE; }
%%
void lexer_v3_init(FILE *fp)
{
lineno = 0;
have_body = false;
yy_set_state_initial();
yyrestart(fp);
}
static void skip_to(char chr)
{
size_t len = strchr(yytext, chr) - yytext;
yyless(len);
}
static void html_reorder(void)
{
char *chr = memchr(yytext, '<', yyleng); /* find start of html tag */
size_t len = chr - yytext;
char *tmp;
char *yycopy = xmalloc(yyleng + 1); /* +1 for NUL byte below */
memcpy(yycopy, yytext+len, yyleng-len); /* copy tag to start of buffer */
memcpy(yycopy+yyleng-len, yytext, len); /* copy leading text to end of buffer */
yycopy[yyleng] = '\0'; /* for debugging */
for(tmp = yycopy+yyleng-1 ; tmp >= yycopy; tmp--)
yyunput(*tmp, yytext);
xfree(yycopy);
}
static int xtoi(char *in, size_t len)
{
int val = 0;
while (isxdigit((byte) *in) && (len-- > 0)) {
char c = *in++;
val <<= 4;
val += isdigit((unsigned char)c)
? (c - '0')
: (tolower((unsigned char)c) - 'a' + 10);
}
return val;
}
static void html_char(void)
{
char *txt = strstr(yytext, "&#"); /* find decodable char */
size_t len = txt - yytext;
int val;
char *yycopy = NULL;
if (len != 0) {
yycopy = xmalloc(yyleng + 1); /* +1 for NUL byte below */
memcpy(yycopy, yytext, yyleng); /* copy tag to start of buffer */
yycopy[yyleng] = '\0'; /* for debugging */
}
txt += 2;
val = isdigit((byte) *txt) ? atoi(txt) : xtoi(txt+1, 4);
if ((val < 256) && isprint(val)) { /* use it if printable */
yyunput(val, yytext);
yyleng = len; /* adjust len to pre-char count */
}
else {
if (yycopy)
yycopy[yyleng-1] = ' '; /* prevents parsing loop */
}
if (yycopy != NULL) {
while (yyleng-- > 0)
yyunput(yycopy[yyleng], yytext);
xfree(yycopy);
}
}
static void url_char(void)
{
char *src, *dst;
src = dst = yytext;
while (src < yytext + yyleng) {
char c = *src++;
if (c == '%') {
c = xtoi(src, 2);
src += 2;
}
*dst++ = c;
}
while (dst > yytext) {
yyunput(*--dst, yytext);
}
}
char yy_get_state()
{
switch (YYSTATE) {
case INITIAL: return 'i';
case TEXT: return 't';
case HTML:
case HTOKEN: return 'h';
case SCOMMENT: return 's';
case LCOMMENT: return 'l';
default: return 'o';
}
}
void yy_set_state_initial(void)
{
BEGIN INITIAL;
msg_header = true;
set_tag("Header");
if (DEBUG_LEXER(1))
fprintf(dbgout, "BEGIN INITIAL\n");
#ifdef FLEX_DEBUG
yy_flex_debug = BOGOTEST('L');
#endif
}
/*
* The following sets edit modes for GNU EMACS
* Local Variables:
* mode:c
* indent-tabs-mode:t
* End:
*/
More information about the bogofilter
mailing list