diff -u -r --exclude-from=diff.excl 111/src/lexer_v3.l cvs/src/lexer_v3.l --- 111/src/lexer_v3.l 2006-07-03 23:47:37.000000000 -0400 +++ cvs/src/lexer_v3.l 2006-11-26 11:40:21.000000000 -0500 @@ -1,4 +1,4 @@ -/* $Id: lexer_v3.l,v 1.167 2006/07/04 03:47:37 relson Exp $ */ +/* $Id: lexer_v3.l,v 1.170 2006/11/26 16:38:07 relson Exp $ */ %{ /* @@ -15,7 +15,6 @@ * We throw away headers that are readily identifiable as dates. * We throw away all digit strings that don't look like IP address parts. * We thow away lines beginning with id -- mailer UDs. - * We throw away *all* tokens of length 1 or 2. * * These are optimizations to keep the token lists from bloating. * The big win is recognizing machine-generated unique IDs that @@ -137,7 +136,7 @@ BCHARS [[:alnum:]()+_,-./:=?#\' ] MIME_BOUNDARY {BCHARS}*{BCHARSNOSPC} -ID ? +ID ? CHARSET [[:alnum:]-]+ VERPID [[:alnum:]#-]+[[:digit:]]+[[:alnum:]#-]+ MTYPE [[:blank:]]*[[:alnum:]/-]* @@ -147,16 +146,11 @@ MSG_COUNT ^\".MSG_COUNT\" TOKENFRONT [^[:blank:][:cntrl:][:digit:][:punct:]] -TOKENMID [^[:blank:][:cntrl:]<>;=():&%$#@+|/\\{}^\"?*,\[\]]+ +TOKENMID [^[:blank:][:cntrl:]<>;=():&%$#@+|/\\{}^\"?*,\[\]]* BOGOLEX_TOKEN [^[:blank:][:cntrl:]<>; &% @ |/\\{}^\" *,\[\]]+ TOKENBACK [^[:blank:][:cntrl:]<>;=():&%$#@+|/\\{}^\"?*,\[\]._~\'\`\-] -TOKEN {TOKENFRONT}{TOKENMID}{TOKENBACK} -SHORT_TOKEN {TOKENFRONT}{TOKENBACK}? - -T1 [[:alpha:]] -T12 [[:alpha:]][[:alnum:]]? -TOKEN_12 ({TOKEN}|{T12}|{T1}) +TOKEN {TOKENFRONT}({TOKENMID}{TOKENBACK})? /* RFC2047.2 encoded-word = "=?" charset "?" encoding "?" encoded-text "?=" @@ -252,7 +246,7 @@ charset=\"?{CHARSET}\"? { got_charset(yytext); skip_to('='); return TOKEN; } (file)?name=\"? /* ignore */ -\n?[[:blank:]]id\ {ID} { return QUEUE_ID; } +\n?[[:blank:]]id{WHITESPACE}+{ID} { return QUEUE_ID; } \n[[:blank:]] { lineno += 1; } \n\n { enum mimetype type = get_content_type(); @@ -295,14 +289,14 @@ return TOKEN; } -{TOKEN_12}({HTMLTOKEN})+/{NOTWHITESPACE} { html_reorder(); } +{TOKEN}({HTMLTOKEN})+/{NOTWHITESPACE} { html_reorder(); } "