several bugs/glitches/typos/questions
David Relson
relson at osagesoftware.com
Sat Mar 8 01:41:24 CET 2003
Bill,
Give the attached patch a try. As it steps through the wordlist, it checks
for tokens with non-ascii characters. When one is found, it is deleted and
a new one (with changed characters) is added. If two tokens map to the
same token, their counts will be combined. The patch also includes a
simple test in scripts tests/bogoutil/t.nonascii.replace
As always, I request that you let me know whether or not the patch fixes
_your_ problem.
Cheers,
David
At 06:01 PM 3/7/03, W M Brelsford wrote:
>A couple problems seem to remain in 0.11.1.2:
>
>On Thu Mar 06 2003 at 03:00 PM -0500, David Relson wrote:
> > At 01:48 PM 3/6/03, W M Brelsford wrote:
> >
> > >A few things I've noticed lately (using 0.11.1.1):
> > >
> > >1. bogoutil -m doesn't work, e.g. "bogoutil -c3 -m file.db" does
> > > not change file.db.
>
>Works now with -c3, but "bogoutil -n -m file.db" still does nothing.
>
> > >2. bogoutil -n doesn't combine tokens, e.g. "bogoutil -n -d file.db"
> > > yields multiple identical lines like "c??067???? 1 20030222".
> > > (Would they be combined with "-m"?)
>
>Still doesn't combine tokens.
>
>--
>Bill Brelsford
>k2di2 at att.net
-------------- next part --------------
diff -u -r -b -N --exclude-from=diff.excl 01112/src/maint.c cvs/src/maint.c
--- 01112/src/maint.c 2003-03-06 19:35:33.000000000 -0500
+++ cvs/src/maint.c 2003-03-07 19:10:34.000000000 -0500
@@ -1,4 +1,4 @@
-/* $Id: maint.c,v 1.9 2003/03/07 00:35:33 relson Exp $ */
+/* $Id: maint.c,v 1.10 2003/03/08 00:10:34 relson Exp $ */
/*****************************************************************************
@@ -23,6 +23,7 @@
#include "error.h"
#include "maint.h"
#include "xmalloc.h"
+#include "xstrdup.h"
YYYYMMDD today; /* date as YYYYMMDD */
uint32_t thresh_count = 0;
@@ -98,12 +99,18 @@
}
}
-void do_replace_nonascii_characters(register byte *str, register size_t len) {
+bool do_replace_nonascii_characters(register byte *str, register size_t len)
+{
+ bool change = false;
assert(str != NULL);
while(len--) {
- if (*str & 0x80) *str = '?';
+ if (*str & 0x80) {
+ *str = '?';
+ change = true;
+ }
str++;
}
+ return change;
}
void maintain_wordlists(void)
@@ -137,8 +144,7 @@
static int maintain_hook(word_t *key, word_t *data,
void *userdata /*@unused@*/)
{
- static word_t *x = NULL;
- static uint32_t x_size = MAXTOKENLEN+1;
+ word_t w;
dbv_t val;
if (data->leng > sizeof(val)) {
@@ -146,30 +152,36 @@
exit(2);
}
- memcpy(&val, data->text, data->leng);
+ w.leng = key->leng;
+ w.text = key->text;
- if (replace_nonascii_characters)
- do_replace_nonascii_characters(key->text, key->leng);
+ memcpy(&val, data->text, data->leng);
if (!keep_count(val.count) || !keep_date(val.date) || !keep_size(key->leng)) {
- if (x == NULL || key->leng + 1 > x_size) {
- word_free(x);
- x_size = max(x_size, key->leng + 1);
- x = word_new(NULL, x_size);
- }
-
- x->leng = key->leng;
- memcpy(x->text, key->text, key->leng);
- x->text[key->leng] = '\0';
-
- db_delete(userdata, x);
if (DEBUG_DATABASE(0)) {
fputs("deleting ", dbgout);
- word_puts(x, 0, dbgout);
+ word_puts(&w, 0, dbgout);
fputc('\n', dbgout);
}
}
+ else {
+ if (replace_nonascii_characters)
+ {
+ byte *tmp = xstrdup(key->text);
+ unsigned long count = val.count;
+ if (do_replace_nonascii_characters(tmp, key->leng))
+ {
+ db_delete(userdata, key);
+ w.text = tmp;
+ w.leng = key->leng;
+ count += db_getvalue(userdata, &w);
+ set_date(val.date);
+ db_setvalue(userdata, &w, count);
+ }
+ xfree(tmp);
+ }
+ }
return 0;
}
diff -u -r -b -N --exclude-from=diff.excl 01112/src/maint.h cvs/src/maint.h
--- 01112/src/maint.h 2003-02-03 11:55:14.000000000 -0500
+++ cvs/src/maint.h 2003-03-07 19:10:34.000000000 -0500
@@ -1,4 +1,4 @@
-/* $Id: maint.h,v 1.1 2003/02/03 16:55:14 relson Exp $ */
+/* $Id: maint.h,v 1.2 2003/03/08 00:10:34 relson Exp $ */
#ifndef MAINT_H
#define MAINT_H
@@ -26,7 +26,7 @@
bool keep_date(YYYYMMDD dat);
bool keep_count(uint32_t cnt);
bool keep_size(size_t siz);
-void do_replace_nonascii_characters(byte *, size_t);
+bool do_replace_nonascii_characters(byte *, size_t);
void set_today(void);
void set_date(YYYYMMDD date);
diff -urN 01112/src/tests/bogoutil/Makefile.am cvs/src/tests/bogoutil/Makefile.am
--- 01112/src/tests/bogoutil/Makefile.am 2003-02-03 12:01:22.000000000 -0500
+++ cvs/src/tests/bogoutil/Makefile.am 2003-03-07 19:11:12.000000000 -0500
@@ -1,6 +1,6 @@
-# $Id: Makefile.am,v 1.1 2003/02/03 17:01:22 relson Exp $
+# $Id: Makefile.am,v 1.2 2003/03/08 00:11:12 relson Exp $
-TESTSCRIPTS = driver.sh t.dump.load
+TESTSCRIPTS = driver.sh t.dump.load t.nonascii.replace
TESTS=$(TESTSCRIPTS)
TESTS_ENVIRONMENT = RUN_FROM_MAKE=1 srcdir=$(srcdir) $(SHELL) $(VERBOSE)
diff -urN 01112/src/tests/bogoutil/t.nonascii.replace cvs/src/tests/bogoutil/t.nonascii.replace
--- 01112/src/tests/bogoutil/t.nonascii.replace 1969-12-31 19:00:00.000000000 -0500
+++ cvs/src/tests/bogoutil/t.nonascii.replace 2003-03-07 19:07:54.000000000 -0500
@@ -0,0 +1,47 @@
+#! /bin/sh
+
+: ${srcdir=.}
+relpath="`pwd`/../.."
+. ${srcdir}/../t.frame
+
+BOGOUTIL="$VAL ../../bogoutil"
+
+# create a test wordlist with tokens containing non-ascii characters,
+# replace the non-ascii characters to question marks,
+# check the number of tokens in the wordlist
+#
+# test below
+# remember to use ${srcdir}
+echo 41 A4 BA B5 B5 20 31 0A \
+ 41 C1 BA B8 B5 20 32 0A \
+ 41 BA C1 B8 B5 20 33 0A \
+ 42 A4 BA B8 B5 B5 20 31 0A \
+ 42 C1 BA B8 B5 B5 20 32 0A \
+ 42 BA C1 B8 B5 B5 20 33 0A \
+ 42 C1 BA B5 B8 B5 20 34 0A \
+| ../dehex >${TMPDIR}/input
+
+WORDLIST="${TMPDIR}/spamlist.db"
+
+rm -f ${WORDLIST}
+
+$BOGOUTIL -l ${WORDLIST} < ${TMPDIR}/input
+$BOGOUTIL -d ${WORDLIST} > ${TMPDIR}/output.1
+$BOGOUTIL -n -m ${WORDLIST}
+$BOGOUTIL -d ${WORDLIST} > ${TMPDIR}/output.2
+
+LEN1=`wc -l ${TMPDIR}/output.1 | awk '{print $1}'`
+LEN2=`wc -l ${TMPDIR}/output.2 | awk '{print $1}'`
+
+TOK1=`head -1 ${TMPDIR}/output.2 | awk '{print $2 }'`
+TOK2=`tail -1 ${TMPDIR}/output.2 | awk '{print $2 }'`
+
+RESULT=`printf "%d.%d.%d.%d" $LEN1 $LEN2 $TOK1 $TOK2`
+
+WANT="7.2.6.10"
+
+if [ "$RESULT" != "$WANT" ] ; then
+ echo want: $WANT, have: $RESULT
+fi
+
+test $RESULT = "$WANT"
More information about the Bogofilter
mailing list