several bugs/glitches/typos/questions

David Relson relson at osagesoftware.com
Sat Mar 8 01:41:24 CET 2003


Bill,

Give the attached patch a try.  As it steps through the wordlist, it checks 
for tokens with non-ascii characters.  When one is found, it is deleted and 
a new one (with changed characters) is added.  If two tokens map to the 
same token, their counts will be combined.  The patch also includes a 
simple test in scripts tests/bogoutil/t.nonascii.replace

As always, I request that you let me know whether or not the patch fixes 
_your_ problem.

Cheers,

David

At 06:01 PM 3/7/03, W M Brelsford wrote:

>A couple problems seem to remain in 0.11.1.2:
>
>On Thu Mar 06 2003 at 03:00 PM -0500, David Relson wrote:
> > At 01:48 PM 3/6/03, W M Brelsford wrote:
> >
> > >A few things I've noticed lately (using 0.11.1.1):
> > >
> > >1. bogoutil -m doesn't work, e.g. "bogoutil -c3 -m file.db" does
> > >        not change file.db.
>
>Works now with -c3, but "bogoutil -n -m file.db" still does nothing.
>
> > >2. bogoutil -n doesn't combine tokens, e.g. "bogoutil -n -d file.db"
> > >        yields multiple identical lines like "c??067???? 1 20030222".
> > >        (Would they be combined with "-m"?)
>
>Still doesn't combine tokens.
>
>--
>Bill Brelsford
>k2di2 at att.net
-------------- next part --------------
diff -u -r -b -N --exclude-from=diff.excl 01112/src/maint.c cvs/src/maint.c
--- 01112/src/maint.c	2003-03-06 19:35:33.000000000 -0500
+++ cvs/src/maint.c	2003-03-07 19:10:34.000000000 -0500
@@ -1,4 +1,4 @@
-/* $Id: maint.c,v 1.9 2003/03/07 00:35:33 relson Exp $ */
+/* $Id: maint.c,v 1.10 2003/03/08 00:10:34 relson Exp $ */
 
 /*****************************************************************************
 
@@ -23,6 +23,7 @@
 #include "error.h"
 #include "maint.h"
 #include "xmalloc.h"
+#include "xstrdup.h"
 
 YYYYMMDD today;			/* date as YYYYMMDD */
 uint32_t thresh_count = 0;
@@ -98,12 +99,18 @@
     }
 }
 
-void do_replace_nonascii_characters(register byte *str, register size_t len) {
+bool do_replace_nonascii_characters(register byte *str, register size_t len)
+{
+    bool change = false;
     assert(str != NULL);
     while(len--) {
-	if (*str & 0x80) *str = '?';
+	if (*str & 0x80) {
+	    *str = '?';
+	    change = true;
+	}
 	str++;
     }
+    return change;
 }
 
 void maintain_wordlists(void)
@@ -137,8 +144,7 @@
 static int maintain_hook(word_t *key, word_t *data,
 			 void *userdata /*@unused@*/)
 {
-    static word_t *x = NULL;
-    static uint32_t x_size = MAXTOKENLEN+1;
+    word_t w;
     dbv_t val;
 
     if (data->leng > sizeof(val)) {
@@ -146,30 +152,36 @@
 	exit(2);
     }
 
-    memcpy(&val, data->text, data->leng);
+    w.leng = key->leng;
+    w.text = key->text;
 
-    if (replace_nonascii_characters)
-	do_replace_nonascii_characters(key->text, key->leng);
+    memcpy(&val, data->text, data->leng);
 
     if (!keep_count(val.count) || !keep_date(val.date) || !keep_size(key->leng)) {
-	if (x == NULL || key->leng + 1 > x_size) {
-	    word_free(x);
-	    x_size = max(x_size, key->leng + 1);
-	    x = word_new(NULL, x_size);
-	}
-
-	x->leng = key->leng;
-	memcpy(x->text, key->text, key->leng);
-	x->text[key->leng] = '\0';
-
-	db_delete(userdata, x);
 
 	if (DEBUG_DATABASE(0)) {
 	    fputs("deleting ", dbgout);
-	    word_puts(x, 0, dbgout);
+	    word_puts(&w, 0, dbgout);
 	    fputc('\n', dbgout);
 	}
     }
+    else {
+	if (replace_nonascii_characters)
+	{
+	    byte *tmp = xstrdup(key->text);
+	    unsigned long count = val.count;
+	    if (do_replace_nonascii_characters(tmp, key->leng))
+	    {
+		db_delete(userdata, key);
+		w.text = tmp;
+		w.leng = key->leng;
+		count += db_getvalue(userdata, &w);
+		set_date(val.date);
+		db_setvalue(userdata, &w, count);
+	    }
+	    xfree(tmp);
+	}
+    }
     return 0;
 }
 
diff -u -r -b -N --exclude-from=diff.excl 01112/src/maint.h cvs/src/maint.h
--- 01112/src/maint.h	2003-02-03 11:55:14.000000000 -0500
+++ cvs/src/maint.h	2003-03-07 19:10:34.000000000 -0500
@@ -1,4 +1,4 @@
-/* $Id: maint.h,v 1.1 2003/02/03 16:55:14 relson Exp $ */
+/* $Id: maint.h,v 1.2 2003/03/08 00:10:34 relson Exp $ */
 
 #ifndef MAINT_H
 #define MAINT_H
@@ -26,7 +26,7 @@
 bool keep_date(YYYYMMDD dat);
 bool keep_count(uint32_t cnt);
 bool keep_size(size_t siz);
-void do_replace_nonascii_characters(byte *, size_t);
+bool do_replace_nonascii_characters(byte *, size_t);
 
 void set_today(void);
 void set_date(YYYYMMDD date);
diff -urN 01112/src/tests/bogoutil/Makefile.am cvs/src/tests/bogoutil/Makefile.am
--- 01112/src/tests/bogoutil/Makefile.am	2003-02-03 12:01:22.000000000 -0500
+++ cvs/src/tests/bogoutil/Makefile.am	2003-03-07 19:11:12.000000000 -0500
@@ -1,6 +1,6 @@
-# $Id: Makefile.am,v 1.1 2003/02/03 17:01:22 relson Exp $
+# $Id: Makefile.am,v 1.2 2003/03/08 00:11:12 relson Exp $
 
-TESTSCRIPTS = driver.sh t.dump.load
+TESTSCRIPTS = driver.sh t.dump.load t.nonascii.replace
 TESTS=$(TESTSCRIPTS)
 
 TESTS_ENVIRONMENT = RUN_FROM_MAKE=1 srcdir=$(srcdir) $(SHELL) $(VERBOSE)
diff -urN 01112/src/tests/bogoutil/t.nonascii.replace cvs/src/tests/bogoutil/t.nonascii.replace
--- 01112/src/tests/bogoutil/t.nonascii.replace	1969-12-31 19:00:00.000000000 -0500
+++ cvs/src/tests/bogoutil/t.nonascii.replace	2003-03-07 19:07:54.000000000 -0500
@@ -0,0 +1,47 @@
+#! /bin/sh
+
+: ${srcdir=.}
+relpath="`pwd`/../.."
+. ${srcdir}/../t.frame
+
+BOGOUTIL="$VAL ../../bogoutil"
+
+# create a test wordlist with tokens containing non-ascii characters,
+# replace the non-ascii characters to question marks,
+# check the number of tokens in the wordlist
+#
+# test below
+# remember to use ${srcdir}
+echo  	41 A4 BA B5 B5 20 31 0A \
+	41 C1 BA B8 B5 20 32 0A \
+	41 BA C1 B8 B5 20 33 0A \
+  	42 A4 BA B8 B5 B5 20 31 0A \
+	42 C1 BA B8 B5 B5 20 32 0A \
+	42 BA C1 B8 B5 B5 20 33 0A \
+	42 C1 BA B5 B8 B5 20 34 0A \
+| ../dehex >${TMPDIR}/input
+
+WORDLIST="${TMPDIR}/spamlist.db"
+
+rm -f ${WORDLIST}
+
+$BOGOUTIL -l ${WORDLIST} < ${TMPDIR}/input
+$BOGOUTIL -d ${WORDLIST} > ${TMPDIR}/output.1
+$BOGOUTIL -n -m ${WORDLIST}
+$BOGOUTIL -d ${WORDLIST} > ${TMPDIR}/output.2
+
+LEN1=`wc -l ${TMPDIR}/output.1 | awk '{print $1}'`
+LEN2=`wc -l ${TMPDIR}/output.2 | awk '{print $1}'`
+
+TOK1=`head -1 ${TMPDIR}/output.2 | awk '{print $2 }'`
+TOK2=`tail -1 ${TMPDIR}/output.2 | awk '{print $2 }'`
+
+RESULT=`printf "%d.%d.%d.%d" $LEN1 $LEN2 $TOK1 $TOK2`
+
+WANT="7.2.6.10"
+
+if [ "$RESULT" != "$WANT" ] ; then
+    echo want: $WANT, have: $RESULT
+fi
+
+test $RESULT = "$WANT"



More information about the Bogofilter mailing list