register restructuring preview

Matthias Andree matthias.andree at gmx.de
Mon Nov 25 20:24:01 CET 2002


This is the preview patch I used to figure the 60% time reduction for
registering big message boxes.

 .cvsignore     |  1 
 Makefile.am    | 44 +++++++++++++-------------
 bogofilter.c   | 14 +++++++-
 bogowordfreq.c | 40 ++++++++++++++++++++++++
 collect.c      | 65 +++++++++++++++++++++++++++++++++++++++
 collect.h      | 16 +++++++++
 globals.h      |  2 +
 register.c     | 95 +++++++++++++++------------------------------------------
 register.h     |  1 
 9 files changed, 184 insertions(+), 94 deletions(-)

Index: .cvsignore
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/.cvsignore,v
retrieving revision 1.20
diff -u -r1.20 .cvsignore
--- .cvsignore	16 Nov 2002 15:40:07 -0000	1.20
+++ .cvsignore	25 Nov 2002 19:22:15 -0000
@@ -35,3 +35,4 @@
 version.h
 wordhash
 directories.h
+bogowordfreq
Index: Makefile.am
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/Makefile.am,v
retrieving revision 1.78
diff -u -r1.78 Makefile.am
--- Makefile.am	24 Nov 2002 18:28:22 -0000	1.78
+++ Makefile.am	25 Nov 2002 19:22:15 -0000
@@ -25,7 +25,7 @@
 # what to build
 bin_PROGRAMS = bogofilter bogoutil bogolexer
 bin_SCRIPTS = bogoupgrade
-check_PROGRAMS = debugtest configtest wordhash find_home.test
+check_PROGRAMS = debugtest configtest wordhash find_home.test bogowordfreq
 man_MANS = bogofilter.1 bogoutil.1 bogoupgrade.1 bogolexer.1
 sysconf_DATA = bogofilter.cf.example
 
@@ -33,39 +33,42 @@
 LDADD += libbogofilter.a
 
 # what to build that from
-libbogofilter_a_SOURCES=xmalloc.h xcalloc.c xmem_error.c xrealloc.c \
-	xmalloc.c xstrdup.h xstrdup.c globals.h debug.h \
-	debug.c find_home.h find_home.c find_home_user.c \
-	find_home_tildeexpand.c
-
-MYCOMMON=  bogoconfig.h system.h version.h
+libbogofilter_a_SOURCES= \
+	xmalloc.h xcalloc.c xmem_error.c xrealloc.c \
+	    xmalloc.c xstrdup.h xstrdup.c \
+	debug.h debug.c \
+	find_home.h find_home.c find_home_user.c \
+	    find_home_tildeexpand.c \
+	collect.h collect.c \
+	datastore.h datastore_db.h datastore_db.c \
+	lexer.h lexer.l \
+	register.h register.c \
+	wordhash.h wordhash.c wordlists.h wordlists.c \
+	bogoconfig.h globals.h system.h version.h
 
 BUILT_SOURCES = version.h directories.h
 
 CLEANFILES= directories.h
 
-bogofilter_SOURCES = bogofilter.c bogofilter.h main.c lexer.l lexer.h \
-		     datastore.h datastore_db.h datastore_db.c \
-		     config.c register.c register.h \
+bogofilter_SOURCES = bogofilter.c bogofilter.h main.c \
+		     config.c \
 		     $(GRAHAM_SRC) $(ROBINSON_SRC) $(FISHER_SRC) \
 		     rstats.h rstats.c \
-                     wordhash.h wordhash.c wordlists.h wordlists.c \
-		     $(MYCOMMON) common.h method.h
+		     common.h method.h
+
+bogowordfreq_SOURCES = bogowordfreq.c
 
-bogolexer_SOURCES = bogolexer.c lexer.l lexer.h $(MYCOMMON)
+bogolexer_SOURCES = bogolexer.c
 
 debugtest_SOURCES = debug.c debug.h debug.main.c
 
-bogoutil_SOURCES = bogoutil.c \
-		   datastore.h datastore_db.h datastore_db.c wordlists.h wordlists.c \
-		   $(MYCOMMON)
+bogoutil_SOURCES = bogoutil.c
 
-configtest_SOURCES = configtest.c config.c wordlists.c \
-		     datastore_db.c $(MYCOMMON)
+configtest_SOURCES = configtest.c config.c
 
-wordhash_SOURCES = wordhash.c wordhash.h wordhash.main.c $(MYCOMMON)
+wordhash_SOURCES = wordhash.main.c
 
-find_home_test_SOURCES = find_home.test.c $(MYCOMMON)
+find_home_test_SOURCES = find_home.test.c
 
 # what to distribute
 extradistdirs = doc contrib
@@ -79,7 +82,6 @@
 	     README.cvs README.freebsd README.hp-ux \
 	     README.dcdflib README.Robinson \
 	     $(extradistdirs)
-#
 #
 VERSION_FROM=main.c bogofilter.c bogoutil.c lexer.l
 version.h:	version.sh $(VERSION_FROM)
Index: bogofilter.c
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/bogofilter.c,v
retrieving revision 1.88
diff -u -r1.88 bogofilter.c
--- bogofilter.c	21 Nov 2002 15:42:52 -0000	1.88
+++ bogofilter.c	25 Nov 2002 19:22:15 -0000
@@ -37,6 +37,7 @@
 #include "bogofilter.h"
 #include "method.h"
 #include "datastore.h"
+#include "collect.h"
 #include "register.h"
 
 void initialize_constants()
@@ -55,7 +56,8 @@
     rc_t	status;
     double 	spamicity;
     wordhash_t  *wordhash;
-    int		wordcount, msgcount;
+    long	wordcount, msgcount = 0;
+    bool	cont;
 
     good_list.active = spam_list.active = true;
 
@@ -67,7 +69,15 @@
     method->initialize();
 
     /* tokenize input text and save words in a wordhash. */
-    wordhash = collect_words(&msgcount, &wordcount);
+    do {
+	collect_words(&wordhash, &wordcount, &cont);
+	++msgcount;
+    } while(cont);
+
+    if (msgcount > 1) {
+	fprintf(stderr, "%s: must get only one message to calculate spamicity!\n", progname);
+	exit(2);
+    }
 
     spamicity = method->compute_spamicity(wordhash, NULL);
 
Index: bogowordfreq.c
===================================================================
RCS file: bogowordfreq.c
diff -N bogowordfreq.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ bogowordfreq.c	25 Nov 2002 19:22:15 -0000
@@ -0,0 +1,40 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "wordhash.h"
+
+#include "common.h"
+#include "system.h"
+#include "collect.h"
+
+const char *spam_header_name = "X-Bogosity:"; /* unused */
+int passthrough = 0; /* unused */
+
+int max_repeats = 1;
+
+static void print_wordlist (wordhash_t *h)
+{
+    hashnode_t *n;
+
+    for(n=wordhash_first(h);n;n = wordhash_next(h)) {
+	printf("%ld %s\n", ((wordprop_t *)(n->buf))->freq, n->key);
+    }
+}
+
+int main(int argc, char **argv) {
+    wordhash_t *h;
+    long count;
+    bool b;
+
+    if (argc >= 2) max_repeats=atoi(argv[1]);
+    
+    do {
+	collect_words(&h, &count, &b);
+	printf("%ld tokens:\n", count);
+	print_wordlist(h);
+	printf("\n");
+	wordhash_free(h);
+    } while(b);
+    return 0;
+}
Index: collect.c
===================================================================
RCS file: collect.c
diff -N collect.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ collect.c	25 Nov 2002 19:22:15 -0000
@@ -0,0 +1,65 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <config.h>
+
+#include "common.h"
+#include "bogofilter.h"
+#include "register.h"
+#include "collect.h"
+#include "wordhash.h"
+
+
+#include "lexer.h"
+
+void wordprop_init(void *vwordprop){
+	wordprop_t *wordprop = vwordprop;
+
+	wordprop->freq = 0;
+}
+
+static bool from_seen = false;
+
+void collect_reset(void)
+{
+    from_seen = false;
+}
+
+void collect_words(/*@out@*/ wordhash_t **wh,
+       /*@out@*/ /*@null@*/ long *word_count, /*@out@*/ bool *cont)
+    /* tokenize input text and save words in wordhash_t hash table 
+     * Sets word_count to the appropriate values
+     * if the pointer is non-NULL.
+     * wh and cont must not be NULL
+     * cont is set if further data is available.
+     */
+{
+    long w_count = 0;
+
+    wordprop_t *w;
+    wordhash_t *h = wordhash_init();
+
+    for (;;){
+	token_t token_type = get_token();
+
+	if (token_type != FROM && token_type != 0){
+	    w = wordhash_insert(h, yylval, sizeof(wordprop_t), &wordprop_init);
+	    if (w->freq < max_repeats) w->freq++;
+	    w_count++;
+	} else {
+	    if (token_type == FROM && from_seen == false) {
+		from_seen = true;
+		continue;
+	    }
+
+	    /* Want to process EOF, *then* drop out */
+	    *cont = (token_type != 0);
+	    break;
+	}
+    }
+
+    if (word_count)
+	*word_count = w_count;
+
+    *wh = h;
+}
Index: collect.h
===================================================================
RCS file: collect.h
diff -N collect.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ collect.h	25 Nov 2002 19:22:15 -0000
@@ -0,0 +1,16 @@
+#ifndef COLLECT_H
+#define COLLECT_H
+
+/* Represents the secondary data for a word key */
+typedef struct {
+  int freq;
+} wordprop_t;
+
+extern void wordprop_init(void *vwordprop);
+
+extern void collect_words(/*@out@*/ wordhash_t **wh,
+       /*@out@*/ /*@null@*/ long *word_count, /*@out@*/ bool *cont);
+
+extern void collect_reset(void);
+
+#endif
Index: globals.h
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/globals.h,v
retrieving revision 1.17
diff -u -r1.17 globals.h
--- globals.h	22 Nov 2002 12:14:41 -0000	1.17
+++ globals.h	25 Nov 2002 19:22:15 -0000
@@ -8,6 +8,8 @@
 #include <float.h> /* has DBL_EPSILON */
 #define EPS		(100.0 * DBL_EPSILON) /* equality cutoff */
 
+#include "system.h" /* has bool */
+
 extern int nonspam_exits_zero;	/* '-e' */
 extern bool fisher;		/* '-f' */
 extern bool force;		/* '-F' */
Index: register.c
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/register.c,v
retrieving revision 1.5
diff -u -r1.5 register.c
--- register.c	25 Nov 2002 18:00:53 -0000	1.5
+++ register.c	25 Nov 2002 19:22:15 -0000
@@ -8,8 +8,8 @@
 
 #include "bogofilter.h"
 #include "datastore.h"
-#include "lexer.h"
 #include "register.h"
+#include "collect.h"
 #include "wordhash.h"
 
 #define PLURAL(count) ((count == 1) ? "" : "s")
@@ -18,70 +18,6 @@
 
 extern char msg_register[];
 
-/* Represents the secondary data for a word key */
-typedef struct {
-  int freq;
-  int msg_freq;
-} wordprop_t;
-
-static void wordprop_init(void *vwordprop){
-	wordprop_t *wordprop = vwordprop;
-
-	wordprop->freq = 0;
-	wordprop->msg_freq = 0;
-}
-
-void *collect_words(/*@out@*/ int *message_count,
-		    /*@out@*/ int *word_count)
-    /* tokenize input text and save words in wordhash_t hash table 
-     * returns: the wordhash_t hash table.
-     * Sets messageg_count and word_count to the appropriate values
-     * if their pointers are non-NULL.  */
-{
-  int w_count = 0;
-  int msg_count = 0;
- 
-  wordprop_t *w;
-  hashnode_t *n;
-  wordhash_t *h = wordhash_init();
-     
-  for (;;){
-    token_t token_type = get_token();
-  
-    if (token_type != FROM && token_type != 0){
-      w = wordhash_insert(h, yylval, sizeof(wordprop_t), &wordprop_init);
-      if (w->msg_freq < max_repeats) w->msg_freq++;
-      w_count++;
-    }
-    else {
-      /* End of message. Update message counts. */
-      if (token_type == FROM || (token_type == 0 && msg_count == 0))
-        msg_count++;
-  
-      /* Incremenent word frequencies, capping each message's
-       * contribution at MAX_REPEATS in order to be able to cap
-       * frequencies. */
-      for(n = wordhash_first(h); n != NULL; n = wordhash_next(h)){
-        w = n->buf;
-        w->freq += w->msg_freq;
-        w->msg_freq = 0;
-      }
-  
-      /* Want to process EOF, *then* drop out */
-      if (token_type == 0)
-        break;
-    }
-  }
- 
-  if (word_count)
-    *word_count = w_count;
-
-  if (message_count)
-    *message_count = msg_count;
- 
-  return(h);
-}
-
 
 void register_words(run_t _run_type, wordhash_t *h,
 		    int msgcount, int wordcount)
@@ -180,13 +116,32 @@
   db_lock_release_list(word_lists);
 }
 
+static void add_hash(wordhash_t *dest, wordhash_t *src) {
+    wordprop_t *d;
+    hashnode_t *s;
+
+    for (s = wordhash_first(src); s; s = wordhash_next(src)) {
+	d = wordhash_insert(dest, s->key, sizeof(wordprop_t), &wordprop_init);
+	d -> freq += ((wordprop_t *)(s -> buf)) ->freq;
+    }
+}
+
 void register_messages(run_t _run_type)
 {
-  wordhash_t *h;
-  int	wordcount, msgcount;
+  wordhash_t *h, *words = wordhash_init();
+  long	wordcount, msgcount = 0;
+  bool cont;
+
   initialize_constants();
-  h = collect_words(&msgcount, &wordcount);
-  register_words(_run_type, h, msgcount, wordcount);
-  wordhash_free(h);
+
+  do {
+      collect_words(&h, &wordcount, &cont);
+      add_hash(words, h);
+      wordhash_free(h);
+      msgcount++;
+  } while(cont);
+
+  register_words(_run_type, words, msgcount, wordcount);
+  wordhash_free(words);
 }
 
Index: register.h
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/register.h,v
retrieving revision 1.1
diff -u -r1.1 register.h
--- register.h	14 Nov 2002 22:43:32 -0000	1.1
+++ register.h	25 Nov 2002 19:22:15 -0000
@@ -6,7 +6,6 @@
 
 #include <wordhash.h>
 
-extern void *collect_words(int *message_count, int *word_count);
 extern void register_messages(run_t _run_type);
 extern void register_words(run_t _run_type, wordhash_t *h,
 			   int msgcount, int wordcount);

-- 
Matthias Andree



More information about the bogofilter-dev mailing list