register restructuring preview
Matthias Andree
matthias.andree at gmx.de
Mon Nov 25 20:24:01 CET 2002
This is the preview patch I used to figure the 60% time reduction for
registering big message boxes.
.cvsignore | 1
Makefile.am | 44 +++++++++++++-------------
bogofilter.c | 14 +++++++-
bogowordfreq.c | 40 ++++++++++++++++++++++++
collect.c | 65 +++++++++++++++++++++++++++++++++++++++
collect.h | 16 +++++++++
globals.h | 2 +
register.c | 95 +++++++++++++++------------------------------------------
register.h | 1
9 files changed, 184 insertions(+), 94 deletions(-)
Index: .cvsignore
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/.cvsignore,v
retrieving revision 1.20
diff -u -r1.20 .cvsignore
--- .cvsignore 16 Nov 2002 15:40:07 -0000 1.20
+++ .cvsignore 25 Nov 2002 19:22:15 -0000
@@ -35,3 +35,4 @@
version.h
wordhash
directories.h
+bogowordfreq
Index: Makefile.am
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/Makefile.am,v
retrieving revision 1.78
diff -u -r1.78 Makefile.am
--- Makefile.am 24 Nov 2002 18:28:22 -0000 1.78
+++ Makefile.am 25 Nov 2002 19:22:15 -0000
@@ -25,7 +25,7 @@
# what to build
bin_PROGRAMS = bogofilter bogoutil bogolexer
bin_SCRIPTS = bogoupgrade
-check_PROGRAMS = debugtest configtest wordhash find_home.test
+check_PROGRAMS = debugtest configtest wordhash find_home.test bogowordfreq
man_MANS = bogofilter.1 bogoutil.1 bogoupgrade.1 bogolexer.1
sysconf_DATA = bogofilter.cf.example
@@ -33,39 +33,42 @@
LDADD += libbogofilter.a
# what to build that from
-libbogofilter_a_SOURCES=xmalloc.h xcalloc.c xmem_error.c xrealloc.c \
- xmalloc.c xstrdup.h xstrdup.c globals.h debug.h \
- debug.c find_home.h find_home.c find_home_user.c \
- find_home_tildeexpand.c
-
-MYCOMMON= bogoconfig.h system.h version.h
+libbogofilter_a_SOURCES= \
+ xmalloc.h xcalloc.c xmem_error.c xrealloc.c \
+ xmalloc.c xstrdup.h xstrdup.c \
+ debug.h debug.c \
+ find_home.h find_home.c find_home_user.c \
+ find_home_tildeexpand.c \
+ collect.h collect.c \
+ datastore.h datastore_db.h datastore_db.c \
+ lexer.h lexer.l \
+ register.h register.c \
+ wordhash.h wordhash.c wordlists.h wordlists.c \
+ bogoconfig.h globals.h system.h version.h
BUILT_SOURCES = version.h directories.h
CLEANFILES= directories.h
-bogofilter_SOURCES = bogofilter.c bogofilter.h main.c lexer.l lexer.h \
- datastore.h datastore_db.h datastore_db.c \
- config.c register.c register.h \
+bogofilter_SOURCES = bogofilter.c bogofilter.h main.c \
+ config.c \
$(GRAHAM_SRC) $(ROBINSON_SRC) $(FISHER_SRC) \
rstats.h rstats.c \
- wordhash.h wordhash.c wordlists.h wordlists.c \
- $(MYCOMMON) common.h method.h
+ common.h method.h
+
+bogowordfreq_SOURCES = bogowordfreq.c
-bogolexer_SOURCES = bogolexer.c lexer.l lexer.h $(MYCOMMON)
+bogolexer_SOURCES = bogolexer.c
debugtest_SOURCES = debug.c debug.h debug.main.c
-bogoutil_SOURCES = bogoutil.c \
- datastore.h datastore_db.h datastore_db.c wordlists.h wordlists.c \
- $(MYCOMMON)
+bogoutil_SOURCES = bogoutil.c
-configtest_SOURCES = configtest.c config.c wordlists.c \
- datastore_db.c $(MYCOMMON)
+configtest_SOURCES = configtest.c config.c
-wordhash_SOURCES = wordhash.c wordhash.h wordhash.main.c $(MYCOMMON)
+wordhash_SOURCES = wordhash.main.c
-find_home_test_SOURCES = find_home.test.c $(MYCOMMON)
+find_home_test_SOURCES = find_home.test.c
# what to distribute
extradistdirs = doc contrib
@@ -79,7 +82,6 @@
README.cvs README.freebsd README.hp-ux \
README.dcdflib README.Robinson \
$(extradistdirs)
-#
#
VERSION_FROM=main.c bogofilter.c bogoutil.c lexer.l
version.h: version.sh $(VERSION_FROM)
Index: bogofilter.c
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/bogofilter.c,v
retrieving revision 1.88
diff -u -r1.88 bogofilter.c
--- bogofilter.c 21 Nov 2002 15:42:52 -0000 1.88
+++ bogofilter.c 25 Nov 2002 19:22:15 -0000
@@ -37,6 +37,7 @@
#include "bogofilter.h"
#include "method.h"
#include "datastore.h"
+#include "collect.h"
#include "register.h"
void initialize_constants()
@@ -55,7 +56,8 @@
rc_t status;
double spamicity;
wordhash_t *wordhash;
- int wordcount, msgcount;
+ long wordcount, msgcount = 0;
+ bool cont;
good_list.active = spam_list.active = true;
@@ -67,7 +69,15 @@
method->initialize();
/* tokenize input text and save words in a wordhash. */
- wordhash = collect_words(&msgcount, &wordcount);
+ do {
+ collect_words(&wordhash, &wordcount, &cont);
+ ++msgcount;
+ } while(cont);
+
+ if (msgcount > 1) {
+ fprintf(stderr, "%s: must get only one message to calculate spamicity!\n", progname);
+ exit(2);
+ }
spamicity = method->compute_spamicity(wordhash, NULL);
Index: bogowordfreq.c
===================================================================
RCS file: bogowordfreq.c
diff -N bogowordfreq.c
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ bogowordfreq.c 25 Nov 2002 19:22:15 -0000
@@ -0,0 +1,40 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "wordhash.h"
+
+#include "common.h"
+#include "system.h"
+#include "collect.h"
+
+const char *spam_header_name = "X-Bogosity:"; /* unused */
+int passthrough = 0; /* unused */
+
+int max_repeats = 1;
+
+static void print_wordlist (wordhash_t *h)
+{
+ hashnode_t *n;
+
+ for(n=wordhash_first(h);n;n = wordhash_next(h)) {
+ printf("%ld %s\n", ((wordprop_t *)(n->buf))->freq, n->key);
+ }
+}
+
+int main(int argc, char **argv) {
+ wordhash_t *h;
+ long count;
+ bool b;
+
+ if (argc >= 2) max_repeats=atoi(argv[1]);
+
+ do {
+ collect_words(&h, &count, &b);
+ printf("%ld tokens:\n", count);
+ print_wordlist(h);
+ printf("\n");
+ wordhash_free(h);
+ } while(b);
+ return 0;
+}
Index: collect.c
===================================================================
RCS file: collect.c
diff -N collect.c
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ collect.c 25 Nov 2002 19:22:15 -0000
@@ -0,0 +1,65 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <config.h>
+
+#include "common.h"
+#include "bogofilter.h"
+#include "register.h"
+#include "collect.h"
+#include "wordhash.h"
+
+
+#include "lexer.h"
+
+void wordprop_init(void *vwordprop){
+ wordprop_t *wordprop = vwordprop;
+
+ wordprop->freq = 0;
+}
+
+static bool from_seen = false;
+
+void collect_reset(void)
+{
+ from_seen = false;
+}
+
+void collect_words(/*@out@*/ wordhash_t **wh,
+ /*@out@*/ /*@null@*/ long *word_count, /*@out@*/ bool *cont)
+ /* tokenize input text and save words in wordhash_t hash table
+ * Sets word_count to the appropriate values
+ * if the pointer is non-NULL.
+ * wh and cont must not be NULL
+ * cont is set if further data is available.
+ */
+{
+ long w_count = 0;
+
+ wordprop_t *w;
+ wordhash_t *h = wordhash_init();
+
+ for (;;){
+ token_t token_type = get_token();
+
+ if (token_type != FROM && token_type != 0){
+ w = wordhash_insert(h, yylval, sizeof(wordprop_t), &wordprop_init);
+ if (w->freq < max_repeats) w->freq++;
+ w_count++;
+ } else {
+ if (token_type == FROM && from_seen == false) {
+ from_seen = true;
+ continue;
+ }
+
+ /* Want to process EOF, *then* drop out */
+ *cont = (token_type != 0);
+ break;
+ }
+ }
+
+ if (word_count)
+ *word_count = w_count;
+
+ *wh = h;
+}
Index: collect.h
===================================================================
RCS file: collect.h
diff -N collect.h
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ collect.h 25 Nov 2002 19:22:15 -0000
@@ -0,0 +1,16 @@
+#ifndef COLLECT_H
+#define COLLECT_H
+
+/* Represents the secondary data for a word key */
+typedef struct {
+ int freq;
+} wordprop_t;
+
+extern void wordprop_init(void *vwordprop);
+
+extern void collect_words(/*@out@*/ wordhash_t **wh,
+ /*@out@*/ /*@null@*/ long *word_count, /*@out@*/ bool *cont);
+
+extern void collect_reset(void);
+
+#endif
Index: globals.h
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/globals.h,v
retrieving revision 1.17
diff -u -r1.17 globals.h
--- globals.h 22 Nov 2002 12:14:41 -0000 1.17
+++ globals.h 25 Nov 2002 19:22:15 -0000
@@ -8,6 +8,8 @@
#include <float.h> /* has DBL_EPSILON */
#define EPS (100.0 * DBL_EPSILON) /* equality cutoff */
+#include "system.h" /* has bool */
+
extern int nonspam_exits_zero; /* '-e' */
extern bool fisher; /* '-f' */
extern bool force; /* '-F' */
Index: register.c
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/register.c,v
retrieving revision 1.5
diff -u -r1.5 register.c
--- register.c 25 Nov 2002 18:00:53 -0000 1.5
+++ register.c 25 Nov 2002 19:22:15 -0000
@@ -8,8 +8,8 @@
#include "bogofilter.h"
#include "datastore.h"
-#include "lexer.h"
#include "register.h"
+#include "collect.h"
#include "wordhash.h"
#define PLURAL(count) ((count == 1) ? "" : "s")
@@ -18,70 +18,6 @@
extern char msg_register[];
-/* Represents the secondary data for a word key */
-typedef struct {
- int freq;
- int msg_freq;
-} wordprop_t;
-
-static void wordprop_init(void *vwordprop){
- wordprop_t *wordprop = vwordprop;
-
- wordprop->freq = 0;
- wordprop->msg_freq = 0;
-}
-
-void *collect_words(/*@out@*/ int *message_count,
- /*@out@*/ int *word_count)
- /* tokenize input text and save words in wordhash_t hash table
- * returns: the wordhash_t hash table.
- * Sets messageg_count and word_count to the appropriate values
- * if their pointers are non-NULL. */
-{
- int w_count = 0;
- int msg_count = 0;
-
- wordprop_t *w;
- hashnode_t *n;
- wordhash_t *h = wordhash_init();
-
- for (;;){
- token_t token_type = get_token();
-
- if (token_type != FROM && token_type != 0){
- w = wordhash_insert(h, yylval, sizeof(wordprop_t), &wordprop_init);
- if (w->msg_freq < max_repeats) w->msg_freq++;
- w_count++;
- }
- else {
- /* End of message. Update message counts. */
- if (token_type == FROM || (token_type == 0 && msg_count == 0))
- msg_count++;
-
- /* Incremenent word frequencies, capping each message's
- * contribution at MAX_REPEATS in order to be able to cap
- * frequencies. */
- for(n = wordhash_first(h); n != NULL; n = wordhash_next(h)){
- w = n->buf;
- w->freq += w->msg_freq;
- w->msg_freq = 0;
- }
-
- /* Want to process EOF, *then* drop out */
- if (token_type == 0)
- break;
- }
- }
-
- if (word_count)
- *word_count = w_count;
-
- if (message_count)
- *message_count = msg_count;
-
- return(h);
-}
-
void register_words(run_t _run_type, wordhash_t *h,
int msgcount, int wordcount)
@@ -180,13 +116,32 @@
db_lock_release_list(word_lists);
}
+static void add_hash(wordhash_t *dest, wordhash_t *src) {
+ wordprop_t *d;
+ hashnode_t *s;
+
+ for (s = wordhash_first(src); s; s = wordhash_next(src)) {
+ d = wordhash_insert(dest, s->key, sizeof(wordprop_t), &wordprop_init);
+ d -> freq += ((wordprop_t *)(s -> buf)) ->freq;
+ }
+}
+
void register_messages(run_t _run_type)
{
- wordhash_t *h;
- int wordcount, msgcount;
+ wordhash_t *h, *words = wordhash_init();
+ long wordcount, msgcount = 0;
+ bool cont;
+
initialize_constants();
- h = collect_words(&msgcount, &wordcount);
- register_words(_run_type, h, msgcount, wordcount);
- wordhash_free(h);
+
+ do {
+ collect_words(&h, &wordcount, &cont);
+ add_hash(words, h);
+ wordhash_free(h);
+ msgcount++;
+ } while(cont);
+
+ register_words(_run_type, words, msgcount, wordcount);
+ wordhash_free(words);
}
Index: register.h
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/register.h,v
retrieving revision 1.1
diff -u -r1.1 register.h
--- register.h 14 Nov 2002 22:43:32 -0000 1.1
+++ register.h 25 Nov 2002 19:22:15 -0000
@@ -6,7 +6,6 @@
#include <wordhash.h>
-extern void *collect_words(int *message_count, int *word_count);
extern void register_messages(run_t _run_type);
extern void register_words(run_t _run_type, wordhash_t *h,
int msgcount, int wordcount);
--
Matthias Andree
More information about the bogofilter-dev
mailing list