prototype code to split mbox files

Matthias Andree matthias.andree at gmx.de
Mon Aug 18 21:11:21 CEST 2003


Hi,

I have prototype code to split mbox files properly and quickly. It may
evolve into one of the reader modules later.

It currently assumes that its input can be mmap()ed, so it will not work
from pipes and streams and it may not work across NFS (but we map the
whole file, so it will work on good NFS implementations).

It's rather a proof of concept than something that can be merged, but
anyways.

To compile, save it as read_mbox.c into bogofilter's src/ directory and
type: gcc -O -o read_mbox -I. read_mbox.c

When run, it expects file names of mbox files and will currently just
print the individual mails, preceded by the size in bytes in curly
braces on a line by themselves.

With Boris' "mbox-test" he posted (bogofilter -T prints four mails), it
detects two mails, so it appears to work...

Here's the code, as shar.

# This is a shell archive.  Save it in a file, remove anything before
# this line, and then unpack it by entering "sh file".  Note, it may
# create directories; files and directories will be owned by you and
# have default permissions.
#
# This archive contains:
#
#	read_mbox.c
#
echo x - read_mbox.c
sed 's/^X//' >read_mbox.c << 'END-of-read_mbox.c'
X#include "config.h"
X#include "system.h"
X#include <sys/mman.h>
X#include <string.h>
X#include <stdlib.h>
X
Xtypedef int (*read_hook)(const char *, size_t len, void *userdata);
X
Xstatic char *mem_find(const char *haystack, size_t stacksize, const char *needle, size_t needlesize) {
X    if (needlesize > stacksize) return NULL;
X    stacksize -= needlesize;
X    do {
X	if (memcmp(haystack, needle, needlesize) == (void *)0) return haystack;
X	haystack++;
X	stacksize--;
X    } while (stacksize);
X    return NULL;
X}
X
Xint read_mbox_mmap(int fd, read_hook hook, void *userdata) {
X    struct stat st;
X    char *addr;
X    char *ptr, *nxt;
X    char *cl /* content-lenght */, *he /* header's end */;
X
X    if (fstat(fd, &st)) return -1;
X    addr = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, fd, (off_t)0);
X    if (addr == NULL) return -1;
X    for(ptr = addr; ptr < addr + st.st_size ; ptr = nxt) {
X	if (memcmp(ptr, "From ", 5)) goto barf;
X	/* If Content-Length: header is found, use it to accelerate
X	 * things. */
X	he = mem_find(ptr, st.st_size - (ptr - addr), "\n\n", 2);
X	cl = mem_find(ptr, st.st_size - (ptr - addr), "\nContent-Length:", 16);
X	if (cl && he && cl < he) {
X	    nxt = he + atol(cl + 16) + 1;
X	}
X	nxt = mem_find(ptr, st.st_size - (ptr - addr), "\nFrom ", 6);
X
X	if (!nxt) {
X	    hook(ptr, addr + st.st_size - ptr, userdata);
X	    break;
X	} else {
X	    hook(ptr, nxt - ptr, userdata);
X	    nxt ++;
X	}
X    }
X    return munmap(addr, st.st_size);
Xbarf:
X    munmap(addr, st.st_size);
X    return -1;
X}
X
X#include <stdio.h>
X#include <fcntl.h>
X#include <unistd.h>
X
Xint rh(const char *x, size_t len, void *userdata) {
X    printf("{%lu}\n", (unsigned long)len);
X    fwrite(x, 1, len, stdout);
X    (void)userdata;
X    return 0;
X}
X
Xint main(int argc, char **argv) {
X    int i, rc = 0;
X    for (i = 1; i < argc; i++) {
X	int fd = open(argv[i], O_RDONLY);
X	if (fd < 0) {
X	    perror(argv[i]);
X	    rc = 1;
X	    continue;
X	}
X	read_mbox_mmap(fd, rh, NULL);
X	close(fd);
X    }
X    exit(rc);
X}
END-of-read_mbox.c
exit



-- 
Matthias Andree




More information about the bogofilter-dev mailing list