[OT] Re: return level?

Matej Cepl matej at ceplovi.cz
Tue Feb 18 17:23:03 CET 2003


David Relson wrote:
> So far, actual details of how he's running bogofilter are lacking...

My python script is attached (be aware, that there are
substantial changes -- new config rile -- which are not tested
much yet). On line 488 it prints 256.

Matej

-- 
Matej Cepl,
Finger: 89EF 4BC6 288A BF43 1BAB  25C3 E09F EF25 D964 84AC
138 Highland Ave. #10, Somerville, Ma 02143, (617) 623-1488
 
The ratio of literacy to illiteracy is a constant, but nowadays
the illiterates can read.
    -- Alberto Moravia

-------------- next part --------------
#!/usr/bin/env python

# This Python program scans an IMAP Inbox and runs every
# entry against SpamAssassin.  For any entries that match,
# the message is copied to another folder, and the original
# marked or deleted.

# This software is written and maintained by Roger Binns
# <rogerb at rogerbinns.com> It is distributed under the <a
# href="http://www.opensource.org/licenses/artistic-license.php">Artistic
# License</a>.

# $Id: isbg.py,v 1.8 2003/02/18 04:50:08 matej Exp $

version="0.96-20Jan03"

import imaplib
import sys
import re
import os
import popen2
import getpass
import getopt
import string
import socket
import md5
import ConfigParser
import stat

# You can specify your imap password using a command line option (--imappassword).
# This however is a really bad idea since any user on the system can run
# ps and see the command line arguments.  If you really must do it non-interactively
# then set the password here.

# name of /dev/null
if os.name=="nt":
    devnull="nul"
else:
    devnull="/dev/null"

config = {'imapuser': getpass.getuser(),
'imaphost':'localhost',
'imapport': 0,  # autodetect - 143 for standard connection, 993 for imaps
'usessl': 0,
'imappassword': None,
'imapinbox': 'INBOX',
'spaminbox': 'INBOX.spam',
'thresholdsize': 120000, # messages larger than this aren't considered
'pastuidsfile': None,
'passwordfilename': None,  # where the password is stored if requested
'savepw': 0,               # save the password
# satest is what command is used test if the message is spam
'satest': "spamassassin --exit-code >"+devnull,
# sasave is the one that dumps out a munged message including report
'sasave': "spamassassin",
# what we use to set flags on the original spam in imapbox
'spamflagscmd': "+FLAGS.SILENT",
# and the flags we set them to (none by default)
'spamflags': "(",
# include the spamassassin report in the message placed in spaminbox
'increport': 1,
# expunge before quiting causing all messages marked for deletion
# to actually be deleted
'expunge': 0,
# print imap tracing info
'verbose': 0,
# print stats at end
'stats': 1,
# IMAP implementation detail
# Courier IMAP ignores uid fetches where more than a certain number are listed
# so we break them down into smaller groups of this size
'uidfetchbatchsize': 25,
# password saving stuff.  A vague level of obfuscation
'passwordhashlen': 256, # should be a multiple of 16
'passwordhash': None,
'spamreturnvalue': 0} # bogofilter returns different errorlevel

#read rc file
# or we can just ignore any *rc file which is not 0600
rcfilename=os.path.expanduser("~"+os.sep+".isbgrc")
if not(os.path.exists(rcfilename)):
   print "File not exists."
else:
   rcmode=oct(stat.S_IMODE(os.stat(rcfilename)[stat.ST_MODE]))
   if (rcmode == '0600'):
      rcconfigfile = ConfigParser.ConfigParser()
      rcconfigfile.read(rcfilename)
      rcconfig = {}
      for sel in rcconfigfile.options('isbg'):
         value=rcconfigfile.get('isbg',sel)
         rcconfig[sel] = value
      config.update(rcconfig)
      for sel in ['imapport', 'usessl', 'thresholdsize', 'savepw',
      'increport', 'expunge', 'verbose', 'stats', 'uidfetchbatchsize',
      'passwordhashlen', 'spamreturnvalue']:
         config[sel] = int(config[sel])

# Usage message - note that not all options are documented
def usage():
    sslmsg=""
    if hasattr(socket, "ssl"):
        sslmsg="""
  --ssl                 Make an SSL connection to the IMAP server"""
    sys.stderr.write("""isbg: IMAP Spam begone %s
All options are optional

  --imaphost hostname   IMAP server name [%s]%s
  --imapuser username   Who you login as [%s]
  --imapinbox mbox      Name of your inbox folder [%s]
  --spaminbox mbox      Name of your spam folder [%s]
  --maxsize numbytes    Messages larger than this will be ignored as they are
                        unlikely to be spam [%d]
  --noreport            Don't include the SpamAssassin report in the message
                        copied to your spam folder
  --flag                The spams will be flagged in your inbox
  --delete              The spams will be marked for deletion from your inbox
  --expunge             Cause marked for deletion messages to also be deleted
                        (only useful if --delete is specified)
  --verbose             Show IMAP stuff happening
  --spamc               Use spamc instead of standalone SpamAssassin binary
  --savepw              Store the password to be used in future runs
  --nostats             Don't print stats
(Your inbox will remain untouched unless you specify --flag or --delete)
  
See http://www.rogerbinns.com/isbg for more details\n""" % (version, config['imaphost'],
sslmsg, config['imapuser'], config['imapinbox'], config['spaminbox'],
config['thresholdsize']))
    sys.exit(1)

def errorexit(msg):
    sys.stderr.write(msg)
    sys.stderr.write("\nUse --help to see valid options and arguments\n")
    sys.exit(3)

def addspamflag(flag):
    global spamflags
    if len(spamflags)>1: spamflags=spamflags+" "
    spamflags=spamflags+flag

def hexof(x):
    res=""
    for i in x: res=res+("%02x" % ord(i))
    return res

def hexdigit(c):
    if c>='0' and c<='9':
        return ord(c)-ord('0')
    if c>='a' and c<='f':
        return 10+ord(c)-ord('a')
    if c>='A' and c<='F':
        return 10+ord(c)-ord('a')
    raise ValueError(`c`+"is not a valid hexadecimal digit")

def dehexof(x):
    res=""
    while(len(x)):
        res=res+chr( 16*hexdigit(x[0])+ hexdigit(x[1]))
        x=x[2:]
    return res

## argument processing
longopts=[ "imaphost=", "imapuser=", "imapinbox=", "spaminbox=",
       "maxsize=", "noreport", "flag", "delete", "expunge", "verbose",
       "trackfile=", "spamc", "ssl", "savepw", "nostats",
       # options not mentioned in usage
       "imappassword=", "satest=", "sasave=", "spamflagscmd=", "spamflags=",
       "help", "version", "imapport=", "passwordfilename="
       ]

try:
    opts, pargs=getopt.getopt(sys.argv[1:], None, longopts)
except Exception,e:
    errorexit("option processing failed - "+str(e))

if len(pargs):
    errorexit("unrecognised option(s) - "+`pargs`)

for p in opts:
    if p[0]=="--maxsize":
        try:
            config['thresholdsize']=int(p[1])
        except:
            errorexit("Unrecognized size - "+p[1])
        if config['thresholdsize']<1:
            errorexit("Size "+`config['thresholdsize']`+" is too small")
    elif p[0]=="--imapport":
        config['imapport']=int(p[1])
    elif p[0]=="--noreport":
        config['increport']=0
    elif p[0]=="--flag":
        addspamflag("\\Flagged")
    elif p[0]=="--delete":
        addspamflag("\\Deleted")
    elif p[0]=="--spamc":
        config['satest']="spamc -c >"+devnull
        config['sasave']="spamc"
    elif p[0]=="--expunge":
        config['expunge']=1
    elif p[0]=="--verbose":
        config['verbose']=1
    elif p[0]=="--ssl":
        config['usessl']=1
    elif p[0]=="--savepw":
        config['savepw']=1
    elif p[0]=="--nostats":
        config['stats']=0
    elif p[0]=="--help":
        usage()
    elif p[0]=="--version":
        print version
        sys.exit(0)
    elif p[0]=="--trackfile":
        config['pastuidsfile']=p[1]
    else:
        locals()[p[0][2:]]=p[1]

print config
# fixup any arguments

if config['spamflags'][-1]!=')':
    config['spamflags']=config['spamflags']+')'

if config['imapport']==0:
    if config['usessl']: config['imapport']=993
    else:      config['imapport']=143

if config['pastuidsfile'] is None:
    config['pastuidsfile']=os.path.expanduser("~"+os.sep+".isbg-track")
    m=md5.new()
    m.update(config['imaphost'])
    m.update(config['imapuser'])
    m.update(`config['imapport']`)
    res=hexof(m.digest())
    config['pastuidsfile']=config['pastuidsfile']+res

# Password stuff
def getpw(data,hash):
    res=""
    for i in range(0,config['passwordhashlen']):
        c=ord(data[i]) ^ ord(hash[i])
        if c==0:
            break
        res=res+chr(c)
    return res
        
def setpw(pw, hash):
    if len(pw)>config['passwordhashlen']:
        raise ValueError("password of length %d is too long to store (max accepted is %d)" % (len(pw), config['passwordhashlen']))
    res=list(hash)
    for i in range(0, len(pw)):
        res[i]=chr( ord(res[i]) ^ ord(pw[i]) )
    return string.join(res, '')

if config['passwordfilename'] is None:
    m=md5.new()
    m.update(config['imaphost'])
    m.update(config['imapuser'])
    m.update(`config['imapport']`)
    config['passwordfilename']=os.path.expanduser("~"+os.sep+".isbg-"+hexof(m.digest()))

if config['passwordhash'] is None:
    # We make hash that the password is xor'ed against
    m=md5.new()
    m.update(config['imaphost'])
    m.update(m.digest())
    m.update(config['imapuser'])
    m.update(m.digest())
    m.update(`config['imapport']`)
    m.update(m.digest())
    config['passwordhash']=m.digest()
    while len(config['passwordhash'])<config['passwordhashlen']:
        m.update(config['passwordhash'])
        config['passwordhash']=config['passwordhash']+m.digest()

if config['verbose']:
    print "Trackfile is", config['pastuidsfile']
    print "SpamFlags are", config['spamflags']
    print "Password file is", config['passwordfilename']

# Figure out the password
if config['imappassword'] is None:
    if not config['savepw'] and os.path.exists(config['passwordfilename']):
        try:
            config['imappassword']=getpw(dehexof(open(config['passwordfilename'], "rb").read()), config['passwordhash'])
            if config['verbose']: print "Successfully read password file"
        except:
            pass
        
    # do we have to prompt?
    if config['imappassword'] is None:
        config['imappassword']=getpass.getpass("IMAP password for %s@%s: " % (config['imapuser'], config['imaphost']))

    # Should we save it?
    if config['savepw']:
        f=open(config['passwordfilename'], "wb+")
        try:
            os.chmod(config['passwordfilename'], 0600)
        except:
            pass
        f.write(hexof(setpw(config['imappassword'], config['passwordhash'])))
        f.close()

# pastuids keeps track of which uids we have already seen, so
# that we don't analyze them multiple times.  We store its
# contents between sessions by saving into a file as Python
# code (makes loading it here real easy since we just source
# the file)
pastuids=[]
try:
    execfile(config['pastuidsfile'])
except:
    pass
# remember what pastuids looked like so that we can compare at the end
origpastuids=pastuids[:]

# This function gets the list of uids corresponding
# to a message range
gure=re.compile(r"[0-9]+ \(UID ([0-9]+)\)")
def getuids(imap, low, high):
    range=`low`+":"+`high`
    res=imap.fetch(range, "UID")
    assertok(res, 'fetch', range, 'UID')
    res2=[]
    for i in res[1]:
        mo=gure.match(i)
        if mo is None:
            if verbose: print "getuids Eh?", i
        else:
            res2.append(mo.group(1))
    return res2

# This function gets the size of each message in the provided
# list
gsre=re.compile(r"[0-9]+ \(UID ([0-9]+) RFC822.SIZE ([0-9]+)\)")
def getsizes(imap, msgs):
    res2=[]

    # Python really needs do - while
    while 1:
        if len(msgs)==0: break
        if len(msgs)>config['uidfetchbatchsize']:
            msgmore=msgs[config['uidfetchbatchsize']:]
            msgs=msgs[:config['uidfetchbatchsize']]
        else:
            msgmore=[]
        msgs=string.join(msgs, ',')
        res=imap.uid("FETCH", msgs, "(UID RFC822.SIZE)")
        assertok(res, "uid fetch", msgs, "(UID RFC822.SIZE)")
        for i in res[1]:
            mo=gsre.match(i)
            if mo is None:
                if config['verbose']: print "getsize Eh?", i
            else:
                res2.append((mo.group(2), mo.group(1)))
        msgs=msgmore
    return res2

# This function makes sure that each lines ends in <CR><LF>
# SpamAssassin strips out the <CR> normally
crnlre=re.compile("([^\r])\n", re.DOTALL)
def crnlify(text):
    # we have to do it twice to work right since the re includes
    # the char preceding \n
    return re.sub(crnlre, "\\1\r\n", re.sub(crnlre, "\\1\r\n", text))

# This function checks that the return code is OK
# It also prints out what happened (which would end
# up /dev/null'ed in non-verbose mode)
def assertok(res,*args):
    if res[0]!="OK":
        sys.stderr.write("\n%s returned %s - aborting\n" % (`args`,  res ))
        sys.exit(2)
    if config['verbose']:
        print `args`, "=", res

# This class implements imap over SSL.
class IMAP4S(imaplib.IMAP4):
    def __init__(self, host='', port=993): imaplib.IMAP4.__init__(self, host, port)

    def open(self, host, port):
        self.baresock=socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        self.baresock.connect( (self.host, self.port) )
        self.ssl=socket.ssl( self.baresock )

    def read(self, size):
        res=""
        while len(res)<size:
            res=res+self.ssl.read(size-len(res))
        return res

    def readline(self):
        # We can only do one character of lookahead, so this is done character by character
        res=""
        last=0
        while last!="\n":
            last=self.ssl.read(1)
            res=res+last
        return res

    def send(self, data):
        while len(data):
            l=self.ssl.write(data)
            if l==len(data): break
            data=data[l:]

    def shutdown(self):
        del self.ssl
        self.baresock.close()
    
    def socket(self):
        """Do not send or receive any data on the returned socket otherwise you
        will break the ssl connection.  Only set socket options and that sort
        of thing"""
        return self.baresock

# Main code starts here
if config['usessl']:
    imap=IMAP4S(config['imaphost'], config['imapport'])
else:
    imap=imaplib.IMAP4(config['imaphost'], config['imapport'])

# Authenticate (only simple supported)
res=imap.login(config['imapuser'], config['imappassword'])
assertok(res, "login",config['imapuser'], 'xxxxxxxx')

# check spaminbox exists by examining it
res=imap.select(config['spaminbox'], 1)
assertok(res, 'select', config['spaminbox'], 1)

# select inbox
res=imap.select(config['imapinbox'], 1)
assertok(res, 'select', config['imapinbox'], 1)

# it returns number of messages in response
low=1
high=int(res[1][0])

# get the corresponding UIDs
alluids=getuids(imap,low,high)

uids=[]
for i in alluids:
    if i not in pastuids:
        uids.append(i)

# for the uids we haven't seen before, get their sizes
# The code originally got both the UIDs and size at the
# same time.  This however took significantly longer as
# I assume it stat()ed and perhaps even opened every message,
# even the ones we had seen before
sizeduids=getsizes(imap, uids)
uids=[]
for i in sizeduids:
    if int(i[0])>config['thresholdsize']:
        pastuids.append(i[1])
        if verbose:
            print i[1], "is", i[0], "bytes so it is being skipped"
    else:
        uids.append(i[1])

# Keep track of new spam uids
spamlist=[]

# Main loop that iterates over each new uid we haven't seen before
for u in uids:
    # Double check
    if u in pastuids: continue
    if config['verbose']: print u
    # Retrieve the entire message
    res=imap.uid("FETCH", u, "(RFC822)")
    if res[0]!="OK":
        assertok(res, 'uid fetch', u, '(RFC822)')
    try:
        body=res[1][0][1]
    except:
        if config['verbose']:
            print "Confused - rfc822 fetch gave "+`res`
            print "The message was probably deleted while we are running"
        pastuids.append(u)

    # Feed it to SpamAssassin in test mode
    p=os.popen(config['satest'], 'w')
    p.write(body)
    code=p.close()

    errstr="Return code is "+str(code)+" spamreturnvalue is "
    errstr+=str(config['spamreturnvalue'])+"."
    print errstr

    if code == config['spamreturnvalue']:
        # Message is below threshold
        pastuids.append(u)
    else:
        # Message is spam
        if config['verbose']: print u, "is spam"
        spamlist.append(u)

        # do we want to include the spam report
        if config['increport']:
            # filter it through sa
            out,inp=popen2.popen2(config['sasave'])
            inp.write(body)
            inp.close()
            body=out.read()
            out.close()
            body=crnlify(body)
            res=imap.append(config['spaminbox'], None, None, body)
            assertok(res, 'append', config['spaminbox'], "{body}")
        else:
            # just copy it as is
            res=imap.uid("COPY", u, config['spaminbox'])
            assertok(res, "uid copy", u, config['spaminbox'])

# If we found any spams, now go and mark the original messages
if len(spamlist):
    res=imap.select(config['imapinbox'])
    assertok(res, 'select', config['imapinbox'])
    for u in spamlist:
        res=imap.uid("STORE", u, config['spamflagscmd'], config['spamflags'])
        assertok(res, "uid store", u, config['spamflagscmd'], config['spamflags'])
        pastuids.append(u)

# only useful if we marked messages Deleted
if config['expunge']:
    imap.expunge()

# sign off
imap.logout()
del imap

# Now tidy up lists of uids
newpastuids=[]
for i in pastuids:
    if i in alluids and i not in newpastuids:
        newpastuids.append(i)

# only write out pastuids if it has changed
if newpastuids!=origpastuids:
    f=open(config['pastuidsfile'], "w+")
    try:
        os.chmod(config['pastuidsfile'], 0600)
    except:
        pass
    f.write("pastuids=")
    f.write(`newpastuids`)
    f.write("\n")
    f.close()


if config['stats']:
    print "%d spams found in %d messages" % (len(spamlist), len(uids))


More information about the Bogofilter mailing list