#!/usr/bin/env python # # sa-learn-maildir: feed contents of maildir to sa-learn, with # batching and already-seen caching # # Dan R. K. Ports # # Copyright (c) 2007-2008 Dan R. K. Ports # # Permission is hereby granted, free of charge, to any person # obtaining a copy of this software and associated documentation # files (the "Software"), to deal in the Software without # restriction, including without limitation the rights to use, # copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the # Software is furnished to do so, subject to the following # conditions: # # The above copyright notice and this permission notice shall be # included in all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES # OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR # OTHER DEALINGS IN THE SOFTWARE. # # $Revision$ $Date$ import os, subprocess, sys, cPickle, optparse DEF_CACHE="~/.rslearncache" SALEARNCOUNT=100 class AlreadyLearnedCache: """Simple cache that checks whether a filename has already been seen before. Loads and saves itself via pickling.""" def __init__(self, path): self.path = os.path.expanduser(path) try: f = file(self.path) x = cPickle.load(f) if type(x) == set: self.learnedSet = x else: raise TypeError except: self.learnedSet = set() def save(self): f = file(self.path, "w") cPickle.dump(self.learnedSet, f) def check(self, filename, insert=True): if filename in self.learnedSet: return True else: if insert: self.learnedSet.add(filename) return False class SALearnBatcher: """Interface to sa-learn that calls it with a specific number of arguments, e.g. 100 -- this deals with the fact that passing too many arguments causes the length of the arg string to be too long.""" def __init__(self, path, args, batchSize): self.path = path self.args = args self.queue = [] self.batchSize = batchSize def flushQueue(self): if len(self.queue) == 0: return call = [self.path] + self.args + self.queue #print "calling:", call subprocess.call(call) self.queue = [] def learn(self, path): self.queue.append(path) if len(self.queue) == self.batchSize: self.flushQueue() def learnDir(dir): global options for dirpath, dirnames, filenames in os.walk(dir): for filename in filenames: if (not alreadyLearned.check(filename)) or options.force: salearn.learn(os.path.join(dirpath, filename)) def learnMaildir(maildir): learnDir(os.path.join(maildir, "cur")) learnDir(os.path.join(maildir, "new")) learnDir(os.path.join(maildir, "tmp")) parser = optparse.OptionParser(usage="%prog [options] [maildirs]", description="Feed a maildir to sa-learn, " "using a cache of filenames that " "have previously been learned.") actions = optparse.OptionGroup(parser, "Actions", "Action to take with each message. " "Must choose exactly one.") actions.add_option("-S", "--spam", action="store_true") actions.add_option("-H", "--ham", action="store_true") actions.add_option("-F", "--forget", action="store_true") parser.add_option_group(actions) parser.add_option("-f", "--force", action="store_true", help="pass messages to sa-learn even if previously seen") parser.add_option("-n", "--no-update", action="store_true", help="don't update the DB of previously-seen messages") parser.add_option("-c", "--cache", action="store", metavar="FILE", default=DEF_CACHE, help=("location of cache file, default %s" % DEF_CACHE)) (options, args) = parser.parse_args() salearnArgs = [] if options.spam: salearnArgs += ["learn_spam"] if options.ham: salearnArgs += ["learn_ham"] if options.forget: salearnArgs += ["--forget"] if len(salearnArgs) != 1: parser.error("must specify exactly one of --spam, --ham, or --forget") alreadyLearned = AlreadyLearnedCache(options.cache) salearn = SALearnBatcher("rspamc", salearnArgs, SALEARNCOUNT) for x in args: learnMaildir(x) salearn.flushQueue() if not options.no_update: alreadyLearned.save()