#!/usr/bin/env python
#
# sa-learn-maildir: feed contents of maildir to sa-learn, with
# batching and already-seen caching
#
# Dan R. K. Ports <drkp@mit.edu>
#
# Copyright (c) 2007-2008 Dan R. K. Ports
#
# Permission is hereby granted, free of charge, to any person
# obtaining a copy of this software and associated documentation
# files (the "Software"), to deal in the Software without
# restriction, including without limitation the rights to use,
# copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following
# conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
#
# $Revision$ $Date$

import os, subprocess, sys, cPickle, optparse

DEF_CACHE="~/.rslearncache"

SALEARNCOUNT=100

class AlreadyLearnedCache:
    """Simple cache that checks whether a filename has already been
    seen before. Loads and saves itself via pickling."""
    
    def __init__(self, path):
        self.path = os.path.expanduser(path)
        try:
            f = file(self.path)
            x = cPickle.load(f)
            if type(x) == set:
                self.learnedSet = x
            else:
                raise TypeError
        except:
            self.learnedSet = set()

    def save(self):
        f = file(self.path, "w")
        cPickle.dump(self.learnedSet, f)

    def check(self, filename, insert=True):
        if filename in self.learnedSet:
            return True
        else:
            if insert:
                self.learnedSet.add(filename)
            return False

class SALearnBatcher:
    """Interface to sa-learn that calls it with a specific number of
    arguments, e.g. 100 -- this deals with the fact that passing too
    many arguments causes the length of the arg string to be too
    long."""
    
    def __init__(self, path, args, batchSize):
        self.path = path
        self.args = args
        self.queue = []
        self.batchSize = batchSize

    def flushQueue(self):
        if len(self.queue) == 0:
            return
        call = [self.path] + self.args + self.queue
        #print "calling:", call
        subprocess.call(call)
        self.queue = []

    def learn(self, path):
        self.queue.append(path)
        if len(self.queue) == self.batchSize:
            self.flushQueue()

def learnDir(dir):
    global options
    for dirpath, dirnames, filenames in os.walk(dir):
        for filename in filenames:
            if (not alreadyLearned.check(filename)) or options.force:
                salearn.learn(os.path.join(dirpath, filename))

def learnMaildir(maildir):
    learnDir(os.path.join(maildir, "cur"))
    learnDir(os.path.join(maildir, "new"))
    learnDir(os.path.join(maildir, "tmp"))

            
parser = optparse.OptionParser(usage="%prog [options] [maildirs]",
                               description="Feed a maildir to sa-learn, "
                               "using a cache of filenames that "
                               "have previously been learned.")
actions = optparse.OptionGroup(parser, "Actions",
                               "Action to take with each message. "
                               "Must choose exactly one.")
actions.add_option("-S", "--spam", action="store_true")
actions.add_option("-H", "--ham", action="store_true")
actions.add_option("-F", "--forget", action="store_true")
parser.add_option_group(actions)

parser.add_option("-f", "--force", action="store_true",
                  help="pass messages to sa-learn even if previously seen")
parser.add_option("-n", "--no-update", action="store_true",
                  help="don't update the DB of previously-seen messages")
parser.add_option("-c", "--cache", action="store", metavar="FILE",
                  default=DEF_CACHE,
                  help=("location of cache file, default %s" % DEF_CACHE))

(options, args) = parser.parse_args()

salearnArgs = []
if options.spam:
    salearnArgs += ["learn_spam"]
if options.ham:
    salearnArgs += ["learn_ham"]
if options.forget:
    salearnArgs += ["--forget"]

if len(salearnArgs) != 1:
    parser.error("must specify exactly one of --spam, --ham, or --forget")

alreadyLearned = AlreadyLearnedCache(options.cache)
salearn = SALearnBatcher("rspamc", salearnArgs, SALEARNCOUNT)

for x in args:
    learnMaildir(x)
             
salearn.flushQueue()
if not options.no_update:
    alreadyLearned.save()