import sys, os
import random
import heapq
from utils import *

from twisted.spread import pb

DEBUG = False
STOPWORDSFILE = "english.stop"

# Module variables for removing symbols and stopwords for keyword
# generation. These are set immediately below.
xchars = []
rchars = []
stopwords = []

for c in range(256):
    if ord("A") <= c <= ord("Z") or \
       ord("a") <= c <= ord("z") or \
       c >= 0x80:
        xchars.append(chr(c))
    else:
        xchars.append(" ")
xchars = "".join(xchars)
rchars = "'`"
stopwords = set([x.strip().replace("'","")
                 for x in file(STOPWORDSFILE)])


class Metadatum(pb.Copyable, pb.RemoteCopy):
    """Defines a metadata entry: a mapping from key to value. Keys are
    always strings, values can be any type. If isKeyword is true, the
    metadata entry is a 'keyword' metadatum: it can be indexed using
    KSS, and so the value must be a string. Immutable."""
    
    @typechecked(object, str, object, bool)
    def __init__(self, key, value, isKeyword):
        """Construct a metadatum mapping key to value."""
        self.key = key
        self.value = value
        self.keyword = isKeyword
        if isKeyword and not issubclass(type(value), str):
            raise TypecheckError("Keyword metadata value must be a string.")

    def __str__(self):
        if self.isKeyword():
            kwd = " [kwd]"
        else:
            kwd = ""
        return "%s%s = %s" % (self.getKey(), kwd, self.getValue())
        
    def getKey(self):
        """Return the metadatum's key."""
        return self.key

    def getValue(self):
        """Return the metadatum's value."""
        return self.value

    def isKeyword(self):
        """Returns true if the metadatum is a keyword metadatum."""
        return self.keyword

    def getKeywords(self):
        """Return the set of keywords contained in the keyword
        metadatum, as a set of (key, keyword) pairs. If this is not a
        keyword metadatum, a TypeError is raised."""
        if not self.isKeyword():
            raise TypeError("Can't get keywords of a non-keyword metadatum.")

        kwset = set(self.value.translate(xchars, rchars).lower().split())
        kwset -= stopwords
        return set([(self.key, x) for x in kwset])

    def __hash__(self):
        return genericHash([self.key, self.value, self.keyword])

    def __cmp__(self, other):
        if not isinstance(other, Metadatum):
            return -1
        elif self.key != other.key:
            return cmp(self.key, other.key)
        elif self.value != other.value:
            return cmp(self.value, other.value)
        else:
            return cmp(self.keyword, other.keyword)
        
pb.setUnjellyableForClass(Metadatum, Metadatum)

class Metadata(pb.Copyable, pb.RemoteCopy):
    """Defines a metadata block: the set of all metadata associated
    with a file. Note that currently only one value is supported for
    each metadata key."""
    def __init__(self):
        """Create a new empty metadata block."""
        self.metadata = {}

    def __str__(self):
        r = "Metadata:\n"
        for x in self.metadata.values():
            r += str(x) + "\n"
        return r

    @typechecked(object, str)
    def getMetadatum(self, key):
        """Return the metadatum with the specified key, or None if
        none exists."""
        try:
            return self.metadata[key]
        except KeyError:
            return None

    @typechecked(object, Metadatum)
    def setMetadatum(self, metadatum):
        """Add the specified metadatum to the metadata block. If
        another metadatum already exists with the same key, it is
        removed."""
        self.metadata[metadatum.getKey()] = metadatum

    def getKeywords(self):
        """Return all keywords contained in keyword metadata, as a
        set of tuples of (key, keyword)"""
        s = set()
        for x in self.metadata.values():
            if x.isKeyword():
                s.update(x.getKeywords())
        return s
        
    @typechecked(object, int)
    def getKSSSets(self, K):
        """Generate a list of all subsets of keyword metadata up to K
        keywords. This is the Keyword-Set Search algorithm described
        in Gnawali's thesis and the Arpeggio paper."""
        # XXX Consider caching this
        # Begin by generating up the keyword metadata
        kmd = list(self.getKeywords())

#        print "K =", K
#        print "len(kmd) =", len(kmd)
        if K > len(kmd):
            K = len(kmd)                # Full power set is smaller than K
            
        for k in range(1,K+1):          # Generate sets of size k (1 to K)
#            print "k =", k
            ind = range(k)              # Indices of each element in the set 
            yield frozenset([kmd[x] for x in ind])
            done = False
            while not done:
                # Find the next possible set of unique indices
                for i in reversed(range(k)):
                    ind[i] += 1
                    if ind[i] == len(kmd):
                        ind[i] = 0
                        if i == 0:
                            done = True
                    else:
                        break

                if done:
                    break
                
                if len(set(ind)) != k:
                    continue            # Indices not unique, try again

                # Make sure the indices are in monotonically
                # increasing order (if not, we get redundancy).
                if ind != sorted(ind):
                    continue
                
#                print ind
                yield frozenset([kmd[x] for x in ind])

    def getID(self):
        """Return an identifier for the metadata block. This is
        currently just its hash value, but is intended for exchanging
        between nodes."""
        return hash(self)

    def __eq__(self, other):
        if not isinstance(other, Metadata):
            return False
        if len(self.metadata) != len(other.metadata):
            return False
        for x in self.metadata.values():
            if x not in other.metadata.values():
                return False
        return True

    def __hash__(self):
        x = genericHash(sorted(self.metadata.values()))
        return x
                
pb.setUnjellyableForClass(Metadata, Metadata)


class MetadataRecord(pb.Copyable, pb.RemoteCopy):
    """Combines a Metadata block with its expiration time and a list
    of the index names to list it in. This representation is useful in
    replication."""
    # XXX Is there a better name for this thing?

    def __init__(self, metadata, expire, indexes):
        self.metadata = metadata
        self.expire = expire
        self.indexes = indexes          # Index names
        
pb.setUnjellyableForClass(MetadataRecord, MetadataRecord)


class Query(pb.Copyable, pb.RemoteCopy):
    """Representation of a query criterion. This consists of a
    required set of keyword parameters (necessary to identify the
    appropriate index), and perhaps other restrictions that can be
    implemented via index-side filtering."""
    
    def __init__(self):
        self.keywords = []

    def __str__(self):
        return "Query: keywords = " + str(self.keywords)

    @typechecked(object, str, str)
    def addKeyword(self, key, keyword):
        """Add a new keyword to the query."""
        self.keywords.append((key,keyword))

    @typechecked(object, int)
    def getIndexKeywordSet(self, K):
        """Select an index to send the query to, given the KSS
        parameter K. This is a random subset of the keywords in the
        query of size K (or less, if there aren't enough keywords in
        the query)."""
        if len(self.keywords) == 0:
            raise Exception("Query has no keywords")

        keywords = self.getKeywords()
        if len(keywords) == 0:
            raise Exception("Query appears to have no non-stopword keywords")

        if len(keywords) <= K:
            return set(keywords)
        else:
            return set(random.sample(keywords, K))

    @typechecked(object, Metadata)
    def matches(self, metadata):
        """Check whether a particular metadata block matches this
        query."""
        if not set(self.getKeywords()).issubset(metadata.getKeywords()):
            return False
        return True

    def getKeywords(self):
        """Return a copy of the set of keywords for this query."""
        kwset = set()
        for x in self.keywords:
            for y in x[1].translate(xchars, rchars).lower().split():
                if y not in stopwords:
                    kwset.add((x[0],y))
        return kwset
    
pb.setUnjellyableForClass(Query, Query)


class Index:
    """An Index holds Metadata entries and allows Querys to be
    performed on them. Each index corresponds to a certain keyword
    subset, and holds Metadata that contains this keyword subset. The
    Index also includes expiration functionality."""

    @typechecked(object, set)
    def __init__(self, name):
        """Create a new index with the given keyword set."""
        self.name = name
        self.entries = set()
        self.expireQueue = ExpirationQueue()
        
    def __str__(self):
        return "[Index %s]" % self.name
    
    @typechecked(object, Metadata, float)
    def add(self, md, expire):
        """Add a new metadata block to the index."""
        if md in self.entries:
            print "Index: ignoring add; metadata block already in index"
            self.expireQueue.refresh(md, expire)
            return                      # Ignore if already in index
        else:
            self.entries.add(md)
            self.expireQueue.add(md, expire)

    @typechecked(object, Metadata)
    def remove(self, md):
        """Remove a metadata block from the index. Raises a KeyError
        exception if the metadata block is not in the index."""
        try:
            self.entries.remove(md)
        except KeyError:
            raise KeyError("Metadata block not in index.")
    
    @typechecked(object, Query)
    def search(self, query):
        """Perform a query search, returning the set of entries that
        match the query."""
        return list([x for x in self.entries if query.matches(x)])

    def numEntries(self):
        """Return the number of entries in this index."""
        return len(self.entries)

    def __len__(self):
        """Return the number of entries in this index."""
        return self.numEntries()

    @typechecked(object, float)
    def expire(self, expireTime):
        """Remove all entries that are due to expire before
        expireTime. Returns the list of entires that were removed.

        N.B.: Deprecated now, as expiration is now handled by
        the IndexCollection."""
        removed = []
        while ((len(self.expireQueue) > 0) and
               self.expireQueue.min()[0] <= expireTime):
            print "Expiring index entry", self.expireQueue.min()
            ent = self.expireQueue.min()[1]
            self.entries.remove(ent)
            self.expireQueue.remove(ent)
            removed.append(ent)
        return removed

class IndexCollection:
    """An IndexCollection manages multiple Indexes."""

    def __init__(self):
        self.indexes = {}               # Maps name -> Index
        self.blocks = {}                # Maps id -> MetadataRecord
        self.expireQueue = ExpirationQueue()
        
    @typechecked(object, str, Metadata, float)
    def addToIndex(self, indexName, metadata, expireTime):
        """Add an entry to the appropriate index, creating the index
        if it does not already exist."""

        if DEBUG:
            print "Adding to index", indexName, expireTime
        if indexName not in self.indexes:
            self.indexes[indexName] = Index(indexName)
        self.indexes[indexName].add(metadata, expireTime)
        if metadata.getID() in self.blocks:
            rec = self.blocks[metadata.getID()]
            self.refreshExpire(metadata.getID(), expireTime)
            if indexName not in rec.indexes:
                rec.indexes.append(indexName)
        else:
            rec = MetadataRecord(metadata, expireTime, [indexName])
            self.blocks[metadata.getID()] = rec
            self.expireQueue.add(rec, expireTime)

    @typechecked(object, int, float)
    def refreshExpire(self, blockID, expireTime):
        """Increase the expiry time for the block with ID blockID to
        expireTime, if this is greater than the current expiry
        time."""
        if blockID not in self.blocks:
            raise KeyError("Tried to refresh expiry on a nonexistent block")

        rec = self.blocks[blockID]
        rec.expire = max(expireTime, rec.expire)
        self.expireQueue.refresh(rec, rec.expire)    
       
    @typechecked(object, str, Query)
    def search(self, indexName, query):
        """Perform a search by passing the query to the appropriate
        index."""        
        if indexName not in self.indexes:
            print "Attempting to search index", indexName, \
                  "which doesn't exist"
            return []
        else:
            return self.indexes[indexName].search(query)

    @typechecked(object, int)
    def getByID(self, blockID):
        """Return the MetadataRecord for the Metadata block with with
        ID = blockID, or None if no such block exists in the index."""
        try:
            return self.blocks[blockID]
        except KeyError:
            return None

    @typechecked(object, float)
    def expire(self, expireTime):
        """Expires all entries in all indexes that are due to expire
        before expireTime. Removes indexes if they've become
        empty. Returns the expired MetadataRecords."""

        removed = []
#         if len(self.expireQueue) == 0:
#             print "Expiring, t =", expireTime, "  queue empty"
#         else:
#             print "Expiring, t =", expireTime, "  min =", \
#                   self.expireQueue.min()[0]
            
        while ((len(self.expireQueue) > 0) and
               self.expireQueue.min()[0] <= expireTime):
            print "Expiring index entry", self.expireQueue.min()
            ent = self.expireQueue.min()[1]
            # Remove from blocks list
            del self.blocks[ent.metadata.getID()]
            # Remove from indexes
            for index in ent.indexes:
                self.indexes[index].remove(ent.metadata)
                if len(self.indexes[index]) == 0:
                    del self.indexes[index]
            self.expireQueue.remove(ent)
            removed.append(ent)
        return removed


    def numIndexes(self):
        """Return the number of indexes in this collection."""
        return len(self.indexes)

    def numEntries(self):
        """Return the total number of entries in every index in this
        collection."""
        return sum(x.numEntries() for x in self.indexes.values())

    def showStats(self):
        """Print the number of entries and indexes."""
        print "Index collection statistics:"
        print " Number of indexes:", self.numIndexes()
        print " Number of entries:", self.numEntries()

    def dumpData(self):
        """Print the contents of the index collection."""
        print "DEBUG: dumping index collection contents"
        print "Indexes:"
        for index in self.indexes.itervalues():
            print " %s: %s" % (index.name, " ".join([str(hash(m))
                                                     for m
                                                     in index.entries]))
    
@typechecked(set)
def indexNameFromSet(s):
    """Convert a Set of tuples of (key, keyword) to an index name as a
    string."""
    n = ""
    # XXX This is just a temporary representation. It should be made
    # robust to all sorts of things that could show up in a key or
    # keyword.

    l = list(s)
    l.sort()
    for x in l:
        n += x[0] + ":" + x[1] + "&"

    return n
