# Import Psyco if available
try:
    import psyco
    psyco.full()
except ImportError:
    pass

from runscraper import *
from utils import *
import Gnuplot, Gnuplot.funcutils

KEYWORDSMAX = 50
QUERYLOG = "../gnutella-indexer/queries.txt"
GNUTELLARESULTLOG = "../gnutella-indexer/results.txt"
torrentStore = Store("torrents.axiom")

#
# Utility functions
#
def nCr(x, y):
    if x >= y:
        return fact(x)/(fact(y) * fact(x - y))
    else:
        return 0

def fact(x):
    if x == 0:
        return 1
    else:
        return x*fact(x-1)

def I(m, K):
    if (K == 0):
        return 2**m - 1
    else:
        return sum([nCr(m, i) for i in range(1, K+1)])

def avg(x):
    return float(sum(x))/len(x)


#
# Gnuplotters
#

g = Gnuplot.Gnuplot(debug=1)

def latex(f, size):
    g('set terminal push')
    #g('set terminal latex 10')
    g('set terminal epslatex "default" 10')
    g('set format xy "$%g$"')
    g.set_string('output', f.replace(".tex",".eps"))
    g('set size %f, %f' % (size, size))
    #g('set size 3/5, 3/5')
    g.refresh()
    g('set terminal pop')
    g.set_string('output')
    # Darnit, fix the path in the output file
#    os.system("echo ',s,{%(b)s},{figures/%(b)s},\nw' | ed %(e)s" %
#              {'b': f.replace(".tex",""), 'e': f})

def png(f):
    g('set terminal push')
    g('set terminal png')
    g.set_string('output', f)
    g.refresh()
    g('set terminal pop')
    g.set_string('output')

def latexandpng(f, size):
    latex(f + ".tex", size)
    png(f + ".png")
#
# Stopwords stuff
#

STOPWORDSFILE = "english.stop"

# Module variables for removing symbols and stopwords for keyword
# generation. These are set immediately below.
xchars = []
rchars = []
stopwords = []

for c in range(256):
    if ord("A") <= c <= ord("Z") or \
       ord("a") <= c <= ord("z") or \
       c >= 0x80:
        xchars.append(chr(c))
    else:
        xchars.append(" ")
xchars = "".join(xchars)
rchars = "'`"
stopwords = set([x.strip().replace("'","")
                 for x in file(STOPWORDSFILE)])


def keywords(s, removeStopwords=True):
    kwset = set(s.translate(xchars, rchars).lower().split())
    if removeStopwords:
        kwset -= stopwords
    return kwset


def printIndexMinMaxAvg():
    l = (list(len(keywords(x))
              for x in torrentStore.query(Torrent).getColumn("torrentname")))

    print len(l)
    
    for K in range(5):
        print K, "--", [f([I(i,K) for i in l]) for f in [min, max, avg]]

def getTorrentKeywordDistribution(removeStopwords):
    keywordsdist = list(0 for x in range(0, KEYWORDSMAX+1))

    for x in torrentStore.query(Torrent).getColumn("torrentname"):
        kws = len(keywords(x, removeStopwords))
        keywordsdist[kws] += 1

    return keywordsdist

def getTorrentKeywordPopularity():
    kws = {}
    for x in torrentStore.query(Torrent).getColumn("torrentname"):
        for kw in keywords(x, False):
            try:
                kws[kw] += 1
            except KeyError:
                kws[kw] = 1
    return kws

def getTorrentKeywordDistribution(removeStopwords):
    keywordsdist = list(0 for x in range(0, KEYWORDSMAX+1))

    for x in torrentStore.query(Torrent).getColumn("torrentname"):
        kws = len(keywords(x, removeStopwords))
        keywordsdist[kws] += 1

    return keywordsdist

def getGnutellaQueryKeywordDistribution(removeStopwords):
    keywordsdist = list(0 for x in range(0, KEYWORDSMAX+1))

    f = file(QUERYLOG, "r")
    for x in f.readlines():
        kws = len(keywords(x, removeStopwords))
        keywordsdist[kws] += 1

    return keywordsdist

def getGnutellaFileKeywordDistribution(removeStopwords):
    keywordsdist = list(0 for x in range(0, KEYWORDSMAX+1))

    f = file(GNUTELLARESULTLOG, "r")
    for x in f:
        kws = len(keywords(x.split("\t")[0], removeStopwords))
        try:
            keywordsdist[kws] += 1
        except IndexError:
            pass

    return keywordsdist

def printTopKeywords(kws):
    items = kws.items()
    items.sort(lambda x,y: cmp(x[1], y[1]), reverse=True)
    for k, c in items:
        if k in stopwords:
            isStop = "*"
        else:
            isStop = ""
        print "%30s %6d %s" % (k, c, isStop)

def plotKeywordDistribution(*dists):
    datae = []
    for dist, disttitle in dists:
        scaleddist = list(float(x)/sum(dist) for x in dist)
        datae.append(Gnuplot.Data(scaleddist, title=disttitle))
        distmean = sum(x*i for i,x in enumerate(scaleddist))
        print disttitle, "  mean =", distmean
    scaleddists = [(Gnuplot.Data(list(float(x)/sum(dist))) for x in dist)
                   for dist in dists]
    g('set data style linespoints')
    g('set nologscale')
    g('set key left Left reverse')
    g('set ylabel "Fraction of files" 1.5, 0')
    g('set xlabel "Number of keywords" 0, .5')
    g('set xrange [1:20]')
    g.plot(*datae)

def plotISvK(dists, maxK):
    datae = []
    for dist,distTitle in dists:
        indexsize = [0 for x in range(maxK)]
        for K in range(maxK):
            for k, n in enumerate(dist):
                indexsize[K] += n * I(k, K)

        datae.append(Gnuplot.Data(list(x/indexsize[1]
                                       for x in indexsize),
                                  title=distTitle))
    g('set data style linespoints')
    g('set logscale y 10')
    g('set key left Left reverse')
    g('set ylabel "Index size increase\\\\\\\\vs $K=1$" 1.5, 0')
    g('set xlabel "$K$" 0, .5')
    g('set xrange [1:15]')
    g.plot(*datae)


freeDBKWDist = [4611, 24216, 203040, 846441, 1522208, 2400236,
                3227650, 3703407, 3733666, 3359390, 2769268, 2123046,
                1532918, 1052366, 693181, 443693, 277357, 167547,
                99245, 57591, 34114, 20244, 12242, 7845, 5062, 3310,
                2350, 1667, 1116, 748, 540, 398, 320, 201, 140, 85,
                61, 50, 28, 27, 15, 7, 12, 6, 1, 3, 3, 1 , 1, 0, 1]
freeDBKWDistSW = [15012, 112901, 639438, 2039520, 3627477, 
                  4822337, 4862968, 4000040, 2943488, 2001663,
                  1294067, 799768, 485333, 287606, 167497, 95304,
                  54621, 31697, 18330, 11204, 7146, 4619, 3118, 2088,
                  1385, 953, 748, 413, 294, 216, 159, 101, 60, 29, 14,
                  19, 20, 11, 4, 1, 4, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0]

plotKeywordDistribution(
    (getTorrentKeywordDistribution(True), "Torrents (exc. stopwords)"),
#    (getTorrentKeywordDistribution(False), "Torrents"),
    (freeDBKWDistSW, "FreeDB (exc. stopwords)"),
#    (freeDBKWDist, "FreeDB"),
#    (getGnutellaQueryKeywordDistribution(True),
#     "Qnutella queries (exc. stopwords)"),
#    (getGnutellaQueryKeywordDistribution(False),
#     "Gnutella queries"),
    (getGnutellaFileKeywordDistribution(True), "Gnutella (exc. stopwords)"),
#    (getGnutellaFileKeywordDistribution(False), "Gnutella (w/o stopwords"),
    )
latexandpng("keyword-dist-stopwords", 0.675)
plotKeywordDistribution(
#    (getTorrentKeywordDistribution(True), "Torrents (exc. stopwords)"),
    (getTorrentKeywordDistribution(False), "Torrents"),
#    (freeDBKWDistSW, "FreeDB (exc. stopwords)"),
    (freeDBKWDist, "FreeDB"),
#    (getGnutellaQueryKeywordDistribution(True),
#     "Qnutella queries (exc. stopwords)"),
#    (getGnutellaQueryKeywordDistribution(False),
#     "Gnutella queries"),
#    (getGnutellaFileKeywordDistribution(True), "Gnutella (w/ stopwords"),
    (getGnutellaFileKeywordDistribution(False), "Gnutella (w/o stopword)s"),
    )
latexandpng("keyword-dist-no-stopwords", 0.675)

# plotKeywordDistribution(
#     (getGnutellaFileKeywordDistribution(False), "Gnutella queries"),
#     (getGnutellaFileKeywordDistribution(True), "Gnutella queries (exc. stopwords)"),
#     )
# latexandpng("gnutella-query-dist", 0.675)

# plotISvK([
#         (getTorrentKeywordDistribution(True), "Torrents"),
#         (freeDBKWDistSW, "FreeDB"),
#         (getGnutellaFileKeywordDistribution(True), "Gnutella")
#           ],
#           20)
# latexandpng("index-size-vs-k-stopwords", 0.675)
                             
# plotISvK([(getTorrentKeywordDistribution(False), "Torrents"),
#           (freeDBKWDist, "FreeDB"),
#           (getGnutellaFileKeywordDistribution(False), "Gnutella")],
#           20)
# latexandpng("index-size-vs-k-no-stopwords", 0.675)
# raw_input()

