# This file can't even be _put_ in a subversion-controlled directory
# or svn fails miserably.  This can also be a bz2, but it'll run about
# 10% slower
FREEDB = "/u/drkp/freedb-complete-20061101.tar.bz2"
FREEDB_COMMAND = "tar -xOf " + FREEDB

KMAX = 50
KEYWORDSMAX = 50

# Import Psyco if available
try:
    import psyco
    psyco.full()
except ImportError:
    pass

from funcs import *
import tarfile
import os
import time

try:
    set()
except NameError:
    import warnings
    warnings.warn("Using sets module")
    from sets import Set as set

discs = 0
songs = 0
indexentries = list(0 for x in range(0, KMAX+1))
keywords = 0
seconds = 0
secondparseerrors = 0
keywordsdist = list(0 for x in range(0, KEYWORDSMAX+1))
keywordsdistsw = list(0 for x in range(0, KEYWORDSMAX+1))
starttime = time.time()

def showstats():
    print
    print "Number of discs:", discs
    print "Number of songs:", songs
    print "Keywords per songs:", float(keywords)/songs
    print "Keywords distribution:", keywordsdist
    print "Keywords distribution (w/ stopwords):", keywordsdistsw
    print "Number of index entries:", indexentries
    print "Entries per song:", list(float(x)/songs for x in indexentries)
    print "Total song time (seconds):", seconds
    print "Parse errors for song time:", secondparseerrors
    print "Computation time (seconds):", time.time()-starttime
def findxchars():
    global xchars, rchars
    xchars = []
    for c in range(256):
        if ord("A") <= c <= ord("Z") or \
           ord("a") <= c <= ord("z") or \
           c >= 0x80:
            xchars.append(chr(c))
        else:
            xchars.append(" ")
    xchars = "".join(xchars)
    rchars = "'`"

    global stopwords
    stopwords = set([x.strip().replace("'","") for x in file("english.stop")])

    global Ilookup
    Ilookup = list(list([I(k, K) for k in range(50)])
                   for K in range(0, KMAX+1))
findxchars()

def insert(kwset, Ilookup = Ilookup):
    global songs, indexentries, keywords, keywordsdist
    x = len(kwset)
    keywordsdist[x] += 1
    kwset -= stopwords
    x = len(kwset)
    keywordsdistsw[x] += 1
    songs += 1
    #indexentries += I(x, K)
    for K in range(0, KMAX+1):
        indexentries[K] += Ilookup[K][x]
    keywords += x

def parsefreedb(f, datadict={}):
    # Not actually used, but kept around for posterity
    datadict.clear()
    for line in f.readlines():
        if line[0] == "#":
            continue
        key, value = line.strip().split("=", 1)
        datadict[key] = value

def strtokw(s, kwset=set(), removeStopwords=False):
    kwset.clear()
    kwset.update(s.translate(xchars, rchars).lower().split())
    if removeStopwords:
        kwset -= stopwords

def insertentries(f, dtitle=set(), kwset=set()):
    global discs, seconds, secondparseerrors
    line = "\n"
    while line != "":
        line = f.readline()
        if len(line) < 2:
            continue
        if line[0] == '#' and len(line) > 5:
            if line[2:6] == "xmcd":
                if discs > 0 and (discs % 1000) == 0:
                    showstats()
                discs += 1
                dtitle.clear()
                continue
            elif line[2] != 'D':
                continue
            else:
                lengthline = line.split()
                try:
                    seconds += int(lengthline[3])
                except:
                    secondparseerrors += 1
                continue
        if line[1] != 'T':
            continue
        if line[0] == 'D':
            strtokw(line[7:-1], dtitle)
        elif line[0] == 'T':
            if line[7] == '=':
                strtokw(line[8:-1], kwset)
            else:
                strtokw(line[9:-1], kwset)
            kwset.update(dtitle)
            #print " ".join(kwset)
            insert(kwset)
            
def main():
    f = os.popen(FREEDB_COMMAND)

    insertentries(f)

    showstats()

if __name__ == "__main__":
    main()

# Number of discs: 1000000
# Number of songs: 13333859
# Number of index entries: 881075985
# Entries per song: 66.0780937462
# Keywords per songs: 6.27440675651
