# This file can't even be _put_ in a subversion-controlled directory
# or svn fails miserably.  This can also be a bz2, but it'll run about
# 10% slower
FREEDB = "/usr/home/dan/tmp/freedb/freedb-complete-20050104.tar"

K = 3

from funcs import *
import tarfile

try:
    set()
except NameError:
    import warnings
    warnings.warn("Using sets module")
    from sets import Set as set

discs = 0
songs = 0
indexentries = 0
keywords = 0

def showstats():
    print "Number of discs:", discs
    print "Number of songs:", songs
    print "Number of index entries:", indexentries
    print "Entries per song:", float(indexentries)/songs
    print "Keywords per songs:", float(keywords)/songs

def findxchars():
    global xchars, rchars
    xchars = []
    for c in range(256):
        if ord("A") <= c <= ord("Z") or \
           ord("a") <= c <= ord("z") or \
           c >= 0x80:
            xchars.append(chr(c))
        else:
            xchars.append(" ")
    xchars = "".join(xchars)
    rchars = "'`"

    global stopwords
    stopwords = set([x.strip().replace("'","") for x in file("english.stop")])

    global Ilookup
    Ilookup = list([I(k, K) for k in range(50)])
findxchars()

def insert(kwset, Ilookup = Ilookup):
    global songs, indexentries, keywords
    x = len(kwset)
    songs += 1
    #indexentries += I(x, K)
    indexentries += Ilookup[x]
    keywords += x


def parsefreedb(f, datadict={}):
    # Not actually used, but kept around for posterity
    datadict.clear()
    for line in f.readlines():
        if line[0] == "#":
            continue
        key, value = line.strip().split("=", 1)
        datadict[key] = value

def strtokw(s, kwset=set()):
    kwset.clear()
    kwset.update(s.translate(xchars, rchars).lower().split())
    kwset -= stopwords

def insertentry(f, dtitle=set(), kwset=set()):
    global discs
    for line in f.readlines():
        if line[0] == '#' or line[1] != 'T':
            continue
        if line[0] == 'D':
            strtokw(line[7:-1], dtitle)
        elif line[0] == 'T':
            if line[7] == '=':
                strtokw(line[8:-1], kwset)
            else:
                strtokw(line[9:-1], kwset)
            kwset.update(dtitle)
            #print " ".join(kwset)
            insert(kwset)
    discs += 1

def main():
    tf = tarfile.open(FREEDB, "r")
    for ti in tf:
        if not ti.isfile():
            continue
        insertentry(tf.extractfile(ti))

        if discs % 1000 == 0:
            showstats()
            print
    showstats()

if __name__ == "__main__":
    main()

# Number of discs: 1000000
# Number of songs: 13333859
# Number of index entries: 881075985
# Entries per song: 66.0780937462
# Keywords per songs: 6.27440675651
