DBW

Darjeeling, Bergamot and Walnuts

19.4.10

god_writes_good_code.py

# Copyright (c) 2010 DBW.
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; version 2 of the License.
 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# pygmentize -f html -o god_writes_good_code.html god_writes_good_code.py
# More info @ http://burstcoding.blogspot.com/2010/04/rendering-code-via-pygments-in-your.html

import nltk
from nltk.corpus import PlaintextCorpusReader as PtCr

import numpy
import os


def cache_url(url, gunzip=True):
    """fetch the url locally if not already local.
    gunzip - {True,False} - pass through gunzip or not
    """

    filename = os.path.split(url)[-1]
    ext = os.path.splitext(url)[-1]

    # will we be unzipping?
    if gunzip and ext==".gz":
        unzipped = filename[:-3]
        if not os.path.exists(unzipped):
            err = os.system("gunzip -f %s" % filename)
            if err!=0:
                raise OSError, "gunzip error on file: %s" % filename
    
        return unzipped


    elif not os.path.exists(filename):
        err = os.system('wget %s' % url)
        if err!=0:
            raise OSError, "wget error on url: %s" % url

    return filename


def draw_from_cfdist(cfdist):
    
    values = numpy.array(cfdist.values())
    keys = cfdist.keys()
    a = numpy.add.accumulate(values)
    # draw random integer upto end of a+1
    rndi = numpy.random.randint(a[-1]+1)
    idx = a.searchsorted(rndi)
    return keys[idx]
    
def generate_model(cfdist, word, num=15):

    for i in range(num):
        print word, 
        word = draw_from_cfdist(cfdist[word])


if __name__=="__main__":

    # King James bible and New Hacker's Dictionary.
    sources = {'kjv':'http://www.ccim.org/~bible/kjv.rawtxt.gz',
               'nhd':'http://catb.org/jargon/oldversions/jarg2912.txt'}

    urls = sources.values()

    # cache the urls and get filenames
    filenames = []
    for url in urls:
        filenames.append(cache_url(url))

    source_files = dict(zip(sources.keys(),filenames))

    # Make a corpus out of local cache
    local_corpus = PtCr('.',source_files.values())

    words = reduce(lambda x,y: x+y, [local_corpus.words(source_files[src]) for src in source_files])

    print "Generating bigrams."

    bigrams = nltk.bigrams(words)

    print "Generating cfd."

    cfd = nltk.ConditionalFreqDist(bigrams)


    for i in range(20):
        generate_model(cfd,'God')
        print "\n"

No comments:

Post a Comment