# Copyright (c) 2010 DBW.
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; version 2 of the License.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# pygmentize -f html -o god_writes_good_code.html god_writes_good_code.py
# More info @ http://burstcoding.blogspot.com/2010/04/rendering-code-via-pygments-in-your.html
import nltk
from nltk.corpus import PlaintextCorpusReader as PtCr
import numpy
import os
def cache_url(url, gunzip=True):
"""fetch the url locally if not already local.
gunzip - {True,False} - pass through gunzip or not
"""
filename = os.path.split(url)[-1]
ext = os.path.splitext(url)[-1]
# will we be unzipping?
if gunzip and ext==".gz":
unzipped = filename[:-3]
if not os.path.exists(unzipped):
err = os.system("gunzip -f %s" % filename)
if err!=0:
raise OSError, "gunzip error on file: %s" % filename
return unzipped
elif not os.path.exists(filename):
err = os.system('wget %s' % url)
if err!=0:
raise OSError, "wget error on url: %s" % url
return filename
def draw_from_cfdist(cfdist):
values = numpy.array(cfdist.values())
keys = cfdist.keys()
a = numpy.add.accumulate(values)
# draw random integer upto end of a+1
rndi = numpy.random.randint(a[-1]+1)
idx = a.searchsorted(rndi)
return keys[idx]
def generate_model(cfdist, word, num=15):
for i in range(num):
print word,
word = draw_from_cfdist(cfdist[word])
if __name__=="__main__":
# King James bible and New Hacker's Dictionary.
sources = {'kjv':'http://www.ccim.org/~bible/kjv.rawtxt.gz',
'nhd':'http://catb.org/jargon/oldversions/jarg2912.txt'}
urls = sources.values()
# cache the urls and get filenames
filenames = []
for url in urls:
filenames.append(cache_url(url))
source_files = dict(zip(sources.keys(),filenames))
# Make a corpus out of local cache
local_corpus = PtCr('.',source_files.values())
words = reduce(lambda x,y: x+y, [local_corpus.words(source_files[src]) for src in source_files])
print "Generating bigrams."
bigrams = nltk.bigrams(words)
print "Generating cfd."
cfd = nltk.ConditionalFreqDist(bigrams)
for i in range(20):
generate_model(cfd,'God')
print "\n"
Darjeeling, Bergamot and Walnuts
19.4.10
god_writes_good_code.py
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment