#!/usr/bin/env python import sys from collections import defaultdict from random import randint, choice from string import letters, lowercase class WordGenerator(object): def __init__(self): self._start_data = [] self._o1_data = defaultdict(list) self._o2_data = defaultdict(list) def train(self, filename): for line in open(filename): for word in line[:-1].strip().split(): word = ''.join([c for c in word if c in letters]) if len(word) < 2: continue self._process_word(word.lower()) def finish_training(self): for c in self._o1_data.keys(): seq = '' for d in self._o1_data[c].keys(): seq += d * self._o1_data[c][d] self._o1_data[c] = seq for c in self._o2_data.keys(): seq = '' for d in self._o2_data[c].keys(): seq += d * self._o2_data[c][d] self._o2_data[c] = seq def gen_word(self, length): word = [] word.append(choice(self._o1_data.keys())) word.append(choice(self._o1_data[word[-1]])) while len(word) < length: if self._o2_data.has_key(word[-2]+word[1]): word.append(choice(self._o2_data[word[-2]+word[-1]])) else: word.append(choice(self._o1_data[word[-1]])) return ''.join(word) def _process_word(self, word): for i in xrange(len(word)): if i < len(word) - 1: if i >= 1: self._o2_data[word[i-1]+word[i]].append(word[i+1]) self._o1_data[word[i]].append(word[i+1]) if __name__ == '__main__': wordgen = WordGenerator() print 'Training...' wordgen.train('pandp.txt') print 'Done!' print 'Some words:' for i in xrange(20): print wordgen.gen_word(randint(3, 10))