forked from sd17fall/TextMining
-
Notifications
You must be signed in to change notification settings - Fork 0
/
sentence_generator.py
128 lines (114 loc) · 4.18 KB
/
sentence_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/usr/bin/python
import re
import random
import sys
# These mappings can get fairly large -- they're stored globally to
# save copying time.
# (tuple of words) -> {dict: word -> number of times the word appears following the tuple}
# Example entry:
# ('eyes', 'turned') => {'to': 2.0, 'from': 1.0}
# Used briefly while first constructing the normalized mapping
tempMapping = {}
# (tuple of words) -> {dict: word -> *normalized* number of times the word appears following the tuple}
# Example entry:
# ('eyes', 'turned') => {'to': 0.66666666, 'from': 0.33333333}
mapping = {}
# Contains the set of words that can start sentences
starts = []
# We want to be able to compare words independent of their capitalization.
def fixCaps(word):
# Ex: "FOO" -> "foo"
if word.isupper() and word != "I":
word = word.lower()
# Ex: "LaTeX" => "Latex"
elif word [0].isupper():
word = word.lower().capitalize()
# Ex: "wOOt" -> "woot"
else:
word = word.lower()
return word
# Tuples can be hashed; lists can't. We need hashable values for dict keys.
# This looks like a hack (and it is, a little) but in practice it doesn't
# affect processing time too negatively.
def toHashKey(lst):
return tuple(lst)
# Returns the contents of the file, split into a list of words and
# (some) punctuation.
def wordlist(words):
# f = open(filename, 'r')
wordlist = [fixCaps(w) for w in re.findall(r"[\w']+|[.,!?;]", words)]
# f.close()
return wordlist
# Self-explanatory -- adds "word" to the "tempMapping" dict under "history".
# tempMapping (and mapping) both match each word to a list of possible next
# words.
# Given history = ["the", "rain", "in"] and word = "Spain", we add "Spain" to
# the entries for ["the", "rain", "in"], ["rain", "in"], and ["in"].
def addItemToTempMapping(history, word):
global tempMapping
while len(history) > 0:
first = toHashKey(history)
if first in tempMapping:
if word in tempMapping[first]:
tempMapping[first][word] += 1.0
else:
tempMapping[first][word] = 1.0
else:
tempMapping[first] = {}
tempMapping[first][word] = 1.0
history = history[1:]
# Building and normalizing the mapping.
def buildMapping(wordlist, markovLength):
global tempMapping
starts.append(wordlist [0])
for i in range(1, len(wordlist) - 1):
if i <= markovLength:
history = wordlist[: i + 1]
else:
history = wordlist[i - markovLength + 1 : i + 1]
follow = wordlist[i + 1]
# if the last elt was a period, add the next word to the start list
if history[-1] == "." and follow not in ".,!?;":
starts.append(follow)
addItemToTempMapping(history, follow)
# Normalize the values in tempMapping, put them into mapping
for first, followset in tempMapping.items():
total = sum(followset.values())
# Normalizing here:
mapping[first] = dict([(k, v / total) for k, v in followset.items()])
# Returns the next word in the sentence (chosen randomly),
# given the previous ones.
def next(prevList):
sum = 0.0
retval = ""
index = random.random()
# Shorten prevList until it's in mapping
while toHashKey(prevList) not in mapping:
prevList.pop(0)
# Get a random word from the mapping, given prevList
for k, v in mapping[toHashKey(prevList)].items():
sum += v
if sum >= index and retval == "":
retval = k
return retval
def genSentence(markovLength):
# Start with a random "starting word"
curr = random.choice(starts)
sent = curr.capitalize()
prevList = [curr]
# Keep adding words until we hit a period
while (curr not in "."):
curr = next(prevList)
prevList.append(curr)
# if the prevList has gotten too long, trim it
if len(prevList) > markovLength:
prevList.pop(0)
if (curr not in ".,!?;"):
sent += " " # Add spaces between words (but not punctuation)
sent += curr
return sent
def main(words,markov):
buildMapping(wordlist(words), markov)
print(genSentence(markov))
if __name__ == "__main__":
main()