-
Notifications
You must be signed in to change notification settings - Fork 0
/
tinywords.py
156 lines (138 loc) · 5.38 KB
/
tinywords.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
from contextlib import contextmanager
import csv
import io
import math
from pathlib import Path
import requests
import shutil
from zipfile import ZipFile
def getSequenceCosts(
weights={'part': 1, 'start': 0.1, 'end': 0.05, 'full': 1, 'short': 50},
path=Path("sequenceCosts.csv")
):
canUseFile = False
if path.exists():
with path.open(encoding='utf-8') as f:
reader = csv.reader(f)
weightHeader = next(reader)
weightRow = next(reader)
loadedWeights = dict(zip(weightHeader, map(float, weightRow)))
costHeader = next(reader)
canUseFile = loadedWeights == weights \
and costHeader == ['sequence', 'cost']
if True:#not canUseFile:
weightItems = list(weights.items())
weightNames, weightVector = zip(
*(
(name, weight) for name, weight in weightItems
if name != 'short'
)
)
sequenceCountVectors = getSequenceCountVectors(weightNames)
totalCountVector = [0 for _ in weightVector]
for countVector in sequenceCountVectors.values():
for i, count in enumerate(countVector):
totalCountVector[i] += count
normWeightVector = [
weight / totalCount for weight, totalCount
in zip(weightVector, totalCountVector)
]
charCost = math.log(weights['short'])
sequenceCosts = sequenceCountVectors
for sequence, countVector in sequenceCountVectors.items():
sequenceCosts[sequence] = -math.log(
sum(
weight * count for weight, count
in zip(normWeightVector, countVector)
)
) + charCost * len(sequence)
sequenceCosts = {
getCase(sequence): cost
for sequence, cost in sequenceCosts.items()
}
with path.open('w', encoding='utf-8', newline='') as f:
writer = csv.writer(f, lineterminator='\n')
writer.writerows(zip(*weightItems))
writer.writerow(['sequence', 'cost'])
writer.writerows(
sorted(sequenceCosts.items(), key=lambda item:item[1])
)
with path.open(encoding='utf-8') as f:
reader = csv.reader(f)
weightHeader = next(reader)
weightRow = next(reader)
assert dict(zip(weightHeader, map(float, weightRow))) == weights
costHeader = next(reader)
assert costHeader == ['sequence', 'cost']
for sequence, cost in reader:
yield sequence, float(cost)
def getCase(word):
if any(i in 'lj' for i in word[1:]):
return word.upper()
if word[:1] in 'ljftr':
return word.title()
return word
def getSequenceCountVectors(
countNames,
alphabet=set('abcdefghijklmnopqrstuvwxyz'),
lengths=[1, 2, 3]
):
PART = countNames.index('part')
START = countNames.index('start')
END = countNames.index('end')
FULL = countNames.index('full')
sequenceCountVectors = {}
for word, count in getWordCounts():
for length in lengths:
for i in range(len(word) - length + 1):
part = word[i:i+length]
if alphabet.issuperset(part):
countVector = sequenceCountVectors.setdefault(
part,
[0 for _ in countNames]
)
countVector[PART] += count
if i == 0:
countVector[START] += count
if i + length == len(word):
countVector[END] += count
if length == len(word):
countVector[FULL] += count
for length in lengths:
startVector = sequenceCountVectors[['v', 'of', 'hip'][length - 1]]
stopVector = sequenceCountVectors[['v', 'mk', 'pwn'][length - 1]]
numberStart, numberStop = 10 ** (length - 1), 10 ** length
for i in range(numberStart, numberStop):
phase = math.exp(
-10 * (i - numberStart) / (numberStop - numberStart)
)
sequenceCountVectors[str(i)] = [
phase * startCount + (1 - phase) * stopCount
for startCount, stopCount in zip(startVector, stopVector)
]
return sequenceCountVectors
def getWordCounts():
archivePath, innerPath = getWordCountPaths()
with ZipFile(archivePath) as archive:
with archive.open(str(innerPath)) as binaryFile:
textFile = io.TextIOWrapper(binaryFile, 'utf-8', errors='replace')
reader = csv.reader(textFile, delimiter='\t')
for row in reader:
if len(row) != 1: # skip "Total words" line
word, count, _ = row
yield word, int(count)
def getWordCountPaths(
url='http://www.anc.org/SecondRelease/data/ANC-token-count.zip',
archivePath=Path('ANC-token-count.zip'),
innerPath=Path('ANC-token-count.txt')
):
if not archivePath.exists():
# https://stackoverflow.com/a/39217788
with requests.get(url, stream=True) as response:
response.raise_for_status()
with archivePath.open('wb') as f:
shutil.copyfileobj(response.raw, f)
return archivePath, innerPath
if __name__ == '__main__':
import itertools
print(*itertools.islice(getSequenceCosts(), 200), sep='\n')