-
Notifications
You must be signed in to change notification settings - Fork 0
/
word.py
168 lines (130 loc) · 5.48 KB
/
word.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
"""
word.py
contains Word object definition and general function pertaining to word objects
"""
from nltk.corpus import cmudict
import nltk
#class for containing word and all of its attributes
def tokens(words):
return map (lambda i: i.word, words)
class Word:
def __init__(self, word, pronunciation, isPunct, line, index):
self.word = word
self.line = line #which number line
self.sentence = -1 #which number sentence
self.pos = "" #part of speech
self.pronunciation = pronunciation
self.isFunctionWord = False
self.isPunct = isPunct
self.isEOLN = False
self.index = index
#returns 2-D list of sentences, with EOLN tokens filtered out
def getSentences(words):
return map(lambda i: filter(lambda j: j.sentence == i and (not j.isEOLN), words), range(0, words[-1].sentence))
#This slows things down a bit, but pretty necessary for a poetry analyzer...
def getPronunciation(word, d):
if word in d:
return d[word]
else:
return []
#crude estimate of whether something is pronunciation
def isPunct(word):
return all(map(lambda i: not i.isalnum(), word))
#I include the following types of words as high-content (as opposed to determinants like "the" or pronouns, which often should not be included in an analysis)
#Adverbs, adjectives, all types of verbs except for existential verbs, all types of nouns except for pronouns, interjections
#I also skip any word less than four letters, to decrease false positives from an imperfect part of speech tagger
def isHighContent(word):
return (word.pos in ["ADJ", "NUM", "NP", "ADV", "VD", "VG", "VN", "N", "P", "V", "UH"] and len(word.word) > 3)
#includes who/what type words, conjunctions, pronouns, determinants (a, the, etc.), modal auxiliary (can, must, should, etc)
def isFunctionWord(pos_tag):
function_tags = ["WH","CNJ","TO","PRO","DET","MOD","P"]
return (pos_tag in function_tags)
#returns whether a Word object is a verb
def isVerb(word):
return (word.pos in ["VD", "VG", "VN", "V"])
#returns whether a Word object is an adjective
def isAdjective(word):
return (word.pos in ["ADJ", "NUM"])
#returns whether a Word object is a noun
def isNoun(word):
return (word.pos in ["N", "NP", "PRO"])
#returns tokens that are not punctuation or endline tokens
def getRealWords(words):
return filter(lambda i: not(i.isPunct or i.isEOLN), words)
#tokenizing lines into individual words
def tokenize(lines):
punc = "-/~,.-;!|\n"
words = []
cur_word = ""
for c in lines:
if (c == ' '):
if (len(cur_word) > 0): #add current word to words, start a new word
words.append(cur_word)
cur_word = ""
elif (c in punc):
if (len(cur_word) > 0): #again, add current word, start new words
words.append(cur_word)
cur_word = ""
words.append(str(c)) #and add punctuation
else:
cur_word += c
if (len(cur_word) > 0):
words.append(cur_word)
return words
#Builds a list of Word objects out of unformatted code from a text file
def getWordList(lines):
word_objects = []
d = cmudict.dict()
def getPOS(words):
#I use simplified tags so that it is easier to parse the grammar for my simile checker
pos =[(word, nltk.tag.simplify.simplify_wsj_tag(tag)) for (word, tag) in nltk.pos_tag(words)]
return pos
tokenized = map(lambda i: tokenize(i), lines) #tokenizes string of words to individuals words/punctuation
raw_words = []
#create 1-D list from list of lines
for i in range(0, len(tokenized)):
raw_words += tokenized[i]
#create a list of word objects (although right now you don't have all of the data about the words)
line = 0
for i in range(0, len(raw_words)):
raw_word = raw_words[i]
w = Word(index=i, word=raw_word, pronunciation=getPronunciation(raw_word.lower(), d), line = line, isPunct = isPunct(raw_word))
word_objects.append(w)
if (raw_word == "\n"):
line += 1
#Now: get sentence-level data
#First, split sentences so that every element in an array is either a sentence or ["\n"]
start_word = 0
sentences = []
for i in range(0, len(raw_words)):
current = raw_words[i]
if (current == "."):
sentences.append(raw_words[start_word:i + 1])
start_word = i + 1
elif (current == "\n"):
if (start_word != i):
sentences.append(raw_words[start_word:i])
sentences.append([current])
start_word = i + 1
if start_word < len(raw_words):
sentences.append(raw_words[start_word:])
pos_sentences = []
#Get the position data for each sentence
for sent in sentences:
if sent[0] == "\n":
pos_sentences += [("\n", "EOLN")]
else:
pos_sentences += getPOS(sent)
sentence = 0
for i in range(0, len(word_objects)):
pos = pos_sentences[i][1]
if word_objects[i].isPunct:
word_objects[i].pos = "PUNC"
else:
word_objects[i].pos = pos
word_objects[i].isFunctionWord = isFunctionWord(pos)
word_objects[i].sentence = sentence
word_objects[i].isEOLN = pos == "EOLN"
if (pos == '.'):
sentence += 1
return word_objects