This repository has been archived by the owner on Sep 23, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
L-NLTK
221 lines (165 loc) · 6.82 KB
/
L-NLTK
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Aug 2 15:05:59 2017
@author: hashischins
"""
import nltk
nltk.download()
from nltk.text import Text
import matplotlib
""" I couldn't manage to install a lot of the packages directly from NLTK.
The GUI Installer would halt all downloads and freeze when a package like 'sample_grammars' didn't
give explicit permissions. To overcome this, we use the code block below to trick the server into thinking
the various packages giving trouble are already installed. (Dirty Code is from Alvas on StackOverflow)
I'm not sure what your problems your computer is facing, but tricking those 4 packages were already
installed seemed to do the trick.
"""
#The Ol' Trickeroo
dler = nltk.downloader.Downloader()
dler._update_index()
dler._status_cache['sample_grammars'] = 'installed'
dler._status_cache['spanish_grammars'] = 'installed'
dler._status_cache['basque_grammars'] = 'installed'
dler._status_cache['large_grammars'] = 'installed'
dler.download('all')
#Tokenizing
from nltk import word_tokenize,sent_tokenize
fieldids = nltk.corpus.gutenberg.fileids()
print(fieldids)
hamlet = nltk.corpus.gutenberg.words('shakespeare-hamlet.txt')
len(hamlet)
#To concord, you need to define it this way
hamlet = nltk.Text(nltk.corpus.gutenberg.words('shakespeare-hamlet.txt'))
#Test Cases
hamlet.concordance('anger')
hamlet.concordance('sorrow')
#Alternative way of getting the same fileids list
from nltk.corpus import gutenberg
gutenberg.fileids()
#A short program to display other information about the text
for fileid in gutenberg.fileids():
num_chars = len(gutenberg.raw(fileid))
num_words = len(gutenberg.words(fileid))
num_sents = len(gutenberg.sents(fileid))
num_vocab = len(set(w.lower() for w in gutenberg.words(fileid)))
print (int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), fileid)
""" 3 different parameters have emerged from this program
1) Average Word Length
2) Average Sentence Length
3) Number of times each vocabulary item appears in the text on average (Or you could call it Lexical Diversity Score)
"""
#Sent function: To make long form text into Sentences
macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')
macbeth_sentences
macbeth_sentences[1037]
#[u'Good', u'night', u',', u'and', u'better', u'health', u'Attend', u'his', u'Maiesty']
longest_len = max([len(s) for s in macbeth_sentences])
[s for s in macbeth_sentences if len(s) == longest_len]
gutenberg.raw('shakespeare-hamlet.txt') # Generate the raw file of whatever text you want to read.
len(gutenberg.raw('shakespeare-hamlet.txt')) # Output: 162881 - Length of text
from nltk.corpus import webtext
for fileid in webtext.fileids():
print (fileid, webtext.raw(fileid)[:65], '...')
from nltk.corpus import nps_chat
chatroom = nps_chat.posts('10-19-20s_706posts.xml')
chatroom[123]
chatroom[420]
chatroom[69]
chatroom[500]
#brown corpus
#good for stylistics
from nltk.corpus import brown
brown.categories()
brown.words(categories='news')
brown.words(fileids=['cg22'])
brown.sents(categories=['news', 'editorial','reviews'])
#News Practice
from nltk.corpus import brown
news_text = brown.words(categories='news')
fdist = nltk.FreqDist([w.lower() for w in news_text])
modals = ['can', 'could', 'may', 'might', 'must', 'will']
for m in modals:
print (m + ':', fdist[m])
#Reviews Practice
from nltk.corpus import brown
brown.words(categories='reviews')
reviews_text = brown.words(categories='reviews')
fdist = nltk.FreqDist([w.lower() for w in reviews_text])
Descriptors = ['happy', 'sad', 'depressed', 'overjoyed', 'high']
for d in Descriptors:
print (d + ":", fdist[d], end = ' ')
# NLTK Conditional Frequencies
cfd = nltk.ConditionalFreqDist(
(genre,word)
for genre in brown.categories()
for word in brown.words(categories=genre))
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
modals = ['can', 'could', 'may', 'might', 'must', 'will']
cfd.tabulate(conditions=genres, samples=modals)
#output
# can could may might must will
# news 93 86 66 38 50 389
# religion 82 59 78 12 54 71
# hobbies 268 58 131 22 83 264
#science_fiction 16 49 4 12 8 16
# romance 74 193 11 51 45 43
# humor 16 30 8 8 9 13
#Reuters corpus
## documents are classified into 90 topics and two sets: training and test
## splitting for training and testing is covered in Chapter 6
from nltk.corpus import reuters
reuters.fileids()
## fileids are formatted e.g. test/14826, training/988
reuters.categories()
# out: u'acq', u'alum', u'barley', u'bop', u'carcass', u'castor-oil', u'cocoa', u'coconut', u'coconut-oil', u'coffee', u'copper',
# !! important to note this corpus the categories overlap
reuters.categories('training/9865')
reuters.categories(['training/9865', 'training/9880'])
reuters.fileids('barley')
reuters.fileids(['barley', 'corn'])
#pulling out a set of words, the first words in upcase are the title
reuters.words('training/9865')[:14]
reuters.words(['training/9865', 'training/9880'])
reuters.words(categories='barley')
reuters.words(categories=['barley', 'corn'])
#Inaugural Address Corpus
from nltk.corpus import inaugural
inaugural.fileids()
# grab the first 4 chars of the fileids to grab the years
[fileid[:4] for fileid in inaugural.fileids()]
cfd = nltk.ConditionalFreqDist(
(target, fileid[:4])
for fileid in inaugural.fileids()
for w in inaugural.words(fileid)
for target in ['america', 'citizen']
if w.lower().startswith(target))
#convert corpus to lowercase then check whether they start with either the targets america or citizen
cfd.plot()
# ANNOTATED TEXT CORPORA
# Loading your own corpus
# see Pathology project: need to add pathology report text to txt files
# using just a folder on my computer
from nltk.corpus import PlaintextCorpusReader
corpus_root = '/usr/share/dict'
wordlists = PlaintextCorpusReader(corpus_root, '.*')
wordlists.fileids()
# ['README', 'connectives', 'propernames', 'web2', 'web2a', 'words']
wordlists.words('connectives')
#Counting words by genre
from nltk.corpus import brown
cfd = nltk.ConditionalFreqDist(
(genre, word)
for genre in brown.categories()
for word in brown.words(categories=genre))
genre_word = [(genre, word) # producing pairs consisting of the genre and the word
for genre in ['news', 'romance'] # for each genre...
for word in brown.words(categories=genre)] # loop over every word in the genre
len(genre_word)
genre_word[:4]
# pairs at the beginning of the list genre_word will be of the form ('news', word)
genre_word[-4:]
# at the end of genre_word will be the form of ('romance', word)
cfd = nltk.ConditionalFreqDist(genre_word)
cfd # we can type the name of the variable to inspect it
cfd.conditions()