-
Notifications
You must be signed in to change notification settings - Fork 0
/
analyze.py
35 lines (30 loc) · 1.4 KB
/
analyze.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import sys
import nltk
import operator
from nltk.corpus import stopwords
import re
import os
reload(sys)
sys.setdefaultencoding('utf-8')
### USING NLTK, ANALYZING FREQUENCY OF WORDS IN MY MESSAGES
frequency = {}
files = os.listdir("corpus/")
# stopwords - don't look for these words for frequency analysis
stopwords = set(nltk.corpus.stopwords.words('english')) | set((u'hehe', u'hmm', u'lol', u'lols', u'ill', u'haha', u'doesnt', u'ooo', u'anywhoo', u'kinda',u'might', u'didnt', u'also',
u'cause',u'thats', u'wanna',u'hes',u'well',u'still',u'ohh',u'lemme',u'since',u'lets',u'hows',u'okie',u'havent',u'cant',u'ive',u'dont',u'wanna' ))
for f in files:
messages = open ("corpus/" + f, "r").readlines()
# analyze messages sent by me
for message in messages[1:len(messages):2]:
#convert word to lowercase and find words that contain between 3-15 letters
lowercase = message.lower()
words = re.findall(r'\b[a-z]{3,15}\b', lowercase)
# analyze frequency and add the word and its level of frequency to a dictionary
for word in words:
if word not in stopwords:
count = frequency.get(word,0)
frequency[word] = count + 1
# print and sort dictionary from lowest to highest frequency value
for key, value in sorted(frequency.iteritems(), key=lambda (k,v): (v,k)):
if value > 75:
print "%s: %s" % (key, value)