-
Notifications
You must be signed in to change notification settings - Fork 0
/
process.py
94 lines (79 loc) · 2.56 KB
/
process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import json
import sys, nltk
import datetime
from pprint import pprint
#see wordHistogram
histoString = ''
#see timeLine
dateDict = {}
#see hourHistogram
hourDict = {}
#escape the f, a.k.a get the f outta there!
with open('C:\Users\David\Documents\GitHub\\fbData\chelConvo') as jsonData:
d = json.load(jsonData)
#userList stucture:
# {
# u1 : {
# words : [[num]],
# messages : [[num]]
# }
#
userList = {}
def wordCount():
f = open('wordDump.txt','w')
for m in d['messages']:
#add the raw string of the message to the wordDump for wordl
try:
f.write(m['text']+'\n')
except UnicodeEncodeError:
#print 'unable to encode '+m['text']
pass
#tokenize the message content
numWords = len(m['text'].split(" "))
#if user isn't already there, add and initialize. otherwise, add to his sum
if m['user'] not in userList.keys():
userList[m['user']]={'words':numWords, 'messages':1}
else:
userList[m['user']]['words'] += numWords
userList[m['user']]['messages'] += 1
def timeLine():
fl = open('timeLine.csv','w')
msgs = iter(d['messages'])
try:
m = msgs.next()
while True:
wordCount = 0
date = datetime.datetime.strptime(m['date'], '%Y-%m-%dT%H:%M:%S.%fZ')
temp = date
while temp.date() == date.date():
wordCount += len(m['text'].split(" "))
m = msgs.next()
temp = datetime.datetime.strptime(m['date'], '%Y-%m-%dT%H:%M:%S.%fZ')
dateDict[str(date.date())] = wordCount
fl.write(str(date.date())+','+str(wordCount)+'\n')
except StopIteration:
print("No more messages.")
fl.close()
#need to -5hrs from GMT
def hourHistogram():
for m in d['messages']:
date = datetime.datetime.strptime(m['date'], '%Y-%m-%dT%H:%M:%S.%fZ')
if date.hour not in hourDict.keys():
hourDict[date.hour] = 1
else:
hourDict[date.hour] += 1
def wordHistogram():
f2 = open('wordDump.txt', 'rU')
txt = f2.read()
f2.close()
tokens = nltk.word_tokenize(txt) # tokenize text
clean_tokens = []
for word in tokens:
word = word.lower()
if word.isalpha(): # drop all non-words
clean_tokens.append(word)
# make frequency distribution of words
fd = nltk.FreqDist(clean_tokens)
for token in fd:
histoString += token + ':' + fd[token] + '\n'
#print token, ':', fd[token]