Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Completed Twitter Quote Bot #15

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions GetTweets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import twitter
from TwitterKeys import *
from Pickling import PickleBuddy, unPickleBuddy

def cleanTweet(unsorted):
textList = []
cleanList = []
for tweet in unsorted:
textList.append(tweet.text)
for text in textList:
if text[0] == 'R':
cleaned = (text.split(':')[1])

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm assuming this has something to do with the specifics of how the Twitter package/api formats things, but this could use a little documentation I think.

if '#' in cleaned:
cleaned = cleaned.split("#")[0]
if cleaned != '':
cleanList.append(cleaned)
return cleanList


def getTweets():
PickleBuddy()
unfilteredTweet = unPickleBuddy()
niceTweet = cleanTweet(unfilteredTweet)
return niceTweet



# print (getTweets())

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Generally we want to remove commented code and multiple trailing newlines.

34 changes: 34 additions & 0 deletions Pickling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
filePath = "C:/Users/sokuno/TextMining"
import os
import pickle
import twitter
from TwitterKeys import *


def PickleBuddy():
files = os.listdir(filePath) # make a list of all the files that already exist
if not "cachedData.pickle" in files:
api = twitter.Api(consumer_key=CONSUMER_KEY,

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Proper usage, props for that. Though, it might be a little more clear to clarify that these come from TwitterKeys - e.g. TwitterKeys.CONSUMER_KEY.

That's just a style thing though, personal preference and such.

consumer_secret=CONSUMER_SECRET,
access_token_key=ACCESS_TOKEN_KEY,
access_token_secret=ACCESS_TOKEN_SECRET)
fullTweet = api.GetSearch(term='#inspirationalquotes', raw_query=None, geocode=None, since_id=None, max_id=None, until=None, since=None, count=100, lang=None, locale=None, result_type='recent', include_entities=None)
pickles = open('cachedData.pickle', 'wb')
pickle.dump(fullTweet, pickles)
pickles.close

def unPickleBuddy():
unpickles = open(filePath+'/cachedData.pickle', 'rb')
untest = pickle.load(unpickles)
return untest






PickleBuddy()
unPickleBuddy()
#try:
#cache = open('cachedData.txt')
#except:
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
# TextMining

This is the base repo for the text mining and analysis project for Software Design at Olin College.

First, install/import the following:
-re
-random
-sys
-twitter
-os
-pickle

Then run using the TwitterQuoteBot.py file
Binary file added TextMining_reflection.pdf
Binary file not shown.
8 changes: 8 additions & 0 deletions TwitterQuoteBot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@

import nltk
import re
from GetTweets import getTweets
from sentence_generator import buildMapping, genSentence, main
words = "".join(getTweets())
words = words.replace('\r', '').replace('\n', '').replace('T.','').replace('D.','')
main(words,1)
128 changes: 128 additions & 0 deletions sentence_generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
#!/usr/bin/python

import re
import random
import sys

# These mappings can get fairly large -- they're stored globally to
# save copying time.

# (tuple of words) -> {dict: word -> number of times the word appears following the tuple}
# Example entry:
# ('eyes', 'turned') => {'to': 2.0, 'from': 1.0}
# Used briefly while first constructing the normalized mapping
tempMapping = {}

# (tuple of words) -> {dict: word -> *normalized* number of times the word appears following the tuple}
# Example entry:
# ('eyes', 'turned') => {'to': 0.66666666, 'from': 0.33333333}
mapping = {}

# Contains the set of words that can start sentences
starts = []

# We want to be able to compare words independent of their capitalization.
def fixCaps(word):
# Ex: "FOO" -> "foo"
if word.isupper() and word != "I":
word = word.lower()
# Ex: "LaTeX" => "Latex"
elif word [0].isupper():
word = word.lower().capitalize()
# Ex: "wOOt" -> "woot"
else:
word = word.lower()
return word

# Tuples can be hashed; lists can't. We need hashable values for dict keys.
# This looks like a hack (and it is, a little) but in practice it doesn't
# affect processing time too negatively.
def toHashKey(lst):
return tuple(lst)

# Returns the contents of the file, split into a list of words and
# (some) punctuation.
def wordlist(words):
# f = open(filename, 'r')

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Often it's better to open and close files as-needed, avoiding this kind of commenting /uncommenting while developing.

wordlist = [fixCaps(w) for w in re.findall(r"[\w']+|[.,!?;]", words)]
# f.close()
return wordlist

# Self-explanatory -- adds "word" to the "tempMapping" dict under "history".
# tempMapping (and mapping) both match each word to a list of possible next
# words.
# Given history = ["the", "rain", "in"] and word = "Spain", we add "Spain" to
# the entries for ["the", "rain", "in"], ["rain", "in"], and ["in"].
def addItemToTempMapping(history, word):
global tempMapping
while len(history) > 0:
first = toHashKey(history)
if first in tempMapping:
if word in tempMapping[first]:
tempMapping[first][word] += 1.0
else:
tempMapping[first][word] = 1.0
else:
tempMapping[first] = {}
tempMapping[first][word] = 1.0
history = history[1:]

# Building and normalizing the mapping.
def buildMapping(wordlist, markovLength):
global tempMapping
starts.append(wordlist [0])
for i in range(1, len(wordlist) - 1):
if i <= markovLength:
history = wordlist[: i + 1]
else:
history = wordlist[i - markovLength + 1 : i + 1]
follow = wordlist[i + 1]
# if the last elt was a period, add the next word to the start list
if history[-1] == "." and follow not in ".,!?;":
starts.append(follow)
addItemToTempMapping(history, follow)
# Normalize the values in tempMapping, put them into mapping
for first, followset in tempMapping.items():
total = sum(followset.values())
# Normalizing here:
mapping[first] = dict([(k, v / total) for k, v in followset.items()])

# Returns the next word in the sentence (chosen randomly),
# given the previous ones.
def next(prevList):
sum = 0.0
retval = ""
index = random.random()
# Shorten prevList until it's in mapping
while toHashKey(prevList) not in mapping:
prevList.pop(0)
# Get a random word from the mapping, given prevList
for k, v in mapping[toHashKey(prevList)].items():
sum += v
if sum >= index and retval == "":
retval = k
return retval

def genSentence(markovLength):
# Start with a random "starting word"
curr = random.choice(starts)
sent = curr.capitalize()
prevList = [curr]
# Keep adding words until we hit a period
while (curr not in "."):
curr = next(prevList)
prevList.append(curr)
# if the prevList has gotten too long, trim it
if len(prevList) > markovLength:
prevList.pop(0)
if (curr not in ".,!?;"):
sent += " " # Add spaces between words (but not punctuation)
sent += curr
return sent

def main(words,markov):
buildMapping(wordlist(words), markov)
print(genSentence(markov))

if __name__ == "__main__":
main()