-
Notifications
You must be signed in to change notification settings - Fork 16
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Completed Twitter Quote Bot #15
base: master
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
import twitter | ||
from TwitterKeys import * | ||
from Pickling import PickleBuddy, unPickleBuddy | ||
|
||
def cleanTweet(unsorted): | ||
textList = [] | ||
cleanList = [] | ||
for tweet in unsorted: | ||
textList.append(tweet.text) | ||
for text in textList: | ||
if text[0] == 'R': | ||
cleaned = (text.split(':')[1]) | ||
if '#' in cleaned: | ||
cleaned = cleaned.split("#")[0] | ||
if cleaned != '': | ||
cleanList.append(cleaned) | ||
return cleanList | ||
|
||
|
||
def getTweets(): | ||
PickleBuddy() | ||
unfilteredTweet = unPickleBuddy() | ||
niceTweet = cleanTweet(unfilteredTweet) | ||
return niceTweet | ||
|
||
|
||
|
||
# print (getTweets()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Generally we want to remove commented code and multiple trailing newlines. |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
filePath = "C:/Users/sokuno/TextMining" | ||
import os | ||
import pickle | ||
import twitter | ||
from TwitterKeys import * | ||
|
||
|
||
def PickleBuddy(): | ||
files = os.listdir(filePath) # make a list of all the files that already exist | ||
if not "cachedData.pickle" in files: | ||
api = twitter.Api(consumer_key=CONSUMER_KEY, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Proper usage, props for that. Though, it might be a little more clear to clarify that these come from TwitterKeys - e.g. TwitterKeys.CONSUMER_KEY. That's just a style thing though, personal preference and such. |
||
consumer_secret=CONSUMER_SECRET, | ||
access_token_key=ACCESS_TOKEN_KEY, | ||
access_token_secret=ACCESS_TOKEN_SECRET) | ||
fullTweet = api.GetSearch(term='#inspirationalquotes', raw_query=None, geocode=None, since_id=None, max_id=None, until=None, since=None, count=100, lang=None, locale=None, result_type='recent', include_entities=None) | ||
pickles = open('cachedData.pickle', 'wb') | ||
pickle.dump(fullTweet, pickles) | ||
pickles.close | ||
|
||
def unPickleBuddy(): | ||
unpickles = open(filePath+'/cachedData.pickle', 'rb') | ||
untest = pickle.load(unpickles) | ||
return untest | ||
|
||
|
||
|
||
|
||
|
||
|
||
PickleBuddy() | ||
unPickleBuddy() | ||
#try: | ||
#cache = open('cachedData.txt') | ||
#except: |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,13 @@ | ||
# TextMining | ||
|
||
This is the base repo for the text mining and analysis project for Software Design at Olin College. | ||
|
||
First, install/import the following: | ||
-re | ||
-random | ||
-sys | ||
-os | ||
-pickle | ||
|
||
Then run using the TwitterQuoteBot.py file |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
|
||
import nltk | ||
import re | ||
from GetTweets import getTweets | ||
from sentence_generator import buildMapping, genSentence, main | ||
words = "".join(getTweets()) | ||
words = words.replace('\r', '').replace('\n', '').replace('T.','').replace('D.','') | ||
main(words,1) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,128 @@ | ||
#!/usr/bin/python | ||
|
||
import re | ||
import random | ||
import sys | ||
|
||
# These mappings can get fairly large -- they're stored globally to | ||
# save copying time. | ||
|
||
# (tuple of words) -> {dict: word -> number of times the word appears following the tuple} | ||
# Example entry: | ||
# ('eyes', 'turned') => {'to': 2.0, 'from': 1.0} | ||
# Used briefly while first constructing the normalized mapping | ||
tempMapping = {} | ||
|
||
# (tuple of words) -> {dict: word -> *normalized* number of times the word appears following the tuple} | ||
# Example entry: | ||
# ('eyes', 'turned') => {'to': 0.66666666, 'from': 0.33333333} | ||
mapping = {} | ||
|
||
# Contains the set of words that can start sentences | ||
starts = [] | ||
|
||
# We want to be able to compare words independent of their capitalization. | ||
def fixCaps(word): | ||
# Ex: "FOO" -> "foo" | ||
if word.isupper() and word != "I": | ||
word = word.lower() | ||
# Ex: "LaTeX" => "Latex" | ||
elif word [0].isupper(): | ||
word = word.lower().capitalize() | ||
# Ex: "wOOt" -> "woot" | ||
else: | ||
word = word.lower() | ||
return word | ||
|
||
# Tuples can be hashed; lists can't. We need hashable values for dict keys. | ||
# This looks like a hack (and it is, a little) but in practice it doesn't | ||
# affect processing time too negatively. | ||
def toHashKey(lst): | ||
return tuple(lst) | ||
|
||
# Returns the contents of the file, split into a list of words and | ||
# (some) punctuation. | ||
def wordlist(words): | ||
# f = open(filename, 'r') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Often it's better to open and close files as-needed, avoiding this kind of commenting /uncommenting while developing. |
||
wordlist = [fixCaps(w) for w in re.findall(r"[\w']+|[.,!?;]", words)] | ||
# f.close() | ||
return wordlist | ||
|
||
# Self-explanatory -- adds "word" to the "tempMapping" dict under "history". | ||
# tempMapping (and mapping) both match each word to a list of possible next | ||
# words. | ||
# Given history = ["the", "rain", "in"] and word = "Spain", we add "Spain" to | ||
# the entries for ["the", "rain", "in"], ["rain", "in"], and ["in"]. | ||
def addItemToTempMapping(history, word): | ||
global tempMapping | ||
while len(history) > 0: | ||
first = toHashKey(history) | ||
if first in tempMapping: | ||
if word in tempMapping[first]: | ||
tempMapping[first][word] += 1.0 | ||
else: | ||
tempMapping[first][word] = 1.0 | ||
else: | ||
tempMapping[first] = {} | ||
tempMapping[first][word] = 1.0 | ||
history = history[1:] | ||
|
||
# Building and normalizing the mapping. | ||
def buildMapping(wordlist, markovLength): | ||
global tempMapping | ||
starts.append(wordlist [0]) | ||
for i in range(1, len(wordlist) - 1): | ||
if i <= markovLength: | ||
history = wordlist[: i + 1] | ||
else: | ||
history = wordlist[i - markovLength + 1 : i + 1] | ||
follow = wordlist[i + 1] | ||
# if the last elt was a period, add the next word to the start list | ||
if history[-1] == "." and follow not in ".,!?;": | ||
starts.append(follow) | ||
addItemToTempMapping(history, follow) | ||
# Normalize the values in tempMapping, put them into mapping | ||
for first, followset in tempMapping.items(): | ||
total = sum(followset.values()) | ||
# Normalizing here: | ||
mapping[first] = dict([(k, v / total) for k, v in followset.items()]) | ||
|
||
# Returns the next word in the sentence (chosen randomly), | ||
# given the previous ones. | ||
def next(prevList): | ||
sum = 0.0 | ||
retval = "" | ||
index = random.random() | ||
# Shorten prevList until it's in mapping | ||
while toHashKey(prevList) not in mapping: | ||
prevList.pop(0) | ||
# Get a random word from the mapping, given prevList | ||
for k, v in mapping[toHashKey(prevList)].items(): | ||
sum += v | ||
if sum >= index and retval == "": | ||
retval = k | ||
return retval | ||
|
||
def genSentence(markovLength): | ||
# Start with a random "starting word" | ||
curr = random.choice(starts) | ||
sent = curr.capitalize() | ||
prevList = [curr] | ||
# Keep adding words until we hit a period | ||
while (curr not in "."): | ||
curr = next(prevList) | ||
prevList.append(curr) | ||
# if the prevList has gotten too long, trim it | ||
if len(prevList) > markovLength: | ||
prevList.pop(0) | ||
if (curr not in ".,!?;"): | ||
sent += " " # Add spaces between words (but not punctuation) | ||
sent += curr | ||
return sent | ||
|
||
def main(words,markov): | ||
buildMapping(wordlist(words), markov) | ||
print(genSentence(markov)) | ||
|
||
if __name__ == "__main__": | ||
main() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm assuming this has something to do with the specifics of how the Twitter package/api formats things, but this could use a little documentation I think.