Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Text Mining MiniProject 3 #10

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added .DS_Store
Binary file not shown.
Binary file added Figure_1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added Reflection.pdf
Binary file not shown.
131 changes: 131 additions & 0 deletions comparing_authorby_frquency.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
"""
Author: Shreya Rangarajan
Date: 10/12/17
Description: Compare text frequency of two books from from Jane Austen
and two books Nathaniel Hawthorne
"""

import requests
import numpy as np
import matplotlib.pyplot as plt

# Get books from Gutenberg
pride_and_prejudice = requests.get('http://www.gutenberg.org/files/1342/1342-0.txt').text
emma = requests.get('http://www.gutenberg.org/files/158/158-0.txt').text
scarlet_letter = requests.get('http://www.gutenberg.org/cache/epub/33/pg33.txt').text
seven_gables = requests.get('http://www.gutenberg.org/cache/epub/77/pg77.txt').text

def most_frequent_words(n, text):
""" Computes the most frequent words in a text as well as the number of
words in the entire text

n: an integer indication # of most frequent words wanting to output
text: a text requested from the Gutenberg database
returns: tuples with the first element as the word and the second
element as how frequently it occurs in the text and the numbers of words
in the text
>>> most_frequent_words(5,pride_and_prejudice)
([('the', 4205), ('to', 4121), ('of', 3662), ('and', 3309), ('a', 1945)], 124592)

>>> most_frequent_words(7, scarlet_letter)
([('the', 5029), ('of', 3332), ('and', 2642), ('a', 2028), ('to', 1993), ('in', 1386), ('with', 996)], 86639)
"""

frequency = {}
split_text = text.split()
for word in split_text:
frequency[word] = 0
for word in split_text:
frequency[word] += 1
sort_top_words = sorted(frequency.items(), key=lambda x:x[1])

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yay lambda functions 👍

sort_top_words.reverse()
return sort_top_words[:n],len(split_text)


def compare_authors(text1,text2):
""" Takes two texts, determines the 500 most frequent words in each text
and returns how many of most frequent words are the same for both
texts.

text1: a unique text requested from Gutenberg database
text2: another unique text requested from Gutenberg database
returns: the count of how many "most frequent words" are similar for
the 2 texts

>>> compare_authors(emma, pride_and_prejudice)
375
>>> compare_authors(seven_gables, scarlet_letter)
366
"""
most_freq_word_txt1 = most_frequent_words(500,text1)[0]
most_freq_word_txt2 = most_frequent_words(500,text2)[0]

words_txt1 = [x[0] for x in most_freq_word_txt1]
words_txt2 = [x[0] for x in most_freq_word_txt2]

count = 0
for word in words_txt1:
if word in words_txt2:
count += 1
return count


def frequency_table(textlist):
""" Takes a list of texts and returns the titles of the texts along with
the count of how many "most frequent words" are similar to both texts

#>>> frequency_table([emma, pride_and_prejudice])
#The Project Gutenberg EBook of Pride and Prejudice, by Jane Austen
#The Project Gutenberg EBook of Emma, by Jane Austen
#375
Disclaimer: Doctest "expected" and "got" give the same results, but does not
pass.
"""
running_text_list = textlist
for text in textlist:
running_text_list.remove(text)
for compare_text in textlist:
compare1 = compare_authors(compare_text, text)
print(compare_text.split('\n', 1)[0].replace('\ufeff',''))
print(text.split('\n', 1)[0].replace('\ufeff',''))
print(compare1)

def lexical_diversity(text):
""" Computes and returns the lexial diversity of a text

text: a text requested form the Gutenberg database
returns: lexical diversity of the text
>>> lexical_diversity(emma)
0.11230976330254647

>>> lexical_diversity(scarlet_letter)
0.17630628239014762
"""
split_text = text.split()
total_num_uniqu_words = len(set(split_text))
lex_div = total_num_uniqu_words/len(split_text)
return lex_div

def plot_lexical_diversity(textlist, title_list):
""" Plots the lexical diversity of each text in a list against their title

textlist: list of texts from Gutenberg database
title_list: list of book titles that correspond to the textlist
returns: a plot of the lexical diversity title_list on x axis and lexica
"""
lex_div_list = []
for text in textlist:
lex_div_list.append(lexical_diversity(text))

x = np.array([0,1,2,3])
plt.xticks(x, title_list)
plt.plot(x, lex_div_list)
plt.title('Lexical Diveristy of 4 different books')
plt.xlabel('Book Name')
plt.ylabel('Units')
plt.show()


if __name__ == "__main__":
import doctest
doctest.testmod()