-
Notifications
You must be signed in to change notification settings - Fork 1
/
summarizer.py
256 lines (218 loc) · 8.45 KB
/
summarizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
import flask
from flask import Flask, jsonify, request
from flask_cors import CORS
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from heapq import nlargest
from collections import Counter
from sumy.parsers.plaintext import PlaintextParser
from sumy.summarizers.lsa import LsaSummarizer
from sumy.nlp.tokenizers import Tokenizer
from sumy.utils import get_stop_words
import math
from newspaper import Article
from textblob import TextBlob
import requests
from bs4 import BeautifulSoup
from deep_translator import MyMemoryTranslator
app = Flask(__name__)
CORS(app)
import nltk
nltk.download('punkt')
lang_dict = {
'English': 'en',
'French': 'fr',
'Spanish': 'es',
'German': 'de',
'Italian': 'it',
'Russian': 'ru',
'Japanese': 'ja',
'Hindi': 'hi'
}
# Post Processing techniques
def post_process_summary(summary):
if not summary:
return summary
summary = summary\
.replace('\xa0', ' ')\
.strip('.,!?' )[0]\
.upper() + summary[1:] + '.' if summary[-1] not in '.!?' \
else summary[0].upper() + summary[1:]
return summary
# Summarization algorithm using spacy
def summarize_using_spacy(text, number_of_sentences):
nlp = spacy.load('en_core_web_md')
doc= nlp(text)
# Token Filtering
keyword = []
stopwords = list(STOP_WORDS)
pos_tag = ['PROPN', 'ADJ', 'NOUN', 'VERB']
for token in doc:
if(token.text in stopwords or token.text in punctuation):
continue
if(token.pos_ in pos_tag):
keyword.append(token.text)
freq_word = Counter(keyword)
# Normalization
if len(Counter(keyword)):
max_freq = Counter(keyword).most_common(1)[0][1]
for word in freq_word.keys():
freq_word[word] = (freq_word[word]/max_freq)
# Weighing Sentences
sent_strength={}
for sent in doc.sents:
for word in sent:
if word.text in freq_word.keys():
if sent in sent_strength.keys():
sent_strength[sent]+=freq_word[word.text]
else:
sent_strength[sent]=freq_word[word.text]
summarized_sentences = nlargest(number_of_sentences, sent_strength, key=sent_strength.get)
final_sentences = [ w.text for w in summarized_sentences ]
summary = ' '.join(final_sentences)
# Get top-5 tags
tags = [tag for tag, _ in freq_word.most_common(5)]
return post_process_summary(summary), tags
#Summarization using Sumy
def summarize_using_sumy(text, number_of_sentences):
LANGUAGE = "english"
parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
summarizer = LsaSummarizer()
summarizer.stop_words = get_stop_words(LANGUAGE)
summarized_sentences = summarizer(parser.document, number_of_sentences)
summary = ' '.join([str(sentence) for sentence in summarized_sentences])
return post_process_summary(summary)
def ensemble_summarization(text, count, sumy_weight=0.8, spacy_weight=0.2):
spacy_sentences =int(math.ceil(count * spacy_weight))
sumy_sentences = count - spacy_sentences
# Spacy summarizer
spacy_summary, top_5_tags = summarize_using_spacy(text, spacy_sentences)
# Sumy summarizer
sumy_summary = summarize_using_sumy(text, sumy_sentences)
combined_summary = sumy_summary + ' ' + spacy_summary
return post_process_summary(combined_summary), top_5_tags
# Sentiment Analysis of the text
def get_sentiment(text):
nlp = spacy.load('en_core_web_md')
doc = nlp(text)
sentences = [sent.text for sent in doc.sents]
polarity = 0.0
subjectivity = 0.0
for sentence in sentences:
blob = TextBlob(sentence)
polarity += blob.sentiment.polarity
subjectivity += blob.sentiment.subjectivity
num_sentences = len(sentences)
avg_polarity = polarity / num_sentences
avg_subjectivity = subjectivity / num_sentences
return get_sentiment_scale(avg_polarity), get_subjectivity_scale(avg_subjectivity)
def get_sentiment_scale(avg_polarity):
if avg_polarity >= 0.2:
return 1 # Positive
elif avg_polarity >= 0:
return 0 # Neutral
else:
return -1 # Negative
def get_subjectivity_scale(avg_subjectivity):
if avg_subjectivity < 0.4:
return 1 # objective
elif avg_subjectivity < 0.6:
return 0 # Neutral
else:
return -1 # subjective
def generate_citations(website_url):
metadata = get_metadata_information(website_url)
citation_parts = []
if metadata.get("author"):
citation_parts.append(metadata["author"])
if metadata.get("date_published"):
citation_parts.append(f"({metadata['date_published']})")
if metadata.get("title"):
citation_parts.append(metadata["title"])
citation_parts.append(website_url)
citation = ". ".join(citation_parts)
return citation
def get_metadata_information(website_url):
response = requests.get(website_url)
soup = BeautifulSoup(response.content, 'html.parser')
title_tag = soup.find('title')
if title_tag:
title = title_tag.text.strip()
else:
title_tag = soup.find('h1', {'class': 'title'})
title = title_tag.text.strip() if title_tag else ""
author_tag = soup.find('meta', {'name': 'author'})
if author_tag:
author = author_tag.get('content')
else:
author_tag = soup.find('span', {'class': 'author'}) or soup.find('div', {'class': 'author'})
author = author_tag.text if author_tag else ""
date_published_tag = soup.find('meta', {'property': 'article:published_time'})
if date_published_tag:
date_published = date_published_tag.get('content').strip()
else:
date_published_tag = soup.find('time', {'class': 'published'})
date_published = date_published_tag.get('datetime').strip() if date_published_tag else ""
metadata = {
"title": title,
"author": author,
"website_url": website_url,
"date_published": date_published
}
return metadata
def perform_text_translation(text, source_lang, destination_lang, batch_size=400):
source_lang_key = lang_dict[source_lang]
target_lang_key= lang_dict[destination_lang]
translator = MyMemoryTranslator(source=source_lang_key, target=target_lang_key)
# Split the text into batches of size batch_size
num_batches = math.ceil(len(text) / batch_size)
batches = [text[i*batch_size:(i+1)*batch_size] for i in range(num_batches)]
# Translate each batch and concatenate the results
translated_batches = []
for batch in batches:
translated_batch = translator.translate(batch)
translated_batches.append(translated_batch)
translated_text = ''.join(translated_batches)
return translated_text
@app.route('/summarize', methods=['POST'])
def example_api():
data = request.json
website_url = data['message']
numSentences = data.get('numSentences', 2) # Get the number of sentences from the request data
numSentences = max(1, int(numSentences)) # Ensure numSentences is at least 1
target_lang = data['targetLang']
try:
article = Article(website_url)
article.download()
article.parse()
text = article.text
except requests.exceptions.RequestException as e:
print("FAILURE EXCEPTION HIT: ", e)
text = ""
if text == "":
response = requests.get(website_url)
soup = BeautifulSoup(response.content, 'html.parser')
main_content = soup.find('div', {'class': 'main-content'})
if main_content:
text = main_content.get_text()
text = ' '.join(text.split())
if text == "":
text = soup.body.get_text(separator=' ')
if text == "":
return None
citation = generate_citations(website_url)
summary, tags = ensemble_summarization(text, numSentences) # Use the numSentences value when calling the summarize function
polarity, subjectivity = get_sentiment(text)
if target_lang:
translated_summary = perform_text_translation(summary, 'English', target_lang)
response_data = {
'summary': translated_summary if target_lang else summary,
'tags': tags,
'sentiment': polarity,
'subjectivity': subjectivity,
'citation': citation
}
return jsonify(response_data)
if __name__ == '__main__':
app.run(host='localhost', port=8000)