-
Notifications
You must be signed in to change notification settings - Fork 0
/
app.py
541 lines (458 loc) Β· 20.5 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
# Import Libraries
import streamlit as st
import pandas as pd
import plotly.express as px
import re, string, os, sys, pytz, json, nltk, pyperclip
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim import corpora, models
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from gensim.models import LdaModel
from reportlab.lib.pagesizes import letter
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.platypus import (Table, TableStyle, Paragraph, Image, Spacer, SimpleDocTemplate, NextPageTemplate, PageBreak)
from reportlab.lib import styles, enums, colors, pagesizes
# TO DO
# Add button to clear model cache from front-end to enable pie chart regeneration.
# If internet is present, check for updates then show button to update.
# Show/hide terminal button only on local url, not network url.
# Update github project structure in readme.
# Add button to close app from frontend when terminal is set to hidden.
# Add new languages like french, spanish etc to app.
# Make mobile version of app.
# Make linux version of app.
# make code object oriented.
# Regenerate pipreqs file.
# Set page configuration including favicon
if getattr(sys, 'frozen', False):
# Running in a bundle
favicon_path = os.path.join(sys._MEIPASS, 'favicon.ico')
else:
# Running in a normal Python environment
favicon_path = 'dist/favicon.ico'
# Configure page
st.set_page_config(
page_title="Chat Analyzer",
page_icon=favicon_path,
layout="wide",
initial_sidebar_state="expanded",
menu_items={
'Report a bug': None,
'About': None
}
)
# Force dark theme using custom CSS
st.markdown("""
<style>
/* Override Streamlit's default theme */
:root {
--primary-color: #FF4B4B;
background-color: #0E1117;
}
.stApp {
background-color: #0E1117;
color: #FAFAFA;
}
.sidebar .sidebar-content {
background-color: #262730;
}
/* Make dataframe fill width */
.stDataFrame {
width: 100% !important;
}
div[data-testid="stDataFrame"] > div {
width: 100% !important;
}
/* Hide errors in production */
.stException, .stError {
display: none !important;
}
</style>
""", unsafe_allow_html=True)
# Hide Streamlit's default error messages in production
if getattr(sys, 'frozen', False):
st.markdown("""
<style>
.stException, .stError {
display: none !important;
}
.stDataFrame {
width: 100% !important;
}
div[data-testid="stDataFrame"] > div {
width: 100% !important;
}
</style>
""", unsafe_allow_html=True)
def copy_to_clipboard(text):
"""Copy text to clipboard"""
pyperclip.copy(text)
st.session_state["copy_message"] = f"{text} Copied!"
def display_network_urls():
"""Display network URLs in the Streamlit interface"""
local_ip = os.environ.get('STREAMLIT_LOCAL_IP', 'localhost')
network_ip = os.environ.get('STREAMLIT_NETWORK_IP', 'localhost')
port = os.environ.get('STREAMLIT_SERVER_PORT', '8501')
# Create a container for URLs
url_container = st.container()
with url_container:
st.markdown("#### Network Access URLs")
cols = st.columns(2)
with cols[0]:
local_url = f"http://{local_ip}:{port}"
st.markdown(f"**Local URL:**")
st.code(local_url)
if st.button("Copy Local URL"):
copy_to_clipboard(local_url)
with cols[1]:
if network_ip != 'localhost':
network_url = f"http://{network_ip}:{port}"
st.markdown(f"**Network URL:**")
st.code(network_url)
if st.button("Copy Network URL"):
copy_to_clipboard(network_url)
else:
st.markdown("**Network URL:**")
st.warning("Network access not available")
if "copy_message" in st.session_state:
st.success(st.session_state.pop("copy_message"))
st.markdown("*Other devices on the same network can access the app using the Network URL.*")
def save_terminal_setting(show_terminal):
"""Save terminal visibility setting to a file"""
settings_file = os.path.join(os.path.expanduser('~'), '.chat_analyzer_settings.json')
settings = {"show_terminal": show_terminal}
with open(settings_file, 'w') as f:
json.dump(settings, f)
def load_terminal_setting():
"""Load terminal visibility setting from file"""
settings_file = os.path.join(os.path.expanduser('~'), '.chat_analyzer_settings.json')
if os.path.exists(settings_file):
with open(settings_file, 'r') as f:
settings = json.load(f)
return settings.get("show_terminal", False)
return False
# Function Definitions
def save_lda_checkpoint(lda_model, dictionary, checkpoint_dir="Model", checkpoint_name="lda_checkpoint"):
"""Saves the LDA model and dictionary as a checkpoint."""
os.makedirs(checkpoint_dir, exist_ok=True)
model_path = os.path.join(checkpoint_dir, f"{checkpoint_name}.model")
dict_path = os.path.join(checkpoint_dir, f"{checkpoint_name}.dict")
lda_model.save(model_path)
dictionary.save(dict_path)
print(f"Checkpoint Saved: {model_path}, {dict_path}")
def load_lda_checkpoint(checkpoint_dir="Model", checkpoint_name="lda_checkpoint"):
"""Loads the LDA model and dictionary from a checkpoint."""
model_path = os.path.join(checkpoint_dir, f"{checkpoint_name}.model")
dict_path = os.path.join(checkpoint_dir, f"{checkpoint_name}.dict")
if not os.path.exists(model_path) or not os.path.exists(dict_path):
raise FileNotFoundError("Checkpoint Files Not Found!")
lda_model = LdaModel.load(model_path)
dictionary = corpora.Dictionary.load(dict_path)
print(f"Checkpoint Loaded: {model_path}, {dict_path}")
return lda_model, dictionary
def process_json_to_txt(file_content):
"""Converts a JSON chat file into text format."""
try:
data = json.loads(file_content.decode("utf-8"))
except json.JSONDecodeError:
st.error("The uploaded JSON file is not valid.")
return None
output_lines = []
for message in data.get("messages", []):
if message.get("type") == "message":
try:
dt = datetime.fromisoformat(message["date"])
date_formatted = dt.strftime('%d/%m/%Y, %I:%M %p').lower()
username = message.get("from", "Unknown")
content = message.get("text", "")
line = f"{date_formatted} - {username}: {content}"
output_lines.append(line)
except (ValueError, KeyError):
continue
return "\n".join(output_lines)
def parse_chat(file_content):
"""Parses chat file content into a DataFrame."""
chat_data = []
data = file_content.decode("utf-8").splitlines()
line_pattern = re.compile(r"(\d{1,2}/\d{1,2}/\d{4}), (\d{1,2}:\d{2}\s(?:am|pm)) - (.+?): (.+)")
system_message_pattern = re.compile(r"(\d{1,2}/\d{1,2}/\d{4}), (\d{1,2}:\d{2}\s(?:am|pm)) - (.+)")
for line in data:
if match := line_pattern.match(line):
chat_data.append([*match.groups()])
elif sys_match := system_message_pattern.match(line):
date, time, message = sys_match.groups()
chat_data.append([date, time, None, message])
elif chat_data:
chat_data[-1][3] += f" {line.strip()}"
return pd.DataFrame(chat_data, columns=['Date', 'Time', 'Sender', 'Message'])
def preprocess_chat(chat_df):
"""Cleans and preprocesses chat messages."""
chat_df = chat_df[chat_df['Sender'].notnull()].copy()
def clean_message(message):
stop_words = set(stopwords.words('english'))
tokens = word_tokenize(message.lower())
tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]
return ' '.join(tokens)
chat_df['Cleaned_Message'] = chat_df['Message'].apply(clean_message)
return chat_df
def filter_by_criteria(chat_df, start_date=None, end_date=None, start_time=None, end_time=None, keywords=None):
"""Filters the chat DataFrame based on user-specified criteria."""
chat_df['Datetime'] = pd.to_datetime(chat_df['Date'] + ' ' + chat_df['Time'], format='%d/%m/%Y %I:%M %p')
if start_date:
chat_df = chat_df[chat_df['Datetime'] >= pd.to_datetime(start_date)]
if end_date:
chat_df = chat_df[chat_df['Datetime'] <= pd.to_datetime(end_date)]
if start_time or end_time:
chat_df = chat_df[(start_time is None or chat_df['Datetime'].dt.time >= start_time) &
(end_time is None or chat_df['Datetime'].dt.time <= end_time)]
if keywords:
pattern = '|'.join(map(re.escape, filter(None, map(str.strip, keywords))))
chat_df = chat_df[chat_df['Cleaned_Message'].str.contains(pattern, na=False, case=False)]
return chat_df
def generate_summary(chat_df, top_n_keywords=10, top_n_messages=5, num_topics=3):
"""Generates a summary of the chat."""
# Extract Keywords
vectorizer = TfidfVectorizer(max_features=top_n_keywords)
tfidf_matrix = vectorizer.fit_transform(chat_df['Cleaned_Message'])
# Create a dictionary of keywords and their scores
keywords_dict = dict(zip(vectorizer.get_feature_names_out(), tfidf_matrix.sum(axis=0).tolist()[0]))
# Summarize Messages
chat_df['Message_Length'] = chat_df['Message'].str.len()
summary = chat_df.sort_values(by='Message_Length', ascending=False).head(top_n_messages)
# Extract Topics
tokenizer = re.compile(r'\w+')
texts = [tokenizer.findall(msg.lower()) for msg in chat_df['Cleaned_Message']]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
lda_model = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, passes=10)
topics = lda_model.print_topics(num_words=5)
return keywords_dict, summary, topics
def plot_topics_distribution(topics):
"""Plot a pie chart of topics distribution."""
topic_labels = [f"Topic {i+1}" for i, _ in enumerate(topics)]
topic_words = [', '.join([word.split('*')[1].strip('"') for word in topic.split(' + ')]) for _, topic in topics]
topic_sizes = [len(words.split(', ')) for words in topic_words]
# Create a DataFrame for the pie chart
topics_df = pd.DataFrame({
'Topic': topic_labels,
'Words': topic_words,
'Size': topic_sizes
})
# Plot pie chart
fig = px.pie(
topics_df,
names='Topic',
values='Size',
title='Distribution',
hover_data=['Words']
)
st.plotly_chart(fig)
def plot_message_trends(chat_df):
"""Plot a line chart of message trends over time."""
chat_df['Date_Only'] = chat_df['Datetime'].dt.date
message_counts = chat_df.groupby('Date_Only').size().reset_index(name='Message Count')
# Plot line chart with area under the curve
st.line_chart(
message_counts.set_index('Date_Only'),
use_container_width=True,
height=400
)
def generate_pdf_report(keywords_chart_path, trends_chart_path, top_messages):
"""Generates a PDF report containing chat analysis results with keywords chart on first page and messages table on second page."""
os.makedirs("Report", exist_ok=True)
pdf_path = "Report/chat_analysis_report.pdf"
# Use a smaller page size to ensure content fits
doc = SimpleDocTemplate(
pdf_path,
pagesize=letter,
rightMargin=30,
leftMargin=30,
topMargin=30,
bottomMargin=30
)
styles = getSampleStyleSheet()
elements = []
# Add title
title_style = styles['Title']
title_style.spaceAfter = 30
elements.append(Paragraph("Chat Analysis Report", title_style))
# Add keywords chart on the first page
if keywords_chart_path:
elements.append(Paragraph("Top Keywords", styles['Heading2']))
img = Image(keywords_chart_path, width=400, height=300)
elements.append(img)
elements.append(Spacer(1, 20))
# Force a page break
elements.append(PageBreak())
# Add messages table on the second page
elements.append(Paragraph("Message Summary", styles['Heading2']))
elements.append(Spacer(1, 20))
# Create a custom style for table cells
table_style = styles["BodyText"].clone('TableCell', fontSize=8, leading=10)
# Prepare table data with wrapped text
table_data = [["Sender", "Message"]]
for _, row in top_messages.iterrows():
# Limit message length and wrap text
sender = Paragraph(str(row["Sender"])[:50], table_style)
message = Paragraph(str(row["Message"])[:500], table_style) # Limit message length
table_data.append([sender, message])
# Create table with adjusted column widths
available_width = doc.width
col_widths = [available_width * 0.3, available_width * 0.7] # 30% for sender, 70% for message
table = Table(
table_data,
colWidths=col_widths,
repeatRows=1 # Repeat header row on each page
)
# Apply table styles
table.setStyle(TableStyle([
('BACKGROUND', (0, 0), (-1, 0), colors.grey),
('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
('ALIGN', (0, 0), (-1, -1), 'LEFT'),
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
('FONTSIZE', (0, 0), (-1, 0), 10),
('BOTTOMPADDING', (0, 0), (-1, 0), 12),
('BACKGROUND', (0, 1), (-1, -1), colors.beige),
('TEXTCOLOR', (0, 1), (-1, -1), colors.black),
('FONTSIZE', (0, 1), (-1, -1), 8),
('GRID', (0, 0), (-1, -1), 1, colors.black),
('ROWHEIGHT', (0, 0), (-1, -1), None), # Let row height adjust automatically
('VALIGN', (0, 0), (-1, -1), 'TOP'),
('LEFTPADDING', (0, 0), (-1, -1), 6),
('RIGHTPADDING', (0, 0), (-1, -1), 6),
]))
elements.append(table)
# Build the PDF
try:
doc.build(elements)
except Exception as e:
print(f"Error generating PDF: {str(e)}")
return None
return pdf_path
# Generate Summary with LDA Checkpointing
def generate_summary(chat_df, top_n_keywords=10, top_n_messages=5, num_topics=3):
"""
Generates a summary of the chat with LDA checkpointing.
"""
# Extract Keywords
vectorizer = TfidfVectorizer(max_features=top_n_keywords)
tfidf_matrix = vectorizer.fit_transform(chat_df['Cleaned_Message'])
keywords_dict = dict(zip(vectorizer.get_feature_names_out(), tfidf_matrix.sum(axis=0).tolist()[0]))
# Summarize Messages
chat_df['Message_Length'] = chat_df['Message'].str.len()
summary = chat_df.sort_values(by='Message_Length', ascending=False).head(top_n_messages)
# Extract Topics
tokenizer = re.compile(r'\w+')
texts = [tokenizer.findall(msg.lower()) for msg in chat_df['Cleaned_Message']]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
# Load or Train LDA Model
try:
lda_model, loaded_dictionary = load_lda_checkpoint()
if loaded_dictionary.token2id != dictionary.token2id:
print("Dictionary Mismatch; Retraining LDA Model.")
lda_model = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, passes=10)
save_lda_checkpoint(lda_model, dictionary)
except FileNotFoundError:
lda_model = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, passes=10)
save_lda_checkpoint(lda_model, dictionary)
topics = lda_model.print_topics(num_words=5)
return keywords_dict, summary, topics
# Streamlit App Structure
st.markdown(
"""
<style>
.center-title {
text-align: center;
font-size: 2.5rem;
font-weight: bold;
margin-top: 1rem;
margin-bottom: 1rem;
}
</style>
<div class="center-title">CHAT ANALYZER</div>
""",
unsafe_allow_html=True
)
display_network_urls()
st.sidebar.header("Upload Chat")
uploaded_file = st.sidebar.file_uploader("Upload A Chat File", type=["txt", "json"])
if uploaded_file:
# Process and Parse Chat
if uploaded_file.name.endswith(".json"):
processed_text = process_json_to_txt(uploaded_file.read())
if processed_text:
chat_df = parse_chat(processed_text.encode("utf-8"))
else:
st.stop()
else:
chat_df = parse_chat(uploaded_file.read())
st.write("### Raw Chat Data")
st.dataframe(chat_df, use_container_width=True)
with st.spinner("Processing Chat Data..."):
chat_df = preprocess_chat(chat_df)
st.success("Chat Data Successfully Preprocessed!")
# Date and Time Ranges
chat_df['Datetime'] = pd.to_datetime(chat_df['Date'] + " " + chat_df['Time'], format='%d/%m/%Y %I:%M %p')
earliest_date, latest_date = chat_df['Datetime'].dt.date.min(), chat_df['Datetime'].dt.date.max()
current_date, current_time = datetime.now(pytz.timezone("Asia/Dubai")).date(), datetime.now(pytz.timezone("Asia/Dubai")).time()
# Sidebar Filters and Parameters
st.sidebar.header("Filters")
start_date = st.sidebar.date_input("Start Date", value=earliest_date, min_value=earliest_date, max_value=current_date)
end_date = st.sidebar.date_input("End Date", value=current_date, min_value=earliest_date, max_value=current_date)
earliest_time = chat_df['Datetime'].dt.time.min()
latest_time = chat_df['Datetime'].dt.time.max()
start_time = st.sidebar.time_input("Start Time", value=earliest_time)
end_time = st.sidebar.time_input("End Time", value=latest_time)
keywords = st.sidebar.text_input("Keywords [Comma-Separated]").split(",")
st.sidebar.header("Parameters")
num_topics = st.sidebar.number_input("Number of Topics", min_value=1, max_value=10, value=3)
top_n_keywords = st.sidebar.number_input("Top Keywords", min_value=1, max_value=50, value=10)
top_n_messages = st.sidebar.number_input("Top Messages", min_value=1, max_value=50, value=5)
filtered_chat = filter_by_criteria(chat_df, start_date, end_date, start_time, end_time, keywords)
if st.sidebar.button("Apply Filters"):
keywords, summary, topics = generate_summary(filtered_chat, top_n_keywords, top_n_messages, num_topics)
st.write("#### Top Keywords")
st.bar_chart(pd.DataFrame.from_dict(keywords, orient='index', columns=['Score']))
st.write("#### Topics Distribution")
plot_topics_distribution(topics)
st.write("### Filtered Chat Data", filtered_chat)
if not filtered_chat.empty:
with st.spinner("Generating Summary..."):
st.write("#### Message Trends Over Time")
plot_message_trends(filtered_chat)
st.write("#### Top Messages")
st.table(summary[['Sender', 'Message']])
else:
st.warning("No Data Available For The Selected Filters.")
if st.sidebar.button("Generate Report"):
with st.spinner("Generating PDF Report..."):
os.makedirs("Report", exist_ok=True)
# Save Trends Chart
trends_chart_path = "Report/message_trends_plot.png"
# Ensure Keywords Chart is Available
if 'keywords_dict' not in locals():
keywords_dict, summary, topics = generate_summary(filtered_chat, top_n_keywords, top_n_messages, num_topics)
# Save Keywords Chart
keywords_chart_path = "Report/top_keywords.png"
sns.barplot(x=list(keywords_dict.values()), y=list(keywords_dict.keys()), palette="YlGnBu")
plt.title("Top Keywords")
plt.tight_layout()
plt.savefig(keywords_chart_path)
plt.close()
# Generate PDF
pdf_path = generate_pdf_report(keywords_chart_path, trends_chart_path, summary[['Sender', 'Message']])
st.success("PDF Report Successfully Generated!")
# Provide Download Option
with open(pdf_path, "rb") as pdf_file:
st.download_button("Download Report", pdf_file, file_name="Chat Report.pdf", mime="application/pdf")
# Add terminal visibility control with persistence
show_terminal = st.sidebar.checkbox("Show Terminal", value=load_terminal_setting())
if show_terminal != load_terminal_setting():
save_terminal_setting(show_terminal)
if getattr(sys, 'frozen', False):
st.rerun()