generated from tulane-cmps6730/sample-project
-
Notifications
You must be signed in to change notification settings - Fork 0
/
cli.py
235 lines (168 loc) · 7.19 KB
/
cli.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
# -*- coding: utf-8 -*-
import click
import glob
import pickle
import sys
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report
import click
import os
import sys
from dotenv import load_dotenv
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from openai import OpenAI
import tiktoken
from langchain.text_splitter import RecursiveCharacterTextSplitter
@click.group()
def main(args=None):
"""Console script for nlp."""
return 0
@main.command('web')
@click.option('-p', '--port', required=False, default=5000, show_default=True, help='port of web server')
def web(port):
"""
Launch the flask web app.
"""
from .app import app
app.run(host='0.0.0.0', debug=True, port=port)
#setter up functions: ----------------------------------------------------------------------------------------------
#Important Variables setting up
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
load_dotenv()
openai_api_key = os.getenv('OPENAI_API_KEY')
client = OpenAI(api_key=openai_api_key)
def preprocess_text(text):
nlp = spacy.load("en_core_web_sm")
doc = nlp(text) # Parse the document using spaCy
clean_tokens = []
for ent in doc.ents:
# Apply heuristic rules to determine if the entity might be a class or program title
if ent.label_ in ['ORG', 'PRODUCT'] and 'course' in ent.text.lower() or 'program' in ent.text.lower():
# Treat the whole entity as a single token
clean_tokens.append(ent.text)
else:
# Process non-entity tokens normally
tokens = [token.lemma_ for token in ent if not token.is_punct and not token.is_stop]
clean_tokens.extend(tokens)
last_ent_end = 0
for ent in doc.ents:
clean_tokens.extend([token.lemma_ for token in doc[last_ent_end:ent.start] if not token.is_punct and not token.is_stop])
last_ent_end = ent.end
clean_tokens.extend([token.lemma_ for token in doc[last_ent_end:] if not token.is_punct and not token.is_stop])
return ' '.join(clean_tokens)
def chunk(chunk_size, documents):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=100,
length_function=len,
is_separator_regex=False
)
all_docs = []
for document in documents:
chunks = text_splitter.create_documents([document])
all_docs.extend(chunk.page_content for chunk in chunks)
return all_docs
def load_and_preprocess_documents(directory):
documents = []
for filename in os.listdir(directory):
filepath = os.path.join(directory, filename)
if os.path.isfile(filepath):
with open(filepath, 'r', encoding='utf-8') as file:
text = file.read()
processed_text = preprocess_text(text)
documents.append(processed_text)
return documents
def load_documents(filepath):
with open(filepath, 'rb') as file:
return pickle.load(file)
def retrieve(query, vectorizer, tfidf_matrix, data, max_tokens=16000):
if not data:
return []
try:
query_tf = vectorizer.transform([query])
similarities = cosine_similarity(query_tf, tfidf_matrix).flatten()
query_keywords = set(query.lower().split())
matches = []
current_token_count = len(encoding.encode(query))
for i, document in enumerate(data):
title = document.split(',')[0].lower()
title_keywords = set(title.split())
common_keywords = query_keywords.intersection(title_keywords)
keyword_count = len(common_keywords)
combined_score = similarities[i] + (keyword_count * 0.5) # Adjust the weight as needed
# Tokenize the document to count token
doc_token_count = len(encoding.encode(document))
matches.append((document, combined_score, doc_token_count))
matches.sort(key=lambda x: x[1], reverse=True)
selected_documents = []
current_token_count = 0
iterator = 0
for doc, combined_score, tokens in matches:
if current_token_count + tokens > max_tokens:
print("Tokens stopped at:", current_token_count)
print(f'Relevant documents Found {iterator}')
break # Stop adding if the next document exceeds the token limit
iterator += 1
selected_documents.append((doc,combined_score))
current_token_count += tokens
return selected_documents
except Exception as e:
print(f"An error occurred: {e}")
return []
def answer_question(question, documents, vectorizer, tfidf_matrix, model, max_tokens=300, stop_sequence=None):
retrieved_texts = retrieve(question, vectorizer, tfidf_matrix, documents)
context = " ".join([text for text, _ in retrieved_texts])
if context:
try:
response = client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": "You are an academic advisor. Answer the question based on the context below"},
{"role": "user", "content": f"Context: {context}\n\nQuestion: {question}"}
],
temperature=0,
max_tokens=max_tokens,
stop=stop_sequence,
)
# Get the response content
response_content = response.choices[0].message.content.strip()
html_response = '<p>' + '</p><p>'.join(response_content.split('\n')) + '</p>'
return html_response
except Exception as e:
return str(e)
else:
return "No relevant context found for the question."
#-------------------------------------------------------------------------------------------
@main.command('hello')
def hello():
print("hello")
@main.command('chat')
def chat():
"""Interactive chat using the document retrieval system."""
print("Chat Active")
script_dir = os.path.dirname(__file__)
# Construct the path to the "pre_processed" folder
pre_processed_folder = os.path.join(script_dir, 'app', 'pre_processed')
# Set the path for the processed documents file
processed_documents_path = os.path.join(pre_processed_folder, 'processed_documents.pkl')
# Load the processed documents
documents = load_documents(processed_documents_path)
print("Data Loaded")
vectorizer = TfidfVectorizer(max_features=10000, min_df=2, stop_words="english")
tfidf_matrix = vectorizer.fit_transform(documents)
while True:
try:
message = input("Ask any questions about majors and minors at Tulane:\n")
if message.lower() == 'exit':
print("Exiting chat...")
break
answer = answer_question(message, documents, vectorizer, tfidf_matrix, "gpt-3.5-turbo")
print("\nAnswer:", answer)
except Exception as e:
print(f"An error occurred: {e}")
print("Sorry, I didn't understand that. Please try again.")
if __name__ == "__main__":
sys.exit(main())