Skip to content

Commit

Permalink
huge restructure and cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
NielsHouben committed Dec 18, 2023
1 parent 1e2d957 commit 42d7bef
Show file tree
Hide file tree
Showing 24 changed files with 587 additions and 191 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,4 @@
__pycache__/
__pycache__/

src/credentials.json
src/token.json
Binary file added chroma_storage/chroma.sqlite3
Binary file not shown.
7 changes: 6 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,10 @@ pypandoc_binary
nltk
langchain
flask
sentence-transformers
flask-cors
sentence-transformers
google-auth
google-auth-oauthlib
google-api-python-client
tqdm
markdown
Binary file modified src/chroma_storage/chroma.sqlite3
Binary file not shown.
Binary file not shown.
Binary file not shown.
4 changes: 3 additions & 1 deletion src/documents/newDoc.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
Niels Houben älskar äppelpaj, äpplen, och äppeljuice.
Niels vill alltid äta äpplen!
Niels vill alltid äta äpplen!

Han gillar även ostkaka
64 changes: 2 additions & 62 deletions src/main.py
Original file line number Diff line number Diff line change
@@ -1,63 +1,3 @@
""" Main module to start the application """
import sys
import json
from prompt_request import prompt_request
# from model_wrapper import prompt_model
# from query_wrapper import preform_query
from test_prompts.test_prompts import test_prompts, standard_prompts

from rich.console import Console
from rich.markdown import Markdown
console = Console()



def run_standard_prompts():
output = []
for prompt in standard_prompts:
query_string = prompt
result = preform_query(query_string)
answer = prompt_model(str(result), prompt)
output.append({
"usr_prompt": prompt,
"answer": answer
})
with open("test_results/test_output.json", "w+") as f:
json.dump(output, f, indent=4, ensure_ascii=False)


def question_loop():
while True:
usr_prompt = input("Ställ din fråga: ")
if usr_prompt == "e":
break
answer = prompt_request(usr_prompt).json()['answer']

console.print(Markdown(answer))
# print(prompt_request(usr_prompt))
# query_string = usr_prompt
# result = preform_query(query_string)
#print("dokument som hittades: ",[res["filepath"] for res in result])
#print("results:")
# print(str(result))
# prompt_model(str(result), usr_prompt)


def main() -> None:
""" For now just a dummy """

# if sys.argv[1] == 'test':
# run_standard_prompts()
# return 0

# sys_prompt = test_prompts[0]["sys_prompt"]
# usr_prompt = test_prompts[0]["usr_prompt"]
question_loop()



#could create user interface in json file that can be edited...

if __name__ == '__main__':
sys.exit(main())
""" Runs servers for model, mail, file-watcher """

# TODO, FIX THE THING
11 changes: 11 additions & 0 deletions src/model_server/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import requests
import json

url = 'http://127.0.0.1:5000/prompt'
headers = {'Content-Type': 'application/json'}
data = {'prompt': 'Hej, vad är budgeten?'}
json_data = json.dumps(data)
response = requests.post(url, headers=headers, data=json_data)

print(response.status_code)
print(response.json())
63 changes: 63 additions & 0 deletions src/prompt_loop.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
""" Main module to start the application """
import sys
import json
from utils.api.prompt_request import prompt_request
# from model_wrapper import prompt_model
# from query import preform_query
from test_prompts.test_prompts import test_prompts, standard_prompts

from rich.console import Console
from rich.markdown import Markdown
console = Console()



# def run_standard_prompts():
# output = []
# for prompt in standard_prompts:
# query_string = prompt
# result = preform_query(query_string)
# answer = prompt_model(str(result), prompt)
# output.append({
# "usr_prompt": prompt,
# "answer": answer
# })
# with open("test_results/test_output.json", "w+") as f:
# json.dump(output, f, indent=4, ensure_ascii=False)


def question_loop():
while True:
usr_prompt = input("Ställ din fråga: ")
if usr_prompt == "e":
break
answer = prompt_request(usr_prompt)['answer']

console.print(Markdown(answer))
# print(prompt_request(usr_prompt))
# query_string = usr_prompt
# result = preform_query(query_string)
#print("dokument som hittades: ",[res["filepath"] for res in result])
#print("results:")
# print(str(result))
# prompt_model(str(result), usr_prompt)


def main() -> None:
""" For now just a dummy """

# if sys.argv[1] == 'test':
# run_standard_prompts()
# return 0

# sys_prompt = test_prompts[0]["sys_prompt"]
# usr_prompt = test_prompts[0]["usr_prompt"]
question_loop()



#could create user interface in json file that can be edited...

if __name__ == '__main__':
sys.exit(main())

27 changes: 27 additions & 0 deletions src/server_mail.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from time import sleep

from utils.mail.mail_utils import get_service, respond_to_mails

# If modifying these SCOPES, delete the file token.json.
SCOPES = ["https://www.googleapis.com/auth/gmail.modify"]
SENDER_ADRESS = "[email protected]"
USER_ID = "me"


def main():
service = get_service(SCOPES)

"""
Checks the inbox for mails every 5 seconds for 30 minutes or until keyboard interupt.
Implement google pub sub later to trigger when new mail is received
"""

print("Started mailbox watcher")
for _ in range(360):
print("Checking...")
respond_to_mails(service, SENDER_ADRESS, USER_ID)
sleep(5)


if __name__ == "__main__":
main()
2 changes: 1 addition & 1 deletion src/server.py → src/server_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from flask_cors import CORS

from model_wrapper import prompt_model
from query_wrapper import preform_query
from utils.query.query import preform_query
app = Flask(__name__)
CORS(app)

Expand Down
28 changes: 28 additions & 0 deletions src/server_watcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from utils.watcher.update_db import check_db, cleanse_documents, init_db
from utils.watcher.watcher_utils import start_watcher

choicetext = """
start - Start the watcher
reset - Remove all documents from DB (not directory) and then re-add them
check - Print all documents that are currently in the DB
quit - Quit the CLI
> """

if __name__ == "__main__":
DIRECTORY = "./documents/" # Change to path of network folder
while True:
choice = input(choicetext)

if choice.lower() == "start":
print("Press ctrl+c to stop watcher")
start_watcher(DIRECTORY)
elif choice.lower() == "reset":
cleanse_documents()
init_db(DIRECTORY)
print("Reset done")
elif choice.lower() == "check":
check_db()
elif choice.lower() == "quit":
print("closing CLI")
break
Empty file removed src/test.ipynb
Empty file.
2 changes: 1 addition & 1 deletion src/query_wrapper.py → src/test_query.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from query_utils.query import preform_query
from utils.query.query import preform_query


if __name__ == "__main__":
Expand Down
57 changes: 57 additions & 0 deletions src/test_similarity_checks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import chromadb
# client = chromadb.PersistentClient(path=persist_directory)
import torch
from langchain.document_loaders import TextLoader
from langchain.vectorstores import Chroma
from transformers import AutoTokenizer, AutoModel

def preprocess_text(text, chunk_size=2000):
chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
return chunks

def perform_query(query: str):
collection_name = "documents_collection"
persist_directory = "chroma_storage"

# Instantiate the Chroma client or use the appropriate class/method
client = chromadb.PersistentClient(path=persist_directory)

# Get the collection.
collection = client.get_collection(name=collection_name)

print("Querying...\n")

# Instantiate a Hugging Face tokenizer and model
model_name = "bert-base-uncased" # Replace with your desired Hugging Face model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Preprocess the query into chunks
query_chunks = preprocess_text(query)

# Perform similarity checks on each chunk
results = []
for chunk in query_chunks:
# Tokenize and obtain embeddings using the Hugging Face model
tokens = tokenizer(chunk, return_tensors="pt")
with torch.no_grad():
embeddings = model(**tokens).last_hidden_state.mean(dim=1).squeeze().numpy()

# Convert the embeddings to a list of strings for the query_texts argument
embeddings_str = [str(val) for val in embeddings.tolist()]

result_chunk = collection.query(
query_texts=embeddings_str, n_results=4, include=["documents", "metadatas"]
)

filepaths = [result["filepath"] for result in result_chunk["metadatas"][0]]
documents = result_chunk["documents"][0]
result_chunk = [{"filepath": filepath, "text": document} for filepath, document in zip(filepaths, documents)]
results.extend(result_chunk)

return results

# Example usage:
query = "Your query text goes here."
results = perform_query(query)
print(results)
6 changes: 4 additions & 2 deletions src/prompt_request.py → src/utils/api/prompt_request.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import json
import requests
def prompt_request(question):

def prompt_request(question: str) -> dict:
""" Sends a request to server_model """
url = 'http://127.0.0.1:5000/prompt'
headers = {'Content-Type': 'application/json'}
data = {'prompt': question}
json_data = json.dumps(data)
response = requests.post(url, headers=headers, data=json_data) # timeout=30
return response
return response.json()

Loading

0 comments on commit 42d7bef

Please sign in to comment.