huge restructure and cleanup

aidotse · Dec 18, 2023 · 42d7bef · 42d7bef
1 parent 1e2d957
commit 42d7bef
Show file tree

Hide file tree

Showing 24 changed files with 587 additions and 191 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1 +1,4 @@
-__pycache__/
+__pycache__/
+
+src/credentials.json
+src/token.json
diff --git a/chroma_storage/chroma.sqlite3 b/chroma_storage/chroma.sqlite3
diff --git a/requirements.txt b/requirements.txt
@@ -9,5 +9,10 @@ pypandoc_binary
 nltk
 langchain
 flask
-sentence-transformers
 flask-cors
+sentence-transformers
+google-auth 
+google-auth-oauthlib
+google-api-python-client
+tqdm
+markdown
diff --git a/src/chroma_storage/chroma.sqlite3 b/src/chroma_storage/chroma.sqlite3
diff --git a/src/chroma_storage/d99d339e-cf80-4412-ac9f-80a2a2a28aa5/data_level0.bin b/src/chroma_storage/d99d339e-cf80-4412-ac9f-80a2a2a28aa5/data_level0.bin
diff --git a/src/chroma_storage/d99d339e-cf80-4412-ac9f-80a2a2a28aa5/length.bin b/src/chroma_storage/d99d339e-cf80-4412-ac9f-80a2a2a28aa5/length.bin
diff --git a/src/documents/newDoc.txt b/src/documents/newDoc.txt
@@ -1,2 +1,4 @@
 Niels Houben älskar äppelpaj, äpplen, och äppeljuice.
-Niels vill alltid äta äpplen!
+Niels vill alltid äta äpplen!
+
+Han gillar även ostkaka
diff --git a/src/main.py b/src/main.py
@@ -1,63 +1,3 @@
-""" Main module to start the application """
-import sys
-import json
-from prompt_request import prompt_request
-# from model_wrapper import prompt_model
-# from query_wrapper import preform_query
-from test_prompts.test_prompts import test_prompts, standard_prompts
-
-from rich.console import Console
-from rich.markdown import Markdown
-console = Console()
-
-
-
-def run_standard_prompts():
-    output = []
-    for prompt in standard_prompts:
-        query_string = prompt
-        result = preform_query(query_string)
-        answer = prompt_model(str(result), prompt)
-        output.append({
-            "usr_prompt": prompt,
-            "answer": answer
-        })
-    with open("test_results/test_output.json", "w+") as f:
-        json.dump(output, f, indent=4, ensure_ascii=False)
-
-
-def question_loop():
-    while True:
-        usr_prompt = input("Ställ din fråga: ")
-        if usr_prompt == "e":
-            break
-        answer = prompt_request(usr_prompt).json()['answer']
-
-        console.print(Markdown(answer))
-        # print(prompt_request(usr_prompt))
-        # query_string = usr_prompt
-        # result = preform_query(query_string)
-        #print("dokument som hittades: ",[res["filepath"] for res in result])
-        #print("results:")
-        # print(str(result))
-        # prompt_model(str(result), usr_prompt)
-
-
-def main() -> None:
-    """ For now just a dummy """
-
-    # if sys.argv[1] == 'test':
-    #     run_standard_prompts()
-    #     return 0
-
-    # sys_prompt = test_prompts[0]["sys_prompt"]
-    # usr_prompt = test_prompts[0]["usr_prompt"]
-    question_loop()
-
-
-
-#could create user interface in json file that can be edited...
-
-if __name__ == '__main__':
-    sys.exit(main())
+""" Runs servers for model, mail, file-watcher """
 
+# TODO, FIX THE THING
diff --git a/src/model_server/test.py b/src/model_server/test.py
@@ -0,0 +1,11 @@
+import requests
+import json
+
+url = 'http://127.0.0.1:5000/prompt'
+headers = {'Content-Type': 'application/json'}
+data = {'prompt': 'Hej, vad är budgeten?'}
+json_data = json.dumps(data)
+response = requests.post(url, headers=headers, data=json_data)
+
+print(response.status_code)
+print(response.json())
diff --git a/src/prompt_loop.py b/src/prompt_loop.py
@@ -0,0 +1,63 @@
+""" Main module to start the application """
+import sys
+import json
+from utils.api.prompt_request import prompt_request
+# from model_wrapper import prompt_model
+# from query import preform_query
+from test_prompts.test_prompts import test_prompts, standard_prompts
+
+from rich.console import Console
+from rich.markdown import Markdown
+console = Console()
+
+
+
+# def run_standard_prompts():
+#     output = []
+#     for prompt in standard_prompts:
+#         query_string = prompt
+#         result = preform_query(query_string)
+#         answer = prompt_model(str(result), prompt)
+#         output.append({
+#             "usr_prompt": prompt,
+#             "answer": answer
+#         })
+#     with open("test_results/test_output.json", "w+") as f:
+#         json.dump(output, f, indent=4, ensure_ascii=False)
+
+
+def question_loop():
+    while True:
+        usr_prompt = input("Ställ din fråga: ")
+        if usr_prompt == "e":
+            break
+        answer = prompt_request(usr_prompt)['answer']
+
+        console.print(Markdown(answer))
+        # print(prompt_request(usr_prompt))
+        # query_string = usr_prompt
+        # result = preform_query(query_string)
+        #print("dokument som hittades: ",[res["filepath"] for res in result])
+        #print("results:")
+        # print(str(result))
+        # prompt_model(str(result), usr_prompt)
+
+
+def main() -> None:
+    """ For now just a dummy """
+
+    # if sys.argv[1] == 'test':
+    #     run_standard_prompts()
+    #     return 0
+
+    # sys_prompt = test_prompts[0]["sys_prompt"]
+    # usr_prompt = test_prompts[0]["usr_prompt"]
+    question_loop()
+
+
+
+#could create user interface in json file that can be edited...
+
+if __name__ == '__main__':
+    sys.exit(main())
+
diff --git a/src/server_mail.py b/src/server_mail.py
@@ -0,0 +1,27 @@
+from time import sleep
+
+from utils.mail.mail_utils import get_service, respond_to_mails
+
+# If modifying these SCOPES, delete the file token.json.
+SCOPES = ["https://www.googleapis.com/auth/gmail.modify"]
+SENDER_ADRESS = "[email protected]"
+USER_ID = "me"
+
+
+def main():
+    service = get_service(SCOPES)
+
+    """
+    Checks the inbox for mails every 5 seconds for 30 minutes or until keyboard interupt.
+    Implement google pub sub later to trigger when new mail is received
+    """
+
+    print("Started mailbox watcher")
+    for _ in range(360):
+        print("Checking...")
+        respond_to_mails(service, SENDER_ADRESS, USER_ID)
+        sleep(5)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/server.py → src/server_model.py b/src/server.py → src/server_model.py
@@ -2,7 +2,7 @@
 from flask_cors import CORS
 
 from model_wrapper import prompt_model
-from query_wrapper import preform_query
+from utils.query.query import preform_query
 app = Flask(__name__)
 CORS(app)  
 

diff --git a/src/server_watcher.py b/src/server_watcher.py
@@ -0,0 +1,28 @@
+from utils.watcher.update_db import check_db, cleanse_documents, init_db
+from utils.watcher.watcher_utils import start_watcher
+
+choicetext = """
+start - Start the watcher
+reset - Remove all documents from DB (not directory) and then re-add them
+check - Print all documents that are currently in the DB
+quit - Quit the CLI
+
+> """
+
+if __name__ == "__main__":
+    DIRECTORY = "./documents/"  # Change to path of network folder
+    while True:
+        choice = input(choicetext)
+
+        if choice.lower() == "start":
+            print("Press ctrl+c to stop watcher")
+            start_watcher(DIRECTORY)
+        elif choice.lower() == "reset":
+            cleanse_documents()
+            init_db(DIRECTORY)
+            print("Reset done")
+        elif choice.lower() == "check":
+            check_db()
+        elif choice.lower() == "quit":
+            print("closing CLI")
+            break
diff --git a/src/test.ipynb b/src/test.ipynb
diff --git a/src/query_wrapper.py → src/test_query.py b/src/query_wrapper.py → src/test_query.py
@@ -1,4 +1,4 @@
-from query_utils.query import preform_query
+from utils.query.query import preform_query
 
 
 if __name__ == "__main__":

diff --git a/src/test_similarity_checks.py b/src/test_similarity_checks.py
@@ -0,0 +1,57 @@
+import chromadb
+    # client = chromadb.PersistentClient(path=persist_directory)
+import torch
+from langchain.document_loaders import TextLoader
+from langchain.vectorstores import Chroma
+from transformers import AutoTokenizer, AutoModel
+
+def preprocess_text(text, chunk_size=2000):
+    chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
+    return chunks
+
+def perform_query(query: str):
+    collection_name = "documents_collection"
+    persist_directory = "chroma_storage"
+
+    # Instantiate the Chroma client or use the appropriate class/method
+    client = chromadb.PersistentClient(path=persist_directory)
+
+    # Get the collection.
+    collection = client.get_collection(name=collection_name)
+
+    print("Querying...\n")
+
+    # Instantiate a Hugging Face tokenizer and model
+    model_name = "bert-base-uncased"  # Replace with your desired Hugging Face model
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModel.from_pretrained(model_name)
+
+    # Preprocess the query into chunks
+    query_chunks = preprocess_text(query)
+
+    # Perform similarity checks on each chunk
+    results = []
+    for chunk in query_chunks:
+        # Tokenize and obtain embeddings using the Hugging Face model
+        tokens = tokenizer(chunk, return_tensors="pt")
+        with torch.no_grad():
+            embeddings = model(**tokens).last_hidden_state.mean(dim=1).squeeze().numpy()
+
+        # Convert the embeddings to a list of strings for the query_texts argument
+        embeddings_str = [str(val) for val in embeddings.tolist()]
+
+        result_chunk = collection.query(
+            query_texts=embeddings_str, n_results=4, include=["documents", "metadatas"]
+        )
+
+        filepaths = [result["filepath"] for result in result_chunk["metadatas"][0]]
+        documents = result_chunk["documents"][0]
+        result_chunk = [{"filepath": filepath, "text": document} for filepath, document in zip(filepaths, documents)]
+        results.extend(result_chunk)
+
+    return results
+
+# Example usage:
+query = "Your query text goes here."
+results = perform_query(query)
+print(results)
diff --git a/src/prompt_request.py → src/utils/api/prompt_request.py b/src/prompt_request.py → src/utils/api/prompt_request.py
@@ -1,10 +1,12 @@
 import json
 import requests
-def prompt_request(question):
+
+def prompt_request(question: str) -> dict:
+    """ Sends a request to server_model """
     url = 'http://127.0.0.1:5000/prompt'
     headers = {'Content-Type': 'application/json'}
     data = {'prompt': question}
     json_data = json.dumps(data)
     response = requests.post(url, headers=headers, data=json_data) # timeout=30
-    return response
+    return response.json()