updated model for embeddings

eye-on-surveillance · Nov 8, 2023 · 6fd95b4 · 6fd95b4
1 parent 991dffc
commit 6fd95b4
Showing 6 changed files with 33 additions and 33 deletions.
diff --git a/packages/backend/src/cache/faiss_index_general.dvc b/packages/backend/src/cache/faiss_index_general.dvc
@@ -1,5 +1,5 @@
 outs:
-- md5: 71ba40a724f16e67f72d45805b5782c1.dir
+- md5: 1da4b1d88045a2adbc87e5d11c0a6af8.dir
   size: 85685475
   nfiles: 2
   hash: md5

diff --git a/packages/backend/src/cache/faiss_index_in_depth.dvc b/packages/backend/src/cache/faiss_index_in_depth.dvc
@@ -1,5 +1,5 @@
 outs:
-- md5: 9ebc1797364f0107c0e944eb9f1c5dbc.dir
+- md5: 038f50d4fd4c158b138d626f0fe19c2e.dir
   size: 85685475
   nfiles: 2
   hash: md5

diff --git a/packages/backend/src/preprocessor.py b/packages/backend/src/preprocessor.py
@@ -1,7 +1,6 @@
 import logging
 import os
 from langchain.document_loaders import (
-    Docx2txtLoader,
     JSONLoader,
 )
 from langchain.text_splitter import RecursiveCharacterTextSplitter
@@ -13,13 +12,12 @@
 from pathlib import Path
 import shutil
 
-
 logger = logging.getLogger(__name__)
 dir = Path(__file__).parent.absolute()
 
 
 def create_embeddings():
-    # llm = ChatOpenAI()
+    llm = ChatOpenAI(model="gpt-4-1106-preview")
 
     base_embeddings = OpenAIEmbeddings()
 
@@ -38,18 +36,18 @@ def create_embeddings():
         input_variables=["user_query"], template=in_depth_prompt_template
     )
 
-    # llm_chain_general = LLMChain(llm=llm, prompt=general_prompt)
-    # llm_chain_in_depth = LLMChain(llm=llm, prompt=in_depth_prompt)
+    llm_chain_general = LLMChain(llm=llm, prompt=general_prompt)
+    llm_chain_in_depth = LLMChain(llm=llm, prompt=in_depth_prompt)
 
-    # general_embeddings = HypotheticalDocumentEmbedder(
-    #     llm_chain=llm_chain_general,
-    #     base_embeddings=base_embeddings,
-    # )
-    # in_depth_embeddings = HypotheticalDocumentEmbedder(
-    #     llm_chain=llm_chain_in_depth, base_embeddings=base_embeddings
-    # )
+    general_embeddings = HypotheticalDocumentEmbedder(
+        llm_chain=llm_chain_general,
+        base_embeddings=base_embeddings,
+    )
+    in_depth_embeddings = HypotheticalDocumentEmbedder(
+        llm_chain=llm_chain_in_depth, base_embeddings=base_embeddings
+    )
 
-    return base_embeddings, base_embeddings
+    return general_embeddings, in_depth_embeddings
 
 
 def metadata_func_minutes_and_agendas(record: dict, metadata: dict) -> dict:

diff --git a/packages/googlecloud/functions/getanswer/cache/faiss_index_general.dvc b/packages/googlecloud/functions/getanswer/cache/faiss_index_general.dvc
@@ -1,5 +1,5 @@
 outs:
-- md5: 71ba40a724f16e67f72d45805b5782c1.dir
+- md5: 1da4b1d88045a2adbc87e5d11c0a6af8.dir
   size: 85685475
   nfiles: 2
   hash: md5

diff --git a/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth.dvc b/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth.dvc
@@ -1,5 +1,5 @@
 outs:
-- md5: 9ebc1797364f0107c0e944eb9f1c5dbc.dir
+- md5: 038f50d4fd4c158b138d626f0fe19c2e.dir
   size: 85685475
   nfiles: 2
   hash: md5

diff --git a/packages/googlecloud/functions/getanswer/helper.py b/packages/googlecloud/functions/getanswer/helper.py
@@ -37,34 +37,36 @@ def get_dbs():
 
 
 def create_embeddings():
+    llm = ChatOpenAI(model="gpt-4-1106-preview")
+
+    base_embeddings = OpenAIEmbeddings()
+
     general_prompt_template = """
-    As an AI assistant tasked with generating brief general summaries, your role is to provide succinct, balanced information from the transcripts of New Orleans City Council meetings in response to the question "{question}". The response should not exceed one paragraph in length. If the available information from the transcripts is insufficient to accurately summarize the issue, please respond with 'Insufficient information available.' If the question extends beyond the scope of information contained in the transcripts, state 'I don't know.'
+    As an AI assistant, your role is to provide concise, balanced summaries from the transcripts of New Orleans City Council meetings in response to the user's query "{user_query}". Your response should not exceed one paragraph in length. If the available information from the transcripts is insufficient to accurately summarize the issue, respond with 'Insufficient information available.' If the user's query extends beyond the scope of information contained in the transcripts, state 'I don't know.'
     Answer:"""
 
     in_depth_prompt_template = """
-    As an AI assistant tasked with providing in-depth dialogical summaries, your role is to provide comprehensive information from the transcripts of New Orleans City Council meetings. Your response should mimic the structure of a real conversation, often involving more than two exchanges between the parties. The dialogue should recreate the actual exchanges that occurred between city council members and external stakeholders in response to the question "{question}". For specific queries related to any votes that took place, your response should include detailed information. This should cover the ordinance number, who moved and seconded the motion, how each council member voted, and the final outcome of the vote. For each statement, response, and voting action, provide a summary, followed by a direct quote from the meeting transcript to ensure the context and substance of the discussion is preserved. If a question is about the voting results on a particular initiative, include in your response how each council member voted, if they were present, and if there were any abstentions or recusals. Always refer back to the original transcript to ensure accuracy. If the available information from the transcripts is insufficient to accurately answer the question or recreate the dialogue, please respond with 'Insufficient information available.' If the question extends beyond the scope of information contained in the transcripts, state 'I don't know.'
+    As an AI assistant, use the New Orleans City Council transcript data that you were trained on to provide an in-depth and balanced response to the following query: "{user_query}" 
     Answer:"""
 
     general_prompt = PromptTemplate(
-        input_variables=["question"], template=general_prompt_template
+        input_variables=["user_query"], template=general_prompt_template
     )
     in_depth_prompt = PromptTemplate(
-        input_variables=["question"], template=in_depth_prompt_template
+        input_variables=["user_query"], template=in_depth_prompt_template
     )
 
-    # llm_chain_general = LLMChain(llm=llm, prompt=general_prompt)
-    # llm_chain_in_depth = LLMChain(llm=llm, prompt=in_depth_prompt)
-
-    base_embeddings = OpenAIEmbeddings()
-
-    # general_embeddings = HypotheticalDocumentEmbedder(
-    #     llm_chain=llm_chain_general, base_embeddings=base_embeddings
-    # )
-    # in_depth_embeddings = HypotheticalDocumentEmbedder(
-    #     llm_chain=llm_chain_in_depth, base_embeddings=base_embeddings
-    # )
+    llm_chain_general = LLMChain(llm=llm, prompt=general_prompt)
+    llm_chain_in_depth = LLMChain(llm=llm, prompt=in_depth_prompt)
 
-    return base_embeddings, base_embeddings
+    general_embeddings = HypotheticalDocumentEmbedder(
+        llm_chain=llm_chain_general,
+        base_embeddings=base_embeddings,
+    )
+    in_depth_embeddings = HypotheticalDocumentEmbedder(
+        llm_chain=llm_chain_in_depth, base_embeddings=base_embeddings
+    )
+    return general_embeddings, in_depth_embeddings
 
 
 def sort_retrived_documents(doc_list):