From 6fd95b4cf1911383fa9f7206f540b515a56271c2 Mon Sep 17 00:00:00 2001 From: Ayyub I Date: Tue, 7 Nov 2023 21:15:56 -0600 Subject: [PATCH] updated model for embeddings --- .../backend/src/cache/faiss_index_general.dvc | 2 +- .../src/cache/faiss_index_in_depth.dvc | 2 +- packages/backend/src/preprocessor.py | 24 ++++++------- .../getanswer/cache/faiss_index_general.dvc | 2 +- .../getanswer/cache/faiss_index_in_depth.dvc | 2 +- .../googlecloud/functions/getanswer/helper.py | 34 ++++++++++--------- 6 files changed, 33 insertions(+), 33 deletions(-) diff --git a/packages/backend/src/cache/faiss_index_general.dvc b/packages/backend/src/cache/faiss_index_general.dvc index 42d875fd..b3b63437 100644 --- a/packages/backend/src/cache/faiss_index_general.dvc +++ b/packages/backend/src/cache/faiss_index_general.dvc @@ -1,5 +1,5 @@ outs: -- md5: 71ba40a724f16e67f72d45805b5782c1.dir +- md5: 1da4b1d88045a2adbc87e5d11c0a6af8.dir size: 85685475 nfiles: 2 hash: md5 diff --git a/packages/backend/src/cache/faiss_index_in_depth.dvc b/packages/backend/src/cache/faiss_index_in_depth.dvc index 3359e9bc..96aed5ad 100644 --- a/packages/backend/src/cache/faiss_index_in_depth.dvc +++ b/packages/backend/src/cache/faiss_index_in_depth.dvc @@ -1,5 +1,5 @@ outs: -- md5: 9ebc1797364f0107c0e944eb9f1c5dbc.dir +- md5: 038f50d4fd4c158b138d626f0fe19c2e.dir size: 85685475 nfiles: 2 hash: md5 diff --git a/packages/backend/src/preprocessor.py b/packages/backend/src/preprocessor.py index 50ea743d..aba3550b 100644 --- a/packages/backend/src/preprocessor.py +++ b/packages/backend/src/preprocessor.py @@ -1,7 +1,6 @@ import logging import os from langchain.document_loaders import ( - Docx2txtLoader, JSONLoader, ) from langchain.text_splitter import RecursiveCharacterTextSplitter @@ -13,13 +12,12 @@ from pathlib import Path import shutil - logger = logging.getLogger(__name__) dir = Path(__file__).parent.absolute() def create_embeddings(): - # llm = ChatOpenAI() + llm = ChatOpenAI(model="gpt-4-1106-preview") base_embeddings = OpenAIEmbeddings() @@ -38,18 +36,18 @@ def create_embeddings(): input_variables=["user_query"], template=in_depth_prompt_template ) - # llm_chain_general = LLMChain(llm=llm, prompt=general_prompt) - # llm_chain_in_depth = LLMChain(llm=llm, prompt=in_depth_prompt) + llm_chain_general = LLMChain(llm=llm, prompt=general_prompt) + llm_chain_in_depth = LLMChain(llm=llm, prompt=in_depth_prompt) - # general_embeddings = HypotheticalDocumentEmbedder( - # llm_chain=llm_chain_general, - # base_embeddings=base_embeddings, - # ) - # in_depth_embeddings = HypotheticalDocumentEmbedder( - # llm_chain=llm_chain_in_depth, base_embeddings=base_embeddings - # ) + general_embeddings = HypotheticalDocumentEmbedder( + llm_chain=llm_chain_general, + base_embeddings=base_embeddings, + ) + in_depth_embeddings = HypotheticalDocumentEmbedder( + llm_chain=llm_chain_in_depth, base_embeddings=base_embeddings + ) - return base_embeddings, base_embeddings + return general_embeddings, in_depth_embeddings def metadata_func_minutes_and_agendas(record: dict, metadata: dict) -> dict: diff --git a/packages/googlecloud/functions/getanswer/cache/faiss_index_general.dvc b/packages/googlecloud/functions/getanswer/cache/faiss_index_general.dvc index 42d875fd..b3b63437 100644 --- a/packages/googlecloud/functions/getanswer/cache/faiss_index_general.dvc +++ b/packages/googlecloud/functions/getanswer/cache/faiss_index_general.dvc @@ -1,5 +1,5 @@ outs: -- md5: 71ba40a724f16e67f72d45805b5782c1.dir +- md5: 1da4b1d88045a2adbc87e5d11c0a6af8.dir size: 85685475 nfiles: 2 hash: md5 diff --git a/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth.dvc b/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth.dvc index 3359e9bc..96aed5ad 100644 --- a/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth.dvc +++ b/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth.dvc @@ -1,5 +1,5 @@ outs: -- md5: 9ebc1797364f0107c0e944eb9f1c5dbc.dir +- md5: 038f50d4fd4c158b138d626f0fe19c2e.dir size: 85685475 nfiles: 2 hash: md5 diff --git a/packages/googlecloud/functions/getanswer/helper.py b/packages/googlecloud/functions/getanswer/helper.py index 08c34df5..55571659 100644 --- a/packages/googlecloud/functions/getanswer/helper.py +++ b/packages/googlecloud/functions/getanswer/helper.py @@ -37,34 +37,36 @@ def get_dbs(): def create_embeddings(): + llm = ChatOpenAI(model="gpt-4-1106-preview") + + base_embeddings = OpenAIEmbeddings() + general_prompt_template = """ - As an AI assistant tasked with generating brief general summaries, your role is to provide succinct, balanced information from the transcripts of New Orleans City Council meetings in response to the question "{question}". The response should not exceed one paragraph in length. If the available information from the transcripts is insufficient to accurately summarize the issue, please respond with 'Insufficient information available.' If the question extends beyond the scope of information contained in the transcripts, state 'I don't know.' + As an AI assistant, your role is to provide concise, balanced summaries from the transcripts of New Orleans City Council meetings in response to the user's query "{user_query}". Your response should not exceed one paragraph in length. If the available information from the transcripts is insufficient to accurately summarize the issue, respond with 'Insufficient information available.' If the user's query extends beyond the scope of information contained in the transcripts, state 'I don't know.' Answer:""" in_depth_prompt_template = """ - As an AI assistant tasked with providing in-depth dialogical summaries, your role is to provide comprehensive information from the transcripts of New Orleans City Council meetings. Your response should mimic the structure of a real conversation, often involving more than two exchanges between the parties. The dialogue should recreate the actual exchanges that occurred between city council members and external stakeholders in response to the question "{question}". For specific queries related to any votes that took place, your response should include detailed information. This should cover the ordinance number, who moved and seconded the motion, how each council member voted, and the final outcome of the vote. For each statement, response, and voting action, provide a summary, followed by a direct quote from the meeting transcript to ensure the context and substance of the discussion is preserved. If a question is about the voting results on a particular initiative, include in your response how each council member voted, if they were present, and if there were any abstentions or recusals. Always refer back to the original transcript to ensure accuracy. If the available information from the transcripts is insufficient to accurately answer the question or recreate the dialogue, please respond with 'Insufficient information available.' If the question extends beyond the scope of information contained in the transcripts, state 'I don't know.' + As an AI assistant, use the New Orleans City Council transcript data that you were trained on to provide an in-depth and balanced response to the following query: "{user_query}" Answer:""" general_prompt = PromptTemplate( - input_variables=["question"], template=general_prompt_template + input_variables=["user_query"], template=general_prompt_template ) in_depth_prompt = PromptTemplate( - input_variables=["question"], template=in_depth_prompt_template + input_variables=["user_query"], template=in_depth_prompt_template ) - # llm_chain_general = LLMChain(llm=llm, prompt=general_prompt) - # llm_chain_in_depth = LLMChain(llm=llm, prompt=in_depth_prompt) - - base_embeddings = OpenAIEmbeddings() - - # general_embeddings = HypotheticalDocumentEmbedder( - # llm_chain=llm_chain_general, base_embeddings=base_embeddings - # ) - # in_depth_embeddings = HypotheticalDocumentEmbedder( - # llm_chain=llm_chain_in_depth, base_embeddings=base_embeddings - # ) + llm_chain_general = LLMChain(llm=llm, prompt=general_prompt) + llm_chain_in_depth = LLMChain(llm=llm, prompt=in_depth_prompt) - return base_embeddings, base_embeddings + general_embeddings = HypotheticalDocumentEmbedder( + llm_chain=llm_chain_general, + base_embeddings=base_embeddings, + ) + in_depth_embeddings = HypotheticalDocumentEmbedder( + llm_chain=llm_chain_in_depth, base_embeddings=base_embeddings + ) + return general_embeddings, in_depth_embeddings def sort_retrived_documents(doc_list):