From af184f311ff18a772b36dd6969cf23fa9187864c Mon Sep 17 00:00:00 2001 From: Ayyub Ibrahim Date: Sat, 4 May 2024 13:13:55 -0500 Subject: [PATCH] introduced 1. semantic chunking 2. transformers and compressors for post processing of retrieved documents --- .../src/cache/faiss_index_in_depth_cj.dvc | 4 +- .../src/cache/faiss_index_in_depth_fc.dvc | 4 +- .../src/cache/faiss_index_in_depth_news.dvc | 4 +- .../src/cache/faiss_index_in_depth_pc.dvc | 4 +- .../src/cache/faiss_index_in_depth_pdf.dvc | 4 +- packages/backend/src/preprocessor.py | 116 ++++++++++-------- .../cache/faiss_index_in_depth_cj.dvc | 4 +- .../cache/faiss_index_in_depth_fc.dvc | 4 +- .../cache/faiss_index_in_depth_news.dvc | 4 +- .../cache/faiss_index_in_depth_pc.dvc | 4 +- .../cache/faiss_index_in_depth_pdf.dvc | 4 +- .../functions/getanswer/inquirer.py | 107 +++++++++------- 12 files changed, 147 insertions(+), 116 deletions(-) diff --git a/packages/backend/src/cache/faiss_index_in_depth_cj.dvc b/packages/backend/src/cache/faiss_index_in_depth_cj.dvc index a8319f0..c0a175c 100644 --- a/packages/backend/src/cache/faiss_index_in_depth_cj.dvc +++ b/packages/backend/src/cache/faiss_index_in_depth_cj.dvc @@ -1,6 +1,6 @@ outs: -- md5: c6beb8080d1d4184ffc674e87f44d325.dir - size: 23116757 +- md5: 0b60967184b16042af19d7a5e668d976.dir + size: 23052882 nfiles: 2 hash: md5 path: faiss_index_in_depth_cj diff --git a/packages/backend/src/cache/faiss_index_in_depth_fc.dvc b/packages/backend/src/cache/faiss_index_in_depth_fc.dvc index 1983be3..dee0b81 100644 --- a/packages/backend/src/cache/faiss_index_in_depth_fc.dvc +++ b/packages/backend/src/cache/faiss_index_in_depth_fc.dvc @@ -1,6 +1,6 @@ outs: -- md5: 5429df81ded37a05e2b635a7a17f9c85.dir - size: 104396613 +- md5: 1fb1758a0371e1646b7097027a19bf96.dir + size: 104365424 nfiles: 2 hash: md5 path: faiss_index_in_depth_fc diff --git a/packages/backend/src/cache/faiss_index_in_depth_news.dvc b/packages/backend/src/cache/faiss_index_in_depth_news.dvc index 7ed4b24..4c4661d 100644 --- a/packages/backend/src/cache/faiss_index_in_depth_news.dvc +++ b/packages/backend/src/cache/faiss_index_in_depth_news.dvc @@ -1,6 +1,6 @@ outs: -- md5: 3d2ad2d2958f151049b9aed5553e2631.dir - size: 526418 +- md5: 7cd24ae248c08b35b247efc0333de089.dir + size: 436809 nfiles: 2 hash: md5 path: faiss_index_in_depth_news diff --git a/packages/backend/src/cache/faiss_index_in_depth_pc.dvc b/packages/backend/src/cache/faiss_index_in_depth_pc.dvc index b806097..a305675 100644 --- a/packages/backend/src/cache/faiss_index_in_depth_pc.dvc +++ b/packages/backend/src/cache/faiss_index_in_depth_pc.dvc @@ -1,6 +1,6 @@ outs: -- md5: d128edb351bf5f50f183908a7a980121.dir - size: 2097501 +- md5: 6a63f62685b7e2767b810e38d939fed2.dir + size: 3185640 nfiles: 2 hash: md5 path: faiss_index_in_depth_pc diff --git a/packages/backend/src/cache/faiss_index_in_depth_pdf.dvc b/packages/backend/src/cache/faiss_index_in_depth_pdf.dvc index dbda62c..57cd14c 100644 --- a/packages/backend/src/cache/faiss_index_in_depth_pdf.dvc +++ b/packages/backend/src/cache/faiss_index_in_depth_pdf.dvc @@ -1,6 +1,6 @@ outs: -- md5: e6ab47badccf9be48d8f2bc73c446cb0.dir - size: 53418928 +- md5: c8a155bd868ccfd72bb6d2a07464357d.dir + size: 86366971 nfiles: 2 hash: md5 path: faiss_index_in_depth_pdf diff --git a/packages/backend/src/preprocessor.py b/packages/backend/src/preprocessor.py index b2b276e..6e35141 100644 --- a/packages/backend/src/preprocessor.py +++ b/packages/backend/src/preprocessor.py @@ -1,15 +1,17 @@ import logging import os -from langchain_community.document_loaders import JSONLoader +from langchain_community.document_loaders.json_loader import JSONLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_openai import OpenAIEmbeddings -from langchain.chains import LLMChain, HypotheticalDocumentEmbedder from langchain.prompts import PromptTemplate -from langchain_community.vectorstores import FAISS +from langchain_community.vectorstores.faiss import FAISS from langchain_openai import OpenAI from pathlib import Path import shutil +from langchain_experimental.text_splitter import SemanticChunker +from langchain.docstore.document import Document + logger = logging.getLogger(__name__) dir = Path(__file__).parent.absolute() @@ -35,19 +37,19 @@ def create_embeddings(): input_variables=["user_query"], template=in_depth_prompt_template ) - llm_chain_general = LLMChain(llm=llm, prompt=general_prompt) - llm_chain_in_depth = LLMChain(llm=llm, prompt=in_depth_prompt) + # llm_chain_general = LLMChain(llm=llm, prompt=general_prompt) + # llm_chain_in_depth = LLMChain(llm=llm, prompt=in_depth_prompt) - general_embeddings = HypotheticalDocumentEmbedder( - llm_chain=llm_chain_general, - base_embeddings=base_embeddings, - ) + # general_embeddings = HypotheticalDocumentEmbedder( + # llm_chain=llm_chain_general, + # base_embeddings=base_embeddings, + # ) - in_depth_embeddings = HypotheticalDocumentEmbedder( - llm_chain=llm_chain_in_depth, base_embeddings=base_embeddings - ) + # in_depth_embeddings = HypotheticalDocumentEmbedder( + # llm_chain=llm_chain_in_depth, base_embeddings=base_embeddings + # ) - return general_embeddings, in_depth_embeddings + return base_embeddings, base_embeddings def metadata_func_minutes_and_agendas(record: dict, metadata: dict) -> dict: @@ -72,11 +74,15 @@ def create_db_from_minutes_and_agendas(doc_directory): ) data = loader.load() - text_splitter = RecursiveCharacterTextSplitter( - chunk_size=2000, chunk_overlap=100 - ) - docs = text_splitter.split_documents(data) - all_docs.extend(docs) + text_splitter = SemanticChunker(OpenAIEmbeddings()) + for doc in data: + chunks = text_splitter.split_text(doc.page_content) + for chunk in chunks: + new_doc = Document(page_content=chunk, metadata=doc.metadata) + print( + f"Content: {new_doc.page_content}\nMetadata: {new_doc.metadata}\n" + ) + all_docs.append(new_doc) logger.info("Finished database from minutes...") return all_docs @@ -102,11 +108,15 @@ def create_db_from_news_transcripts(news_json_directory): ) data = loader.load() - text_splitter = RecursiveCharacterTextSplitter( - chunk_size=2000, chunk_overlap=100 - ) - docs = text_splitter.split_documents(data) - all_docs.extend(docs) + text_splitter = SemanticChunker(OpenAIEmbeddings()) + for doc in data: + chunks = text_splitter.split_text(doc.page_content) + for chunk in chunks: + new_doc = Document(page_content=chunk, metadata=doc.metadata) + print( + f"Content: {new_doc.page_content}\nMetadata: {new_doc.metadata}\n" + ) + all_docs.append(new_doc) logger.info("Finished database from news transcripts...") return all_docs @@ -135,19 +145,15 @@ def create_db_from_cj_transcripts(cj_json_directory): ) data = loader.load() - text_splitter = RecursiveCharacterTextSplitter( - chunk_size=2000, chunk_overlap=100 - ) - docs = text_splitter.split_documents(data) - - for doc in docs: - publish_date = doc.metadata.get("publish_date") - if publish_date: - doc.page_content += f" -- publish_date: {publish_date}" - else: - logger.warning(f"No publish date found for document: {doc}") - - all_docs.extend(docs) + text_splitter = SemanticChunker(OpenAIEmbeddings()) + for doc in data: + chunks = text_splitter.split_text(doc.page_content) + for chunk in chunks: + new_doc = Document(page_content=chunk, metadata=doc.metadata) + print( + f"Content: {new_doc.page_content}\nMetadata: {new_doc.metadata}\n" + ) + all_docs.append(new_doc) logger.info("Finished database from CJ transcripts...") return all_docs @@ -168,17 +174,15 @@ def create_db_from_fc_transcripts(fc_json_directory): ) data = loader.load() - text_splitter = RecursiveCharacterTextSplitter( - chunk_size=2000, chunk_overlap=100 - ) - docs = text_splitter.split_documents(data) - # Append the publish date to the end of page_content - for doc in docs: - publish_date = doc.metadata.get("publish_date") - if publish_date: - doc.page_content += f" -- publish_date: {publish_date}" - - all_docs.extend(docs) + text_splitter = SemanticChunker(OpenAIEmbeddings()) + for doc in data: + chunks = text_splitter.split_text(doc.page_content) + for chunk in chunks: + new_doc = Document(page_content=chunk, metadata=doc.metadata) + print( + f"Content: {new_doc.page_content}\nMetadata: {new_doc.metadata}\n" + ) + all_docs.append(new_doc) logger.info("Finished database from news transcripts...") return all_docs @@ -198,11 +202,15 @@ def create_db_from_public_comments(pc_json_directory): ) data = loader.load() - text_splitter = RecursiveCharacterTextSplitter( - chunk_size=2000, chunk_overlap=100 - ) - docs = text_splitter.split_documents(data) - all_docs.extend(docs) + text_splitter = SemanticChunker(OpenAIEmbeddings()) + for doc in data: + chunks = text_splitter.split_text(doc.page_content) + for chunk in chunks: + new_doc = Document(page_content=chunk, metadata=doc.metadata) + print( + f"Content: {new_doc.page_content}\nMetadata: {new_doc.metadata}\n" + ) + all_docs.append(new_doc) logger.info("Finished database from Public Comments...") return all_docs @@ -239,7 +247,9 @@ def create_save_and_copy_faiss(docs, embeddings, doc_type): f"googlecloud/functions/getanswer/cache/faiss_index_in_depth_{doc_type}" ) shutil.copytree(local_save_dir, cloud_dir, dirs_exist_ok=True) - logger.info(f"FAISS index for {doc_type} copied to Google Cloud directory: {cloud_dir}") + logger.info( + f"FAISS index for {doc_type} copied to Google Cloud directory: {cloud_dir}" + ) return db diff --git a/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_cj.dvc b/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_cj.dvc index a8319f0..c0a175c 100644 --- a/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_cj.dvc +++ b/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_cj.dvc @@ -1,6 +1,6 @@ outs: -- md5: c6beb8080d1d4184ffc674e87f44d325.dir - size: 23116757 +- md5: 0b60967184b16042af19d7a5e668d976.dir + size: 23052882 nfiles: 2 hash: md5 path: faiss_index_in_depth_cj diff --git a/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_fc.dvc b/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_fc.dvc index 1983be3..dee0b81 100644 --- a/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_fc.dvc +++ b/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_fc.dvc @@ -1,6 +1,6 @@ outs: -- md5: 5429df81ded37a05e2b635a7a17f9c85.dir - size: 104396613 +- md5: 1fb1758a0371e1646b7097027a19bf96.dir + size: 104365424 nfiles: 2 hash: md5 path: faiss_index_in_depth_fc diff --git a/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_news.dvc b/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_news.dvc index 7ed4b24..4c4661d 100644 --- a/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_news.dvc +++ b/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_news.dvc @@ -1,6 +1,6 @@ outs: -- md5: 3d2ad2d2958f151049b9aed5553e2631.dir - size: 526418 +- md5: 7cd24ae248c08b35b247efc0333de089.dir + size: 436809 nfiles: 2 hash: md5 path: faiss_index_in_depth_news diff --git a/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_pc.dvc b/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_pc.dvc index b806097..a305675 100644 --- a/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_pc.dvc +++ b/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_pc.dvc @@ -1,6 +1,6 @@ outs: -- md5: d128edb351bf5f50f183908a7a980121.dir - size: 2097501 +- md5: 6a63f62685b7e2767b810e38d939fed2.dir + size: 3185640 nfiles: 2 hash: md5 path: faiss_index_in_depth_pc diff --git a/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_pdf.dvc b/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_pdf.dvc index dbda62c..57cd14c 100644 --- a/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_pdf.dvc +++ b/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_pdf.dvc @@ -1,6 +1,6 @@ outs: -- md5: e6ab47badccf9be48d8f2bc73c446cb0.dir - size: 53418928 +- md5: c8a155bd868ccfd72bb6d2a07464357d.dir + size: 86366971 nfiles: 2 hash: md5 path: faiss_index_in_depth_pdf diff --git a/packages/googlecloud/functions/getanswer/inquirer.py b/packages/googlecloud/functions/getanswer/inquirer.py index 1103277..1b51257 100644 --- a/packages/googlecloud/functions/getanswer/inquirer.py +++ b/packages/googlecloud/functions/getanswer/inquirer.py @@ -221,19 +221,32 @@ def transform_query_for_date(query): ) -def process_and_concat_documents(retrieved_docs): - """ - Process and combine documents from multiple sources. +# def process_and_concat_documents(retrieved_docs): +# """ +# Process and combine documents from multiple sources. - :param retrieved_docs: Dictionary with keys as source names and values as lists of (Document, score) tuples. - :return: Tuple of combined string of all processed documents and list of original Document objects. - """ +# :param retrieved_docs: Dictionary with keys as source names and values as lists of (Document, score) tuples. +# :return: Tuple of combined string of all processed documents and list of original Document objects. +# """ +# combined_docs_content = [] +# original_documents = [] + +# for source, docs in retrieved_docs.items(): +# sorted_docs = sort_retrieved_documents(docs) +# for doc, score in sorted_docs: +# combined_docs_content.append(doc.page_content) +# original_documents.append(doc) + +# combined_content = "\n\n".join(combined_docs_content) +# return combined_content, original_documents + + +def process_and_concat_documents(retrieved_docs): combined_docs_content = [] original_documents = [] for source, docs in retrieved_docs.items(): - sorted_docs = sort_retrieved_documents(docs) - for doc, score in sorted_docs: + for doc in docs: combined_docs_content.append(doc.page_content) original_documents.append(doc) @@ -243,67 +256,75 @@ def process_and_concat_documents(retrieved_docs): def get_indepth_response_from_query(df, db_fc, db_cj, db_pdf, db_pc, db_news, query, k): logger.info("Performing in-depth summary query...") - llm = ChatOpenAI(model_name="gpt-4-turbo") - retrievers = [db_fc, db_cj, db_pdf, db_pc, db_news] - retriever_names = ["fc"] + embeddings = OpenAIEmbeddings() + + # Initialize compressors and transformers + splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=0, separator=". ") + redundant_filter = EmbeddingsRedundantFilter(embeddings=embeddings) + relevant_filter = EmbeddingsFilter(embeddings=embeddings, similarity_threshold=0.7) + + # Create a compressor pipeline + pipeline_compressor = DocumentCompressorPipeline(transformers=[redundant_filter]) + + # Wrap base retrievers with the compressor pipeline + compressed_retrievers = [ + ContextualCompressionRetriever( + base_compressor=pipeline_compressor, base_retriever=db.as_retriever() + ) + for db in [db_fc, db_cj, db_pdf, db_pc, db_news] + ] + retriever_names = [ + "fc", + "cj", + "pdf", + ] + # Initialize parallel retrieval with compressed retrievers retrieval_chains = { - name: RunnableLambda(lambda q, db=db: db.similarity_search_with_score(q, k=10)) - for name, db in zip(retriever_names, retrievers) + name: RunnableLambda(lambda q, db=db: db.get_relevant_documents(q, k=25)) + for name, db in zip(retriever_names, compressed_retrievers) } retrievals = RunnableParallel(retrieval_chains) - retrieved_docs = retrievals.invoke(query) + + compressed_docs = retrievals.invoke(query) combined_docs_content, original_documents = process_and_concat_documents( - retrieved_docs + compressed_docs ) template = """ ### Task - Focus exclusively on answering the specific question: '{question}'. - - ### Relevance Guidelines - Directly relevant information must explicitly pertain to the question. - Information that is indirectly relevant should only be used to clarify the context necessary for understanding the direct answer. - Omit any information that is irrelevant or tangential to the question. - + Focus exclusively on answering the specific question: '{question}' using only the information provided in the documents below. + ### Summary Guidelines - 1. Extract the key points, decisions, and actions discussed during the city council meetings relevant to {question}; - 2. Highlight any immediate shortcomings, mistakes, or negative actions by the city council relevant to {question}; - 3. Elaborate on the implications and broader societal or community impacts of the identified issues relevant to {question}; - 4. Investigate any underlying biases or assumptions present in the city council's discourse or actions relevant to {question}. - If not relevant to the question, answer the question without expanding on these points. - - ### Bias Guidelines: - Be mindful of biases in the document corpus. These documents were produced by city council, therefore, you must be aware of the inherent biases toward its behavior. - + 1. Extract the key points, decisions, and actions from the provided documents that are relevant to the question. + 2. Identify any immediate shortcomings, mistakes, or negative actions mentioned in the documents that are relevant to the question. + 3. Discuss the implications and broader societal or community impacts of the identified issues as stated in the documents, if they are relevant to the question. + 4. Highlight any underlying biases or assumptions present in the documents' content that are relevant to the question. + ### Formatting Instructions - Deliver the response in unformatted paragraph form. - Avoid any lists or bullet points. - Do not mention document analysis methods or publication dates. - - If your response includes technical terms provide a brief definition for those terms at the end of your response. Ensure each definition is on a new line, formatted as follows: + 1. Provide the response in concise, unformatted paragraphs. + 2. Avoid lists, bullet points, or mentioning document analysis methods or publication dates. + 3. If your response includes technical terms, provide a brief definition for those terms at the end of your response, with each definition on a new line, formatted as follows: Definitions: - - Word: Definition - Word: Definition - Word: Definition - + Term 1: Definition + Term 2: Definition + Term 3: Definition + ### Documents to Analyze {docs} """ prompt_response = ChatPromptTemplate.from_template(template) response_chain = prompt_response | llm | StrOutputParser() - responses_llm = response_chain.invoke( {"question": query, "docs": combined_docs_content} ) - print(responses_llm) + print(responses_llm) return process_streamed_responses_llm(responses_llm, original_documents)