From af184f311ff18a772b36dd6969cf23fa9187864c Mon Sep 17 00:00:00 2001
From: Ayyub Ibrahim <ayyub.ibrahimi@gmail.com>
Date: Sat, 4 May 2024 13:13:55 -0500
Subject: [PATCH] introduced 1. semantic chunking 2. transformers and
 compressors for post processing of retrieved documents

---
 .../src/cache/faiss_index_in_depth_cj.dvc     |   4 +-
 .../src/cache/faiss_index_in_depth_fc.dvc     |   4 +-
 .../src/cache/faiss_index_in_depth_news.dvc   |   4 +-
 .../src/cache/faiss_index_in_depth_pc.dvc     |   4 +-
 .../src/cache/faiss_index_in_depth_pdf.dvc    |   4 +-
 packages/backend/src/preprocessor.py          | 116 ++++++++++--------
 .../cache/faiss_index_in_depth_cj.dvc         |   4 +-
 .../cache/faiss_index_in_depth_fc.dvc         |   4 +-
 .../cache/faiss_index_in_depth_news.dvc       |   4 +-
 .../cache/faiss_index_in_depth_pc.dvc         |   4 +-
 .../cache/faiss_index_in_depth_pdf.dvc        |   4 +-
 .../functions/getanswer/inquirer.py           | 107 +++++++++-------
 12 files changed, 147 insertions(+), 116 deletions(-)

diff --git a/packages/backend/src/cache/faiss_index_in_depth_cj.dvc b/packages/backend/src/cache/faiss_index_in_depth_cj.dvc
index a8319f0..c0a175c 100644
--- a/packages/backend/src/cache/faiss_index_in_depth_cj.dvc
+++ b/packages/backend/src/cache/faiss_index_in_depth_cj.dvc
@@ -1,6 +1,6 @@
 outs:
-- md5: c6beb8080d1d4184ffc674e87f44d325.dir
-  size: 23116757
+- md5: 0b60967184b16042af19d7a5e668d976.dir
+  size: 23052882
   nfiles: 2
   hash: md5
   path: faiss_index_in_depth_cj
diff --git a/packages/backend/src/cache/faiss_index_in_depth_fc.dvc b/packages/backend/src/cache/faiss_index_in_depth_fc.dvc
index 1983be3..dee0b81 100644
--- a/packages/backend/src/cache/faiss_index_in_depth_fc.dvc
+++ b/packages/backend/src/cache/faiss_index_in_depth_fc.dvc
@@ -1,6 +1,6 @@
 outs:
-- md5: 5429df81ded37a05e2b635a7a17f9c85.dir
-  size: 104396613
+- md5: 1fb1758a0371e1646b7097027a19bf96.dir
+  size: 104365424
   nfiles: 2
   hash: md5
   path: faiss_index_in_depth_fc
diff --git a/packages/backend/src/cache/faiss_index_in_depth_news.dvc b/packages/backend/src/cache/faiss_index_in_depth_news.dvc
index 7ed4b24..4c4661d 100644
--- a/packages/backend/src/cache/faiss_index_in_depth_news.dvc
+++ b/packages/backend/src/cache/faiss_index_in_depth_news.dvc
@@ -1,6 +1,6 @@
 outs:
-- md5: 3d2ad2d2958f151049b9aed5553e2631.dir
-  size: 526418
+- md5: 7cd24ae248c08b35b247efc0333de089.dir
+  size: 436809
   nfiles: 2
   hash: md5
   path: faiss_index_in_depth_news
diff --git a/packages/backend/src/cache/faiss_index_in_depth_pc.dvc b/packages/backend/src/cache/faiss_index_in_depth_pc.dvc
index b806097..a305675 100644
--- a/packages/backend/src/cache/faiss_index_in_depth_pc.dvc
+++ b/packages/backend/src/cache/faiss_index_in_depth_pc.dvc
@@ -1,6 +1,6 @@
 outs:
-- md5: d128edb351bf5f50f183908a7a980121.dir
-  size: 2097501
+- md5: 6a63f62685b7e2767b810e38d939fed2.dir
+  size: 3185640
   nfiles: 2
   hash: md5
   path: faiss_index_in_depth_pc
diff --git a/packages/backend/src/cache/faiss_index_in_depth_pdf.dvc b/packages/backend/src/cache/faiss_index_in_depth_pdf.dvc
index dbda62c..57cd14c 100644
--- a/packages/backend/src/cache/faiss_index_in_depth_pdf.dvc
+++ b/packages/backend/src/cache/faiss_index_in_depth_pdf.dvc
@@ -1,6 +1,6 @@
 outs:
-- md5: e6ab47badccf9be48d8f2bc73c446cb0.dir
-  size: 53418928
+- md5: c8a155bd868ccfd72bb6d2a07464357d.dir
+  size: 86366971
   nfiles: 2
   hash: md5
   path: faiss_index_in_depth_pdf
diff --git a/packages/backend/src/preprocessor.py b/packages/backend/src/preprocessor.py
index b2b276e..6e35141 100644
--- a/packages/backend/src/preprocessor.py
+++ b/packages/backend/src/preprocessor.py
@@ -1,15 +1,17 @@
 import logging
 import os
-from langchain_community.document_loaders import JSONLoader
+from langchain_community.document_loaders.json_loader import JSONLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_openai import OpenAIEmbeddings
 
-from langchain.chains import LLMChain, HypotheticalDocumentEmbedder
 from langchain.prompts import PromptTemplate
-from langchain_community.vectorstores import FAISS
+from langchain_community.vectorstores.faiss import FAISS
 from langchain_openai import OpenAI
 from pathlib import Path
 import shutil
+from langchain_experimental.text_splitter import SemanticChunker
+from langchain.docstore.document import Document
+
 
 logger = logging.getLogger(__name__)
 dir = Path(__file__).parent.absolute()
@@ -35,19 +37,19 @@ def create_embeddings():
         input_variables=["user_query"], template=in_depth_prompt_template
     )
 
-    llm_chain_general = LLMChain(llm=llm, prompt=general_prompt)
-    llm_chain_in_depth = LLMChain(llm=llm, prompt=in_depth_prompt)
+    # llm_chain_general = LLMChain(llm=llm, prompt=general_prompt)
+    # llm_chain_in_depth = LLMChain(llm=llm, prompt=in_depth_prompt)
 
-    general_embeddings = HypotheticalDocumentEmbedder(
-        llm_chain=llm_chain_general,
-        base_embeddings=base_embeddings,
-    )
+    # general_embeddings = HypotheticalDocumentEmbedder(
+    #     llm_chain=llm_chain_general,
+    #     base_embeddings=base_embeddings,
+    # )
 
-    in_depth_embeddings = HypotheticalDocumentEmbedder(
-        llm_chain=llm_chain_in_depth, base_embeddings=base_embeddings
-    )
+    # in_depth_embeddings = HypotheticalDocumentEmbedder(
+    #     llm_chain=llm_chain_in_depth, base_embeddings=base_embeddings
+    # )
 
-    return general_embeddings, in_depth_embeddings
+    return base_embeddings, base_embeddings
 
 
 def metadata_func_minutes_and_agendas(record: dict, metadata: dict) -> dict:
@@ -72,11 +74,15 @@ def create_db_from_minutes_and_agendas(doc_directory):
         )
 
         data = loader.load()
-        text_splitter = RecursiveCharacterTextSplitter(
-            chunk_size=2000, chunk_overlap=100
-        )
-        docs = text_splitter.split_documents(data)
-        all_docs.extend(docs)
+        text_splitter = SemanticChunker(OpenAIEmbeddings())
+        for doc in data:
+            chunks = text_splitter.split_text(doc.page_content)
+            for chunk in chunks:
+                new_doc = Document(page_content=chunk, metadata=doc.metadata)
+                print(
+                    f"Content: {new_doc.page_content}\nMetadata: {new_doc.metadata}\n"
+                )
+                all_docs.append(new_doc)
     logger.info("Finished database from minutes...")
     return all_docs
 
@@ -102,11 +108,15 @@ def create_db_from_news_transcripts(news_json_directory):
         )
 
         data = loader.load()
-        text_splitter = RecursiveCharacterTextSplitter(
-            chunk_size=2000, chunk_overlap=100
-        )
-        docs = text_splitter.split_documents(data)
-        all_docs.extend(docs)
+        text_splitter = SemanticChunker(OpenAIEmbeddings())
+        for doc in data:
+            chunks = text_splitter.split_text(doc.page_content)
+            for chunk in chunks:
+                new_doc = Document(page_content=chunk, metadata=doc.metadata)
+                print(
+                    f"Content: {new_doc.page_content}\nMetadata: {new_doc.metadata}\n"
+                )
+                all_docs.append(new_doc)
     logger.info("Finished database from news transcripts...")
     return all_docs
 
@@ -135,19 +145,15 @@ def create_db_from_cj_transcripts(cj_json_directory):
         )
 
         data = loader.load()
-        text_splitter = RecursiveCharacterTextSplitter(
-            chunk_size=2000, chunk_overlap=100
-        )
-        docs = text_splitter.split_documents(data)
-
-        for doc in docs:
-            publish_date = doc.metadata.get("publish_date")
-            if publish_date:
-                doc.page_content += f" -- publish_date: {publish_date}"
-            else:
-                logger.warning(f"No publish date found for document: {doc}")
-
-        all_docs.extend(docs)
+        text_splitter = SemanticChunker(OpenAIEmbeddings())
+        for doc in data:
+            chunks = text_splitter.split_text(doc.page_content)
+            for chunk in chunks:
+                new_doc = Document(page_content=chunk, metadata=doc.metadata)
+                print(
+                    f"Content: {new_doc.page_content}\nMetadata: {new_doc.metadata}\n"
+                )
+                all_docs.append(new_doc)
 
     logger.info("Finished database from CJ transcripts...")
     return all_docs
@@ -168,17 +174,15 @@ def create_db_from_fc_transcripts(fc_json_directory):
         )
 
         data = loader.load()
-        text_splitter = RecursiveCharacterTextSplitter(
-            chunk_size=2000, chunk_overlap=100
-        )
-        docs = text_splitter.split_documents(data)
-        # Append the publish date to the end of page_content
-        for doc in docs:
-            publish_date = doc.metadata.get("publish_date")
-            if publish_date:
-                doc.page_content += f" -- publish_date: {publish_date}"
-
-        all_docs.extend(docs)
+        text_splitter = SemanticChunker(OpenAIEmbeddings())
+        for doc in data:
+            chunks = text_splitter.split_text(doc.page_content)
+            for chunk in chunks:
+                new_doc = Document(page_content=chunk, metadata=doc.metadata)
+                print(
+                    f"Content: {new_doc.page_content}\nMetadata: {new_doc.metadata}\n"
+                )
+                all_docs.append(new_doc)
     logger.info("Finished database from news transcripts...")
     return all_docs
 
@@ -198,11 +202,15 @@ def create_db_from_public_comments(pc_json_directory):
         )
 
         data = loader.load()
-        text_splitter = RecursiveCharacterTextSplitter(
-            chunk_size=2000, chunk_overlap=100
-        )
-        docs = text_splitter.split_documents(data)
-        all_docs.extend(docs)
+        text_splitter = SemanticChunker(OpenAIEmbeddings())
+        for doc in data:
+            chunks = text_splitter.split_text(doc.page_content)
+            for chunk in chunks:
+                new_doc = Document(page_content=chunk, metadata=doc.metadata)
+                print(
+                    f"Content: {new_doc.page_content}\nMetadata: {new_doc.metadata}\n"
+                )
+                all_docs.append(new_doc)
     logger.info("Finished database from Public Comments...")
     return all_docs
 
@@ -239,7 +247,9 @@ def create_save_and_copy_faiss(docs, embeddings, doc_type):
             f"googlecloud/functions/getanswer/cache/faiss_index_in_depth_{doc_type}"
         )
         shutil.copytree(local_save_dir, cloud_dir, dirs_exist_ok=True)
-        logger.info(f"FAISS index for {doc_type} copied to Google Cloud directory: {cloud_dir}")
+        logger.info(
+            f"FAISS index for {doc_type} copied to Google Cloud directory: {cloud_dir}"
+        )
 
         return db
 
diff --git a/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_cj.dvc b/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_cj.dvc
index a8319f0..c0a175c 100644
--- a/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_cj.dvc
+++ b/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_cj.dvc
@@ -1,6 +1,6 @@
 outs:
-- md5: c6beb8080d1d4184ffc674e87f44d325.dir
-  size: 23116757
+- md5: 0b60967184b16042af19d7a5e668d976.dir
+  size: 23052882
   nfiles: 2
   hash: md5
   path: faiss_index_in_depth_cj
diff --git a/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_fc.dvc b/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_fc.dvc
index 1983be3..dee0b81 100644
--- a/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_fc.dvc
+++ b/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_fc.dvc
@@ -1,6 +1,6 @@
 outs:
-- md5: 5429df81ded37a05e2b635a7a17f9c85.dir
-  size: 104396613
+- md5: 1fb1758a0371e1646b7097027a19bf96.dir
+  size: 104365424
   nfiles: 2
   hash: md5
   path: faiss_index_in_depth_fc
diff --git a/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_news.dvc b/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_news.dvc
index 7ed4b24..4c4661d 100644
--- a/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_news.dvc
+++ b/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_news.dvc
@@ -1,6 +1,6 @@
 outs:
-- md5: 3d2ad2d2958f151049b9aed5553e2631.dir
-  size: 526418
+- md5: 7cd24ae248c08b35b247efc0333de089.dir
+  size: 436809
   nfiles: 2
   hash: md5
   path: faiss_index_in_depth_news
diff --git a/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_pc.dvc b/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_pc.dvc
index b806097..a305675 100644
--- a/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_pc.dvc
+++ b/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_pc.dvc
@@ -1,6 +1,6 @@
 outs:
-- md5: d128edb351bf5f50f183908a7a980121.dir
-  size: 2097501
+- md5: 6a63f62685b7e2767b810e38d939fed2.dir
+  size: 3185640
   nfiles: 2
   hash: md5
   path: faiss_index_in_depth_pc
diff --git a/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_pdf.dvc b/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_pdf.dvc
index dbda62c..57cd14c 100644
--- a/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_pdf.dvc
+++ b/packages/googlecloud/functions/getanswer/cache/faiss_index_in_depth_pdf.dvc
@@ -1,6 +1,6 @@
 outs:
-- md5: e6ab47badccf9be48d8f2bc73c446cb0.dir
-  size: 53418928
+- md5: c8a155bd868ccfd72bb6d2a07464357d.dir
+  size: 86366971
   nfiles: 2
   hash: md5
   path: faiss_index_in_depth_pdf
diff --git a/packages/googlecloud/functions/getanswer/inquirer.py b/packages/googlecloud/functions/getanswer/inquirer.py
index 1103277..1b51257 100644
--- a/packages/googlecloud/functions/getanswer/inquirer.py
+++ b/packages/googlecloud/functions/getanswer/inquirer.py
@@ -221,19 +221,32 @@ def transform_query_for_date(query):
     )
 
 
-def process_and_concat_documents(retrieved_docs):
-    """
-    Process and combine documents from multiple sources.
+# def process_and_concat_documents(retrieved_docs):
+#     """
+#     Process and combine documents from multiple sources.
 
-    :param retrieved_docs: Dictionary with keys as source names and values as lists of (Document, score) tuples.
-    :return: Tuple of combined string of all processed documents and list of original Document objects.
-    """
+#     :param retrieved_docs: Dictionary with keys as source names and values as lists of (Document, score) tuples.
+#     :return: Tuple of combined string of all processed documents and list of original Document objects.
+#     """
+#     combined_docs_content = []
+#     original_documents = []
+
+#     for source, docs in retrieved_docs.items():
+#         sorted_docs = sort_retrieved_documents(docs)
+#         for doc, score in sorted_docs:
+#             combined_docs_content.append(doc.page_content)
+#             original_documents.append(doc)
+
+#     combined_content = "\n\n".join(combined_docs_content)
+#     return combined_content, original_documents
+
+
+def process_and_concat_documents(retrieved_docs):
     combined_docs_content = []
     original_documents = []
 
     for source, docs in retrieved_docs.items():
-        sorted_docs = sort_retrieved_documents(docs)
-        for doc, score in sorted_docs:
+        for doc in docs:
             combined_docs_content.append(doc.page_content)
             original_documents.append(doc)
 
@@ -243,67 +256,75 @@ def process_and_concat_documents(retrieved_docs):
 
 def get_indepth_response_from_query(df, db_fc, db_cj, db_pdf, db_pc, db_news, query, k):
     logger.info("Performing in-depth summary query...")
-
     llm = ChatOpenAI(model_name="gpt-4-turbo")
 
-    retrievers = [db_fc, db_cj, db_pdf, db_pc, db_news]
-    retriever_names = ["fc"]
+    embeddings = OpenAIEmbeddings()
+
+    # Initialize compressors and transformers
+    splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=0, separator=". ")
+    redundant_filter = EmbeddingsRedundantFilter(embeddings=embeddings)
+    relevant_filter = EmbeddingsFilter(embeddings=embeddings, similarity_threshold=0.7)
+
+    # Create a compressor pipeline
+    pipeline_compressor = DocumentCompressorPipeline(transformers=[redundant_filter])
+
+    # Wrap base retrievers with the compressor pipeline
+    compressed_retrievers = [
+        ContextualCompressionRetriever(
+            base_compressor=pipeline_compressor, base_retriever=db.as_retriever()
+        )
+        for db in [db_fc, db_cj, db_pdf, db_pc, db_news]
+    ]
+    retriever_names = [
+        "fc",
+        "cj",
+        "pdf",
+    ]
 
+    # Initialize parallel retrieval with compressed retrievers
     retrieval_chains = {
-        name: RunnableLambda(lambda q, db=db: db.similarity_search_with_score(q, k=10))
-        for name, db in zip(retriever_names, retrievers)
+        name: RunnableLambda(lambda q, db=db: db.get_relevant_documents(q, k=25))
+        for name, db in zip(retriever_names, compressed_retrievers)
     }
     retrievals = RunnableParallel(retrieval_chains)
-    retrieved_docs = retrievals.invoke(query)
+
+    compressed_docs = retrievals.invoke(query)
 
     combined_docs_content, original_documents = process_and_concat_documents(
-        retrieved_docs
+        compressed_docs
     )
 
     template = """
     ### Task
-    Focus exclusively on answering the specific question: '{question}'. 
-
-    ### Relevance Guidelines
-    Directly relevant information must explicitly pertain to the question.
-    Information that is indirectly relevant should only be used to clarify the context necessary for understanding the direct answer.
-    Omit any information that is irrelevant or tangential to the question.
-
+    Focus exclusively on answering the specific question: '{question}' using only the information provided in the documents below.
+    
     ### Summary Guidelines
-    1. Extract the key points, decisions, and actions discussed during the city council meetings relevant to {question};
-    2. Highlight any immediate shortcomings, mistakes, or negative actions by the city council relevant to {question}; 
-    3. Elaborate on the implications and broader societal or community impacts of the identified issues relevant to {question};
-    4. Investigate any underlying biases or assumptions present in the city council's discourse or actions relevant to {question}. 
-    If not relevant to the question, answer the question without expanding on these points.
-
-    ### Bias Guidelines:
-    Be mindful of biases in the document corpus. These documents were produced by city council, therefore, you must be aware of the inherent biases toward its behavior.
-
+    1. Extract the key points, decisions, and actions from the provided documents that are relevant to the question.
+    2. Identify any immediate shortcomings, mistakes, or negative actions mentioned in the documents that are relevant to the question.
+    3. Discuss the implications and broader societal or community impacts of the identified issues as stated in the documents, if they are relevant to the question.
+    4. Highlight any underlying biases or assumptions present in the documents' content that are relevant to the question.
+    
     ### Formatting Instructions
-    Deliver the response in unformatted paragraph form.
-    Avoid any lists or bullet points.
-    Do not mention document analysis methods or publication dates.
-
-    If your response includes technical terms provide a brief definition for those terms at the end of your response. Ensure each definition is on a new line, formatted as follows:
+    1. Provide the response in concise, unformatted paragraphs.
+    2. Avoid lists, bullet points, or mentioning document analysis methods or publication dates.
+    3. If your response includes technical terms, provide a brief definition for those terms at the end of your response, with each definition on a new line, formatted as follows:
 
     Definitions:
-
-    Word: Definition
-    Word: Definition
-    Word: Definition
-
+    Term 1: Definition
+    Term 2: Definition
+    Term 3: Definition
+    
     ### Documents to Analyze
     {docs}
     """
 
     prompt_response = ChatPromptTemplate.from_template(template)
     response_chain = prompt_response | llm | StrOutputParser()
-
     responses_llm = response_chain.invoke(
         {"question": query, "docs": combined_docs_content}
     )
-    print(responses_llm)
 
+    print(responses_llm)
     return process_streamed_responses_llm(responses_llm, original_documents)