Skip to content

Commit

Permalink
introduced 1. semantic chunking 2. transformers and compressors for p…
Browse files Browse the repository at this point in the history
…ost processing of retrieved documents
  • Loading branch information
ayyubibrahimi committed May 4, 2024
1 parent 2a7eaf9 commit af184f3
Show file tree
Hide file tree
Showing 12 changed files with 147 additions and 116 deletions.
4 changes: 2 additions & 2 deletions packages/backend/src/cache/faiss_index_in_depth_cj.dvc
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
outs:
- md5: c6beb8080d1d4184ffc674e87f44d325.dir
size: 23116757
- md5: 0b60967184b16042af19d7a5e668d976.dir
size: 23052882
nfiles: 2
hash: md5
path: faiss_index_in_depth_cj
4 changes: 2 additions & 2 deletions packages/backend/src/cache/faiss_index_in_depth_fc.dvc
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
outs:
- md5: 5429df81ded37a05e2b635a7a17f9c85.dir
size: 104396613
- md5: 1fb1758a0371e1646b7097027a19bf96.dir
size: 104365424
nfiles: 2
hash: md5
path: faiss_index_in_depth_fc
4 changes: 2 additions & 2 deletions packages/backend/src/cache/faiss_index_in_depth_news.dvc
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
outs:
- md5: 3d2ad2d2958f151049b9aed5553e2631.dir
size: 526418
- md5: 7cd24ae248c08b35b247efc0333de089.dir
size: 436809
nfiles: 2
hash: md5
path: faiss_index_in_depth_news
4 changes: 2 additions & 2 deletions packages/backend/src/cache/faiss_index_in_depth_pc.dvc
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
outs:
- md5: d128edb351bf5f50f183908a7a980121.dir
size: 2097501
- md5: 6a63f62685b7e2767b810e38d939fed2.dir
size: 3185640
nfiles: 2
hash: md5
path: faiss_index_in_depth_pc
4 changes: 2 additions & 2 deletions packages/backend/src/cache/faiss_index_in_depth_pdf.dvc
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
outs:
- md5: e6ab47badccf9be48d8f2bc73c446cb0.dir
size: 53418928
- md5: c8a155bd868ccfd72bb6d2a07464357d.dir
size: 86366971
nfiles: 2
hash: md5
path: faiss_index_in_depth_pdf
116 changes: 63 additions & 53 deletions packages/backend/src/preprocessor.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
import logging
import os
from langchain_community.document_loaders import JSONLoader
from langchain_community.document_loaders.json_loader import JSONLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings

from langchain.chains import LLMChain, HypotheticalDocumentEmbedder
from langchain.prompts import PromptTemplate
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores.faiss import FAISS
from langchain_openai import OpenAI
from pathlib import Path
import shutil
from langchain_experimental.text_splitter import SemanticChunker
from langchain.docstore.document import Document


logger = logging.getLogger(__name__)
dir = Path(__file__).parent.absolute()
Expand All @@ -35,19 +37,19 @@ def create_embeddings():
input_variables=["user_query"], template=in_depth_prompt_template
)

llm_chain_general = LLMChain(llm=llm, prompt=general_prompt)
llm_chain_in_depth = LLMChain(llm=llm, prompt=in_depth_prompt)
# llm_chain_general = LLMChain(llm=llm, prompt=general_prompt)
# llm_chain_in_depth = LLMChain(llm=llm, prompt=in_depth_prompt)

general_embeddings = HypotheticalDocumentEmbedder(
llm_chain=llm_chain_general,
base_embeddings=base_embeddings,
)
# general_embeddings = HypotheticalDocumentEmbedder(
# llm_chain=llm_chain_general,
# base_embeddings=base_embeddings,
# )

in_depth_embeddings = HypotheticalDocumentEmbedder(
llm_chain=llm_chain_in_depth, base_embeddings=base_embeddings
)
# in_depth_embeddings = HypotheticalDocumentEmbedder(
# llm_chain=llm_chain_in_depth, base_embeddings=base_embeddings
# )

return general_embeddings, in_depth_embeddings
return base_embeddings, base_embeddings


def metadata_func_minutes_and_agendas(record: dict, metadata: dict) -> dict:
Expand All @@ -72,11 +74,15 @@ def create_db_from_minutes_and_agendas(doc_directory):
)

data = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=2000, chunk_overlap=100
)
docs = text_splitter.split_documents(data)
all_docs.extend(docs)
text_splitter = SemanticChunker(OpenAIEmbeddings())
for doc in data:
chunks = text_splitter.split_text(doc.page_content)
for chunk in chunks:
new_doc = Document(page_content=chunk, metadata=doc.metadata)
print(
f"Content: {new_doc.page_content}\nMetadata: {new_doc.metadata}\n"
)
all_docs.append(new_doc)
logger.info("Finished database from minutes...")
return all_docs

Expand All @@ -102,11 +108,15 @@ def create_db_from_news_transcripts(news_json_directory):
)

data = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=2000, chunk_overlap=100
)
docs = text_splitter.split_documents(data)
all_docs.extend(docs)
text_splitter = SemanticChunker(OpenAIEmbeddings())
for doc in data:
chunks = text_splitter.split_text(doc.page_content)
for chunk in chunks:
new_doc = Document(page_content=chunk, metadata=doc.metadata)
print(
f"Content: {new_doc.page_content}\nMetadata: {new_doc.metadata}\n"
)
all_docs.append(new_doc)
logger.info("Finished database from news transcripts...")
return all_docs

Expand Down Expand Up @@ -135,19 +145,15 @@ def create_db_from_cj_transcripts(cj_json_directory):
)

data = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=2000, chunk_overlap=100
)
docs = text_splitter.split_documents(data)

for doc in docs:
publish_date = doc.metadata.get("publish_date")
if publish_date:
doc.page_content += f" -- publish_date: {publish_date}"
else:
logger.warning(f"No publish date found for document: {doc}")

all_docs.extend(docs)
text_splitter = SemanticChunker(OpenAIEmbeddings())
for doc in data:
chunks = text_splitter.split_text(doc.page_content)
for chunk in chunks:
new_doc = Document(page_content=chunk, metadata=doc.metadata)
print(
f"Content: {new_doc.page_content}\nMetadata: {new_doc.metadata}\n"
)
all_docs.append(new_doc)

logger.info("Finished database from CJ transcripts...")
return all_docs
Expand All @@ -168,17 +174,15 @@ def create_db_from_fc_transcripts(fc_json_directory):
)

data = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=2000, chunk_overlap=100
)
docs = text_splitter.split_documents(data)
# Append the publish date to the end of page_content
for doc in docs:
publish_date = doc.metadata.get("publish_date")
if publish_date:
doc.page_content += f" -- publish_date: {publish_date}"

all_docs.extend(docs)
text_splitter = SemanticChunker(OpenAIEmbeddings())
for doc in data:
chunks = text_splitter.split_text(doc.page_content)
for chunk in chunks:
new_doc = Document(page_content=chunk, metadata=doc.metadata)
print(
f"Content: {new_doc.page_content}\nMetadata: {new_doc.metadata}\n"
)
all_docs.append(new_doc)
logger.info("Finished database from news transcripts...")
return all_docs

Expand All @@ -198,11 +202,15 @@ def create_db_from_public_comments(pc_json_directory):
)

data = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=2000, chunk_overlap=100
)
docs = text_splitter.split_documents(data)
all_docs.extend(docs)
text_splitter = SemanticChunker(OpenAIEmbeddings())
for doc in data:
chunks = text_splitter.split_text(doc.page_content)
for chunk in chunks:
new_doc = Document(page_content=chunk, metadata=doc.metadata)
print(
f"Content: {new_doc.page_content}\nMetadata: {new_doc.metadata}\n"
)
all_docs.append(new_doc)
logger.info("Finished database from Public Comments...")
return all_docs

Expand Down Expand Up @@ -239,7 +247,9 @@ def create_save_and_copy_faiss(docs, embeddings, doc_type):
f"googlecloud/functions/getanswer/cache/faiss_index_in_depth_{doc_type}"
)
shutil.copytree(local_save_dir, cloud_dir, dirs_exist_ok=True)
logger.info(f"FAISS index for {doc_type} copied to Google Cloud directory: {cloud_dir}")
logger.info(
f"FAISS index for {doc_type} copied to Google Cloud directory: {cloud_dir}"
)

return db

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
outs:
- md5: c6beb8080d1d4184ffc674e87f44d325.dir
size: 23116757
- md5: 0b60967184b16042af19d7a5e668d976.dir
size: 23052882
nfiles: 2
hash: md5
path: faiss_index_in_depth_cj
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
outs:
- md5: 5429df81ded37a05e2b635a7a17f9c85.dir
size: 104396613
- md5: 1fb1758a0371e1646b7097027a19bf96.dir
size: 104365424
nfiles: 2
hash: md5
path: faiss_index_in_depth_fc
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
outs:
- md5: 3d2ad2d2958f151049b9aed5553e2631.dir
size: 526418
- md5: 7cd24ae248c08b35b247efc0333de089.dir
size: 436809
nfiles: 2
hash: md5
path: faiss_index_in_depth_news
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
outs:
- md5: d128edb351bf5f50f183908a7a980121.dir
size: 2097501
- md5: 6a63f62685b7e2767b810e38d939fed2.dir
size: 3185640
nfiles: 2
hash: md5
path: faiss_index_in_depth_pc
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
outs:
- md5: e6ab47badccf9be48d8f2bc73c446cb0.dir
size: 53418928
- md5: c8a155bd868ccfd72bb6d2a07464357d.dir
size: 86366971
nfiles: 2
hash: md5
path: faiss_index_in_depth_pdf
Loading

0 comments on commit af184f3

Please sign in to comment.