Skip to content

Commit

Permalink
Merge pull request #254 from eye-on-surveillance/AI/chunking
Browse files Browse the repository at this point in the history
AI/chunking
  • Loading branch information
ayyubibrahimi authored May 4, 2024
2 parents 2a7eaf9 + af184f3 commit 9445c85
Show file tree
Hide file tree
Showing 12 changed files with 147 additions and 116 deletions.
4 changes: 2 additions & 2 deletions packages/backend/src/cache/faiss_index_in_depth_cj.dvc
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
outs:
- md5: c6beb8080d1d4184ffc674e87f44d325.dir
size: 23116757
- md5: 0b60967184b16042af19d7a5e668d976.dir
size: 23052882
nfiles: 2
hash: md5
path: faiss_index_in_depth_cj
4 changes: 2 additions & 2 deletions packages/backend/src/cache/faiss_index_in_depth_fc.dvc
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
outs:
- md5: 5429df81ded37a05e2b635a7a17f9c85.dir
size: 104396613
- md5: 1fb1758a0371e1646b7097027a19bf96.dir
size: 104365424
nfiles: 2
hash: md5
path: faiss_index_in_depth_fc
4 changes: 2 additions & 2 deletions packages/backend/src/cache/faiss_index_in_depth_news.dvc
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
outs:
- md5: 3d2ad2d2958f151049b9aed5553e2631.dir
size: 526418
- md5: 7cd24ae248c08b35b247efc0333de089.dir
size: 436809
nfiles: 2
hash: md5
path: faiss_index_in_depth_news
4 changes: 2 additions & 2 deletions packages/backend/src/cache/faiss_index_in_depth_pc.dvc
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
outs:
- md5: d128edb351bf5f50f183908a7a980121.dir
size: 2097501
- md5: 6a63f62685b7e2767b810e38d939fed2.dir
size: 3185640
nfiles: 2
hash: md5
path: faiss_index_in_depth_pc
4 changes: 2 additions & 2 deletions packages/backend/src/cache/faiss_index_in_depth_pdf.dvc
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
outs:
- md5: e6ab47badccf9be48d8f2bc73c446cb0.dir
size: 53418928
- md5: c8a155bd868ccfd72bb6d2a07464357d.dir
size: 86366971
nfiles: 2
hash: md5
path: faiss_index_in_depth_pdf
116 changes: 63 additions & 53 deletions packages/backend/src/preprocessor.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
import logging
import os
from langchain_community.document_loaders import JSONLoader
from langchain_community.document_loaders.json_loader import JSONLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings

from langchain.chains import LLMChain, HypotheticalDocumentEmbedder
from langchain.prompts import PromptTemplate
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores.faiss import FAISS
from langchain_openai import OpenAI
from pathlib import Path
import shutil
from langchain_experimental.text_splitter import SemanticChunker
from langchain.docstore.document import Document


logger = logging.getLogger(__name__)
dir = Path(__file__).parent.absolute()
Expand All @@ -35,19 +37,19 @@ def create_embeddings():
input_variables=["user_query"], template=in_depth_prompt_template
)

llm_chain_general = LLMChain(llm=llm, prompt=general_prompt)
llm_chain_in_depth = LLMChain(llm=llm, prompt=in_depth_prompt)
# llm_chain_general = LLMChain(llm=llm, prompt=general_prompt)
# llm_chain_in_depth = LLMChain(llm=llm, prompt=in_depth_prompt)

general_embeddings = HypotheticalDocumentEmbedder(
llm_chain=llm_chain_general,
base_embeddings=base_embeddings,
)
# general_embeddings = HypotheticalDocumentEmbedder(
# llm_chain=llm_chain_general,
# base_embeddings=base_embeddings,
# )

in_depth_embeddings = HypotheticalDocumentEmbedder(
llm_chain=llm_chain_in_depth, base_embeddings=base_embeddings
)
# in_depth_embeddings = HypotheticalDocumentEmbedder(
# llm_chain=llm_chain_in_depth, base_embeddings=base_embeddings
# )

return general_embeddings, in_depth_embeddings
return base_embeddings, base_embeddings


def metadata_func_minutes_and_agendas(record: dict, metadata: dict) -> dict:
Expand All @@ -72,11 +74,15 @@ def create_db_from_minutes_and_agendas(doc_directory):
)

data = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=2000, chunk_overlap=100
)
docs = text_splitter.split_documents(data)
all_docs.extend(docs)
text_splitter = SemanticChunker(OpenAIEmbeddings())
for doc in data:
chunks = text_splitter.split_text(doc.page_content)
for chunk in chunks:
new_doc = Document(page_content=chunk, metadata=doc.metadata)
print(
f"Content: {new_doc.page_content}\nMetadata: {new_doc.metadata}\n"
)
all_docs.append(new_doc)
logger.info("Finished database from minutes...")
return all_docs

Expand All @@ -102,11 +108,15 @@ def create_db_from_news_transcripts(news_json_directory):
)

data = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=2000, chunk_overlap=100
)
docs = text_splitter.split_documents(data)
all_docs.extend(docs)
text_splitter = SemanticChunker(OpenAIEmbeddings())
for doc in data:
chunks = text_splitter.split_text(doc.page_content)
for chunk in chunks:
new_doc = Document(page_content=chunk, metadata=doc.metadata)
print(
f"Content: {new_doc.page_content}\nMetadata: {new_doc.metadata}\n"
)
all_docs.append(new_doc)
logger.info("Finished database from news transcripts...")
return all_docs

Expand Down Expand Up @@ -135,19 +145,15 @@ def create_db_from_cj_transcripts(cj_json_directory):
)

data = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=2000, chunk_overlap=100
)
docs = text_splitter.split_documents(data)

for doc in docs:
publish_date = doc.metadata.get("publish_date")
if publish_date:
doc.page_content += f" -- publish_date: {publish_date}"
else:
logger.warning(f"No publish date found for document: {doc}")

all_docs.extend(docs)
text_splitter = SemanticChunker(OpenAIEmbeddings())
for doc in data:
chunks = text_splitter.split_text(doc.page_content)
for chunk in chunks:
new_doc = Document(page_content=chunk, metadata=doc.metadata)
print(
f"Content: {new_doc.page_content}\nMetadata: {new_doc.metadata}\n"
)
all_docs.append(new_doc)

logger.info("Finished database from CJ transcripts...")
return all_docs
Expand All @@ -168,17 +174,15 @@ def create_db_from_fc_transcripts(fc_json_directory):
)

data = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=2000, chunk_overlap=100
)
docs = text_splitter.split_documents(data)
# Append the publish date to the end of page_content
for doc in docs:
publish_date = doc.metadata.get("publish_date")
if publish_date:
doc.page_content += f" -- publish_date: {publish_date}"

all_docs.extend(docs)
text_splitter = SemanticChunker(OpenAIEmbeddings())
for doc in data:
chunks = text_splitter.split_text(doc.page_content)
for chunk in chunks:
new_doc = Document(page_content=chunk, metadata=doc.metadata)
print(
f"Content: {new_doc.page_content}\nMetadata: {new_doc.metadata}\n"
)
all_docs.append(new_doc)
logger.info("Finished database from news transcripts...")
return all_docs

Expand All @@ -198,11 +202,15 @@ def create_db_from_public_comments(pc_json_directory):
)

data = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=2000, chunk_overlap=100
)
docs = text_splitter.split_documents(data)
all_docs.extend(docs)
text_splitter = SemanticChunker(OpenAIEmbeddings())
for doc in data:
chunks = text_splitter.split_text(doc.page_content)
for chunk in chunks:
new_doc = Document(page_content=chunk, metadata=doc.metadata)
print(
f"Content: {new_doc.page_content}\nMetadata: {new_doc.metadata}\n"
)
all_docs.append(new_doc)
logger.info("Finished database from Public Comments...")
return all_docs

Expand Down Expand Up @@ -239,7 +247,9 @@ def create_save_and_copy_faiss(docs, embeddings, doc_type):
f"googlecloud/functions/getanswer/cache/faiss_index_in_depth_{doc_type}"
)
shutil.copytree(local_save_dir, cloud_dir, dirs_exist_ok=True)
logger.info(f"FAISS index for {doc_type} copied to Google Cloud directory: {cloud_dir}")
logger.info(
f"FAISS index for {doc_type} copied to Google Cloud directory: {cloud_dir}"
)

return db

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
outs:
- md5: c6beb8080d1d4184ffc674e87f44d325.dir
size: 23116757
- md5: 0b60967184b16042af19d7a5e668d976.dir
size: 23052882
nfiles: 2
hash: md5
path: faiss_index_in_depth_cj
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
outs:
- md5: 5429df81ded37a05e2b635a7a17f9c85.dir
size: 104396613
- md5: 1fb1758a0371e1646b7097027a19bf96.dir
size: 104365424
nfiles: 2
hash: md5
path: faiss_index_in_depth_fc
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
outs:
- md5: 3d2ad2d2958f151049b9aed5553e2631.dir
size: 526418
- md5: 7cd24ae248c08b35b247efc0333de089.dir
size: 436809
nfiles: 2
hash: md5
path: faiss_index_in_depth_news
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
outs:
- md5: d128edb351bf5f50f183908a7a980121.dir
size: 2097501
- md5: 6a63f62685b7e2767b810e38d939fed2.dir
size: 3185640
nfiles: 2
hash: md5
path: faiss_index_in_depth_pc
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
outs:
- md5: e6ab47badccf9be48d8f2bc73c446cb0.dir
size: 53418928
- md5: c8a155bd868ccfd72bb6d2a07464357d.dir
size: 86366971
nfiles: 2
hash: md5
path: faiss_index_in_depth_pdf
Loading

0 comments on commit 9445c85

Please sign in to comment.