Skip to content

Commit

Permalink
Fix the data load issue for structured files (opea-project#505)
Browse files Browse the repository at this point in the history
Signed-off-by: XuhuiRen <[email protected]>
  • Loading branch information
XuhuiRen authored and sharanshirodkar7 committed Sep 3, 2024
1 parent 2f44b30 commit 7877d12
Show file tree
Hide file tree
Showing 7 changed files with 62 additions and 14 deletions.
12 changes: 10 additions & 2 deletions comps/dataprep/milvus/prepare_doc_milvus.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,12 +92,20 @@ def ingest_data_to_milvus(doc_path: DocPath):
)

content = document_loader(path)
chunks = text_splitter.split_text(content)

structured_types = [".xlsx", ".csv", ".json", "jsonl"]
_, ext = os.path.splitext(path)

if ext in structured_types:
chunks = content
else:
chunks = text_splitter.split_text(content)

if doc_path.process_table and path.endswith(".pdf"):
table_chunks = get_tables_result(path, doc_path.table_strategy)
chunks = chunks + table_chunks
if logflag:
logger.info("[ ingest data ] Done preprocessing. Created ", len(chunks), " chunks of the original pdf")
logger.info("[ ingest data ] Done preprocessing. Created ", len(chunks), " chunks of the original file.")

# Create vectorstore
if MOSEC_EMBEDDING_ENDPOINT:
Expand Down
13 changes: 11 additions & 2 deletions comps/dataprep/pgvector/langchain/prepare_doc_pgvector.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,10 +100,19 @@ def ingest_doc_to_pgvector(doc_path: DocPath):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP, add_start_index=True, separators=get_separators()
)

content = document_loader(doc_path)
chunks = text_splitter.split_text(content)

structured_types = [".xlsx", ".csv", ".json", "jsonl"]
_, ext = os.path.splitext(doc_path)

if ext in structured_types:
chunks = content
else:
chunks = text_splitter.split_text(content)

if logflag:
logger.info("Done preprocessing. Created ", len(chunks), " chunks of the original pdf")
logger.info("Done preprocessing. Created ", len(chunks), " chunks of the original file.")
logger.info("PG Connection", PG_CONNECTION_STRING)
metadata = [dict({"doc_name": str(doc_path)})]

Expand Down
12 changes: 10 additions & 2 deletions comps/dataprep/pinecone/prepare_doc_pinecone.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,12 +105,20 @@ def ingest_data_to_pinecone(doc_path: DocPath):
)

content = document_loader(path)
chunks = text_splitter.split_text(content)

structured_types = [".xlsx", ".csv", ".json", "jsonl"]
_, ext = os.path.splitext(path)

if ext in structured_types:
chunks = content
else:
chunks = text_splitter.split_text(content)

if doc_path.process_table and path.endswith(".pdf"):
table_chunks = get_tables_result(path, doc_path.table_strategy)
chunks = chunks + table_chunks
if logflag:
logger.info("Done preprocessing. Created ", len(chunks), " chunks of the original pdf")
logger.info("Done preprocessing. Created ", len(chunks), " chunks of the original file.")

# Create vectorstore
if tei_embedding_endpoint:
Expand Down
11 changes: 9 additions & 2 deletions comps/dataprep/qdrant/prepare_doc_qdrant.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,12 +52,19 @@ def ingest_data_to_qdrant(doc_path: DocPath):

content = document_loader(path)

chunks = text_splitter.split_text(content)
structured_types = [".xlsx", ".csv", ".json", "jsonl"]
_, ext = os.path.splitext(path)

if ext in structured_types:
chunks = content
else:
chunks = text_splitter.split_text(content)

if doc_path.process_table and path.endswith(".pdf"):
table_chunks = get_tables_result(path, doc_path.table_strategy)
chunks = chunks + table_chunks
if logflag:
logger.info("Done preprocessing. Created ", len(chunks), " chunks of the original pdf")
logger.info("Done preprocessing. Created ", len(chunks), " chunks of the original file.")

# Create vectorstore
if TEI_EMBEDDING_ENDPOINT:
Expand Down
12 changes: 10 additions & 2 deletions comps/dataprep/redis/langchain/prepare_doc_redis.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,12 +198,20 @@ def ingest_data_to_redis(doc_path: DocPath):
if logflag:
logger.info("[ ingest data ] file content loaded")

chunks = text_splitter.split_text(content)
structured_types = [".xlsx", ".csv", ".json", "jsonl"]
_, ext = os.path.splitext(path)

if ext in structured_types:
chunks = content
else:
chunks = text_splitter.split_text(content)

### Specially processing for the table content in PDFs
if doc_path.process_table and path.endswith(".pdf"):
table_chunks = get_tables_result(path, doc_path.table_strategy)
chunks = chunks + table_chunks
if logflag:
logger.info(f"[ ingest data ] Done preprocessing. Created {len(chunks)} chunks of the original pdf")
logger.info(f"[ ingest data ] Done preprocessing. Created {len(chunks)} chunks of the given file.")

file_name = doc_path.path.split("/")[-1]
return ingest_chunks_to_redis(file_name, chunks)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,12 @@ def data_to_redis(data):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1500, chunk_overlap=100, add_start_index=True, separators=get_separators(), is_separator_regex=False
)
chunks = text_splitter.split_text(data)
if isinstance(data, list):
chunks = data
elif isinstance(data, str):
chunks = text_splitter.split_text(data)
else:
raise TypeError("The content must be either a list or a string.")

# Create vectorstore
if tei_embedding_endpoint:
Expand Down
9 changes: 6 additions & 3 deletions comps/dataprep/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,8 @@ def load_json(json_path):
"""Load and process json file."""
with open(json_path, "r") as file:
data = json.load(file)
return json.dumps(data)
content_list = [json.dumps(item) for item in data]
return content_list


def load_yaml(yaml_path):
Expand All @@ -289,13 +290,15 @@ def load_yaml(yaml_path):
def load_xlsx(input_path):
"""Load and process xlsx file."""
df = pd.read_excel(input_path)
return df.to_string()
content_list = df.apply(lambda row: ", ".join(row.astype(str)), axis=1).tolist()
return content_list


def load_csv(input_path):
"""Load the csv file."""
df = pd.read_csv(input_path)
return df.to_string()
content_list = df.apply(lambda row: ", ".join(row.astype(str)), axis=1).tolist()
return content_list


def load_image(image_path):
Expand Down

0 comments on commit 7877d12

Please sign in to comment.