From 817489c44a81025207216e5374bd422319dcd13c Mon Sep 17 00:00:00 2001
From: XuhuiRen <44249229+XuhuiRen@users.noreply.github.com>
Date: Wed, 21 Aug 2024 23:22:50 +0800
Subject: [PATCH] Fix the data load issue for structured files (#505)

Signed-off-by: XuhuiRen <xuhui.ren@intel.com>
Signed-off-by: siddhivelankar23 <siddhi.velankar@intel.com>
---
 comps/dataprep/milvus/prepare_doc_milvus.py         | 12 ++++++++++--
 .../pgvector/langchain/prepare_doc_pgvector.py      | 13 +++++++++++--
 comps/dataprep/pinecone/prepare_doc_pinecone.py     | 12 ++++++++++--
 comps/dataprep/qdrant/prepare_doc_qdrant.py         | 11 +++++++++--
 comps/dataprep/redis/langchain/prepare_doc_redis.py | 12 ++++++++++--
 .../redis/langchain_ray/prepare_doc_redis_on_ray.py |  7 ++++++-
 comps/dataprep/utils.py                             |  9 ++++++---
 7 files changed, 62 insertions(+), 14 deletions(-)

diff --git a/comps/dataprep/milvus/prepare_doc_milvus.py b/comps/dataprep/milvus/prepare_doc_milvus.py
index 25640dab9..72cbf2424 100644
--- a/comps/dataprep/milvus/prepare_doc_milvus.py
+++ b/comps/dataprep/milvus/prepare_doc_milvus.py
@@ -92,12 +92,20 @@ def ingest_data_to_milvus(doc_path: DocPath):
         )
 
     content = document_loader(path)
-    chunks = text_splitter.split_text(content)
+
+    structured_types = [".xlsx", ".csv", ".json", "jsonl"]
+    _, ext = os.path.splitext(path)
+
+    if ext in structured_types:
+        chunks = content
+    else:
+        chunks = text_splitter.split_text(content)
+
     if doc_path.process_table and path.endswith(".pdf"):
         table_chunks = get_tables_result(path, doc_path.table_strategy)
         chunks = chunks + table_chunks
     if logflag:
-        logger.info("[ ingest data ] Done preprocessing. Created ", len(chunks), " chunks of the original pdf")
+        logger.info("[ ingest data ] Done preprocessing. Created ", len(chunks), " chunks of the original file.")
 
     # Create vectorstore
     if MOSEC_EMBEDDING_ENDPOINT:
diff --git a/comps/dataprep/pgvector/langchain/prepare_doc_pgvector.py b/comps/dataprep/pgvector/langchain/prepare_doc_pgvector.py
index 7a02c6792..1331f3772 100644
--- a/comps/dataprep/pgvector/langchain/prepare_doc_pgvector.py
+++ b/comps/dataprep/pgvector/langchain/prepare_doc_pgvector.py
@@ -100,10 +100,19 @@ def ingest_doc_to_pgvector(doc_path: DocPath):
     text_splitter = RecursiveCharacterTextSplitter(
         chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP, add_start_index=True, separators=get_separators()
     )
+
     content = document_loader(doc_path)
-    chunks = text_splitter.split_text(content)
+
+    structured_types = [".xlsx", ".csv", ".json", "jsonl"]
+    _, ext = os.path.splitext(doc_path)
+
+    if ext in structured_types:
+        chunks = content
+    else:
+        chunks = text_splitter.split_text(content)
+
     if logflag:
-        logger.info("Done preprocessing. Created ", len(chunks), " chunks of the original pdf")
+        logger.info("Done preprocessing. Created ", len(chunks), " chunks of the original file.")
         logger.info("PG Connection", PG_CONNECTION_STRING)
     metadata = [dict({"doc_name": str(doc_path)})]
 
diff --git a/comps/dataprep/pinecone/prepare_doc_pinecone.py b/comps/dataprep/pinecone/prepare_doc_pinecone.py
index 73f3e94af..9bb5c35ff 100644
--- a/comps/dataprep/pinecone/prepare_doc_pinecone.py
+++ b/comps/dataprep/pinecone/prepare_doc_pinecone.py
@@ -105,12 +105,20 @@ def ingest_data_to_pinecone(doc_path: DocPath):
         )
 
     content = document_loader(path)
-    chunks = text_splitter.split_text(content)
+
+    structured_types = [".xlsx", ".csv", ".json", "jsonl"]
+    _, ext = os.path.splitext(path)
+
+    if ext in structured_types:
+        chunks = content
+    else:
+        chunks = text_splitter.split_text(content)
+
     if doc_path.process_table and path.endswith(".pdf"):
         table_chunks = get_tables_result(path, doc_path.table_strategy)
         chunks = chunks + table_chunks
     if logflag:
-        logger.info("Done preprocessing. Created ", len(chunks), " chunks of the original pdf")
+        logger.info("Done preprocessing. Created ", len(chunks), " chunks of the original file.")
 
     # Create vectorstore
     if tei_embedding_endpoint:
diff --git a/comps/dataprep/qdrant/prepare_doc_qdrant.py b/comps/dataprep/qdrant/prepare_doc_qdrant.py
index 8fe0399e2..a97987817 100644
--- a/comps/dataprep/qdrant/prepare_doc_qdrant.py
+++ b/comps/dataprep/qdrant/prepare_doc_qdrant.py
@@ -52,12 +52,19 @@ def ingest_data_to_qdrant(doc_path: DocPath):
 
     content = document_loader(path)
 
-    chunks = text_splitter.split_text(content)
+    structured_types = [".xlsx", ".csv", ".json", "jsonl"]
+    _, ext = os.path.splitext(path)
+
+    if ext in structured_types:
+        chunks = content
+    else:
+        chunks = text_splitter.split_text(content)
+
     if doc_path.process_table and path.endswith(".pdf"):
         table_chunks = get_tables_result(path, doc_path.table_strategy)
         chunks = chunks + table_chunks
     if logflag:
-        logger.info("Done preprocessing. Created ", len(chunks), " chunks of the original pdf")
+        logger.info("Done preprocessing. Created ", len(chunks), " chunks of the original file.")
 
     # Create vectorstore
     if TEI_EMBEDDING_ENDPOINT:
diff --git a/comps/dataprep/redis/langchain/prepare_doc_redis.py b/comps/dataprep/redis/langchain/prepare_doc_redis.py
index 0948cb858..0082ddcd6 100644
--- a/comps/dataprep/redis/langchain/prepare_doc_redis.py
+++ b/comps/dataprep/redis/langchain/prepare_doc_redis.py
@@ -198,12 +198,20 @@ def ingest_data_to_redis(doc_path: DocPath):
     if logflag:
         logger.info("[ ingest data ] file content loaded")
 
-    chunks = text_splitter.split_text(content)
+    structured_types = [".xlsx", ".csv", ".json", "jsonl"]
+    _, ext = os.path.splitext(path)
+
+    if ext in structured_types:
+        chunks = content
+    else:
+        chunks = text_splitter.split_text(content)
+
+    ### Specially processing for the table content in PDFs
     if doc_path.process_table and path.endswith(".pdf"):
         table_chunks = get_tables_result(path, doc_path.table_strategy)
         chunks = chunks + table_chunks
     if logflag:
-        logger.info(f"[ ingest data ] Done preprocessing. Created {len(chunks)} chunks of the original pdf")
+        logger.info(f"[ ingest data ] Done preprocessing. Created {len(chunks)} chunks of the given file.")
 
     file_name = doc_path.path.split("/")[-1]
     return ingest_chunks_to_redis(file_name, chunks)
diff --git a/comps/dataprep/redis/langchain_ray/prepare_doc_redis_on_ray.py b/comps/dataprep/redis/langchain_ray/prepare_doc_redis_on_ray.py
index af5095f30..d5ec731ba 100644
--- a/comps/dataprep/redis/langchain_ray/prepare_doc_redis_on_ray.py
+++ b/comps/dataprep/redis/langchain_ray/prepare_doc_redis_on_ray.py
@@ -177,7 +177,12 @@ def data_to_redis(data):
     text_splitter = RecursiveCharacterTextSplitter(
         chunk_size=1500, chunk_overlap=100, add_start_index=True, separators=get_separators(), is_separator_regex=False
     )
-    chunks = text_splitter.split_text(data)
+    if isinstance(data, list):
+        chunks = data
+    elif isinstance(data, str):
+        chunks = text_splitter.split_text(data)
+    else:
+        raise TypeError("The content must be either a list or a string.")
 
     # Create vectorstore
     if tei_embedding_endpoint:
diff --git a/comps/dataprep/utils.py b/comps/dataprep/utils.py
index ae8361539..b300af428 100644
--- a/comps/dataprep/utils.py
+++ b/comps/dataprep/utils.py
@@ -276,7 +276,8 @@ def load_json(json_path):
     """Load and process json file."""
     with open(json_path, "r") as file:
         data = json.load(file)
-    return json.dumps(data)
+    content_list = [json.dumps(item) for item in data]
+    return content_list
 
 
 def load_yaml(yaml_path):
@@ -289,13 +290,15 @@ def load_yaml(yaml_path):
 def load_xlsx(input_path):
     """Load and process xlsx file."""
     df = pd.read_excel(input_path)
-    return df.to_string()
+    content_list = df.apply(lambda row: ", ".join(row.astype(str)), axis=1).tolist()
+    return content_list
 
 
 def load_csv(input_path):
     """Load the csv file."""
     df = pd.read_csv(input_path)
-    return df.to_string()
+    content_list = df.apply(lambda row: ", ".join(row.astype(str)), axis=1).tolist()
+    return content_list
 
 
 def load_image(image_path):