Fix the data load issue for structured files (opea-project#505)

Signed-off-by: XuhuiRen <[email protected]>
predictionguard · Sep 3, 2024 · 7877d12 · 7877d12
1 parent 2f44b30
commit 7877d12
Show file tree

Hide file tree

Showing 7 changed files with 62 additions and 14 deletions.
diff --git a/comps/dataprep/milvus/prepare_doc_milvus.py b/comps/dataprep/milvus/prepare_doc_milvus.py
@@ -92,12 +92,20 @@ def ingest_data_to_milvus(doc_path: DocPath):
         )
 
     content = document_loader(path)
-    chunks = text_splitter.split_text(content)
+
+    structured_types = [".xlsx", ".csv", ".json", "jsonl"]
+    _, ext = os.path.splitext(path)
+
+    if ext in structured_types:
+        chunks = content
+    else:
+        chunks = text_splitter.split_text(content)
+
     if doc_path.process_table and path.endswith(".pdf"):
         table_chunks = get_tables_result(path, doc_path.table_strategy)
         chunks = chunks + table_chunks
     if logflag:
-        logger.info("[ ingest data ] Done preprocessing. Created ", len(chunks), " chunks of the original pdf")
+        logger.info("[ ingest data ] Done preprocessing. Created ", len(chunks), " chunks of the original file.")
 
     # Create vectorstore
     if MOSEC_EMBEDDING_ENDPOINT:

diff --git a/comps/dataprep/pgvector/langchain/prepare_doc_pgvector.py b/comps/dataprep/pgvector/langchain/prepare_doc_pgvector.py
@@ -100,10 +100,19 @@ def ingest_doc_to_pgvector(doc_path: DocPath):
     text_splitter = RecursiveCharacterTextSplitter(
         chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP, add_start_index=True, separators=get_separators()
     )
+
     content = document_loader(doc_path)
-    chunks = text_splitter.split_text(content)
+
+    structured_types = [".xlsx", ".csv", ".json", "jsonl"]
+    _, ext = os.path.splitext(doc_path)
+
+    if ext in structured_types:
+        chunks = content
+    else:
+        chunks = text_splitter.split_text(content)
+
     if logflag:
-        logger.info("Done preprocessing. Created ", len(chunks), " chunks of the original pdf")
+        logger.info("Done preprocessing. Created ", len(chunks), " chunks of the original file.")
         logger.info("PG Connection", PG_CONNECTION_STRING)
     metadata = [dict({"doc_name": str(doc_path)})]
 

diff --git a/comps/dataprep/pinecone/prepare_doc_pinecone.py b/comps/dataprep/pinecone/prepare_doc_pinecone.py
@@ -105,12 +105,20 @@ def ingest_data_to_pinecone(doc_path: DocPath):
         )
 
     content = document_loader(path)
-    chunks = text_splitter.split_text(content)
+
+    structured_types = [".xlsx", ".csv", ".json", "jsonl"]
+    _, ext = os.path.splitext(path)
+
+    if ext in structured_types:
+        chunks = content
+    else:
+        chunks = text_splitter.split_text(content)
+
     if doc_path.process_table and path.endswith(".pdf"):
         table_chunks = get_tables_result(path, doc_path.table_strategy)
         chunks = chunks + table_chunks
     if logflag:
-        logger.info("Done preprocessing. Created ", len(chunks), " chunks of the original pdf")
+        logger.info("Done preprocessing. Created ", len(chunks), " chunks of the original file.")
 
     # Create vectorstore
     if tei_embedding_endpoint:

diff --git a/comps/dataprep/qdrant/prepare_doc_qdrant.py b/comps/dataprep/qdrant/prepare_doc_qdrant.py
@@ -52,12 +52,19 @@ def ingest_data_to_qdrant(doc_path: DocPath):
 
     content = document_loader(path)
 
-    chunks = text_splitter.split_text(content)
+    structured_types = [".xlsx", ".csv", ".json", "jsonl"]
+    _, ext = os.path.splitext(path)
+
+    if ext in structured_types:
+        chunks = content
+    else:
+        chunks = text_splitter.split_text(content)
+
     if doc_path.process_table and path.endswith(".pdf"):
         table_chunks = get_tables_result(path, doc_path.table_strategy)
         chunks = chunks + table_chunks
     if logflag:
-        logger.info("Done preprocessing. Created ", len(chunks), " chunks of the original pdf")
+        logger.info("Done preprocessing. Created ", len(chunks), " chunks of the original file.")
 
     # Create vectorstore
     if TEI_EMBEDDING_ENDPOINT:

diff --git a/comps/dataprep/redis/langchain/prepare_doc_redis.py b/comps/dataprep/redis/langchain/prepare_doc_redis.py
@@ -198,12 +198,20 @@ def ingest_data_to_redis(doc_path: DocPath):
     if logflag:
         logger.info("[ ingest data ] file content loaded")
 
-    chunks = text_splitter.split_text(content)
+    structured_types = [".xlsx", ".csv", ".json", "jsonl"]
+    _, ext = os.path.splitext(path)
+
+    if ext in structured_types:
+        chunks = content
+    else:
+        chunks = text_splitter.split_text(content)
+
+    ### Specially processing for the table content in PDFs
     if doc_path.process_table and path.endswith(".pdf"):
         table_chunks = get_tables_result(path, doc_path.table_strategy)
         chunks = chunks + table_chunks
     if logflag:
-        logger.info(f"[ ingest data ] Done preprocessing. Created {len(chunks)} chunks of the original pdf")
+        logger.info(f"[ ingest data ] Done preprocessing. Created {len(chunks)} chunks of the given file.")
 
     file_name = doc_path.path.split("/")[-1]
     return ingest_chunks_to_redis(file_name, chunks)

diff --git a/comps/dataprep/redis/langchain_ray/prepare_doc_redis_on_ray.py b/comps/dataprep/redis/langchain_ray/prepare_doc_redis_on_ray.py
@@ -177,7 +177,12 @@ def data_to_redis(data):
     text_splitter = RecursiveCharacterTextSplitter(
         chunk_size=1500, chunk_overlap=100, add_start_index=True, separators=get_separators(), is_separator_regex=False
     )
-    chunks = text_splitter.split_text(data)
+    if isinstance(data, list):
+        chunks = data
+    elif isinstance(data, str):
+        chunks = text_splitter.split_text(data)
+    else:
+        raise TypeError("The content must be either a list or a string.")
 
     # Create vectorstore
     if tei_embedding_endpoint:

diff --git a/comps/dataprep/utils.py b/comps/dataprep/utils.py
@@ -276,7 +276,8 @@ def load_json(json_path):
     """Load and process json file."""
     with open(json_path, "r") as file:
         data = json.load(file)
-    return json.dumps(data)
+    content_list = [json.dumps(item) for item in data]
+    return content_list
 
 
 def load_yaml(yaml_path):
@@ -289,13 +290,15 @@ def load_yaml(yaml_path):
 def load_xlsx(input_path):
     """Load and process xlsx file."""
     df = pd.read_excel(input_path)
-    return df.to_string()
+    content_list = df.apply(lambda row: ", ".join(row.astype(str)), axis=1).tolist()
+    return content_list
 
 
 def load_csv(input_path):
     """Load the csv file."""
     df = pd.read_csv(input_path)
-    return df.to_string()
+    content_list = df.apply(lambda row: ", ".join(row.astype(str)), axis=1).tolist()
+    return content_list
 
 
 def load_image(image_path):