From 817489c44a81025207216e5374bd422319dcd13c Mon Sep 17 00:00:00 2001 From: XuhuiRen <44249229+XuhuiRen@users.noreply.github.com> Date: Wed, 21 Aug 2024 23:22:50 +0800 Subject: [PATCH] Fix the data load issue for structured files (#505) Signed-off-by: XuhuiRen Signed-off-by: siddhivelankar23 --- comps/dataprep/milvus/prepare_doc_milvus.py | 12 ++++++++++-- .../pgvector/langchain/prepare_doc_pgvector.py | 13 +++++++++++-- comps/dataprep/pinecone/prepare_doc_pinecone.py | 12 ++++++++++-- comps/dataprep/qdrant/prepare_doc_qdrant.py | 11 +++++++++-- comps/dataprep/redis/langchain/prepare_doc_redis.py | 12 ++++++++++-- .../redis/langchain_ray/prepare_doc_redis_on_ray.py | 7 ++++++- comps/dataprep/utils.py | 9 ++++++--- 7 files changed, 62 insertions(+), 14 deletions(-) diff --git a/comps/dataprep/milvus/prepare_doc_milvus.py b/comps/dataprep/milvus/prepare_doc_milvus.py index 25640dab9..72cbf2424 100644 --- a/comps/dataprep/milvus/prepare_doc_milvus.py +++ b/comps/dataprep/milvus/prepare_doc_milvus.py @@ -92,12 +92,20 @@ def ingest_data_to_milvus(doc_path: DocPath): ) content = document_loader(path) - chunks = text_splitter.split_text(content) + + structured_types = [".xlsx", ".csv", ".json", "jsonl"] + _, ext = os.path.splitext(path) + + if ext in structured_types: + chunks = content + else: + chunks = text_splitter.split_text(content) + if doc_path.process_table and path.endswith(".pdf"): table_chunks = get_tables_result(path, doc_path.table_strategy) chunks = chunks + table_chunks if logflag: - logger.info("[ ingest data ] Done preprocessing. Created ", len(chunks), " chunks of the original pdf") + logger.info("[ ingest data ] Done preprocessing. Created ", len(chunks), " chunks of the original file.") # Create vectorstore if MOSEC_EMBEDDING_ENDPOINT: diff --git a/comps/dataprep/pgvector/langchain/prepare_doc_pgvector.py b/comps/dataprep/pgvector/langchain/prepare_doc_pgvector.py index 7a02c6792..1331f3772 100644 --- a/comps/dataprep/pgvector/langchain/prepare_doc_pgvector.py +++ b/comps/dataprep/pgvector/langchain/prepare_doc_pgvector.py @@ -100,10 +100,19 @@ def ingest_doc_to_pgvector(doc_path: DocPath): text_splitter = RecursiveCharacterTextSplitter( chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP, add_start_index=True, separators=get_separators() ) + content = document_loader(doc_path) - chunks = text_splitter.split_text(content) + + structured_types = [".xlsx", ".csv", ".json", "jsonl"] + _, ext = os.path.splitext(doc_path) + + if ext in structured_types: + chunks = content + else: + chunks = text_splitter.split_text(content) + if logflag: - logger.info("Done preprocessing. Created ", len(chunks), " chunks of the original pdf") + logger.info("Done preprocessing. Created ", len(chunks), " chunks of the original file.") logger.info("PG Connection", PG_CONNECTION_STRING) metadata = [dict({"doc_name": str(doc_path)})] diff --git a/comps/dataprep/pinecone/prepare_doc_pinecone.py b/comps/dataprep/pinecone/prepare_doc_pinecone.py index 73f3e94af..9bb5c35ff 100644 --- a/comps/dataprep/pinecone/prepare_doc_pinecone.py +++ b/comps/dataprep/pinecone/prepare_doc_pinecone.py @@ -105,12 +105,20 @@ def ingest_data_to_pinecone(doc_path: DocPath): ) content = document_loader(path) - chunks = text_splitter.split_text(content) + + structured_types = [".xlsx", ".csv", ".json", "jsonl"] + _, ext = os.path.splitext(path) + + if ext in structured_types: + chunks = content + else: + chunks = text_splitter.split_text(content) + if doc_path.process_table and path.endswith(".pdf"): table_chunks = get_tables_result(path, doc_path.table_strategy) chunks = chunks + table_chunks if logflag: - logger.info("Done preprocessing. Created ", len(chunks), " chunks of the original pdf") + logger.info("Done preprocessing. Created ", len(chunks), " chunks of the original file.") # Create vectorstore if tei_embedding_endpoint: diff --git a/comps/dataprep/qdrant/prepare_doc_qdrant.py b/comps/dataprep/qdrant/prepare_doc_qdrant.py index 8fe0399e2..a97987817 100644 --- a/comps/dataprep/qdrant/prepare_doc_qdrant.py +++ b/comps/dataprep/qdrant/prepare_doc_qdrant.py @@ -52,12 +52,19 @@ def ingest_data_to_qdrant(doc_path: DocPath): content = document_loader(path) - chunks = text_splitter.split_text(content) + structured_types = [".xlsx", ".csv", ".json", "jsonl"] + _, ext = os.path.splitext(path) + + if ext in structured_types: + chunks = content + else: + chunks = text_splitter.split_text(content) + if doc_path.process_table and path.endswith(".pdf"): table_chunks = get_tables_result(path, doc_path.table_strategy) chunks = chunks + table_chunks if logflag: - logger.info("Done preprocessing. Created ", len(chunks), " chunks of the original pdf") + logger.info("Done preprocessing. Created ", len(chunks), " chunks of the original file.") # Create vectorstore if TEI_EMBEDDING_ENDPOINT: diff --git a/comps/dataprep/redis/langchain/prepare_doc_redis.py b/comps/dataprep/redis/langchain/prepare_doc_redis.py index 0948cb858..0082ddcd6 100644 --- a/comps/dataprep/redis/langchain/prepare_doc_redis.py +++ b/comps/dataprep/redis/langchain/prepare_doc_redis.py @@ -198,12 +198,20 @@ def ingest_data_to_redis(doc_path: DocPath): if logflag: logger.info("[ ingest data ] file content loaded") - chunks = text_splitter.split_text(content) + structured_types = [".xlsx", ".csv", ".json", "jsonl"] + _, ext = os.path.splitext(path) + + if ext in structured_types: + chunks = content + else: + chunks = text_splitter.split_text(content) + + ### Specially processing for the table content in PDFs if doc_path.process_table and path.endswith(".pdf"): table_chunks = get_tables_result(path, doc_path.table_strategy) chunks = chunks + table_chunks if logflag: - logger.info(f"[ ingest data ] Done preprocessing. Created {len(chunks)} chunks of the original pdf") + logger.info(f"[ ingest data ] Done preprocessing. Created {len(chunks)} chunks of the given file.") file_name = doc_path.path.split("/")[-1] return ingest_chunks_to_redis(file_name, chunks) diff --git a/comps/dataprep/redis/langchain_ray/prepare_doc_redis_on_ray.py b/comps/dataprep/redis/langchain_ray/prepare_doc_redis_on_ray.py index af5095f30..d5ec731ba 100644 --- a/comps/dataprep/redis/langchain_ray/prepare_doc_redis_on_ray.py +++ b/comps/dataprep/redis/langchain_ray/prepare_doc_redis_on_ray.py @@ -177,7 +177,12 @@ def data_to_redis(data): text_splitter = RecursiveCharacterTextSplitter( chunk_size=1500, chunk_overlap=100, add_start_index=True, separators=get_separators(), is_separator_regex=False ) - chunks = text_splitter.split_text(data) + if isinstance(data, list): + chunks = data + elif isinstance(data, str): + chunks = text_splitter.split_text(data) + else: + raise TypeError("The content must be either a list or a string.") # Create vectorstore if tei_embedding_endpoint: diff --git a/comps/dataprep/utils.py b/comps/dataprep/utils.py index ae8361539..b300af428 100644 --- a/comps/dataprep/utils.py +++ b/comps/dataprep/utils.py @@ -276,7 +276,8 @@ def load_json(json_path): """Load and process json file.""" with open(json_path, "r") as file: data = json.load(file) - return json.dumps(data) + content_list = [json.dumps(item) for item in data] + return content_list def load_yaml(yaml_path): @@ -289,13 +290,15 @@ def load_yaml(yaml_path): def load_xlsx(input_path): """Load and process xlsx file.""" df = pd.read_excel(input_path) - return df.to_string() + content_list = df.apply(lambda row: ", ".join(row.astype(str)), axis=1).tolist() + return content_list def load_csv(input_path): """Load the csv file.""" df = pd.read_csv(input_path) - return df.to_string() + content_list = df.apply(lambda row: ", ".join(row.astype(str)), axis=1).tolist() + return content_list def load_image(image_path):