diff --git a/comps/dataprep/arango/__init__.py b/comps/dataprep/arango/__init__.py new file mode 100644 index 000000000..916f3a44b --- /dev/null +++ b/comps/dataprep/arango/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/dataprep/arango/langchain/Dockerfile b/comps/dataprep/arango/langchain/Dockerfile new file mode 100644 index 000000000..7bd07262a --- /dev/null +++ b/comps/dataprep/arango/langchain/Dockerfile @@ -0,0 +1,38 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +ENV LANG=C.UTF-8 + +ARG ARCH="cpu" + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + build-essential \ + default-jre \ + libgl1-mesa-glx \ + libjemalloc-dev + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps + +RUN pip install --no-cache-dir --upgrade pip setuptools && \ + if [ ${ARCH} = "cpu" ]; then pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cpu; fi && \ + pip install --no-cache-dir -r /home/user/comps/dataprep/arango/langchain/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +USER root + +RUN mkdir -p /home/user/comps/dataprep/arango/langchain/uploaded_files && chown -R user /home/user/comps/dataprep/arango/langchain/uploaded_files + +USER user + +WORKDIR /home/user/comps/dataprep/arango/langchain + +ENTRYPOINT ["python", "prepare_doc_arango.py"] diff --git a/comps/dataprep/arango/langchain/README.md b/comps/dataprep/arango/langchain/README.md new file mode 100644 index 000000000..095a6da73 --- /dev/null +++ b/comps/dataprep/arango/langchain/README.md @@ -0,0 +1,111 @@ +# Dataprep Microservice with ArangoDB + +## 🚀Start Microservice with Python + +### Install Requirements + +```bash +pip install -r requirements.txt +apt-get install libtesseract-dev -y +apt-get install poppler-utils -y +``` + +### Start ArangoDB Server + +To launch ArangoDB locally, first ensure you have docker installed. Then, you can launch the database with the following docker command. + +```bash +docker run -d --name arangodb -p 8529:8529 -e ARANGO_ROOT_PASSWORD=password arangodb/arangodb:latest +``` + +### Setup Environment Variables + +```bash +export no_proxy=${your_no_proxy} +export http_proxy=${your_http_proxy} +export https_proxy=${your_http_proxy} +export ARANGO_URL=${your_arango_url} +export ARANGO_USERNAME=${your_arango_username} +export ARANGO_PASSWORD=${your_arango_password} +export ARANGO_DB_NAME=${your_db_name} +export PYTHONPATH=${path_to_comps} +``` + +### Start Document Preparation Microservice for ArangoDB with Python Script + +Start document preparation microservice for ArangoDB with below command. + +```bash +python prepare_doc_arango.py +``` + +## 🚀Start Microservice with Docker + +### Build Docker Image + +```bash +cd ../../../../ +docker build -t opea/dataprep-arango:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/arango/langchain/Dockerfile . +``` + +### Run Docker with CLI + +```bash +docker run -d --name="dataprep-arango-server" -p 6007:6007 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/dataprep-arango:latest +``` + +### Setup Environment Variables + +```bash +export no_proxy=${your_no_proxy} +export http_proxy=${your_http_proxy} +export https_proxy=${your_http_proxy} +export ARANGO_URL=${your_arango_url} +export ARANGO_USERNAME=${your_arango_username} +export ARANGO_PASSWORD=${your_arango_password} +export ARANGO_DB_NAME=${your_db_name} +``` + +### Run Docker with Docker Compose + +```bash +cd comps/dataprep/arango/langchain +docker compose -f docker-compose-dataprep-arango.yaml up -d +``` + +## Invoke Microservice + +Once document preparation microservice for ArangoDB is started, user can use below command to invoke the microservice to convert the document to embedding and save to the database. + +```bash +curl -X POST \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./file1.txt" \ + http://localhost:6007/v1/dataprep +``` + +You can specify chunk_size and chunk_size by the following commands. + +```bash +curl -X POST \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./file1.txt" \ + -F "chunk_size=1500" \ + -F "chunk_overlap=100" \ + http://localhost:6007/v1/dataprep +``` + +We support table extraction from pdf documents. You can specify process_table and table_strategy by the following commands. "table_strategy" refers to the strategies to understand tables for table retrieval. As the setting progresses from "fast" to "hq" to "llm," the focus shifts towards deeper table understanding at the expense of processing speed. The default strategy is "fast". + +Note: If you specify "table_strategy=llm", You should first start TGI Service, please refer to 1.2.1, 1.3.1 in https://github.com/opea-project/GenAIComps/tree/main/comps/llms/README.md, and then `export TGI_LLM_ENDPOINT="http://${your_ip}:8008"`. + +For ensure the quality and comprehensiveness of the extracted entities, we recommend to use `gpt-4o` as the default model for parsing the document. To enable the openai service, please `export OPENAI_API_KEY=xxxx` before using this services. + +```bash +curl -X POST \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./your_file.pdf" \ + -F "process_table=true" \ + -F "table_strategy=hq" \ + http://localhost:6007/v1/dataprep +``` diff --git a/comps/dataprep/arango/langchain/__init__.py b/comps/dataprep/arango/langchain/__init__.py new file mode 100644 index 000000000..916f3a44b --- /dev/null +++ b/comps/dataprep/arango/langchain/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/dataprep/arango/langchain/config.py b/comps/dataprep/arango/langchain/config.py new file mode 100644 index 000000000..45eb39a80 --- /dev/null +++ b/comps/dataprep/arango/langchain/config.py @@ -0,0 +1,21 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +# ArangoDB configuration +ARANGO_URL = os.getenv("ARANGO_URL", "http://localhost:8529") +ARANGO_USERNAME = os.getenv("ARANGO_USERNAME", "root") +ARANGO_PASSWORD = os.getenv("ARANGO_PASSWORD", "test") +ARANGO_DB_NAME = os.getenv("ARANGO_DB_NAME", "_system") + +# Text Generation Inference configuration +TGI_LLM_ENDPOINT = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080") + +# Text Embeddings Inference configuration +TEI_EMBEDDING_ENDPOINT = os.getenv("TEI_EMBEDDING_ENDPOINT") +HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN") +TEI_EMBED_MODEL = os.getenv("TEI_EMBED_MODEL", "BAAI/bge-base-en-v1.5") + +# OpenAI configuration (alternative to TGI & TEI) +OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") diff --git a/comps/dataprep/arango/langchain/docker-compose-dataprep-arango.yaml b/comps/dataprep/arango/langchain/docker-compose-dataprep-arango.yaml new file mode 100644 index 000000000..c766b5c03 --- /dev/null +++ b/comps/dataprep/arango/langchain/docker-compose-dataprep-arango.yaml @@ -0,0 +1,51 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3" +services: + arango-vector-db: + image: arangodb/arangodb:latest + container_name: arango-graph-db + ports: + - "8529:8529" + tgi_gaudi_service: + image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + container_name: tgi-service + ports: + - "8088:80" + volumes: + - "./data:/data" + shm_size: 1g + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + HF_TOKEN: ${HF_TOKEN} + command: --model-id ${LLM_MODEL_ID} --auto-truncate --max-input-tokens 1024 --max-total-tokens 2048 + dataprep-arango: + image: opea/gen-ai-comps:dataprep-arango-xeon-server + container_name: dataprep-arango-server + depends_on: + - arango-vector-db + - tgi_gaudi_service + ports: + - "6007:6007" + ipc: host + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + ARANGO_URL: ${ARANGO_URL} + ARANGO_USERNAME: ${ARANGO_USERNAME} + ARANGO_PASSWORD: ${ARANGO_PASSWORD} + ARANGO_DB_NAME: ${ARANGO_DB_NAME} + TGI_LLM_ENDPOINT: ${TEI_ENDPOINT} + TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT} + HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + TEI_EMBED_MODEL: ${TEI_EMBED_MODEL} + OPENAI_API_KEY: ${OPENAI_API_KEY} + restart: unless-stopped + +networks: + default: + driver: bridge diff --git a/comps/dataprep/arango/langchain/prepare_doc_arango.py b/comps/dataprep/arango/langchain/prepare_doc_arango.py new file mode 100644 index 000000000..dfb383e43 --- /dev/null +++ b/comps/dataprep/arango/langchain/prepare_doc_arango.py @@ -0,0 +1,275 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import json +import os +from typing import List, Optional, Union + +import openai +from arango import ArangoClient +from config import ( + ARANGO_DB_NAME, + ARANGO_PASSWORD, + ARANGO_URL, + ARANGO_USERNAME, + HUGGINGFACEHUB_API_TOKEN, + OPENAI_API_KEY, + TEI_EMBED_MODEL, + TEI_EMBEDDING_ENDPOINT, + TGI_LLM_ENDPOINT, +) +from fastapi import File, Form, HTTPException, UploadFile +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceHubEmbeddings +from langchain_community.graphs.arangodb_graph import ArangoGraph +from langchain_community.llms import HuggingFaceEndpoint +from langchain_core.documents import Document +from langchain_core.embeddings import Embeddings +from langchain_experimental.graph_transformers import LLMGraphTransformer +from langchain_openai import ChatOpenAI, OpenAIEmbeddings +from langchain_text_splitters import HTMLHeaderTextSplitter + +from comps import CustomLogger, DocPath, opea_microservices, register_microservice +from comps.dataprep.utils import ( + document_loader, + encode_filename, + get_separators, + get_tables_result, + parse_html, + save_content_to_local_disk, +) + +logger = CustomLogger("prepare_doc_arango") +logflag = os.getenv("LOGFLAG", False) + +upload_folder = "./uploaded_files/" + + +def ingest_data_to_arango(doc_path: DocPath, embeddings: Embeddings | None) -> bool: + """Ingest document to ArangoDB.""" + path = doc_path.path + if logflag: + logger.info(f"Parsing document {path}.") + + ############ + # ArangoDB # + ############ + + client = ArangoClient(hosts=ARANGO_URL) + sys_db = client.db(name="_system", username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + + if not sys_db.has_database(ARANGO_DB_NAME): + sys_db.create_database(ARANGO_DB_NAME) + + db = client.db(name=ARANGO_DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + + graph = ArangoGraph( + db=db, + include_examples=False, + generate_schema_on_init=False, + ) + + ############################# + # Text Generation Inference # + ############################# + + if OPENAI_API_KEY: + logger.info("OpenAI API Key is set. Verifying its validity...") + openai.api_key = OPENAI_API_KEY + + try: + openai.models.list() + logger.info("OpenAI API Key is valid.") + llm = ChatOpenAI(temperature=0, model_name="gpt-4o") + except openai.error.AuthenticationError: + logger.info("OpenAI API Key is invalid.") + except Exception as e: + logger.info(f"An error occurred while verifying the API Key: {e}") + + elif TGI_LLM_ENDPOINT: + llm = HuggingFaceEndpoint( + endpoint_url=TGI_LLM_ENDPOINT, + max_new_tokens=512, + top_k=40, + top_p=0.9, + temperature=0.8, + timeout=600, + ) + else: + raise ValueError("No text generation inference endpoint is set.") + + llm_transformer = LLMGraphTransformer( + llm=llm, + # prompt=..., # TODO: Parameterize + # allowed_nodes=..., # TODO: Parameterize + # allowed_relationships=..., # TODO: Parameterize + ) + + ######################################## + # Text Embeddings Inference (optional) # + ######################################## + + if OPENAI_API_KEY: + # Use OpenAI embeddings + embeddings = OpenAIEmbeddings( + model="text-embedding-3-small", # TODO: Parameterize + dimensions=512, # TODO: Parameterize + ) + + if TEI_EMBEDDING_ENDPOINT and HUGGINGFACEHUB_API_TOKEN: + # Use TEI endpoint service + embeddings = HuggingFaceHubEmbeddings( + model=TEI_EMBEDDING_ENDPOINT, + huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN, + ) + elif TEI_EMBED_MODEL: + # Use local embedding model + embeddings = HuggingFaceBgeEmbeddings(model_name=TEI_EMBED_MODEL) + else: + embeddings = None + + ############ + # Chunking # + ############ + + if path.endswith(".html"): + headers_to_split_on = [ + ("h1", "Header 1"), + ("h2", "Header 2"), + ("h3", "Header 3"), + ] + text_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on) + else: + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=doc_path.chunk_size, + chunk_overlap=doc_path.chunk_overlap, + add_start_index=True, + separators=get_separators(), + ) + + content = document_loader(path) + + structured_types = [".xlsx", ".csv", ".json", "jsonl"] + _, ext = os.path.splitext(path) + + if ext in structured_types: + chunks = content + else: + chunks = text_splitter.split_text(content) + + if doc_path.process_table and path.endswith(".pdf"): + table_chunks = get_tables_result(path, doc_path.table_strategy) + chunks = chunks + table_chunks + if logflag: + logger.info("Done preprocessing. Created ", len(chunks), " chunks of the original file.") + + ################################ + # Graph generation & insertion # + ################################ + + generate_chunk_embeddings = embeddings is not None + + for text in chunks: + document = Document(page_content=text) + graph_doc = llm_transformer.process_response(document) + + if generate_chunk_embeddings: + source = graph_doc.source + source.metadata["embeddings"] = embeddings.embed_documents([source.page_content])[0] + + graph.add_graph_documents( + graph_documents=[graph_doc], + include_source=True, # TODO: Parameterize + graph_name="NewGraph", # TODO: Parameterize + update_graph_definition_if_exists=False, # TODO: Set as reverse of `use_one_entity_collection` + batch_size=1000, # TODO: Parameterize + use_one_entity_collection=True, # TODO: Parameterize + insert_async=False, # TODO: Parameterize + ) + + if logflag: + logger.info("The graph is built.") + + return True + + +@register_microservice( + name="opea_service@prepare_doc_arango", + endpoint="/v1/dataprep", + host="0.0.0.0", + port=6007, + input_datatype=DocPath, + output_datatype=None, +) +async def ingest_documents( + files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), + link_list: Optional[str] = Form(None), + chunk_size: int = Form(1500), + chunk_overlap: int = Form(100), + process_table: bool = Form(False), + table_strategy: str = Form("fast"), +): + if logflag: + logger.info(f"files:{files}") + logger.info(f"link_list:{link_list}") + + if files: + if not isinstance(files, list): + files = [files] + uploaded_files = [] + for file in files: + encode_file = encode_filename(file.filename) + save_path = upload_folder + encode_file + await save_content_to_local_disk(save_path, file) + ingest_data_to_arango( + DocPath( + path=save_path, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + process_table=process_table, + table_strategy=table_strategy, + ) + ) + uploaded_files.append(save_path) + if logflag: + logger.info(f"Successfully saved file {save_path}") + result = {"status": 200, "message": "Data preparation succeeded"} + if logflag: + logger.info(result) + return result + + if link_list: + link_list = json.loads(link_list) # Parse JSON string to list + if not isinstance(link_list, list): + raise HTTPException(status_code=400, detail="link_list should be a list.") + for link in link_list: + encoded_link = encode_filename(link) + save_path = upload_folder + encoded_link + ".txt" + content = parse_html([link])[0][0] + try: + await save_content_to_local_disk(save_path, content) + ingest_data_to_arango( + DocPath( + path=save_path, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + process_table=process_table, + table_strategy=table_strategy, + ) + ) + except json.JSONDecodeError: + raise HTTPException(status_code=500, detail="Fail to ingest data into qdrant.") + + if logflag: + logger.info(f"Successfully saved link {link}") + + result = {"status": 200, "message": "Data preparation succeeded"} + if logflag: + logger.info(result) + return result + + raise HTTPException(status_code=400, detail="Must provide either a file or a string list.") + + +if __name__ == "__main__": + opea_microservices["opea_service@prepare_doc_arango"].start() diff --git a/comps/dataprep/arango/langchain/requirements.txt b/comps/dataprep/arango/langchain/requirements.txt new file mode 100644 index 000000000..74d4a9f0d --- /dev/null +++ b/comps/dataprep/arango/langchain/requirements.txt @@ -0,0 +1,32 @@ +beautifulsoup4 +cairosvg +docarray[full] +docx2txt +easyocr +fastapi +huggingface_hub +langchain +git+https://github.com/arangoml/langchain.git@arangodb#subdirectory=libs/community +langchain-experimental +langchain-openai +langchain-text-splitters +langchain_huggingface +markdown +python-arango +cityhash +numpy +openai +opentelemetry-api +opentelemetry-exporter-otlp +opentelemetry-sdk +pandas +Pillow +prometheus-fastapi-instrumentator +pymupdf +pytesseract +python-docx +python-pptx +sentence_transformers +shortuuid +unstructured[all-docs]==0.15.7 +uvicorn