opea-project · s-gobriel · Aug 12, 2024 · Aug 12, 2024 · Aug 13, 2024 · Aug 13, 2024
@@ -12,6 +12,7 @@
     GeneratedDoc,
     LLMParamsDoc,
     SearchedDoc,
+    SearchedMultimodalDoc,
     RerankedDoc,
     TextDoc,
     RAGASParams,

@@ -1,7 +1,7 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Dict, List, Optional, Union
+from typing import Dict, List, Optional, Tuple, Union
 
 import numpy as np
 from docarray import BaseDoc, DocList
@@ -20,6 +20,14 @@ class TextDoc(BaseDoc, TopologyInfo):
     text: str
 
 
+class ImageDoc(BaseDoc):
+    image_path: str
+
+
+class TextImageDoc(BaseDoc):
+    doc: Tuple[Union[TextDoc, ImageDoc]]
+
+
 class Base64ByteStrDoc(BaseDoc):
     byte_str: str
 
@@ -41,6 +49,7 @@ class EmbedDoc(BaseDoc):
     fetch_k: int = 20
     lambda_mult: float = 0.5
     score_threshold: float = 0.2
+    constraints: dict = None
 
 
 class Audio2TextDoc(AudioDoc):
@@ -67,6 +76,16 @@ class Config:
         json_encoders = {np.ndarray: lambda x: x.tolist()}
 
 
+class SearchedMultimodalDoc(BaseDoc):
+    retrieved_docs: List[TextImageDoc]
+    initial_query: str
+    top_n: int = 1
+    metadata: Optional[List[Dict]] = None
+
+    class Config:
+        json_encoders = {np.ndarray: lambda x: x.tolist()}
+
+
 class GeneratedDoc(BaseDoc):
     text: str
     prompt: str

@@ -36,3 +36,7 @@ For details, please refer to this [readme](pinecone/README.md)
 ## Dataprep Microservice with PGVector
 
 For details, please refer to this [readme](pgvector/README.md)
+
+# Dataprep Microservice with VDMS
+
+For details, please refer to this [readme](vdms/README.md)
@@ -0,0 +1,189 @@
+# Dataprep Microservice with VDMS
+
+For dataprep microservice, we currently provide one framework: `Langchain`.
+
+<!-- We also provide `Langchain_ray` which uses ray to parallel the data prep for multi-file performance improvement(observed 5x - 15x speedup by processing 1000 files/links.). -->
+
+We organized the folders in the same way, so you can use either framework for dataprep microservice with the following constructions.
+
+# 🚀1. Start Microservice with Python (Option 1)
+
+## 1.1 Install Requirements
+
+- option 1: Install Single-process version (for 1-10 files processing)
+
+```bash
+apt-get update
+apt-get install -y default-jre tesseract-ocr libtesseract-dev poppler-utils
+cd langchain
+pip install -r requirements.txt
+```
+
+<!-- - option 2: Install multi-process version (for >10 files processing)
+
+```bash
+cd langchain_ray; pip install -r requirements_ray.txt
+``` -->
+
+## 1.2 Start VDMS Server
+
+Please refer to this [readme](../../vectorstores/langchain/vdms/README.md).
+
+## 1.3 Setup Environment Variables
+
+```bash
+export http_proxy=${your_http_proxy}
+export https_proxy=${your_http_proxy}
+export VDMS_HOST=${host_ip}
+export VDMS_PORT=55555
+export COLLECTION_NAME=${your_collection_name}
+export LANGCHAIN_TRACING_V2=true
+export LANGCHAIN_PROJECT="opea/gen-ai-comps:dataprep"
+export PYTHONPATH=${path_to_comps}
+```
+
+## 1.4 Start Document Preparation Microservice for VDMS with Python Script
+
+Start document preparation microservice for VDMS with below command.
+
+- option 1: Start single-process version (for 1-10 files processing)
+
+```bash
+python prepare_doc_vdms.py
+```
+
+<!-- - option 2: Start multi-process version (for >10 files processing)
+
+```bash
+python prepare_doc_redis_on_ray.py
+``` -->
+
+# 🚀2. Start Microservice with Docker (Option 2)
+
+## 2.1 Start VDMS Server
+
+Please refer to this [readme](../../vectorstores/langchain/vdms/README.md).
+
+## 2.2 Setup Environment Variables
+
+```bash
+export http_proxy=${your_http_proxy}
+export https_proxy=${your_http_proxy}
+export VDMS_HOST=${host_ip}
+export VDMS_PORT=55555
+export TEI_ENDPOINT=${your_tei_endpoint}
+export COLLECTION_NAME=${your_collection_name}
+export SEARCH_ENGINE="FaissFlat"
+export DISTANCE_STRATEGY="L2"
+export PYTHONPATH=${path_to_comps}
+```
+
+## 2.3 Build Docker Image
+
+- Build docker image with langchain
+
+* option 1: Start single-process version (for 1-10 files processing)
+
+```bash
+cd ../../../
+docker build -t opea/dataprep-vdms:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/vdms/langchain/docker/Dockerfile .
+```
+
+<!-- - option 2: Start multi-process version (for >10 files processing)
+
+```bash
+cd ../../../../
+docker build -t opea/dataprep-on-ray-vdms:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/vdms/langchain_ray/docker/Dockerfile . -->
+
+## 2.4 Run Docker with CLI
+
+- option 1: Start single-process version (for 1-10 files processing)
+
+```bash
+docker run -d --name="dataprep-vdms-server" -p 6007:6007 --runtime=runc --ipc=host \
+-e http_proxy=$http_proxy -e https_proxy=$https_proxy -e TEI_ENDPOINT=$TEI_ENDPOINT \
+-e COLLECTION_NAME=$COLLECTION_NAME -e VDMS_HOST=$VDMS_HOST -e VDMS_PORT=$VDMS_PORT \
+opea/dataprep-vdms:latest
+```
+
+<!-- - option 2: Start multi-process version (for >10 files processing)
+
+```bash
+docker run -d --name="dataprep-vdms-server" -p 6007:6007 --runtime=runc --ipc=host \
+-e http_proxy=$http_proxy -e https_proxy=$https_proxy \
+-e COLLECTION_NAME=$COLLECTION_NAME -e VDMS_HOST=$VDMS_HOST -e VDMS_PORT=$VDMS_PORT \
+-e TIMEOUT_SECONDS=600 opea/dataprep-on-ray-vdms:latest
+``` -->
+
+# 🚀3. Status Microservice
+
+```bash
+docker container logs -f dataprep-vdms-server
+```
+
+# 🚀4. Consume Microservice
+
+Once document preparation microservice for VDMS is started, user can use below command to invoke the microservice to convert the document to embedding and save to the database.
+
+Make sure the file path after `files=@` is correct.
+
+<!-- - Single file upload
+
+```bash
+curl -X POST \
+    -H "Content-Type: multipart/form-data" \
+    -F "files=@./file1.txt" \
+    http://localhost:6007/v1/dataprep
+```
+
+You can specify chunk_size and chunk_size by the following commands.
+
+```bash
+curl -X POST \
+    -H "Content-Type: multipart/form-data" \
+    -F "files=@/home/sdp/yuxiang/opea_intent/GenAIComps4/comps/table_extraction/LLAMA2_page6.pdf" \
+    -F "chunk_size=1500" \
+    -F "chunk_overlap=100" \
+    http://localhost:6007/v1/dataprep
+```
+
+- Multiple file upload
+
+```bash
+curl -X POST \
+    -H "Content-Type: multipart/form-data" \
+    -F "files=@./file1.txt" \
+    -F "files=@./file2.txt" \
+    -F "files=@./file3.txt" \
+    http://localhost:6007/v1/dataprep
+```
+
+- Links upload (not supported for llama_index now)
+
+```bash
+curl -X POST \
+    -F 'link_list=["https://www.ces.tech/"]' \
+    http://localhost:6007/v1/dataprep
+```
+
+or
+
+```python
+import requests
+import json
+
+proxies = {"http": ""}
+url = "http://localhost:6007/v1/dataprep"
+urls = [
+    "https://towardsdatascience.com/no-gpu-no-party-fine-tune-bert-for-sentiment-analysis-with-vertex-ai-custom-jobs-d8fc410e908b?source=rss----7f60cf5620c9---4"
+]
+payload = {"link_list": json.dumps(urls)}
+
+try:
+    resp = requests.post(url=url, data=payload, proxies=proxies)
+    print(resp.text)
+    resp.raise_for_status()  # Raise an exception for unsuccessful HTTP status codes
+    print("Request successful!")
+except requests.exceptions.RequestException as e:
+    print("An error occurred:", e)
+``` -->
@@ -0,0 +1,2 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,33 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+
+def getEnv(key, default_value=None):
+    env_value = os.getenv(key, default=default_value)
+    print(f"{key}: {env_value}")
+    return env_value
+
+
+# Embedding model
+EMBED_MODEL = getEnv("EMBED_MODEL", "BAAI/bge-base-en-v1.5")
+
+# VDMS configuration
+VDMS_HOST = getEnv("VDMS_HOST", "localhost")
+VDMS_PORT = int(getEnv("VDMS_PORT", 55555))
+COLLECTION_NAME = getEnv("COLLECTION_NAME", "rag-vdms")
+SEARCH_ENGINE = getEnv("SEARCH_ENGINE", "FaissFlat")
+DISTANCE_STRATEGY = getEnv("DISTANCE_STRATEGY", "L2")
+
+# LLM/Embedding endpoints
+TGI_LLM_ENDPOINT = getEnv("TGI_LLM_ENDPOINT", "http://localhost:8080")
+TGI_LLM_ENDPOINT_NO_RAG = getEnv("TGI_LLM_ENDPOINT_NO_RAG", "http://localhost:8081")
+TEI_EMBEDDING_ENDPOINT = getEnv("TEI_ENDPOINT")
+
+# chunk parameters
+CHUNK_SIZE = getEnv("CHUNK_SIZE", 1500)
+CHUNK_OVERLAP = getEnv("CHUNK_OVERLAP", 100)
+
+current_file_path = os.path.abspath(__file__)
+parent_dir = os.path.dirname(current_file_path)
@@ -0,0 +1,41 @@
+
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+FROM python:3.11-slim
+
+ENV LANG=C.UTF-8
+
+ARG ARCH="cpu"
+
+RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
+    build-essential \
+    libcairo2-dev \
+    libgl1-mesa-glx \
+    libjemalloc-dev \
+    vim
+
+RUN useradd -m -s /bin/bash user && \
+    mkdir -p /home/user && \
+    chown -R user /home/user/
+
+USER user
+
+COPY comps /home/user/comps
+
+RUN pip install --no-cache-dir --upgrade pip setuptools && \
+    if [ ${ARCH} = "cpu" ]; then pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu; fi && \
+    pip install --no-cache-dir -r /home/user/comps/dataprep/vdms/langchain/requirements.txt
+
+ENV PYTHONPATH=/home/user
+
+USER root
+
+RUN mkdir -p /home/user/comps/dataprep/vdms/langchain/uploaded_files && chown -R user /home/user/comps/dataprep/vdms/langchain/uploaded_files
+
+USER user
+
+WORKDIR /home/user/comps/dataprep/vdms/langchain
+
+ENTRYPOINT ["python", "prepare_doc_vdms.py"]
+
@@ -0,0 +1,28 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+version: "3"
+services:
+  vdms-vector-db:
+    image: intellabs/vdms:latest
+    container_name: vdms-vector-db
+    ports:
+      - "55555:55555"
+  dataprep-vdms:
+    image: opea/dataprep-vdms:latest
+    container_name: dataprep-vdms-server
+    ports:
+      - "6000:6000"
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      VDMS_HOST: ${VDMS_HOST}
+      VDMS_PORT: ${VDMS_PORT}
+      COLLECTION_NAME: ${COLLECTION_NAME}
+    restart: unless-stopped
+
+networks:
+  default:
+    driver: bridge
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# Copyright (C) 2024 Intel Corporation
		# SPDX-License-Identifier: Apache-2.0