From c214405aba4626162e91a3018aa64b058fa9b26e Mon Sep 17 00:00:00 2001 From: arijit-intel Date: Mon, 12 Aug 2024 14:16:32 -0700 Subject: [PATCH 01/11] adding VDMS retriever and test --- comps/retrievers/langchain/vdms/README.md | 152 ++++++++++++++++++ comps/retrievers/langchain/vdms/__init__.py | 2 + .../langchain/vdms/docker/Dockerfile | 47 ++++++ .../vdms/docker/docker_compose_retriever.yaml | 32 ++++ comps/retrievers/langchain/vdms/ingest.py | 133 +++++++++++++++ .../langchain/vdms/requirements.txt | 16 ++ .../langchain/vdms/retriever_vdms.py | 117 ++++++++++++++ comps/retrievers/langchain/vdms/run.sh | 9 ++ .../retrievers/langchain/vdms/vdms_config.py | 81 ++++++++++ comps/vectorstores/langchain/vdms/README.md | 13 ++ comps/vectorstores/langchain/vdms/__init__.py | 2 + .../langchain/vdms/docker-compose-vdms.yml | 10 ++ tests/test_retrievers_langchain_vdms.sh | 119 ++++++++++++++ 13 files changed, 733 insertions(+) create mode 100644 comps/retrievers/langchain/vdms/README.md create mode 100644 comps/retrievers/langchain/vdms/__init__.py create mode 100644 comps/retrievers/langchain/vdms/docker/Dockerfile create mode 100644 comps/retrievers/langchain/vdms/docker/docker_compose_retriever.yaml create mode 100644 comps/retrievers/langchain/vdms/ingest.py create mode 100644 comps/retrievers/langchain/vdms/requirements.txt create mode 100644 comps/retrievers/langchain/vdms/retriever_vdms.py create mode 100644 comps/retrievers/langchain/vdms/run.sh create mode 100644 comps/retrievers/langchain/vdms/vdms_config.py create mode 100644 comps/vectorstores/langchain/vdms/README.md create mode 100644 comps/vectorstores/langchain/vdms/__init__.py create mode 100644 comps/vectorstores/langchain/vdms/docker-compose-vdms.yml create mode 100755 tests/test_retrievers_langchain_vdms.sh diff --git a/comps/retrievers/langchain/vdms/README.md b/comps/retrievers/langchain/vdms/README.md new file mode 100644 index 000000000..446aef279 --- /dev/null +++ b/comps/retrievers/langchain/vdms/README.md @@ -0,0 +1,152 @@ +# Retriever Microservice + +This retriever microservice is a highly efficient search service designed for handling and retrieving embedding vectors. It operates by receiving an embedding vector as input and conducting a similarity search against vectors stored in a VectorDB database. Users must specify the VectorDB's host, port, and the index/collection name, and the service searches within that index to find documents with the highest similarity to the input vector. + +The service primarily utilizes similarity measures in vector space to rapidly retrieve contentually similar documents. The vector-based retrieval approach is particularly suited for handling large datasets, offering fast and accurate search results that significantly enhance the efficiency and quality of information retrieval. + +Overall, this microservice provides robust backend support for applications requiring efficient similarity searches, playing a vital role in scenarios such as recommendation systems, information retrieval, or any other context where precise measurement of document similarity is crucial. + +# 🚀1. Start Microservice with Python (Option 1) + +To start the retriever microservice, you must first install the required python packages. + +## 1.1 Install Requirements + +```bash +pip install -r requirements.txt +``` + +## 1.2 Start TEI Service + +```bash +export LANGCHAIN_TRACING_V2=true +export LANGCHAIN_API_KEY=${your_langchain_api_key} +export LANGCHAIN_PROJECT="opea/retriever" +model=BAAI/bge-base-en-v1.5 +revision=refs/pr/4 +volume=$PWD/data +docker run -d -p 6060:80 -v $volume:/data -e http_proxy=$http_proxy -e https_proxy=$https_proxy --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.2 --model-id $model --revision $revision +``` + +## 1.3 Verify the TEI Service + +```bash +curl 127.0.0.1:6060/rerank \ + -X POST \ + -d '{"query":"What is Deep Learning?", "texts": ["Deep Learning is not...", "Deep learning is..."]}' \ + -H 'Content-Type: application/json' +``` + +## 1.4 Setup VectorDB Service + +You need to setup your own VectorDB service (VDMS in this example), and ingest your knowledge documents into the vector database. + +As for VDMS, you could start a docker container using the following commands. +Remember to ingest data into it manually. + +```bash +docker run -d --name="vdms-vector-db" -p 55555:55555 intellabs/vdms:latest +``` + +## 1.5 Start Retriever Service + +```bash +export TEI_EMBEDDING_ENDPOINT="http://${your_ip}:6060" +python langchain/retriever_vdms.py +``` + +# 🚀2. Start Microservice with Docker (Option 2) + +## 2.1 Setup Environment Variables + +```bash +export RETRIEVE_MODEL_ID="BAAI/bge-base-en-v1.5" +export COLLECTION_NAME=${your_index_name} +export TEI_EMBEDDING_ENDPOINT="http://${your_ip}:6060" +export LANGCHAIN_TRACING_V2=true +export LANGCHAIN_API_KEY=${your_langchain_api_key} +export LANGCHAIN_PROJECT="opea/retrievers" +``` + +## 2.2 Build Docker Image + +```bash +cd ../../ +docker build -t opea/retriever-vdms:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/langchain/vdms/docker/Dockerfile . +``` + +To start a docker container, you have two options: + +- A. Run Docker with CLI +- B. Run Docker with Docker Compose + +You can choose one as needed. + +## 2.3 Run Docker with CLI (Option A) + +```bash +docker run -d --name="retriever-vdms-server" -p 7000:7000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e INDEX_NAME=$INDEX_NAME -e TEI_EMBEDDING_ENDPOINT=$TEI_EMBEDDING_ENDPOINT opea/retriever-vdms:latest +``` + +## 2.4 Run Docker with Docker Compose (Option B) + +```bash +cd langchain/vdms/docker +docker compose -f docker_compose_retriever.yaml up -d +``` + +# 🚀3. Consume Retriever Service + +## 3.1 Check Service Status + +```bash +curl http://localhost:7000/v1/health_check \ + -X GET \ + -H 'Content-Type: application/json' +``` + +## 3.2 Consume Embedding Service + +To consume the Retriever Microservice, you can generate a mock embedding vector of length 768 with Python. + +```bash +your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") +curl http://${your_ip}:7000/v1/retrieval \ + -X POST \ + -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding}}" \ + -H 'Content-Type: application/json' +``` + +You can set the parameters for the retriever. + +```bash +your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") +curl http://localhost:7000/v1/retrieval \ + -X POST \ + -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding},\"search_type\":\"similarity\", \"k\":4}" \ + -H 'Content-Type: application/json' +``` + +```bash +your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") +curl http://localhost:7000/v1/retrieval \ + -X POST \ + -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding},\"search_type\":\"similarity_distance_threshold\", \"k\":4, \"distance_threshold\":1.0}" \ + -H 'Content-Type: application/json' +``` + +```bash +your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") +curl http://localhost:7000/v1/retrieval \ + -X POST \ + -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding},\"search_type\":\"similarity_score_threshold\", \"k\":4, \"score_threshold\":0.2}" \ + -H 'Content-Type: application/json' +``` + +```bash +your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") +curl http://localhost:7000/v1/retrieval \ + -X POST \ + -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding},\"search_type\":\"mmr\", \"k\":4, \"fetch_k\":20, \"lambda_mult\":0.5}" \ + -H 'Content-Type: application/json' +``` diff --git a/comps/retrievers/langchain/vdms/__init__.py b/comps/retrievers/langchain/vdms/__init__.py new file mode 100644 index 000000000..916f3a44b --- /dev/null +++ b/comps/retrievers/langchain/vdms/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/retrievers/langchain/vdms/docker/Dockerfile b/comps/retrievers/langchain/vdms/docker/Dockerfile new file mode 100644 index 000000000..d1c4676f2 --- /dev/null +++ b/comps/retrievers/langchain/vdms/docker/Dockerfile @@ -0,0 +1,47 @@ + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM langchain/langchain:latest + +ARG ARCH="cpu" + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + libgl1-mesa-glx \ + libjemalloc-dev \ + iputils-ping \ + vim + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +COPY comps /home/user/comps + +# RUN chmod +x /home/user/comps/retrievers/langchain/vdms/run.sh + +USER user +RUN pip install --no-cache-dir --upgrade pip && \ + if [ ${ARCH} = "cpu" ]; then pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu; fi && \ + pip install --no-cache-dir -r /home/user/comps/retrievers/langchain/vdms/requirements.txt + +RUN pip install -U langchain +RUN pip install -U langchain-community + +RUN pip install --upgrade huggingface-hub + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +ENV HUGGINGFACEHUB_API_TOKEN=dummy + +ENV no_proxy=localhost,127.0.0.1 + +ENV http_proxy="" +ENV https_proxy="" + +WORKDIR /home/user/comps/retrievers/langchain/vdms + +#ENTRYPOINT ["/home/user/comps/retrievers/langchain/vdms/run.sh"] +#ENTRYPOINT ["/bin/bash"] + +ENTRYPOINT ["python", "retriever_vdms.py"] diff --git a/comps/retrievers/langchain/vdms/docker/docker_compose_retriever.yaml b/comps/retrievers/langchain/vdms/docker/docker_compose_retriever.yaml new file mode 100644 index 000000000..1fa8ddb1e --- /dev/null +++ b/comps/retrievers/langchain/vdms/docker/docker_compose_retriever.yaml @@ -0,0 +1,32 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3.8" + +services: + tei_xeon_service: + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.2 + container_name: tei-xeon-server + ports: + - "6060:80" + volumes: + - "./data:/data" + shm_size: 1g + command: --model-id ${RETRIEVE_MODEL_ID} + retriever: + image: opea/retriever-vdms:latest + container_name: retriever-vdms-server + ports: + - "7000:7000" + ipc: host + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + COLLECTION_NAME: ${COLLECTION_NAME} + LANGCHAIN_API_KEY: ${LANGCHAIN_API_KEY} + restart: unless-stopped + +networks: + default: + driver: bridge diff --git a/comps/retrievers/langchain/vdms/ingest.py b/comps/retrievers/langchain/vdms/ingest.py new file mode 100644 index 000000000..fb0b39fd7 --- /dev/null +++ b/comps/retrievers/langchain/vdms/ingest.py @@ -0,0 +1,133 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# + +import io +import os +import time + +import numpy as np +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceEmbeddings, HuggingFaceHubEmbeddings +from langchain_community.vectorstores import VDMS +from langchain_community.vectorstores.vdms import VDMS_Client +from PIL import Image +from vdms_config import ( + COLLECTION_NAME, + DISTANCE_STRATEGY, + EMBED_MODEL, + SEARCH_ENGINE, + VDMS_HOST, + VDMS_PORT, +) #, INDEX_SCHEMA, VDMS_URL + +tei_embedding_endpoint = os.getenv("TEI_EMBEDDING_ENDPOINT") +client = VDMS_Client(VDMS_HOST, VDMS_PORT) + +timeout = 180 # Timeout in seconds +check_interval = 5 # Check every 5 seconds + + +def check_embedding_endpoint(embedder): + try: + test_sentence = "embedder available test." + embedder.embed_documents([test_sentence]) + return True + except Exception as e: + print(f"Error embedder is unavailable: {e}") + return False + + +def pdf_loader(file_path): + try: + import easyocr + import fitz + except ImportError: + raise ImportError( + "`PyMuPDF` or 'easyocr' package is not found, please install it with " + "`pip install pymupdf or pip install easyocr.`" + ) + + doc = fitz.open(file_path) + reader = easyocr.Reader(["en"]) + result = "" + for i in range(doc.page_count): + page = doc.load_page(i) + pagetext = page.get_text().strip() + if pagetext: + result = result + pagetext + if len(doc.get_page_images(i)) > 0: + for img in doc.get_page_images(i): + if img: + pageimg = "" + xref = img[0] + img_data = doc.extract_image(xref) + img_bytes = img_data["image"] + pil_image = Image.open(io.BytesIO(img_bytes)) + img = np.array(pil_image) + img_result = reader.readtext(img, paragraph=True, detail=0) + pageimg = pageimg + ", ".join(img_result).strip() + if pageimg.endswith("!") or pageimg.endswith("?") or pageimg.endswith("."): + pass + else: + pageimg = pageimg + "." + result = result + pageimg + return result + + +def ingest_documents(): + """Ingest PDF to VDMS from the data/ directory that + contains Edgar 10k filings data for Nike.""" + # Load list of pdfs + company_name = "Nike" + data_path = "data/" + doc_path = [os.path.join(data_path, file) for file in os.listdir(data_path)][0] + + print("Parsing 10k filing doc for NIKE", doc_path) + + text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100, add_start_index=True) + content = pdf_loader(doc_path) + chunks = text_splitter.split_text(content) + + print("Done preprocessing. Created ", len(chunks), " chunks of the original pdf") + # Create vectorstore + if tei_embedding_endpoint: + # create embeddings using TEI endpoint service + embedder = HuggingFaceHubEmbeddings(model=tei_embedding_endpoint) + start_time = time.time() + while time.time() - start_time < timeout: + is_available = check_embedding_endpoint(embedder) + if is_available: + print("Embedder endpoint is available.") + break + else: + print("Embedder endpoint is not available. Checking again in 5 seconds...") + time.sleep(check_interval) + else: + # create embeddings using local embedding model + embedder = HuggingFaceBgeEmbeddings(model_name=EMBED_MODEL) + + # Batch size + batch_size = 32 + num_chunks = len(chunks) + for i in range(0, num_chunks, batch_size): + batch_chunks = chunks[i : i + batch_size] + batch_texts = [f"Company: {company_name}. " + chunk for chunk in batch_chunks] + + _ = VDMS.from_texts( + client=client, + embedding=embedder, + collection_name=COLLECTION_NAME, + distance_strategy=DISTANCE_STRATEGY, + engine=SEARCH_ENGINE, + batch_size=batch_size, + texts=batch_texts, + ) + print(f"Processed batch {i//batch_size + 1}/{(num_chunks-1)//batch_size + 1}") + + +if __name__ == "__main__": + ingest_documents() diff --git a/comps/retrievers/langchain/vdms/requirements.txt b/comps/retrievers/langchain/vdms/requirements.txt new file mode 100644 index 000000000..c15077a90 --- /dev/null +++ b/comps/retrievers/langchain/vdms/requirements.txt @@ -0,0 +1,16 @@ +docarray[full] +easyocr +fastapi +langchain-core +langchain-community +langchain-huggingface +langsmith +uvicorn +opentelemetry-api +opentelemetry-exporter-otlp +opentelemetry-sdk +prometheus-fastapi-instrumentator +pymupdf +vdms +sentence_transformers +shortuuid diff --git a/comps/retrievers/langchain/vdms/retriever_vdms.py b/comps/retrievers/langchain/vdms/retriever_vdms.py new file mode 100644 index 000000000..b666f740c --- /dev/null +++ b/comps/retrievers/langchain/vdms/retriever_vdms.py @@ -0,0 +1,117 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os +import time + +from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceHubEmbeddings +from langchain_community.vectorstores.vdms import VDMS, VDMS_Client +from langsmith import traceable +from vdms_config import ( + COLLECTION_NAME, + DISTANCE_STRATEGY, + EMBED_MODEL, + SEARCH_ENGINE, + VDMS_HOST, + VDMS_PORT, +) #, HUGGINGFACEHUB_API_TOKEN, INDEX_SCHEMA, VDMS_URL + +from comps import ( + EmbedDoc, + SearchedDoc, + ServiceType, + TextDoc, + opea_microservices, + register_microservice, + register_statistics, + statistics_dict, +) + +from langchain_huggingface.embeddings import HuggingFaceEndpointEmbeddings + +tei_embedding_endpoint = os.getenv("TEI_EMBEDDING_ENDPOINT") +hf_token=os.getenv("HUGGINGFACEHUB_API_TOKEN") + +#Debugging +all_variables = dir() + +for name in all_variables: + + # Print the item if it doesn't start with '__' + if not name.startswith('__'): + myvalue = eval(name) + print(name, "is", type(myvalue), "and = ", myvalue) + +#client = VDMS_Client(VDMS_HOST, VDMS_PORT) + + +#VDMS_HOST="172.17.0.2" +#VDMS_HOST="10.54.80.228" +#print("Host =", VDMS_HOST) +#end debugging + +client = VDMS_Client(VDMS_HOST, VDMS_PORT) + +@register_microservice( + name="opea_service@retriever_vdms", + service_type=ServiceType.RETRIEVER, + endpoint="/v1/retrieval", + host="0.0.0.0", + port=7000, +) +@traceable(run_type="retriever") +@register_statistics(names=["opea_service@retriever_vdms"]) +def retrieve(input: EmbedDoc) -> SearchedDoc: + start = time.time() + if input.search_type == "similarity": + search_res = vector_db.similarity_search_by_vector(embedding=input.embedding, k=input.k) + elif input.search_type == "similarity_distance_threshold": + if input.distance_threshold is None: + raise ValueError("distance_threshold must be provided for " + "similarity_distance_threshold retriever") + search_res = vector_db.similarity_search_by_vector( + embedding=input.embedding, k=input.k, distance_threshold=input.distance_threshold + ) + elif input.search_type == "similarity_score_threshold": + docs_and_similarities = vector_db.similarity_search_with_relevance_scores( + query=input.text, k=input.k, score_threshold=input.score_threshold + ) + search_res = [doc for doc, _ in docs_and_similarities] + elif input.search_type == "mmr": + search_res = vector_db.max_marginal_relevance_search( + query=input.text, k=input.k, fetch_k=input.fetch_k, lambda_mult=input.lambda_mult + ) + searched_docs = [] + for r in search_res: + searched_docs.append(TextDoc(text=r.page_content)) + result = SearchedDoc(retrieved_docs=searched_docs, initial_query=input.text) + statistics_dict["opea_service@retriever_vdms"].append_latency(time.time() - start, None) + return result + + +if __name__ == "__main__": + # Create vectorstore + if tei_embedding_endpoint: + # create embeddings using TEI endpoint service + #print(f"TEI_EMBEDDING_ENDPOINT:{tei_embedding_endpoint}") + #embeddings = HuggingFaceHubEmbeddings(model=tei_embedding_endpoint,huggingfacehub_api_token=hf_token) + #embeddings = HuggingFaceHubEmbeddings(model=tei_embedding_endpoint) + embeddings = HuggingFaceEndpointEmbeddings(model=tei_embedding_endpoint, huggingfacehub_api_token=hf_token) + #embeddings = HuggingFaceEndpointEmbeddings(model=tei_embedding_endpoint) + else: + # create embeddings using local embedding model + embeddings = HuggingFaceBgeEmbeddings(model_name=EMBED_MODEL) + + +#debug + #embeddings = HuggingFaceBgeEmbeddings(model_name=EMBED_MODEL) +#end debug + + vector_db = VDMS( + client=client, + embedding=embeddings, + collection_name=COLLECTION_NAME, + embedding_dimensions=128, + distance_strategy=DISTANCE_STRATEGY, + engine=SEARCH_ENGINE, + ) + opea_microservices["opea_service@retriever_vdms"].start() diff --git a/comps/retrievers/langchain/vdms/run.sh b/comps/retrievers/langchain/vdms/run.sh new file mode 100644 index 000000000..6ce1dd65e --- /dev/null +++ b/comps/retrievers/langchain/vdms/run.sh @@ -0,0 +1,9 @@ +#!/bin/sh + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +cd /home/user/comps/retrievers/langchain/vdms +python ingest.py + +python retriever_vdms.py diff --git a/comps/retrievers/langchain/vdms/vdms_config.py b/comps/retrievers/langchain/vdms/vdms_config.py new file mode 100644 index 000000000..7d4a6c0c1 --- /dev/null +++ b/comps/retrievers/langchain/vdms/vdms_config.py @@ -0,0 +1,81 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + + +def get_boolean_env_var(var_name, default_value=False): + """Retrieve the boolean value of an environment variable. + + Args: + var_name (str): The name of the environment variable to retrieve. + default_value (bool): The default value to return if the variable + is not found. + + Returns: + bool: The value of the environment variable, interpreted as a boolean. + """ + true_values = {"true", "1", "t", "y", "yes"} + false_values = {"false", "0", "f", "n", "no"} + + # Retrieve the environment variable's value + value = os.getenv(var_name, "").lower() + + # Decide the boolean value based on the content of the string + if value in true_values: + return True + elif value in false_values: + return False + else: + return default_value + + +# Whether or not to enable langchain debugging +DEBUG = get_boolean_env_var("DEBUG", False) +# Set DEBUG env var to "true" if you wish to enable LC debugging module +if DEBUG: + import langchain + + langchain.debug = True + + +# Embedding model +EMBED_MODEL = os.getenv("EMBED_MODEL", "BAAI/bge-base-en-v1.5") + + +# VDMS Connection Information +VDMS_HOST = os.getenv("VDMS_HOST", "localhost") +VDMS_PORT = int(os.getenv("VDMS_PORT", 55555)) + + +# def format_vdms_conn_from_env(): +# vdms_url = os.getenv("VDMS_URL", None) +# if vdms_url: +# return vdms_url +# else: +# using_ssl = get_boolean_env_var("VDMS_SSL", False) +# start = "vdmss://" if using_ssl else "vdms://" + +# # if using RBAC +# password = os.getenv("VDMS_PASSWORD", None) +# username = os.getenv("VDMS_USERNAME", "default") +# if password is not None: +# start += f"{username}:{password}@" + +# return start + f"{VDMS_HOST}:{VDMS_PORT}" + + +# VDMS_URL = format_vdms_conn_from_env() + +# Vector Index Configuration +COLLECTION_NAME = os.getenv("COLLECTION_NAME", "rag-vdms") +#HUGGINGFACEHUB_API_TOKEN ="dummy-token" + + + +# current_file_path = os.path.abspath(__file__) +# parent_dir = os.path.dirname(current_file_path) +# VDMS_SCHEMA = os.getenv("VDMS_SCHEMA", "vdms_schema.yml") +# INDEX_SCHEMA = os.path.join(parent_dir, VDMS_SCHEMA) +SEARCH_ENGINE="FaissFlat" +DISTANCE_STRATEGY="L2" diff --git a/comps/vectorstores/langchain/vdms/README.md b/comps/vectorstores/langchain/vdms/README.md new file mode 100644 index 000000000..959bbfd64 --- /dev/null +++ b/comps/vectorstores/langchain/vdms/README.md @@ -0,0 +1,13 @@ +# Start VDMS server + +## 1. Download VDMS image + +```bash +docker pull intellabs/vdms:latest +``` + +## 2. Run VDMS service + +```bash +docker run -d -p 55555:55555 intellabs/vdms:latest +``` diff --git a/comps/vectorstores/langchain/vdms/__init__.py b/comps/vectorstores/langchain/vdms/__init__.py new file mode 100644 index 000000000..916f3a44b --- /dev/null +++ b/comps/vectorstores/langchain/vdms/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/vectorstores/langchain/vdms/docker-compose-vdms.yml b/comps/vectorstores/langchain/vdms/docker-compose-vdms.yml new file mode 100644 index 000000000..8e13a0f1b --- /dev/null +++ b/comps/vectorstores/langchain/vdms/docker-compose-vdms.yml @@ -0,0 +1,10 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3" +services: + vdms-vector-db: + image: intellabs/vdms:latest + container_name: vdms-vector-db + ports: + - "55555:55555" diff --git a/tests/test_retrievers_langchain_vdms.sh b/tests/test_retrievers_langchain_vdms.sh new file mode 100755 index 000000000..1d17e76a9 --- /dev/null +++ b/tests/test_retrievers_langchain_vdms.sh @@ -0,0 +1,119 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -xe + +WORKPATH=$(dirname "$PWD") +LOG_PATH="$WORKPATH/tests" +ip_address=$(hostname -I | awk '{print $1}') +no_proxy=$no_proxy,$ip_address + +function build_docker_images() { + cd $WORKPATH + hf_token="dummy" + docker build --no-cache -t opea/retriever-vdms:comps \ + --build-arg https_proxy=$https_proxy \ + --build-arg http_proxy=$http_proxy \ + --build-arg huggingfacehub_api_token=$hf_token\ + -f comps/retrievers/langchain/vdms/docker/Dockerfile . + +} + +function start_service() { + #unset http_proxy + # vdms + vdms_port=55555 + docker run -d --name test-comps-retriever-vdms-vector-db \ + -p $vdms_port:$vdms_port intellabs/vdms:latest + sleep 10s + + # tei endpoint + tei_endpoint=5008 + model="BAAI/bge-base-en-v1.5" + docker run -d --name="test-comps-retriever-tei-endpoint" \ + -p $tei_endpoint:80 -v ./data:/data \ + --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.2 \ + --model-id $model + sleep 30s + + export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:${tei_endpoint}" + + export INDEX_NAME="rag-vdms" + + # vdms retriever + unset http_proxy + docker run -d --name="test-comps-retriever-vdms-server" -p 7000:7000 --ipc=host \ + -e INDEX_NAME=$INDEX_NAME -e VDMS_HOST=$ip_address \ + -e https_proxy=$https_proxy -e http_proxy=$http_proxy \ + -e VDMS_PORT=$vdms_port -e HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN -e TEI_EMBEDDING_ENDPOINT=$TEI_EMBEDDING_ENDPOINT \ + opea/retriever-vdms:comps + sleep 3m +} + +function validate_microservice() { + + + retriever_port=7000 + URL="http://${ip_address}:$retriever_port/v1/retrieval" + test_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") + + + HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "{\"text\":\"test\",\"embedding\":${test_embedding}}" -H 'Content-Type: application/json' "$URL") + + #echo "HTTP_STATUS = $HTTP_STATUS" + + if [ "$HTTP_STATUS" -eq 200 ]; then + echo "[ retriever ] HTTP status is 200. Checking content..." + local CONTENT=$(curl -s -X POST -d "{\"text\":\"test\",\"embedding\":${test_embedding}}" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/retriever.log) + + if echo "$CONTENT" | grep -q "retrieved_docs"; then + echo "[ retriever ] Content is as expected." + else + echo "[ retriever ] Content does not match the expected result: $CONTENT" + docker logs test-comps-retriever-vdms-server >> ${LOG_PATH}/retriever.log + exit 1 + fi + else + echo "[ retriever ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs test-comps-retriever-vdms-server >> ${LOG_PATH}/retriever.log + exit 1 + fi + + docker logs test-comps-retriever-tei-endpoint >> ${LOG_PATH}/tei.log +} + +function stop_docker() { + cid_retrievers=$(docker ps -aq --filter "name=test-comps-retriever-tei-endpoint*") + if [[ ! -z "$cid_retrievers" ]]; then + docker stop $cid_retrievers && docker rm $cid_retrievers && sleep 1s + fi + + cid_vdms=$(docker ps -aq --filter "name=test-comps-retriever-vdms-server") + if [[ ! -z "$cid_vdms" ]]; then + docker stop $cid_vdms && docker rm $cid_vdms && sleep 1s + fi + + cid_vdmsdb=$(docker ps -aq --filter "name=test-comps-retriever-vdms-vector-db") + if [[ ! -z "$cid_vdmsdb" ]]; then + docker stop $cid_vdmsdb && docker rm $cid_vdmsdb && sleep 1s + fi + + +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_microservice + + stop_docker + echo y | docker system prune + +} + +main From 0e3598b4188cc70fd013c40aed359acce3ad7403 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 12 Aug 2024 21:34:02 +0000 Subject: [PATCH 02/11] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- comps/retrievers/langchain/vdms/ingest.py | 4 +- .../langchain/vdms/requirements.txt | 6 +-- .../langchain/vdms/retriever_vdms.py | 51 +++++++++---------- .../retrievers/langchain/vdms/vdms_config.py | 7 ++- tests/test_retrievers_langchain_vdms.sh | 14 ++--- 5 files changed, 40 insertions(+), 42 deletions(-) diff --git a/comps/retrievers/langchain/vdms/ingest.py b/comps/retrievers/langchain/vdms/ingest.py index fb0b39fd7..92861e777 100644 --- a/comps/retrievers/langchain/vdms/ingest.py +++ b/comps/retrievers/langchain/vdms/ingest.py @@ -15,14 +15,14 @@ from langchain_community.vectorstores import VDMS from langchain_community.vectorstores.vdms import VDMS_Client from PIL import Image -from vdms_config import ( +from vdms_config import ( # , INDEX_SCHEMA, VDMS_URL COLLECTION_NAME, DISTANCE_STRATEGY, EMBED_MODEL, SEARCH_ENGINE, VDMS_HOST, VDMS_PORT, -) #, INDEX_SCHEMA, VDMS_URL +) tei_embedding_endpoint = os.getenv("TEI_EMBEDDING_ENDPOINT") client = VDMS_Client(VDMS_HOST, VDMS_PORT) diff --git a/comps/retrievers/langchain/vdms/requirements.txt b/comps/retrievers/langchain/vdms/requirements.txt index c15077a90..792cd1998 100644 --- a/comps/retrievers/langchain/vdms/requirements.txt +++ b/comps/retrievers/langchain/vdms/requirements.txt @@ -1,16 +1,16 @@ docarray[full] easyocr fastapi -langchain-core langchain-community +langchain-core langchain-huggingface langsmith -uvicorn opentelemetry-api opentelemetry-exporter-otlp opentelemetry-sdk prometheus-fastapi-instrumentator pymupdf -vdms sentence_transformers shortuuid +uvicorn +vdms diff --git a/comps/retrievers/langchain/vdms/retriever_vdms.py b/comps/retrievers/langchain/vdms/retriever_vdms.py index b666f740c..a07ec8c47 100644 --- a/comps/retrievers/langchain/vdms/retriever_vdms.py +++ b/comps/retrievers/langchain/vdms/retriever_vdms.py @@ -6,15 +6,16 @@ from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceHubEmbeddings from langchain_community.vectorstores.vdms import VDMS, VDMS_Client +from langchain_huggingface.embeddings import HuggingFaceEndpointEmbeddings from langsmith import traceable -from vdms_config import ( +from vdms_config import ( # , HUGGINGFACEHUB_API_TOKEN, INDEX_SCHEMA, VDMS_URL COLLECTION_NAME, DISTANCE_STRATEGY, EMBED_MODEL, SEARCH_ENGINE, VDMS_HOST, VDMS_PORT, -) #, HUGGINGFACEHUB_API_TOKEN, INDEX_SCHEMA, VDMS_URL +) from comps import ( EmbedDoc, @@ -27,31 +28,30 @@ statistics_dict, ) -from langchain_huggingface.embeddings import HuggingFaceEndpointEmbeddings - tei_embedding_endpoint = os.getenv("TEI_EMBEDDING_ENDPOINT") -hf_token=os.getenv("HUGGINGFACEHUB_API_TOKEN") +hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN") -#Debugging -all_variables = dir() +# Debugging +all_variables = dir() -for name in all_variables: - - # Print the item if it doesn't start with '__' - if not name.startswith('__'): - myvalue = eval(name) - print(name, "is", type(myvalue), "and = ", myvalue) +for name in all_variables: -#client = VDMS_Client(VDMS_HOST, VDMS_PORT) + # Print the item if it doesn't start with '__' + if not name.startswith("__"): + myvalue = eval(name) + print(name, "is", type(myvalue), "and = ", myvalue) +# client = VDMS_Client(VDMS_HOST, VDMS_PORT) -#VDMS_HOST="172.17.0.2" -#VDMS_HOST="10.54.80.228" -#print("Host =", VDMS_HOST) -#end debugging + +# VDMS_HOST="172.17.0.2" +# VDMS_HOST="10.54.80.228" +# print("Host =", VDMS_HOST) +# end debugging client = VDMS_Client(VDMS_HOST, VDMS_PORT) + @register_microservice( name="opea_service@retriever_vdms", service_type=ServiceType.RETRIEVER, @@ -92,19 +92,18 @@ def retrieve(input: EmbedDoc) -> SearchedDoc: # Create vectorstore if tei_embedding_endpoint: # create embeddings using TEI endpoint service - #print(f"TEI_EMBEDDING_ENDPOINT:{tei_embedding_endpoint}") - #embeddings = HuggingFaceHubEmbeddings(model=tei_embedding_endpoint,huggingfacehub_api_token=hf_token) - #embeddings = HuggingFaceHubEmbeddings(model=tei_embedding_endpoint) + # print(f"TEI_EMBEDDING_ENDPOINT:{tei_embedding_endpoint}") + # embeddings = HuggingFaceHubEmbeddings(model=tei_embedding_endpoint,huggingfacehub_api_token=hf_token) + # embeddings = HuggingFaceHubEmbeddings(model=tei_embedding_endpoint) embeddings = HuggingFaceEndpointEmbeddings(model=tei_embedding_endpoint, huggingfacehub_api_token=hf_token) - #embeddings = HuggingFaceEndpointEmbeddings(model=tei_embedding_endpoint) + # embeddings = HuggingFaceEndpointEmbeddings(model=tei_embedding_endpoint) else: # create embeddings using local embedding model embeddings = HuggingFaceBgeEmbeddings(model_name=EMBED_MODEL) - -#debug - #embeddings = HuggingFaceBgeEmbeddings(model_name=EMBED_MODEL) -#end debug + # debug + # embeddings = HuggingFaceBgeEmbeddings(model_name=EMBED_MODEL) + # end debug vector_db = VDMS( client=client, diff --git a/comps/retrievers/langchain/vdms/vdms_config.py b/comps/retrievers/langchain/vdms/vdms_config.py index 7d4a6c0c1..caa1136e8 100644 --- a/comps/retrievers/langchain/vdms/vdms_config.py +++ b/comps/retrievers/langchain/vdms/vdms_config.py @@ -69,13 +69,12 @@ def get_boolean_env_var(var_name, default_value=False): # Vector Index Configuration COLLECTION_NAME = os.getenv("COLLECTION_NAME", "rag-vdms") -#HUGGINGFACEHUB_API_TOKEN ="dummy-token" - +# HUGGINGFACEHUB_API_TOKEN ="dummy-token" # current_file_path = os.path.abspath(__file__) # parent_dir = os.path.dirname(current_file_path) # VDMS_SCHEMA = os.getenv("VDMS_SCHEMA", "vdms_schema.yml") # INDEX_SCHEMA = os.path.join(parent_dir, VDMS_SCHEMA) -SEARCH_ENGINE="FaissFlat" -DISTANCE_STRATEGY="L2" +SEARCH_ENGINE = "FaissFlat" +DISTANCE_STRATEGY = "L2" diff --git a/tests/test_retrievers_langchain_vdms.sh b/tests/test_retrievers_langchain_vdms.sh index 1d17e76a9..83d772867 100755 --- a/tests/test_retrievers_langchain_vdms.sh +++ b/tests/test_retrievers_langchain_vdms.sh @@ -16,8 +16,8 @@ function build_docker_images() { --build-arg https_proxy=$https_proxy \ --build-arg http_proxy=$http_proxy \ --build-arg huggingfacehub_api_token=$hf_token\ - -f comps/retrievers/langchain/vdms/docker/Dockerfile . - + -f comps/retrievers/langchain/vdms/docker/Dockerfile . + } function start_service() { @@ -38,7 +38,7 @@ function start_service() { sleep 30s export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:${tei_endpoint}" - + export INDEX_NAME="rag-vdms" # vdms retriever @@ -52,16 +52,16 @@ function start_service() { } function validate_microservice() { - + retriever_port=7000 URL="http://${ip_address}:$retriever_port/v1/retrieval" test_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") - + HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "{\"text\":\"test\",\"embedding\":${test_embedding}}" -H 'Content-Type: application/json' "$URL") - - #echo "HTTP_STATUS = $HTTP_STATUS" + + #echo "HTTP_STATUS = $HTTP_STATUS" if [ "$HTTP_STATUS" -eq 200 ]; then echo "[ retriever ] HTTP status is 200. Checking content..." From acf63f35a91bc111dee9d04dbaca625e61ab3de5 Mon Sep 17 00:00:00 2001 From: arijit-intel Date: Mon, 12 Aug 2024 21:17:23 -0700 Subject: [PATCH 03/11] add placeholder for constraints --- comps/retrievers/langchain/vdms/retriever_vdms.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/comps/retrievers/langchain/vdms/retriever_vdms.py b/comps/retrievers/langchain/vdms/retriever_vdms.py index b666f740c..16de60709 100644 --- a/comps/retrievers/langchain/vdms/retriever_vdms.py +++ b/comps/retrievers/langchain/vdms/retriever_vdms.py @@ -63,22 +63,25 @@ @register_statistics(names=["opea_service@retriever_vdms"]) def retrieve(input: EmbedDoc) -> SearchedDoc: start = time.time() + constraints = None + #place holder for adding constraints this has to be passed in the EmbedDoc input + #so retriever can filter on them, if this functionality is needed if input.search_type == "similarity": - search_res = vector_db.similarity_search_by_vector(embedding=input.embedding, k=input.k) + search_res = vector_db.similarity_search_by_vector(embedding=input.embedding, k=input.k, filter=constraints) elif input.search_type == "similarity_distance_threshold": if input.distance_threshold is None: raise ValueError("distance_threshold must be provided for " + "similarity_distance_threshold retriever") search_res = vector_db.similarity_search_by_vector( - embedding=input.embedding, k=input.k, distance_threshold=input.distance_threshold + embedding=input.embedding, k=input.k, distance_threshold=input.distance_threshold, filter=constraints ) elif input.search_type == "similarity_score_threshold": docs_and_similarities = vector_db.similarity_search_with_relevance_scores( - query=input.text, k=input.k, score_threshold=input.score_threshold + query=input.text, k=input.k, score_threshold=input.score_threshold, filter=constraints ) search_res = [doc for doc, _ in docs_and_similarities] elif input.search_type == "mmr": search_res = vector_db.max_marginal_relevance_search( - query=input.text, k=input.k, fetch_k=input.fetch_k, lambda_mult=input.lambda_mult + query=input.text, k=input.k, fetch_k=input.fetch_k, lambda_mult=input.lambda_mult, filter=constraints ) searched_docs = [] for r in search_res: @@ -110,7 +113,7 @@ def retrieve(input: EmbedDoc) -> SearchedDoc: client=client, embedding=embeddings, collection_name=COLLECTION_NAME, - embedding_dimensions=128, + embedding_dimensions=768, distance_strategy=DISTANCE_STRATEGY, engine=SEARCH_ENGINE, ) From 40ba411920e690fde559631d11e0659286326496 Mon Sep 17 00:00:00 2001 From: s-gobriel Date: Mon, 12 Aug 2024 21:44:51 -0700 Subject: [PATCH 04/11] add placehoder for constraints --- comps/retrievers/langchain/vdms/retriever_vdms.py | 1 + 1 file changed, 1 insertion(+) diff --git a/comps/retrievers/langchain/vdms/retriever_vdms.py b/comps/retrievers/langchain/vdms/retriever_vdms.py index db9db154b..ba9620eec 100644 --- a/comps/retrievers/langchain/vdms/retriever_vdms.py +++ b/comps/retrievers/langchain/vdms/retriever_vdms.py @@ -66,6 +66,7 @@ def retrieve(input: EmbedDoc) -> SearchedDoc: constraints = None #place holder for adding constraints this has to be passed in the EmbedDoc input #so retriever can filter on them, if this functionality is needed + if input.search_type == "similarity": search_res = vector_db.similarity_search_by_vector(embedding=input.embedding, k=input.k, filter=constraints) elif input.search_type == "similarity_distance_threshold": From ec2f1a31debefbc90cc50376e96f8c0a49d351bb Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 13 Aug 2024 04:48:38 +0000 Subject: [PATCH 05/11] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- comps/retrievers/langchain/vdms/retriever_vdms.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/comps/retrievers/langchain/vdms/retriever_vdms.py b/comps/retrievers/langchain/vdms/retriever_vdms.py index ba9620eec..6a98c3700 100644 --- a/comps/retrievers/langchain/vdms/retriever_vdms.py +++ b/comps/retrievers/langchain/vdms/retriever_vdms.py @@ -63,10 +63,10 @@ @register_statistics(names=["opea_service@retriever_vdms"]) def retrieve(input: EmbedDoc) -> SearchedDoc: start = time.time() - constraints = None - #place holder for adding constraints this has to be passed in the EmbedDoc input - #so retriever can filter on them, if this functionality is needed - + constraints = None + # place holder for adding constraints this has to be passed in the EmbedDoc input + # so retriever can filter on them, if this functionality is needed + if input.search_type == "similarity": search_res = vector_db.similarity_search_by_vector(embedding=input.embedding, k=input.k, filter=constraints) elif input.search_type == "similarity_distance_threshold": From 784df7ac998836aaaab42a179c7ab5973caa3c69 Mon Sep 17 00:00:00 2001 From: s-gobriel Date: Tue, 13 Aug 2024 07:54:08 -0700 Subject: [PATCH 06/11] add dataprep service based on VDMS --- comps/dataprep/README.md | 4 + comps/dataprep/vdms/vdms/README.md | 188 ++++++++++++++++++ .../dataprep/vdms/vdms/langchain/__init__.py | 2 + comps/dataprep/vdms/vdms/langchain/config.py | 31 +++ .../vdms/vdms/langchain/docker/Dockerfile | 41 ++++ .../docker/docker-compose-dataprep-vdms.yaml | 28 +++ .../vdms/vdms/langchain/prepare_doc_vdms.py | 87 ++++++++ .../vdms/vdms/langchain/requirements.txt | 30 +++ tests/test_dataprep_vdms.sh | 98 +++++++++ 9 files changed, 509 insertions(+) create mode 100644 comps/dataprep/vdms/vdms/README.md create mode 100644 comps/dataprep/vdms/vdms/langchain/__init__.py create mode 100644 comps/dataprep/vdms/vdms/langchain/config.py create mode 100644 comps/dataprep/vdms/vdms/langchain/docker/Dockerfile create mode 100644 comps/dataprep/vdms/vdms/langchain/docker/docker-compose-dataprep-vdms.yaml create mode 100644 comps/dataprep/vdms/vdms/langchain/prepare_doc_vdms.py create mode 100644 comps/dataprep/vdms/vdms/langchain/requirements.txt create mode 100755 tests/test_dataprep_vdms.sh diff --git a/comps/dataprep/README.md b/comps/dataprep/README.md index 46e5e3c0f..82b38d4f4 100644 --- a/comps/dataprep/README.md +++ b/comps/dataprep/README.md @@ -36,3 +36,7 @@ For details, please refer to this [readme](pinecone/README.md) # Dataprep Microservice with PGVector For details, please refer to this [readme](pgvector/README.md) + +# Dataprep Microservice with VDMS + +For details, please refer to this [readme](vdms/README.md) diff --git a/comps/dataprep/vdms/vdms/README.md b/comps/dataprep/vdms/vdms/README.md new file mode 100644 index 000000000..cc4a2212c --- /dev/null +++ b/comps/dataprep/vdms/vdms/README.md @@ -0,0 +1,188 @@ +# Dataprep Microservice with VDMS + +For dataprep microservice, we currently provide one framework: `Langchain`. + + +We organized the folders in the same way, so you can use either framework for dataprep microservice with the following constructions. + +# 🚀1. Start Microservice with Python (Option 1) + +## 1.1 Install Requirements + +- option 1: Install Single-process version (for 1-10 files processing) + +```bash +apt-get update +apt-get install -y default-jre tesseract-ocr libtesseract-dev poppler-utils +cd langchain +pip install -r requirements.txt +``` + + + +## 1.2 Start VDMS Server + +Please refer to this [readme](../../vectorstores/langchain/vdms/README.md). + +## 1.3 Setup Environment Variables + +```bash +export http_proxy=${your_http_proxy} +export https_proxy=${your_http_proxy} +export VDMS_HOST=${host_ip} +export VDMS_PORT=55555 +export COLLECTION_NAME=${your_collection_name} +export LANGCHAIN_TRACING_V2=true +export LANGCHAIN_PROJECT="opea/gen-ai-comps:dataprep" +export PYTHONPATH=${path_to_comps} +``` + +## 1.4 Start Document Preparation Microservice for VDMS with Python Script + +Start document preparation microservice for VDMS with below command. + +- option 1: Start single-process version (for 1-10 files processing) + +```bash +python prepare_doc_vdms.py +``` + + + +# 🚀2. Start Microservice with Docker (Option 2) + +## 2.1 Start VDMS Server + +Please refer to this [readme](../../vectorstores/langchain/vdms/README.md). + +## 2.2 Setup Environment Variables + +```bash +export http_proxy=${your_http_proxy} +export https_proxy=${your_http_proxy} +export VDMS_HOST=${host_ip} +export VDMS_PORT=55555 +export TEI_ENDPOINT=${your_tei_endpoint} +export COLLECTION_NAME=${your_collection_name} +export SEARCH_ENGINE="FaissFlat" +export DISTANCE_STRATEGY="L2" +export PYTHONPATH=${path_to_comps} +``` + +## 2.3 Build Docker Image + +- Build docker image with langchain + +* option 1: Start single-process version (for 1-10 files processing) + +```bash +cd ../../../ +docker build -t opea/dataprep-vdms:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/vdms/langchain/docker/Dockerfile . +``` + + + +## 2.4 Run Docker with CLI + +- option 1: Start single-process version (for 1-10 files processing) + +```bash +docker run -d --name="dataprep-vdms-server" -p 6007:6007 --runtime=runc --ipc=host \ +-e http_proxy=$http_proxy -e https_proxy=$https_proxy -e TEI_ENDPOINT=$TEI_ENDPOINT \ +-e COLLECTION_NAME=$COLLECTION_NAME -e VDMS_HOST=$VDMS_HOST -e VDMS_PORT=$VDMS_PORT \ +opea/dataprep-vdms:latest +``` + + + +# 🚀3. Status Microservice + +```bash +docker container logs -f dataprep-vdms-server +``` + +# 🚀4. Consume Microservice + +Once document preparation microservice for VDMS is started, user can use below command to invoke the microservice to convert the document to embedding and save to the database. + +Make sure the file path after `files=@` is correct. + + diff --git a/comps/dataprep/vdms/vdms/langchain/__init__.py b/comps/dataprep/vdms/vdms/langchain/__init__.py new file mode 100644 index 000000000..916f3a44b --- /dev/null +++ b/comps/dataprep/vdms/vdms/langchain/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/dataprep/vdms/vdms/langchain/config.py b/comps/dataprep/vdms/vdms/langchain/config.py new file mode 100644 index 000000000..13651fb1e --- /dev/null +++ b/comps/dataprep/vdms/vdms/langchain/config.py @@ -0,0 +1,31 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +def getEnv(key, default_value=None): + env_value = os.getenv(key, default=default_value) + print(f"{key}: {env_value}") + return env_value + +# Embedding model +EMBED_MODEL = getEnv("EMBED_MODEL", "BAAI/bge-base-en-v1.5") + +# VDMS configuration +VDMS_HOST = getEnv("VDMS_HOST", "localhost") +VDMS_PORT = int(getEnv("VDMS_PORT", 55555)) +COLLECTION_NAME = getEnv("COLLECTION_NAME", "rag-vdms") +SEARCH_ENGINE = getEnv("SEARCH_ENGINE", "FaissFlat") +DISTANCE_STRATEGY = getEnv("DISTANCE_STRATEGY", "L2") + +# LLM/Embedding endpoints +TGI_LLM_ENDPOINT = getEnv("TGI_LLM_ENDPOINT", "http://localhost:8080") +TGI_LLM_ENDPOINT_NO_RAG = getEnv("TGI_LLM_ENDPOINT_NO_RAG", "http://localhost:8081") +TEI_EMBEDDING_ENDPOINT = getEnv("TEI_ENDPOINT") + +# chunk parameters +CHUNK_SIZE = getEnv("CHUNK_SIZE", 1500) +CHUNK_OVERLAP = getEnv("CHUNK_OVERLAP", 100) + +current_file_path = os.path.abspath(__file__) +parent_dir = os.path.dirname(current_file_path) diff --git a/comps/dataprep/vdms/vdms/langchain/docker/Dockerfile b/comps/dataprep/vdms/vdms/langchain/docker/Dockerfile new file mode 100644 index 000000000..a149b017f --- /dev/null +++ b/comps/dataprep/vdms/vdms/langchain/docker/Dockerfile @@ -0,0 +1,41 @@ + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +ENV LANG=C.UTF-8 + +ARG ARCH="cpu" + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + build-essential \ + libcairo2-dev \ + libgl1-mesa-glx \ + libjemalloc-dev \ + vim + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps + +RUN pip install --no-cache-dir --upgrade pip setuptools && \ + if [ ${ARCH} = "cpu" ]; then pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu; fi && \ + pip install --no-cache-dir -r /home/user/comps/dataprep/vdms/langchain/requirements.txt + +ENV PYTHONPATH=/home/user + +USER root + +RUN mkdir -p /home/user/comps/dataprep/vdms/langchain/uploaded_files && chown -R user /home/user/comps/dataprep/vdms/langchain/uploaded_files + +USER user + +WORKDIR /home/user/comps/dataprep/vdms/langchain + +ENTRYPOINT ["python", "prepare_doc_vdms.py"] + diff --git a/comps/dataprep/vdms/vdms/langchain/docker/docker-compose-dataprep-vdms.yaml b/comps/dataprep/vdms/vdms/langchain/docker/docker-compose-dataprep-vdms.yaml new file mode 100644 index 000000000..7619e2e9f --- /dev/null +++ b/comps/dataprep/vdms/vdms/langchain/docker/docker-compose-dataprep-vdms.yaml @@ -0,0 +1,28 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3" +services: + vdms-vector-db: + image: intellabs/vdms:latest + container_name: vdms-vector-db + ports: + - "55555:55555" + dataprep-vdms: + image: opea/dataprep-vdms:latest + container_name: dataprep-vdms-server + ports: + - "6000:6000" + ipc: host + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + VDMS_HOST: ${VDMS_HOST} + VDMS_PORT: ${VDMS_PORT} + COLLECTION_NAME: ${COLLECTION_NAME} + restart: unless-stopped + +networks: + default: + driver: bridge diff --git a/comps/dataprep/vdms/vdms/langchain/prepare_doc_vdms.py b/comps/dataprep/vdms/vdms/langchain/prepare_doc_vdms.py new file mode 100644 index 000000000..637f37bc1 --- /dev/null +++ b/comps/dataprep/vdms/vdms/langchain/prepare_doc_vdms.py @@ -0,0 +1,87 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +from config import ( + COLLECTION_NAME, + DISTANCE_STRATEGY, + EMBED_MODEL, + SEARCH_ENGINE, + VDMS_HOST, + VDMS_PORT, +) +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceEmbeddings, HuggingFaceHubEmbeddings +from langchain_community.vectorstores.vdms import VDMS, VDMS_Client +from langchain_text_splitters import HTMLHeaderTextSplitter + +from comps import DocPath, opea_microservices, opea_telemetry, register_microservice +from comps.dataprep.utils import document_loader, get_separators, get_tables_result + +tei_embedding_endpoint = os.getenv("TEI_ENDPOINT") +client = VDMS_Client(VDMS_HOST, int(VDMS_PORT)) + + +@register_microservice( + name="opea_service@prepare_doc_vdms", + endpoint="/v1/dataprep", + host="0.0.0.0", + port=6007, + input_datatype=DocPath, + output_datatype=None, +) +@opea_telemetry +def ingest_documents(doc_path: DocPath): + """Ingest document to VDMS.""" + path = doc_path.path + print(f"Parsing document {doc_path}.") + + if path.endswith(".html"): + headers_to_split_on = [ + ("h1", "Header 1"), + ("h2", "Header 2"), + ("h3", "Header 3"), + ] + text_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on) + else: + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=doc_path.chunk_size, chunk_overlap=100, add_start_index=True, separators=get_separators() + ) + + content = document_loader(doc_path) + chunks = text_splitter.split_text(content) + if doc_path.process_table and path.endswith(".pdf"): + table_chunks = get_tables_result(path, doc_path.table_strategy) + chunks = chunks + table_chunks + + print("Done preprocessing. Created ", len(chunks), " chunks of the original pdf") + + # Create vectorstore + if tei_embedding_endpoint: + # create embeddings using TEI endpoint service + embedder = HuggingFaceHubEmbeddings(model=tei_embedding_endpoint) + else: + # create embeddings using local embedding model + embedder = HuggingFaceBgeEmbeddings(model_name=EMBED_MODEL) + + # Batch size + batch_size = 32 + num_chunks = len(chunks) + for i in range(0, num_chunks, batch_size): + batch_chunks = chunks[i : i + batch_size] + batch_texts = batch_chunks + + _ = VDMS.from_texts( + client=client, + embedding=embedder, + collection_name=COLLECTION_NAME, + distance_strategy=DISTANCE_STRATEGY, + engine=SEARCH_ENGINE, + texts=batch_texts, + ) + print(f"Processed batch {i//batch_size + 1}/{(num_chunks-1)//batch_size + 1}") + + +if __name__ == "__main__": + opea_microservices["opea_service@prepare_doc_vdms"].start() diff --git a/comps/dataprep/vdms/vdms/langchain/requirements.txt b/comps/dataprep/vdms/vdms/langchain/requirements.txt new file mode 100644 index 000000000..fc392007f --- /dev/null +++ b/comps/dataprep/vdms/vdms/langchain/requirements.txt @@ -0,0 +1,30 @@ +beautifulsoup4 +cairosvg +docarray[full] +docx2txt +easyocr +fastapi +huggingface_hub +langchain +langchain-core +langchain-community +langchain-text-splitters +langsmith +markdown +numpy +opentelemetry-api +opentelemetry-exporter-otlp +opentelemetry-sdk +pandas +Pillow +prometheus-fastapi-instrumentator +pymupdf +pyspark +python-bidi==0.4.2 +python-docx +python-pptx +sentence_transformers +shortuuid +unstructured[all-docs]==0.11.5 +uvicorn +vdms diff --git a/tests/test_dataprep_vdms.sh b/tests/test_dataprep_vdms.sh new file mode 100755 index 000000000..2eace0d1f --- /dev/null +++ b/tests/test_dataprep_vdms.sh @@ -0,0 +1,98 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -xe + +WORKPATH=$(dirname "$PWD") +LOG_PATH="$WORKPATH/tests" +ip_address=$(hostname -I | awk '{print $1}') +export VDMS_PORT=55555 + +function build_docker_images() { + cd $WORKPATH + echo $(pwd) + + # pull vdms image + docker pull intellabs/vdms:latest + + # build dataprep image for vdms + docker build -t opea/dataprep-vdms:comps \ + --build-arg https_proxy=$https_proxy \ + --build-arg http_proxy=$http_proxy \ + -f comps/dataprep/vdms/langchain/docker/Dockerfile . +} + +function start_service() { + docker run -d --name "test-comps-dataprep-vdms-langchain" -p $VDMS_PORT:55555 --ipc=host \ + -e http_proxy=$http_proxy -e https_proxy=$https_proxy \ + intellabs/vdms:latest + + sleep 10s + + dataprep_service_port=5020 + docker run -d --name="test-comps-dataprep-vdms-langchain-server" -p ${dataprep_service_port}:6007 --ipc=host \ + -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e TEI_ENDPOINT=$TEI_ENDPOINT \ + -e VDMS_HOST=$ip_address -e VDMS_PORT=$VDMS_PORT \ + opea/dataprep-vdms:comps + + sleep 30s +} + +# function validate_microservice() { +# cd $LOG_PATH +# dataprep_service_port=6007 #5020 +# URL="http://$ip_address:$dataprep_service_port/v1/dataprep" +# echo 'The OPEA platform includes: Detailed framework of composable building blocks for state-of-the-art generative AI systems including LLMs, data stores, and prompt engines' > $LOG_PATH/dataprep_file.txt +# curl --noproxy $ip_address --location --request POST \ +# --form 'files=@$LOG_PATH/dataprep_file.txt' $URL +# } + +function validate_microservice() { + cd $LOG_PATH + dataprep_service_port=5020 + URL="http://${ip_address}:$dataprep_service_port/v1/dataprep" + echo "Deep learning is a subset of machine learning that utilizes neural networks with multiple layers to analyze various levels of abstract data representations. It enables computers to identify patterns and make decisions with minimal human intervention by learning from large amounts of data." > $LOG_PATH/dataprep_file.txt + HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -F 'files=@./dataprep_file.txt' -H 'Content-Type: multipart/form-data' "$URL") + if [ "$HTTP_STATUS" -eq 200 ]; then + echo "[ dataprep ] HTTP status is 200. Checking content..." + local CONTENT=$(curl -s -X POST -F 'files=@./dataprep_file.txt' -H 'Content-Type: multipart/form-data' "$URL" | tee ${LOG_PATH}/dataprep.log) + + if echo 'Data preparation succeeded' | grep -q "$EXPECTED_RESULT"; then + echo "[ dataprep ] Content is as expected." + else + echo "[ dataprep ] Content does not match the expected result: $CONTENT" + docker logs test-comps-dataprep-vdms-langchain-server >> ${LOG_PATH}/dataprep.log + exit 1 + fi + else + echo "[ dataprep ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs test-comps-dataprep-vdms-langchain-server >> ${LOG_PATH}/dataprep.log + exit 1 + fi + rm -rf $LOG_PATH/dataprep_file.txt +} + +function stop_docker() { + cid=$(docker ps -aq --filter "name=test-comps-dataprep-vdms-langchain*") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi + + cid=$(docker ps -aq --filter "name=test-comps-dataprep-vdms-langchain-server*") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_microservice + + # stop_docker + # echo y | docker system prune + +} + +main From 7ad2cbdc2160d5bdb32f37ad1ea5eb7a7c154496 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 13 Aug 2024 15:18:32 +0000 Subject: [PATCH 07/11] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- comps/dataprep/vdms/vdms/README.md | 1 + comps/dataprep/vdms/vdms/langchain/config.py | 2 ++ .../dataprep/vdms/vdms/langchain/prepare_doc_vdms.py | 11 ++--------- comps/dataprep/vdms/vdms/langchain/requirements.txt | 2 +- 4 files changed, 6 insertions(+), 10 deletions(-) diff --git a/comps/dataprep/vdms/vdms/README.md b/comps/dataprep/vdms/vdms/README.md index cc4a2212c..617761f02 100644 --- a/comps/dataprep/vdms/vdms/README.md +++ b/comps/dataprep/vdms/vdms/README.md @@ -1,6 +1,7 @@ # Dataprep Microservice with VDMS For dataprep microservice, we currently provide one framework: `Langchain`. + We organized the folders in the same way, so you can use either framework for dataprep microservice with the following constructions. diff --git a/comps/dataprep/vdms/vdms/langchain/config.py b/comps/dataprep/vdms/vdms/langchain/config.py index 13651fb1e..e12ba1502 100644 --- a/comps/dataprep/vdms/vdms/langchain/config.py +++ b/comps/dataprep/vdms/vdms/langchain/config.py @@ -3,11 +3,13 @@ import os + def getEnv(key, default_value=None): env_value = os.getenv(key, default=default_value) print(f"{key}: {env_value}") return env_value + # Embedding model EMBED_MODEL = getEnv("EMBED_MODEL", "BAAI/bge-base-en-v1.5") diff --git a/comps/dataprep/vdms/vdms/langchain/prepare_doc_vdms.py b/comps/dataprep/vdms/vdms/langchain/prepare_doc_vdms.py index 637f37bc1..e6f7d0072 100644 --- a/comps/dataprep/vdms/vdms/langchain/prepare_doc_vdms.py +++ b/comps/dataprep/vdms/vdms/langchain/prepare_doc_vdms.py @@ -3,14 +3,7 @@ import os -from config import ( - COLLECTION_NAME, - DISTANCE_STRATEGY, - EMBED_MODEL, - SEARCH_ENGINE, - VDMS_HOST, - VDMS_PORT, -) +from config import COLLECTION_NAME, DISTANCE_STRATEGY, EMBED_MODEL, SEARCH_ENGINE, VDMS_HOST, VDMS_PORT from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceEmbeddings, HuggingFaceHubEmbeddings from langchain_community.vectorstores.vdms import VDMS, VDMS_Client @@ -56,7 +49,7 @@ def ingest_documents(doc_path: DocPath): chunks = chunks + table_chunks print("Done preprocessing. Created ", len(chunks), " chunks of the original pdf") - + # Create vectorstore if tei_embedding_endpoint: # create embeddings using TEI endpoint service diff --git a/comps/dataprep/vdms/vdms/langchain/requirements.txt b/comps/dataprep/vdms/vdms/langchain/requirements.txt index fc392007f..7b8bc37ed 100644 --- a/comps/dataprep/vdms/vdms/langchain/requirements.txt +++ b/comps/dataprep/vdms/vdms/langchain/requirements.txt @@ -6,8 +6,8 @@ easyocr fastapi huggingface_hub langchain -langchain-core langchain-community +langchain-core langchain-text-splitters langsmith markdown From bbc33391d58fb3798775d28da16988cc29c76cd7 Mon Sep 17 00:00:00 2001 From: s-gobriel Date: Wed, 14 Aug 2024 10:15:00 -0700 Subject: [PATCH 08/11] change return type of microservice --- comps/__init__.py | 1 + comps/cores/proto/docarray.py | 19 ++++++++++++++++++- comps/retrievers/langchain/README.md | 4 ++++ .../langchain/vdms/retriever_vdms.py | 11 +++++++---- tests/test_retrievers_langchain_vdms.sh | 1 + 5 files changed, 31 insertions(+), 5 deletions(-) diff --git a/comps/__init__.py b/comps/__init__.py index cb7ed7a28..7af91348e 100644 --- a/comps/__init__.py +++ b/comps/__init__.py @@ -12,6 +12,7 @@ GeneratedDoc, LLMParamsDoc, SearchedDoc, + SearchedMultimodalDoc, RerankedDoc, TextDoc, RAGASParams, diff --git a/comps/cores/proto/docarray.py b/comps/cores/proto/docarray.py index 9e07d618d..4f04ba6b7 100644 --- a/comps/cores/proto/docarray.py +++ b/comps/cores/proto/docarray.py @@ -1,7 +1,7 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -from typing import Optional +from typing import Dict, List, Optional, Tuple, Union import numpy as np from docarray import BaseDoc, DocList @@ -19,6 +19,13 @@ class TopologyInfo: class TextDoc(BaseDoc, TopologyInfo): text: str +class ImageDoc(BaseDoc): + image_path: str + + +class TextImageDoc(BaseDoc): + doc: Tuple[Union[TextDoc, ImageDoc]] + class Base64ByteStrDoc(BaseDoc): byte_str: str @@ -67,6 +74,16 @@ class Config: json_encoders = {np.ndarray: lambda x: x.tolist()} +class SearchedMultimodalDoc(BaseDoc): + retrieved_docs: List[TextImageDoc] + initial_query: str + top_n: int = 1 + metadata: Optional[List[Dict]] = None + + class Config: + json_encoders = {np.ndarray: lambda x: x.tolist()} + + class GeneratedDoc(BaseDoc): text: str prompt: str diff --git a/comps/retrievers/langchain/README.md b/comps/retrievers/langchain/README.md index 3de5cab21..9d96ba14a 100644 --- a/comps/retrievers/langchain/README.md +++ b/comps/retrievers/langchain/README.md @@ -17,3 +17,7 @@ For details, please refer to this [readme](milvus/README.md) # Retriever Microservice with PGVector For details, please refer to this [readme](pgvector/README.md) + +# Retriever Microservice with VDMS + +For details, please refer to this [readme](vdms/README.md) diff --git a/comps/retrievers/langchain/vdms/retriever_vdms.py b/comps/retrievers/langchain/vdms/retriever_vdms.py index 6a98c3700..3b6650a91 100644 --- a/comps/retrievers/langchain/vdms/retriever_vdms.py +++ b/comps/retrievers/langchain/vdms/retriever_vdms.py @@ -20,6 +20,7 @@ from comps import ( EmbedDoc, SearchedDoc, + SearchedMultimodalDoc, ServiceType, TextDoc, opea_microservices, @@ -61,7 +62,7 @@ ) @traceable(run_type="retriever") @register_statistics(names=["opea_service@retriever_vdms"]) -def retrieve(input: EmbedDoc) -> SearchedDoc: +def retrieve(input: EmbedDoc) -> SearchedMultimodalDoc: start = time.time() constraints = None # place holder for adding constraints this has to be passed in the EmbedDoc input @@ -85,9 +86,11 @@ def retrieve(input: EmbedDoc) -> SearchedDoc: query=input.text, k=input.k, fetch_k=input.fetch_k, lambda_mult=input.lambda_mult, filter=constraints ) searched_docs = [] + metadata_list = [] for r in search_res: searched_docs.append(TextDoc(text=r.page_content)) - result = SearchedDoc(retrieved_docs=searched_docs, initial_query=input.text) + metadata_list.append(r.metadata) + result = SearchedMultimodalDoc(retrieved_docs=searched_docs, metadata=metadata_list, initial_query=input.text) statistics_dict["opea_service@retriever_vdms"].append_latency(time.time() - start, None) return result @@ -99,7 +102,7 @@ def retrieve(input: EmbedDoc) -> SearchedDoc: # print(f"TEI_EMBEDDING_ENDPOINT:{tei_embedding_endpoint}") # embeddings = HuggingFaceHubEmbeddings(model=tei_embedding_endpoint,huggingfacehub_api_token=hf_token) # embeddings = HuggingFaceHubEmbeddings(model=tei_embedding_endpoint) - embeddings = HuggingFaceEndpointEmbeddings(model=tei_embedding_endpoint, huggingfacehub_api_token=hf_token) + embeddings = HuggingFaceEndpointEmbeddings(model=tei_embedding_endpoint, huggingfacehub_api_token=hf_token) # embeddings = HuggingFaceEndpointEmbeddings(model=tei_embedding_endpoint) else: # create embeddings using local embedding model @@ -113,7 +116,7 @@ def retrieve(input: EmbedDoc) -> SearchedDoc: client=client, embedding=embeddings, collection_name=COLLECTION_NAME, - embedding_dimensions=768, + #embedding_dimensions=768, distance_strategy=DISTANCE_STRATEGY, engine=SEARCH_ENGINE, ) diff --git a/tests/test_retrievers_langchain_vdms.sh b/tests/test_retrievers_langchain_vdms.sh index 83d772867..feb7b1a0c 100755 --- a/tests/test_retrievers_langchain_vdms.sh +++ b/tests/test_retrievers_langchain_vdms.sh @@ -33,6 +33,7 @@ function start_service() { model="BAAI/bge-base-en-v1.5" docker run -d --name="test-comps-retriever-tei-endpoint" \ -p $tei_endpoint:80 -v ./data:/data \ + -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy \ --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.2 \ --model-id $model sleep 30s From e734cac94a38047c87120bb85e18552714f32da2 Mon Sep 17 00:00:00 2001 From: s-gobriel Date: Thu, 15 Aug 2024 23:23:40 -0700 Subject: [PATCH 09/11] add calls to CLIP embedding --- comps/cores/proto/docarray.py | 1 + .../langchain/vdms/docker/Dockerfile | 2 + .../langchain/vdms/retriever_vdms.py | 59 ++++++++++--------- tests/test_retrievers_langchain_vdms.sh | 5 +- 4 files changed, 39 insertions(+), 28 deletions(-) diff --git a/comps/cores/proto/docarray.py b/comps/cores/proto/docarray.py index 4f04ba6b7..a5b1d01ab 100644 --- a/comps/cores/proto/docarray.py +++ b/comps/cores/proto/docarray.py @@ -48,6 +48,7 @@ class EmbedDoc(BaseDoc): fetch_k: int = 20 lambda_mult: float = 0.5 score_threshold: float = 0.2 + constraints: dict = None class Audio2TextDoc(AudioDoc): diff --git a/comps/retrievers/langchain/vdms/docker/Dockerfile b/comps/retrievers/langchain/vdms/docker/Dockerfile index d1c4676f2..de4a3b50c 100644 --- a/comps/retrievers/langchain/vdms/docker/Dockerfile +++ b/comps/retrievers/langchain/vdms/docker/Dockerfile @@ -34,6 +34,8 @@ ENV PYTHONPATH=$PYTHONPATH:/home/user ENV HUGGINGFACEHUB_API_TOKEN=dummy +ENV USECLIP 0 + ENV no_proxy=localhost,127.0.0.1 ENV http_proxy="" diff --git a/comps/retrievers/langchain/vdms/retriever_vdms.py b/comps/retrievers/langchain/vdms/retriever_vdms.py index 3b6650a91..3d6717158 100644 --- a/comps/retrievers/langchain/vdms/retriever_vdms.py +++ b/comps/retrievers/langchain/vdms/retriever_vdms.py @@ -7,6 +7,7 @@ from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceHubEmbeddings from langchain_community.vectorstores.vdms import VDMS, VDMS_Client from langchain_huggingface.embeddings import HuggingFaceEndpointEmbeddings + from langsmith import traceable from vdms_config import ( # , HUGGINGFACEHUB_API_TOKEN, INDEX_SCHEMA, VDMS_URL COLLECTION_NAME, @@ -31,24 +32,22 @@ tei_embedding_endpoint = os.getenv("TEI_EMBEDDING_ENDPOINT") hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN") +use_clip = int(os.getenv("USECLIP")) + +if use_clip: + import sys + sys.path.append('../../../embeddings/langchain_multimodal/') + from embeddings_clip import vCLIP # Debugging all_variables = dir() for name in all_variables: - # Print the item if it doesn't start with '__' if not name.startswith("__"): myvalue = eval(name) print(name, "is", type(myvalue), "and = ", myvalue) -# client = VDMS_Client(VDMS_HOST, VDMS_PORT) - - -# VDMS_HOST="172.17.0.2" -# VDMS_HOST="10.54.80.228" -# print("Host =", VDMS_HOST) -# end debugging client = VDMS_Client(VDMS_HOST, VDMS_PORT) @@ -64,26 +63,23 @@ @register_statistics(names=["opea_service@retriever_vdms"]) def retrieve(input: EmbedDoc) -> SearchedMultimodalDoc: start = time.time() - constraints = None - # place holder for adding constraints this has to be passed in the EmbedDoc input - # so retriever can filter on them, if this functionality is needed if input.search_type == "similarity": - search_res = vector_db.similarity_search_by_vector(embedding=input.embedding, k=input.k, filter=constraints) + search_res = vector_db.similarity_search_by_vector(embedding=input.embedding, k=input.k, filter=input.constraints) elif input.search_type == "similarity_distance_threshold": if input.distance_threshold is None: raise ValueError("distance_threshold must be provided for " + "similarity_distance_threshold retriever") search_res = vector_db.similarity_search_by_vector( - embedding=input.embedding, k=input.k, distance_threshold=input.distance_threshold, filter=constraints + embedding=input.embedding, k=input.k, distance_threshold=input.distance_threshold, filter=input.constraints ) elif input.search_type == "similarity_score_threshold": docs_and_similarities = vector_db.similarity_search_with_relevance_scores( - query=input.text, k=input.k, score_threshold=input.score_threshold, filter=constraints + query=input.text, k=input.k, score_threshold=input.score_threshold, filter=input.constraints ) search_res = [doc for doc, _ in docs_and_similarities] elif input.search_type == "mmr": search_res = vector_db.max_marginal_relevance_search( - query=input.text, k=input.k, fetch_k=input.fetch_k, lambda_mult=input.lambda_mult, filter=constraints + query=input.text, k=input.k, fetch_k=input.fetch_k, lambda_mult=input.lambda_mult, filter=input.constraints ) searched_docs = [] metadata_list = [] @@ -97,22 +93,29 @@ def retrieve(input: EmbedDoc) -> SearchedMultimodalDoc: if __name__ == "__main__": # Create vectorstore - if tei_embedding_endpoint: - # create embeddings using TEI endpoint service - # print(f"TEI_EMBEDDING_ENDPOINT:{tei_embedding_endpoint}") - # embeddings = HuggingFaceHubEmbeddings(model=tei_embedding_endpoint,huggingfacehub_api_token=hf_token) - # embeddings = HuggingFaceHubEmbeddings(model=tei_embedding_endpoint) + + if use_clip: + embeddings = vCLIP({"model_name": "openai/clip-vit-base-patch32", "num_frm": 4}) + dimensions=embeddings.get_embedding_lenth() + elif tei_embedding_endpoint: embeddings = HuggingFaceEndpointEmbeddings(model=tei_embedding_endpoint, huggingfacehub_api_token=hf_token) - # embeddings = HuggingFaceEndpointEmbeddings(model=tei_embedding_endpoint) else: - # create embeddings using local embedding model embeddings = HuggingFaceBgeEmbeddings(model_name=EMBED_MODEL) + # create embeddings using local embedding model + + - # debug - # embeddings = HuggingFaceBgeEmbeddings(model_name=EMBED_MODEL) - # end debug - - vector_db = VDMS( + if use_clip: + vector_db = VDMS( + client=client, + embedding=embeddings, + collection_name=COLLECTION_NAME, + embedding_dimensions=dimensions, + distance_strategy=DISTANCE_STRATEGY, + engine=SEARCH_ENGINE, + ) + else: + vector_db = VDMS( client=client, embedding=embeddings, collection_name=COLLECTION_NAME, @@ -120,4 +123,6 @@ def retrieve(input: EmbedDoc) -> SearchedMultimodalDoc: distance_strategy=DISTANCE_STRATEGY, engine=SEARCH_ENGINE, ) + + opea_microservices["opea_service@retriever_vdms"].start() diff --git a/tests/test_retrievers_langchain_vdms.sh b/tests/test_retrievers_langchain_vdms.sh index feb7b1a0c..3c04c7a1e 100755 --- a/tests/test_retrievers_langchain_vdms.sh +++ b/tests/test_retrievers_langchain_vdms.sh @@ -44,10 +44,13 @@ function start_service() { # vdms retriever unset http_proxy + use_clip=0 #set to 1 if openai clip embedding should be used + docker run -d --name="test-comps-retriever-vdms-server" -p 7000:7000 --ipc=host \ -e INDEX_NAME=$INDEX_NAME -e VDMS_HOST=$ip_address \ -e https_proxy=$https_proxy -e http_proxy=$http_proxy \ - -e VDMS_PORT=$vdms_port -e HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN -e TEI_EMBEDDING_ENDPOINT=$TEI_EMBEDDING_ENDPOINT \ + -e VDMS_PORT=$vdms_port -e HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN \ + -e TEI_EMBEDDING_ENDPOINT=$TEI_EMBEDDING_ENDPOINT -e USECLIP=$use_clip \ opea/retriever-vdms:comps sleep 3m } From 00c5f32c26262c3874abedd43af626fee85230b5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 16 Aug 2024 12:27:42 +0000 Subject: [PATCH 10/11] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- comps/cores/proto/docarray.py | 3 +- .../langchain/vdms/retriever_vdms.py | 47 +++++++++---------- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/comps/cores/proto/docarray.py b/comps/cores/proto/docarray.py index bb970790a..a1821cd23 100644 --- a/comps/cores/proto/docarray.py +++ b/comps/cores/proto/docarray.py @@ -19,9 +19,10 @@ class TopologyInfo: class TextDoc(BaseDoc, TopologyInfo): text: str + class ImageDoc(BaseDoc): image_path: str - + class TextImageDoc(BaseDoc): doc: Tuple[Union[TextDoc, ImageDoc]] diff --git a/comps/retrievers/langchain/vdms/retriever_vdms.py b/comps/retrievers/langchain/vdms/retriever_vdms.py index 3d6717158..d0a8b7020 100644 --- a/comps/retrievers/langchain/vdms/retriever_vdms.py +++ b/comps/retrievers/langchain/vdms/retriever_vdms.py @@ -7,7 +7,6 @@ from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceHubEmbeddings from langchain_community.vectorstores.vdms import VDMS, VDMS_Client from langchain_huggingface.embeddings import HuggingFaceEndpointEmbeddings - from langsmith import traceable from vdms_config import ( # , HUGGINGFACEHUB_API_TOKEN, INDEX_SCHEMA, VDMS_URL COLLECTION_NAME, @@ -36,7 +35,8 @@ if use_clip: import sys - sys.path.append('../../../embeddings/langchain_multimodal/') + + sys.path.append("../../../embeddings/langchain_multimodal/") from embeddings_clip import vCLIP # Debugging @@ -65,7 +65,9 @@ def retrieve(input: EmbedDoc) -> SearchedMultimodalDoc: start = time.time() if input.search_type == "similarity": - search_res = vector_db.similarity_search_by_vector(embedding=input.embedding, k=input.k, filter=input.constraints) + search_res = vector_db.similarity_search_by_vector( + embedding=input.embedding, k=input.k, filter=input.constraints + ) elif input.search_type == "similarity_distance_threshold": if input.distance_threshold is None: raise ValueError("distance_threshold must be provided for " + "similarity_distance_threshold retriever") @@ -93,36 +95,33 @@ def retrieve(input: EmbedDoc) -> SearchedMultimodalDoc: if __name__ == "__main__": # Create vectorstore - + if use_clip: embeddings = vCLIP({"model_name": "openai/clip-vit-base-patch32", "num_frm": 4}) - dimensions=embeddings.get_embedding_lenth() + dimensions = embeddings.get_embedding_lenth() elif tei_embedding_endpoint: - embeddings = HuggingFaceEndpointEmbeddings(model=tei_embedding_endpoint, huggingfacehub_api_token=hf_token) + embeddings = HuggingFaceEndpointEmbeddings(model=tei_embedding_endpoint, huggingfacehub_api_token=hf_token) else: embeddings = HuggingFaceBgeEmbeddings(model_name=EMBED_MODEL) - # create embeddings using local embedding model - - + # create embeddings using local embedding model if use_clip: vector_db = VDMS( - client=client, - embedding=embeddings, - collection_name=COLLECTION_NAME, - embedding_dimensions=dimensions, - distance_strategy=DISTANCE_STRATEGY, - engine=SEARCH_ENGINE, - ) + client=client, + embedding=embeddings, + collection_name=COLLECTION_NAME, + embedding_dimensions=dimensions, + distance_strategy=DISTANCE_STRATEGY, + engine=SEARCH_ENGINE, + ) else: vector_db = VDMS( - client=client, - embedding=embeddings, - collection_name=COLLECTION_NAME, - #embedding_dimensions=768, - distance_strategy=DISTANCE_STRATEGY, - engine=SEARCH_ENGINE, - ) - + client=client, + embedding=embeddings, + collection_name=COLLECTION_NAME, + # embedding_dimensions=768, + distance_strategy=DISTANCE_STRATEGY, + engine=SEARCH_ENGINE, + ) opea_microservices["opea_service@retriever_vdms"].start() From cece57e3dae31b9961beac8e5b7d3441513a2fe3 Mon Sep 17 00:00:00 2001 From: s-gobriel Date: Fri, 16 Aug 2024 09:16:37 -0700 Subject: [PATCH 11/11] fix typo --- comps/retrievers/langchain/vdms/retriever_vdms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comps/retrievers/langchain/vdms/retriever_vdms.py b/comps/retrievers/langchain/vdms/retriever_vdms.py index 3d6717158..16ab96dd8 100644 --- a/comps/retrievers/langchain/vdms/retriever_vdms.py +++ b/comps/retrievers/langchain/vdms/retriever_vdms.py @@ -96,7 +96,7 @@ def retrieve(input: EmbedDoc) -> SearchedMultimodalDoc: if use_clip: embeddings = vCLIP({"model_name": "openai/clip-vit-base-patch32", "num_frm": 4}) - dimensions=embeddings.get_embedding_lenth() + dimensions=embeddings.get_embedding_length() elif tei_embedding_endpoint: embeddings = HuggingFaceEndpointEmbeddings(model=tei_embedding_endpoint, huggingfacehub_api_token=hf_token) else: