diff --git a/.github/workflows/docker/compose/retrievers-compose.yaml b/.github/workflows/docker/compose/retrievers-compose.yaml index dc49b95d5..24f5623e5 100644 --- a/.github/workflows/docker/compose/retrievers-compose.yaml +++ b/.github/workflows/docker/compose/retrievers-compose.yaml @@ -15,10 +15,6 @@ services: build: dockerfile: comps/retrievers/vdms/langchain/Dockerfile image: ${REGISTRY:-opea}/retriever-vdms:${TAG:-latest} - retriever-multimodal-redis: - build: - dockerfile: comps/retrievers/multimodal/redis/langchain/Dockerfile - image: ${REGISTRY:-opea}/retriever-multimodal-redis:${TAG:-latest} retriever-pgvector: build: dockerfile: comps/retrievers/pgvector/langchain/Dockerfile diff --git a/comps/retrievers/README.md b/comps/retrievers/README.md index eeba8860e..a05586d5f 100644 --- a/comps/retrievers/README.md +++ b/comps/retrievers/README.md @@ -29,7 +29,3 @@ For details, please refer to this [readme](qdrant/haystack/README.md) ## Retriever Microservice with VDMS For details, please refer to this [readme](vdms/langchain/README.md) - -## Retriever Microservice with Multimodal - -For details, please refer to this [readme](multimodal/redis/langchain/README.md) diff --git a/comps/retrievers/milvus/langchain/ingest.py b/comps/retrievers/milvus/langchain/ingest.py deleted file mode 100644 index 77b7a7823..000000000 --- a/comps/retrievers/milvus/langchain/ingest.py +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -# - -import argparse -import io -import os - -import numpy as np -from config import COLLECTION_NAME, EMBED_ENDPOINT, EMBED_MODEL, MILVUS_HOST, MILVUS_PORT -from langchain.text_splitter import RecursiveCharacterTextSplitter -from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceHubEmbeddings -from langchain_milvus.vectorstores import Milvus -from PIL import Image - - -def pdf_loader(file_path): - try: - import easyocr - import fitz - except ImportError: - raise ImportError( - "`PyMuPDF` or 'easyocr' package is not found, please install it with " - "`pip install pymupdf or pip install easyocr.`" - ) - - doc = fitz.open(file_path) - reader = easyocr.Reader(["en"]) - result = "" - for i in range(doc.page_count): - page = doc.load_page(i) - pagetext = page.get_text().strip() - if pagetext: - result = result + pagetext - if len(doc.get_page_images(i)) > 0: - for img in doc.get_page_images(i): - if img: - pageimg = "" - xref = img[0] - img_data = doc.extract_image(xref) - img_bytes = img_data["image"] - pil_image = Image.open(io.BytesIO(img_bytes)) - img = np.array(pil_image) - img_result = reader.readtext(img, paragraph=True, detail=0) - pageimg = pageimg + ", ".join(img_result).strip() - if pageimg.endswith("!") or pageimg.endswith("?") or pageimg.endswith("."): - pass - else: - pageimg = pageimg + "." - result = result + pageimg - return result - - -def ingest_documents(folder_path, tag): - """Ingest PDF to Milvus from a directory.""" - # Load list of pdfs - doc_path = [os.path.join(folder_path, file) for file in os.listdir(folder_path)][0] - - print("Parsing...", doc_path) - - text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100, add_start_index=True) - content = pdf_loader(doc_path) - chunks = text_splitter.split_text(content) - - print("Done preprocessing. Created ", len(chunks), " chunks of the original pdf") - # Create vectorstore - if EMBED_ENDPOINT: - # create embeddings using TEI endpoint service - embedder = HuggingFaceHubEmbeddings(model=EMBED_ENDPOINT) - else: - # create embeddings using local embedding model - embedder = HuggingFaceBgeEmbeddings(model_name=EMBED_MODEL) - - # Batch size - batch_size = 32 - num_chunks = len(chunks) - for i in range(0, num_chunks, batch_size): - batch_chunks = chunks[i : i + batch_size] - batch_texts = [f"Tag: {tag}. " + chunk for chunk in batch_chunks] - - _ = Milvus.from_texts( - texts=batch_texts, - embedding=embedder, - collection_name=COLLECTION_NAME, - connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT}, - ) - print(f"Processed batch {i//batch_size + 1}/{(num_chunks-1)//batch_size + 1}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Ingest documents from a specified folder with a tag") - parser.add_argument("folder_path", type=str, help="Path to the folder containing documents") - parser.add_argument("--tag", type=str, default="", help="Tag to be used as an identifier") - - args = parser.parse_args() - ingest_documents(args.folder_path, args.tag) diff --git a/comps/retrievers/multimodal/redis/langchain/Dockerfile b/comps/retrievers/multimodal/redis/langchain/Dockerfile deleted file mode 100644 index 5c339069f..000000000 --- a/comps/retrievers/multimodal/redis/langchain/Dockerfile +++ /dev/null @@ -1,28 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -FROM python:3.11-slim - -ARG ARCH="cpu" - -RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ - libgl1-mesa-glx \ - libjemalloc-dev - -RUN useradd -m -s /bin/bash user && \ - mkdir -p /home/user && \ - chown -R user /home/user/ - -COPY comps /home/user/comps - -USER user - -RUN pip install --no-cache-dir --upgrade pip setuptools && \ - if [ ${ARCH} = "cpu" ]; then pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cpu; fi && \ - pip install --no-cache-dir -r /home/user/comps/retrievers/multimodal/redis/langchain/requirements.txt - -ENV PYTHONPATH=$PYTHONPATH:/home/user - -WORKDIR /home/user/comps/retrievers/multimodal/redis/langchain - -ENTRYPOINT ["python", "retriever_redis.py"] diff --git a/comps/retrievers/multimodal/redis/langchain/README.md b/comps/retrievers/multimodal/redis/langchain/README.md deleted file mode 100644 index c67dc9e77..000000000 --- a/comps/retrievers/multimodal/redis/langchain/README.md +++ /dev/null @@ -1,123 +0,0 @@ -# Retriever Microservice - -This retriever microservice is a highly efficient search service designed for handling and retrieving embedding vectors from multimodal data. It operates by receiving an embedding vector as input and conducting a similarity search against vectors stored in a VectorDB database. Users must specify the VectorDB's URL and the index name, and the service searches within that index to find documents with the highest similarity to the input vector. - -The service primarily utilizes similarity measures in vector space to rapidly retrieve contentually similar documents. The vector-based retrieval approach is particularly suited for handling large datasets, offering fast and accurate search results that significantly enhance the efficiency and quality of information retrieval. - -Overall, this microservice provides robust backend support for applications requiring efficient similarity searches, playing a vital role in scenarios such as recommendation systems, information retrieval, or any other context where precise measurement of document similarity is crucial. - -## 🚀1. Start Microservice with Python (Option 1) - -To start the retriever microservice, you must first install the required python packages. - -### 1.1 Install Requirements - -```bash -pip install -r requirements.txt -``` - -### 1.2 Setup VectorDB Service - -You need to setup your own VectorDB service (Redis in this example), and ingest your knowledge documents into the vector database. - -As for Redis, you could start a docker container using the following commands. -Remember to ingest data into it manually. - -```bash -docker run -d --name="redis-vector-db" -p 6379:6379 -p 8001:8001 redis/redis-stack:7.2.0-v9 -``` - -### 1.3 Ingest images or video - -Upload a video or images using the dataprep microservice, instructions can be found [here](https://github.com/opea-project/GenAIComps/blob/main/comps/dataprep/multimodal/redis/langchain/README.md). - -### 1.4 Start Retriever Service - -```bash -python retriever_redis.py -``` - -## 🚀2. Start Microservice with Docker (Option 2) - -### 2.1 Setup Environment Variables - -```bash -export your_ip=$(hostname -I | awk '{print $1}') -export REDIS_URL="redis://${your_ip}:6379" -export INDEX_NAME=${your_index_name} -``` - -### 2.2 Build Docker Image - -```bash -cd ../../../../../ -docker build -t opea/retriever-multimodal-redis:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/multimodal/redis/langchain/Dockerfile . -``` - -To start a docker container, you have two options: - -- A. Run Docker with CLI -- B. Run Docker with Docker Compose - -You can choose one as needed. - -### 2.3 Run Docker with CLI (Option A) - -```bash -docker run -d --name="retriever-multimodal-redis" -p 7000:7000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e REDIS_URL=$REDIS_URL -e INDEX_NAME=$INDEX_NAME opea/retriever-multimodal-redis:latest -``` - -### 2.4 Run Docker with Docker Compose (Option B) - -```bash -cd docker -docker compose -f docker_compose_retriever.yaml up -d -``` - -## 🚀3. Consume Retriever Service - -### 3.1 Consume Embedding Service - -To consume the Retriever Microservice, you can generate a mock embedding vector of length 512 with Python. - -```bash -export your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(512)]; print(embedding)") -curl http://${your_ip}:7000/v1/multimodal_retrieval \ - -X POST \ - -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding}}" \ - -H 'Content-Type: application/json' -``` - -You can set the parameters for the retriever. - -```bash -export your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(512)]; print(embedding)") -curl http://localhost:7000/v1/multimodal_retrieval \ - -X POST \ - -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding},\"search_type\":\"similarity\", \"k\":4}" \ - -H 'Content-Type: application/json' -``` - -```bash -export your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(512)]; print(embedding)") -curl http://localhost:7000/v1/multimodal_retrieval \ - -X POST \ - -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding},\"search_type\":\"similarity_distance_threshold\", \"k\":4, \"distance_threshold\":1.0}" \ - -H 'Content-Type: application/json' -``` - -```bash -export your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(512)]; print(embedding)") -curl http://localhost:7000/v1/multimodal_retrieval \ - -X POST \ - -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding},\"search_type\":\"similarity_score_threshold\", \"k\":4, \"score_threshold\":0.2}" \ - -H 'Content-Type: application/json' -``` - -```bash -export your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(512)]; print(embedding)") -curl http://localhost:7000/v1/multimodal_retrieval \ - -X POST \ - -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding},\"search_type\":\"mmr\", \"k\":4, \"fetch_k\":20, \"lambda_mult\":0.5}" \ - -H 'Content-Type: application/json' -``` diff --git a/comps/retrievers/multimodal/redis/langchain/__init__.py b/comps/retrievers/multimodal/redis/langchain/__init__.py deleted file mode 100644 index 916f3a44b..000000000 --- a/comps/retrievers/multimodal/redis/langchain/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/retrievers/multimodal/redis/langchain/docker_compose_retriever.yaml b/comps/retrievers/multimodal/redis/langchain/docker_compose_retriever.yaml deleted file mode 100644 index 6c4f76ad0..000000000 --- a/comps/retrievers/multimodal/redis/langchain/docker_compose_retriever.yaml +++ /dev/null @@ -1,23 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -version: "1.0" - -services: - retriever-multimodal-redis: - image: opea/retriever-multimodal-redis:latest - container_name: retriever-multimodal-redis - ports: - - "7000:7000" - ipc: host - environment: - no_proxy: ${no_proxy} - http_proxy: ${http_proxy} - https_proxy: ${https_proxy} - REDIS_URL: ${REDIS_URL} - INDEX_NAME: ${INDEX_NAME} - restart: unless-stopped - -networks: - default: - driver: bridge diff --git a/comps/retrievers/multimodal/redis/langchain/multimodal_config.py b/comps/retrievers/multimodal/redis/langchain/multimodal_config.py deleted file mode 100644 index 79ac5698f..000000000 --- a/comps/retrievers/multimodal/redis/langchain/multimodal_config.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -import os - -current_file_path = os.path.abspath(__file__) -parent_dir = os.path.dirname(current_file_path) - - -def get_boolean_env_var(var_name, default_value=False): - """Retrieve the boolean value of an environment variable. - - Args: - var_name (str): The name of the environment variable to retrieve. - default_value (bool): The default value to return if the variable - is not found. - Returns: - bool: The value of the environment variable, interpreted as a boolean. - """ - true_values = {"true", "1", "t", "y", "yes"} - false_values = {"false", "0", "f", "n", "no"} - - # Retrieve the environment variable's value - value = os.getenv(var_name, "").lower() - - # Decide the boolean value based on the content of the string - if value in true_values: - return True - elif value in false_values: - return False - else: - return default_value - - -# Check for openai API key -# if "OPENAI_API_KEY" not in os.environ: -# raise Exception("Must provide an OPENAI_API_KEY as an env var.") - - -# Whether or not to enable langchain debugging -DEBUG = get_boolean_env_var("DEBUG", False) -# Set DEBUG env var to "true" if you wish to enable LC debugging module -if DEBUG: - import langchain - - langchain.debug = True - - -# Embedding model -EMBED_MODEL = os.getenv("EMBED_MODEL", "BridgeTower/bridgetower-large-itm-mlm-itc") - -# Redis Connection Information -REDIS_HOST = os.getenv("REDIS_HOST", "localhost") -REDIS_PORT = int(os.getenv("REDIS_PORT", 6379)) - - -def format_redis_conn_from_env(): - redis_url = os.getenv("REDIS_URL", None) - if redis_url: - return redis_url - else: - using_ssl = get_boolean_env_var("REDIS_SSL", False) - start = "rediss://" if using_ssl else "redis://" - - # if using RBAC - password = os.getenv("REDIS_PASSWORD", None) - username = os.getenv("REDIS_USERNAME", "default") - if password is not None: - start += f"{username}:{password}@" - - return start + f"{REDIS_HOST}:{REDIS_PORT}" - - -REDIS_URL = format_redis_conn_from_env() - -# Vector Index Configuration -INDEX_NAME = os.getenv("INDEX_NAME", "test-index") - -current_file_path = os.path.abspath(__file__) -parent_dir = os.path.dirname(current_file_path) -REDIS_SCHEMA = os.getenv("REDIS_SCHEMA", "schema.yml") -schema_path = os.path.join(parent_dir, REDIS_SCHEMA) -INDEX_SCHEMA = schema_path diff --git a/comps/retrievers/multimodal/redis/langchain/requirements.txt b/comps/retrievers/multimodal/redis/langchain/requirements.txt deleted file mode 100644 index b7cef2866..000000000 --- a/comps/retrievers/multimodal/redis/langchain/requirements.txt +++ /dev/null @@ -1,11 +0,0 @@ -docarray[full] -fastapi -langchain_community -opentelemetry-api -opentelemetry-exporter-otlp -opentelemetry-sdk -prometheus-fastapi-instrumentator -redis -shortuuid -transformers -uvicorn diff --git a/comps/retrievers/multimodal/redis/langchain/retriever_redis.py b/comps/retrievers/multimodal/redis/langchain/retriever_redis.py deleted file mode 100644 index a01b3e20c..000000000 --- a/comps/retrievers/multimodal/redis/langchain/retriever_redis.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -import time -from typing import Union - -from langchain_community.vectorstores import Redis -from multimodal_config import INDEX_NAME, INDEX_SCHEMA, REDIS_URL - -from comps import ( - EmbedMultimodalDoc, - SearchedMultimodalDoc, - ServiceType, - TextDoc, - opea_microservices, - register_microservice, - register_statistics, - statistics_dict, -) -from comps.cores.proto.api_protocol import ( - ChatCompletionRequest, - RetrievalRequest, - RetrievalResponse, - RetrievalResponseData, -) -from comps.embeddings.multimodal.bridgetower import BridgeTowerEmbedding - - -@register_microservice( - name="opea_service@multimodal_retriever_redis", - service_type=ServiceType.RETRIEVER, - endpoint="/v1/multimodal_retrieval", - host="0.0.0.0", - port=7000, -) -@register_statistics(names=["opea_service@multimodal_retriever_redis"]) -async def retrieve( - input: Union[EmbedMultimodalDoc, RetrievalRequest, ChatCompletionRequest] -) -> Union[SearchedMultimodalDoc, RetrievalResponse, ChatCompletionRequest]: - - start = time.time() - # check if the Redis index has data - if vector_db.client.keys() == []: - search_res = [] - else: - # if the Redis index has data, perform the search - if input.search_type == "similarity": - search_res = await vector_db.asimilarity_search_by_vector(embedding=input.embedding, k=input.k) - elif input.search_type == "similarity_distance_threshold": - if input.distance_threshold is None: - raise ValueError("distance_threshold must be provided for " + "similarity_distance_threshold retriever") - search_res = await vector_db.asimilarity_search_by_vector( - embedding=input.embedding, k=input.k, distance_threshold=input.distance_threshold - ) - elif input.search_type == "similarity_score_threshold": - docs_and_similarities = await vector_db.asimilarity_search_with_relevance_scores( - query=input.text, k=input.k, score_threshold=input.score_threshold - ) - search_res = [doc for doc, _ in docs_and_similarities] - elif input.search_type == "mmr": - search_res = await vector_db.amax_marginal_relevance_search( - query=input.text, k=input.k, fetch_k=input.fetch_k, lambda_mult=input.lambda_mult - ) - else: - raise ValueError(f"{input.search_type} not valid") - - # return different response format - retrieved_docs = [] - if isinstance(input, EmbedMultimodalDoc): - metadata_list = [] - for r in search_res: - metadata_list.append(r.metadata) - retrieved_docs.append(TextDoc(text=r.page_content)) - result = SearchedMultimodalDoc(retrieved_docs=retrieved_docs, initial_query=input.text, metadata=metadata_list) - else: - for r in search_res: - retrieved_docs.append(RetrievalResponseData(text=r.page_content, metadata=r.metadata)) - if isinstance(input, RetrievalRequest): - result = RetrievalResponse(retrieved_docs=retrieved_docs) - elif isinstance(input, ChatCompletionRequest): - input.retrieved_docs = retrieved_docs - input.documents = [doc.text for doc in retrieved_docs] - result = input - - statistics_dict["opea_service@multimodal_retriever_redis"].append_latency(time.time() - start, None) - return result - - -if __name__ == "__main__": - - embeddings = BridgeTowerEmbedding() - vector_db = Redis(embedding=embeddings, index_name=INDEX_NAME, index_schema=INDEX_SCHEMA, redis_url=REDIS_URL) - opea_microservices["opea_service@multimodal_retriever_redis"].start() diff --git a/comps/retrievers/qdrant/haystack/ingest.py b/comps/retrievers/qdrant/haystack/ingest.py deleted file mode 100644 index d14dfbbb0..000000000 --- a/comps/retrievers/qdrant/haystack/ingest.py +++ /dev/null @@ -1,110 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -# - -import argparse -import io -import os -import uuid - -import numpy as np -from haystack.components.embedders import HuggingFaceTEIDocumentEmbedder, SentenceTransformersDocumentEmbedder -from haystack.dataclasses.document import Document -from haystack_integrations.document_stores.qdrant import QdrantDocumentStore -from langchain.text_splitter import RecursiveCharacterTextSplitter -from PIL import Image -from qdrant_config import EMBED_DIMENSION, EMBED_ENDPOINT, EMBED_MODEL, INDEX_NAME, QDRANT_HOST, QDRANT_PORT - - -def pdf_loader(file_path): - try: - import easyocr - import fitz - except ImportError: - raise ImportError( - "`PyMuPDF` or 'easyocr' package is not found, please install it with " - "`pip install pymupdf or pip install easyocr.`" - ) - - doc = fitz.open(file_path) - reader = easyocr.Reader(["en"]) - result = "" - for i in range(doc.page_count): - page = doc.load_page(i) - pagetext = page.get_text().strip() - if pagetext: - result = result + pagetext - if len(doc.get_page_images(i)) > 0: - for img in doc.get_page_images(i): - if img: - pageimg = "" - xref = img[0] - img_data = doc.extract_image(xref) - img_bytes = img_data["image"] - pil_image = Image.open(io.BytesIO(img_bytes)) - img = np.array(pil_image) - img_result = reader.readtext(img, paragraph=True, detail=0) - pageimg = pageimg + ", ".join(img_result).strip() - if pageimg.endswith("!") or pageimg.endswith("?") or pageimg.endswith("."): - pass - else: - pageimg = pageimg + "." - result = result + pageimg - return result - - -def ingest_documents(folder_path, tag): - """Ingest PDF to Qdrant from the a given path.""" - # Load list of pdfs - doc_path = [os.path.join(folder_path, file) for file in os.listdir(folder_path)][0] - - print("Parsing...", doc_path) - - text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100, add_start_index=True) - content = pdf_loader(doc_path) - chunks = text_splitter.split_text(content) - - print("Done preprocessing. Created ", len(chunks), " chunks of the original pdf") - # Create vectorstore - if EMBED_ENDPOINT: - # create embeddings using TEI endpoint service - embedder = HuggingFaceTEIDocumentEmbedder(url=EMBED_ENDPOINT) - else: - # create embeddings using local embedding model - embedder = SentenceTransformersDocumentEmbedder(model=EMBED_MODEL) - embedder.warm_up() - - # Initialize Qdrant store - qdrant_store = QdrantDocumentStore( - host=QDRANT_HOST, - port=QDRANT_PORT, - embedding_dim=EMBED_DIMENSION, - index=INDEX_NAME, - embedding_field="embedding", - similarity="cosine", - recreate_index=True, - ) - - # Batch size - batch_size = 32 - num_chunks = len(chunks) - for i in range(0, num_chunks, batch_size): - batch_chunks = chunks[i : i + batch_size] - batch_texts = [f"Tag: {tag}. " + chunk for chunk in batch_chunks] - documents = [Document(id=str(uuid.uuid4()), content=content) for content in batch_texts] - documents_with_embeddings = embedder.run(documents)["documents"] - qdrant_store.write_documents(documents_with_embeddings) - - print(f"Processed batch {i//batch_size + 1}/{(num_chunks-1)//batch_size + 1}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Ingest documents from a specified folder with a tag") - parser.add_argument("folder_path", type=str, help="Path to the folder containing documents") - parser.add_argument("--tag", type=str, default="", help="Tag to be used as an identifier") - - args = parser.parse_args() - ingest_documents(args.folder_path, args.tag) diff --git a/comps/retrievers/redis/data/nke-10k-2023.pdf b/comps/retrievers/redis/data/nke-10k-2023.pdf deleted file mode 100644 index 6ade8863e..000000000 Binary files a/comps/retrievers/redis/data/nke-10k-2023.pdf and /dev/null differ diff --git a/comps/retrievers/redis/langchain/README.md b/comps/retrievers/redis/langchain/README.md index db3161192..2172bcbc1 100644 --- a/comps/retrievers/redis/langchain/README.md +++ b/comps/retrievers/redis/langchain/README.md @@ -57,12 +57,24 @@ python retriever_redis.py ### 2.1 Setup Environment Variables +Two versions of retriever are supported for redis: text retriever and multimodal retriever. +Users need to setup different environment variables for each type of retriever as below. + ```bash +# for text retriever +export your_ip=$(hostname -I | awk '{print $1}') export RETRIEVE_MODEL_ID="BAAI/bge-base-en-v1.5" export REDIS_URL="redis://${your_ip}:6379" export INDEX_NAME=${your_index_name} export TEI_EMBEDDING_ENDPOINT="http://${your_ip}:6060" export HUGGINGFACEHUB_API_TOKEN=${your_hf_token} + +# for multimodal retriever +export your_ip=$(hostname -I | awk '{print $1}') +export RETRIEVE_MODEL_ID="BAAI/bge-base-en-v1.5" +export REDIS_URL="redis://${your_ip}:6379" +export INDEX_NAME=${your_index_name} +export BRIDGE_TOWER_EMBEDDING=true ``` ### 2.2 Build Docker Image @@ -82,7 +94,10 @@ You can choose one as needed. ### 2.3 Run Docker with CLI (Option A) ```bash +# Start a text retriever server docker run -d --name="retriever-redis-server" -p 7000:7000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e REDIS_URL=$REDIS_URL -e INDEX_NAME=$INDEX_NAME -e TEI_EMBEDDING_ENDPOINT=$TEI_EMBEDDING_ENDPOINT -e HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN opea/retriever-redis:latest +# start a multimodal retriever server +docker run -d --name="retriever-multimodal-redis-server" -p 7000:7000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e REDIS_URL=$REDIS_URL -e INDEX_NAME=$INDEX_NAME -e BRIDGE_TOWER_EMBEDDING=${BRIDGE_TOWER_EMBEDDING} opea/retriever-redis:latest ``` ### 2.4 Run Docker with Docker Compose (Option B) @@ -103,10 +118,20 @@ curl http://localhost:7000/v1/health_check \ ### 3.2 Consume Embedding Service -To consume the Retriever Microservice, you can generate a mock embedding vector of length 768 with Python. +To consume the Retriever Microservice, you can generate a mock embedding vector with Python. + +Same here, users need to validate text/multimodal embedding service with different lengths of vectors. Then use the `curl` command to validate. ```bash +# for text retriever export your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") +# for multimodal retriever +export your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(512)]; print(embedding)") +``` + +Default validation. + +```bash curl http://${your_ip}:7000/v1/retrieval \ -X POST \ -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding}}" \ @@ -116,7 +141,6 @@ curl http://${your_ip}:7000/v1/retrieval \ You can set the parameters for the retriever. ```bash -export your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") curl http://localhost:7000/v1/retrieval \ -X POST \ -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding},\"search_type\":\"similarity\", \"k\":4}" \ @@ -124,7 +148,6 @@ curl http://localhost:7000/v1/retrieval \ ``` ```bash -export your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") curl http://localhost:7000/v1/retrieval \ -X POST \ -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding},\"search_type\":\"similarity_distance_threshold\", \"k\":4, \"distance_threshold\":1.0}" \ @@ -132,7 +155,6 @@ curl http://localhost:7000/v1/retrieval \ ``` ```bash -export your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") curl http://localhost:7000/v1/retrieval \ -X POST \ -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding},\"search_type\":\"similarity_score_threshold\", \"k\":4, \"score_threshold\":0.2}" \ @@ -140,7 +162,6 @@ curl http://localhost:7000/v1/retrieval \ ``` ```bash -export your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") curl http://localhost:7000/v1/retrieval \ -X POST \ -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding},\"search_type\":\"mmr\", \"k\":4, \"fetch_k\":20, \"lambda_mult\":0.5}" \ diff --git a/comps/retrievers/redis/langchain/docker_compose_retriever.yaml b/comps/retrievers/redis/langchain/docker_compose_retriever.yaml index 90c4f6a7f..5ce13af87 100644 --- a/comps/retrievers/redis/langchain/docker_compose_retriever.yaml +++ b/comps/retrievers/redis/langchain/docker_compose_retriever.yaml @@ -27,6 +27,7 @@ services: INDEX_NAME: ${INDEX_NAME} TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT} HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + BRIDGE_TOWER_EMBEDDING: ${BRIDGE_TOWER_EMBEDDING} restart: unless-stopped networks: diff --git a/comps/retrievers/redis/langchain/ingest.py b/comps/retrievers/redis/langchain/ingest.py deleted file mode 100644 index c064c4158..000000000 --- a/comps/retrievers/redis/langchain/ingest.py +++ /dev/null @@ -1,121 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -# - -import io -import os -import time - -import numpy as np -from langchain.text_splitter import RecursiveCharacterTextSplitter -from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceHubEmbeddings -from langchain_community.vectorstores import Redis -from PIL import Image -from redis_config import EMBED_MODEL, INDEX_NAME, REDIS_URL - -tei_embedding_endpoint = os.getenv("TEI_EMBEDDING_ENDPOINT") - -timeout = 180 # Timeout in seconds -check_interval = 5 # Check every 5 seconds - - -def check_embedding_endpoint(embedder): - try: - test_sentence = "embedder available test." - embedder.embed_documents([test_sentence]) - return True - except Exception as e: - print(f"Error embedder is unavailable: {e}") - return False - - -def pdf_loader(file_path): - try: - import easyocr - import fitz - except ImportError: - raise ImportError( - "`PyMuPDF` or 'easyocr' package is not found, please install it with " - "`pip install pymupdf or pip install easyocr.`" - ) - - doc = fitz.open(file_path) - reader = easyocr.Reader(["en"]) - result = "" - for i in range(doc.page_count): - page = doc.load_page(i) - pagetext = page.get_text().strip() - if pagetext: - result = result + pagetext - if len(doc.get_page_images(i)) > 0: - for img in doc.get_page_images(i): - if img: - pageimg = "" - xref = img[0] - img_data = doc.extract_image(xref) - img_bytes = img_data["image"] - pil_image = Image.open(io.BytesIO(img_bytes)) - img = np.array(pil_image) - img_result = reader.readtext(img, paragraph=True, detail=0) - pageimg = pageimg + ", ".join(img_result).strip() - if pageimg.endswith("!") or pageimg.endswith("?") or pageimg.endswith("."): - pass - else: - pageimg = pageimg + "." - result = result + pageimg - return result - - -def ingest_documents(): - """Ingest PDF to Redis from the data/ directory that - contains Edgar 10k filings data for Nike.""" - # Load list of pdfs - company_name = "Nike" - data_path = "data/" - doc_path = [os.path.join(data_path, file) for file in os.listdir(data_path)][0] - - print("Parsing 10k filing doc for NIKE", doc_path) - - text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100, add_start_index=True) - content = pdf_loader(doc_path) - chunks = text_splitter.split_text(content) - - print("Done preprocessing. Created ", len(chunks), " chunks of the original pdf") - # Create vectorstore - if tei_embedding_endpoint: - # create embeddings using TEI endpoint service - embedder = HuggingFaceHubEmbeddings(model=tei_embedding_endpoint) - start_time = time.time() - while time.time() - start_time < timeout: - is_available = check_embedding_endpoint(embedder) - if is_available: - print("Embedder endpoint is available.") - break - else: - print("Embedder endpoint is not available. Checking again in 5 seconds...") - time.sleep(check_interval) - else: - # create embeddings using local embedding model - embedder = HuggingFaceBgeEmbeddings(model_name=EMBED_MODEL) - - # Batch size - batch_size = 32 - num_chunks = len(chunks) - for i in range(0, num_chunks, batch_size): - batch_chunks = chunks[i : i + batch_size] - batch_texts = [f"Company: {company_name}. " + chunk for chunk in batch_chunks] - - _ = Redis.from_texts( - texts=batch_texts, - embedding=embedder, - index_name=INDEX_NAME, - redis_url=REDIS_URL, - ) - print(f"Processed batch {i//batch_size + 1}/{(num_chunks-1)//batch_size + 1}") - - -if __name__ == "__main__": - ingest_documents() diff --git a/comps/retrievers/redis/langchain/redis_config.py b/comps/retrievers/redis/langchain/redis_config.py index 35233d8ff..ade017e7b 100644 --- a/comps/retrievers/redis/langchain/redis_config.py +++ b/comps/retrievers/redis/langchain/redis_config.py @@ -73,3 +73,6 @@ def format_redis_conn_from_env(): current_file_path = os.path.abspath(__file__) parent_dir = os.path.dirname(current_file_path) +REDIS_SCHEMA = os.getenv("REDIS_SCHEMA", "redis_schema_multi.yml") +schema_path = os.path.join(parent_dir, REDIS_SCHEMA) +INDEX_SCHEMA = schema_path diff --git a/comps/retrievers/multimodal/redis/langchain/schema.yml b/comps/retrievers/redis/langchain/redis_schema_multi.yml similarity index 100% rename from comps/retrievers/multimodal/redis/langchain/schema.yml rename to comps/retrievers/redis/langchain/redis_schema_multi.yml diff --git a/comps/retrievers/redis/langchain/requirements.txt b/comps/retrievers/redis/langchain/requirements.txt index f29317dd9..96a303bd7 100644 --- a/comps/retrievers/redis/langchain/requirements.txt +++ b/comps/retrievers/redis/langchain/requirements.txt @@ -11,4 +11,5 @@ pymupdf redis sentence_transformers shortuuid +transformers uvicorn diff --git a/comps/retrievers/redis/langchain/retriever_redis.py b/comps/retrievers/redis/langchain/retriever_redis.py index ada07d236..da531adb4 100644 --- a/comps/retrievers/redis/langchain/retriever_redis.py +++ b/comps/retrievers/redis/langchain/retriever_redis.py @@ -8,12 +8,14 @@ from langchain_community.embeddings import HuggingFaceBgeEmbeddings from langchain_community.vectorstores import Redis from langchain_huggingface import HuggingFaceEndpointEmbeddings -from redis_config import EMBED_MODEL, INDEX_NAME, REDIS_URL +from redis_config import EMBED_MODEL, INDEX_NAME, INDEX_SCHEMA, REDIS_URL from comps import ( CustomLogger, EmbedDoc, + EmbedMultimodalDoc, SearchedDoc, + SearchedMultimodalDoc, ServiceType, TextDoc, opea_microservices, @@ -28,11 +30,13 @@ RetrievalResponse, RetrievalResponseData, ) +from comps.embeddings.multimodal.bridgetower import BridgeTowerEmbedding logger = CustomLogger("retriever_redis") logflag = os.getenv("LOGFLAG", False) tei_embedding_endpoint = os.getenv("TEI_EMBEDDING_ENDPOINT") +bridge_tower_embedding = os.getenv("BRIDGE_TOWER_EMBEDDING") @register_microservice( @@ -44,8 +48,8 @@ ) @register_statistics(names=["opea_service@retriever_redis"]) async def retrieve( - input: Union[EmbedDoc, RetrievalRequest, ChatCompletionRequest] -) -> Union[SearchedDoc, RetrievalResponse, ChatCompletionRequest]: + input: Union[EmbedDoc, EmbedMultimodalDoc, RetrievalRequest, ChatCompletionRequest] +) -> Union[SearchedDoc, SearchedMultimodalDoc, RetrievalResponse, ChatCompletionRequest]: if logflag: logger.info(input) start = time.time() @@ -53,21 +57,16 @@ async def retrieve( if vector_db.client.keys() == []: search_res = [] else: - if isinstance(input, EmbedDoc): - query = input.text + if isinstance(input, EmbedDoc) or isinstance(input, EmbedMultimodalDoc): embedding_data_input = input.embedding else: # for RetrievalRequest, ChatCompletionRequest - query = input.input if isinstance(input.embedding, EmbeddingResponse): embeddings = input.embedding.data embedding_data_input = [] for emb in embeddings: # each emb is EmbeddingResponseData - # print("Embedding data: ", emb.embedding) - # print("Embedding data length: ",len(emb.embedding)) embedding_data_input.append(emb.embedding) - # print("All Embedding data length: ",len(embedding_data_input)) else: embedding_data_input = input.embedding @@ -98,6 +97,12 @@ async def retrieve( for r in search_res: retrieved_docs.append(TextDoc(text=r.page_content)) result = SearchedDoc(retrieved_docs=retrieved_docs, initial_query=input.text) + elif isinstance(input, EmbedMultimodalDoc): + metadata_list = [] + for r in search_res: + metadata_list.append(r.metadata) + retrieved_docs.append(TextDoc(text=r.page_content)) + result = SearchedMultimodalDoc(retrieved_docs=retrieved_docs, initial_query=input.text, metadata=metadata_list) else: for r in search_res: retrieved_docs.append(RetrievalResponseData(text=r.page_content, metadata=r.metadata)) @@ -119,9 +124,14 @@ async def retrieve( if tei_embedding_endpoint: # create embeddings using TEI endpoint service embeddings = HuggingFaceEndpointEmbeddings(model=tei_embedding_endpoint) + vector_db = Redis(embedding=embeddings, index_name=INDEX_NAME, redis_url=REDIS_URL) + elif bridge_tower_embedding: + # create embeddings using BridgeTower service + embeddings = BridgeTowerEmbedding() + vector_db = Redis(embedding=embeddings, index_name=INDEX_NAME, index_schema=INDEX_SCHEMA, redis_url=REDIS_URL) else: # create embeddings using local embedding model embeddings = HuggingFaceBgeEmbeddings(model_name=EMBED_MODEL) + vector_db = Redis(embedding=embeddings, index_name=INDEX_NAME, redis_url=REDIS_URL) - vector_db = Redis(embedding=embeddings, index_name=INDEX_NAME, redis_url=REDIS_URL) opea_microservices["opea_service@retriever_redis"].start() diff --git a/tests/retrievers/test_retrievers_multimodal_redis_langchain.sh b/tests/retrievers/test_retrievers_multimodal_redis_langchain.sh deleted file mode 100644 index 873516ddc..000000000 --- a/tests/retrievers/test_retrievers_multimodal_redis_langchain.sh +++ /dev/null @@ -1,84 +0,0 @@ -#!/bin/bash -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -set -x - -WORKPATH=$(dirname "$PWD") -LOG_PATH="$WORKPATH/tests" -ip_address=$(hostname -I | awk '{print $1}') - -function build_docker_images() { - cd $WORKPATH - docker build --no-cache -t opea/retriever-multimodal-redis:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/multimodal/redis/langchain/Dockerfile . - if [ $? -ne 0 ]; then - echo "opea/retriever-multimodal-redis built fail" - exit 1 - else - echo "opea/retriever-multimodal-redis built successful" - fi -} - -function start_service() { - # redis - docker run -d --name test-comps-multimodal-retriever-redis-vector-db -p 5689:6379 -p 5011:8001 -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy redis/redis-stack:7.2.0-v9 - sleep 10s - - # redis retriever - export REDIS_URL="redis://${ip_address}:5689" - export INDEX_NAME="rag-redis" - retriever_port=5434 - unset http_proxy - docker run -d --name="test-comps-retriever-multimodal-redis" -p ${retriever_port}:7000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e REDIS_URL=$REDIS_URL -e INDEX_NAME=$INDEX_NAME opea/retriever-multimodal-redis:comps - - sleep 5m -} - -function validate_microservice() { - retriever_port=5434 - export PATH="${HOME}/miniforge3/bin:$PATH" - source activate - URL="http://${ip_address}:$retriever_port/v1/multimodal_retrieval" - test_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(512)]; print(embedding)") - - HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "{\"text\":\"test\",\"embedding\":${test_embedding}}" -H 'Content-Type: application/json' "$URL") - if [ "$HTTP_STATUS" -eq 200 ]; then - echo "[ retriever ] HTTP status is 200. Checking content..." - local CONTENT=$(curl -s -X POST -d "{\"text\":\"test\",\"embedding\":${test_embedding}}" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/retriever.log) - - if echo "$CONTENT" | grep -q "retrieved_docs"; then - echo "[ retriever ] Content is as expected." - else - echo "[ retriever ] Content does not match the expected result: $CONTENT" - docker logs test-comps-retriever-multimodal-redis >> ${LOG_PATH}/retriever.log - exit 1 - fi - else - echo "[ retriever ] HTTP status is not 200. Received status was $HTTP_STATUS" - docker logs test-comps-retriever-multimodal-redis >> ${LOG_PATH}/retriever.log - exit 1 - fi -} - -function stop_docker() { - cid_retrievers=$(docker ps -aq --filter "name=test-comps-*") - if [[ ! -z "$cid_retrievers" ]]; then - docker stop $cid_retrievers && docker rm $cid_retrievers && sleep 1s - fi -} - -function main() { - - stop_docker - - build_docker_images - start_service - - validate_microservice - - stop_docker - echo y | docker system prune - -} - -main diff --git a/tests/retrievers/test_retrievers_redis_langchain.sh b/tests/retrievers/test_retrievers_redis_langchain.sh index dd34a2a0f..685d20ba4 100644 --- a/tests/retrievers/test_retrievers_redis_langchain.sh +++ b/tests/retrievers/test_retrievers_redis_langchain.sh @@ -42,14 +42,29 @@ function start_service() { sleep 3m } +function start_multimodal_service() { + # redis + docker run -d --name test-comps-retriever-redis-vector-db -p 5689:6379 -p 5011:8001 -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy redis/redis-stack:7.2.0-v9 + sleep 10s + + # redis retriever + export REDIS_URL="redis://${ip_address}:5689" + export INDEX_NAME="rag-redis" + retriever_port=5435 + unset http_proxy + docker run -d --name="test-comps-retriever-redis-server" -p ${retriever_port}:7000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e REDIS_URL=$REDIS_URL -e INDEX_NAME=$INDEX_NAME -e BRIDGE_TOWER_EMBEDDING=true opea/retriever-redis:comps + + sleep 2m +} + function validate_microservice() { + local test_embedding="$1" + retriever_port=5435 export PATH="${HOME}/miniforge3/bin:$PATH" source activate URL="http://${ip_address}:$retriever_port/v1/retrieval" - test_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") - HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "{\"text\":\"test\",\"embedding\":${test_embedding}}" -H 'Content-Type: application/json' "$URL") if [ "$HTTP_STATUS" -eq 200 ]; then echo "[ retriever ] HTTP status is 200. Checking content..." @@ -60,13 +75,11 @@ function validate_microservice() { else echo "[ retriever ] Content does not match the expected result: $CONTENT" docker logs test-comps-retriever-redis-server >> ${LOG_PATH}/retriever.log - docker logs test-comps-retriever-redis-tei-endpoint >> ${LOG_PATH}/tei.log exit 1 fi else echo "[ retriever ] HTTP status is not 200. Received status was $HTTP_STATUS" docker logs test-comps-retriever-redis-server >> ${LOG_PATH}/retriever.log - docker logs test-comps-retriever-redis-tei-endpoint >> ${LOG_PATH}/tei.log exit 1 fi } @@ -81,12 +94,20 @@ function stop_docker() { function main() { stop_docker - build_docker_images + + # test text retriever start_service + test_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") + validate_microservice "$test_embedding" + stop_docker - validate_microservice + # test multimodal retriever + start_multimodal_service + test_embedding_multi=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(512)]; print(embedding)") + validate_microservice "$test_embedding_multi" + # clean env stop_docker echo y | docker system prune