diff --git a/.github/workflows/docker/compose/retrievers-compose.yaml b/.github/workflows/docker/compose/retrievers-compose.yaml index 7b89ce9bf..bfc8a29a5 100644 --- a/.github/workflows/docker/compose/retrievers-compose.yaml +++ b/.github/workflows/docker/compose/retrievers-compose.yaml @@ -47,3 +47,7 @@ services: build: dockerfile: comps/retrievers/neo4j/llama_index/Dockerfile image: ${REGISTRY:-opea}/retriever-neo4j-llamaindex:${TAG:-latest} + retriever-arango: + build: + dockerfile: comps/retrievers/arango/langchain/Dockerfile + image: ${REGISTRY:-opea}/retriever-arango:${TAG:-latest} \ No newline at end of file diff --git a/comps/retrievers/arango/langchain/Dockerfile b/comps/retrievers/arango/langchain/Dockerfile new file mode 100644 index 000000000..043bf7460 --- /dev/null +++ b/comps/retrievers/arango/langchain/Dockerfile @@ -0,0 +1,33 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +ENV LANG=C.UTF-8 + +ENV HUGGINGFACEHUB_API_TOKEN=dummy + +ARG ARCH="cpu" + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + build-essential \ + libgl1-mesa-glx \ + libjemalloc-dev + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps + +RUN pip install --no-cache-dir --upgrade pip setuptools && \ + if [ ${ARCH} = "cpu" ]; then pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu; fi && \ + pip install --no-cache-dir -r /home/user/comps/retrievers/arangodb/langchain/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +WORKDIR /home/user/comps/retrievers/arangodb/langchain + +ENTRYPOINT ["python", "retriever_arangodb.py"] diff --git a/comps/retrievers/arango/langchain/README.md b/comps/retrievers/arango/langchain/README.md new file mode 100644 index 000000000..6e0c7a907 --- /dev/null +++ b/comps/retrievers/arango/langchain/README.md @@ -0,0 +1,111 @@ +# Retriever Microservice with ArangoDB (work-in-progress) + +## 🚀Start Microservice with Python + +### Install Requirements + +```bash +pip install -r requirements.txt +``` + +### Start ArangoDB Server + +To launch ArangoDB locally, first ensure you have docker installed. Then, you can launch the database with the following docker command. + +**TODO: Switch to official image when ready** + +```bash +docker run \ + -p 8529:8529 + -e ARANGO_ROOT_PASSWORD=test + jbajic/arangodb-arm:vector-index-preview +``` + +### Setup Environment Variables + +```bash +export no_proxy=${your_no_proxy} +export http_proxy=${your_http_proxy} +export https_proxy=${your_http_proxy} +export ARANGODB_URI=${your_arangodb_uri} +export ARANGODB_USERNAME=${your_arangodb_username} +export ARANGODB_PASSWORD=${your_arangodb_password} +export ARANGODB_DATABASE=${your_arangodb_database} +``` + +### Start Retriever Service + +```bash +python retriever_arangodb.py +``` + +## 🚀Start Microservice with Docker + +### Build Docker Image + +```bash +cd ../../ +docker build -t opea/retriever-arangodb:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/arangodb/langchain/Dockerfile . +``` + +### Run Docker with CLI + +```bash +docker run -d --name="retriever-arangodb-server" -p 7000:7000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e ARANGODB_URI=${your_arangodb_host_ip} opea/retriever-arangodb:latest +``` + +## 🚀3. Consume Retriever Service + +### 3.1 Check Service Status + +```bash +curl http://${your_ip}:7000/v1/health_check \ + -X GET \ + -H 'Content-Type: application/json' +``` + +### 3.2 Consume Embedding Service + +To consume the Retriever Microservice, you can generate a mock embedding vector of length 768 with Python. + +```bash +export your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") +curl http://${your_ip}:7000/v1/retrieval \ + -X POST \ + -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding}}" \ + -H 'Content-Type: application/json' +``` + +You can set the parameters for the retriever. + +```bash +export your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") +curl http://localhost:7000/v1/retrieval \ + -X POST \ + -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding},\"search_type\":\"similarity\", \"k\":4}" \ + -H 'Content-Type: application/json' +``` + +```bash +export your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") +curl http://localhost:7000/v1/retrieval \ + -X POST \ + -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding},\"search_type\":\"similarity_distance_threshold\", \"k\":4, \"distance_threshold\":1.0}" \ + -H 'Content-Type: application/json' +``` + +```bash +export your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") +curl http://localhost:7000/v1/retrieval \ + -X POST \ + -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding},\"search_type\":\"similarity_score_threshold\", \"k\":4, \"score_threshold\":0.2}" \ + -H 'Content-Type: application/json' +``` + +```bash +export your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") +curl http://localhost:7000/v1/retrieval \ + -X POST \ + -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding},\"search_type\":\"mmr\", \"k\":4, \"fetch_k\":20, \"lambda_mult\":0.5}" \ + -H 'Content-Type: application/json' +``` diff --git a/comps/retrievers/arango/langchain/__init__.py b/comps/retrievers/arango/langchain/__init__.py new file mode 100644 index 000000000..916f3a44b --- /dev/null +++ b/comps/retrievers/arango/langchain/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/retrievers/arango/langchain/config.py b/comps/retrievers/arango/langchain/config.py new file mode 100644 index 000000000..8d8e089fe --- /dev/null +++ b/comps/retrievers/arango/langchain/config.py @@ -0,0 +1,26 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +# ArangoDB configuration +ARANGO_URL = os.getenv("ARANGO_URL", "http://localhost:8529") +ARANGO_USERNAME = os.getenv("ARANGO_USERNAME", "root") +ARANGO_PASSWORD = os.getenv("ARANGO_PASSWORD", "test") +ARANGO_DB_NAME = os.getenv("ARANGO_DB_NAME", "_system") +ARANGO_COLLECTION_NAME = os.getenv("ARANGO_COLLECTION_NAME", "SOURCE") +ARANGO_DISTANCE_STRATEGY = os.getenv("ARANGO_DISTANCE_STRATEGY", "COSINE") +ARANGO_TEXT_FIELD = os.getenv("ARANGO_TEXT_FIELD", "text") +ARANGO_EMBBEDDING_FIELD = os.getenv("ARANGO_EMBEDDING_FIELD", "metadata.embedding") +ARANGO_EMBED_DIMENSION = os.getenv("ARANGO_EMBED_DIMENSION") +ARANGO_NUM_CENTROIDS = os.getenv("ARANGO_NUM_CENTROIDS", 1) +ARANGO_GRAPH_NAME = os.getenv("ARANGO_GRAPH_NAME") + +# Embedding configuration +EMBED_MODEL = os.getenv("EMBED_MODEL", "BAAI/bge-base-en-v1.5") +EMBED_ENDPOINT = os.getenv("TEI_EMBEDDING_ENDPOINT", "") +HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN") + +# OpenAI configuration (alternative to TEI & local model) +OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") +OPENAI_EMBED_MODEL = os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-small") diff --git a/comps/retrievers/arango/langchain/requirements.txt b/comps/retrievers/arango/langchain/requirements.txt new file mode 100644 index 000000000..f1d40dbbe --- /dev/null +++ b/comps/retrievers/arango/langchain/requirements.txt @@ -0,0 +1,22 @@ +docarray[full] +fastapi +frontend +huggingface_hub +langchain +git+https://github.com/arangoml/langchain.git@arangodb#subdirectory=libs/community +langchain_openai +python-arango +numpy +opentelemetry-api +opentelemetry-exporter-otlp +opentelemetry-sdk +pandas +Pillow +prometheus-fastapi-instrumentator +pydantic +pymupdf +python-docx +sentence_transformers +shortuuid +tiktoken +uvicorn diff --git a/comps/retrievers/arango/langchain/retriever_arangodb.py b/comps/retrievers/arango/langchain/retriever_arangodb.py new file mode 100644 index 000000000..17df9e6d7 --- /dev/null +++ b/comps/retrievers/arango/langchain/retriever_arangodb.py @@ -0,0 +1,178 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os +import time +from typing import Union + +from arango import ArangoClient +from config import ( + ARANGO_COLLECTION_NAME, + ARANGO_DB_NAME, + ARANGO_DISTANCE_STRATEGY, + ARANGO_EMBBEDDING_FIELD, + ARANGO_NUM_CENTROIDS, + ARANGO_PASSWORD, + ARANGO_GRAPH_NAME, + ARANGO_TEXT_FIELD, + ARANGO_URL, + ARANGO_USERNAME, + ARANGO_EMBED_DIMENSION, + EMBED_ENDPOINT, + EMBED_MODEL, + HUGGINGFACEHUB_API_TOKEN, + OPENAI_API_KEY, + OPENAI_EMBED_MODEL, +) +from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceHubEmbeddings +from langchain_community.vectorstores.arangodb_vector import ArangoVector +from langchain_openai import OpenAIEmbeddings + +from comps import ( + CustomLogger, + EmbedDoc, + SearchedDoc, + ServiceType, + TextDoc, + opea_microservices, + register_microservice, + register_statistics, + statistics_dict, +) +from comps.cores.proto.api_protocol import ( + ChatCompletionRequest, + RetrievalRequest, + RetrievalResponse, + RetrievalResponseData, +) + +logger = CustomLogger("retriever_arangodb") +logflag = os.getenv("LOGFLAG", False) + + +@register_microservice( + name="opea_service@retriever_arangodb", + service_type=ServiceType.RETRIEVER, + endpoint="/v1/retrieval", + host="0.0.0.0", + port=7000, +) +@register_statistics(names=["opea_service@retriever_arangodb"]) +async def retrieve( + input: Union[EmbedDoc, RetrievalRequest, ChatCompletionRequest] +) -> Union[SearchedDoc, RetrievalResponse, ChatCompletionRequest]: + if logflag: + logger.info(input) + start = time.time() + + index = vector_db.retrieve_vector_index() + if index is None and db.collection(vector_db.collection_name).count() > 0: + vector_db.create_vector_index() + + query = input.text if isinstance(input, EmbedDoc) else input.input + embedding = input.embedding if isinstance(input.embedding, list) else None + k = input.k + + if input.search_type == "similarity": + if not input.embedding: + raise ValueError("Embedding must be provided for similarity retriever") + + search_res = await vector_db.asimilarity_search_by_vector(query=query, embedding=embedding, k=k) + elif input.search_type == "similarity_distance_threshold": + if input.distance_threshold is None: + raise ValueError("distance_threshold must be provided for similarity_distance_threshold retriever") + if not embedding: + raise ValueError("Embedding must not be None for similarity_distance_threshold retriever") + + search_res = await vector_db.asimilarity_search_by_vector( + query=query, + embedding=embedding, + k=k, + distance_threshold=input.distance_threshold, + ) + elif input.search_type == "similarity_score_threshold": + docs_and_similarities = await vector_db.asimilarity_search_with_relevance_scores( + query=query, embedding=embedding, k=k, score_threshold=input.score_threshold + ) + search_res = [doc for doc, _ in docs_and_similarities] + elif input.search_type == "mmr": + search_res = await vector_db.amax_marginal_relevance_search( + query=query, embedding=embedding, k=k, fetch_k=input.fetch_k, lambda_mult=input.lambda_mult + ) + else: + raise ValueError(f"Search Type '{input.search_type}' not valid") + + # return different response format + retrieved_docs = [] + if isinstance(input, EmbedDoc): + for r in search_res: + retrieved_docs.append(TextDoc(text=r.page_content, id=r.id)) + result = SearchedDoc(retrieved_docs=retrieved_docs, initial_query=input.text) + else: + for r in search_res: + retrieved_docs.append(RetrievalResponseData(text=r.page_content, id=r.id, metadata=r.metadata)) + if isinstance(input, RetrievalRequest): + result = RetrievalResponse(retrieved_docs=retrieved_docs) + elif isinstance(input, ChatCompletionRequest): + input.retrieved_docs = retrieved_docs + input.documents = [doc.text for doc in retrieved_docs] + result = input + + # if ARANGO_GRAPH_NAME: + # # TODO: Sample neighborhood from the graph? + # retrieved_docs_keys = [doc.id for doc in retrieved_docs] + + # query = """ + # FOR doc IN @@collection + # FILTER doc._key IN @keys + + # FOR v, e IN 1..1 ANY doc GRAPH @graph + # RETURN ? + # """ + + # pass + + statistics_dict["opea_service@retriever_arangodb"].append_latency(time.time() - start, None) + + if logflag: + logger.info(result) + + return result + + +if __name__ == "__main__": + + if not ARANGO_EMBED_DIMENSION: + raise ValueError("EMBED_DIMENSION must specified in advance.") + + if OPENAI_API_KEY and OPENAI_EMBED_MODEL: + # Use OpenAI embeddings + embeddings = OpenAIEmbeddings(model=OPENAI_EMBED_MODEL, dimensions=ARANGO_EMBED_DIMENSION) + + elif EMBED_ENDPOINT and HUGGINGFACEHUB_API_TOKEN: + # create embeddings using TEI endpoint service + embeddings = HuggingFaceHubEmbeddings(model=EMBED_ENDPOINT, huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN) + else: + # create embeddings using local embedding model + embeddings = HuggingFaceBgeEmbeddings(model_name=EMBED_MODEL) + + client = ArangoClient(hosts=ARANGO_URL) + sys_db = client.db(name="_system", username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + + if not sys_db.has_database(ARANGO_DB_NAME): + sys_db.create_database(ARANGO_DB_NAME) + + db = client.db(name=ARANGO_DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + + vector_db = ArangoVector( + embedding=embeddings, + embedding_dimension=ARANGO_EMBED_DIMENSION, + database=db, + collection_name=ARANGO_COLLECTION_NAME, + embedding_field=ARANGO_EMBBEDDING_FIELD, + text_field=ARANGO_TEXT_FIELD, + distance_strategy=ARANGO_DISTANCE_STRATEGY, + num_centroids=ARANGO_NUM_CENTROIDS, + ) + + opea_microservices["opea_service@retriever_arangodb"].start() diff --git a/tests/retrievers/test_retrievers_arango_langchain.sh b/tests/retrievers/test_retrievers_arango_langchain.sh new file mode 100644 index 000000000..e6a90786a --- /dev/null +++ b/tests/retrievers/test_retrievers_arango_langchain.sh @@ -0,0 +1,99 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -x + +WORKPATH=$(dirname "$PWD") +LOG_PATH="$WORKPATH/tests" +ip_address=$(hostname -I | awk '{print $1}') + +function build_docker_images() { + cd $WORKPATH + docker run -d -p 7474:7474 -p 7687:7687 -v ./data:/data -v ./plugins:/plugins --name test-comps-arango-apoc1 -e ARANGO_AUTH=arango/password -e ARANGO_PLUGINS=\[\"apoc\"\] arango:latest + sleep 30s + + docker build --no-cache -t opea/retriever-arango:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/arango/langchain/Dockerfile . + if [ $? -ne 0 ]; then + echo "opea/retriever-arango built fail" + exit 1 + else + echo "opea/retriever-arango built successful" + fi +} + +function start_service() { + # tei endpoint + tei_endpoint=5434 + model="BAAI/bge-base-en-v1.5" + docker run -d --name="test-comps-retriever-arango-tei-endpoint" -p $tei_endpoint:80 -v ./data:/data --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 --model-id $model + sleep 30s + export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:${tei_endpoint}" + + # Arango retriever + export ARANGO_URL="http://${ip_address}:8529" + export ARANGO_USERNAME="root" + export ARANGO_PASSWORD="test" + retriever_port=5435 + # unset http_proxy + export no_proxy="localhost,127.0.0.1,"${ip_address} + docker run -d --name="test-comps-retriever-arango-server" -p ${retriever_port}:7000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e ARANGO_URL="http://${ip_address}:8529" -e ARANGO_USERNAME="root" -e ARANGO_PASSWORD="test" opea/retriever-arango:comps + + sleep 1m +} + +function validate_microservice() { + retriever_port=5435 + export PATH="${HOME}/miniforge3/bin:$PATH" + source activate + URL="http://${ip_address}:$retriever_port/v1/retrieval" + + test_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") + + HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "{\"text\":\"test\",\"embedding\":${test_embedding}}" -H 'Content-Type: application/json' "$URL") + if [ "$HTTP_STATUS" -eq 200 ]; then + echo "[ retriever ] HTTP status is 200. Checking content..." + local CONTENT=$(curl -s -X POST -d "{\"text\":\"test\",\"embedding\":${test_embedding}}" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/retriever.log) + + if echo "$CONTENT" | grep -q "retrieved_docs"; then + echo "[ retriever ] Content is as expected." + else + echo "[ retriever ] Content does not match the expected result: $CONTENT" + docker logs test-comps-retriever-arango-server >> ${LOG_PATH}/retriever.log + docker logs test-comps-retriever-arango-tei-endpoint >> ${LOG_PATH}/tei.log + exit 1 + fi + else + echo "[ retriever ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs test-comps-retriever-arango-server >> ${LOG_PATH}/retriever.log + docker logs test-comps-retriever-arango-tei-endpoint >> ${LOG_PATH}/tei.log + exit 1 + fi +} + +function stop_docker() { + cid_retrievers=$(docker ps -aq --filter "name=test-comps-retriever-arango*") + if [[ ! -z "$cid_retrievers" ]]; then + docker stop $cid_retrievers && docker rm $cid_retrievers && sleep 1s + fi + cid_db=$(docker ps -aq --filter "name=test-comps-arango-apoc1") + if [[ ! -z "$cid_retrievers" ]]; then + docker stop $cid_retrievers && docker rm $cid_retrievers && sleep 1s + fi +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_microservice + + stop_docker + echo y | docker system prune + +} + +main