diff --git a/.github/workflows/docker/compose/chathistory-compose.yaml b/.github/workflows/docker/compose/chathistory-compose.yaml index 987447fee..64dc579fc 100644 --- a/.github/workflows/docker/compose/chathistory-compose.yaml +++ b/.github/workflows/docker/compose/chathistory-compose.yaml @@ -7,3 +7,7 @@ services: build: dockerfile: comps/chathistory/mongo/Dockerfile image: ${REGISTRY:-opea}/chathistory-mongo-server:${TAG:-latest} + chathistory-arango-server: + build: + dockerfile: comps/chathistory/arango/Dockerfile + image: ${REGISTRY:-opea}/chathistory-arango-server:${TAG:-latest} diff --git a/.github/workflows/docker/compose/feedback_management-compose.yaml b/.github/workflows/docker/compose/feedback_management-compose.yaml index 0a3cfce66..51f5ae343 100644 --- a/.github/workflows/docker/compose/feedback_management-compose.yaml +++ b/.github/workflows/docker/compose/feedback_management-compose.yaml @@ -3,7 +3,11 @@ # this file should be run in the root of the repo services: - feedbackmanagement: + feedbackmanagement-mongo-server: build: dockerfile: comps/feedback_management/mongo/Dockerfile - image: ${REGISTRY:-opea}/feedbackmanagement:${TAG:-latest} + image: ${REGISTRY:-opea}/feedbackmanagement-mongo-server:${TAG:-latest} + feedbackmanagement-arango-server: + build: + dockerfile: comps/feedback_management/arango/Dockerfile + image: ${REGISTRY:-opea}/feedbackmanagement-arango-server:${TAG:-latest} \ No newline at end of file diff --git a/.github/workflows/docker/compose/prompt_registry-compose.yaml b/.github/workflows/docker/compose/prompt_registry-compose.yaml index 34d8973df..4415a18a9 100644 --- a/.github/workflows/docker/compose/prompt_registry-compose.yaml +++ b/.github/workflows/docker/compose/prompt_registry-compose.yaml @@ -7,3 +7,7 @@ services: build: dockerfile: comps/prompt_registry/mongo/Dockerfile image: ${REGISTRY:-opea}/promptregistry-mongo-server:${TAG:-latest} + promptregistry-arango-server: + build: + dockerfile: comps/prompt_registry/arango/Dockerfile + image: ${REGISTRY:-opea}/promptregistry-arango-server:${TAG:-latest} diff --git a/.gitignore b/.gitignore index 1d1e0a389..3a428754d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ __pycache__ *.egg-info/ .DS_Store +.venv +venv/ diff --git a/ARANGODB_README.md b/ARANGODB_README.md new file mode 100644 index 000000000..76f4b7db3 --- /dev/null +++ b/ARANGODB_README.md @@ -0,0 +1,33 @@ +Instructions + +0. Create a virtual environment: + +```bash +python -m venv .venv + +source .venv/bin/activate +``` + +1. Install the required packages: + +```bash +pip install python-arango +pip install langchain_openai +pip install git+https://github.com/arangoml/langchain.git@arangodb#subdirectory=libs/community +``` + +2. Provision the ArangoDB with Vector Index image: + +```bash +docker create --name arango-vector -p 8529:8529 -e ARANGO_ROOT_PASSWORD=test jbajic/arangodb-arm:vector-index-preview + +docker start arango-vector +``` + +3. Set your `OPENAI_API_KEY` environment variable (contact Anthony for access) + +4. Run the test script to confirm LangChain is working: + +```bash +python langchain_test.py +``` \ No newline at end of file diff --git a/comps/chathistory/README.md b/comps/chathistory/README.md index 4f7bcbf71..754fd0bd8 100644 --- a/comps/chathistory/README.md +++ b/comps/chathistory/README.md @@ -24,3 +24,7 @@ The Chat History microservice able to support various database backends for stor ### Chat History with MongoDB For more detail, please refer to this [README](./mongo/README.md) + +### Chat History with ArangoDB + +For more detail, please refer to this [README](./arango/README.md) diff --git a/comps/chathistory/arango/Dockerfile b/comps/chathistory/arango/Dockerfile new file mode 100644 index 000000000..f402e5526 --- /dev/null +++ b/comps/chathistory/arango/Dockerfile @@ -0,0 +1,30 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +ENV LANG=C.UTF-8 + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + build-essential \ + libjemalloc-dev \ + libgl1-mesa-glx + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps +COPY requirements.txt /home/user/ + +RUN pip install --no-cache-dir --upgrade pip setuptools && \ + pip install --no-cache-dir -r /home/user/comps/chathistory/arango/requirements.txt && \ + pip install --no-cache-dir -r /home/user/requirements.txt + +ENV PYTHONPATH=/home/user + +WORKDIR /home/user/comps/chathistory/mongo + +ENTRYPOINT ["python", "chat.py"] diff --git a/comps/chathistory/arango/README.md b/comps/chathistory/arango/README.md new file mode 100644 index 000000000..428a65255 --- /dev/null +++ b/comps/chathistory/arango/README.md @@ -0,0 +1,123 @@ +# 📝 Chat History Microservice with ArangoDB + +This README provides setup guides and all the necessary information about the Chat History microservice with ArangoDB database. + +--- + +## Setup Environment Variables + +See `config.py` for default values. + +```bash +export ARANGO_HOST=${ARANGO_HOST} +export ARANGO_PORT=${ARANGO_PORT} +export ARANGO_PROTOCOL=${ARANGO_PROTOCOL} +export ARANGO_USERNAME=${ARANGO_USERNAME} +export ARANGO_PASSWORD=${ARANGO_PASSWORD} +export DB_NAME=${DB_NAME} +export COLLECTION_NAME=${COLLECTION_NAME} +``` + +--- + +## 🚀Start Microservice with Docker + +### Build Docker Image + +```bash +cd ../../../../ +docker build -t opea/chathistory-arango-server:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/chathistory/arango/Dockerfile . +``` + +### Run Docker with CLI + +- Run ArangoDB image container + + ```bash + docker run -d -p 8529:8529 --name=arango arangodb/arangodb:latest + ``` + +- Run the Chat History microservice + + ```bash + docker run -p 6012:6012 \ + -e http_proxy=$http_proxy \ + -e https_proxy=$https_proxy \ + -e no_proxy=$no_proxy \ + -e ARANGO_HOST=${ARANGO_HOST} \ + -e ARANGO_PORT=${ARANGO_PORT} \ + -e ARANGO_PROTOCOL=${ARANGO_PROTOCOL} \ + -e ARANGO_USERNAME=${ARANGO_USERNAME} \ + -e ARANGO_PASSWORD=${ARANGO_PASSWORD} \ + -e DB_NAME=${DB_NAME} \ + -e COLLECTION_NAME=${COLLECTION_NAME} \ + opea/chathistory-arango-server:latest + ``` + +--- + +## ✅ Invoke Microservice + +The Chat History microservice exposes the following API endpoints: + +- Create new chat conversation + + ```bash + curl -X 'POST' \ + http://${host_ip}:6012/v1/chathistory/create \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "data": { + "messages": "test Messages", "user": "test" + } + }' + ``` + +- Get all the Conversations for a user + + ```bash + curl -X 'POST' \ + http://${host_ip}:6012/v1/chathistory/get \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test"}' + ``` + +- Get a specific conversation by id. + + ```bash + curl -X 'POST' \ + http://${host_ip}:6012/v1/chathistory/get \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test", "id":"48918"}' + ``` + +- Update the conversation by id. + + ```bash + curl -X 'POST' \ + http://${host_ip}:6012/v1/chathistory/create \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "data": { + "messages": "test Messages Update", "user": "test" + }, + "id":"48918" + }' + ``` + +- Delete a stored conversation. + + ```bash + curl -X 'POST' \ + http://${host_ip}:6012/v1/chathistory/delete \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test", "id":"48918"}' + ``` diff --git a/comps/chathistory/arango/arango_conn.py b/comps/chathistory/arango/arango_conn.py new file mode 100644 index 000000000..d6c4b5977 --- /dev/null +++ b/comps/chathistory/arango/arango_conn.py @@ -0,0 +1,32 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from arango import ArangoClient as PythonArangoClient +from arango.database import StandardDatabase +from config import ARANGO_HOST, ARANGO_PASSWORD, ARANGO_PORT, ARANGO_PROTOCOL, ARANGO_USERNAME, DB_NAME + + +class ArangoClient: + conn_url = f"{ARANGO_PROTOCOL}://{ARANGO_HOST}:{ARANGO_PORT}/" + + @staticmethod + def get_db_client() -> StandardDatabase: + try: + # Create client + client = PythonArangoClient(hosts=ArangoClient.conn_url) + + # First connect to _system database + sys_db = client.db("_system", username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + + # Create target database if it doesn't exist + if not sys_db.has_database(DB_NAME): + sys_db.create_database(DB_NAME) + + # Now connect to the target database + db = client.db(DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + + return db + + except Exception as e: + print(e) + raise e diff --git a/comps/chathistory/arango/arango_store.py b/comps/chathistory/arango/arango_store.py new file mode 100644 index 000000000..8ab6928eb --- /dev/null +++ b/comps/chathistory/arango/arango_store.py @@ -0,0 +1,186 @@ +from typing import Any + +from arango_conn import ArangoClient +from config import COLLECTION_NAME +from pydantic import BaseModel + + +class DocumentStore: + + def __init__( + self, + user: str, + ): + self.user = user + + def initialize_storage(self) -> None: + self.db_client = ArangoClient.get_db_client() + + if not self.db_client.has_collection(COLLECTION_NAME): + self.db_client.create_collection(COLLECTION_NAME) + + self.collection = self.db_client.collection(COLLECTION_NAME) + + def save_document(self, document: BaseModel) -> str: + """Stores a new document into the storage. + + Args: + document: The document to be stored. It should be a Pydantic model. + + Returns: + str: The ID of the inserted document. + + Raises: + Exception: If an error occurs while storing the document. + """ + try: + model_dump = document.model_dump(by_alias=True, mode="json", exclude={"id"}) + + inserted_document = self.collection.insert(model_dump) + + document_id = str(inserted_document["_key"]) + + return document_id + + except Exception as e: + print(e) + raise Exception(e) + + def update_document(self, document_id: str, updated_data: BaseModel, first_query: Any) -> str: + """Updates a document in the collection with the given document_id. + + Args: + document_id (str): The ID of the document to update. + updated_data (object): The updated data to be set in the document. + first_query (object): The first query to be set in the document. + + Returns: + bool: True if the document was successfully updated, False otherwise. + + Raises: + KeyError: If the document with ID is not found. + Exception: If the user does not match with the document user. + Exception: If an error occurs while updating the document data. + """ + document = self.collection.get(document_id) + + if document is None: + raise Exception(f"Unable to find Document {document_id}") + + if document["data"]["user"] != self.user: + raise Exception(f"User {self.user} is not allowed to update Document {document_id}.") + + try: + self.collection.update( + { + "_key": document_id, + "data": updated_data.model_dump(by_alias=True, mode="json"), + "first_query": first_query, + }, + merge=True, + keep_none=True, + ) + + print(f"Updated document: {document_id} !") + + return True + + except Exception as e: + print("Not able to update the data.") + print(e) + raise Exception(e) + + def get_all_documents_of_user(self) -> list[dict]: + """Retrieves all documents of a specific user from the collection. + + Returns: + A list of dictionaries representing the conversation documents. + Raises: + Exception: If there is an error while retrieving the documents. + """ + try: + document_list: list = [] + + # TODO: Clarify if we actually want to omit the `data` field. + # Implemented using MongoDB Feedback Management as a reference. + cursor = """ + FOR doc IN @@collection + FILTER doc.data.user == @user + RETURN UNSET(doc, "data") + """ + + cursor = self.db_client.aql.execute( + cursor, bind_vars={"@collection": self.collection.name, "user": self.user} + ) + + for document in cursor: + document["id"] = str(document["_key"]) + del document["_id"] + del document["_key"] + del document["_rev"] + + document_list.append(document) + + return document_list + + except Exception as e: + print(e) + raise Exception(e) + + def get_user_documents_by_id(self, document_id: str) -> dict | None: + """Retrieves a user document from the collection based on the given document ID. + + Args: + document_id (str): The ID of the document to retrieve. + + Returns: + dict | None: The user document if found, None otherwise. + + Raises: + KeyError: If document with ID is not found. + Exception: If the user does not match with the document user. + """ + response = self.collection.get(document_id) + + if response is None: + raise KeyError(f"Document with ID: {document_id} not found.") + + if response["data"]["user"] != self.user: + raise Exception(f"User mismatch. Document with ID: {document_id} does not belong to user: {self.user}") + + del response["_id"] + del response["_key"] + del response["_rev"] + + return response + + def delete_document(self, document_id: str) -> str: + """Deletes a document from the collection based on the provided document ID. + + Args: + document_id (str): The ID of the document to be deleted. + + Returns: + bool: True if the document is successfully deleted, False otherwise. + + Raises: + KeyError: If the provided document_id is invalid: + Exception: If the user does not match with the document user. + Exception: If any errors occurs during delete process. + """ + response = self.collection.get(document_id) + + if response is None: + raise KeyError(f"Document with ID: {document_id} not found.") + + if response["data"]["user"] != self.user: + raise Exception(f"User mismatch. Feedback with ID: {document_id} does not belong to user: {self.user}") + + try: + response = self.collection.delete(document_id) + print(f"Deleted document: {document_id} !") + + return True + except Exception as e: + print(e) + raise Exception("Not able to delete the data.") diff --git a/comps/chathistory/arango/chat.py b/comps/chathistory/arango/chat.py new file mode 100644 index 000000000..ce9c0a16e --- /dev/null +++ b/comps/chathistory/arango/chat.py @@ -0,0 +1,146 @@ +import os +from typing import Optional + +from arango_store import DocumentStore +from fastapi import HTTPException +from pydantic import BaseModel + +from comps import CustomLogger +from comps.cores.mega.micro_service import opea_microservices, register_microservice +from comps.cores.proto.api_protocol import ChatCompletionRequest + +logger = CustomLogger("chathistory_arango") +logflag = os.getenv("LOGFLAG", False) + + +class ChatMessage(BaseModel): + data: ChatCompletionRequest + first_query: Optional[str] = None + id: Optional[str] = None + + +class ChatId(BaseModel): + user: str + id: Optional[str] = None + + +def get_first_string(value): + if isinstance(value, str): + return value + elif isinstance(value, list): + # Assuming we want the first string from the first dictionary + if value and isinstance(value[0], dict): + first_dict = value[0] + if first_dict: + # Get the first value from the dictionary + first_key = next(iter(first_dict)) + return first_dict[first_key] + + +@register_microservice( + name="opea_service@chathistory_arango", + endpoint="/v1/chathistory/create", + host="0.0.0.0", + input_datatype=ChatMessage, + port=6012, +) +async def create_documents(document: ChatMessage): + """Creates or updates a document in the document store. + + Args: + document (ChatMessage): The ChatMessage object containing the data to be stored. + + Returns: + The result of the operation if successful, None otherwise. + """ + if logflag: + logger.info(document) + try: + if document.data.user is None: + raise HTTPException(status_code=500, detail="Please provide the user information") + store = DocumentStore(document.data.user) + store.initialize_storage() + if document.first_query is None: + document.first_query = get_first_string(document.data.messages) + if document.id: + res = store.update_document(document.id, document.data, document.first_query) + else: + res = store.save_document(document) + if logflag: + logger.info(res) + return res + except Exception as e: + logger.info(f"An error occurred: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + + +@register_microservice( + name="opea_service@chathistory_arango", + endpoint="/v1/chathistory/get", + host="0.0.0.0", + input_datatype=ChatId, + port=6012, +) +async def get_documents(document: ChatId): + """Retrieves documents from the document store based on the provided ChatId. + + Args: + document (ChatId): The ChatId object containing the user and optional document id. + + Returns: + The retrieved documents if successful, None otherwise. + """ + if logflag: + logger.info(document) + try: + store = DocumentStore(document.user) + store.initialize_storage() + if document.id is None: + res = store.get_all_documents_of_user() + else: + res = store.get_user_documents_by_id(document.id) + if logflag: + logger.info(res) + return res + except Exception as e: + # Handle the exception here + logger.info(f"An error occurred: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + + +@register_microservice( + name="opea_service@chathistory_arango", + endpoint="/v1/chathistory/delete", + host="0.0.0.0", + input_datatype=ChatId, + port=6012, +) +async def delete_documents(document: ChatId): + """Deletes a document from the document store based on the provided ChatId. + + Args: + document (ChatId): The ChatId object containing the user and document id. + + Returns: + The result of the deletion if successful, None otherwise. + """ + if logflag: + logger.info(document) + try: + store = DocumentStore(document.user) + store.initialize_storage() + if document.id is None: + raise Exception("Document id is required.") + else: + res = store.delete_document(document.id) + if logflag: + logger.info(res) + return res + except Exception as e: + # Handle the exception here + logger.info(f"An error occurred: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + + +if __name__ == "__main__": + opea_microservices["opea_service@chathistory_arango"].start() diff --git a/comps/chathistory/arango/config.py b/comps/chathistory/arango/config.py new file mode 100644 index 000000000..9e66e8f1d --- /dev/null +++ b/comps/chathistory/arango/config.py @@ -0,0 +1,13 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +# ARANGO configuration +ARANGO_HOST = os.getenv("ARANGO_HOST", "localhost") +ARANGO_PORT = os.getenv("ARANGO_PORT", 8529) +ARANGO_PROTOCOL = os.getenv("ARANGO_PROTOCOL", "http") +ARANGO_USERNAME = os.getenv("ARANGO_USERNAME", "root") +ARANGO_PASSWORD = os.getenv("ARANGO_PASSWORD", "test") +DB_NAME = os.getenv("DB_NAME", "OPEA") +COLLECTION_NAME = os.getenv("COLLECTION_NAME", "ChatHistory") diff --git a/comps/chathistory/arango/docker-compose-chathistory-arango.yaml b/comps/chathistory/arango/docker-compose-chathistory-arango.yaml new file mode 100644 index 000000000..36819c99b --- /dev/null +++ b/comps/chathistory/arango/docker-compose-chathistory-arango.yaml @@ -0,0 +1,38 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3" +services: + arango: + image: arangodb/arangodb:latest + container_name: arangodb + ports: + - 8529:8529 + environment: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + no_proxy: ${no_proxy} + ARANGO_ROOT_PASSWORD: ${ARANGO_ROOT_PASSWORD} + + chathistory-arango: + image: opea/chathistory-arango:latest + container_name: chathistory-arango-server + ports: + - "6012:6012" + ipc: host + environment: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + no_proxy: ${no_proxy} + ARANGO_HOST: ${ARANGO_HOST} + ARANGO_PORT: ${ARANGO_PORT} + ARANGO_PROTOCOL: ${ARANGO_PROTOCOL} + ARANGO_USERNAME: ${ARANGO_USERNAME} + ARANGO_PASSWORD: ${ARANGO_PASSWORD} + DB_NAME: ${DB_NAME} + COLLECTION_NAME: ${COLLECTION_NAME} + restart: unless-stopped + +networks: + default: + driver: bridge diff --git a/comps/chathistory/arango/requirements.txt b/comps/chathistory/arango/requirements.txt new file mode 100644 index 000000000..9e5d0de8e --- /dev/null +++ b/comps/chathistory/arango/requirements.txt @@ -0,0 +1 @@ +python-arango \ No newline at end of file diff --git a/comps/dataprep/arango/__init__.py b/comps/dataprep/arango/__init__.py new file mode 100644 index 000000000..916f3a44b --- /dev/null +++ b/comps/dataprep/arango/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/dataprep/arango/langchain/Dockerfile b/comps/dataprep/arango/langchain/Dockerfile new file mode 100644 index 000000000..5d8aa7a48 --- /dev/null +++ b/comps/dataprep/arango/langchain/Dockerfile @@ -0,0 +1,39 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +ENV LANG=C.UTF-8 + +ARG ARCH="cpu" + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + build-essential \ + default-jre \ + libgl1-mesa-glx \ + libjemalloc-dev \ + git + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps + +RUN pip install --no-cache-dir --upgrade pip setuptools && \ + if [ ${ARCH} = "cpu" ]; then pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cpu; fi && \ + pip install --no-cache-dir -r /home/user/comps/dataprep/arango/langchain/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +USER root + +RUN mkdir -p /home/user/comps/dataprep/arango/langchain/uploaded_files && chown -R user /home/user/comps/dataprep/arango/langchain/uploaded_files + +USER user + +WORKDIR /home/user/comps/dataprep/arango/langchain + +ENTRYPOINT ["python", "prepare_doc_arango.py"] \ No newline at end of file diff --git a/comps/dataprep/arango/langchain/README.md b/comps/dataprep/arango/langchain/README.md new file mode 100644 index 000000000..bdd6f71b0 --- /dev/null +++ b/comps/dataprep/arango/langchain/README.md @@ -0,0 +1,130 @@ +# Dataprep Microservice with ArangoDB + +## 🚀Start Microservice with Python + +### Install Requirements + +```bash +pip install -r requirements.txt +apt-get install libtesseract-dev -y +apt-get install poppler-utils -y +``` + +### Start ArangoDB Server + +To launch ArangoDB locally, first ensure you have docker installed. Then, you can launch the database with the following docker command. + +```bash +docker run -d --name arangodb -p 8529:8529 -e ARANGO_ROOT_PASSWORD=password arangodb/arangodb:latest +``` + +### Setup Environment Variables + +```bash +export no_proxy=${your_no_proxy} +export http_proxy=${your_http_proxy} +export https_proxy=${your_http_proxy} +export ARANGO_URL=${your_arango_url} +export ARANGO_USERNAME=${your_arango_username} +export ARANGO_PASSWORD=${your_arango_password} +export ARANGO_DB_NAME=${your_db_name} +export PYTHONPATH=${path_to_comps} +``` + +### Start Document Preparation Microservice for ArangoDB with Python Script + +Start document preparation microservice for ArangoDB with below command. + +```bash +python prepare_doc_arango.py +``` + +## 🚀Start Microservice with Docker + +### Build Docker Image + +```bash +cd ../../../../ +docker build -t opea/dataprep-arango:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/arango/langchain/Dockerfile . +``` + +### Run Docker with CLI + +```bash +docker run -d --name="dataprep-arango-server" -p 6007:6007 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/dataprep-arango:latest +``` + +### Run Docker with Docker Compose + +```bash +cd comps/dataprep/arango/langchain +docker compose -f docker-compose-dataprep-arango.yaml up -d +``` + +## Invoke Microservice + +Once document preparation microservice for ArangoDB is started, user can use below command to invoke the microservice to convert the document to embedding and save to the database. + +After the service is complete a Graph is created in ArangoDB. The default graph name is `NewGraph`, you can specify the graph name by `-F "graph_name=${your_graph_name}"` in the curl command. + +By default, the microservice will create embeddings for the documents. You can specify `-F "create_embeddings=false"` to skip the embedding creation. + +```bash +curl -X POST \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./file1.txt" \ + -F "graph_name=${your_graph_name}" \ + http://localhost:6007/v1/dataprep +``` + +You can specify chunk_size and chunk_size by the following commands. + +```bash +curl -X POST \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./file1.txt" \ + -F "chunk_size=1500" \ + -F "chunk_overlap=100" \ + -F "graph_name=${your_graph_name}" \ + http://localhost:6007/v1/dataprep +``` + +Additional options that can be specified from the environment variables are as follows (default values are in the config.py file): + +OpenAI Configuration: +- `OPENAI_EMBED_MODEL`: The embedding model to use for the OpenAI service. +- `OPENAI_EMBED_DIMENSIONS`: The dimensions for the OpenAI service. +- `OPENAI_CHAT_MODEL`: The chat model to use for the OpenAI service. +- `OPENAI_CHAT_TEMPERATURE`: The temperature for the OpenAI service. + +ArangoDB Configuration: +- `YOUR_GRAPH_NAME`: The name of the graph to create. Must be passed in the curl command. +- `USE_ONE_ENTITY_COLLECTION`: If set to True, the microservice will use a single entity collection for all nodes. If set to False, the microservice will use a separate collection for each node type. +- `INSERT_ASYNC`: If set to True, the microservice will insert the data asynchronously. +- `ARANGO_BATCH_SIZE`: The batch size for the microservice to insert the data. +- `INCLUDE_SOURCE`: If set to True, the microservice will include the source in the data. + +Graph Configuration: +- `ALLOWED_NODES`: The nodes to allow in the graph. +- `ALLOWED_RELATIONSHIPS`: The relationships to allow in the graph. +- `NODE_PROPERTIES`: The properties to allow in the nodes. +- `RELATIONSHIP_PROPERTIES`: The properties to allow in the relationships. + +Prompt Configuration: +- `SYSTEM_PROMPT_PATH`: The path to the system prompt text file. This can be used to specify the specific system prompt for the entity extraction and graph generation steps. + +We support table extraction from pdf documents. You can specify process_table and table_strategy by the following commands. "table_strategy" refers to the strategies to understand tables for table retrieval. As the setting progresses from "fast" to "hq" to "llm," the focus shifts towards deeper table understanding at the expense of processing speed. The default strategy is "fast". + +Note: If you specify "table_strategy=llm", You should first start TGI Service, please refer to 1.2.1, 1.3.1 in https://github.com/opea-project/GenAIComps/tree/main/comps/llms/README.md, and then `export TGI_LLM_ENDPOINT="http://${your_ip}:8008"`. + +For ensure the quality and comprehensiveness of the extracted entities, we recommend to use `gpt-4o` as the default model for parsing the document. To enable the openai service, please `export OPENAI_API_KEY=xxxx` before using this services. + +```bash +curl -X POST \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./your_file.pdf" \ + -F "process_table=true" \ + -F "table_strategy=hq" \ + -F "graph_name=${your_graph_name}" \ + http://localhost:6007/v1/dataprep +``` diff --git a/comps/dataprep/arango/langchain/__init__.py b/comps/dataprep/arango/langchain/__init__.py new file mode 100644 index 000000000..916f3a44b --- /dev/null +++ b/comps/dataprep/arango/langchain/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/dataprep/arango/langchain/config.py b/comps/dataprep/arango/langchain/config.py new file mode 100644 index 000000000..5b27a8166 --- /dev/null +++ b/comps/dataprep/arango/langchain/config.py @@ -0,0 +1,39 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +# ArangoDB configuration +ARANGO_URL = os.getenv("ARANGO_URL", "http://localhost:8529") +ARANGO_USERNAME = os.getenv("ARANGO_USERNAME", "root") +ARANGO_PASSWORD = os.getenv("ARANGO_PASSWORD", "test") +ARANGO_DB_NAME = os.getenv("ARANGO_DB_NAME", "_system") + +# ArangoDB graph configuration +USE_ONE_ENTITY_COLLECTION = os.getenv("USE_ONE_ENTITY_COLLECTION", True) +INSERT_ASYNC = os.getenv("INSERT_ASYNC", False) +ARANGO_BATCH_SIZE = os.getenv("ARANGO_BATCH_SIZE", 1000) +INCLUDE_SOURCE = os.getenv("INCLUDE_SOURCE", True) + +# Text Generation Inference configuration +TGI_LLM_ENDPOINT = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080") + +# Text Embeddings Inference configuration +TEI_EMBEDDING_ENDPOINT = os.getenv("TEI_EMBEDDING_ENDPOINT") +HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN") +TEI_EMBED_MODEL = os.getenv("TEI_EMBED_MODEL", "BAAI/bge-base-en-v1.5") + +# OpenAI configuration (alternative to TGI & TEI) +OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") +OPENAI_EMBED_MODEL = os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-small") +OPENAI_EMBED_DIMENSIONS = os.getenv("OPENAI_EMBED_DIMENSIONS", 512) +OPENAI_CHAT_MODEL = os.getenv("OPENAI_CHAT_MODEL", "gpt-4o") +OPENAI_CHAT_TEMPERATURE = os.getenv("OPENAI_CHAT_TEMPERATURE", 0) + +# LLMGraphTransformer configuration +ALLOWED_NODES = os.getenv("ALLOWED_NODES", []) # ["Person", "Organization"] +ALLOWED_RELATIONSHIPS = os.getenv("ALLOWED_RELATIONSHIPS", []) # [("Person", "knows", "Person"), ("Person", "works_at", "Organization")] +NODE_PROPERTIES = os.getenv("NODE_PROPERTIES", ['description']) +RELATIONSHIP_PROPERTIES = os.getenv("RELATIONSHIP_PROPERTIES", ['description']) + +SYSTEM_PROMPT_PATH = os.getenv("SYSTEM_PROMPT_PATH", "./prompt.txt") \ No newline at end of file diff --git a/comps/dataprep/arango/langchain/docker-compose-dataprep-arango.yaml b/comps/dataprep/arango/langchain/docker-compose-dataprep-arango.yaml new file mode 100644 index 000000000..08408c127 --- /dev/null +++ b/comps/dataprep/arango/langchain/docker-compose-dataprep-arango.yaml @@ -0,0 +1,53 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3" +services: + arango-vector-db: + image: arangodb/arangodb:latest + container_name: arango-graph-db + ports: + - "8529:8529" + environment: + ARANGO_ROOT_PASSWORD: ${ARANGO_PASSWORD} + tgi_gaudi_service: + image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + container_name: tgi-service + ports: + - "8088:80" + volumes: + - "./data:/data" + shm_size: 1g + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + HF_TOKEN: ${HF_TOKEN} + command: --model-id ${LLM_MODEL_ID} --auto-truncate --max-input-tokens 1024 --max-total-tokens 2048 + dataprep-arango: + image: opea/dataprep-arango:latest + container_name: dataprep-arango-server + depends_on: + - arango-vector-db + - tgi_gaudi_service + ports: + - "6007:6007" + ipc: host + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + ARANGO_URL: http://arango-graph-db:8529 + ARANGO_USERNAME: ${ARANGO_USERNAME} + ARANGO_PASSWORD: ${ARANGO_PASSWORD} + ARANGO_DB_NAME: ${ARANGO_DB_NAME} + TGI_LLM_ENDPOINT: ${TEI_ENDPOINT} + TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT} + HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + TEI_EMBED_MODEL: ${TEI_EMBED_MODEL} + OPENAI_API_KEY: ${OPENAI_API_KEY} + restart: unless-stopped + +networks: + default: + driver: bridge diff --git a/comps/dataprep/arango/langchain/prepare_doc_arango.py b/comps/dataprep/arango/langchain/prepare_doc_arango.py new file mode 100644 index 000000000..500f728b0 --- /dev/null +++ b/comps/dataprep/arango/langchain/prepare_doc_arango.py @@ -0,0 +1,346 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import json +import os +from typing import List, Optional, Union + +import openai +from arango import ArangoClient +from config import ( + ARANGO_DB_NAME, + ARANGO_PASSWORD, + ARANGO_URL, + ARANGO_USERNAME, + HUGGINGFACEHUB_API_TOKEN, + OPENAI_API_KEY, + TEI_EMBED_MODEL, + TEI_EMBEDDING_ENDPOINT, + TGI_LLM_ENDPOINT, + OPENAI_EMBED_MODEL, + OPENAI_EMBED_DIMENSIONS, + USE_ONE_ENTITY_COLLECTION, + INSERT_ASYNC, + ARANGO_BATCH_SIZE, + INCLUDE_SOURCE, + SYSTEM_PROMPT_PATH, + ALLOWED_NODES, + ALLOWED_RELATIONSHIPS, + NODE_PROPERTIES, + RELATIONSHIP_PROPERTIES, +) +from fastapi import File, Form, HTTPException, UploadFile +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceHubEmbeddings +from langchain_community.graphs.arangodb_graph import ArangoGraph +from langchain_community.llms import HuggingFaceEndpoint +from langchain_core.documents import Document +from langchain_core.embeddings import Embeddings +from langchain_experimental.graph_transformers import LLMGraphTransformer +from langchain_openai import ChatOpenAI, OpenAIEmbeddings +from langchain_text_splitters import HTMLHeaderTextSplitter +from langchain_core.prompts import ChatPromptTemplate, BasePromptTemplate + +from comps import CustomLogger, DocPath, opea_microservices, register_microservice +from comps.dataprep.utils import ( + document_loader, + encode_filename, + get_separators, + get_tables_result, + parse_html, + save_content_to_local_disk, +) + +logger = CustomLogger("prepare_doc_arango") +logflag = os.getenv("LOGFLAG", False) + +upload_folder = "./uploaded_files/" + +PROMPT_TEMPLATE = None +if SYSTEM_PROMPT_PATH is not None: + try: + with open(SYSTEM_PROMPT_PATH, "r") as f: + PROMPT_TEMPLATE = ChatPromptTemplate.from_messages( + [ + ( + "system", + f.read(), + ), + ( + "human", + ( + "Tip: Make sure to answer in the correct format and do " + "not include any explanations. " + "Use the given format to extract information from the " + "following input: {input}" + ), + ), + ] + ) + except Exception: + print("Could not set custom Prompt") + +def ingest_data_to_arango(doc_path: DocPath, graph_name: str, create_embeddings: bool) -> bool: + """Ingest document to ArangoDB.""" + path = doc_path.path + if logflag: + logger.info(f"Parsing document {path}.") + + ############ + # ArangoDB # + ############ + + client = ArangoClient(hosts=ARANGO_URL) + sys_db = client.db(name="_system", username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + + if not sys_db.has_database(ARANGO_DB_NAME): + sys_db.create_database(ARANGO_DB_NAME) + + db = client.db(name=ARANGO_DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + + graph = ArangoGraph( + db=db, + include_examples=False, + generate_schema_on_init=False, + ) + + ############################# + # Text Generation Inference # + ############################# + + if OPENAI_API_KEY: + if logflag: + logger.info("OpenAI API Key is set. Verifying its validity...") + openai.api_key = OPENAI_API_KEY + + try: + openai.models.list() + if logflag: + logger.info("OpenAI API Key is valid.") + llm = ChatOpenAI(temperature=0, model_name="gpt-4o") + except openai.error.AuthenticationError: + if logflag: + logger.info("OpenAI API Key is invalid.") + except Exception as e: + if logflag: + logger.info(f"An error occurred while verifying the API Key: {e}") + + elif TGI_LLM_ENDPOINT: + llm = HuggingFaceEndpoint( + endpoint_url=TGI_LLM_ENDPOINT, + max_new_tokens=512, + top_k=40, + top_p=0.9, + temperature=0.8, + timeout=600, + ) + else: + raise ValueError("No text generation inference endpoint is set.") + + try: + if not (NODE_PROPERTIES or RELATIONSHIP_PROPERTIES): + llm_transformer = LLMGraphTransformer( + llm=llm, + prompt=PROMPT_TEMPLATE, + allowed_nodes=ALLOWED_NODES, + allowed_relationships=ALLOWED_RELATIONSHIPS, + ) + else: + llm_transformer = LLMGraphTransformer( + llm=llm, + node_properties=NODE_PROPERTIES, + relationship_properties=RELATIONSHIP_PROPERTIES, + prompt=PROMPT_TEMPLATE, + allowed_nodes=ALLOWED_NODES, + allowed_relationships=ALLOWED_RELATIONSHIPS, + ) + except (TypeError, ValueError) as e: + if logflag: + logger.warning(f"Advanced LLMGraphTransformer failed: {e}") + # Fall back to basic config + try: + llm_transformer = LLMGraphTransformer(llm=llm) + except (TypeError, ValueError) as e: + if logflag: + logger.error(f"Failed to initialize LLMGraphTransformer: {e}") + raise + + ######################################## + # Text Embeddings Inference (optional) # + ######################################## + + if create_embeddings: + if OPENAI_API_KEY: + # Use OpenAI embeddings + embeddings = OpenAIEmbeddings( + model=OPENAI_EMBED_MODEL, + dimensions=OPENAI_EMBED_DIMENSIONS, + ) + + elif TEI_EMBEDDING_ENDPOINT and HUGGINGFACEHUB_API_TOKEN: + # Use TEI endpoint service + embeddings = HuggingFaceHubEmbeddings( + model=TEI_EMBEDDING_ENDPOINT, + huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN, + ) + elif TEI_EMBED_MODEL: + # Use local embedding model + embeddings = HuggingFaceBgeEmbeddings(model_name=TEI_EMBED_MODEL) + else: + if logflag: + logger.error("No text embeddings inference endpoint is set.") + embeddings = None + else: + embeddings = None + + ############ + # Chunking # + ############ + + if path.endswith(".html"): + headers_to_split_on = [ + ("h1", "Header 1"), + ("h2", "Header 2"), + ("h3", "Header 3"), + ] + text_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on) + else: + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=doc_path.chunk_size, + chunk_overlap=doc_path.chunk_overlap, + add_start_index=True, + separators=get_separators(), + ) + + content = document_loader(path) + + structured_types = [".xlsx", ".csv", ".json", "jsonl"] + _, ext = os.path.splitext(path) + + if ext in structured_types: + chunks = content + else: + chunks = text_splitter.split_text(content) + + if doc_path.process_table and path.endswith(".pdf"): + table_chunks = get_tables_result(path, doc_path.table_strategy) + if isinstance(table_chunks, list): + chunks = chunks + table_chunks + if logflag: + logger.info("Done preprocessing. Created ", len(chunks), " chunks of the original file.") + + ################################ + # Graph generation & insertion # + ################################ + + generate_chunk_embeddings = embeddings is not None + + for text in chunks: + document = Document(page_content=text) + graph_docs = llm_transformer.convert_to_graph_documents([document]) + + if generate_chunk_embeddings: + source = graph_docs[0].source + source.metadata["embeddings"] = embeddings.embed_documents([source.page_content])[0] + + graph.add_graph_documents( + graph_documents=graph_docs, + include_source=INCLUDE_SOURCE, + graph_name=graph_name, + update_graph_definition_if_exists=not USE_ONE_ENTITY_COLLECTION, + batch_size=ARANGO_BATCH_SIZE, + use_one_entity_collection=USE_ONE_ENTITY_COLLECTION, + insert_async=INSERT_ASYNC, + ) + + if logflag: + logger.info("The graph is built.") + + return True + + +@register_microservice( + name="opea_service@prepare_doc_arango", + endpoint="/v1/dataprep", + host="0.0.0.0", + port=6007, + input_datatype=DocPath, + output_datatype=None, +) +async def ingest_documents( + files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), + link_list: Optional[str] = Form(None), + chunk_size: int = Form(1500), + chunk_overlap: int = Form(100), + process_table: bool = Form(False), + table_strategy: str = Form("fast"), + graph_name: str = Form("NewGraph"), + create_embeddings: bool = Form(True), +): + if logflag: + logger.info(f"files:{files}") + logger.info(f"link_list:{link_list}") + + if files: + if not isinstance(files, list): + files = [files] + uploaded_files = [] + for file in files: + encode_file = encode_filename(file.filename) + save_path = upload_folder + encode_file + await save_content_to_local_disk(save_path, file) + ingest_data_to_arango( + DocPath( + path=save_path, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + process_table=process_table, + table_strategy=table_strategy, + ), + graph_name=graph_name, + create_embeddings=create_embeddings, + ) + uploaded_files.append(save_path) + if logflag: + logger.info(f"Successfully saved file {save_path}") + result = {"status": 200, "message": "Data preparation succeeded"} + if logflag: + logger.info(result) + return result + + if link_list: + link_list = json.loads(link_list) # Parse JSON string to list + if not isinstance(link_list, list): + raise HTTPException(status_code=400, detail="link_list should be a list.") + for link in link_list: + encoded_link = encode_filename(link) + save_path = upload_folder + encoded_link + ".txt" + content = parse_html([link])[0][0] + try: + await save_content_to_local_disk(save_path, content) + ingest_data_to_arango( + DocPath( + path=save_path, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + process_table=process_table, + table_strategy=table_strategy, + ), + graph_name=graph_name, + ) + except json.JSONDecodeError: + raise HTTPException(status_code=500, detail="Fail to ingest data into qdrant.") + + if logflag: + logger.info(f"Successfully saved link {link}") + + result = {"status": 200, "message": "Data preparation succeeded"} + if logflag: + logger.info(result) + return result + + raise HTTPException(status_code=400, detail="Must provide either a file or a string list.") + + +if __name__ == "__main__": + opea_microservices["opea_service@prepare_doc_arango"].start() diff --git a/comps/dataprep/arango/langchain/requirements.txt b/comps/dataprep/arango/langchain/requirements.txt new file mode 100644 index 000000000..74d4a9f0d --- /dev/null +++ b/comps/dataprep/arango/langchain/requirements.txt @@ -0,0 +1,32 @@ +beautifulsoup4 +cairosvg +docarray[full] +docx2txt +easyocr +fastapi +huggingface_hub +langchain +git+https://github.com/arangoml/langchain.git@arangodb#subdirectory=libs/community +langchain-experimental +langchain-openai +langchain-text-splitters +langchain_huggingface +markdown +python-arango +cityhash +numpy +openai +opentelemetry-api +opentelemetry-exporter-otlp +opentelemetry-sdk +pandas +Pillow +prometheus-fastapi-instrumentator +pymupdf +pytesseract +python-docx +python-pptx +sentence_transformers +shortuuid +unstructured[all-docs]==0.15.7 +uvicorn diff --git a/comps/feedback_management/README.md b/comps/feedback_management/README.md index 2e68aa413..9cd4b42a5 100644 --- a/comps/feedback_management/README.md +++ b/comps/feedback_management/README.md @@ -20,3 +20,7 @@ The Feedback Management microservice able to support various database backends f ### Feedback Management with MongoDB For more detail, please refer to this [README](./mongo/README.md) + +### Feedback Management with ArangoDB + +For more detail, please refer to this [README](./arango/README.md) diff --git a/comps/feedback_management/arango/Dockerfile b/comps/feedback_management/arango/Dockerfile new file mode 100644 index 000000000..95ac359e6 --- /dev/null +++ b/comps/feedback_management/arango/Dockerfile @@ -0,0 +1,30 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +ENV LANG=C.UTF-8 + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + build-essential \ + libjemalloc-dev \ + libgl1-mesa-glx + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps +COPY requirements.txt /home/user/ + +RUN pip install --no-cache-dir --upgrade pip setuptools && \ + pip install --no-cache-dir -r /home/user/comps/feedback_management/arango/requirements.txt && \ + pip install --no-cache-dir -r /home/user/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +WORKDIR /home/user/comps/feedback_management/arango + +ENTRYPOINT ["python", "feedback.py"] diff --git a/comps/feedback_management/arango/README.md b/comps/feedback_management/arango/README.md new file mode 100644 index 000000000..7e9a5f840 --- /dev/null +++ b/comps/feedback_management/arango/README.md @@ -0,0 +1,174 @@ +# 🗨 Feedback Management Microservice with ArangoDB + +This README provides setup guides and all the necessary information about the Feedback Management microservice with ArangoDB database. + +--- + +## Setup Environment Variables + +See `config.py` for default values. + +```bash +export ARANGO_HOST=${ARANGO_HOST} +export ARANGO_PORT=${ARANGO_PORT} +export ARANGO_PROTOCOL=${ARANGO_PROTOCOL} +export ARANGO_USERNAME=${ARANGO_USERNAME} +export ARANGO_PASSWORD=${ARANGO_PASSWORD} +export DB_NAME=${DB_NAME} +export COLLECTION_NAME=${COLLECTION_NAME} +export PYTHONPATH={Path to base of directory} +``` + +--- + +## 🚀Start Microservice with Docker + +### Build Docker Image + +```bash +cd ~/GenAIComps +docker build -t opea/feedbackmanagement-arango-server:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/feedback_management/arango/Dockerfile . +``` + +### Run Docker with CLI + +- Run ArangoDB image container + + ```bash + docker run -d -p 8529:8529 --name=arango arangodb/arangodb:latest + ``` + +- Run Feedback Management microservice + + ```bash + docker run -d -p 6016:6016 \ + --name="feedbackmanagement-arango-server" \ + -e http_proxy=$http_proxy \ + -e https_proxy=$https_proxy \ + -e no_proxy=$no_proxy \ + -e ARANGO_HOST=${ARANGO_HOST} \ + -e ARANGO_PORT=${ARANGO_PORT} \ + -e ARANGO_PROTOCOL=${ARANGO_PROTOCOL} \ + -e ARANGO_USERNAME=${ARANGO_USERNAME} \ + -e ARANGO_PASSWORD=${ARANGO_PASSWORD} \ + -e DB_NAME=${DB_NAME} \ + -e COLLECTION_NAME=${COLLECTION_NAME} \ + opea/feedbackmanagement-arango-server:latest + + ``` + +--- + +### ✅ Invoke Microservice + +The Feedback Management microservice exposes the following API endpoints: + +- Save feedback data + + ```bash + curl -X 'POST' \ + http://${host_ip}:6016/v1/feedback/create \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "chat_id": "66445d4f71c7eff23d44f78d", + "chat_data": { + "user": "test", + "messages": [ + { + "role": "system", + "content": "You are helpful assistant" + }, + { + "role": "user", + "content": "hi", + "time": "1724915247" + }, + { + "role": "assistant", + "content": "Hi, may I help you?", + "time": "1724915249" + } + ] + }, + "feedback_data": { + "comment": "Moderate", + "rating": 3, + "is_thumbs_up": true + }}' + + + # Take note that chat_id here would be the id get from feedback_arango service + # If you do not wish to maintain chat history via feedback_arango service, you may generate some random uuid for it or just leave it empty. + ``` + +- Update feedback data by feedback_id + + ```bash + curl -X 'POST' \ + http://${host_ip}:6016/v1/feedback/create \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "chat_id": "66445d4f71c7eff23d44f78d", + "chat_data": { + "user": "test", + "messages": [ + { + "role": "system", + "content": "You are helpful assistant" + }, + { + "role": "user", + "content": "hi", + "time": "1724915247" + }, + { + "role": "assistant", + "content": "Hi, may I help you?", + "time": "1724915249" + } + ] + }, + "feedback_data": { + "comment": "Fair and Moderate answer", + "rating": 2, + "is_thumbs_up": true + }, + "feedback_id": "{feedback_id of the data that wanted to update}"}' + + # Just include any feedback_data field value that you wanted to update. + ``` + +- Retrieve feedback data by user + + ```bash + curl -X 'POST' \ + http://${host_ip}:6016/v1/feedback/get \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test"}' + ``` + +- Retrieve feedback data by feedback_id + + ```bash + curl -X 'POST' \ + http://${host_ip}:6016/v1/feedback/get \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test", "feedback_id":"{feedback_id returned from save feedback route above}"}' + ``` + +- Delete feedback data by feedback_id + + ```bash + curl -X 'POST' \ + http://${host_ip}:6016/v1/feedback/delete \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test", "feedback_id":"{feedback_id to be deleted}"}' + ``` diff --git a/comps/feedback_management/arango/arango_conn.py b/comps/feedback_management/arango/arango_conn.py new file mode 100644 index 000000000..d6c4b5977 --- /dev/null +++ b/comps/feedback_management/arango/arango_conn.py @@ -0,0 +1,32 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from arango import ArangoClient as PythonArangoClient +from arango.database import StandardDatabase +from config import ARANGO_HOST, ARANGO_PASSWORD, ARANGO_PORT, ARANGO_PROTOCOL, ARANGO_USERNAME, DB_NAME + + +class ArangoClient: + conn_url = f"{ARANGO_PROTOCOL}://{ARANGO_HOST}:{ARANGO_PORT}/" + + @staticmethod + def get_db_client() -> StandardDatabase: + try: + # Create client + client = PythonArangoClient(hosts=ArangoClient.conn_url) + + # First connect to _system database + sys_db = client.db("_system", username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + + # Create target database if it doesn't exist + if not sys_db.has_database(DB_NAME): + sys_db.create_database(DB_NAME) + + # Now connect to the target database + db = client.db(DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + + return db + + except Exception as e: + print(e) + raise e diff --git a/comps/feedback_management/arango/arango_store.py b/comps/feedback_management/arango/arango_store.py new file mode 100644 index 000000000..cd22b8078 --- /dev/null +++ b/comps/feedback_management/arango/arango_store.py @@ -0,0 +1,186 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from arango_conn import ArangoClient +from config import COLLECTION_NAME +from pydantic import BaseModel + + +class FeedbackStore: + + def __init__( + self, + user: str, + ): + self.user = user + + def initialize_storage(self) -> None: + self.db_client = ArangoClient.get_db_client() + + if not self.db_client.has_collection(COLLECTION_NAME): + self.db_client.create_collection(COLLECTION_NAME) + + self.collection = self.db_client.collection(COLLECTION_NAME) + + def save_feedback(self, feedback_data: BaseModel) -> str: + """Stores a new feedback data into the storage. + + Args: + feedback_data (object): The document to be stored. + + Returns: + str: The ID of the inserted feedback data. + + Raises: + Exception: If an error occurs while storing the feedback_data. + """ + try: + model_dump = feedback_data.model_dump(by_alias=True, mode="json", exclude={"feedback_id"}) + + inserted_feedback_data = self.collection.insert(model_dump) + + feedback_id = str(inserted_feedback_data["_key"]) + + return feedback_id + + except Exception as e: + print(e) + raise Exception(e) + + def update_feedback(self, feedback_data: BaseModel) -> bool: + """Update a feedback data in the collection with given id. + + Args: + feedback_id (str): The ID of the data to be updated. + updated_data (object): The data to be updated in the entry. + + Returns: + bool: True if the data updated successfully, False otherwise. + + Raises: + KeyError: If the document with ID is not found. + Exception: If the user does not match with the document user. + Exception: If an error occurs while updating the feedback data. + """ + _key = feedback_data.feedback_id + document = self.collection.get(_key) + + if document is None: + raise KeyError(f"Document with ID: {_key} not found.") + + if document["chat_data"]["user"] != self.user: + raise Exception(f"User mismatch. Document with ID: {_key} does not belong to user: {self.user}") + + try: + model_dump = feedback_data.feedback_data.model_dump(by_alias=True, mode="json") + + self.collection.update( + {"_key": _key, "feedback_data": model_dump}, + merge=True, + keep_none=False, + ) + + print(f"Updated document: {_key} !") + + return True + + except Exception as e: + print("Not able to update the data.") + print(e) + raise Exception(e) + + def get_all_feedback_of_user(self) -> list[dict]: + """Retrieves all feedback data of a user from the collection. + + Returns: + list[dict] | None: List of dict of feedback data of the user, None otherwise. + + Raises: + Exception: If there is an error while retrieving data. + """ + try: + feedback_data_list: list = [] + + # TODO: Clarify if we actually want to omit the `feedback_data` field. + # Implemented using MongoDB Feedback Management as a reference. + cursor = """ + FOR doc IN @@collection + FILTER doc.chat_data.user == @user + RETURN UNSET(doc, "feedback_data") + """ + + cursor = self.db_client.aql.execute( + cursor, bind_vars={"@collection": self.collection.name, "user": self.user} + ) + + for document in cursor: + document["feedback_id"] = str(document["_key"]) + del document["_id"] + del document["_key"] + del document["_rev"] + + feedback_data_list.append(document) + + return feedback_data_list + + except Exception as e: + print(e) + raise Exception(e) + + def get_feedback_by_id(self, feedback_id: str) -> dict | None: + """Retrieves a user feedback data from the collection based on the given feedback ID. + + Args: + feedback_id (str): The ID of the feedback data to retrieve. + + Returns: + dict | None: The user's feedback data if found, None otherwise. + + Raises: + KeyError: If document with ID is not found. + Exception: If the user does not match with the document user. + """ + response = self.collection.get(feedback_id) + + if response is None: + raise KeyError(f"Feedback with ID: {feedback_id} not found.") + + if response["chat_data"]["user"] != self.user: + raise Exception(f"User mismatch. Feedback with ID: {feedback_id} does not belong to user: {self.user}") + + del response["_id"] + del response["_key"] + del response["_rev"] + + return response + + def delete_feedback(self, feedback_id: str) -> bool: + """Delete a feedback data from collection by given feedback_id. + + Args: + feedback_id(str): The ID of the feedback data to be deleted. + + Returns: + bool: True if feedback is successfully deleted, False otherwise. + + Raises: + KeyError: If the provided feedback_id is invalid: + Exception: If the user does not match with the document user. + Exception: If any errors occurs during delete process. + """ + response = self.collection.get(feedback_id) + + if response is None: + raise KeyError(f"Feedback with ID: {feedback_id} not found.") + + if response["chat_data"]["user"] != self.user: + raise Exception(f"User mismatch. Feedback with ID: {feedback_id} does not belong to user: {self.user}") + + try: + response = self.collection.delete(feedback_id) + print(f"Deleted document: {feedback_id} !") + + return True + except Exception as e: + print(e) + raise Exception("Not able to delete the data.") diff --git a/comps/feedback_management/arango/config.py b/comps/feedback_management/arango/config.py new file mode 100644 index 000000000..bb790eb38 --- /dev/null +++ b/comps/feedback_management/arango/config.py @@ -0,0 +1,13 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +# ARANGO configuration +ARANGO_HOST = os.getenv("ARANGO_HOST", "localhost") +ARANGO_PORT = os.getenv("ARANGO_PORT", 8529) +ARANGO_PROTOCOL = os.getenv("ARANGO_PROTOCOL", "http") +ARANGO_USERNAME = os.getenv("ARANGO_USERNAME", "root") +ARANGO_PASSWORD = os.getenv("ARANGO_PASSWORD", "test") +DB_NAME = os.getenv("DB_NAME", "OPEA") +COLLECTION_NAME = os.getenv("COLLECTION_NAME", "Feedback") diff --git a/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml b/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml new file mode 100644 index 000000000..8f9b3a85a --- /dev/null +++ b/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml @@ -0,0 +1,38 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3" +services: + arango: + image: arangodb/arangodb:latest + container_name: arangodb + ports: + - 8529:8529 + environment: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + no_proxy: ${no_proxy} + ARANGO_ROOT_PASSWORD: ${ARANGO_ROOT_PASSWORD} + + feedbackmanagement-arango: + image: opea/feedbackmanagement-arango:latest + container_name: feedbackmanagement-arango-server + ports: + - "6016:6016" + ipc: host + environment: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + no_proxy: ${no_proxy} + ARANGO_HOST: ${ARANGO_HOST} + ARANGO_PORT: ${ARANGO_PORT} + ARANGO_PROTOCOL: ${ARANGO_PROTOCOL} + ARANGO_USERNAME: ${ARANGO_USERNAME} + ARANGO_PASSWORD: ${ARANGO_PASSWORD} + DB_NAME: ${DB_NAME} + COLLECTION_NAME: ${COLLECTION_NAME} + restart: unless-stopped + +networks: + default: + driver: bridge diff --git a/comps/feedback_management/arango/feedback.py b/comps/feedback_management/arango/feedback.py new file mode 100644 index 000000000..f1efa6f43 --- /dev/null +++ b/comps/feedback_management/arango/feedback.py @@ -0,0 +1,172 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os +from typing import Annotated, Optional + +from arango_store import FeedbackStore +from fastapi import HTTPException +from pydantic import BaseModel, Field + +from comps import CustomLogger +from comps.cores.mega.micro_service import opea_microservices, register_microservice +from comps.cores.proto.api_protocol import ChatCompletionRequest + +logger = CustomLogger("feedback_arango") +logflag = os.getenv("LOGFLAG", False) + + +class FeedbackData(BaseModel): + """This class represents the data model of FeedbackData collected to store in database.". + + Attributes: + is_thumbs_up (bool): True if the response is satisfy, False otherwise. + rating: (int)[Optional]: Score rating. Range from 0 (bad rating) to 5(good rating). + comment (str)[Optional]: Comment given for response. + """ + + is_thumbs_up: bool + rating: Annotated[Optional[int], Field(ge=0, le=5)] = None + comment: Optional[str] = None + + +class ChatFeedback(BaseModel): + """This class represents the model for chat to collect FeedbackData together with ChatCompletionRequest data to store in database. + + Attributes: + chat_data (ChatCompletionRequest): ChatCompletionRequest object containing chat data to be stored. + feedback_data (FeedbackData): FeedbackData object containing feedback data for chat to be stored. + chat_id (str)[Optional]: The chat_id associated to the chat to be store together with feedback data. + feedback_id (str)[Optional]: The feedback_id of feedback data to be retrieved from database. + """ + + chat_data: ChatCompletionRequest + feedback_data: FeedbackData + chat_id: Optional[str] = None + feedback_id: Optional[str] = None + + +class FeedbackId(BaseModel): + """This class represent the data model for retrieve feedback data stored in database. + + Attributes: + user (str): The user of the requested feedback data. + feedback_id (str): The feedback_id of feedback data to be retrieved from database. + """ + + user: str + feedback_id: Optional[str] = None + + +@register_microservice( + name="opea_service@feedback_arango", + endpoint="/v1/feedback/create", + host="0.0.0.0", + input_datatype=FeedbackData, + port=6016, +) +async def create_feedback_data(feedback: ChatFeedback): + """Creates and stores a feedback data in database. + + Args: + feedback (ChatFeedback): The ChatFeedback class object containing feedback data to be stored. + + Returns: + response (str/bool): FeedbackId of the object created in database. True if data update successfully. + """ + if logflag: + logger.info(feedback) + + try: + feedback_store = FeedbackStore(feedback.chat_data.user) + feedback_store.initialize_storage() + if feedback.feedback_id is None: + response = feedback_store.save_feedback(feedback) + else: + response = feedback_store.update_feedback(feedback) + + if logflag: + logger.info(response) + return response + + except Exception as e: + logger.info(f"An error occurred: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + + +@register_microservice( + name="opea_service@feedback_arango", + endpoint="/v1/feedback/get", + host="0.0.0.0", + input_datatype=FeedbackId, + port=6016, +) +async def get_feedback(feedback: FeedbackId): + """Retrieves feedback_data from feedback store based on provided FeedbackId or user. + + Args: + feedback (FeedbackId): The FeedbackId object containing user and feedback_id or chat_id. + + Returns: + JSON: Retrieved feedback data if successful, error otherwise. + """ + if logflag: + logger.info(feedback) + + try: + feedback_store = FeedbackStore(feedback.user) + feedback_store.initialize_storage() + if feedback.feedback_id: + response = feedback_store.get_feedback_by_id(feedback.feedback_id) + else: + response = feedback_store.get_all_feedback_of_user() + + if logflag: + logger.info(response) + + return response + + except Exception as e: + logger.info(f"An error occurred: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + + +@register_microservice( + name="opea_service@feedback_arango", + endpoint="/v1/feedback/delete", + host="0.0.0.0", + input_datatype=FeedbackId, + port=6016, +) +async def delete_feedback(feedback: FeedbackId): + """Delete a feedback data from feedback store by given feedback Id. + + Args: + feedback (FeedbackId): The FeedbackId object containing user and feedback_id or chat_id + + Returns: + Result of deletion if successful, None otherwise. + """ + if logflag: + logger.info(feedback) + + try: + feedback_store = FeedbackStore(feedback.user) + feedback_store.initialize_storage() + if feedback.feedback_id is None: + raise Exception("feedback_id is required.") + else: + response = feedback_store.delete_feedback(feedback.feedback_id) + + if logflag: + logger.info(response) + + return response + + except Exception as e: + logger.info(f"An error occurred: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + + +if __name__ == "__main__": + opea_microservices["opea_service@feedback_arango"].start() diff --git a/comps/feedback_management/arango/requirements.txt b/comps/feedback_management/arango/requirements.txt new file mode 100644 index 000000000..9e5d0de8e --- /dev/null +++ b/comps/feedback_management/arango/requirements.txt @@ -0,0 +1 @@ +python-arango \ No newline at end of file diff --git a/comps/prompt_registry/README.md b/comps/prompt_registry/README.md index 6332a1a13..a99b1b27b 100644 --- a/comps/prompt_registry/README.md +++ b/comps/prompt_registry/README.md @@ -19,3 +19,7 @@ The Prompt Registry microservice able to support various database backends for s ### Prompt Registry with MongoDB For more detail, please refer to this [README](./mongo/README.md) + +### Prompt Registry with ArangoDB + +For more detail, please refer to this [README](./arango/README.md) diff --git a/comps/prompt_registry/arango/DockerFile b/comps/prompt_registry/arango/DockerFile new file mode 100644 index 000000000..065920205 --- /dev/null +++ b/comps/prompt_registry/arango/DockerFile @@ -0,0 +1,30 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +ENV LANG=C.UTF-8 + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + build-essential \ + libgl1-mesa-glx \ + libjemalloc-dev + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps +COPY requirements.txt /home/user/ + +RUN pip install --no-cache-dir --upgrade pip setuptools && \ + pip install --no-cache-dir -r /home/user/comps/prompt_registry/arango/requirements.txt && \ + pip install --no-cache-dir -r /home/user/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +WORKDIR /home/user/comps/prompt_registry/arango + +ENTRYPOINT ["python", "prompt.py"] diff --git a/comps/prompt_registry/arango/README.md b/comps/prompt_registry/arango/README.md new file mode 100644 index 000000000..e4bdd6c10 --- /dev/null +++ b/comps/prompt_registry/arango/README.md @@ -0,0 +1,120 @@ +# 🧾 Prompt Registry Microservice with ArangoDB + +This README provides setup guides and all the necessary information about the Prompt Registry microservice with ArangoDB database. + +--- + +## Setup Environment Variables + +See `config.py` for default values. + +```bash +export ARANGO_HOST=${ARANGO_HOST} +export ARANGO_PORT=${ARANGO_PORT} +export ARANGO_PROTOCOL=${ARANGO_PROTOCOL} +export ARANGO_USERNAME=${ARANGO_USERNAME} +export ARANGO_PASSWORD=${ARANGO_PASSWORD} +export DB_NAME=${DB_NAME} +export COLLECTION_NAME=${COLLECTION_NAME} +``` + +--- + +## 🚀Start Microservice with Docker + +### Build Docker Image + +```bash +cd ~/GenAIComps +docker build -t opea/promptregistry-arango-server:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/prompt_registry/arango/Dockerfile . +``` + +### Run Docker with CLI + + +- Run ArangoDB image container + + ```bash + docker run -d -p 8529:8529 --name=arango arangodb/arangodb:latest + ``` + +- Run Prompt Registry microservice + + ```bash + docker run -d -p 6018:6018 \ + --name="promptregistry-arango-server" \ + -e http_proxy=$http_proxy \ + -e https_proxy=$https_proxy \ + -e no_proxy=$no_proxy \ + -e ARANGO_HOST=${ARANGO_HOST} \ + -e ARANGO_PORT=${ARANGO_PORT} \ + -e ARANGO_PROTOCOL=${ARANGO_PROTOCOL} \ + -e ARANGO_USERNAME=${ARANGO_USERNAME} \ + -e ARANGO_PASSWORD=${ARANGO_PASSWORD} \ + -e DB_NAME=${DB_NAME} \ + -e COLLECTION_NAME=${COLLECTION_NAME} \ + opea/promptregistry-arango-server:latest + + ``` + +--- + +### ✅ Invoke Microservice + +The Prompt Registry microservice exposes the following API endpoints: + +- Save prompt + + ```bash + curl -X 'POST' \ + http://${host_ip}:6018/v1/prompt/create \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "prompt_text": "test prompt", "user": "test" + }' + ``` + +- Retrieve prompt from database by user + + ```bash + curl -X 'POST' \ + http://${host_ip}:6018/v1/prompt/get \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test"}' + ``` + +- Retrieve prompt from database by prompt_id + + ```bash + curl -X 'POST' \ + http://${host_ip}:6018/v1/prompt/get \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test", "prompt_id":"{_id returned from save prompt route above}"}' + ``` + +- Retrieve relevant prompt by keyword + + ```bash + curl -X 'POST' \ + http://${host_ip}:6018/v1/prompt/get \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test", "prompt_text": "{keyword to search}"}' + ``` + +- Delete prompt by prompt_id + + ```bash + curl -X 'POST' \ + http://${host_ip}:6018/v1/prompt/delete \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test", "prompt_id":"{prompt_id to be deleted}"}' + ``` diff --git a/comps/prompt_registry/arango/arango_conn.py b/comps/prompt_registry/arango/arango_conn.py new file mode 100644 index 000000000..d6c4b5977 --- /dev/null +++ b/comps/prompt_registry/arango/arango_conn.py @@ -0,0 +1,32 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from arango import ArangoClient as PythonArangoClient +from arango.database import StandardDatabase +from config import ARANGO_HOST, ARANGO_PASSWORD, ARANGO_PORT, ARANGO_PROTOCOL, ARANGO_USERNAME, DB_NAME + + +class ArangoClient: + conn_url = f"{ARANGO_PROTOCOL}://{ARANGO_HOST}:{ARANGO_PORT}/" + + @staticmethod + def get_db_client() -> StandardDatabase: + try: + # Create client + client = PythonArangoClient(hosts=ArangoClient.conn_url) + + # First connect to _system database + sys_db = client.db("_system", username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + + # Create target database if it doesn't exist + if not sys_db.has_database(DB_NAME): + sys_db.create_database(DB_NAME) + + # Now connect to the target database + db = client.db(DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + + return db + + except Exception as e: + print(e) + raise e diff --git a/comps/prompt_registry/arango/arango_store.py b/comps/prompt_registry/arango/arango_store.py new file mode 100644 index 000000000..fb80ccd20 --- /dev/null +++ b/comps/prompt_registry/arango/arango_store.py @@ -0,0 +1,213 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +from arango.exceptions import IndexGetError +from arango_conn import ArangoClient +from config import COLLECTION_NAME +from prompt import PromptCreate +from pydantic import BaseModel + +from comps import CustomLogger + +logger = CustomLogger("arango_store") +logflag = os.getenv("LOGFLAG", False) + + +class PromptStore: + + def __init__( + self, + user: str, + ): + self.user = user + self.inverted_index_exists = False + + def initialize_storage(self) -> None: + self.db_client = ArangoClient.get_db_client() + + if not self.db_client.has_collection(COLLECTION_NAME): + self.db_client.create_collection(COLLECTION_NAME) + + self.collection = self.db_client.collection(COLLECTION_NAME) + + def save_prompt(self, prompt: PromptCreate): + """Stores a new prompt into the storage. + + Args: + prompt: The document to be stored. It should be a Pydantic model. + + Returns: + str: The ID of the inserted prompt. + + Raises: + Exception: If an error occurs while storing the prompt. + """ + try: + model_dump = prompt.model_dump(by_alias=True, mode="json", exclude={"id"}) + + inserted_prompt_data = self.collection.insert(model_dump) + + prompt_id = str(inserted_prompt_data["_key"]) + + return prompt_id + + except Exception as e: + print(e) + raise Exception(e) + + def get_all_prompt_of_user(self) -> list[dict]: + """Retrieves all prompts of a user from the collection. + + Returns: + list[dict] | None: List of dict of prompts of the user, None otherwise. + + Raises: + Exception: If there is an error while retrieving data. + """ + try: + prompt_data_list: list = [] + + # TODO: Clarify if we actually want to omit the `data` field. + # Implemented using MongoDB Prompt Registry as a reference. + cursor = """ + FOR doc IN @@collection + FILTER doc.chat_data.user == @user + RETURN UNSET(doc, "data") + """ + + cursor = self.db_client.aql.execute( + cursor, bind_vars={"@collection": self.collection.name, "user": self.user} + ) + + for document in cursor: + document["id"] = str(document["_key"]) + del document["_id"] + del document["_key"] + del document["_rev"] + + prompt_data_list.append(document) + + return prompt_data_list + + except Exception as e: + print(e) + raise Exception(e) + + def get_user_prompt_by_id(self, prompt_id: str) -> dict | None: + """Retrieves a user prompt from the collection based on the given prompt ID. + + Args: + prompt_id (str): The ID of the prompt to retrieve. + + Returns: + dict | None: The user prompt if found, None otherwise. + + Raises: + KeyError: If document with ID is not found. + Exception: If the user does not match with the document user. + """ + response = self.collection.get(prompt_id) + + if response is None: + raise KeyError(f"Prompt with ID: {prompt_id} not found.") + + if response["user"] != self.user: + raise Exception(f"User mismatch. Prompt with ID: {prompt_id} does not belong to user: {self.user}") + + del response["_id"] + del response["_key"] + del response["_rev"] + + return response + + def prompt_search(self, keyword: str) -> list | None: + """Retrieves prompt from the collection based on keyword provided. + + Args: + keyword (str): The keyword of prompt to search for. + + Returns: + list | None: The list of relevant prompt if found, None otherwise. + + Raises: + Exception: If there is an error while searching data. + """ + try: + index_name = "prompt_text_index" + + if not self.inverted_index_exists: + try: + self.collection.get_index(index_name) + + except IndexGetError: + self.collection.add_inverted_index( + fields=["prompt_text"], + name=index_name, + # TODO: add more kwargs if needed + ) + + self.inverted_index_exists = True + + query = """ + FOR doc IN @@collection + OPTIONS { indexHint: @index_name, forceIndexHint: true } + FILTER PHRASE(doc.prompt_text, @keyword, "text_en") + RETURN doc + """ + + cursor = self.db_client.aql.execute( + query, + bind_vars={ + "@collection": self.collection.name, + "index_name": index_name, + "keyword": keyword, + }, + ) + + serialized_data = [] + for doc in cursor: + doc["id"] = str(doc["_key"]) + del doc["_id"] + del doc["_key"] + del doc["_rev"] + + serialized_data.append(doc) + + return serialized_data + + except Exception as e: + print(e) + raise Exception(e) + + def delete_prompt(self, prompt_id: str) -> bool: + """Delete a prompt from collection by given prompt_id. + + Args: + prompt_id(str): The ID of the prompt to be deleted. + + Returns: + bool: True if prompt is successfully deleted, False otherwise. + + Raises: + KeyError: If the provided feedback_id is invalid: + Exception: If the user does not match with the document user. + Exception: If any errors occurs during delete process. + """ + response = self.collection.get(prompt_id) + + if response is None: + raise KeyError(f"Feedback with ID: {prompt_id} not found.") + + if response["user"] != self.user: + raise Exception(f"User mismatch. Feedback with ID: {prompt_id} does not belong to user: {self.user}") + + try: + response = self.collection.delete(prompt_id) + print(f"Deleted document: {prompt_id} !") + + return True + except Exception as e: + print(e) + raise Exception("Not able to delete the data.") diff --git a/comps/prompt_registry/arango/config.py b/comps/prompt_registry/arango/config.py new file mode 100644 index 000000000..e597df0fb --- /dev/null +++ b/comps/prompt_registry/arango/config.py @@ -0,0 +1,13 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +# ARANGO configuration +ARANGO_HOST = os.getenv("ARANGO_HOST", "localhost") +ARANGO_PORT = os.getenv("ARANGO_PORT", 8529) +ARANGO_PROTOCOL = os.getenv("ARANGO_PROTOCOL", "http") +ARANGO_USERNAME = os.getenv("ARANGO_USERNAME", "root") +ARANGO_PASSWORD = os.getenv("ARANGO_PASSWORD", "test") +DB_NAME = os.getenv("DB_NAME", "OPEA") +COLLECTION_NAME = os.getenv("COLLECTION_NAME", "Prompt") diff --git a/comps/prompt_registry/arango/docker-compose-prompt-registry-arango.yaml b/comps/prompt_registry/arango/docker-compose-prompt-registry-arango.yaml new file mode 100644 index 000000000..b1aee077d --- /dev/null +++ b/comps/prompt_registry/arango/docker-compose-prompt-registry-arango.yaml @@ -0,0 +1,38 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3" +services: + arango: + image: arangodb/arangodb:latest + container_name: arangodb + ports: + - 8529:8529 + environment: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + no_proxy: ${no_proxy} + ARANGO_ROOT_PASSWORD: ${ARANGO_ROOT_PASSWORD} + + promptregistry-arango: + image: opea/promptregistry-arango:latest + container_name: promptregistry-arango-server + ports: + - "6018:6018" + ipc: host + environment: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + no_proxy: ${no_proxy} + ARANGO_HOST: ${ARANGO_HOST} + ARANGO_PORT: ${ARANGO_PORT} + ARANGO_PROTOCOL: ${ARANGO_PROTOCOL} + ARANGO_USERNAME: ${ARANGO_USERNAME} + ARANGO_PASSWORD: ${ARANGO_PASSWORD} + DB_NAME: ${DB_NAME} + COLLECTION_NAME: ${COLLECTION_NAME} + restart: unless-stopped + +networks: + default: + driver: bridge diff --git a/comps/prompt_registry/arango/prompt.py b/comps/prompt_registry/arango/prompt.py new file mode 100644 index 000000000..c46e0174c --- /dev/null +++ b/comps/prompt_registry/arango/prompt.py @@ -0,0 +1,148 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os +from typing import Optional + +from arango_store import PromptStore +from pydantic import BaseModel + +from comps import CustomLogger +from comps.cores.mega.micro_service import opea_microservices, register_microservice + +logger = CustomLogger("prompt_arango") +logflag = os.getenv("LOGFLAG", False) + + +class PromptCreate(BaseModel): + """This class represents the data model for creating and storing a new prompt in the database. + + Attributes: + prompt_text (str): The text content of the prompt. + user (str): The user or creator of the prompt. + """ + + prompt_text: str + user: str + + +class PromptId(BaseModel): + """This class represent the data model for retrieve prompt stored in database. + + Attributes: + user (str): The user of the requested prompt. + prompt_id (str): The prompt_id of prompt to be retrieved from database. + """ + + user: str + prompt_id: Optional[str] = None + prompt_text: Optional[str] = None + + +@register_microservice( + name="opea_service@prompt_arango", + endpoint="/v1/prompt/create", + host="0.0.0.0", + input_datatype=PromptCreate, + port=6018, +) +async def create_prompt(prompt: PromptCreate): + """Creates and stores a prompt in prompt store. + + Args: + prompt (PromptCreate): The PromptCreate class object containing the data to be stored. + + Returns: + JSON (PromptResponse): PromptResponse class object, None otherwise. + """ + if logflag: + logger.info(prompt) + + try: + prompt_store = PromptStore(prompt.user) + prompt_store.initialize_storage() + response = prompt_store.save_prompt(prompt) + if logflag: + logger.info(response) + + return response + + except Exception as error: + logger.error(f"An error occurred: {str(error)}") + raise error + + +@register_microservice( + name="opea_service@prompt_arango", + endpoint="/v1/prompt/get", + host="0.0.0.0", + input_datatype=PromptId, + port=6018, +) +async def get_prompt(prompt: PromptId): + """Retrieves prompt from prompt store based on provided PromptId or user. + + Args: + prompt (PromptId): The PromptId object containing user and prompt_id. + + Returns: + JSON: Retrieved prompt data if successful, None otherwise. + """ + if logflag: + logger.info(prompt) + try: + + prompt_store = PromptStore(prompt.user) + prompt_store.initialize_storage() + + if prompt.prompt_id is not None: + response = prompt_store.get_user_prompt_by_id(prompt.prompt_id) + elif prompt.prompt_text: + response = prompt_store.prompt_search(prompt.prompt_text) + else: + response = prompt_store.get_all_prompt_of_user() + if logflag: + logger.info(response) + return response + + except Exception as error: + logger.error(f"An error occurred: {str(error)}") + raise error + + +@register_microservice( + name="opea_service@prompt_arango", + endpoint="/v1/prompt/delete", + host="0.0.0.0", + input_datatype=PromptId, + port=6018, +) +async def delete_prompt(prompt: PromptId): + """Delete a prompt from prompt store by given PromptId. + + Args: + prompt (PromptId): The PromptId object containing user and prompt_id. + + Returns: + Result of deletion if successful, None otherwise. + """ + if logflag: + logger.info(prompt) + try: + prompt_store = PromptStore(prompt.user) + prompt_store.initialize_storage() + if prompt.prompt_id is None: + raise Exception("Prompt id is required.") + else: + response = prompt_store.delete_prompt(prompt.prompt_id) + if logflag: + logger.info(response) + return response + + except Exception as error: + logger.error(f"An error occurred: {str(error)}") + raise error + + +if __name__ == "__main__": + opea_microservices["opea_service@prompt_arango"].start() diff --git a/comps/prompt_registry/arango/requirements.txt b/comps/prompt_registry/arango/requirements.txt new file mode 100644 index 000000000..9e5d0de8e --- /dev/null +++ b/comps/prompt_registry/arango/requirements.txt @@ -0,0 +1 @@ +python-arango \ No newline at end of file diff --git a/langchain_test.py b/langchain_test.py new file mode 100644 index 000000000..e33ea1687 --- /dev/null +++ b/langchain_test.py @@ -0,0 +1,101 @@ +from arango import ArangoClient +from langchain_community.chains.graph_qa.arangodb import ArangoGraphQAChain +from langchain_community.embeddings import OpenAIEmbeddings +from langchain_community.graphs.arangodb_graph import ArangoGraph +from langchain_community.graphs.graph_document import GraphDocument, Node, Relationship +from langchain_community.vectorstores.arangodb_vector import ArangoVector +from langchain_core.documents import Document +from langchain_openai import OpenAI + +system_db = ArangoClient().db("_system", password="test", verify=True) +system_db.delete_database("langchain_test", ignore_missing=True) +system_db.create_database("langchain_test") +db = ArangoClient().db("langchain_test", password="test", verify=True) + +#################### +# Test ArangoGraph # +#################### + +# Create nodes +node1 = Node(id="1", type="Person", properties={"name": "John", "age": 30}) +node2 = Node(id="2", type="Person", properties={"name": "Jane", "age": 28}) +node3 = Node(id="3", type="Club", properties={"name": "Karate Club"}) + +# Create relationships +relationship1 = Relationship(source=node1, target=node3, type="MEMBER_OF", properties={"joined_date": "2020-01-01"}) +relationship2 = Relationship(source=node2, target=node3, type="MEMBER_OF", properties={"joined_date": "2019-05-15"}) +relationship3 = Relationship(source=node1, target=node2, type="KNOWS", properties={"since": "2018-03-10"}) + +# Create source document +source_doc = Document( + page_content="John and Jane are members of the Karate Club. They know each other.", + metadata={"source": "club_records"}, +) + +# Create GraphDocument +graph_doc = GraphDocument( + nodes=[node1, node2, node3], relationships=[relationship1, relationship2, relationship3], source=source_doc +) + +arango_graph = ArangoGraph(db=db, include_examples=False) +arango_graph.add_graph_documents([graph_doc], graph_name="NewGraph", include_source=True) + +##################### +# Test ArangoVector # +##################### + +# Add some sample texts +texts = [ + "The quick brown fox jumps over the lazy dog", + "A journey of a thousand miles begins with a single step", + "To be or not to be, that is the question", + "All that glitters is not gold", + "hello what's up", +] + +vector_store = ArangoVector.from_texts( + texts, + OpenAIEmbeddings(), + database=db, + collection_name="vector_test", + index_name="vector_index", + distance_strategy="COSINE", +) + +texts_2 = ["the dog, cat, and mouse are all mammals"] +vector_store.add_texts(texts_2) + +# Perform a similarity search +query = "What animal is mentioned?" +results = vector_store.similarity_search_with_score(query, k=2) + +print("Search results for query:", query) +for doc, score in results: + print(f"Content: {doc.page_content}") + print(f"Metadata: {doc.metadata}") + print(f"Score: {score}") + print("---") + +# Try another query +query2 = "What's a famous Shakespeare quote?" +results2 = vector_store.similarity_search_with_score(query2, k=1) + +print("\nSearch results for query:", query2) +for doc, score in results2: + print(f"Content: {doc.page_content}") + print(f"Metadata: {doc.metadata}") + print(f"Score: {score}") + print("---") + +########################### +# Test ArangoGraphQAChain # +########################### + +llm = OpenAI(temperature=0) +graph = ArangoGraph(db=db, include_examples=False, graph_name="NewGraph") +chain = ArangoGraphQAChain.from_llm(llm, graph=graph, allow_dangerous_requests=True) +chain.verbose = True +chain.execute_aql_query = False +chain.run("What is the name of the club?") +chain.execute_aql_query = True +chain.run("What is the name of the club?") diff --git a/tests/chathistory/test_chathistory_arango.sh b/tests/chathistory/test_chathistory_arango.sh new file mode 100644 index 000000000..50481262f --- /dev/null +++ b/tests/chathistory/test_chathistory_arango.sh @@ -0,0 +1,91 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -x + +WORKPATH=$(dirname "$PWD") +ip_address=$(hostname -I | awk '{print $1}') + +export ARANGO_HOST=${ip_address} +export ARANGO_PORT=8529 +export ARANGO_PROTOCOL=${ARANGO_PROTOCOL:-"http"} +export ARANGO_USERNAME=${ARANGO_USERNAME:-"root"} +export ARANGO_PASSWORD=${ARANGO_PASSWORD:-"test"} +export DB_NAME=${DB_NAME:-"Conversations"} +export COLLECTION_NAME=${COLLECTION_NAME:-"test"} + +function build_docker_images() { + cd $WORKPATH + echo $(pwd) + docker run -d -p 8529:8529 --name=test-comps-arango arangodb/arangodb:latest + + docker build --no-cache -t opea/chathistory-arango-server:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/chathistory/arango/Dockerfile . + if [ $? -ne 0 ]; then + echo "opea/chathistory-arango-server built fail" + exit 1 + else + echo "opea/chathistory-arango-server built successful" + fi +} + +function start_service() { + + docker run -d --name="test-comps-chathistory-arango-server" \ + -p 6012:6012 \ + -e http_proxy=$http_proxy \ + -e https_proxy=$https_proxy \ + -e no_proxy=$no_proxy \ + -e ARANGO_HOST=${ARANGO_HOST} \ + -e ARANGO_PORT=${ARANGO_PORT} \ + -e ARANGO_PROTOCOL=${ARANGO_PROTOCOL} \ + -e ARANGO_USERNAME=${ARANGO_USERNAME} \ + -e ARANGO_PASSWORD=${ARANGO_PASSWORD} \ + -e DB_NAME=${DB_NAME} \ + -e COLLECTION_NAME=${COLLECTION_NAME} \ + opea/chathistory-arango-server:comps + + sleep 10s +} + +function validate_microservice() { + result=$(curl -X 'POST' \ + http://${ip_address}:6012/v1/chathistory/create \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "data": { + "messages": "test Messages", "user": "test" + } +}') + echo $result + if [[ ${#result} -eq 26 ]]; then + echo "Result correct." + else + echo "Result wrong." + docker logs test-comps-chathistory-arango-server + exit 1 + fi + +} + +function stop_docker() { + cid=$(docker ps -aq --filter "name=test-comps*") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_microservice + + stop_docker + echo y | docker system prune + +} + +main diff --git a/tests/feedback_management/test_feedback_management_arango.sh b/tests/feedback_management/test_feedback_management_arango.sh new file mode 100644 index 000000000..925555030 --- /dev/null +++ b/tests/feedback_management/test_feedback_management_arango.sh @@ -0,0 +1,113 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -xe + +WORKPATH=$(dirname "$PWD") +ip_address=$(hostname -I | awk '{print $1}') + +export ARANGO_HOST=${ip_address} +export ARANGO_PORT=8529 +export ARANGO_PROTOCOL=${ARANGO_PROTOCOL:-"http"} +export ARANGO_USERNAME=${ARANGO_USERNAME:-"root"} +export ARANGO_PASSWORD=${ARANGO_PASSWORD:-"test"} +export DB_NAME=${DB_NAME:-"Feedback"} +export COLLECTION_NAME=${COLLECTION_NAME:-"test"} + +function build_docker_images() { + cd $WORKPATH + echo $(pwd) + docker run -d -p 8529:8529 --name=test-comps-arango arangodb/arangodb:latest + + docker build --no-cache -t opea/feedbackmanagement-arango-server:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/feedback_management/arango/Dockerfile . + if [ $? -ne 0 ]; then + echo "opea/feedbackmanagement-arango-server built fail" + exit 1 + else + echo "opea/feedbackmanagement-arango-server built successful" + fi +} + +function start_service() { + + docker run -d --name="test-comps-feedbackmanagement-arango-server" \ + -p 6016:6016 \ + -e http_proxy=$http_proxy \ + -e https_proxy=$https_proxy \ + -e no_proxy=$no_proxy \ + -e ARANGO_HOST=${ARANGO_HOST} \ + -e ARANGO_PORT=${ARANGO_PORT} \ + -e ARANGO_PROTOCOL=${ARANGO_PROTOCOL} \ + -e ARANGO_USERNAME=${ARANGO_USERNAME} \ + -e ARANGO_PASSWORD=${ARANGO_PASSWORD} \ + -e DB_NAME=${DB_NAME} \ + -e COLLECTION_NAME=${COLLECTION_NAME} \ + opea/feedbackmanagement-arango-server:comps + + sleep 10s +} + +function validate_microservice() { + result=$(curl -X 'POST' \ + http://$ip_address:6016/v1/feedback/create \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "chat_id": "66445d4f71c7eff23d44f78d", + "chat_data": { + "user": "test", + "messages": [ + { + "role": "system", + "content": "You are helpful assistant" + }, + { + "role": "user", + "content": "hi", + "time": "1724915247" + }, + { + "role": "assistant", + "content": "Hi, may I help you?", + "time": "1724915249" + } + ] + }, + "feedback_data": { + "comment": "Moderate", + "rating": 3, + "is_thumbs_up": true + } +}') + echo $result + if [[ ${#result} -eq 26 ]]; then + echo "Correct result." + else + echo "Incorrect result." + docker logs test-comps-feedbackmanagement-arango-server + exit 1 + fi + +} + +function stop_docker() { + cid=$(docker ps -aq --filter "name=test-comps*") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_microservice + + stop_docker + echo y | docker system prune + +} + +main diff --git a/tests/prompt_registry/test_prompt_registry_arango.sh b/tests/prompt_registry/test_prompt_registry_arango.sh new file mode 100644 index 000000000..abc15ee7f --- /dev/null +++ b/tests/prompt_registry/test_prompt_registry_arango.sh @@ -0,0 +1,89 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -x + +WORKPATH=$(dirname "$PWD") +ip_address=$(hostname -I | awk '{print $1}') + +export ARANGO_HOST=${ip_address} +export ARANGO_PORT=8529 +export ARANGO_PROTOCOL=${ARANGO_PROTOCOL:-"http"} +export ARANGO_USERNAME=${ARANGO_USERNAME:-"root"} +export ARANGO_PASSWORD=${ARANGO_PASSWORD:-"test"} +export DB_NAME=${DB_NAME:-"Prompts"} +export COLLECTION_NAME=${COLLECTION_NAME:-"test"} + +function build_docker_images() { + cd $WORKPATH + echo $(pwd) + docker run -d -p 8529:8529 --name=test-comps-arango arangodb/arangodb:latest + + docker build --no-cache -t opea/promptregistry-arango-server:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/prompt_registry/arango/Dockerfile . + if [ $? -ne 0 ]; then + echo "opea/promptregistry-arango-server built fail" + exit 1 + else + echo "opea/promptregistry-arango-server built successful" + fi +} + +function start_service() { + + docker run -d --name="test-comps-promptregistry-arango-server" \ + -p 6018:6018 \ + -e http_proxy=$http_proxy \ + -e https_proxy=$https_proxy \ + -e no_proxy=$no_proxy \ + -e ARANGO_HOST=${ARANGO_HOST} \ + -e ARANGO_PORT=${ARANGO_PORT} \ + -e ARANGO_PROTOCOL=${ARANGO_PROTOCOL} \ + -e ARANGO_USERNAME=${ARANGO_USERNAME} \ + -e ARANGO_PASSWORD=${ARANGO_PASSWORD} \ + -e DB_NAME=${DB_NAME} \ + -e COLLECTION_NAME=${COLLECTION_NAME} \ + opea/promptregistry-arango-server:comps + + sleep 10s +} + +function validate_microservice() { + result=$(curl -X 'POST' \ + http://$ip_address:6018/v1/prompt/create \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "prompt_text": "test prompt", "user": "test" +}') + echo $result + if [[ ${#result} -eq 26 ]]; then + echo "Correct result." + else + echo "Incorrect result." + docker logs test-comps-promptregistry-arango-server + exit 1 + fi + +} + +function stop_docker() { + cid=$(docker ps -aq --filter "name=test-comps*") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_microservice + + stop_docker + echo y | docker system prune + +} + +main