From 0e9ed3ba98be04d106ead44c692fc215944997f5 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Tue, 22 Oct 2024 21:30:29 -0400 Subject: [PATCH 01/25] arangodb prep | initial commit --- .gitignore | 1 + ARANGODB_README.md | 33 +++++++++++++++ langchain_test.py | 101 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 135 insertions(+) create mode 100644 ARANGODB_README.md create mode 100644 langchain_test.py diff --git a/.gitignore b/.gitignore index 1d1e0a389..9778bf8f7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ __pycache__ *.egg-info/ .DS_Store +.venv \ No newline at end of file diff --git a/ARANGODB_README.md b/ARANGODB_README.md new file mode 100644 index 000000000..76f4b7db3 --- /dev/null +++ b/ARANGODB_README.md @@ -0,0 +1,33 @@ +Instructions + +0. Create a virtual environment: + +```bash +python -m venv .venv + +source .venv/bin/activate +``` + +1. Install the required packages: + +```bash +pip install python-arango +pip install langchain_openai +pip install git+https://github.com/arangoml/langchain.git@arangodb#subdirectory=libs/community +``` + +2. Provision the ArangoDB with Vector Index image: + +```bash +docker create --name arango-vector -p 8529:8529 -e ARANGO_ROOT_PASSWORD=test jbajic/arangodb-arm:vector-index-preview + +docker start arango-vector +``` + +3. Set your `OPENAI_API_KEY` environment variable (contact Anthony for access) + +4. Run the test script to confirm LangChain is working: + +```bash +python langchain_test.py +``` \ No newline at end of file diff --git a/langchain_test.py b/langchain_test.py new file mode 100644 index 000000000..e33ea1687 --- /dev/null +++ b/langchain_test.py @@ -0,0 +1,101 @@ +from arango import ArangoClient +from langchain_community.chains.graph_qa.arangodb import ArangoGraphQAChain +from langchain_community.embeddings import OpenAIEmbeddings +from langchain_community.graphs.arangodb_graph import ArangoGraph +from langchain_community.graphs.graph_document import GraphDocument, Node, Relationship +from langchain_community.vectorstores.arangodb_vector import ArangoVector +from langchain_core.documents import Document +from langchain_openai import OpenAI + +system_db = ArangoClient().db("_system", password="test", verify=True) +system_db.delete_database("langchain_test", ignore_missing=True) +system_db.create_database("langchain_test") +db = ArangoClient().db("langchain_test", password="test", verify=True) + +#################### +# Test ArangoGraph # +#################### + +# Create nodes +node1 = Node(id="1", type="Person", properties={"name": "John", "age": 30}) +node2 = Node(id="2", type="Person", properties={"name": "Jane", "age": 28}) +node3 = Node(id="3", type="Club", properties={"name": "Karate Club"}) + +# Create relationships +relationship1 = Relationship(source=node1, target=node3, type="MEMBER_OF", properties={"joined_date": "2020-01-01"}) +relationship2 = Relationship(source=node2, target=node3, type="MEMBER_OF", properties={"joined_date": "2019-05-15"}) +relationship3 = Relationship(source=node1, target=node2, type="KNOWS", properties={"since": "2018-03-10"}) + +# Create source document +source_doc = Document( + page_content="John and Jane are members of the Karate Club. They know each other.", + metadata={"source": "club_records"}, +) + +# Create GraphDocument +graph_doc = GraphDocument( + nodes=[node1, node2, node3], relationships=[relationship1, relationship2, relationship3], source=source_doc +) + +arango_graph = ArangoGraph(db=db, include_examples=False) +arango_graph.add_graph_documents([graph_doc], graph_name="NewGraph", include_source=True) + +##################### +# Test ArangoVector # +##################### + +# Add some sample texts +texts = [ + "The quick brown fox jumps over the lazy dog", + "A journey of a thousand miles begins with a single step", + "To be or not to be, that is the question", + "All that glitters is not gold", + "hello what's up", +] + +vector_store = ArangoVector.from_texts( + texts, + OpenAIEmbeddings(), + database=db, + collection_name="vector_test", + index_name="vector_index", + distance_strategy="COSINE", +) + +texts_2 = ["the dog, cat, and mouse are all mammals"] +vector_store.add_texts(texts_2) + +# Perform a similarity search +query = "What animal is mentioned?" +results = vector_store.similarity_search_with_score(query, k=2) + +print("Search results for query:", query) +for doc, score in results: + print(f"Content: {doc.page_content}") + print(f"Metadata: {doc.metadata}") + print(f"Score: {score}") + print("---") + +# Try another query +query2 = "What's a famous Shakespeare quote?" +results2 = vector_store.similarity_search_with_score(query2, k=1) + +print("\nSearch results for query:", query2) +for doc, score in results2: + print(f"Content: {doc.page_content}") + print(f"Metadata: {doc.metadata}") + print(f"Score: {score}") + print("---") + +########################### +# Test ArangoGraphQAChain # +########################### + +llm = OpenAI(temperature=0) +graph = ArangoGraph(db=db, include_examples=False, graph_name="NewGraph") +chain = ArangoGraphQAChain.from_llm(llm, graph=graph, allow_dangerous_requests=True) +chain.verbose = True +chain.execute_aql_query = False +chain.run("What is the name of the club?") +chain.execute_aql_query = True +chain.run("What is the name of the club?") From 7bd5aaacdb43a90f6b8e2cb8ea08b184a03044d2 Mon Sep 17 00:00:00 2001 From: Ajay Kallepalli <72517322+ajaykallepalli@users.noreply.github.com> Date: Mon, 25 Nov 2024 14:28:02 -0800 Subject: [PATCH 02/25] ArangoDB: Feedback management (#11) * initial commit * updating feedback management readme to match arango * Removing comments above import * Working API test and updated readme * Working docker compose file * Docker compose creating network and docker image * code review * update readme & dev yaml * delete dev files * Delete arango_store.py --------- Co-authored-by: Anthony Mahanna --- comps/feedback_management/arango/Dockerfile | 30 +++ comps/feedback_management/arango/README.md | 172 ++++++++++++++++ .../feedback_management/arango/arango_conn.py | 32 +++ .../arango/arango_store.py | 186 ++++++++++++++++++ comps/feedback_management/arango/config.py | 13 ++ .../docker-compose-user-feedback-arango.yaml | 38 ++++ comps/feedback_management/arango/feedback.py | 172 ++++++++++++++++ .../arango/requirements.txt | 1 + 8 files changed, 644 insertions(+) create mode 100644 comps/feedback_management/arango/Dockerfile create mode 100644 comps/feedback_management/arango/README.md create mode 100644 comps/feedback_management/arango/arango_conn.py create mode 100644 comps/feedback_management/arango/arango_store.py create mode 100644 comps/feedback_management/arango/config.py create mode 100644 comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml create mode 100644 comps/feedback_management/arango/feedback.py create mode 100644 comps/feedback_management/arango/requirements.txt diff --git a/comps/feedback_management/arango/Dockerfile b/comps/feedback_management/arango/Dockerfile new file mode 100644 index 000000000..95ac359e6 --- /dev/null +++ b/comps/feedback_management/arango/Dockerfile @@ -0,0 +1,30 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +ENV LANG=C.UTF-8 + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + build-essential \ + libjemalloc-dev \ + libgl1-mesa-glx + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps +COPY requirements.txt /home/user/ + +RUN pip install --no-cache-dir --upgrade pip setuptools && \ + pip install --no-cache-dir -r /home/user/comps/feedback_management/arango/requirements.txt && \ + pip install --no-cache-dir -r /home/user/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +WORKDIR /home/user/comps/feedback_management/arango + +ENTRYPOINT ["python", "feedback.py"] diff --git a/comps/feedback_management/arango/README.md b/comps/feedback_management/arango/README.md new file mode 100644 index 000000000..8eb223ce9 --- /dev/null +++ b/comps/feedback_management/arango/README.md @@ -0,0 +1,172 @@ +# 🗨 Feedback Management Microservice with ArangoDB + +This README provides setup guides and all the necessary information about the Feedback Management microservice with ArangoDB database. + +--- + +## Setup Environment Variables + +```bash +export ARANGO_HOST=${ARANGO_HOST} +export ARANGO_PORT=${ARANGO_PORT} +export ARANGO_USERNAME=${ARANGO_USERNAME} +export ARANGO_PASSWORD=${ARANGO_PASSWORD} +export DB_NAME=${DB_NAME} +export COLLECTION_NAME=${COLLECTION_NAME} +export PROTOCOL=${PROTOCOL} +export PYTHONPATH={Path to base of directory} +``` + +--- + +## 🚀Start Microservice with Docker + +### Build Docker Image + +```bash +cd ~/GenAIComps +docker build -t opea/feedbackmanagement-arango-server:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/feedback_management/arango/Dockerfile . +``` + +### Run Docker with CLI + +- Run ArangoDB image container + + ```bash + docker run -d -p 8529:8529 --name=arango arangodb/arangodb:latest + ``` + +- Run Feedback Management microservice + + ```bash + docker run -d -p 6016:6016 \ + --name="feedbackmanagement-arango-server" \ + -e http_proxy=$http_proxy \ + -e https_proxy=$https_proxy \ + -e no_proxy=$no_proxy \ + -e ARANGO_HOST=${ARANGO_HOST} \ + -e ARANGO_PORT=${ARANGO_PORT} \ + -e ARANGO_USERNAME=${ARANGO_USERNAME} \ + -e ARANGO_PASSWORD=${ARANGO_PASSWORD} \ + -e DB_NAME=${DB_NAME} \ + -e PROTOCOL=${PROTOCOL} \ + -e COLLECTION_NAME=${COLLECTION_NAME} \ + opea/feedbackmanagement-arango-server:latest + + ``` + +--- + +### ✅ Invoke Microservice + +The Feedback Management microservice exposes the following API endpoints: + +- Save feedback data + + ```bash + curl -X 'POST' \ + http://${host_ip}:6016/v1/feedback/create \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "chat_id": "66445d4f71c7eff23d44f78d", + "chat_data": { + "user": "test", + "messages": [ + { + "role": "system", + "content": "You are helpful assistant" + }, + { + "role": "user", + "content": "hi", + "time": "1724915247" + }, + { + "role": "assistant", + "content": "Hi, may I help you?", + "time": "1724915249" + } + ] + }, + "feedback_data": { + "comment": "Moderate", + "rating": 3, + "is_thumbs_up": true + }}' + + + # Take note that chat_id here would be the id get from feedback_arango service + # If you do not wish to maintain chat history via feedback_arango service, you may generate some random uuid for it or just leave it empty. + ``` + +- Update feedback data by feedback_id + + ```bash + curl -X 'POST' \ + http://${host_ip}:6016/v1/feedback/create \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "chat_id": "66445d4f71c7eff23d44f78d", + "chat_data": { + "user": "test", + "messages": [ + { + "role": "system", + "content": "You are helpful assistant" + }, + { + "role": "user", + "content": "hi", + "time": "1724915247" + }, + { + "role": "assistant", + "content": "Hi, may I help you?", + "time": "1724915249" + } + ] + }, + "feedback_data": { + "comment": "Fair and Moderate answer", + "rating": 2, + "is_thumbs_up": true + }, + "feedback_id": "{feedback_id of the data that wanted to update}"}' + + # Just include any feedback_data field value that you wanted to update. + ``` + +- Retrieve feedback data by user + + ```bash + curl -X 'POST' \ + http://${host_ip}:6016/v1/feedback/get \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test"}' + ``` + +- Retrieve feedback data by feedback_id + + ```bash + curl -X 'POST' \ + http://${host_ip}:6016/v1/feedback/get \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test", "feedback_id":"{feedback_id returned from save feedback route above}"}' + ``` + +- Delete feedback data by feedback_id + + ```bash + curl -X 'POST' \ + http://${host_ip}:6016/v1/feedback/delete \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test", "feedback_id":"{feedback_id to be deleted}"}' + ``` diff --git a/comps/feedback_management/arango/arango_conn.py b/comps/feedback_management/arango/arango_conn.py new file mode 100644 index 000000000..f9ac9e411 --- /dev/null +++ b/comps/feedback_management/arango/arango_conn.py @@ -0,0 +1,32 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from arango import ArangoClient as PythonArangoClient +from arango.database import StandardDatabase +from config import ARANGO_HOST, ARANGO_PASSWORD, ARANGO_PORT, ARANGO_USERNAME, DB_NAME, PROTOCOL + + +class ArangoClient: + conn_url = f"{PROTOCOL}://{ARANGO_HOST}:{ARANGO_PORT}/" + + @staticmethod + def get_db_client() -> StandardDatabase: + try: + # Create client + client = PythonArangoClient(hosts=ArangoClient.conn_url) + + # First connect to _system database + sys_db = client.db("_system", username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + + # Create target database if it doesn't exist + if not sys_db.has_database(DB_NAME): + sys_db.create_database(DB_NAME) + + # Now connect to the target database + db = client.db(DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + + return db + + except Exception as e: + print(e) + raise e diff --git a/comps/feedback_management/arango/arango_store.py b/comps/feedback_management/arango/arango_store.py new file mode 100644 index 000000000..cd22b8078 --- /dev/null +++ b/comps/feedback_management/arango/arango_store.py @@ -0,0 +1,186 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from arango_conn import ArangoClient +from config import COLLECTION_NAME +from pydantic import BaseModel + + +class FeedbackStore: + + def __init__( + self, + user: str, + ): + self.user = user + + def initialize_storage(self) -> None: + self.db_client = ArangoClient.get_db_client() + + if not self.db_client.has_collection(COLLECTION_NAME): + self.db_client.create_collection(COLLECTION_NAME) + + self.collection = self.db_client.collection(COLLECTION_NAME) + + def save_feedback(self, feedback_data: BaseModel) -> str: + """Stores a new feedback data into the storage. + + Args: + feedback_data (object): The document to be stored. + + Returns: + str: The ID of the inserted feedback data. + + Raises: + Exception: If an error occurs while storing the feedback_data. + """ + try: + model_dump = feedback_data.model_dump(by_alias=True, mode="json", exclude={"feedback_id"}) + + inserted_feedback_data = self.collection.insert(model_dump) + + feedback_id = str(inserted_feedback_data["_key"]) + + return feedback_id + + except Exception as e: + print(e) + raise Exception(e) + + def update_feedback(self, feedback_data: BaseModel) -> bool: + """Update a feedback data in the collection with given id. + + Args: + feedback_id (str): The ID of the data to be updated. + updated_data (object): The data to be updated in the entry. + + Returns: + bool: True if the data updated successfully, False otherwise. + + Raises: + KeyError: If the document with ID is not found. + Exception: If the user does not match with the document user. + Exception: If an error occurs while updating the feedback data. + """ + _key = feedback_data.feedback_id + document = self.collection.get(_key) + + if document is None: + raise KeyError(f"Document with ID: {_key} not found.") + + if document["chat_data"]["user"] != self.user: + raise Exception(f"User mismatch. Document with ID: {_key} does not belong to user: {self.user}") + + try: + model_dump = feedback_data.feedback_data.model_dump(by_alias=True, mode="json") + + self.collection.update( + {"_key": _key, "feedback_data": model_dump}, + merge=True, + keep_none=False, + ) + + print(f"Updated document: {_key} !") + + return True + + except Exception as e: + print("Not able to update the data.") + print(e) + raise Exception(e) + + def get_all_feedback_of_user(self) -> list[dict]: + """Retrieves all feedback data of a user from the collection. + + Returns: + list[dict] | None: List of dict of feedback data of the user, None otherwise. + + Raises: + Exception: If there is an error while retrieving data. + """ + try: + feedback_data_list: list = [] + + # TODO: Clarify if we actually want to omit the `feedback_data` field. + # Implemented using MongoDB Feedback Management as a reference. + cursor = """ + FOR doc IN @@collection + FILTER doc.chat_data.user == @user + RETURN UNSET(doc, "feedback_data") + """ + + cursor = self.db_client.aql.execute( + cursor, bind_vars={"@collection": self.collection.name, "user": self.user} + ) + + for document in cursor: + document["feedback_id"] = str(document["_key"]) + del document["_id"] + del document["_key"] + del document["_rev"] + + feedback_data_list.append(document) + + return feedback_data_list + + except Exception as e: + print(e) + raise Exception(e) + + def get_feedback_by_id(self, feedback_id: str) -> dict | None: + """Retrieves a user feedback data from the collection based on the given feedback ID. + + Args: + feedback_id (str): The ID of the feedback data to retrieve. + + Returns: + dict | None: The user's feedback data if found, None otherwise. + + Raises: + KeyError: If document with ID is not found. + Exception: If the user does not match with the document user. + """ + response = self.collection.get(feedback_id) + + if response is None: + raise KeyError(f"Feedback with ID: {feedback_id} not found.") + + if response["chat_data"]["user"] != self.user: + raise Exception(f"User mismatch. Feedback with ID: {feedback_id} does not belong to user: {self.user}") + + del response["_id"] + del response["_key"] + del response["_rev"] + + return response + + def delete_feedback(self, feedback_id: str) -> bool: + """Delete a feedback data from collection by given feedback_id. + + Args: + feedback_id(str): The ID of the feedback data to be deleted. + + Returns: + bool: True if feedback is successfully deleted, False otherwise. + + Raises: + KeyError: If the provided feedback_id is invalid: + Exception: If the user does not match with the document user. + Exception: If any errors occurs during delete process. + """ + response = self.collection.get(feedback_id) + + if response is None: + raise KeyError(f"Feedback with ID: {feedback_id} not found.") + + if response["chat_data"]["user"] != self.user: + raise Exception(f"User mismatch. Feedback with ID: {feedback_id} does not belong to user: {self.user}") + + try: + response = self.collection.delete(feedback_id) + print(f"Deleted document: {feedback_id} !") + + return True + except Exception as e: + print(e) + raise Exception("Not able to delete the data.") diff --git a/comps/feedback_management/arango/config.py b/comps/feedback_management/arango/config.py new file mode 100644 index 000000000..e3272febf --- /dev/null +++ b/comps/feedback_management/arango/config.py @@ -0,0 +1,13 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +# ARANGO configuration +ARANGO_HOST = os.getenv("ARANGO_HOST", "localhost") +ARANGO_PORT = os.getenv("ARANGO_PORT", 8529) +ARANGO_USERNAME = os.getenv("ARANGO_USERNAME", "root") +ARANGO_PASSWORD = os.getenv("ARANGO_PASSWORD", "test") +DB_NAME = os.getenv("DB_NAME", "OPEA") +COLLECTION_NAME = os.getenv("COLLECTION_NAME", "Feedback") +PROTOCOL = os.getenv("PROTOCOL", "http") diff --git a/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml b/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml new file mode 100644 index 000000000..f4be0c845 --- /dev/null +++ b/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml @@ -0,0 +1,38 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3" +services: + arangodb: + image: arangodb/arangodb:latest + container_name: arangodb + ports: + - 8529:8529 + environment: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + no_proxy: ${no_proxy} + ARANGO_ROOT_PASSWORD: ${ARANGO_ROOT_PASSWORD} + + feedbackmanagement-arango: + image: opea/feedbackmanagement-arango:latest + container_name: feedbackmanagement-arango-server + ports: + - "6016:6016" + ipc: host + environment: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + no_proxy: ${no_proxy} + ARANGO_HOST: ${ARANGO_HOST} + ARANGO_PORT: ${ARANGO_PORT} + ARANGO_USERNAME: ${ARANGO_USERNAME} + ARANGO_PASSWORD: ${ARANGO_PASSWORD} + PROTOCOL: ${PROTOCOL} + DB_NAME: ${DB_NAME} + COLLECTION_NAME: ${COLLECTION_NAME} + restart: unless-stopped + +networks: + feedback_network: + driver: bridge diff --git a/comps/feedback_management/arango/feedback.py b/comps/feedback_management/arango/feedback.py new file mode 100644 index 000000000..f1efa6f43 --- /dev/null +++ b/comps/feedback_management/arango/feedback.py @@ -0,0 +1,172 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os +from typing import Annotated, Optional + +from arango_store import FeedbackStore +from fastapi import HTTPException +from pydantic import BaseModel, Field + +from comps import CustomLogger +from comps.cores.mega.micro_service import opea_microservices, register_microservice +from comps.cores.proto.api_protocol import ChatCompletionRequest + +logger = CustomLogger("feedback_arango") +logflag = os.getenv("LOGFLAG", False) + + +class FeedbackData(BaseModel): + """This class represents the data model of FeedbackData collected to store in database.". + + Attributes: + is_thumbs_up (bool): True if the response is satisfy, False otherwise. + rating: (int)[Optional]: Score rating. Range from 0 (bad rating) to 5(good rating). + comment (str)[Optional]: Comment given for response. + """ + + is_thumbs_up: bool + rating: Annotated[Optional[int], Field(ge=0, le=5)] = None + comment: Optional[str] = None + + +class ChatFeedback(BaseModel): + """This class represents the model for chat to collect FeedbackData together with ChatCompletionRequest data to store in database. + + Attributes: + chat_data (ChatCompletionRequest): ChatCompletionRequest object containing chat data to be stored. + feedback_data (FeedbackData): FeedbackData object containing feedback data for chat to be stored. + chat_id (str)[Optional]: The chat_id associated to the chat to be store together with feedback data. + feedback_id (str)[Optional]: The feedback_id of feedback data to be retrieved from database. + """ + + chat_data: ChatCompletionRequest + feedback_data: FeedbackData + chat_id: Optional[str] = None + feedback_id: Optional[str] = None + + +class FeedbackId(BaseModel): + """This class represent the data model for retrieve feedback data stored in database. + + Attributes: + user (str): The user of the requested feedback data. + feedback_id (str): The feedback_id of feedback data to be retrieved from database. + """ + + user: str + feedback_id: Optional[str] = None + + +@register_microservice( + name="opea_service@feedback_arango", + endpoint="/v1/feedback/create", + host="0.0.0.0", + input_datatype=FeedbackData, + port=6016, +) +async def create_feedback_data(feedback: ChatFeedback): + """Creates and stores a feedback data in database. + + Args: + feedback (ChatFeedback): The ChatFeedback class object containing feedback data to be stored. + + Returns: + response (str/bool): FeedbackId of the object created in database. True if data update successfully. + """ + if logflag: + logger.info(feedback) + + try: + feedback_store = FeedbackStore(feedback.chat_data.user) + feedback_store.initialize_storage() + if feedback.feedback_id is None: + response = feedback_store.save_feedback(feedback) + else: + response = feedback_store.update_feedback(feedback) + + if logflag: + logger.info(response) + return response + + except Exception as e: + logger.info(f"An error occurred: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + + +@register_microservice( + name="opea_service@feedback_arango", + endpoint="/v1/feedback/get", + host="0.0.0.0", + input_datatype=FeedbackId, + port=6016, +) +async def get_feedback(feedback: FeedbackId): + """Retrieves feedback_data from feedback store based on provided FeedbackId or user. + + Args: + feedback (FeedbackId): The FeedbackId object containing user and feedback_id or chat_id. + + Returns: + JSON: Retrieved feedback data if successful, error otherwise. + """ + if logflag: + logger.info(feedback) + + try: + feedback_store = FeedbackStore(feedback.user) + feedback_store.initialize_storage() + if feedback.feedback_id: + response = feedback_store.get_feedback_by_id(feedback.feedback_id) + else: + response = feedback_store.get_all_feedback_of_user() + + if logflag: + logger.info(response) + + return response + + except Exception as e: + logger.info(f"An error occurred: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + + +@register_microservice( + name="opea_service@feedback_arango", + endpoint="/v1/feedback/delete", + host="0.0.0.0", + input_datatype=FeedbackId, + port=6016, +) +async def delete_feedback(feedback: FeedbackId): + """Delete a feedback data from feedback store by given feedback Id. + + Args: + feedback (FeedbackId): The FeedbackId object containing user and feedback_id or chat_id + + Returns: + Result of deletion if successful, None otherwise. + """ + if logflag: + logger.info(feedback) + + try: + feedback_store = FeedbackStore(feedback.user) + feedback_store.initialize_storage() + if feedback.feedback_id is None: + raise Exception("feedback_id is required.") + else: + response = feedback_store.delete_feedback(feedback.feedback_id) + + if logflag: + logger.info(response) + + return response + + except Exception as e: + logger.info(f"An error occurred: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + + +if __name__ == "__main__": + opea_microservices["opea_service@feedback_arango"].start() diff --git a/comps/feedback_management/arango/requirements.txt b/comps/feedback_management/arango/requirements.txt new file mode 100644 index 000000000..9e5d0de8e --- /dev/null +++ b/comps/feedback_management/arango/requirements.txt @@ -0,0 +1 @@ +python-arango \ No newline at end of file From b6ded9fb924a5d3fbb1645114ddf874d409d7c57 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Mon, 25 Nov 2024 19:05:21 -0500 Subject: [PATCH 03/25] remove: `PROTOCOL` env --- comps/feedback_management/arango/README.md | 2 -- comps/feedback_management/arango/arango_conn.py | 4 ++-- comps/feedback_management/arango/config.py | 3 +-- .../arango/docker-compose-user-feedback-arango.yaml | 3 +-- 4 files changed, 4 insertions(+), 8 deletions(-) diff --git a/comps/feedback_management/arango/README.md b/comps/feedback_management/arango/README.md index 8eb223ce9..e0f070b68 100644 --- a/comps/feedback_management/arango/README.md +++ b/comps/feedback_management/arango/README.md @@ -13,7 +13,6 @@ export ARANGO_USERNAME=${ARANGO_USERNAME} export ARANGO_PASSWORD=${ARANGO_PASSWORD} export DB_NAME=${DB_NAME} export COLLECTION_NAME=${COLLECTION_NAME} -export PROTOCOL=${PROTOCOL} export PYTHONPATH={Path to base of directory} ``` @@ -49,7 +48,6 @@ docker build -t opea/feedbackmanagement-arango-server:latest --build-arg https_p -e ARANGO_USERNAME=${ARANGO_USERNAME} \ -e ARANGO_PASSWORD=${ARANGO_PASSWORD} \ -e DB_NAME=${DB_NAME} \ - -e PROTOCOL=${PROTOCOL} \ -e COLLECTION_NAME=${COLLECTION_NAME} \ opea/feedbackmanagement-arango-server:latest diff --git a/comps/feedback_management/arango/arango_conn.py b/comps/feedback_management/arango/arango_conn.py index f9ac9e411..84ded0428 100644 --- a/comps/feedback_management/arango/arango_conn.py +++ b/comps/feedback_management/arango/arango_conn.py @@ -3,11 +3,11 @@ from arango import ArangoClient as PythonArangoClient from arango.database import StandardDatabase -from config import ARANGO_HOST, ARANGO_PASSWORD, ARANGO_PORT, ARANGO_USERNAME, DB_NAME, PROTOCOL +from config import ARANGO_HOST, ARANGO_PASSWORD, ARANGO_PORT, ARANGO_USERNAME, DB_NAME class ArangoClient: - conn_url = f"{PROTOCOL}://{ARANGO_HOST}:{ARANGO_PORT}/" + conn_url = f"arangodb://{ARANGO_HOST}:{ARANGO_PORT}/" @staticmethod def get_db_client() -> StandardDatabase: diff --git a/comps/feedback_management/arango/config.py b/comps/feedback_management/arango/config.py index e3272febf..c332de7e5 100644 --- a/comps/feedback_management/arango/config.py +++ b/comps/feedback_management/arango/config.py @@ -9,5 +9,4 @@ ARANGO_USERNAME = os.getenv("ARANGO_USERNAME", "root") ARANGO_PASSWORD = os.getenv("ARANGO_PASSWORD", "test") DB_NAME = os.getenv("DB_NAME", "OPEA") -COLLECTION_NAME = os.getenv("COLLECTION_NAME", "Feedback") -PROTOCOL = os.getenv("PROTOCOL", "http") +COLLECTION_NAME = os.getenv("COLLECTION_NAME", "Feedback") \ No newline at end of file diff --git a/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml b/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml index f4be0c845..62ab0df54 100644 --- a/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml +++ b/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml @@ -28,11 +28,10 @@ services: ARANGO_PORT: ${ARANGO_PORT} ARANGO_USERNAME: ${ARANGO_USERNAME} ARANGO_PASSWORD: ${ARANGO_PASSWORD} - PROTOCOL: ${PROTOCOL} DB_NAME: ${DB_NAME} COLLECTION_NAME: ${COLLECTION_NAME} restart: unless-stopped networks: - feedback_network: + default: driver: bridge From 17467499a6fd22afcdb4d058984af6e8735cdae2 Mon Sep 17 00:00:00 2001 From: SLasyaN Date: Tue, 26 Nov 2024 13:47:41 -0800 Subject: [PATCH 04/25] ArangoDB: PromptRegistry (#8) * Initial commit * remove unnecessary files * code review * update: `prompt_search` * new: `ARANGO_PROTOCOL` * README * cleanup --------- Co-authored-by: lasyasn Co-authored-by: Anthony Mahanna --- comps/feedback_management/README.md | 4 + comps/feedback_management/arango/README.md | 4 + .../feedback_management/arango/arango_conn.py | 4 +- comps/feedback_management/arango/config.py | 3 +- .../docker-compose-user-feedback-arango.yaml | 3 +- comps/prompt_registry/README.md | 4 + comps/prompt_registry/arango/DockerFile | 30 +++ comps/prompt_registry/arango/README.md | 120 ++++++++++ comps/prompt_registry/arango/arango_conn.py | 32 +++ comps/prompt_registry/arango/arango_store.py | 213 ++++++++++++++++++ comps/prompt_registry/arango/config.py | 13 ++ ...docker-compose-prompt-registry-arango.yaml | 38 ++++ comps/prompt_registry/arango/prompt.py | 148 ++++++++++++ comps/prompt_registry/arango/requirements.txt | 1 + 14 files changed, 613 insertions(+), 4 deletions(-) create mode 100644 comps/prompt_registry/arango/DockerFile create mode 100644 comps/prompt_registry/arango/README.md create mode 100644 comps/prompt_registry/arango/arango_conn.py create mode 100644 comps/prompt_registry/arango/arango_store.py create mode 100644 comps/prompt_registry/arango/config.py create mode 100644 comps/prompt_registry/arango/docker-compose-prompt-registry-arango.yaml create mode 100644 comps/prompt_registry/arango/prompt.py create mode 100644 comps/prompt_registry/arango/requirements.txt diff --git a/comps/feedback_management/README.md b/comps/feedback_management/README.md index 2e68aa413..9cd4b42a5 100644 --- a/comps/feedback_management/README.md +++ b/comps/feedback_management/README.md @@ -20,3 +20,7 @@ The Feedback Management microservice able to support various database backends f ### Feedback Management with MongoDB For more detail, please refer to this [README](./mongo/README.md) + +### Feedback Management with ArangoDB + +For more detail, please refer to this [README](./arango/README.md) diff --git a/comps/feedback_management/arango/README.md b/comps/feedback_management/arango/README.md index e0f070b68..7e9a5f840 100644 --- a/comps/feedback_management/arango/README.md +++ b/comps/feedback_management/arango/README.md @@ -6,9 +6,12 @@ This README provides setup guides and all the necessary information about the Fe ## Setup Environment Variables +See `config.py` for default values. + ```bash export ARANGO_HOST=${ARANGO_HOST} export ARANGO_PORT=${ARANGO_PORT} +export ARANGO_PROTOCOL=${ARANGO_PROTOCOL} export ARANGO_USERNAME=${ARANGO_USERNAME} export ARANGO_PASSWORD=${ARANGO_PASSWORD} export DB_NAME=${DB_NAME} @@ -45,6 +48,7 @@ docker build -t opea/feedbackmanagement-arango-server:latest --build-arg https_p -e no_proxy=$no_proxy \ -e ARANGO_HOST=${ARANGO_HOST} \ -e ARANGO_PORT=${ARANGO_PORT} \ + -e ARANGO_PROTOCOL=${ARANGO_PROTOCOL} \ -e ARANGO_USERNAME=${ARANGO_USERNAME} \ -e ARANGO_PASSWORD=${ARANGO_PASSWORD} \ -e DB_NAME=${DB_NAME} \ diff --git a/comps/feedback_management/arango/arango_conn.py b/comps/feedback_management/arango/arango_conn.py index 84ded0428..d6c4b5977 100644 --- a/comps/feedback_management/arango/arango_conn.py +++ b/comps/feedback_management/arango/arango_conn.py @@ -3,11 +3,11 @@ from arango import ArangoClient as PythonArangoClient from arango.database import StandardDatabase -from config import ARANGO_HOST, ARANGO_PASSWORD, ARANGO_PORT, ARANGO_USERNAME, DB_NAME +from config import ARANGO_HOST, ARANGO_PASSWORD, ARANGO_PORT, ARANGO_PROTOCOL, ARANGO_USERNAME, DB_NAME class ArangoClient: - conn_url = f"arangodb://{ARANGO_HOST}:{ARANGO_PORT}/" + conn_url = f"{ARANGO_PROTOCOL}://{ARANGO_HOST}:{ARANGO_PORT}/" @staticmethod def get_db_client() -> StandardDatabase: diff --git a/comps/feedback_management/arango/config.py b/comps/feedback_management/arango/config.py index c332de7e5..bb790eb38 100644 --- a/comps/feedback_management/arango/config.py +++ b/comps/feedback_management/arango/config.py @@ -6,7 +6,8 @@ # ARANGO configuration ARANGO_HOST = os.getenv("ARANGO_HOST", "localhost") ARANGO_PORT = os.getenv("ARANGO_PORT", 8529) +ARANGO_PROTOCOL = os.getenv("ARANGO_PROTOCOL", "http") ARANGO_USERNAME = os.getenv("ARANGO_USERNAME", "root") ARANGO_PASSWORD = os.getenv("ARANGO_PASSWORD", "test") DB_NAME = os.getenv("DB_NAME", "OPEA") -COLLECTION_NAME = os.getenv("COLLECTION_NAME", "Feedback") \ No newline at end of file +COLLECTION_NAME = os.getenv("COLLECTION_NAME", "Feedback") diff --git a/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml b/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml index 62ab0df54..8f9b3a85a 100644 --- a/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml +++ b/comps/feedback_management/arango/docker-compose-user-feedback-arango.yaml @@ -3,7 +3,7 @@ version: "3" services: - arangodb: + arango: image: arangodb/arangodb:latest container_name: arangodb ports: @@ -26,6 +26,7 @@ services: no_proxy: ${no_proxy} ARANGO_HOST: ${ARANGO_HOST} ARANGO_PORT: ${ARANGO_PORT} + ARANGO_PROTOCOL: ${ARANGO_PROTOCOL} ARANGO_USERNAME: ${ARANGO_USERNAME} ARANGO_PASSWORD: ${ARANGO_PASSWORD} DB_NAME: ${DB_NAME} diff --git a/comps/prompt_registry/README.md b/comps/prompt_registry/README.md index 6332a1a13..a99b1b27b 100644 --- a/comps/prompt_registry/README.md +++ b/comps/prompt_registry/README.md @@ -19,3 +19,7 @@ The Prompt Registry microservice able to support various database backends for s ### Prompt Registry with MongoDB For more detail, please refer to this [README](./mongo/README.md) + +### Prompt Registry with ArangoDB + +For more detail, please refer to this [README](./arango/README.md) diff --git a/comps/prompt_registry/arango/DockerFile b/comps/prompt_registry/arango/DockerFile new file mode 100644 index 000000000..065920205 --- /dev/null +++ b/comps/prompt_registry/arango/DockerFile @@ -0,0 +1,30 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +ENV LANG=C.UTF-8 + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + build-essential \ + libgl1-mesa-glx \ + libjemalloc-dev + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps +COPY requirements.txt /home/user/ + +RUN pip install --no-cache-dir --upgrade pip setuptools && \ + pip install --no-cache-dir -r /home/user/comps/prompt_registry/arango/requirements.txt && \ + pip install --no-cache-dir -r /home/user/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +WORKDIR /home/user/comps/prompt_registry/arango + +ENTRYPOINT ["python", "prompt.py"] diff --git a/comps/prompt_registry/arango/README.md b/comps/prompt_registry/arango/README.md new file mode 100644 index 000000000..e4bdd6c10 --- /dev/null +++ b/comps/prompt_registry/arango/README.md @@ -0,0 +1,120 @@ +# 🧾 Prompt Registry Microservice with ArangoDB + +This README provides setup guides and all the necessary information about the Prompt Registry microservice with ArangoDB database. + +--- + +## Setup Environment Variables + +See `config.py` for default values. + +```bash +export ARANGO_HOST=${ARANGO_HOST} +export ARANGO_PORT=${ARANGO_PORT} +export ARANGO_PROTOCOL=${ARANGO_PROTOCOL} +export ARANGO_USERNAME=${ARANGO_USERNAME} +export ARANGO_PASSWORD=${ARANGO_PASSWORD} +export DB_NAME=${DB_NAME} +export COLLECTION_NAME=${COLLECTION_NAME} +``` + +--- + +## 🚀Start Microservice with Docker + +### Build Docker Image + +```bash +cd ~/GenAIComps +docker build -t opea/promptregistry-arango-server:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/prompt_registry/arango/Dockerfile . +``` + +### Run Docker with CLI + + +- Run ArangoDB image container + + ```bash + docker run -d -p 8529:8529 --name=arango arangodb/arangodb:latest + ``` + +- Run Prompt Registry microservice + + ```bash + docker run -d -p 6018:6018 \ + --name="promptregistry-arango-server" \ + -e http_proxy=$http_proxy \ + -e https_proxy=$https_proxy \ + -e no_proxy=$no_proxy \ + -e ARANGO_HOST=${ARANGO_HOST} \ + -e ARANGO_PORT=${ARANGO_PORT} \ + -e ARANGO_PROTOCOL=${ARANGO_PROTOCOL} \ + -e ARANGO_USERNAME=${ARANGO_USERNAME} \ + -e ARANGO_PASSWORD=${ARANGO_PASSWORD} \ + -e DB_NAME=${DB_NAME} \ + -e COLLECTION_NAME=${COLLECTION_NAME} \ + opea/promptregistry-arango-server:latest + + ``` + +--- + +### ✅ Invoke Microservice + +The Prompt Registry microservice exposes the following API endpoints: + +- Save prompt + + ```bash + curl -X 'POST' \ + http://${host_ip}:6018/v1/prompt/create \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "prompt_text": "test prompt", "user": "test" + }' + ``` + +- Retrieve prompt from database by user + + ```bash + curl -X 'POST' \ + http://${host_ip}:6018/v1/prompt/get \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test"}' + ``` + +- Retrieve prompt from database by prompt_id + + ```bash + curl -X 'POST' \ + http://${host_ip}:6018/v1/prompt/get \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test", "prompt_id":"{_id returned from save prompt route above}"}' + ``` + +- Retrieve relevant prompt by keyword + + ```bash + curl -X 'POST' \ + http://${host_ip}:6018/v1/prompt/get \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test", "prompt_text": "{keyword to search}"}' + ``` + +- Delete prompt by prompt_id + + ```bash + curl -X 'POST' \ + http://${host_ip}:6018/v1/prompt/delete \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test", "prompt_id":"{prompt_id to be deleted}"}' + ``` diff --git a/comps/prompt_registry/arango/arango_conn.py b/comps/prompt_registry/arango/arango_conn.py new file mode 100644 index 000000000..d6c4b5977 --- /dev/null +++ b/comps/prompt_registry/arango/arango_conn.py @@ -0,0 +1,32 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from arango import ArangoClient as PythonArangoClient +from arango.database import StandardDatabase +from config import ARANGO_HOST, ARANGO_PASSWORD, ARANGO_PORT, ARANGO_PROTOCOL, ARANGO_USERNAME, DB_NAME + + +class ArangoClient: + conn_url = f"{ARANGO_PROTOCOL}://{ARANGO_HOST}:{ARANGO_PORT}/" + + @staticmethod + def get_db_client() -> StandardDatabase: + try: + # Create client + client = PythonArangoClient(hosts=ArangoClient.conn_url) + + # First connect to _system database + sys_db = client.db("_system", username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + + # Create target database if it doesn't exist + if not sys_db.has_database(DB_NAME): + sys_db.create_database(DB_NAME) + + # Now connect to the target database + db = client.db(DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + + return db + + except Exception as e: + print(e) + raise e diff --git a/comps/prompt_registry/arango/arango_store.py b/comps/prompt_registry/arango/arango_store.py new file mode 100644 index 000000000..fb80ccd20 --- /dev/null +++ b/comps/prompt_registry/arango/arango_store.py @@ -0,0 +1,213 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +from arango.exceptions import IndexGetError +from arango_conn import ArangoClient +from config import COLLECTION_NAME +from prompt import PromptCreate +from pydantic import BaseModel + +from comps import CustomLogger + +logger = CustomLogger("arango_store") +logflag = os.getenv("LOGFLAG", False) + + +class PromptStore: + + def __init__( + self, + user: str, + ): + self.user = user + self.inverted_index_exists = False + + def initialize_storage(self) -> None: + self.db_client = ArangoClient.get_db_client() + + if not self.db_client.has_collection(COLLECTION_NAME): + self.db_client.create_collection(COLLECTION_NAME) + + self.collection = self.db_client.collection(COLLECTION_NAME) + + def save_prompt(self, prompt: PromptCreate): + """Stores a new prompt into the storage. + + Args: + prompt: The document to be stored. It should be a Pydantic model. + + Returns: + str: The ID of the inserted prompt. + + Raises: + Exception: If an error occurs while storing the prompt. + """ + try: + model_dump = prompt.model_dump(by_alias=True, mode="json", exclude={"id"}) + + inserted_prompt_data = self.collection.insert(model_dump) + + prompt_id = str(inserted_prompt_data["_key"]) + + return prompt_id + + except Exception as e: + print(e) + raise Exception(e) + + def get_all_prompt_of_user(self) -> list[dict]: + """Retrieves all prompts of a user from the collection. + + Returns: + list[dict] | None: List of dict of prompts of the user, None otherwise. + + Raises: + Exception: If there is an error while retrieving data. + """ + try: + prompt_data_list: list = [] + + # TODO: Clarify if we actually want to omit the `data` field. + # Implemented using MongoDB Prompt Registry as a reference. + cursor = """ + FOR doc IN @@collection + FILTER doc.chat_data.user == @user + RETURN UNSET(doc, "data") + """ + + cursor = self.db_client.aql.execute( + cursor, bind_vars={"@collection": self.collection.name, "user": self.user} + ) + + for document in cursor: + document["id"] = str(document["_key"]) + del document["_id"] + del document["_key"] + del document["_rev"] + + prompt_data_list.append(document) + + return prompt_data_list + + except Exception as e: + print(e) + raise Exception(e) + + def get_user_prompt_by_id(self, prompt_id: str) -> dict | None: + """Retrieves a user prompt from the collection based on the given prompt ID. + + Args: + prompt_id (str): The ID of the prompt to retrieve. + + Returns: + dict | None: The user prompt if found, None otherwise. + + Raises: + KeyError: If document with ID is not found. + Exception: If the user does not match with the document user. + """ + response = self.collection.get(prompt_id) + + if response is None: + raise KeyError(f"Prompt with ID: {prompt_id} not found.") + + if response["user"] != self.user: + raise Exception(f"User mismatch. Prompt with ID: {prompt_id} does not belong to user: {self.user}") + + del response["_id"] + del response["_key"] + del response["_rev"] + + return response + + def prompt_search(self, keyword: str) -> list | None: + """Retrieves prompt from the collection based on keyword provided. + + Args: + keyword (str): The keyword of prompt to search for. + + Returns: + list | None: The list of relevant prompt if found, None otherwise. + + Raises: + Exception: If there is an error while searching data. + """ + try: + index_name = "prompt_text_index" + + if not self.inverted_index_exists: + try: + self.collection.get_index(index_name) + + except IndexGetError: + self.collection.add_inverted_index( + fields=["prompt_text"], + name=index_name, + # TODO: add more kwargs if needed + ) + + self.inverted_index_exists = True + + query = """ + FOR doc IN @@collection + OPTIONS { indexHint: @index_name, forceIndexHint: true } + FILTER PHRASE(doc.prompt_text, @keyword, "text_en") + RETURN doc + """ + + cursor = self.db_client.aql.execute( + query, + bind_vars={ + "@collection": self.collection.name, + "index_name": index_name, + "keyword": keyword, + }, + ) + + serialized_data = [] + for doc in cursor: + doc["id"] = str(doc["_key"]) + del doc["_id"] + del doc["_key"] + del doc["_rev"] + + serialized_data.append(doc) + + return serialized_data + + except Exception as e: + print(e) + raise Exception(e) + + def delete_prompt(self, prompt_id: str) -> bool: + """Delete a prompt from collection by given prompt_id. + + Args: + prompt_id(str): The ID of the prompt to be deleted. + + Returns: + bool: True if prompt is successfully deleted, False otherwise. + + Raises: + KeyError: If the provided feedback_id is invalid: + Exception: If the user does not match with the document user. + Exception: If any errors occurs during delete process. + """ + response = self.collection.get(prompt_id) + + if response is None: + raise KeyError(f"Feedback with ID: {prompt_id} not found.") + + if response["user"] != self.user: + raise Exception(f"User mismatch. Feedback with ID: {prompt_id} does not belong to user: {self.user}") + + try: + response = self.collection.delete(prompt_id) + print(f"Deleted document: {prompt_id} !") + + return True + except Exception as e: + print(e) + raise Exception("Not able to delete the data.") diff --git a/comps/prompt_registry/arango/config.py b/comps/prompt_registry/arango/config.py new file mode 100644 index 000000000..9719f1358 --- /dev/null +++ b/comps/prompt_registry/arango/config.py @@ -0,0 +1,13 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +# ARANGO configuration +ARANGO_HOST = os.getenv("ARANGODB_HOST", "localhost") +ARANGO_PORT = os.getenv("ARANGO_PORT", 8529) +ARANGO_PROTOCOL = os.getenv("ARANGO_PROTOCOL", "http") +ARANGO_USERNAME = os.getenv("ARANGO_USERNAME", "root") +ARANGO_PASSWORD = os.getenv("ARANGO_PASSWORD", "test") +DB_NAME = os.getenv("DB_NAME", "OPEA") +COLLECTION_NAME = os.getenv("COLLECTION_NAME", "Prompt") diff --git a/comps/prompt_registry/arango/docker-compose-prompt-registry-arango.yaml b/comps/prompt_registry/arango/docker-compose-prompt-registry-arango.yaml new file mode 100644 index 000000000..b1aee077d --- /dev/null +++ b/comps/prompt_registry/arango/docker-compose-prompt-registry-arango.yaml @@ -0,0 +1,38 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3" +services: + arango: + image: arangodb/arangodb:latest + container_name: arangodb + ports: + - 8529:8529 + environment: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + no_proxy: ${no_proxy} + ARANGO_ROOT_PASSWORD: ${ARANGO_ROOT_PASSWORD} + + promptregistry-arango: + image: opea/promptregistry-arango:latest + container_name: promptregistry-arango-server + ports: + - "6018:6018" + ipc: host + environment: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + no_proxy: ${no_proxy} + ARANGO_HOST: ${ARANGO_HOST} + ARANGO_PORT: ${ARANGO_PORT} + ARANGO_PROTOCOL: ${ARANGO_PROTOCOL} + ARANGO_USERNAME: ${ARANGO_USERNAME} + ARANGO_PASSWORD: ${ARANGO_PASSWORD} + DB_NAME: ${DB_NAME} + COLLECTION_NAME: ${COLLECTION_NAME} + restart: unless-stopped + +networks: + default: + driver: bridge diff --git a/comps/prompt_registry/arango/prompt.py b/comps/prompt_registry/arango/prompt.py new file mode 100644 index 000000000..c46e0174c --- /dev/null +++ b/comps/prompt_registry/arango/prompt.py @@ -0,0 +1,148 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os +from typing import Optional + +from arango_store import PromptStore +from pydantic import BaseModel + +from comps import CustomLogger +from comps.cores.mega.micro_service import opea_microservices, register_microservice + +logger = CustomLogger("prompt_arango") +logflag = os.getenv("LOGFLAG", False) + + +class PromptCreate(BaseModel): + """This class represents the data model for creating and storing a new prompt in the database. + + Attributes: + prompt_text (str): The text content of the prompt. + user (str): The user or creator of the prompt. + """ + + prompt_text: str + user: str + + +class PromptId(BaseModel): + """This class represent the data model for retrieve prompt stored in database. + + Attributes: + user (str): The user of the requested prompt. + prompt_id (str): The prompt_id of prompt to be retrieved from database. + """ + + user: str + prompt_id: Optional[str] = None + prompt_text: Optional[str] = None + + +@register_microservice( + name="opea_service@prompt_arango", + endpoint="/v1/prompt/create", + host="0.0.0.0", + input_datatype=PromptCreate, + port=6018, +) +async def create_prompt(prompt: PromptCreate): + """Creates and stores a prompt in prompt store. + + Args: + prompt (PromptCreate): The PromptCreate class object containing the data to be stored. + + Returns: + JSON (PromptResponse): PromptResponse class object, None otherwise. + """ + if logflag: + logger.info(prompt) + + try: + prompt_store = PromptStore(prompt.user) + prompt_store.initialize_storage() + response = prompt_store.save_prompt(prompt) + if logflag: + logger.info(response) + + return response + + except Exception as error: + logger.error(f"An error occurred: {str(error)}") + raise error + + +@register_microservice( + name="opea_service@prompt_arango", + endpoint="/v1/prompt/get", + host="0.0.0.0", + input_datatype=PromptId, + port=6018, +) +async def get_prompt(prompt: PromptId): + """Retrieves prompt from prompt store based on provided PromptId or user. + + Args: + prompt (PromptId): The PromptId object containing user and prompt_id. + + Returns: + JSON: Retrieved prompt data if successful, None otherwise. + """ + if logflag: + logger.info(prompt) + try: + + prompt_store = PromptStore(prompt.user) + prompt_store.initialize_storage() + + if prompt.prompt_id is not None: + response = prompt_store.get_user_prompt_by_id(prompt.prompt_id) + elif prompt.prompt_text: + response = prompt_store.prompt_search(prompt.prompt_text) + else: + response = prompt_store.get_all_prompt_of_user() + if logflag: + logger.info(response) + return response + + except Exception as error: + logger.error(f"An error occurred: {str(error)}") + raise error + + +@register_microservice( + name="opea_service@prompt_arango", + endpoint="/v1/prompt/delete", + host="0.0.0.0", + input_datatype=PromptId, + port=6018, +) +async def delete_prompt(prompt: PromptId): + """Delete a prompt from prompt store by given PromptId. + + Args: + prompt (PromptId): The PromptId object containing user and prompt_id. + + Returns: + Result of deletion if successful, None otherwise. + """ + if logflag: + logger.info(prompt) + try: + prompt_store = PromptStore(prompt.user) + prompt_store.initialize_storage() + if prompt.prompt_id is None: + raise Exception("Prompt id is required.") + else: + response = prompt_store.delete_prompt(prompt.prompt_id) + if logflag: + logger.info(response) + return response + + except Exception as error: + logger.error(f"An error occurred: {str(error)}") + raise error + + +if __name__ == "__main__": + opea_microservices["opea_service@prompt_arango"].start() diff --git a/comps/prompt_registry/arango/requirements.txt b/comps/prompt_registry/arango/requirements.txt new file mode 100644 index 000000000..9e5d0de8e --- /dev/null +++ b/comps/prompt_registry/arango/requirements.txt @@ -0,0 +1 @@ +python-arango \ No newline at end of file From bf413276134fd8ca942049239caa071b75660876 Mon Sep 17 00:00:00 2001 From: Ajay Kallepalli <72517322+ajaykallepalli@users.noreply.github.com> Date: Tue, 26 Nov 2024 14:59:41 -0800 Subject: [PATCH 05/25] ArangoDB: Chathistory (#10) * Initial chat history implementation without API and docker implementation * make copy and remove async * API functionality matching MongoDB implementation Working API functionality, update to dockerfile required, and additional checks when updating document required. * Delete temp.py * Push changes and reset repo * Async definitions working in curl calls, updated read me to ArangoDB setup * Working docker container with network * Removing need for network to be created before docker compose * Cleanup async files and backup files * code review * fix: typo * revert mongo changes --------- Co-authored-by: Anthony Mahanna --- .gitignore | 3 +- comps/chathistory/arango/Dockerfile | 30 +++ comps/chathistory/arango/README.md | 123 ++++++++++++ comps/chathistory/arango/arango_conn.py | 32 +++ comps/chathistory/arango/arango_store.py | 186 ++++++++++++++++++ comps/chathistory/arango/chat.py | 146 ++++++++++++++ comps/chathistory/arango/config.py | 13 ++ .../docker-compose-chathistory-arango.yaml | 38 ++++ comps/chathistory/arango/requirements.txt | 1 + comps/prompt_registry/arango/config.py | 2 +- 10 files changed, 572 insertions(+), 2 deletions(-) create mode 100644 comps/chathistory/arango/Dockerfile create mode 100644 comps/chathistory/arango/README.md create mode 100644 comps/chathistory/arango/arango_conn.py create mode 100644 comps/chathistory/arango/arango_store.py create mode 100644 comps/chathistory/arango/chat.py create mode 100644 comps/chathistory/arango/config.py create mode 100644 comps/chathistory/arango/docker-compose-chathistory-arango.yaml create mode 100644 comps/chathistory/arango/requirements.txt diff --git a/.gitignore b/.gitignore index 9778bf8f7..3a428754d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ __pycache__ *.egg-info/ .DS_Store -.venv \ No newline at end of file +.venv +venv/ diff --git a/comps/chathistory/arango/Dockerfile b/comps/chathistory/arango/Dockerfile new file mode 100644 index 000000000..f402e5526 --- /dev/null +++ b/comps/chathistory/arango/Dockerfile @@ -0,0 +1,30 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +ENV LANG=C.UTF-8 + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + build-essential \ + libjemalloc-dev \ + libgl1-mesa-glx + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps +COPY requirements.txt /home/user/ + +RUN pip install --no-cache-dir --upgrade pip setuptools && \ + pip install --no-cache-dir -r /home/user/comps/chathistory/arango/requirements.txt && \ + pip install --no-cache-dir -r /home/user/requirements.txt + +ENV PYTHONPATH=/home/user + +WORKDIR /home/user/comps/chathistory/mongo + +ENTRYPOINT ["python", "chat.py"] diff --git a/comps/chathistory/arango/README.md b/comps/chathistory/arango/README.md new file mode 100644 index 000000000..428a65255 --- /dev/null +++ b/comps/chathistory/arango/README.md @@ -0,0 +1,123 @@ +# 📝 Chat History Microservice with ArangoDB + +This README provides setup guides and all the necessary information about the Chat History microservice with ArangoDB database. + +--- + +## Setup Environment Variables + +See `config.py` for default values. + +```bash +export ARANGO_HOST=${ARANGO_HOST} +export ARANGO_PORT=${ARANGO_PORT} +export ARANGO_PROTOCOL=${ARANGO_PROTOCOL} +export ARANGO_USERNAME=${ARANGO_USERNAME} +export ARANGO_PASSWORD=${ARANGO_PASSWORD} +export DB_NAME=${DB_NAME} +export COLLECTION_NAME=${COLLECTION_NAME} +``` + +--- + +## 🚀Start Microservice with Docker + +### Build Docker Image + +```bash +cd ../../../../ +docker build -t opea/chathistory-arango-server:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/chathistory/arango/Dockerfile . +``` + +### Run Docker with CLI + +- Run ArangoDB image container + + ```bash + docker run -d -p 8529:8529 --name=arango arangodb/arangodb:latest + ``` + +- Run the Chat History microservice + + ```bash + docker run -p 6012:6012 \ + -e http_proxy=$http_proxy \ + -e https_proxy=$https_proxy \ + -e no_proxy=$no_proxy \ + -e ARANGO_HOST=${ARANGO_HOST} \ + -e ARANGO_PORT=${ARANGO_PORT} \ + -e ARANGO_PROTOCOL=${ARANGO_PROTOCOL} \ + -e ARANGO_USERNAME=${ARANGO_USERNAME} \ + -e ARANGO_PASSWORD=${ARANGO_PASSWORD} \ + -e DB_NAME=${DB_NAME} \ + -e COLLECTION_NAME=${COLLECTION_NAME} \ + opea/chathistory-arango-server:latest + ``` + +--- + +## ✅ Invoke Microservice + +The Chat History microservice exposes the following API endpoints: + +- Create new chat conversation + + ```bash + curl -X 'POST' \ + http://${host_ip}:6012/v1/chathistory/create \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "data": { + "messages": "test Messages", "user": "test" + } + }' + ``` + +- Get all the Conversations for a user + + ```bash + curl -X 'POST' \ + http://${host_ip}:6012/v1/chathistory/get \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test"}' + ``` + +- Get a specific conversation by id. + + ```bash + curl -X 'POST' \ + http://${host_ip}:6012/v1/chathistory/get \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test", "id":"48918"}' + ``` + +- Update the conversation by id. + + ```bash + curl -X 'POST' \ + http://${host_ip}:6012/v1/chathistory/create \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "data": { + "messages": "test Messages Update", "user": "test" + }, + "id":"48918" + }' + ``` + +- Delete a stored conversation. + + ```bash + curl -X 'POST' \ + http://${host_ip}:6012/v1/chathistory/delete \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "user": "test", "id":"48918"}' + ``` diff --git a/comps/chathistory/arango/arango_conn.py b/comps/chathistory/arango/arango_conn.py new file mode 100644 index 000000000..d6c4b5977 --- /dev/null +++ b/comps/chathistory/arango/arango_conn.py @@ -0,0 +1,32 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from arango import ArangoClient as PythonArangoClient +from arango.database import StandardDatabase +from config import ARANGO_HOST, ARANGO_PASSWORD, ARANGO_PORT, ARANGO_PROTOCOL, ARANGO_USERNAME, DB_NAME + + +class ArangoClient: + conn_url = f"{ARANGO_PROTOCOL}://{ARANGO_HOST}:{ARANGO_PORT}/" + + @staticmethod + def get_db_client() -> StandardDatabase: + try: + # Create client + client = PythonArangoClient(hosts=ArangoClient.conn_url) + + # First connect to _system database + sys_db = client.db("_system", username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + + # Create target database if it doesn't exist + if not sys_db.has_database(DB_NAME): + sys_db.create_database(DB_NAME) + + # Now connect to the target database + db = client.db(DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + + return db + + except Exception as e: + print(e) + raise e diff --git a/comps/chathistory/arango/arango_store.py b/comps/chathistory/arango/arango_store.py new file mode 100644 index 000000000..8ab6928eb --- /dev/null +++ b/comps/chathistory/arango/arango_store.py @@ -0,0 +1,186 @@ +from typing import Any + +from arango_conn import ArangoClient +from config import COLLECTION_NAME +from pydantic import BaseModel + + +class DocumentStore: + + def __init__( + self, + user: str, + ): + self.user = user + + def initialize_storage(self) -> None: + self.db_client = ArangoClient.get_db_client() + + if not self.db_client.has_collection(COLLECTION_NAME): + self.db_client.create_collection(COLLECTION_NAME) + + self.collection = self.db_client.collection(COLLECTION_NAME) + + def save_document(self, document: BaseModel) -> str: + """Stores a new document into the storage. + + Args: + document: The document to be stored. It should be a Pydantic model. + + Returns: + str: The ID of the inserted document. + + Raises: + Exception: If an error occurs while storing the document. + """ + try: + model_dump = document.model_dump(by_alias=True, mode="json", exclude={"id"}) + + inserted_document = self.collection.insert(model_dump) + + document_id = str(inserted_document["_key"]) + + return document_id + + except Exception as e: + print(e) + raise Exception(e) + + def update_document(self, document_id: str, updated_data: BaseModel, first_query: Any) -> str: + """Updates a document in the collection with the given document_id. + + Args: + document_id (str): The ID of the document to update. + updated_data (object): The updated data to be set in the document. + first_query (object): The first query to be set in the document. + + Returns: + bool: True if the document was successfully updated, False otherwise. + + Raises: + KeyError: If the document with ID is not found. + Exception: If the user does not match with the document user. + Exception: If an error occurs while updating the document data. + """ + document = self.collection.get(document_id) + + if document is None: + raise Exception(f"Unable to find Document {document_id}") + + if document["data"]["user"] != self.user: + raise Exception(f"User {self.user} is not allowed to update Document {document_id}.") + + try: + self.collection.update( + { + "_key": document_id, + "data": updated_data.model_dump(by_alias=True, mode="json"), + "first_query": first_query, + }, + merge=True, + keep_none=True, + ) + + print(f"Updated document: {document_id} !") + + return True + + except Exception as e: + print("Not able to update the data.") + print(e) + raise Exception(e) + + def get_all_documents_of_user(self) -> list[dict]: + """Retrieves all documents of a specific user from the collection. + + Returns: + A list of dictionaries representing the conversation documents. + Raises: + Exception: If there is an error while retrieving the documents. + """ + try: + document_list: list = [] + + # TODO: Clarify if we actually want to omit the `data` field. + # Implemented using MongoDB Feedback Management as a reference. + cursor = """ + FOR doc IN @@collection + FILTER doc.data.user == @user + RETURN UNSET(doc, "data") + """ + + cursor = self.db_client.aql.execute( + cursor, bind_vars={"@collection": self.collection.name, "user": self.user} + ) + + for document in cursor: + document["id"] = str(document["_key"]) + del document["_id"] + del document["_key"] + del document["_rev"] + + document_list.append(document) + + return document_list + + except Exception as e: + print(e) + raise Exception(e) + + def get_user_documents_by_id(self, document_id: str) -> dict | None: + """Retrieves a user document from the collection based on the given document ID. + + Args: + document_id (str): The ID of the document to retrieve. + + Returns: + dict | None: The user document if found, None otherwise. + + Raises: + KeyError: If document with ID is not found. + Exception: If the user does not match with the document user. + """ + response = self.collection.get(document_id) + + if response is None: + raise KeyError(f"Document with ID: {document_id} not found.") + + if response["data"]["user"] != self.user: + raise Exception(f"User mismatch. Document with ID: {document_id} does not belong to user: {self.user}") + + del response["_id"] + del response["_key"] + del response["_rev"] + + return response + + def delete_document(self, document_id: str) -> str: + """Deletes a document from the collection based on the provided document ID. + + Args: + document_id (str): The ID of the document to be deleted. + + Returns: + bool: True if the document is successfully deleted, False otherwise. + + Raises: + KeyError: If the provided document_id is invalid: + Exception: If the user does not match with the document user. + Exception: If any errors occurs during delete process. + """ + response = self.collection.get(document_id) + + if response is None: + raise KeyError(f"Document with ID: {document_id} not found.") + + if response["data"]["user"] != self.user: + raise Exception(f"User mismatch. Feedback with ID: {document_id} does not belong to user: {self.user}") + + try: + response = self.collection.delete(document_id) + print(f"Deleted document: {document_id} !") + + return True + except Exception as e: + print(e) + raise Exception("Not able to delete the data.") diff --git a/comps/chathistory/arango/chat.py b/comps/chathistory/arango/chat.py new file mode 100644 index 000000000..ce9c0a16e --- /dev/null +++ b/comps/chathistory/arango/chat.py @@ -0,0 +1,146 @@ +import os +from typing import Optional + +from arango_store import DocumentStore +from fastapi import HTTPException +from pydantic import BaseModel + +from comps import CustomLogger +from comps.cores.mega.micro_service import opea_microservices, register_microservice +from comps.cores.proto.api_protocol import ChatCompletionRequest + +logger = CustomLogger("chathistory_arango") +logflag = os.getenv("LOGFLAG", False) + + +class ChatMessage(BaseModel): + data: ChatCompletionRequest + first_query: Optional[str] = None + id: Optional[str] = None + + +class ChatId(BaseModel): + user: str + id: Optional[str] = None + + +def get_first_string(value): + if isinstance(value, str): + return value + elif isinstance(value, list): + # Assuming we want the first string from the first dictionary + if value and isinstance(value[0], dict): + first_dict = value[0] + if first_dict: + # Get the first value from the dictionary + first_key = next(iter(first_dict)) + return first_dict[first_key] + + +@register_microservice( + name="opea_service@chathistory_arango", + endpoint="/v1/chathistory/create", + host="0.0.0.0", + input_datatype=ChatMessage, + port=6012, +) +async def create_documents(document: ChatMessage): + """Creates or updates a document in the document store. + + Args: + document (ChatMessage): The ChatMessage object containing the data to be stored. + + Returns: + The result of the operation if successful, None otherwise. + """ + if logflag: + logger.info(document) + try: + if document.data.user is None: + raise HTTPException(status_code=500, detail="Please provide the user information") + store = DocumentStore(document.data.user) + store.initialize_storage() + if document.first_query is None: + document.first_query = get_first_string(document.data.messages) + if document.id: + res = store.update_document(document.id, document.data, document.first_query) + else: + res = store.save_document(document) + if logflag: + logger.info(res) + return res + except Exception as e: + logger.info(f"An error occurred: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + + +@register_microservice( + name="opea_service@chathistory_arango", + endpoint="/v1/chathistory/get", + host="0.0.0.0", + input_datatype=ChatId, + port=6012, +) +async def get_documents(document: ChatId): + """Retrieves documents from the document store based on the provided ChatId. + + Args: + document (ChatId): The ChatId object containing the user and optional document id. + + Returns: + The retrieved documents if successful, None otherwise. + """ + if logflag: + logger.info(document) + try: + store = DocumentStore(document.user) + store.initialize_storage() + if document.id is None: + res = store.get_all_documents_of_user() + else: + res = store.get_user_documents_by_id(document.id) + if logflag: + logger.info(res) + return res + except Exception as e: + # Handle the exception here + logger.info(f"An error occurred: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + + +@register_microservice( + name="opea_service@chathistory_arango", + endpoint="/v1/chathistory/delete", + host="0.0.0.0", + input_datatype=ChatId, + port=6012, +) +async def delete_documents(document: ChatId): + """Deletes a document from the document store based on the provided ChatId. + + Args: + document (ChatId): The ChatId object containing the user and document id. + + Returns: + The result of the deletion if successful, None otherwise. + """ + if logflag: + logger.info(document) + try: + store = DocumentStore(document.user) + store.initialize_storage() + if document.id is None: + raise Exception("Document id is required.") + else: + res = store.delete_document(document.id) + if logflag: + logger.info(res) + return res + except Exception as e: + # Handle the exception here + logger.info(f"An error occurred: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + + +if __name__ == "__main__": + opea_microservices["opea_service@chathistory_arango"].start() diff --git a/comps/chathistory/arango/config.py b/comps/chathistory/arango/config.py new file mode 100644 index 000000000..9e66e8f1d --- /dev/null +++ b/comps/chathistory/arango/config.py @@ -0,0 +1,13 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +# ARANGO configuration +ARANGO_HOST = os.getenv("ARANGO_HOST", "localhost") +ARANGO_PORT = os.getenv("ARANGO_PORT", 8529) +ARANGO_PROTOCOL = os.getenv("ARANGO_PROTOCOL", "http") +ARANGO_USERNAME = os.getenv("ARANGO_USERNAME", "root") +ARANGO_PASSWORD = os.getenv("ARANGO_PASSWORD", "test") +DB_NAME = os.getenv("DB_NAME", "OPEA") +COLLECTION_NAME = os.getenv("COLLECTION_NAME", "ChatHistory") diff --git a/comps/chathistory/arango/docker-compose-chathistory-arango.yaml b/comps/chathistory/arango/docker-compose-chathistory-arango.yaml new file mode 100644 index 000000000..36819c99b --- /dev/null +++ b/comps/chathistory/arango/docker-compose-chathistory-arango.yaml @@ -0,0 +1,38 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3" +services: + arango: + image: arangodb/arangodb:latest + container_name: arangodb + ports: + - 8529:8529 + environment: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + no_proxy: ${no_proxy} + ARANGO_ROOT_PASSWORD: ${ARANGO_ROOT_PASSWORD} + + chathistory-arango: + image: opea/chathistory-arango:latest + container_name: chathistory-arango-server + ports: + - "6012:6012" + ipc: host + environment: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + no_proxy: ${no_proxy} + ARANGO_HOST: ${ARANGO_HOST} + ARANGO_PORT: ${ARANGO_PORT} + ARANGO_PROTOCOL: ${ARANGO_PROTOCOL} + ARANGO_USERNAME: ${ARANGO_USERNAME} + ARANGO_PASSWORD: ${ARANGO_PASSWORD} + DB_NAME: ${DB_NAME} + COLLECTION_NAME: ${COLLECTION_NAME} + restart: unless-stopped + +networks: + default: + driver: bridge diff --git a/comps/chathistory/arango/requirements.txt b/comps/chathistory/arango/requirements.txt new file mode 100644 index 000000000..9e5d0de8e --- /dev/null +++ b/comps/chathistory/arango/requirements.txt @@ -0,0 +1 @@ +python-arango \ No newline at end of file diff --git a/comps/prompt_registry/arango/config.py b/comps/prompt_registry/arango/config.py index 9719f1358..e597df0fb 100644 --- a/comps/prompt_registry/arango/config.py +++ b/comps/prompt_registry/arango/config.py @@ -4,7 +4,7 @@ import os # ARANGO configuration -ARANGO_HOST = os.getenv("ARANGODB_HOST", "localhost") +ARANGO_HOST = os.getenv("ARANGO_HOST", "localhost") ARANGO_PORT = os.getenv("ARANGO_PORT", 8529) ARANGO_PROTOCOL = os.getenv("ARANGO_PROTOCOL", "http") ARANGO_USERNAME = os.getenv("ARANGO_USERNAME", "root") From 9d46b278e46b31da63e360e4a4d41a600f6faa47 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Tue, 26 Nov 2024 18:01:17 -0500 Subject: [PATCH 06/25] update ChatHistory README --- comps/chathistory/README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/comps/chathistory/README.md b/comps/chathistory/README.md index 4f7bcbf71..754fd0bd8 100644 --- a/comps/chathistory/README.md +++ b/comps/chathistory/README.md @@ -24,3 +24,7 @@ The Chat History microservice able to support various database backends for stor ### Chat History with MongoDB For more detail, please refer to this [README](./mongo/README.md) + +### Chat History with ArangoDB + +For more detail, please refer to this [README](./arango/README.md) From 029c1fdfe44b85acdcb9d06a6a13020744504d69 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Tue, 26 Nov 2024 18:09:20 -0500 Subject: [PATCH 07/25] new: tests --- tests/chathistory/test_chathistory_arango.sh | 91 ++++++++++++++ .../test_feedback_management_arango.sh | 113 ++++++++++++++++++ .../test_prompt_registry_arango.sh | 89 ++++++++++++++ 3 files changed, 293 insertions(+) create mode 100644 tests/chathistory/test_chathistory_arango.sh create mode 100644 tests/feedback_management/test_feedback_management_arango.sh create mode 100644 tests/prompt_registry/test_prompt_registry_arango.sh diff --git a/tests/chathistory/test_chathistory_arango.sh b/tests/chathistory/test_chathistory_arango.sh new file mode 100644 index 000000000..50481262f --- /dev/null +++ b/tests/chathistory/test_chathistory_arango.sh @@ -0,0 +1,91 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -x + +WORKPATH=$(dirname "$PWD") +ip_address=$(hostname -I | awk '{print $1}') + +export ARANGO_HOST=${ip_address} +export ARANGO_PORT=8529 +export ARANGO_PROTOCOL=${ARANGO_PROTOCOL:-"http"} +export ARANGO_USERNAME=${ARANGO_USERNAME:-"root"} +export ARANGO_PASSWORD=${ARANGO_PASSWORD:-"test"} +export DB_NAME=${DB_NAME:-"Conversations"} +export COLLECTION_NAME=${COLLECTION_NAME:-"test"} + +function build_docker_images() { + cd $WORKPATH + echo $(pwd) + docker run -d -p 8529:8529 --name=test-comps-arango arangodb/arangodb:latest + + docker build --no-cache -t opea/chathistory-arango-server:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/chathistory/arango/Dockerfile . + if [ $? -ne 0 ]; then + echo "opea/chathistory-arango-server built fail" + exit 1 + else + echo "opea/chathistory-arango-server built successful" + fi +} + +function start_service() { + + docker run -d --name="test-comps-chathistory-arango-server" \ + -p 6012:6012 \ + -e http_proxy=$http_proxy \ + -e https_proxy=$https_proxy \ + -e no_proxy=$no_proxy \ + -e ARANGO_HOST=${ARANGO_HOST} \ + -e ARANGO_PORT=${ARANGO_PORT} \ + -e ARANGO_PROTOCOL=${ARANGO_PROTOCOL} \ + -e ARANGO_USERNAME=${ARANGO_USERNAME} \ + -e ARANGO_PASSWORD=${ARANGO_PASSWORD} \ + -e DB_NAME=${DB_NAME} \ + -e COLLECTION_NAME=${COLLECTION_NAME} \ + opea/chathistory-arango-server:comps + + sleep 10s +} + +function validate_microservice() { + result=$(curl -X 'POST' \ + http://${ip_address}:6012/v1/chathistory/create \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "data": { + "messages": "test Messages", "user": "test" + } +}') + echo $result + if [[ ${#result} -eq 26 ]]; then + echo "Result correct." + else + echo "Result wrong." + docker logs test-comps-chathistory-arango-server + exit 1 + fi + +} + +function stop_docker() { + cid=$(docker ps -aq --filter "name=test-comps*") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_microservice + + stop_docker + echo y | docker system prune + +} + +main diff --git a/tests/feedback_management/test_feedback_management_arango.sh b/tests/feedback_management/test_feedback_management_arango.sh new file mode 100644 index 000000000..925555030 --- /dev/null +++ b/tests/feedback_management/test_feedback_management_arango.sh @@ -0,0 +1,113 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -xe + +WORKPATH=$(dirname "$PWD") +ip_address=$(hostname -I | awk '{print $1}') + +export ARANGO_HOST=${ip_address} +export ARANGO_PORT=8529 +export ARANGO_PROTOCOL=${ARANGO_PROTOCOL:-"http"} +export ARANGO_USERNAME=${ARANGO_USERNAME:-"root"} +export ARANGO_PASSWORD=${ARANGO_PASSWORD:-"test"} +export DB_NAME=${DB_NAME:-"Feedback"} +export COLLECTION_NAME=${COLLECTION_NAME:-"test"} + +function build_docker_images() { + cd $WORKPATH + echo $(pwd) + docker run -d -p 8529:8529 --name=test-comps-arango arangodb/arangodb:latest + + docker build --no-cache -t opea/feedbackmanagement-arango-server:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/feedback_management/arango/Dockerfile . + if [ $? -ne 0 ]; then + echo "opea/feedbackmanagement-arango-server built fail" + exit 1 + else + echo "opea/feedbackmanagement-arango-server built successful" + fi +} + +function start_service() { + + docker run -d --name="test-comps-feedbackmanagement-arango-server" \ + -p 6016:6016 \ + -e http_proxy=$http_proxy \ + -e https_proxy=$https_proxy \ + -e no_proxy=$no_proxy \ + -e ARANGO_HOST=${ARANGO_HOST} \ + -e ARANGO_PORT=${ARANGO_PORT} \ + -e ARANGO_PROTOCOL=${ARANGO_PROTOCOL} \ + -e ARANGO_USERNAME=${ARANGO_USERNAME} \ + -e ARANGO_PASSWORD=${ARANGO_PASSWORD} \ + -e DB_NAME=${DB_NAME} \ + -e COLLECTION_NAME=${COLLECTION_NAME} \ + opea/feedbackmanagement-arango-server:comps + + sleep 10s +} + +function validate_microservice() { + result=$(curl -X 'POST' \ + http://$ip_address:6016/v1/feedback/create \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "chat_id": "66445d4f71c7eff23d44f78d", + "chat_data": { + "user": "test", + "messages": [ + { + "role": "system", + "content": "You are helpful assistant" + }, + { + "role": "user", + "content": "hi", + "time": "1724915247" + }, + { + "role": "assistant", + "content": "Hi, may I help you?", + "time": "1724915249" + } + ] + }, + "feedback_data": { + "comment": "Moderate", + "rating": 3, + "is_thumbs_up": true + } +}') + echo $result + if [[ ${#result} -eq 26 ]]; then + echo "Correct result." + else + echo "Incorrect result." + docker logs test-comps-feedbackmanagement-arango-server + exit 1 + fi + +} + +function stop_docker() { + cid=$(docker ps -aq --filter "name=test-comps*") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_microservice + + stop_docker + echo y | docker system prune + +} + +main diff --git a/tests/prompt_registry/test_prompt_registry_arango.sh b/tests/prompt_registry/test_prompt_registry_arango.sh new file mode 100644 index 000000000..abc15ee7f --- /dev/null +++ b/tests/prompt_registry/test_prompt_registry_arango.sh @@ -0,0 +1,89 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -x + +WORKPATH=$(dirname "$PWD") +ip_address=$(hostname -I | awk '{print $1}') + +export ARANGO_HOST=${ip_address} +export ARANGO_PORT=8529 +export ARANGO_PROTOCOL=${ARANGO_PROTOCOL:-"http"} +export ARANGO_USERNAME=${ARANGO_USERNAME:-"root"} +export ARANGO_PASSWORD=${ARANGO_PASSWORD:-"test"} +export DB_NAME=${DB_NAME:-"Prompts"} +export COLLECTION_NAME=${COLLECTION_NAME:-"test"} + +function build_docker_images() { + cd $WORKPATH + echo $(pwd) + docker run -d -p 8529:8529 --name=test-comps-arango arangodb/arangodb:latest + + docker build --no-cache -t opea/promptregistry-arango-server:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/prompt_registry/arango/Dockerfile . + if [ $? -ne 0 ]; then + echo "opea/promptregistry-arango-server built fail" + exit 1 + else + echo "opea/promptregistry-arango-server built successful" + fi +} + +function start_service() { + + docker run -d --name="test-comps-promptregistry-arango-server" \ + -p 6018:6018 \ + -e http_proxy=$http_proxy \ + -e https_proxy=$https_proxy \ + -e no_proxy=$no_proxy \ + -e ARANGO_HOST=${ARANGO_HOST} \ + -e ARANGO_PORT=${ARANGO_PORT} \ + -e ARANGO_PROTOCOL=${ARANGO_PROTOCOL} \ + -e ARANGO_USERNAME=${ARANGO_USERNAME} \ + -e ARANGO_PASSWORD=${ARANGO_PASSWORD} \ + -e DB_NAME=${DB_NAME} \ + -e COLLECTION_NAME=${COLLECTION_NAME} \ + opea/promptregistry-arango-server:comps + + sleep 10s +} + +function validate_microservice() { + result=$(curl -X 'POST' \ + http://$ip_address:6018/v1/prompt/create \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "prompt_text": "test prompt", "user": "test" +}') + echo $result + if [[ ${#result} -eq 26 ]]; then + echo "Correct result." + else + echo "Incorrect result." + docker logs test-comps-promptregistry-arango-server + exit 1 + fi + +} + +function stop_docker() { + cid=$(docker ps -aq --filter "name=test-comps*") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_microservice + + stop_docker + echo y | docker system prune + +} + +main From 3a8060710d3bce0dd4a55bd1582a342bbc332822 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Tue, 26 Nov 2024 18:16:14 -0500 Subject: [PATCH 08/25] update: docker compose workflows --- .github/workflows/docker/compose/chathistory-compose.yaml | 4 ++++ .../docker/compose/feedback_management-compose.yaml | 8 ++++++-- .../workflows/docker/compose/prompt_registry-compose.yaml | 4 ++++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/.github/workflows/docker/compose/chathistory-compose.yaml b/.github/workflows/docker/compose/chathistory-compose.yaml index 987447fee..64dc579fc 100644 --- a/.github/workflows/docker/compose/chathistory-compose.yaml +++ b/.github/workflows/docker/compose/chathistory-compose.yaml @@ -7,3 +7,7 @@ services: build: dockerfile: comps/chathistory/mongo/Dockerfile image: ${REGISTRY:-opea}/chathistory-mongo-server:${TAG:-latest} + chathistory-arango-server: + build: + dockerfile: comps/chathistory/arango/Dockerfile + image: ${REGISTRY:-opea}/chathistory-arango-server:${TAG:-latest} diff --git a/.github/workflows/docker/compose/feedback_management-compose.yaml b/.github/workflows/docker/compose/feedback_management-compose.yaml index 0a3cfce66..51f5ae343 100644 --- a/.github/workflows/docker/compose/feedback_management-compose.yaml +++ b/.github/workflows/docker/compose/feedback_management-compose.yaml @@ -3,7 +3,11 @@ # this file should be run in the root of the repo services: - feedbackmanagement: + feedbackmanagement-mongo-server: build: dockerfile: comps/feedback_management/mongo/Dockerfile - image: ${REGISTRY:-opea}/feedbackmanagement:${TAG:-latest} + image: ${REGISTRY:-opea}/feedbackmanagement-mongo-server:${TAG:-latest} + feedbackmanagement-arango-server: + build: + dockerfile: comps/feedback_management/arango/Dockerfile + image: ${REGISTRY:-opea}/feedbackmanagement-arango-server:${TAG:-latest} \ No newline at end of file diff --git a/.github/workflows/docker/compose/prompt_registry-compose.yaml b/.github/workflows/docker/compose/prompt_registry-compose.yaml index 34d8973df..4415a18a9 100644 --- a/.github/workflows/docker/compose/prompt_registry-compose.yaml +++ b/.github/workflows/docker/compose/prompt_registry-compose.yaml @@ -7,3 +7,7 @@ services: build: dockerfile: comps/prompt_registry/mongo/Dockerfile image: ${REGISTRY:-opea}/promptregistry-mongo-server:${TAG:-latest} + promptregistry-arango-server: + build: + dockerfile: comps/prompt_registry/arango/Dockerfile + image: ${REGISTRY:-opea}/promptregistry-arango-server:${TAG:-latest} From da290e96106d62587475528b010e1336de8582fb Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Tue, 26 Nov 2024 18:31:59 -0500 Subject: [PATCH 09/25] initial commit --- comps/dataprep/arango/__init__.py | 2 + comps/dataprep/arango/langchain/Dockerfile | 38 ++++ comps/dataprep/arango/langchain/README.md | 111 ++++++++++ comps/dataprep/arango/langchain/__init__.py | 2 + comps/dataprep/arango/langchain/config.py | 16 ++ .../docker-compose-dataprep-arango.yaml | 48 +++++ .../arango/langchain/prepare_doc_arango.py | 199 ++++++++++++++++++ .../arango/langchain/requirements.txt | 31 +++ 8 files changed, 447 insertions(+) create mode 100644 comps/dataprep/arango/__init__.py create mode 100644 comps/dataprep/arango/langchain/Dockerfile create mode 100644 comps/dataprep/arango/langchain/README.md create mode 100644 comps/dataprep/arango/langchain/__init__.py create mode 100644 comps/dataprep/arango/langchain/config.py create mode 100644 comps/dataprep/arango/langchain/docker-compose-dataprep-arango.yaml create mode 100644 comps/dataprep/arango/langchain/prepare_doc_arango.py create mode 100644 comps/dataprep/arango/langchain/requirements.txt diff --git a/comps/dataprep/arango/__init__.py b/comps/dataprep/arango/__init__.py new file mode 100644 index 000000000..916f3a44b --- /dev/null +++ b/comps/dataprep/arango/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/dataprep/arango/langchain/Dockerfile b/comps/dataprep/arango/langchain/Dockerfile new file mode 100644 index 000000000..7bd07262a --- /dev/null +++ b/comps/dataprep/arango/langchain/Dockerfile @@ -0,0 +1,38 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +ENV LANG=C.UTF-8 + +ARG ARCH="cpu" + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + build-essential \ + default-jre \ + libgl1-mesa-glx \ + libjemalloc-dev + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps + +RUN pip install --no-cache-dir --upgrade pip setuptools && \ + if [ ${ARCH} = "cpu" ]; then pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cpu; fi && \ + pip install --no-cache-dir -r /home/user/comps/dataprep/arango/langchain/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +USER root + +RUN mkdir -p /home/user/comps/dataprep/arango/langchain/uploaded_files && chown -R user /home/user/comps/dataprep/arango/langchain/uploaded_files + +USER user + +WORKDIR /home/user/comps/dataprep/arango/langchain + +ENTRYPOINT ["python", "prepare_doc_arango.py"] diff --git a/comps/dataprep/arango/langchain/README.md b/comps/dataprep/arango/langchain/README.md new file mode 100644 index 000000000..35f3cdecd --- /dev/null +++ b/comps/dataprep/arango/langchain/README.md @@ -0,0 +1,111 @@ +# Dataprep Microservice with ArangoDB + +## 🚀Start Microservice with Python + +### Install Requirements + +```bash +pip install -r requirements.txt +apt-get install libtesseract-dev -y +apt-get install poppler-utils -y +``` + +### Start ArangoDB Server + +To launch ArangoDB locally, first ensure you have docker installed. Then, you can launch the database with the following docker command. + +```bash +docker run -d --name arangodb -p 8529:8529 -e ARANGO_ROOT_PASSWORD=password arangodb/arangodb:latest +``` + +### Setup Environment Variables + +```bash +export no_proxy=${your_no_proxy} +export http_proxy=${your_http_proxy} +export https_proxy=${your_http_proxy} +export ARANGO_URI=${your_arango_url} +export ARANGO_USERNAME=${your_arango_username} +export ARANGO_PASSWORD=${your_arango_password} +export DB_NAME=${your_db_name} +export PYTHONPATH=${path_to_comps} +``` + +### Start Document Preparation Microservice for ArangoDB with Python Script + +Start document preparation microservice for ArangoDB with below command. + +```bash +python prepare_doc_arango.py +``` + +## 🚀Start Microservice with Docker + +### Build Docker Image + +```bash +cd ../../../../ +docker build -t opea/dataprep-arango:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/arango/langchain/Dockerfile . +``` + +### Run Docker with CLI + +```bash +docker run -d --name="dataprep-arango-server" -p 6007:6007 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/dataprep-arango:latest +``` + +### Setup Environment Variables + +```bash +export no_proxy=${your_no_proxy} +export http_proxy=${your_http_proxy} +export https_proxy=${your_http_proxy} +export ARANGO_URI=${your_arango_url} +export ARANGO_USERNAME=${your_arango_username} +export ARANGO_PASSWORD=${your_arango_password} +export DB_NAME=${your_db_name} +``` + +### Run Docker with Docker Compose + +```bash +cd comps/dataprep/arango/langchain +docker compose -f docker-compose-dataprep-arango.yaml up -d +``` + +## Invoke Microservice + +Once document preparation microservice for ArangoDB is started, user can use below command to invoke the microservice to convert the document to embedding and save to the database. + +```bash +curl -X POST \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./file1.txt" \ + http://localhost:6007/v1/dataprep +``` + +You can specify chunk_size and chunk_size by the following commands. + +```bash +curl -X POST \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./file1.txt" \ + -F "chunk_size=1500" \ + -F "chunk_overlap=100" \ + http://localhost:6007/v1/dataprep +``` + +We support table extraction from pdf documents. You can specify process_table and table_strategy by the following commands. "table_strategy" refers to the strategies to understand tables for table retrieval. As the setting progresses from "fast" to "hq" to "llm," the focus shifts towards deeper table understanding at the expense of processing speed. The default strategy is "fast". + +Note: If you specify "table_strategy=llm", You should first start TGI Service, please refer to 1.2.1, 1.3.1 in https://github.com/opea-project/GenAIComps/tree/main/comps/llms/README.md, and then `export TGI_LLM_ENDPOINT="http://${your_ip}:8008"`. + +For ensure the quality and comprehensiveness of the extracted entities, we recommend to use `gpt-4o` as the default model for parsing the document. To enable the openai service, please `export OPENAI_KEY=xxxx` before using this services. + +```bash +curl -X POST \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./your_file.pdf" \ + -F "process_table=true" \ + -F "table_strategy=hq" \ + http://localhost:6007/v1/dataprep +``` diff --git a/comps/dataprep/arango/langchain/__init__.py b/comps/dataprep/arango/langchain/__init__.py new file mode 100644 index 000000000..916f3a44b --- /dev/null +++ b/comps/dataprep/arango/langchain/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/dataprep/arango/langchain/config.py b/comps/dataprep/arango/langchain/config.py new file mode 100644 index 000000000..8a3cbc501 --- /dev/null +++ b/comps/dataprep/arango/langchain/config.py @@ -0,0 +1,16 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +# ArangoDB configuration +ARANGO_URL = os.getenv("ARANGO_URI", "http://localhost:8529") +ARANGO_USERNAME = os.getenv("ARANGO_USERNAME", "root") +ARANGO_PASSWORD = os.getenv("NEO4J_PASSWORD", "test") +DB_NAME = os.getenv("DB_NAME", "_system") + +# LLM/Embedding endpoints +TGI_LLM_ENDPOINT = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080") +TGI_LLM_ENDPOINT_NO_RAG = os.getenv("TGI_LLM_ENDPOINT_NO_RAG", "http://localhost:8081") +TEI_EMBEDDING_ENDPOINT = os.getenv("TEI_ENDPOINT") +OPENAI_KEY = os.getenv("OPENAI_API_KEY") diff --git a/comps/dataprep/arango/langchain/docker-compose-dataprep-arango.yaml b/comps/dataprep/arango/langchain/docker-compose-dataprep-arango.yaml new file mode 100644 index 000000000..970c7490e --- /dev/null +++ b/comps/dataprep/arango/langchain/docker-compose-dataprep-arango.yaml @@ -0,0 +1,48 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3" +services: + arango-vector-db: + image: arangodb/arangodb:latest + container_name: arango-graph-db + ports: + - "8529:8529" + tgi_gaudi_service: + image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + container_name: tgi-service + ports: + - "8088:80" + volumes: + - "./data:/data" + shm_size: 1g + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + HF_TOKEN: ${HF_TOKEN} + command: --model-id ${LLM_MODEL_ID} --auto-truncate --max-input-tokens 1024 --max-total-tokens 2048 + dataprep-arango: + image: opea/gen-ai-comps:dataprep-arango-xeon-server + container_name: dataprep-arango-server + depends_on: + - arango-vector-db + - tgi_gaudi_service + ports: + - "6007:6007" + ipc: host + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + ARANGO_URL: ${ARANGO_URL} + ARANGO_USERNAME: ${ARANGO_USERNAME} + ARANGO_PASSWORD: ${ARANGO_PASSWORD} + DB_NAME: ${DB_NAME} + TGI_LLM_ENDPOINT: ${TEI_ENDPOINT} + OPENAI_KEY: ${OPENAI_API_KEY} + restart: unless-stopped + +networks: + default: + driver: bridge diff --git a/comps/dataprep/arango/langchain/prepare_doc_arango.py b/comps/dataprep/arango/langchain/prepare_doc_arango.py new file mode 100644 index 000000000..240469e43 --- /dev/null +++ b/comps/dataprep/arango/langchain/prepare_doc_arango.py @@ -0,0 +1,199 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import json +import os +from typing import List, Optional, Union + +import openai +from arango import ArangoClient +from config import ARANGO_PASSWORD, ARANGO_URL, ARANGO_USERNAME, DB_NAME, OPENAI_KEY, TGI_LLM_ENDPOINT +from fastapi import File, Form, HTTPException, UploadFile +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_community.graphs.arangodb_graph import ArangoGraph +from langchain_community.llms import HuggingFaceEndpoint +from langchain_core.documents import Document +from langchain_experimental.graph_transformers import LLMGraphTransformer +from langchain_openai import ChatOpenAI +from langchain_text_splitters import HTMLHeaderTextSplitter + +from comps import CustomLogger, DocPath, opea_microservices, register_microservice +from comps.dataprep.utils import ( + document_loader, + encode_filename, + get_separators, + get_tables_result, + parse_html, + save_content_to_local_disk, +) + +logger = CustomLogger("prepare_doc_arango") +logflag = os.getenv("LOGFLAG", False) + +upload_folder = "./uploaded_files/" + + +def ingest_data_to_arango(doc_path: DocPath): + """Ingest document to ArangoDB.""" + path = doc_path.path + if logflag: + logger.info(f"Parsing document {path}.") + + if path.endswith(".html"): + headers_to_split_on = [ + ("h1", "Header 1"), + ("h2", "Header 2"), + ("h3", "Header 3"), + ] + text_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on) + else: + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=doc_path.chunk_size, + chunk_overlap=doc_path.chunk_overlap, + add_start_index=True, + separators=get_separators(), + ) + + content = document_loader(path) + + structured_types = [".xlsx", ".csv", ".json", "jsonl"] + _, ext = os.path.splitext(path) + + if ext in structured_types: + chunks = content + else: + chunks = text_splitter.split_text(content) + + if doc_path.process_table and path.endswith(".pdf"): + table_chunks = get_tables_result(path, doc_path.table_strategy) + chunks = chunks + table_chunks + if logflag: + logger.info("Done preprocessing. Created ", len(chunks), " chunks of the original file.") + + if OPENAI_KEY: + logger.info("OpenAI API Key is set. Verifying its validity...") + openai.api_key = OPENAI_KEY + + try: + response = openai.Engine.list() + logger.info("OpenAI API Key is valid.") + llm = ChatOpenAI(temperature=0, model_name="gpt-4o") + except openai.error.AuthenticationError: + logger.info("OpenAI API Key is invalid.") + except Exception as e: + logger.info(f"An error occurred while verifying the API Key: {e}") + else: + llm = HuggingFaceEndpoint( + endpoint_url=TGI_LLM_ENDPOINT, + max_new_tokens=512, + top_k=40, + top_p=0.9, + temperature=0.8, + timeout=600, + ) + + llm_transformer = LLMGraphTransformer( + llm=llm, node_properties=["description"], relationship_properties=["description"] + ) + + doc_list = [Document(page_content=text) for text in chunks] + graph_doc = llm_transformer.convert_to_graph_documents(doc_list) + + db = ArangoClient(hosts=ARANGO_URL).db( + name=DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True + ) + graph = ArangoGraph(db=db, include_examples=True) + + graph.add_graph_documents( + graph_doc, + # baseEntityLabel=True, # <<< TODO: Figure out if this is required in ArangoGraph + include_source=True, # <<< TODO: Make this customizable for the user to set somehow + graph_name="NewGraph", # <<< TODO: Make this customizable for the user to set somehow + ) + + if logflag: + logger.info("The graph is built.") + + return True + + +@register_microservice( + name="opea_service@prepare_doc_arango", + endpoint="/v1/dataprep", + host="0.0.0.0", + port=6007, + input_datatype=DocPath, + output_datatype=None, +) +async def ingest_documents( + files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), + link_list: Optional[str] = Form(None), + chunk_size: int = Form(1500), + chunk_overlap: int = Form(100), + process_table: bool = Form(False), + table_strategy: str = Form("fast"), +): + if logflag: + logger.info(f"files:{files}") + logger.info(f"link_list:{link_list}") + + if files: + if not isinstance(files, list): + files = [files] + uploaded_files = [] + for file in files: + encode_file = encode_filename(file.filename) + save_path = upload_folder + encode_file + await save_content_to_local_disk(save_path, file) + ingest_data_to_arango( + DocPath( + path=save_path, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + process_table=process_table, + table_strategy=table_strategy, + ) + ) + uploaded_files.append(save_path) + if logflag: + logger.info(f"Successfully saved file {save_path}") + result = {"status": 200, "message": "Data preparation succeeded"} + if logflag: + logger.info(result) + return result + + if link_list: + link_list = json.loads(link_list) # Parse JSON string to list + if not isinstance(link_list, list): + raise HTTPException(status_code=400, detail="link_list should be a list.") + for link in link_list: + encoded_link = encode_filename(link) + save_path = upload_folder + encoded_link + ".txt" + content = parse_html([link])[0][0] + try: + await save_content_to_local_disk(save_path, content) + ingest_data_to_arango( + DocPath( + path=save_path, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + process_table=process_table, + table_strategy=table_strategy, + ) + ) + except json.JSONDecodeError: + raise HTTPException(status_code=500, detail="Fail to ingest data into qdrant.") + + if logflag: + logger.info(f"Successfully saved link {link}") + + result = {"status": 200, "message": "Data preparation succeeded"} + if logflag: + logger.info(result) + return result + + raise HTTPException(status_code=400, detail="Must provide either a file or a string list.") + + +if __name__ == "__main__": + opea_microservices["opea_service@prepare_doc_arango"].start() diff --git a/comps/dataprep/arango/langchain/requirements.txt b/comps/dataprep/arango/langchain/requirements.txt new file mode 100644 index 000000000..921d1f6e0 --- /dev/null +++ b/comps/dataprep/arango/langchain/requirements.txt @@ -0,0 +1,31 @@ +beautifulsoup4 +cairosvg +docarray[full] +docx2txt +easyocr +fastapi +huggingface_hub +langchain +git+https://github.com/arangoml/langchain.git@arangodb#subdirectory=libs/community +langchain-experimental +langchain-openai +langchain-text-splitters +langchain_huggingface +markdown +python-arango +numpy +openai +opentelemetry-api +opentelemetry-exporter-otlp +opentelemetry-sdk +pandas +Pillow +prometheus-fastapi-instrumentator +pymupdf +pytesseract +python-docx +python-pptx +sentence_transformers +shortuuid +unstructured[all-docs]==0.15.7 +uvicorn From c5eadbb427320e94cfa3e5fee02e5c0a63459e82 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Tue, 26 Nov 2024 18:33:10 -0500 Subject: [PATCH 10/25] fix: env --- comps/dataprep/arango/langchain/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comps/dataprep/arango/langchain/config.py b/comps/dataprep/arango/langchain/config.py index 8a3cbc501..16de3d16a 100644 --- a/comps/dataprep/arango/langchain/config.py +++ b/comps/dataprep/arango/langchain/config.py @@ -6,7 +6,7 @@ # ArangoDB configuration ARANGO_URL = os.getenv("ARANGO_URI", "http://localhost:8529") ARANGO_USERNAME = os.getenv("ARANGO_USERNAME", "root") -ARANGO_PASSWORD = os.getenv("NEO4J_PASSWORD", "test") +ARANGO_PASSWORD = os.getenv("ARANGO_PASSWORD", "test") DB_NAME = os.getenv("DB_NAME", "_system") # LLM/Embedding endpoints From 8f750e4472e33d9c11bdc39606ea6b6e33fef892 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Wed, 27 Nov 2024 16:53:52 -0500 Subject: [PATCH 11/25] Update README.md --- comps/dataprep/arango/langchain/README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/comps/dataprep/arango/langchain/README.md b/comps/dataprep/arango/langchain/README.md index 35f3cdecd..db860804c 100644 --- a/comps/dataprep/arango/langchain/README.md +++ b/comps/dataprep/arango/langchain/README.md @@ -14,8 +14,12 @@ apt-get install poppler-utils -y To launch ArangoDB locally, first ensure you have docker installed. Then, you can launch the database with the following docker command. -```bash + + +```bash +docker run -d --name arangodb -p 8529:8529 -e ARANGO_ROOT_PASSWORD=password jbajic/arango-preview:vector-index-preview --experimental-vector-index=true ``` ### Setup Environment Variables From ea222e78ce829769e5fdec339336ee9035dfe27d Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Wed, 27 Nov 2024 17:01:41 -0500 Subject: [PATCH 12/25] Revert "Update README.md" This reverts commit 8f750e4472e33d9c11bdc39606ea6b6e33fef892. --- comps/dataprep/arango/langchain/README.md | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/comps/dataprep/arango/langchain/README.md b/comps/dataprep/arango/langchain/README.md index db860804c..35f3cdecd 100644 --- a/comps/dataprep/arango/langchain/README.md +++ b/comps/dataprep/arango/langchain/README.md @@ -14,12 +14,8 @@ apt-get install poppler-utils -y To launch ArangoDB locally, first ensure you have docker installed. Then, you can launch the database with the following docker command. - - ```bash -docker run -d --name arangodb -p 8529:8529 -e ARANGO_ROOT_PASSWORD=password jbajic/arango-preview:vector-index-preview --experimental-vector-index=true +docker run -d --name arangodb -p 8529:8529 -e ARANGO_ROOT_PASSWORD=password arangodb/arangodb:latest ``` ### Setup Environment Variables From 204457e4c4d21c01d538c65a9f5164213a7473e8 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Tue, 3 Dec 2024 16:03:55 -0500 Subject: [PATCH 13/25] fix: create database --- .../dataprep/arango/langchain/prepare_doc_arango.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/comps/dataprep/arango/langchain/prepare_doc_arango.py b/comps/dataprep/arango/langchain/prepare_doc_arango.py index 240469e43..6e5997231 100644 --- a/comps/dataprep/arango/langchain/prepare_doc_arango.py +++ b/comps/dataprep/arango/langchain/prepare_doc_arango.py @@ -99,9 +99,19 @@ def ingest_data_to_arango(doc_path: DocPath): doc_list = [Document(page_content=text) for text in chunks] graph_doc = llm_transformer.convert_to_graph_documents(doc_list) - db = ArangoClient(hosts=ARANGO_URL).db( + client = ArangoClient(hosts=ARANGO_URL) + + sys_db = client.db( + name="_system", username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True + ) + + if not sys_db.has_database(DB_NAME): + sys_db.create_database(DB_NAME) + + db = client.db( name=DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True ) + graph = ArangoGraph(db=db, include_examples=True) graph.add_graph_documents( From bbf368275575c0e8f52aa5f7b8537a367d404bd8 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Sun, 8 Dec 2024 22:28:11 -0500 Subject: [PATCH 14/25] cleanup --- .../arango/langchain/prepare_doc_arango.py | 72 ++++++++++++++----- 1 file changed, 54 insertions(+), 18 deletions(-) diff --git a/comps/dataprep/arango/langchain/prepare_doc_arango.py b/comps/dataprep/arango/langchain/prepare_doc_arango.py index 6e5997231..35af1e9d5 100644 --- a/comps/dataprep/arango/langchain/prepare_doc_arango.py +++ b/comps/dataprep/arango/langchain/prepare_doc_arango.py @@ -92,34 +92,70 @@ def ingest_data_to_arango(doc_path: DocPath): timeout=600, ) + client = ArangoClient(hosts=ARANGO_URL) + sys_db = client.db(name="_system", username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + + if not sys_db.has_database(DB_NAME): + sys_db.create_database(DB_NAME) + + db = client.db(name=DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + + graph = ArangoGraph( + db=db, + include_examples=False, + generate_schema_on_init=False, + ) + llm_transformer = LLMGraphTransformer( llm=llm, node_properties=["description"], relationship_properties=["description"] ) - doc_list = [Document(page_content=text) for text in chunks] - graph_doc = llm_transformer.convert_to_graph_documents(doc_list) + ########################## + # Option A: Batch insert # + ########################## - client = ArangoClient(hosts=ARANGO_URL) + for text in chunks: + document = Document(page_content=text) - sys_db = client.db( - name="_system", username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True - ) + # TODO: + # if generate_chunk_embeddings: + # document.metadata["embeddings"] = llm.generate_embeddings(text) - if not sys_db.has_database(DB_NAME): - sys_db.create_database(DB_NAME) + graph_docs = llm_transformer.convert_to_graph_documents([document]) - db = client.db( - name=DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True - ) + graph.add_graph_documents( + graph_documents=graph_docs, + include_source=True, # TODO: Parameterize + graph_name="NewGraph", # TODO: Parameterize + batch_size=1000, # TODO: Parameterize + use_one_entity_collection=True, # TODO: Parameterize + insert_async=False, # TODO: Parameterize + ) - graph = ArangoGraph(db=db, include_examples=True) + # ######################### + # # Option B: Bulk insert # + # ######################### - graph.add_graph_documents( - graph_doc, - # baseEntityLabel=True, # <<< TODO: Figure out if this is required in ArangoGraph - include_source=True, # <<< TODO: Make this customizable for the user to set somehow - graph_name="NewGraph", # <<< TODO: Make this customizable for the user to set somehow - ) + # doc_list = [] + # for text in chunks: + # doc = Document(page_content=text) + + # # TODO: + # # if generate_chunk_embeddings: + # # doc.metadata["embeddings"] = llm.generate_embeddings(text) + + # doc_list.append(doc) + + # graph_docs = llm_transformer.convert_to_graph_documents(doc_list) + + # graph.add_graph_documents( + # graph_documents=graph_docs, + # include_source=True, # TODO: Parameterize + # graph_name="NewGraph", # TODO: Parameterize + # batch_size=1000, # TODO: Parameterize + # use_one_entity_collection=True, # TODO: Parameterize + # insert_async=False, # TODO: Parameterize + # ) if logflag: logger.info("The graph is built.") From a44536348aa3a5a40b37e28cb4724849f7dc3e19 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Sun, 8 Dec 2024 22:44:52 -0500 Subject: [PATCH 15/25] new: chunk embedding generation --- comps/dataprep/arango/langchain/config.py | 1 + .../arango/langchain/prepare_doc_arango.py | 52 ++++++++++++++----- 2 files changed, 39 insertions(+), 14 deletions(-) diff --git a/comps/dataprep/arango/langchain/config.py b/comps/dataprep/arango/langchain/config.py index 16de3d16a..945656722 100644 --- a/comps/dataprep/arango/langchain/config.py +++ b/comps/dataprep/arango/langchain/config.py @@ -13,4 +13,5 @@ TGI_LLM_ENDPOINT = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080") TGI_LLM_ENDPOINT_NO_RAG = os.getenv("TGI_LLM_ENDPOINT_NO_RAG", "http://localhost:8081") TEI_EMBEDDING_ENDPOINT = os.getenv("TEI_ENDPOINT") +TEI_EMBED_MODEL = os.getenv("TEI_EMBED_MODEL", "BAAI/bge-base-en-v1.5") OPENAI_KEY = os.getenv("OPENAI_API_KEY") diff --git a/comps/dataprep/arango/langchain/prepare_doc_arango.py b/comps/dataprep/arango/langchain/prepare_doc_arango.py index 35af1e9d5..ab595b048 100644 --- a/comps/dataprep/arango/langchain/prepare_doc_arango.py +++ b/comps/dataprep/arango/langchain/prepare_doc_arango.py @@ -7,12 +7,23 @@ import openai from arango import ArangoClient -from config import ARANGO_PASSWORD, ARANGO_URL, ARANGO_USERNAME, DB_NAME, OPENAI_KEY, TGI_LLM_ENDPOINT +from config import ( + ARANGO_PASSWORD, + ARANGO_URL, + ARANGO_USERNAME, + DB_NAME, + OPENAI_KEY, + TEI_EMBED_MODEL, + TEI_EMBEDDING_ENDPOINT, + TGI_LLM_ENDPOINT, +) from fastapi import File, Form, HTTPException, UploadFile from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceHubEmbeddings from langchain_community.graphs.arangodb_graph import ArangoGraph from langchain_community.llms import HuggingFaceEndpoint from langchain_core.documents import Document +from langchain_core.embeddings import Embeddings from langchain_experimental.graph_transformers import LLMGraphTransformer from langchain_openai import ChatOpenAI from langchain_text_splitters import HTMLHeaderTextSplitter @@ -33,7 +44,7 @@ upload_folder = "./uploaded_files/" -def ingest_data_to_arango(doc_path: DocPath): +def ingest_data_to_arango(doc_path: DocPath, embeddings: Embeddings | None) -> bool: """Ingest document to ArangoDB.""" path = doc_path.path if logflag: @@ -114,15 +125,16 @@ def ingest_data_to_arango(doc_path: DocPath): # Option A: Batch insert # ########################## + generate_chunk_embeddings = embeddings is not None + for text in chunks: document = Document(page_content=text) - - # TODO: - # if generate_chunk_embeddings: - # document.metadata["embeddings"] = llm.generate_embeddings(text) - graph_docs = llm_transformer.convert_to_graph_documents([document]) + if generate_chunk_embeddings: + source = graph_docs[0].source + source.metadata["embeddings"] = embeddings.embed_documents([source.page_content])[0] + graph.add_graph_documents( graph_documents=graph_docs, include_source=True, # TODO: Parameterize @@ -140,9 +152,9 @@ def ingest_data_to_arango(doc_path: DocPath): # for text in chunks: # doc = Document(page_content=text) - # # TODO: - # # if generate_chunk_embeddings: - # # doc.metadata["embeddings"] = llm.generate_embeddings(text) + # if generate_chunk_embeddings: + # source = graph_docs[0].source + # doc.metadata["embeddings"] = embeddings.embed_documents([text])[0] # doc_list.append(doc) @@ -183,6 +195,16 @@ async def ingest_documents( logger.info(f"files:{files}") logger.info(f"link_list:{link_list}") + if TEI_EMBEDDING_ENDPOINT: + # create embeddings using TEI endpoint service + hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN") + embeddings = HuggingFaceHubEmbeddings(model=TEI_EMBEDDING_ENDPOINT, huggingfacehub_api_token=hf_token) + elif TEI_EMBED_MODEL: + # create embeddings using local embedding model + embeddings = HuggingFaceBgeEmbeddings(model_name=TEI_EMBED_MODEL) + else: + embeddings = None + if files: if not isinstance(files, list): files = [files] @@ -192,13 +214,14 @@ async def ingest_documents( save_path = upload_folder + encode_file await save_content_to_local_disk(save_path, file) ingest_data_to_arango( - DocPath( + doc_path=DocPath( path=save_path, chunk_size=chunk_size, chunk_overlap=chunk_overlap, process_table=process_table, table_strategy=table_strategy, - ) + ), + embeddings=embeddings, ) uploaded_files.append(save_path) if logflag: @@ -219,13 +242,14 @@ async def ingest_documents( try: await save_content_to_local_disk(save_path, content) ingest_data_to_arango( - DocPath( + doc_path=DocPath( path=save_path, chunk_size=chunk_size, chunk_overlap=chunk_overlap, process_table=process_table, table_strategy=table_strategy, - ) + ), + embeddings=embeddings, ) except json.JSONDecodeError: raise HTTPException(status_code=500, detail="Fail to ingest data into qdrant.") From c58e6b6fc7095fc46a00c5e9856c576032f1ff3e Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Mon, 9 Dec 2024 09:57:04 -0500 Subject: [PATCH 16/25] new: `cithash` dep --- comps/dataprep/arango/langchain/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/comps/dataprep/arango/langchain/requirements.txt b/comps/dataprep/arango/langchain/requirements.txt index 921d1f6e0..74d4a9f0d 100644 --- a/comps/dataprep/arango/langchain/requirements.txt +++ b/comps/dataprep/arango/langchain/requirements.txt @@ -13,6 +13,7 @@ langchain-text-splitters langchain_huggingface markdown python-arango +cityhash numpy openai opentelemetry-api From fdc531c4136862710efdc40fa4cb7f1e5e3ed525 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Mon, 9 Dec 2024 09:57:17 -0500 Subject: [PATCH 17/25] cleanup: `ingest_data_to_arango` --- .../arango/langchain/prepare_doc_arango.py | 175 +++++++++--------- 1 file changed, 90 insertions(+), 85 deletions(-) diff --git a/comps/dataprep/arango/langchain/prepare_doc_arango.py b/comps/dataprep/arango/langchain/prepare_doc_arango.py index ab595b048..d99cf7147 100644 --- a/comps/dataprep/arango/langchain/prepare_doc_arango.py +++ b/comps/dataprep/arango/langchain/prepare_doc_arango.py @@ -12,6 +12,7 @@ ARANGO_URL, ARANGO_USERNAME, DB_NAME, + HUGGINGFACEHUB_API_TOKEN, OPENAI_KEY, TEI_EMBED_MODEL, TEI_EMBEDDING_ENDPOINT, @@ -25,7 +26,7 @@ from langchain_core.documents import Document from langchain_core.embeddings import Embeddings from langchain_experimental.graph_transformers import LLMGraphTransformer -from langchain_openai import ChatOpenAI +from langchain_openai import ChatOpenAI, OpenAIEmbeddings from langchain_text_splitters import HTMLHeaderTextSplitter from comps import CustomLogger, DocPath, opea_microservices, register_microservice @@ -50,36 +51,27 @@ def ingest_data_to_arango(doc_path: DocPath, embeddings: Embeddings | None) -> b if logflag: logger.info(f"Parsing document {path}.") - if path.endswith(".html"): - headers_to_split_on = [ - ("h1", "Header 1"), - ("h2", "Header 2"), - ("h3", "Header 3"), - ] - text_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on) - else: - text_splitter = RecursiveCharacterTextSplitter( - chunk_size=doc_path.chunk_size, - chunk_overlap=doc_path.chunk_overlap, - add_start_index=True, - separators=get_separators(), - ) + ############ + # ArangoDB # + ############ - content = document_loader(path) + client = ArangoClient(hosts=ARANGO_URL) + sys_db = client.db(name="_system", username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) - structured_types = [".xlsx", ".csv", ".json", "jsonl"] - _, ext = os.path.splitext(path) + if not sys_db.has_database(DB_NAME): + sys_db.create_database(DB_NAME) - if ext in structured_types: - chunks = content - else: - chunks = text_splitter.split_text(content) + db = client.db(name=DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) - if doc_path.process_table and path.endswith(".pdf"): - table_chunks = get_tables_result(path, doc_path.table_strategy) - chunks = chunks + table_chunks - if logflag: - logger.info("Done preprocessing. Created ", len(chunks), " chunks of the original file.") + graph = ArangoGraph( + db=db, + include_examples=False, + generate_schema_on_init=False, + ) + + ############################# + # Text Generation Inference # + ############################# if OPENAI_KEY: logger.info("OpenAI API Key is set. Verifying its validity...") @@ -93,7 +85,7 @@ def ingest_data_to_arango(doc_path: DocPath, embeddings: Embeddings | None) -> b logger.info("OpenAI API Key is invalid.") except Exception as e: logger.info(f"An error occurred while verifying the API Key: {e}") - else: + elif TGI_LLM_ENDPOINT: llm = HuggingFaceEndpoint( endpoint_url=TGI_LLM_ENDPOINT, max_new_tokens=512, @@ -102,28 +94,77 @@ def ingest_data_to_arango(doc_path: DocPath, embeddings: Embeddings | None) -> b temperature=0.8, timeout=600, ) + else: + raise ValueError("No text generation inference endpoint is set.") - client = ArangoClient(hosts=ARANGO_URL) - sys_db = client.db(name="_system", username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + llm_transformer = LLMGraphTransformer( + llm=llm, + # prompt=..., # TODO: Parameterize + # allowed_nodes=..., # TODO: Parameterize + # allowed_relationships=..., # TODO: Parameterize + ) - if not sys_db.has_database(DB_NAME): - sys_db.create_database(DB_NAME) + ######################################## + # Text Embeddings Inference (optional) # + ######################################## - db = client.db(name=DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + if OPENAI_KEY: + # Use OpenAI embeddings + embeddings = OpenAIEmbeddings( + model="text-embedding-3-small", # TODO: Parameterize + dimensions=512, # TODO: Parameterize + ) - graph = ArangoGraph( - db=db, - include_examples=False, - generate_schema_on_init=False, - ) + if TEI_EMBEDDING_ENDPOINT and HUGGINGFACEHUB_API_TOKEN: + # Use TEI endpoint service + embeddings = HuggingFaceHubEmbeddings( + model=TEI_EMBEDDING_ENDPOINT, + huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN, + ) + elif TEI_EMBED_MODEL: + # Use local embedding model + embeddings = HuggingFaceBgeEmbeddings(model_name=TEI_EMBED_MODEL) + else: + embeddings = None - llm_transformer = LLMGraphTransformer( - llm=llm, node_properties=["description"], relationship_properties=["description"] - ) + ############ + # Chunking # + ############ + + if path.endswith(".html"): + headers_to_split_on = [ + ("h1", "Header 1"), + ("h2", "Header 2"), + ("h3", "Header 3"), + ] + text_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on) + else: + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=doc_path.chunk_size, + chunk_overlap=doc_path.chunk_overlap, + add_start_index=True, + separators=get_separators(), + ) - ########################## - # Option A: Batch insert # - ########################## + content = document_loader(path) + + structured_types = [".xlsx", ".csv", ".json", "jsonl"] + _, ext = os.path.splitext(path) + + if ext in structured_types: + chunks = content + else: + chunks = text_splitter.split_text(content) + + if doc_path.process_table and path.endswith(".pdf"): + table_chunks = get_tables_result(path, doc_path.table_strategy) + chunks = chunks + table_chunks + if logflag: + logger.info("Done preprocessing. Created ", len(chunks), " chunks of the original file.") + + ################################ + # Graph generation & insertion # + ################################ generate_chunk_embeddings = embeddings is not None @@ -139,36 +180,12 @@ def ingest_data_to_arango(doc_path: DocPath, embeddings: Embeddings | None) -> b graph_documents=graph_docs, include_source=True, # TODO: Parameterize graph_name="NewGraph", # TODO: Parameterize + update_graph_definition_if_exists=False, # TODO: Set as reverse of `use_one_entity_collection` batch_size=1000, # TODO: Parameterize use_one_entity_collection=True, # TODO: Parameterize insert_async=False, # TODO: Parameterize ) - # ######################### - # # Option B: Bulk insert # - # ######################### - - # doc_list = [] - # for text in chunks: - # doc = Document(page_content=text) - - # if generate_chunk_embeddings: - # source = graph_docs[0].source - # doc.metadata["embeddings"] = embeddings.embed_documents([text])[0] - - # doc_list.append(doc) - - # graph_docs = llm_transformer.convert_to_graph_documents(doc_list) - - # graph.add_graph_documents( - # graph_documents=graph_docs, - # include_source=True, # TODO: Parameterize - # graph_name="NewGraph", # TODO: Parameterize - # batch_size=1000, # TODO: Parameterize - # use_one_entity_collection=True, # TODO: Parameterize - # insert_async=False, # TODO: Parameterize - # ) - if logflag: logger.info("The graph is built.") @@ -195,16 +212,6 @@ async def ingest_documents( logger.info(f"files:{files}") logger.info(f"link_list:{link_list}") - if TEI_EMBEDDING_ENDPOINT: - # create embeddings using TEI endpoint service - hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN") - embeddings = HuggingFaceHubEmbeddings(model=TEI_EMBEDDING_ENDPOINT, huggingfacehub_api_token=hf_token) - elif TEI_EMBED_MODEL: - # create embeddings using local embedding model - embeddings = HuggingFaceBgeEmbeddings(model_name=TEI_EMBED_MODEL) - else: - embeddings = None - if files: if not isinstance(files, list): files = [files] @@ -214,14 +221,13 @@ async def ingest_documents( save_path = upload_folder + encode_file await save_content_to_local_disk(save_path, file) ingest_data_to_arango( - doc_path=DocPath( + DocPath( path=save_path, chunk_size=chunk_size, chunk_overlap=chunk_overlap, process_table=process_table, table_strategy=table_strategy, - ), - embeddings=embeddings, + ) ) uploaded_files.append(save_path) if logflag: @@ -242,14 +248,13 @@ async def ingest_documents( try: await save_content_to_local_disk(save_path, content) ingest_data_to_arango( - doc_path=DocPath( + DocPath( path=save_path, chunk_size=chunk_size, chunk_overlap=chunk_overlap, process_table=process_table, table_strategy=table_strategy, - ), - embeddings=embeddings, + ) ) except json.JSONDecodeError: raise HTTPException(status_code=500, detail="Fail to ingest data into qdrant.") From 7b5612cd520ffa131a84588951096cf19099630b Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Mon, 9 Dec 2024 09:57:28 -0500 Subject: [PATCH 18/25] new: envs in `config` --- comps/dataprep/arango/langchain/config.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/comps/dataprep/arango/langchain/config.py b/comps/dataprep/arango/langchain/config.py index 945656722..69d3b7872 100644 --- a/comps/dataprep/arango/langchain/config.py +++ b/comps/dataprep/arango/langchain/config.py @@ -9,9 +9,13 @@ ARANGO_PASSWORD = os.getenv("ARANGO_PASSWORD", "test") DB_NAME = os.getenv("DB_NAME", "_system") -# LLM/Embedding endpoints +# Text Generation Inference configuration TGI_LLM_ENDPOINT = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080") -TGI_LLM_ENDPOINT_NO_RAG = os.getenv("TGI_LLM_ENDPOINT_NO_RAG", "http://localhost:8081") -TEI_EMBEDDING_ENDPOINT = os.getenv("TEI_ENDPOINT") + +# Text Embeddings Inference configuration +TEI_EMBEDDING_ENDPOINT = os.getenv("TEI_EMBEDDING_ENDPOINT") +HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN") TEI_EMBED_MODEL = os.getenv("TEI_EMBED_MODEL", "BAAI/bge-base-en-v1.5") + +# OpenAI configuration (alternative to TGI & TEI) OPENAI_KEY = os.getenv("OPENAI_API_KEY") From ba27252ce2fc10975b29c217ef41803adfcecf77 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Mon, 9 Dec 2024 09:59:50 -0500 Subject: [PATCH 19/25] fix: more envs --- comps/dataprep/arango/langchain/README.md | 8 ++++---- comps/dataprep/arango/langchain/config.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/comps/dataprep/arango/langchain/README.md b/comps/dataprep/arango/langchain/README.md index 35f3cdecd..f15283323 100644 --- a/comps/dataprep/arango/langchain/README.md +++ b/comps/dataprep/arango/langchain/README.md @@ -24,10 +24,10 @@ docker run -d --name arangodb -p 8529:8529 -e ARANGO_ROOT_PASSWORD=password aran export no_proxy=${your_no_proxy} export http_proxy=${your_http_proxy} export https_proxy=${your_http_proxy} -export ARANGO_URI=${your_arango_url} +export ARANGO_URL=${your_arango_url} export ARANGO_USERNAME=${your_arango_username} export ARANGO_PASSWORD=${your_arango_password} -export DB_NAME=${your_db_name} +export ARANGO_DB_NAME=${your_db_name} export PYTHONPATH=${path_to_comps} ``` @@ -60,10 +60,10 @@ docker run -d --name="dataprep-arango-server" -p 6007:6007 --ipc=host -e http_pr export no_proxy=${your_no_proxy} export http_proxy=${your_http_proxy} export https_proxy=${your_http_proxy} -export ARANGO_URI=${your_arango_url} +export ARANGO_URL=${your_arango_url} export ARANGO_USERNAME=${your_arango_username} export ARANGO_PASSWORD=${your_arango_password} -export DB_NAME=${your_db_name} +export ARANGO_DB_NAME=${your_db_name} ``` ### Run Docker with Docker Compose diff --git a/comps/dataprep/arango/langchain/config.py b/comps/dataprep/arango/langchain/config.py index 69d3b7872..a6d5a6634 100644 --- a/comps/dataprep/arango/langchain/config.py +++ b/comps/dataprep/arango/langchain/config.py @@ -4,10 +4,10 @@ import os # ArangoDB configuration -ARANGO_URL = os.getenv("ARANGO_URI", "http://localhost:8529") +ARANGO_URL = os.getenv("ARANGO_URL", "http://localhost:8529") ARANGO_USERNAME = os.getenv("ARANGO_USERNAME", "root") ARANGO_PASSWORD = os.getenv("ARANGO_PASSWORD", "test") -DB_NAME = os.getenv("DB_NAME", "_system") +ARANGO_DB_NAME = os.getenv("ARANGO_DB_NAME", "_system") # Text Generation Inference configuration TGI_LLM_ENDPOINT = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080") From 3e75cd169b2f16696242835817f157a257fc3838 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Mon, 9 Dec 2024 10:25:24 -0500 Subject: [PATCH 20/25] more env cleanup --- comps/dataprep/arango/langchain/README.md | 2 +- comps/dataprep/arango/langchain/config.py | 2 +- .../docker-compose-dataprep-arango.yaml | 7 +++++-- .../arango/langchain/prepare_doc_arango.py | 16 ++++++++-------- 4 files changed, 15 insertions(+), 12 deletions(-) diff --git a/comps/dataprep/arango/langchain/README.md b/comps/dataprep/arango/langchain/README.md index f15283323..095a6da73 100644 --- a/comps/dataprep/arango/langchain/README.md +++ b/comps/dataprep/arango/langchain/README.md @@ -99,7 +99,7 @@ We support table extraction from pdf documents. You can specify process_table an Note: If you specify "table_strategy=llm", You should first start TGI Service, please refer to 1.2.1, 1.3.1 in https://github.com/opea-project/GenAIComps/tree/main/comps/llms/README.md, and then `export TGI_LLM_ENDPOINT="http://${your_ip}:8008"`. -For ensure the quality and comprehensiveness of the extracted entities, we recommend to use `gpt-4o` as the default model for parsing the document. To enable the openai service, please `export OPENAI_KEY=xxxx` before using this services. +For ensure the quality and comprehensiveness of the extracted entities, we recommend to use `gpt-4o` as the default model for parsing the document. To enable the openai service, please `export OPENAI_API_KEY=xxxx` before using this services. ```bash curl -X POST \ diff --git a/comps/dataprep/arango/langchain/config.py b/comps/dataprep/arango/langchain/config.py index a6d5a6634..45eb39a80 100644 --- a/comps/dataprep/arango/langchain/config.py +++ b/comps/dataprep/arango/langchain/config.py @@ -18,4 +18,4 @@ TEI_EMBED_MODEL = os.getenv("TEI_EMBED_MODEL", "BAAI/bge-base-en-v1.5") # OpenAI configuration (alternative to TGI & TEI) -OPENAI_KEY = os.getenv("OPENAI_API_KEY") +OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") diff --git a/comps/dataprep/arango/langchain/docker-compose-dataprep-arango.yaml b/comps/dataprep/arango/langchain/docker-compose-dataprep-arango.yaml index 970c7490e..c766b5c03 100644 --- a/comps/dataprep/arango/langchain/docker-compose-dataprep-arango.yaml +++ b/comps/dataprep/arango/langchain/docker-compose-dataprep-arango.yaml @@ -38,9 +38,12 @@ services: ARANGO_URL: ${ARANGO_URL} ARANGO_USERNAME: ${ARANGO_USERNAME} ARANGO_PASSWORD: ${ARANGO_PASSWORD} - DB_NAME: ${DB_NAME} + ARANGO_DB_NAME: ${ARANGO_DB_NAME} TGI_LLM_ENDPOINT: ${TEI_ENDPOINT} - OPENAI_KEY: ${OPENAI_API_KEY} + TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT} + HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + TEI_EMBED_MODEL: ${TEI_EMBED_MODEL} + OPENAI_API_KEY: ${OPENAI_API_KEY} restart: unless-stopped networks: diff --git a/comps/dataprep/arango/langchain/prepare_doc_arango.py b/comps/dataprep/arango/langchain/prepare_doc_arango.py index d99cf7147..08197fbd3 100644 --- a/comps/dataprep/arango/langchain/prepare_doc_arango.py +++ b/comps/dataprep/arango/langchain/prepare_doc_arango.py @@ -8,12 +8,12 @@ import openai from arango import ArangoClient from config import ( + ARANGO_DB_NAME, ARANGO_PASSWORD, ARANGO_URL, ARANGO_USERNAME, - DB_NAME, HUGGINGFACEHUB_API_TOKEN, - OPENAI_KEY, + OPENAI_API_KEY, TEI_EMBED_MODEL, TEI_EMBEDDING_ENDPOINT, TGI_LLM_ENDPOINT, @@ -58,10 +58,10 @@ def ingest_data_to_arango(doc_path: DocPath, embeddings: Embeddings | None) -> b client = ArangoClient(hosts=ARANGO_URL) sys_db = client.db(name="_system", username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) - if not sys_db.has_database(DB_NAME): - sys_db.create_database(DB_NAME) + if not sys_db.has_database(ARANGO_DB_NAME): + sys_db.create_database(ARANGO_DB_NAME) - db = client.db(name=DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) + db = client.db(name=ARANGO_DB_NAME, username=ARANGO_USERNAME, password=ARANGO_PASSWORD, verify=True) graph = ArangoGraph( db=db, @@ -73,9 +73,9 @@ def ingest_data_to_arango(doc_path: DocPath, embeddings: Embeddings | None) -> b # Text Generation Inference # ############################# - if OPENAI_KEY: + if OPENAI_API_KEY: logger.info("OpenAI API Key is set. Verifying its validity...") - openai.api_key = OPENAI_KEY + openai.api_key = OPENAI_API_KEY try: response = openai.Engine.list() @@ -108,7 +108,7 @@ def ingest_data_to_arango(doc_path: DocPath, embeddings: Embeddings | None) -> b # Text Embeddings Inference (optional) # ######################################## - if OPENAI_KEY: + if OPENAI_API_KEY: # Use OpenAI embeddings embeddings = OpenAIEmbeddings( model="text-embedding-3-small", # TODO: Parameterize From 049b782b89866cd35d367316729ef8229466126c Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Mon, 9 Dec 2024 10:50:11 -0500 Subject: [PATCH 21/25] fix: deprecated line --- comps/dataprep/arango/langchain/prepare_doc_arango.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/comps/dataprep/arango/langchain/prepare_doc_arango.py b/comps/dataprep/arango/langchain/prepare_doc_arango.py index 08197fbd3..ab84a847c 100644 --- a/comps/dataprep/arango/langchain/prepare_doc_arango.py +++ b/comps/dataprep/arango/langchain/prepare_doc_arango.py @@ -78,13 +78,14 @@ def ingest_data_to_arango(doc_path: DocPath, embeddings: Embeddings | None) -> b openai.api_key = OPENAI_API_KEY try: - response = openai.Engine.list() + openai.models.list() logger.info("OpenAI API Key is valid.") llm = ChatOpenAI(temperature=0, model_name="gpt-4o") except openai.error.AuthenticationError: logger.info("OpenAI API Key is invalid.") except Exception as e: logger.info(f"An error occurred while verifying the API Key: {e}") + elif TGI_LLM_ENDPOINT: llm = HuggingFaceEndpoint( endpoint_url=TGI_LLM_ENDPOINT, From 5ae912e7c48370f3387d18beec1cff220c023879 Mon Sep 17 00:00:00 2001 From: Ajay Kallepalli Date: Wed, 11 Dec 2024 12:35:44 -0800 Subject: [PATCH 22/25] Initial readme and prepare doc arango, with embeddings by Anthony --- comps/dataprep/arango/langchain/README.md | 31 +++-- comps/dataprep/arango/langchain/config.py | 16 +++ .../arango/langchain/prepare_doc_arango.py | 126 +++++++++++++----- 3 files changed, 130 insertions(+), 43 deletions(-) diff --git a/comps/dataprep/arango/langchain/README.md b/comps/dataprep/arango/langchain/README.md index 095a6da73..1d1e696d4 100644 --- a/comps/dataprep/arango/langchain/README.md +++ b/comps/dataprep/arango/langchain/README.md @@ -54,18 +54,6 @@ docker build -t opea/dataprep-arango:latest --build-arg https_proxy=$https_proxy docker run -d --name="dataprep-arango-server" -p 6007:6007 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/dataprep-arango:latest ``` -### Setup Environment Variables - -```bash -export no_proxy=${your_no_proxy} -export http_proxy=${your_http_proxy} -export https_proxy=${your_http_proxy} -export ARANGO_URL=${your_arango_url} -export ARANGO_USERNAME=${your_arango_username} -export ARANGO_PASSWORD=${your_arango_password} -export ARANGO_DB_NAME=${your_db_name} -``` - ### Run Docker with Docker Compose ```bash @@ -77,10 +65,15 @@ docker compose -f docker-compose-dataprep-arango.yaml up -d Once document preparation microservice for ArangoDB is started, user can use below command to invoke the microservice to convert the document to embedding and save to the database. +After the service is complete a Graph is created in ArangoDB. The default graph name is `NewGraph`, you can specify the graph name by `-F "graph_name=${your_graph_name}"` in the curl command. + +By default, the microservice will create embeddings for the documents. You can specify `-F "create_embeddings=false"` to skip the embedding creation. + ```bash curl -X POST \ -H "Content-Type: multipart/form-data" \ -F "files=@./file1.txt" \ + -F "graph_name=${your_graph_name}" \ http://localhost:6007/v1/dataprep ``` @@ -92,9 +85,22 @@ curl -X POST \ -F "files=@./file1.txt" \ -F "chunk_size=1500" \ -F "chunk_overlap=100" \ + -F "graph_name=${your_graph_name}" \ http://localhost:6007/v1/dataprep ``` +Additional options that can be specified from the config.py file are as follows: + +OpenAI Configuration: +- `OPENAI_EMBED_MODEL`: The embedding model to use for the OpenAI service. +- `OPENAI_EMBED_DIMENSIONS`: The dimensions for the OpenAI service. + +ArangoDB Configuration: +- `USE_ONE_ENTITY_COLLECTION`: If set to True, the microservice will use a single entity collection for all nodes. If set to False, the microservice will use a separate collection for each node type. +- `INSERT_ASYNC`: If set to True, the microservice will insert the data asynchronously. +- `ARANGO_BATCH_SIZE`: The batch size for the microservice to insert the data. +- `INCLUDE_SOURCE`: If set to True, the microservice will include the source in the data. + We support table extraction from pdf documents. You can specify process_table and table_strategy by the following commands. "table_strategy" refers to the strategies to understand tables for table retrieval. As the setting progresses from "fast" to "hq" to "llm," the focus shifts towards deeper table understanding at the expense of processing speed. The default strategy is "fast". Note: If you specify "table_strategy=llm", You should first start TGI Service, please refer to 1.2.1, 1.3.1 in https://github.com/opea-project/GenAIComps/tree/main/comps/llms/README.md, and then `export TGI_LLM_ENDPOINT="http://${your_ip}:8008"`. @@ -107,5 +113,6 @@ curl -X POST \ -F "files=@./your_file.pdf" \ -F "process_table=true" \ -F "table_strategy=hq" \ + -F "graph_name=${your_graph_name}" \ http://localhost:6007/v1/dataprep ``` diff --git a/comps/dataprep/arango/langchain/config.py b/comps/dataprep/arango/langchain/config.py index 45eb39a80..525862a9d 100644 --- a/comps/dataprep/arango/langchain/config.py +++ b/comps/dataprep/arango/langchain/config.py @@ -9,6 +9,12 @@ ARANGO_PASSWORD = os.getenv("ARANGO_PASSWORD", "test") ARANGO_DB_NAME = os.getenv("ARANGO_DB_NAME", "_system") +# ArangoDB graph configuration +USE_ONE_ENTITY_COLLECTION = True +INSERT_ASYNC = False +ARANGO_BATCH_SIZE = 1000 +INCLUDE_SOURCE = True + # Text Generation Inference configuration TGI_LLM_ENDPOINT = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080") @@ -19,3 +25,13 @@ # OpenAI configuration (alternative to TGI & TEI) OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") +OPENAI_EMBED_MODEL = os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-small") +OPENAI_EMBED_DIMENSIONS = os.getenv("OPENAI_EMBED_DIMENSIONS", 512) + +# LLMGraphTransformer configuration +ALLOWED_NODES = [] # ["Person", "Organization"] +ALLOWED_RELATIONSHIPS = [] # [("Person", "knows", "Person"), ("Person", "works_at", "Organization")] +NODE_PROPERTIES = False # ["description"] +RELATIONSHIP_PROPERTIES = False # ["description"] + +SYSTEM_PROMPT_PATH = os.getenv("SYSTEM_PROMPT_PATH", "./prompt.txt") \ No newline at end of file diff --git a/comps/dataprep/arango/langchain/prepare_doc_arango.py b/comps/dataprep/arango/langchain/prepare_doc_arango.py index ab84a847c..80c777225 100644 --- a/comps/dataprep/arango/langchain/prepare_doc_arango.py +++ b/comps/dataprep/arango/langchain/prepare_doc_arango.py @@ -17,6 +17,17 @@ TEI_EMBED_MODEL, TEI_EMBEDDING_ENDPOINT, TGI_LLM_ENDPOINT, + OPENAI_EMBED_MODEL, + OPENAI_EMBED_DIMENSIONS, + USE_ONE_ENTITY_COLLECTION, + INSERT_ASYNC, + ARANGO_BATCH_SIZE, + INCLUDE_SOURCE, + SYSTEM_PROMPT_PATH, + ALLOWED_NODES, + ALLOWED_RELATIONSHIPS, + NODE_PROPERTIES, + RELATIONSHIP_PROPERTIES, ) from fastapi import File, Form, HTTPException, UploadFile from langchain.text_splitter import RecursiveCharacterTextSplitter @@ -28,6 +39,7 @@ from langchain_experimental.graph_transformers import LLMGraphTransformer from langchain_openai import ChatOpenAI, OpenAIEmbeddings from langchain_text_splitters import HTMLHeaderTextSplitter +from langchain_core.prompts import ChatPromptTemplate, BasePromptTemplate from comps import CustomLogger, DocPath, opea_microservices, register_microservice from comps.dataprep.utils import ( @@ -44,8 +56,31 @@ upload_folder = "./uploaded_files/" +PROMPT_TEMPLATE = None +if SYSTEM_PROMPT_PATH is not None: + try: + with open(SYSTEM_PROMPT_PATH, "r") as f: + PROMPT_TEMPLATE = ChatPromptTemplate.from_messages( + [ + ( + "system", + f.read(), + ), + ( + "human", + ( + "Tip: Make sure to answer in the correct format and do " + "not include any explanations. " + "Use the given format to extract information from the " + "following input: {input}" + ), + ), + ] + ) + except Exception: + print("Could not set custom Prompt") -def ingest_data_to_arango(doc_path: DocPath, embeddings: Embeddings | None) -> bool: +def ingest_data_to_arango(doc_path: DocPath, graph_name: str, create_embeddings: bool) -> bool: """Ingest document to ArangoDB.""" path = doc_path.path if logflag: @@ -98,33 +133,56 @@ def ingest_data_to_arango(doc_path: DocPath, embeddings: Embeddings | None) -> b else: raise ValueError("No text generation inference endpoint is set.") - llm_transformer = LLMGraphTransformer( - llm=llm, - # prompt=..., # TODO: Parameterize - # allowed_nodes=..., # TODO: Parameterize - # allowed_relationships=..., # TODO: Parameterize - ) + try: + if not (NODE_PROPERTIES or RELATIONSHIP_PROPERTIES): + llm_transformer = LLMGraphTransformer( + llm=llm, + prompt=PROMPT_TEMPLATE, + allowed_nodes=ALLOWED_NODES, + allowed_relationships=ALLOWED_RELATIONSHIPS, + ) + else: + llm_transformer = LLMGraphTransformer( + llm=llm, + node_properties=NODE_PROPERTIES, + relationship_properties=RELATIONSHIP_PROPERTIES, + prompt=PROMPT_TEMPLATE, + allowed_nodes=ALLOWED_NODES, + allowed_relationships=ALLOWED_RELATIONSHIPS, + ) + except (TypeError, ValueError) as e: + logger.warning(f"Advanced LLMGraphTransformer failed: {e}") + # Fall back to basic config + try: + llm_transformer = LLMGraphTransformer(llm=llm) + except (TypeError, ValueError) as e: + logger.error(f"Failed to initialize LLMGraphTransformer: {e}") + raise ######################################## # Text Embeddings Inference (optional) # ######################################## - if OPENAI_API_KEY: - # Use OpenAI embeddings - embeddings = OpenAIEmbeddings( - model="text-embedding-3-small", # TODO: Parameterize - dimensions=512, # TODO: Parameterize - ) + if create_embeddings: + if OPENAI_API_KEY: + # Use OpenAI embeddings + embeddings = OpenAIEmbeddings( + model=OPENAI_EMBED_MODEL, + dimensions=OPENAI_EMBED_DIMENSIONS, + ) - if TEI_EMBEDDING_ENDPOINT and HUGGINGFACEHUB_API_TOKEN: - # Use TEI endpoint service - embeddings = HuggingFaceHubEmbeddings( - model=TEI_EMBEDDING_ENDPOINT, - huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN, - ) - elif TEI_EMBED_MODEL: - # Use local embedding model - embeddings = HuggingFaceBgeEmbeddings(model_name=TEI_EMBED_MODEL) + elif TEI_EMBEDDING_ENDPOINT and HUGGINGFACEHUB_API_TOKEN: + # Use TEI endpoint service + embeddings = HuggingFaceHubEmbeddings( + model=TEI_EMBEDDING_ENDPOINT, + huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN, + ) + elif TEI_EMBED_MODEL: + # Use local embedding model + embeddings = HuggingFaceBgeEmbeddings(model_name=TEI_EMBED_MODEL) + else: + logger.error("No text embeddings inference endpoint is set.") + embeddings = None else: embeddings = None @@ -159,7 +217,8 @@ def ingest_data_to_arango(doc_path: DocPath, embeddings: Embeddings | None) -> b if doc_path.process_table and path.endswith(".pdf"): table_chunks = get_tables_result(path, doc_path.table_strategy) - chunks = chunks + table_chunks + if isinstance(table_chunks, list): + chunks = chunks + table_chunks if logflag: logger.info("Done preprocessing. Created ", len(chunks), " chunks of the original file.") @@ -179,12 +238,12 @@ def ingest_data_to_arango(doc_path: DocPath, embeddings: Embeddings | None) -> b graph.add_graph_documents( graph_documents=graph_docs, - include_source=True, # TODO: Parameterize - graph_name="NewGraph", # TODO: Parameterize - update_graph_definition_if_exists=False, # TODO: Set as reverse of `use_one_entity_collection` - batch_size=1000, # TODO: Parameterize - use_one_entity_collection=True, # TODO: Parameterize - insert_async=False, # TODO: Parameterize + include_source=INCLUDE_SOURCE, + graph_name=graph_name, + update_graph_definition_if_exists=not USE_ONE_ENTITY_COLLECTION, + batch_size=ARANGO_BATCH_SIZE, + use_one_entity_collection=USE_ONE_ENTITY_COLLECTION, + insert_async=INSERT_ASYNC, ) if logflag: @@ -208,6 +267,8 @@ async def ingest_documents( chunk_overlap: int = Form(100), process_table: bool = Form(False), table_strategy: str = Form("fast"), + graph_name: str = Form("NewGraph"), + create_embeddings: bool = Form(True), ): if logflag: logger.info(f"files:{files}") @@ -228,7 +289,9 @@ async def ingest_documents( chunk_overlap=chunk_overlap, process_table=process_table, table_strategy=table_strategy, - ) + ), + graph_name=graph_name, + create_embeddings=create_embeddings, ) uploaded_files.append(save_path) if logflag: @@ -255,7 +318,8 @@ async def ingest_documents( chunk_overlap=chunk_overlap, process_table=process_table, table_strategy=table_strategy, - ) + ), + graph_name=graph_name, ) except json.JSONDecodeError: raise HTTPException(status_code=500, detail="Fail to ingest data into qdrant.") From 678fcf118a2d40b970098a16828df738602bb857 Mon Sep 17 00:00:00 2001 From: Ajay Kallepalli Date: Sun, 15 Dec 2024 21:12:00 -0800 Subject: [PATCH 23/25] Adding git to Dockerfile, tested dockerfile and dockercompose. Also parametrized variables in prepare_doc_arango.py --- comps/dataprep/arango/langchain/Dockerfile | 5 +++-- comps/dataprep/arango/langchain/README.md | 2 +- comps/dataprep/arango/langchain/config.py | 18 ++++++++++-------- .../docker-compose-dataprep-arango.yaml | 6 ++++-- 4 files changed, 18 insertions(+), 13 deletions(-) diff --git a/comps/dataprep/arango/langchain/Dockerfile b/comps/dataprep/arango/langchain/Dockerfile index 7bd07262a..5d8aa7a48 100644 --- a/comps/dataprep/arango/langchain/Dockerfile +++ b/comps/dataprep/arango/langchain/Dockerfile @@ -11,7 +11,8 @@ RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missin build-essential \ default-jre \ libgl1-mesa-glx \ - libjemalloc-dev + libjemalloc-dev \ + git RUN useradd -m -s /bin/bash user && \ mkdir -p /home/user && \ @@ -35,4 +36,4 @@ USER user WORKDIR /home/user/comps/dataprep/arango/langchain -ENTRYPOINT ["python", "prepare_doc_arango.py"] +ENTRYPOINT ["python", "prepare_doc_arango.py"] \ No newline at end of file diff --git a/comps/dataprep/arango/langchain/README.md b/comps/dataprep/arango/langchain/README.md index 1d1e696d4..d57249510 100644 --- a/comps/dataprep/arango/langchain/README.md +++ b/comps/dataprep/arango/langchain/README.md @@ -89,7 +89,7 @@ curl -X POST \ http://localhost:6007/v1/dataprep ``` -Additional options that can be specified from the config.py file are as follows: +Additional options that can be specified from the environment variables are as follows (default values are in the config.py file): OpenAI Configuration: - `OPENAI_EMBED_MODEL`: The embedding model to use for the OpenAI service. diff --git a/comps/dataprep/arango/langchain/config.py b/comps/dataprep/arango/langchain/config.py index 525862a9d..5b27a8166 100644 --- a/comps/dataprep/arango/langchain/config.py +++ b/comps/dataprep/arango/langchain/config.py @@ -10,10 +10,10 @@ ARANGO_DB_NAME = os.getenv("ARANGO_DB_NAME", "_system") # ArangoDB graph configuration -USE_ONE_ENTITY_COLLECTION = True -INSERT_ASYNC = False -ARANGO_BATCH_SIZE = 1000 -INCLUDE_SOURCE = True +USE_ONE_ENTITY_COLLECTION = os.getenv("USE_ONE_ENTITY_COLLECTION", True) +INSERT_ASYNC = os.getenv("INSERT_ASYNC", False) +ARANGO_BATCH_SIZE = os.getenv("ARANGO_BATCH_SIZE", 1000) +INCLUDE_SOURCE = os.getenv("INCLUDE_SOURCE", True) # Text Generation Inference configuration TGI_LLM_ENDPOINT = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080") @@ -27,11 +27,13 @@ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") OPENAI_EMBED_MODEL = os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-small") OPENAI_EMBED_DIMENSIONS = os.getenv("OPENAI_EMBED_DIMENSIONS", 512) +OPENAI_CHAT_MODEL = os.getenv("OPENAI_CHAT_MODEL", "gpt-4o") +OPENAI_CHAT_TEMPERATURE = os.getenv("OPENAI_CHAT_TEMPERATURE", 0) # LLMGraphTransformer configuration -ALLOWED_NODES = [] # ["Person", "Organization"] -ALLOWED_RELATIONSHIPS = [] # [("Person", "knows", "Person"), ("Person", "works_at", "Organization")] -NODE_PROPERTIES = False # ["description"] -RELATIONSHIP_PROPERTIES = False # ["description"] +ALLOWED_NODES = os.getenv("ALLOWED_NODES", []) # ["Person", "Organization"] +ALLOWED_RELATIONSHIPS = os.getenv("ALLOWED_RELATIONSHIPS", []) # [("Person", "knows", "Person"), ("Person", "works_at", "Organization")] +NODE_PROPERTIES = os.getenv("NODE_PROPERTIES", ['description']) +RELATIONSHIP_PROPERTIES = os.getenv("RELATIONSHIP_PROPERTIES", ['description']) SYSTEM_PROMPT_PATH = os.getenv("SYSTEM_PROMPT_PATH", "./prompt.txt") \ No newline at end of file diff --git a/comps/dataprep/arango/langchain/docker-compose-dataprep-arango.yaml b/comps/dataprep/arango/langchain/docker-compose-dataprep-arango.yaml index c766b5c03..08408c127 100644 --- a/comps/dataprep/arango/langchain/docker-compose-dataprep-arango.yaml +++ b/comps/dataprep/arango/langchain/docker-compose-dataprep-arango.yaml @@ -8,6 +8,8 @@ services: container_name: arango-graph-db ports: - "8529:8529" + environment: + ARANGO_ROOT_PASSWORD: ${ARANGO_PASSWORD} tgi_gaudi_service: image: ghcr.io/huggingface/tgi-gaudi:2.0.5 container_name: tgi-service @@ -23,7 +25,7 @@ services: HF_TOKEN: ${HF_TOKEN} command: --model-id ${LLM_MODEL_ID} --auto-truncate --max-input-tokens 1024 --max-total-tokens 2048 dataprep-arango: - image: opea/gen-ai-comps:dataprep-arango-xeon-server + image: opea/dataprep-arango:latest container_name: dataprep-arango-server depends_on: - arango-vector-db @@ -35,7 +37,7 @@ services: no_proxy: ${no_proxy} http_proxy: ${http_proxy} https_proxy: ${https_proxy} - ARANGO_URL: ${ARANGO_URL} + ARANGO_URL: http://arango-graph-db:8529 ARANGO_USERNAME: ${ARANGO_USERNAME} ARANGO_PASSWORD: ${ARANGO_PASSWORD} ARANGO_DB_NAME: ${ARANGO_DB_NAME} From 1de1c0e3f2b83cbbbb932e9d1ffec88efd153857 Mon Sep 17 00:00:00 2001 From: Ajay Kallepalli Date: Sun, 15 Dec 2024 21:26:37 -0800 Subject: [PATCH 24/25] Updating readme with adjustable parameters listed --- comps/dataprep/arango/langchain/README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/comps/dataprep/arango/langchain/README.md b/comps/dataprep/arango/langchain/README.md index d57249510..bdd6f71b0 100644 --- a/comps/dataprep/arango/langchain/README.md +++ b/comps/dataprep/arango/langchain/README.md @@ -94,13 +94,25 @@ Additional options that can be specified from the environment variables are as f OpenAI Configuration: - `OPENAI_EMBED_MODEL`: The embedding model to use for the OpenAI service. - `OPENAI_EMBED_DIMENSIONS`: The dimensions for the OpenAI service. +- `OPENAI_CHAT_MODEL`: The chat model to use for the OpenAI service. +- `OPENAI_CHAT_TEMPERATURE`: The temperature for the OpenAI service. ArangoDB Configuration: +- `YOUR_GRAPH_NAME`: The name of the graph to create. Must be passed in the curl command. - `USE_ONE_ENTITY_COLLECTION`: If set to True, the microservice will use a single entity collection for all nodes. If set to False, the microservice will use a separate collection for each node type. - `INSERT_ASYNC`: If set to True, the microservice will insert the data asynchronously. - `ARANGO_BATCH_SIZE`: The batch size for the microservice to insert the data. - `INCLUDE_SOURCE`: If set to True, the microservice will include the source in the data. +Graph Configuration: +- `ALLOWED_NODES`: The nodes to allow in the graph. +- `ALLOWED_RELATIONSHIPS`: The relationships to allow in the graph. +- `NODE_PROPERTIES`: The properties to allow in the nodes. +- `RELATIONSHIP_PROPERTIES`: The properties to allow in the relationships. + +Prompt Configuration: +- `SYSTEM_PROMPT_PATH`: The path to the system prompt text file. This can be used to specify the specific system prompt for the entity extraction and graph generation steps. + We support table extraction from pdf documents. You can specify process_table and table_strategy by the following commands. "table_strategy" refers to the strategies to understand tables for table retrieval. As the setting progresses from "fast" to "hq" to "llm," the focus shifts towards deeper table understanding at the expense of processing speed. The default strategy is "fast". Note: If you specify "table_strategy=llm", You should first start TGI Service, please refer to 1.2.1, 1.3.1 in https://github.com/opea-project/GenAIComps/tree/main/comps/llms/README.md, and then `export TGI_LLM_ENDPOINT="http://${your_ip}:8008"`. From 9dec2af5c44ea3fda1dc453fc744dcd97367f0a1 Mon Sep 17 00:00:00 2001 From: Ajay Kallepalli Date: Sun, 15 Dec 2024 21:41:51 -0800 Subject: [PATCH 25/25] Only printing debug statements if log flag is on --- .../arango/langchain/prepare_doc_arango.py | 21 ++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/comps/dataprep/arango/langchain/prepare_doc_arango.py b/comps/dataprep/arango/langchain/prepare_doc_arango.py index 80c777225..500f728b0 100644 --- a/comps/dataprep/arango/langchain/prepare_doc_arango.py +++ b/comps/dataprep/arango/langchain/prepare_doc_arango.py @@ -109,17 +109,21 @@ def ingest_data_to_arango(doc_path: DocPath, graph_name: str, create_embeddings: ############################# if OPENAI_API_KEY: - logger.info("OpenAI API Key is set. Verifying its validity...") + if logflag: + logger.info("OpenAI API Key is set. Verifying its validity...") openai.api_key = OPENAI_API_KEY try: openai.models.list() - logger.info("OpenAI API Key is valid.") + if logflag: + logger.info("OpenAI API Key is valid.") llm = ChatOpenAI(temperature=0, model_name="gpt-4o") except openai.error.AuthenticationError: - logger.info("OpenAI API Key is invalid.") + if logflag: + logger.info("OpenAI API Key is invalid.") except Exception as e: - logger.info(f"An error occurred while verifying the API Key: {e}") + if logflag: + logger.info(f"An error occurred while verifying the API Key: {e}") elif TGI_LLM_ENDPOINT: llm = HuggingFaceEndpoint( @@ -151,12 +155,14 @@ def ingest_data_to_arango(doc_path: DocPath, graph_name: str, create_embeddings: allowed_relationships=ALLOWED_RELATIONSHIPS, ) except (TypeError, ValueError) as e: - logger.warning(f"Advanced LLMGraphTransformer failed: {e}") + if logflag: + logger.warning(f"Advanced LLMGraphTransformer failed: {e}") # Fall back to basic config try: llm_transformer = LLMGraphTransformer(llm=llm) except (TypeError, ValueError) as e: - logger.error(f"Failed to initialize LLMGraphTransformer: {e}") + if logflag: + logger.error(f"Failed to initialize LLMGraphTransformer: {e}") raise ######################################## @@ -181,7 +187,8 @@ def ingest_data_to_arango(doc_path: DocPath, graph_name: str, create_embeddings: # Use local embedding model embeddings = HuggingFaceBgeEmbeddings(model_name=TEI_EMBED_MODEL) else: - logger.error("No text embeddings inference endpoint is set.") + if logflag: + logger.error("No text embeddings inference endpoint is set.") embeddings = None else: embeddings = None