diff --git a/comps/__init__.py b/comps/__init__.py index 95f780b720..6f753aaeb5 100644 --- a/comps/__init__.py +++ b/comps/__init__.py @@ -34,6 +34,7 @@ CodeTransGateway, DocSumGateway, TranslationGateway, + SearchQnAGateway, AudioQnAGateway, ) diff --git a/comps/cores/mega/constants.py b/comps/cores/mega/constants.py index ae0eeeec83..5535a90312 100644 --- a/comps/cores/mega/constants.py +++ b/comps/cores/mega/constants.py @@ -28,6 +28,7 @@ class ServiceType(Enum): RAGAS = 11 LVM = 12 KNOWLEDGE_GRAPH = 13 + WEB_RETRIEVER = 14 class MegaServiceEndpoint(Enum): diff --git a/comps/cores/mega/gateway.py b/comps/cores/mega/gateway.py index ea6341072f..40dda88aa4 100644 --- a/comps/cores/mega/gateway.py +++ b/comps/cores/mega/gateway.py @@ -350,3 +350,45 @@ async def handle_request(self, request: Request): response = result_dict[last_node]["byte_str"] return response + + +class SearchQnAGateway(Gateway): + def __init__(self, megaservice, host="0.0.0.0", port=8888): + super().__init__( + megaservice, host, port, str(MegaServiceEndpoint.SEARCH_QNA), ChatCompletionRequest, ChatCompletionResponse + ) + + async def handle_request(self, request: Request): + data = await request.json() + stream_opt = data.get("stream", True) + chat_request = ChatCompletionRequest.parse_obj(data) + prompt = self._handle_message(chat_request.messages) + parameters = LLMParams( + max_new_tokens=chat_request.max_tokens if chat_request.max_tokens else 1024, + top_k=chat_request.top_k if chat_request.top_k else 10, + top_p=chat_request.top_p if chat_request.top_p else 0.95, + temperature=chat_request.temperature if chat_request.temperature else 0.01, + repetition_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 1.03, + streaming=stream_opt, + ) + result_dict = await self.megaservice.schedule(initial_inputs={"text": prompt}, llm_parameters=parameters) + for node, response in result_dict.items(): + # Here it suppose the last microservice in the megaservice is LLM. + if ( + isinstance(response, StreamingResponse) + and node == list(self.megaservice.services.keys())[-1] + and self.megaservice.services[node].service_type == ServiceType.LLM + ): + return response + last_node = self.megaservice.all_leaves()[-1] + response = result_dict[last_node]["text"] + choices = [] + usage = UsageInfo() + choices.append( + ChatCompletionResponseChoice( + index=0, + message=ChatMessage(role="assistant", content=response), + finish_reason="stop", + ) + ) + return ChatCompletionResponse(model="searchqna", choices=choices, usage=usage) diff --git a/comps/web_retrievers/__init__.py b/comps/web_retrievers/__init__.py new file mode 100644 index 0000000000..916f3a44b2 --- /dev/null +++ b/comps/web_retrievers/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/web_retrievers/langchain/chroma/README.md b/comps/web_retrievers/langchain/chroma/README.md new file mode 100644 index 0000000000..47a308837e --- /dev/null +++ b/comps/web_retrievers/langchain/chroma/README.md @@ -0,0 +1,50 @@ +# Web Retriever Microservice + +The Web Retriever Microservice is designed to efficiently search web pages relevant to the prompt, save them into the VectorDB, and retrieve the matched documents with the highest similarity. The retrieved documents will be used as context in the prompt to LLMs. Different from the normal RAG process, a web retriever can leverage advanced search engines for more diverse demands, such as real-time news, verifiable sources, and diverse sources. + +# Start Microservice with Docker + +## Build Docker Image + +```bash +cd ../../ +docker build -t opea/web-retriever-chroma:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/web_retrievers/langchain/chroma/docker/Dockerfile . +``` + +## Start TEI Service + +```bash +model=BAAI/bge-base-en-v1.5 +revision=refs/pr/4 +volume=$PWD/data +docker run -d -p 6060:80 -v $volume:/data -e http_proxy=$http_proxy -e https_proxy=$https_proxy --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.2 --model-id $model --revision $revision +``` + +## Start Web Retriever Service + +```bash +# set TEI endpoint +export TEI_EMBEDDING_ENDPOINT="http://${your_ip}:6060" + +# set search engine env variables +export GOOGLE_API_KEY=xxx +export GOOGLE_CSE_ID=xxx +``` + +```bash +docker run -d --name="web-retriever-chroma-server" -p 7078:7077 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy -e TEI_EMBEDDING_ENDPOINT=$TEI_EMBEDDING_ENDPOINT opea/web-retriever-chroma:latest +``` + +## Consume Web Retriever Service + +To consume the Web Retriever Microservice, you can generate a mock embedding vector of length 768 with Python. + +```bash +# Test +your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") + +curl http://${your_ip}:7077/v1/web_retrieval \ + -X POST \ + -d "{\"text\":\"What is OPEA?\",\"embedding\":${your_embedding}}" \ + -H 'Content-Type: application/json' +``` diff --git a/comps/web_retrievers/langchain/chroma/docker/Dockerfile b/comps/web_retrievers/langchain/chroma/docker/Dockerfile new file mode 100644 index 0000000000..cf3cee0ea0 --- /dev/null +++ b/comps/web_retrievers/langchain/chroma/docker/Dockerfile @@ -0,0 +1,21 @@ + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM langchain/langchain:latest + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + libgl1-mesa-glx \ + libjemalloc-dev \ + vim + +COPY comps /home/user/comps + +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir -r /home/user/comps/web_retrievers/langchain/chroma/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +WORKDIR /home/user/comps/web_retrievers/langchain/chroma + +ENTRYPOINT ["python", "retriever_chroma.py"] diff --git a/comps/web_retrievers/langchain/chroma/requirements.txt b/comps/web_retrievers/langchain/chroma/requirements.txt new file mode 100644 index 0000000000..e72c136508 --- /dev/null +++ b/comps/web_retrievers/langchain/chroma/requirements.txt @@ -0,0 +1,14 @@ +bs4 +chromadb +docarray[full] +fastapi +google-api-python-client>=2.100.0 +html2text +langchain-huggingface +langchain_community +opentelemetry-api +opentelemetry-exporter-otlp +opentelemetry-sdk +prometheus-fastapi-instrumentator +sentence_transformers +shortuuid diff --git a/comps/web_retrievers/langchain/chroma/retriever_chroma.py b/comps/web_retrievers/langchain/chroma/retriever_chroma.py new file mode 100644 index 0000000000..d699b3eb3c --- /dev/null +++ b/comps/web_retrievers/langchain/chroma/retriever_chroma.py @@ -0,0 +1,121 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os +import time + +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_community.document_loaders import AsyncHtmlLoader +from langchain_community.document_transformers import Html2TextTransformer +from langchain_community.utilities import GoogleSearchAPIWrapper +from langchain_community.vectorstores import Chroma +from langchain_huggingface import HuggingFaceEndpointEmbeddings + +from comps import ( + EmbedDoc768, + SearchedDoc, + ServiceType, + TextDoc, + opea_microservices, + register_microservice, + register_statistics, + statistics_dict, +) + +tei_embedding_endpoint = os.getenv("TEI_EMBEDDING_ENDPOINT") + + +def get_urls(query, num_search_result=1): + result = search.results(query, num_search_result) + return result + + +def retrieve_htmls(all_urls): + loader = AsyncHtmlLoader(all_urls, ignore_load_errors=True, trust_env=True) + docs = loader.load() + return docs + + +def parse_htmls(docs): + print("Indexing new urls...") + + html2text = Html2TextTransformer() + docs = list(html2text.transform_documents(docs)) + docs = text_splitter.split_documents(docs) + + return docs + + +def dump_docs(docs): + vector_db.add_documents(docs) + + +@register_microservice( + name="opea_service@web_retriever_chroma", + service_type=ServiceType.RETRIEVER, + endpoint="/v1/web_retrieval", + host="0.0.0.0", + port=7077, +) +@register_statistics(names=["opea_service@web_retriever_chroma", "opea_service@search"]) +def web_retrieve(input: EmbedDoc768) -> SearchedDoc: + start = time.time() + query = input.text + embedding = input.embedding + + # Google Search the results, parse the htmls + search_results = get_urls(query) + urls_to_look = [] + for res in search_results: + if res.get("link", None): + urls_to_look.append(res["link"]) + urls = list(set(urls_to_look)) + print(f"urls: {urls}") + docs = retrieve_htmls(urls) + docs = parse_htmls(docs) + print(docs) + # Remove duplicated docs + unique_documents_dict = {(doc.page_content, tuple(sorted(doc.metadata.items()))): doc for doc in docs} + unique_documents = list(unique_documents_dict.values()) + statistics_dict["opea_service@search"].append_latency(time.time() - start, None) + + # dump to vector_db + dump_docs(unique_documents) + + # Do the retrieval + search_res = vector_db.similarity_search_by_vector(embedding=embedding) + + searched_docs = [] + + for r in search_res: + # include the metadata into the retrieved docs content + description_str = f"\n description: {r.metadata['description']} \n" if "description" in r.metadata else "" + title_str = f"\n title: {r.metadata['title']} \n" if "title" in r.metadata else "" + source_str = f"\n source: {r.metadata['source']} \n" if "source" in r.metadata else "" + text_with_meta = f"{r.page_content} {description_str} {title_str} {source_str}" + searched_docs.append(TextDoc(text=text_with_meta)) + + result = SearchedDoc(retrieved_docs=searched_docs, initial_query=query) + statistics_dict["opea_service@web_retriever_chroma"].append_latency(time.time() - start, None) + + # For Now history is banned + if vector_db.get()["ids"]: + vector_db.delete(vector_db.get()["ids"]) + return result + + +if __name__ == "__main__": + # Create vectorstore + tei_embedding_endpoint = os.getenv("TEI_EMBEDDING_ENDPOINT") + # vectordb_persistent_directory = os.getenv("VECTORDB_PERSISTENT_DIR", "/home/user/chroma_db_oai") + vector_db = Chroma( + embedding_function=HuggingFaceEndpointEmbeddings(model=tei_embedding_endpoint), + # persist_directory=vectordb_persistent_directory + ) + + google_api_key = os.environ.get("GOOGLE_API_KEY") + google_cse_id = os.environ.get("GOOGLE_CSE_ID") + search = GoogleSearchAPIWrapper(google_api_key=google_api_key, google_cse_id=google_cse_id, k=10) + text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=50) + + opea_microservices["opea_service@web_retriever_chroma"].start() diff --git a/tests/test_retrievers_langchain_redis.sh b/tests/test_retrievers_langchain_redis.sh index 451385aa63..b540b10d9c 100644 --- a/tests/test_retrievers_langchain_redis.sh +++ b/tests/test_retrievers_langchain_redis.sh @@ -47,7 +47,7 @@ function validate_microservice() { } function stop_docker() { - cid_retrievers=$(docker ps -aq --filter "name=test-comps-retrievers*") + cid_retrievers=$(docker ps -aq --filter "name=test-comps-retriever*") if [[ ! -z "$cid_retrievers" ]]; then docker stop $cid_retrievers && docker rm $cid_retrievers && sleep 1s fi diff --git a/tests/test_web_retrievers_langchain.sh b/tests/test_web_retrievers_langchain.sh new file mode 100644 index 0000000000..d1e2c3ed53 --- /dev/null +++ b/tests/test_web_retrievers_langchain.sh @@ -0,0 +1,62 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -xe + +WORKPATH=$(dirname "$PWD") +ip_address=$(hostname -I | awk '{print $1}') +function build_docker_images() { + cd $WORKPATH + docker build --no-cache -t opea/web-retriever-chroma:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/web_retrievers/langchain/chroma/docker/Dockerfile . +} + +function start_service() { + + # tei endpoint + tei_endpoint=5018 + model="BAAI/bge-base-en-v1.5" + docker run -d --name="test-comps-web-retriever-tei-endpoint" -p $tei_endpoint:80 -v ./data:/data --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.2 --model-id $model + export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:${tei_endpoint}" + + # chroma web retriever + retriever_port=5019 + unset http_proxy + docker run -d --name="test-comps-web-retriever-chroma-server" -p ${retriever_port}:7077 --ipc=host -e GOOGLE_API_KEY=$GOOGLE_API_KEY -e GOOGLE_CSE_ID=$GOOGLE_CSE_ID -e TEI_EMBEDDING_ENDPOINT=$TEI_EMBEDDING_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/web-retriever-chroma:comps + + sleep 3m +} + +function validate_microservice() { + retriever_port=5019 + export PATH="${HOME}/miniforge3/bin:$PATH" + test_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") + http_proxy='' curl http://${ip_address}:$retriever_port/v1/web_retrieval \ + -X POST \ + -d "{\"text\":\"What is OPEA?\",\"embedding\":${test_embedding}}" \ + -H 'Content-Type: application/json' + docker logs test-comps-web-retriever-tei-endpoint +} + +function stop_docker() { + cid_retrievers=$(docker ps -aq --filter "name=test-comps-web*") + if [[ ! -z "$cid_retrievers" ]]; then + docker stop $cid_retrievers && docker rm $cid_retrievers && sleep 1s + fi +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_microservice + + stop_docker + echo y | docker system prune + +} + +main