diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 972b2a181a..2636246aec 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,9 +1,21 @@ +/comps/agent/ xuhui.ren@intel.com +/comps/finetuning/ xinyu.ye@intel.com +/comps/guardrails/ liang1.lv@intel.com /comps/asr/ sihan.chen@intel.com +/comps/intent_detection/ liang1.lv@intel.com +/comps/knowledgegraphs/ xuhui.ren@intel.com /comps/cores/ liang1.lv@intel.com /comps/dataprep/ xinyu.ye@intel.com /comps/embeddings/ xuhui.ren@intel.com /comps/guardrails/ letong.han@intel.com /comps/llms/ liang1.lv@intel.com +/comps/lvms/ sihan.chen@intel.com +/comps/nginx/ letong.han@intel.com +/comps/prompt_registry/ hoong.tee.yeoh@intel.com +/comps/chathistory/ yogesh.pandey@intel.com /comps/reranks/ xuhui.ren@intel.com /comps/retrievers/ xuhui.ren@intel.com /comps/tts/ sihan.chen@intel.com +/comps/ragas/ xuhui.ren@intel.com +/comps/vectorstores/ xinyu.ye@intel.com +/comps/web_retrievers/ sihan.chen@intel.com diff --git a/.github/workflows/_comps-workflow.yml b/.github/workflows/_comps-workflow.yml index 58cb02dbac..ca402c445b 100644 --- a/.github/workflows/_comps-workflow.yml +++ b/.github/workflows/_comps-workflow.yml @@ -20,10 +20,17 @@ on: default: true required: false type: boolean - # scan: - # default: true - # required: false - # type: boolean + test: + default: true + description: "Test comps with docker compose" + required: false + type: boolean + mode: + default: "CD" + description: "Whether the test range is CI or CD" + required: false + type: string + jobs: #################################################################################################### # Image Build @@ -31,6 +38,8 @@ jobs: build-images: runs-on: "docker-build-${{ inputs.node }}" continue-on-error: true + outputs: + file_exists: ${{ steps.get-yaml-path.outputs.file_exists }} steps: - name: Clean Up Working Directory run: sudo rm -rf ${{github.workspace}}/* @@ -39,82 +48,45 @@ jobs: uses: actions/checkout@v4 - name: Clone required Repo + id: get-yaml-path run: | cd ${{ github.workspace }}/.github/workflows/docker/compose # service=$(echo ${{ inputs.service }} | cut -d'_' -f1) - docker_compose_yml=${{ github.workspace }}/.github/workflows/docker/compose/${{ inputs.service }}-compose.yaml - echo ${docker_compose_yml} + if [[ "${{ inputs.mode }}" == "CD" ]]; then + docker_compose_yml=${{ github.workspace }}/.github/workflows/docker/compose/${{ inputs.service }}-compose-cd.yaml + else + docker_compose_yml=${{ github.workspace }}/.github/workflows/docker/compose/${{ inputs.service }}-compose.yaml + fi + echo "docker_compose_path=${docker_compose_yml}" >> $GITHUB_OUTPUT + if [ -f "$docker_compose_yml" ]; then + echo "file_exists=true" >> $GITHUB_OUTPUT + else + echo "There is no ${{ inputs.mode }} part of ${{ inputs.service }} that needs to be executed." + echo "file_exists=false" >> $GITHUB_OUTPUT + fi + if [[ $(grep -c "llava-tgi:" ${docker_compose_yml}) != 0 ]]; then git clone https://github.com/yuanwu2017/tgi-gaudi.git && cd tgi-gaudi && git checkout v2.0.4 fi if [[ $(grep -c "vllm-openvino:" ${docker_compose_yml}) != 0 ]]; then git clone https://github.com/vllm-project/vllm.git vllm-openvino fi - # echo "service=$service" >> $GITHUB_ENV - name: Build Image - if: ${{ fromJSON(inputs.build) }} + if: ${{ fromJSON(inputs.build) && steps.get-yaml-path.outputs.file_exists == 'true' }} uses: opea-project/validation/actions/image-build@main with: - work_dir: ${{ github.workspace }}/ - docker_compose_path: ${{ github.workspace }}/.github/workflows/docker/compose/${{ inputs.service }}-compose.yaml + work_dir: ${{ github.workspace }} + docker_compose_path: ${{ steps.get-yaml-path.outputs.docker_compose_path }} registry: ${OPEA_IMAGE_REPO}opea tag: ${{ inputs.tag }} - # #################################################################################################### - # # Trivy Scan - # #################################################################################################### - # get-image-list: - # needs: [build-images] - # if: ${{ fromJSON(inputs.scan) && inputs.node == 'gaudi' }} - # runs-on: ubuntu-latest - # outputs: - # matrix: ${{ steps.scan-matrix.outputs.matrix }} - # steps: - # - name: Checkout out Repo - # uses: actions/checkout@v4 - - # - name: Set Matrix - # id: scan-matrix - # run: | - # pip install yq - # compose_path=${{ github.workspace }}/${{ inputs.example }}/docker/docker_build_compose.yaml - # echo "matrix=$(cat ${compose_path} | yq -r '.[]' | jq 'keys' | jq -c '.')" >> $GITHUB_OUTPUT - - # scan-images: - # needs: [get-image-list, build-images] - # if: ${{ fromJSON(inputs.scan) && inputs.node == 'gaudi'}} - # runs-on: "docker-build-${{ inputs.node }}" - # strategy: - # matrix: - # image: ${{ fromJSON(needs.get-image-list.outputs.matrix) }} - # fail-fast: false - # steps: - # - name: Pull Image - # run: | - # docker pull ${OPEA_IMAGE_REPO}opea/${{ matrix.image }}:${{ inputs.tag }} - # echo "OPEA_IMAGE_REPO=${OPEA_IMAGE_REPO}" >> $GITHUB_ENV - - # - name: Scan Container - # uses: opea-project/validation/actions/trivy-scan@main - # with: - # image-ref: ${{ env.OPEA_IMAGE_REPO }}opea/${{ matrix.image }}:${{ inputs.tag }} - # output: ${{ matrix.image }}-scan.txt - - # - name: Cleanup - # if: always() - # run: docker rmi -f ${OPEA_IMAGE_REPO}opea/${{ matrix.image }}:${{ inputs.tag }} - - # - uses: actions/upload-artifact@v4.3.4 - # with: - # name: ${{ matrix.image }}-scan - # path: ${{ matrix.image }}-scan.txt - # overwrite: true #################################################################################################### # Docker Compose Test #################################################################################################### test-service-compose: needs: [build-images] + if: ${{ fromJSON(inputs.test) && needs.build-images.outputs.file_exists == 'true' }} uses: ./.github/workflows/_run-docker-compose.yml with: tag: ${{ inputs.tag }} diff --git a/.github/workflows/_get-test-matrix.yml b/.github/workflows/_get-test-matrix.yml index deedaa949a..301835fc89 100644 --- a/.github/workflows/_get-test-matrix.yml +++ b/.github/workflows/_get-test-matrix.yml @@ -25,7 +25,6 @@ jobs: else echo "CHECKOUT_REF=${{ github.ref }}" >> $GITHUB_ENV fi - echo "checkout ref ${{ env.CHECKOUT_REF }}" - name: Checkout out Repo uses: actions/checkout@v4 @@ -39,7 +38,7 @@ jobs: set -xe if [ "${{ github.event_name }}" == "pull_request" ] || [ "${{ github.event_name }}" == "pull_request_target" ]; then LATEST_COMMIT_SHA=$(curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \ - "https://api.github.com/repos/opea-project/GenAIComps/commits?sha=main" | jq -r '.[0].sha') + "https://api.github.com/repos/opea-project/GenAIComps/commits?sha=${{ github.event.pull_request.base.ref }}" | jq -r '.[0].sha') echo "Latest commit SHA is $LATEST_COMMIT_SHA" base_commit=$LATEST_COMMIT_SHA else diff --git a/.github/workflows/docker/compose/agent-compose-cd.yaml b/.github/workflows/docker/compose/agent-compose-cd.yaml new file mode 100644 index 0000000000..a285ecc34c --- /dev/null +++ b/.github/workflows/docker/compose/agent-compose-cd.yaml @@ -0,0 +1,9 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# this file should be run in the root of the repo +services: + comps-agent-langchain: + build: + dockerfile: comps/agent/langchain/docker/Dockerfile + image: ${REGISTRY}opea/comps-agent-langchain:${TAG:-latest} diff --git a/.github/workflows/docker/compose/chathistory-compose-cd.yaml b/.github/workflows/docker/compose/chathistory-compose-cd.yaml new file mode 100644 index 0000000000..f8930cde85 --- /dev/null +++ b/.github/workflows/docker/compose/chathistory-compose-cd.yaml @@ -0,0 +1,9 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# this file should be run in the root of the repo +services: + chathistory-mongo-server: + build: + dockerfile: comps/chathistory/mongo/docker/Dockerfile + image: ${REGISTRY}opea/chathistory-mongo-server:${TAG:-latest} diff --git a/.github/workflows/docker/compose/dataprep-compose-cd.yaml b/.github/workflows/docker/compose/dataprep-compose-cd.yaml new file mode 100644 index 0000000000..19f4c063da --- /dev/null +++ b/.github/workflows/docker/compose/dataprep-compose-cd.yaml @@ -0,0 +1,25 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# others: dataprep-redis-llama-index,dataprep-on-ray-redis +services: + dataprep-redis-llama-index: + build: + dockerfile: comps/dataprep/redis/llama_index/docker/Dockerfile + image: ${REGISTRY}opea/dataprep-redis-llama-index:${TAG:-latest} + dataprep-on-ray-redis: + build: + dockerfile: comps/dataprep/redis/langchain_ray/docker/Dockerfile + image: ${REGISTRY}opea/dataprep-on-ray-redis:${TAG:-latest} + dataprep-milvus: + build: + dockerfile: comps/dataprep/milvus/docker/Dockerfile + image: ${REGISTRY}opea/dataprep-milvus:${TAG:-latest} + dataprep-pgvector: + build: + dockerfile: comps/dataprep/pgvector/langchain/docker/Dockerfile + image: ${REGISTRY}opea/dataprep-pgvector:${TAG:-latest} + dataprep-pinecone: + build: + dockerfile: comps/dataprep/pinecone/docker/Dockerfile + image: ${REGISTRY}opea/dataprep-pinecone:${TAG:-latest} diff --git a/.github/workflows/docker/compose/dataprep-compose.yaml b/.github/workflows/docker/compose/dataprep-compose.yaml index 5cca84cb4b..1671235f41 100644 --- a/.github/workflows/docker/compose/dataprep-compose.yaml +++ b/.github/workflows/docker/compose/dataprep-compose.yaml @@ -13,11 +13,3 @@ services: build: dockerfile: comps/dataprep/qdrant/docker/Dockerfile image: ${REGISTRY}opea/dataprep-qdrant:${TAG:-latest} - dataprep-redis-llama-index: - build: - dockerfile: comps/dataprep/redis/llama_index/docker/Dockerfile - image: ${REGISTRY}opea/dataprep-redis-llama-index:${TAG:-latest} - dataprep-on-ray-redis: - build: - dockerfile: comps/dataprep/redis/langchain_ray/docker/Dockerfile - image: ${REGISTRY}opea/dataprep-on-ray-redis:${TAG:-latest} diff --git a/.github/workflows/docker/compose/embeddings-compose-cd.yaml b/.github/workflows/docker/compose/embeddings-compose-cd.yaml new file mode 100644 index 0000000000..3d08a1b53e --- /dev/null +++ b/.github/workflows/docker/compose/embeddings-compose-cd.yaml @@ -0,0 +1,16 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +services: + embedding-langchain-mosec-endpoint: + build: + dockerfile: comps/embeddings/langchain-mosec/mosec-docker/Dockerfile + image: ${REGISTRY}opea/embedding-langchain-mosec-endpoint:${TAG:-latest} + embedding-langchain-mosec: + build: + dockerfile: comps/embeddings/langchain-mosec/docker/Dockerfile + image: ${REGISTRY}opea/embedding-langchain-mosec:${TAG:-latest} + embedding-tei-llama-index: + build: + dockerfile: comps/embeddings/llama_index/docker/Dockerfile + image: ${REGISTRY}opea/embedding-tei-llama-index:${TAG:-latest} diff --git a/.github/workflows/docker/compose/guardrails-compose-cd.yaml b/.github/workflows/docker/compose/guardrails-compose-cd.yaml new file mode 100644 index 0000000000..e6365a99d5 --- /dev/null +++ b/.github/workflows/docker/compose/guardrails-compose-cd.yaml @@ -0,0 +1,8 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +services: + guardrails-pii-detection: + build: + dockerfile: comps/guardrails/pii_detection/docker/Dockerfile + image: ${REGISTRY}opea/guardrails-pii-detection:${TAG:-latest} diff --git a/.github/workflows/docker/compose/llms-compose-cd.yaml b/.github/workflows/docker/compose/llms-compose-cd.yaml new file mode 100644 index 0000000000..f60e0e921d --- /dev/null +++ b/.github/workflows/docker/compose/llms-compose-cd.yaml @@ -0,0 +1,8 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +services: + llm-native: + build: + dockerfile: comps/llms/text-generation/native/docker/Dockerfile + image: ${REGISTRY}opea/llm-native:${TAG:-latest} diff --git a/.github/workflows/docker/compose/lvms-compose-cd.yaml b/.github/workflows/docker/compose/lvms-compose-cd.yaml new file mode 100644 index 0000000000..fbdad3011e --- /dev/null +++ b/.github/workflows/docker/compose/lvms-compose-cd.yaml @@ -0,0 +1,23 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# this file should be run in the root of the repo +services: + lvm: + build: + dockerfile: comps/lvms/Dockerfile + image: ${REGISTRY}opea/lvm:${TAG:-latest} + # Xeon CPU + llava: + build: + dockerfile: comps/lvms/llava/Dockerfile + image: ${REGISTRY}opea/llava:${TAG:-latest} + # Gaudi2 HPU + llava_hpu: + build: + dockerfile: comps/lvms/llava/Dockerfile_hpu + image: ${REGISTRY}opea/llava_hpu:${TAG:-latest} + lvm-tgi: + build: + dockerfile: comps/lvms/Dockerfile_tgi + image: ${REGISTRY}opea/lvm-tgi:${TAG:-latest} diff --git a/.github/workflows/docker/compose/nginx-compose-cd.yaml b/.github/workflows/docker/compose/nginx-compose-cd.yaml new file mode 100644 index 0000000000..e6cf05aa4d --- /dev/null +++ b/.github/workflows/docker/compose/nginx-compose-cd.yaml @@ -0,0 +1,9 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# this file should be run in the root of the repo +services: + nginx: + build: + dockerfile: comps/nginx/docker/Dockerfile + image: ${REGISTRY}opea/nginx:${TAG:-latest} diff --git a/.github/workflows/docker/compose/prompt_registry-compose-cd.yaml b/.github/workflows/docker/compose/prompt_registry-compose-cd.yaml new file mode 100644 index 0000000000..52923a2f1d --- /dev/null +++ b/.github/workflows/docker/compose/prompt_registry-compose-cd.yaml @@ -0,0 +1,9 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# this file should be run in the root of the repo +services: + promptregistry-mongo-server: + build: + dockerfile: comps/prompt_registry/mongo/docker/Dockerfile + image: ${REGISTRY}opea/promptregistry-mongo-server:${TAG:-latest} diff --git a/.github/workflows/docker/compose/reranks-compose-cd.yaml b/.github/workflows/docker/compose/reranks-compose-cd.yaml new file mode 100644 index 0000000000..85339c8b85 --- /dev/null +++ b/.github/workflows/docker/compose/reranks-compose-cd.yaml @@ -0,0 +1,16 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +services: + reranking-fastrag: + build: + dockerfile: comps/reranks/fastrag/docker/Dockerfile + image: ${REGISTRY}opea/reranking-fastrag:${TAG:-latest} + reranking-langchain-mosec-endpoint: + build: + dockerfile: comps/reranks/langchain-mosec/mosec-docker/Dockerfile + image: ${REGISTRY}opea/reranking-langchain-mosec-endpoint:${TAG:-latest} + reranking-langchain-mosec: + build: + dockerfile: comps/reranks/langchain-mosec/docker/Dockerfile + image: ${REGISTRY}opea/reranking-langchain-mosec:${TAG:-latest} diff --git a/.github/workflows/docker/compose/retrievers-compose.yaml b/.github/workflows/docker/compose/retrievers-compose.yaml index d9de4b27f4..289871ed51 100644 --- a/.github/workflows/docker/compose/retrievers-compose.yaml +++ b/.github/workflows/docker/compose/retrievers-compose.yaml @@ -11,3 +11,19 @@ services: build: dockerfile: comps/retrievers/haystack/qdrant/docker/Dockerfile image: ${REGISTRY}opea/retriever-qdrant:${TAG:-latest} + retriever-pgvector: + build: + dockerfile: comps/retrievers/langchain/pgvector/docker/Dockerfile + image: ${REGISTRY}opea/retriever-qdrant:${TAG:-latest} + retriever-pinecone: + build: + dockerfile: comps/retrievers/langchain/pinecone/docker/Dockerfile + image: ${REGISTRY}opea/retriever-pinecone:${TAG:-latest} + retriever-milvus: + build: + dockerfile: comps/retrievers/langchain/milvus/docker/Dockerfile + image: ${REGISTRY}opea/retriever-milvus:${TAG:-latest} + retriever-redis-llamaindex: + build: + dockerfile: comps/retrievers/llamaindex/docker/Dockerfile + image: ${REGISTRY}opea/retriever-redis-llamaindex:${TAG:-latest} diff --git a/.github/workflows/manual-bom-scan.yml b/.github/workflows/manual-bom-scan.yml new file mode 100644 index 0000000000..4a781b2549 --- /dev/null +++ b/.github/workflows/manual-bom-scan.yml @@ -0,0 +1,102 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +name: Comps docker images BoM scan on manual event +on: + workflow_dispatch: + inputs: + services: + default: "asr" + description: "List of services to test [agent_langchain,asr,chathistory_mongo,dataprep_milvus...]" #,embeddings,guardrails,knowledgegraphs,llms,lvms,prompt_registry,ragas,reranks,retrievers,tts,vectorstores,web_retrievers]" + required: true + type: string + tag: + default: "comps" + description: "Tag to apply to images" + required: true + type: string + sbom-scan: + default: true + description: "Enable sbom-scan" + required: false + type: boolean + trivy-scan: + default: true + description: "Enable trivy-scan" + required: false + type: boolean + +permissions: read-all +jobs: + get-image-list: + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.scan-matrix.outputs.matrix }} + steps: + - name: Checkout out Repo + uses: actions/checkout@v4 + + - name: Set Matrix + id: scan-matrix + run: | + pip install yq + services=($(echo ${{ inputs.services }} | tr ',' ' ')) + image_list=[] + for service in ${services[@]} + do + images=$(cat ${{ github.workspace }}/.github/workflows/docker/compose/${service}-compose.yaml | yq -r '.[]' | jq 'keys' | jq -c '.') + image_list=$(echo ${image_list} | jq -s '.[0] + .[1] | unique' - <(echo ${images})) + done + echo "matrix=$(echo ${image_list} | jq -c '.')" >> $GITHUB_OUTPUT + + scan-license: + needs: get-image-list + runs-on: "docker-build-gaudi" + strategy: + matrix: + image: ${{ fromJson(needs.get-image-list.outputs.matrix) }} + fail-fast: false + steps: + - name: Pull Image + run: | + docker pull ${OPEA_IMAGE_REPO}opea/${{ matrix.image }}:latest + # ${{ inputs.tag }} + echo "OPEA_IMAGE_REPO=${OPEA_IMAGE_REPO}" >> $GITHUB_ENV + + - name: SBOM Scan Container + uses: anchore/sbom-action@v0.17.1 + if: ${{ fromJSON(inputs.sbom-scan) }} + with: + image: ${{ env.OPEA_IMAGE_REPO }}opea/${{ matrix.image }}:${{ inputs.tag }} + output-file: ${{ matrix.image }}-sbom-scan.txt + format: "spdx-json" + + - name: Security Scan Container + uses: aquasecurity/trivy-action@0.24.0 + if: ${{ fromJSON(inputs.trivy-scan) }} + with: + image-ref: ${{ env.OPEA_IMAGE_REPO }}opea/${{ matrix.image }}:${{ inputs.tag }} + output: ${{ matrix.image }}-trivy-scan.txt + format: "table" + exit-code: "1" + ignore-unfixed: true + vuln-type: "os,library" + severity: "CRITICAL,HIGH" + + - name: Cleanup + if: always() + run: docker rmi -f ${OPEA_IMAGE_REPO}opea/${{ matrix.image }}:${{ inputs.tag }} + + - uses: actions/upload-artifact@v4.3.4 + if: always() + with: + name: sbom-scan + path: ${{ matrix.image }}-sbom-scan.txt + overwrite: true + + - uses: actions/upload-artifact@v4.3.4 + if: always() + with: + name: trivy-scan + path: ${{ matrix.image }}-trivy-scan.txt + overwrite: true diff --git a/.github/workflows/manual-comps-test.yml b/.github/workflows/manual-comps-test.yml index 010cd0d7ab..bde3bf9fad 100644 --- a/.github/workflows/manual-comps-test.yml +++ b/.github/workflows/manual-comps-test.yml @@ -7,7 +7,7 @@ on: inputs: services: default: "asr" - description: "List of services to test [agent_langchain,asr,chathistory_mongo,dataprep_milvus...]" #,embeddings,guardrails,knowledgegraphs,llms,lvms,prompt_registry,ragas,reranks,retrievers,tts,vectorstores,web_retrievers]" + description: "List of services to test [agent,asr,chathistory,dataprep,embeddings,guardrails,llms,lvms,nginx,prompt_registry,reranks,retrievers,tts,web_retrievers]" required: true type: string build: @@ -15,6 +15,21 @@ on: description: "Build test required images for Comps" required: false type: boolean + test: + default: true + description: "Test comps with docker compose" + required: false + type: boolean + tag: + default: "comps" + description: "Tag to apply to images" + required: true + type: string + mode: + default: "CD" + description: "Whether the test range is CI or CD" + required: false + type: string permissions: read-all @@ -40,6 +55,8 @@ jobs: uses: ./.github/workflows/_comps-workflow.yml with: service: ${{ matrix.service }} - tag: "comps" + tag: ${{ inputs.tag }} node: gaudi + mode: ${{ inputs.mode }} + test: ${{ inputs.test }} secrets: inherit diff --git a/.github/workflows/pr-dockerfile-path-scan.yaml b/.github/workflows/pr-dockerfile-path-scan.yaml index 0595e4bcba..b5e1ce753f 100644 --- a/.github/workflows/pr-dockerfile-path-scan.yaml +++ b/.github/workflows/pr-dockerfile-path-scan.yaml @@ -27,7 +27,7 @@ jobs: - name: Check for changed Dockerfile paths in yaml run: | - set -xe + set -e shopt -s globstar cd ${{github.workspace}} is_use="FALSE" @@ -52,7 +52,7 @@ jobs: - name: Check for changed Dockerfile paths in readme run: | - set -xe + set -e shopt -s globstar cd ${{github.workspace}} is_use="FALSE" @@ -93,7 +93,7 @@ jobs: - name: Check for changed Dockerfile paths run: | - set -xe + set -e shopt -s globstar cd ${{github.workspace}} is_use="FALSE" @@ -102,9 +102,12 @@ jobs: changed_files="$(git diff --name-status --diff-filter=DR ${{ github.event.pull_request.base.sha }} ${merged_commit} -- '**/Dockerfile' | cut -f2)" if [ -n "$changed_files" ]; then for file in $changed_files; do - if grep -q "$file" ../GenAIExamples/**/*.md; then + matching_files=$(grep -rl "$file" ../GenAIExamples/**/*.md) + if [ -n "$matching_files" ]; then is_use="TRUE" used_files+="$file " + echo "Modified Dockerfile '$file' is referenced in:" + echo "$matching_files" fi done fi diff --git a/.github/workflows/pr-examples-test.yml b/.github/workflows/pr-examples-test.yml index a3eb505399..e3fe2b6abb 100644 --- a/.github/workflows/pr-examples-test.yml +++ b/.github/workflows/pr-examples-test.yml @@ -52,11 +52,10 @@ jobs: cat test_chatqna_on_gaudi.sh echo "Run test..." + echo "LOG_DIR=$(pwd)" >> $GITHUB_ENV export IMAGE_TAG="comps" timeout 50m bash test_chatqna_on_gaudi.sh - echo "LOG_PATH=$(pwd)/*.log" >> $GITHUB_ENV - - name: Clean up container if: cancelled() || failure() run: | @@ -69,4 +68,4 @@ jobs: uses: actions/upload-artifact@v4 with: name: "Examples-Test-Logs" - path: ${{ env.LOG_PATH }} + path: ${{ env.LOG_DIR }}/*.log diff --git a/.github/workflows/pr-microservice-test.yml b/.github/workflows/pr-microservice-test.yml index f9dd45d8f9..786b887c56 100644 --- a/.github/workflows/pr-microservice-test.yml +++ b/.github/workflows/pr-microservice-test.yml @@ -5,7 +5,7 @@ name: MicroService-test on: pull_request_target: - branches: [main] + branches: ["main", "*rc"] types: [opened, reopened, ready_for_review, synchronize] # added `ready_for_review` since draft is skipped paths: - comps/** diff --git a/comps/__init__.py b/comps/__init__.py index a5d00f9e07..c58ae42fe1 100644 --- a/comps/__init__.py +++ b/comps/__init__.py @@ -12,6 +12,7 @@ GeneratedDoc, LLMParamsDoc, SearchedDoc, + SearchedMultimodalDoc, RerankedDoc, TextDoc, RAGASParams, @@ -19,6 +20,10 @@ GraphDoc, LVMDoc, LVMVideoDoc, + ImageDoc, + TextImageDoc, + MultimodalDoc, + EmbedMultimodalDoc, ) # Constants diff --git a/comps/chathistory/mongo/docker/Dockerfile b/comps/chathistory/mongo/docker/Dockerfile index 5209af8358..81e0fde5ed 100644 --- a/comps/chathistory/mongo/docker/Dockerfile +++ b/comps/chathistory/mongo/docker/Dockerfile @@ -7,9 +7,8 @@ ENV LANG=C.UTF-8 RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ build-essential \ - libgl1-mesa-glx \ libjemalloc-dev \ - vim + libgl1-mesa-glx RUN useradd -m -s /bin/bash user && \ mkdir -p /home/user && \ diff --git a/comps/cores/mega/orchestrator.py b/comps/cores/mega/orchestrator.py index 616af41c83..92063d4981 100644 --- a/comps/cores/mega/orchestrator.py +++ b/comps/cores/mega/orchestrator.py @@ -72,6 +72,18 @@ async def schedule(self, initial_inputs: Dict, llm_parameters: LLMParams = LLMPa downstreams.remove(downstream) except re.error as e: print("Pattern invalid! Operation cancelled.") + if len(downstreams) == 0 and llm_parameters.streaming: + # turn the response to a StreamingResponse + # to make the response uniform to UI + def fake_stream(text): + yield "data: b'" + text + "'\n\n" + yield "data: [DONE]\n\n" + + self.dump_outputs( + node, + StreamingResponse(fake_stream(response["text"]), media_type="text/event-stream"), + result_dict, + ) for d_node in downstreams: if all(i in result_dict for i in runtime_graph.predecessors(d_node)): @@ -176,18 +188,20 @@ def extract_chunk_str(self, chunk_str): if chunk_str == "data: [DONE]\n\n": return "" prefix = "data: b'" + prefix_2 = 'data: b"' suffix = "'\n\n" - if chunk_str.startswith(prefix): + suffix_2 = '"\n\n' + if chunk_str.startswith(prefix) or chunk_str.startswith(prefix_2): chunk_str = chunk_str[len(prefix) :] - if chunk_str.endswith(suffix): + if chunk_str.endswith(suffix) or chunk_str.endswith(suffix_2): chunk_str = chunk_str[: -len(suffix)] return chunk_str def token_generator(self, sentence, is_last=False): prefix = "data: " suffix = "\n\n" - tokens = re.findall(r"\S+\s?", sentence, re.UNICODE) + tokens = re.findall(r"\s?\S+\s?", sentence, re.UNICODE) for token in tokens: - yield prefix + repr(token.encode("utf-8")) + suffix + yield prefix + repr(token.replace("\\n", "\n").encode("utf-8")) + suffix if is_last: yield "data: [DONE]\n\n" diff --git a/comps/cores/proto/docarray.py b/comps/cores/proto/docarray.py index 0d397dcb7b..1a29aa3291 100644 --- a/comps/cores/proto/docarray.py +++ b/comps/cores/proto/docarray.py @@ -1,12 +1,12 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -from typing import Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Union import numpy as np from docarray import BaseDoc, DocList -from docarray.documents import AudioDoc, VideoDoc -from docarray.typing import AudioUrl +from docarray.documents import AudioDoc +from docarray.typing import AudioUrl, ImageUrl from pydantic import Field, conint, conlist, field_validator @@ -17,7 +17,30 @@ class TopologyInfo: class TextDoc(BaseDoc, TopologyInfo): - text: str + text: str = None + + +class ImageDoc(BaseDoc): + url: Optional[ImageUrl] = Field( + description="The path to the image. It can be remote (Web) URL, or a local file path", + default=None, + ) + base64_image: Optional[str] = Field( + description="The base64-based encoding of the image", + default=None, + ) + + +class TextImageDoc(BaseDoc): + image: ImageDoc = None + text: TextDoc = None + + +MultimodalDoc = Union[ + TextDoc, + ImageDoc, + TextImageDoc, +] class Base64ByteStrDoc(BaseDoc): @@ -43,6 +66,18 @@ class EmbedDoc(BaseDoc): score_threshold: float = 0.2 +class EmbedMultimodalDoc(EmbedDoc): + # extend EmbedDoc with these attributes + url: Optional[ImageUrl] = Field( + description="The path to the image. It can be remote (Web) URL, or a local file path.", + default=None, + ) + base64_image: Optional[str] = Field( + description="The base64-based encoding of the image.", + default=None, + ) + + class Audio2TextDoc(AudioDoc): url: Optional[AudioUrl] = Field( description="The path to the audio.", @@ -67,6 +102,10 @@ class Config: json_encoders = {np.ndarray: lambda x: x.tolist()} +class SearchedMultimodalDoc(SearchedDoc): + metadata: List[Dict[str, Any]] + + class GeneratedDoc(BaseDoc): text: str prompt: str diff --git a/comps/dataprep/milvus/README.md b/comps/dataprep/milvus/README.md index 9941dbaa69..cc958bea06 100644 --- a/comps/dataprep/milvus/README.md +++ b/comps/dataprep/milvus/README.md @@ -1,8 +1,8 @@ # Dataprep Microservice with Milvus -## πŸš€Start Microservice with Python +## πŸš€1. Start Microservice with Python (Option 1) -### Install Requirements +### 1.1 Requirements ```bash pip install -r requirements.txt @@ -11,11 +11,11 @@ apt-get install libtesseract-dev -y apt-get install poppler-utils -y ``` -### Start Milvus Server +### 1.2 Start Milvus Server Please refer to this [readme](../../../vectorstores/langchain/milvus/README.md). -### Setup Environment Variables +### 1.3 Setup Environment Variables ```bash export no_proxy=${your_no_proxy} @@ -27,7 +27,30 @@ export COLLECTION_NAME=${your_collection_name} export MOSEC_EMBEDDING_ENDPOINT=${your_embedding_endpoint} ``` -### Start Document Preparation Microservice for Milvus with Python Script +### 1.4 Start Mosec Embedding Service + +First, you need to build a mosec embedding serving docker image. + +```bash +cd ../../.. +docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy -t opea/embedding-mosec-endpoint:latest -f comps/embeddings/langchain-mosec/mosec-docker/Dockerfile . +``` + +Then start the mosec embedding server. + +```bash +your_port=6010 +docker run -d --name="embedding-mosec-endpoint" -p $your_port:8000 opea/embedding-mosec-endpoint:latest +``` + +Setup environment variables: + +```bash +export MOSEC_EMBEDDING_ENDPOINT="http://localhost:$your_port" +export MILVUS=${your_host_ip} +``` + +### 1.5 Start Document Preparation Microservice for Milvus with Python Script Start document preparation microservice for Milvus with below command. @@ -35,22 +58,45 @@ Start document preparation microservice for Milvus with below command. python prepare_doc_milvus.py ``` -## πŸš€Start Microservice with Docker +## πŸš€2. Start Microservice with Docker (Option 2) + +### 2.1 Start Milvus Server + +Please refer to this [readme](../../../vectorstores/langchain/milvus/README.md). -### Build Docker Image +### 2.2 Build Docker Image ```bash -cd ../../../../ +cd ../../.. +# build mosec embedding docker image +docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy -t opea/embedding-langchain-mosec-endpoint:latest -f comps/embeddings/langchain-mosec/mosec-docker/Dockerfile . +# build dataprep milvus docker image docker build -t opea/dataprep-milvus:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy --build-arg no_proxy=$no_proxy -f comps/dataprep/milvus/docker/Dockerfile . ``` -### Run Docker with CLI +### 2.3 Setup Environment Variables + +```bash +export MOSEC_EMBEDDING_ENDPOINT="http://localhost:$your_port" +export MILVUS=${your_host_ip} +``` + +### 2.3 Run Docker with CLI (Option A) + +```bash +docker run -d --name="dataprep-milvus-server" -p 6010:6010 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy -e MOSEC_EMBEDDING_ENDPOINT=${MOSEC_EMBEDDING_ENDPOINT} -e MILVUS=${MILVUS} opea/dataprep-milvus:latest +``` + +### 2.4 Run with Docker Compose (Option B) ```bash -docker run -d --name="dataprep-milvus-server" -p 6010:6010 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy -e MOSEC_EMBEDDING_ENDPOINT=${your_embedding_endpoint} -e MILVUS=${your_milvus_host_ip} opea/dataprep-milvus:latest +cd docker +docker compose -f docker-compose-dataprep-milvus.yaml up -d ``` -## Invoke Microservice +## πŸš€3. Consume Microservice + +### 3.1 Consume Upload API Once document preparation microservice for Milvus is started, user can use below command to invoke the microservice to convert the document to embedding and save to the database. @@ -65,13 +111,13 @@ curl -X POST \ http://localhost:6010/v1/dataprep ``` -You can specify chunk_size and chunk_size by the following commands. +You can specify chunk_size and chunk_size by the following commands. To avoid big chunks, pass a small chun_size like 500 as below (default 1500). ```bash curl -X POST \ -H "Content-Type: multipart/form-data" \ -F "files=@./file.pdf" \ - -F "chunk_size=1500" \ + -F "chunk_size=500" \ -F "chunk_overlap=100" \ http://localhost:6010/v1/dataprep ``` @@ -132,3 +178,70 @@ Note: If you specify "table_strategy=llm", You should first start TGI Service, p ```bash curl -X POST -H "Content-Type: application/json" -d '{"path":"/home/user/doc/your_document_name","process_table":true,"table_strategy":"hq"}' http://localhost:6010/v1/dataprep ``` + +### 3.2 Consume get_file API + +To get uploaded file structures, use the following command: + +```bash +curl -X POST \ + -H "Content-Type: application/json" \ + http://localhost:6010/v1/dataprep/get_file +``` + +Then you will get the response JSON like this: + +```json +[ + { + "name": "uploaded_file_1.txt", + "id": "uploaded_file_1.txt", + "type": "File", + "parent": "" + }, + { + "name": "uploaded_file_2.txt", + "id": "uploaded_file_2.txt", + "type": "File", + "parent": "" + } +] +``` + +### 3.3 Consume delete_file API + +To delete uploaded file/link, use the following command. + +The `file_path` here should be the `id` get from `/v1/dataprep/get_file` API. + +```bash +# delete link +curl -X POST \ + -H "Content-Type: application/json" \ + -d '{"file_path": "https://www.ces.tech/.txt"}' \ + http://localhost:6010/v1/dataprep/delete_file + +# delete file +curl -X POST \ + -H "Content-Type: application/json" \ + -d '{"file_path": "uploaded_file_1.txt"}' \ + http://localhost:6010/v1/dataprep/delete_file + +# delete all files and links, will drop the entire db collection +curl -X POST \ + -H "Content-Type: application/json" \ + -d '{"file_path": "all"}' \ + http://localhost:6010/v1/dataprep/delete_file +``` + +## πŸš€4. Troubleshooting + +1. If you get errors from Mosec Embedding Endpoint like `cannot find this task, maybe it has expired` while uploading files, try to reduce the `chunk_size` in the curl command like below (the default chunk_size=1500). + + ```bash + curl -X POST \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./file.pdf" \ + -F "chunk_size=500" \ + http://localhost:6010/v1/dataprep + ``` diff --git a/comps/dataprep/milvus/docker/Dockerfile b/comps/dataprep/milvus/docker/Dockerfile index 7e2f2202b9..7ce117641d 100644 --- a/comps/dataprep/milvus/docker/Dockerfile +++ b/comps/dataprep/milvus/docker/Dockerfile @@ -1,4 +1,3 @@ - # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 @@ -10,10 +9,9 @@ ARG ARCH="cpu" RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ build-essential \ - libgl1-mesa-glx \ - libjemalloc-dev \ default-jre \ - vim + libgl1-mesa-glx \ + libjemalloc-dev RUN useradd -m -s /bin/bash user && \ mkdir -p /home/user && \ @@ -37,4 +35,3 @@ USER user WORKDIR /home/user/comps/dataprep/milvus ENTRYPOINT ["python", "prepare_doc_milvus.py"] - diff --git a/comps/dataprep/milvus/docker/docker-compose-dataprep-milvus.yaml b/comps/dataprep/milvus/docker/docker-compose-dataprep-milvus.yaml new file mode 100644 index 0000000000..2b4a05cff2 --- /dev/null +++ b/comps/dataprep/milvus/docker/docker-compose-dataprep-milvus.yaml @@ -0,0 +1,93 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3" +services: + etcd: + container_name: milvus-etcd + image: quay.io/coreos/etcd:v3.5.5 + environment: + - ETCD_AUTO_COMPACTION_MODE=revision + - ETCD_AUTO_COMPACTION_RETENTION=1000 + - ETCD_QUOTA_BACKEND_BYTES=4294967296 + - ETCD_SNAPSHOT_COUNT=50000 + volumes: + - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/etcd:/etcd + command: etcd -advertise-client-urls=http://127.0.0.1:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd + healthcheck: + test: ["CMD", "etcdctl", "endpoint", "health"] + interval: 30s + timeout: 20s + retries: 3 + + minio: + container_name: milvus-minio + image: minio/minio:RELEASE.2023-03-20T20-16-18Z + environment: + MINIO_ACCESS_KEY: minioadmin + MINIO_SECRET_KEY: minioadmin + ports: + - "9001:9001" + - "9000:9000" + volumes: + - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/minio:/minio_data + command: minio server /minio_data --console-address ":9001" + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"] + interval: 30s + timeout: 20s + retries: 3 + + standalone: + container_name: milvus-standalone + image: milvusdb/milvus:v2.4.6 + command: ["milvus", "run", "standalone"] + security_opt: + - seccomp:unconfined + environment: + ETCD_ENDPOINTS: etcd:2379 + MINIO_ADDRESS: minio:9000 + volumes: + - ${DOCKER_VOLUME_DIRECTORY:-.}/milvus.yaml:/milvus/configs/milvus.yaml + - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/milvus:/var/lib/milvus + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:9091/healthz"] + interval: 30s + start_period: 90s + timeout: 20s + retries: 3 + ports: + - "19530:19530" + - "9091:9091" + depends_on: + - "etcd" + - "minio" + + mosec-embedding: + image: opea/embedding-mosec-endpoint:latest + container_name: embedding-mosec-server + ports: + - "6009:8000" + ipc: host + environment: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + restart: unless-stopped + + dataprep-milvus: + image: opea/dataprep-milvus:latest + container_name: dataprep-milvus-server + ports: + - "6010:6010" + ipc: host + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + MOSEC_EMBEDDING_ENDPOINT: ${MOSEC_EMBEDDING_ENDPOINT} + MILVUS: ${MILVUS} + restart: unless-stopped + +networks: + default: + driver: bridge diff --git a/comps/dataprep/milvus/docker/milvus.yaml b/comps/dataprep/milvus/docker/milvus.yaml new file mode 100644 index 0000000000..b9f22cb3d1 --- /dev/null +++ b/comps/dataprep/milvus/docker/milvus.yaml @@ -0,0 +1,811 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# Licensed to the LF AI & Data foundation under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Related configuration of etcd, used to store Milvus metadata & service discovery. +etcd: + endpoints: localhost:2379 + rootPath: by-dev # The root path where data is stored in etcd + metaSubPath: meta # metaRootPath = rootPath + '/' + metaSubPath + kvSubPath: kv # kvRootPath = rootPath + '/' + kvSubPath + log: + level: info # Only supports debug, info, warn, error, panic, or fatal. Default 'info'. + # path is one of: + # - "default" as os.Stderr, + # - "stderr" as os.Stderr, + # - "stdout" as os.Stdout, + # - file path to append server logs to. + # please adjust in embedded Milvus: /tmp/milvus/logs/etcd.log + path: stdout + ssl: + enabled: false # Whether to support ETCD secure connection mode + tlsCert: /path/to/etcd-client.pem # path to your cert file + tlsKey: /path/to/etcd-client-key.pem # path to your key file + tlsCACert: /path/to/ca.pem # path to your CACert file + # TLS min version + # Optional values: 1.0, 1.1, 1.2, 1.3。 + # We recommend using version 1.2 and above. + tlsMinVersion: 1.3 + requestTimeout: 10000 # Etcd operation timeout in milliseconds + use: + embed: false # Whether to enable embedded Etcd (an in-process EtcdServer). + data: + dir: default.etcd # Embedded Etcd only. please adjust in embedded Milvus: /tmp/milvus/etcdData/ + auth: + enabled: false # Whether to enable authentication + userName: # username for etcd authentication + password: # password for etcd authentication + +metastore: + type: etcd # Default value: etcd, Valid values: [etcd, tikv] + +# Related configuration of tikv, used to store Milvus metadata. +# Notice that when TiKV is enabled for metastore, you still need to have etcd for service discovery. +# TiKV is a good option when the metadata size requires better horizontal scalability. +tikv: + endpoints: 127.0.0.1:2389 # Note that the default pd port of tikv is 2379, which conflicts with etcd. + rootPath: by-dev # The root path where data is stored in tikv + metaSubPath: meta # metaRootPath = rootPath + '/' + metaSubPath + kvSubPath: kv # kvRootPath = rootPath + '/' + kvSubPath + requestTimeout: 10000 # ms, tikv request timeout + snapshotScanSize: 256 # batch size of tikv snapshot scan + ssl: + enabled: false # Whether to support TiKV secure connection mode + tlsCert: # path to your cert file + tlsKey: # path to your key file + tlsCACert: # path to your CACert file + +localStorage: + path: /var/lib/milvus/data/ # please adjust in embedded Milvus: /tmp/milvus/data/ + +# Related configuration of MinIO/S3/GCS or any other service supports S3 API, which is responsible for data persistence for Milvus. +# We refer to the storage service as MinIO/S3 in the following description for simplicity. +minio: + address: localhost # Address of MinIO/S3 + port: 9000 # Port of MinIO/S3 + accessKeyID: minioadmin # accessKeyID of MinIO/S3 + secretAccessKey: minioadmin # MinIO/S3 encryption string + useSSL: false # Access to MinIO/S3 with SSL + ssl: + tlsCACert: /path/to/public.crt # path to your CACert file + bucketName: a-bucket # Bucket name in MinIO/S3 + rootPath: files # The root path where the message is stored in MinIO/S3 + # Whether to useIAM role to access S3/GCS instead of access/secret keys + # For more information, refer to + # aws: https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_use.html + # gcp: https://cloud.google.com/storage/docs/access-control/iam + # aliyun (ack): https://www.alibabacloud.com/help/en/container-service-for-kubernetes/latest/use-rrsa-to-enforce-access-control + # aliyun (ecs): https://www.alibabacloud.com/help/en/elastic-compute-service/latest/attach-an-instance-ram-role + useIAM: false + # Cloud Provider of S3. Supports: "aws", "gcp", "aliyun". + # You can use "aws" for other cloud provider supports S3 API with signature v4, e.g.: minio + # You can use "gcp" for other cloud provider supports S3 API with signature v2 + # You can use "aliyun" for other cloud provider uses virtual host style bucket + # When useIAM enabled, only "aws", "gcp", "aliyun" is supported for now + cloudProvider: aws + # Custom endpoint for fetch IAM role credentials. when useIAM is true & cloudProvider is "aws". + # Leave it empty if you want to use AWS default endpoint + iamEndpoint: + logLevel: fatal # Log level for aws sdk log. Supported level: off, fatal, error, warn, info, debug, trace + region: # Specify minio storage system location region + useVirtualHost: false # Whether use virtual host mode for bucket + requestTimeoutMs: 10000 # minio timeout for request time in milliseconds + # The maximum number of objects requested per batch in minio ListObjects rpc, + # 0 means using oss client by default, decrease these configuration if ListObjects timeout + listObjectsMaxKeys: 0 + +# Milvus supports four MQ: rocksmq(based on RockDB), natsmq(embedded nats-server), Pulsar and Kafka. +# You can change your mq by setting mq.type field. +# If you don't set mq.type field as default, there is a note about enabling priority if we config multiple mq in this file. +# 1. standalone(local) mode: rocksmq(default) > natsmq > Pulsar > Kafka +# 2. cluster mode: Pulsar(default) > Kafka (rocksmq and natsmq is unsupported in cluster mode) +mq: + # Default value: "default" + # Valid values: [default, pulsar, kafka, rocksmq, natsmq] + type: default + enablePursuitMode: true # Default value: "true" + pursuitLag: 10 # time tick lag threshold to enter pursuit mode, in seconds + pursuitBufferSize: 8388608 # pursuit mode buffer size in bytes + mqBufSize: 16 # MQ client consumer buffer length + dispatcher: + mergeCheckInterval: 1 # the interval time(in seconds) for dispatcher to check whether to merge + targetBufSize: 16 # the length of channel buffer for targe + maxTolerantLag: 3 # Default value: "3", the timeout(in seconds) that target sends msgPack + +# Related configuration of pulsar, used to manage Milvus logs of recent mutation operations, output streaming log, and provide log publish-subscribe services. +pulsar: + address: localhost # Address of pulsar + port: 6650 # Port of Pulsar + webport: 80 # Web port of pulsar, if you connect directly without proxy, should use 8080 + maxMessageSize: 5242880 # 5 * 1024 * 1024 Bytes, Maximum size of each message in pulsar. + tenant: public + namespace: default + requestTimeout: 60 # pulsar client global request timeout in seconds + enableClientMetrics: false # Whether to register pulsar client metrics into milvus metrics path. + +# If you want to enable kafka, needs to comment the pulsar configs +# kafka: +# brokerList: +# saslUsername: +# saslPassword: +# saslMechanisms: +# securityProtocol: +# ssl: +# enabled: false # whether to enable ssl mode +# tlsCert: # path to client's public key (PEM) used for authentication +# tlsKey: # path to client's private key (PEM) used for authentication +# tlsCaCert: # file or directory path to CA certificate(s) for verifying the broker's key +# tlsKeyPassword: # private key passphrase for use with ssl.key.location and set_ssl_cert(), if any +# readTimeout: 10 + +rocksmq: + # The path where the message is stored in rocksmq + # please adjust in embedded Milvus: /tmp/milvus/rdb_data + path: /var/lib/milvus/rdb_data + lrucacheratio: 0.06 # rocksdb cache memory ratio + rocksmqPageSize: 67108864 # 64 MB, 64 * 1024 * 1024 bytes, The size of each page of messages in rocksmq + retentionTimeInMinutes: 4320 # 3 days, 3 * 24 * 60 minutes, The retention time of the message in rocksmq. + retentionSizeInMB: 8192 # 8 GB, 8 * 1024 MB, The retention size of the message in rocksmq. + compactionInterval: 86400 # 1 day, trigger rocksdb compaction every day to remove deleted data + compressionTypes: 0,0,7,7,7 # compaction compression type, only support use 0,7. 0 means not compress, 7 will use zstd. Length of types means num of rocksdb level. + +# natsmq configuration. +# more detail: https://docs.nats.io/running-a-nats-service/configuration +natsmq: + server: + port: 4222 # Port for nats server listening + storeDir: /var/lib/milvus/nats # Directory to use for JetStream storage of nats + maxFileStore: 17179869184 # Maximum size of the 'file' storage + maxPayload: 8388608 # Maximum number of bytes in a message payload + maxPending: 67108864 # Maximum number of bytes buffered for a connection Applies to client connections + initializeTimeout: 4000 # waiting for initialization of natsmq finished + monitor: + trace: false # If true enable protocol trace log messages + debug: false # If true enable debug log messages + logTime: true # If set to false, log without timestamps. + logFile: /tmp/milvus/logs/nats.log # Log file path relative to .. of milvus binary if use relative path + logSizeLimit: 536870912 # Size in bytes after the log file rolls over to a new one + retention: + maxAge: 4320 # Maximum age of any message in the P-channel + maxBytes: # How many bytes the single P-channel may contain. Removing oldest messages if the P-channel exceeds this size + maxMsgs: # How many message the single P-channel may contain. Removing oldest messages if the P-channel exceeds this limit + +# Related configuration of rootCoord, used to handle data definition language (DDL) and data control language (DCL) requests +rootCoord: + dmlChannelNum: 16 # The number of dml channels created at system startup + maxPartitionNum: 1024 # Maximum number of partitions in a collection + minSegmentSizeToEnableIndex: 1024 # It's a threshold. When the segment size is less than this value, the segment will not be indexed + enableActiveStandby: false + maxDatabaseNum: 64 # Maximum number of database + maxGeneralCapacity: 65536 # upper limit for the sum of of product of partitionNumber and shardNumber + gracefulStopTimeout: 5 # seconds. force stop node without graceful stop + ip: # if not specified, use the first unicastable address + port: 53100 + grpc: + serverMaxSendSize: 536870912 + serverMaxRecvSize: 268435456 + clientMaxSendSize: 268435456 + clientMaxRecvSize: 536870912 + +# Related configuration of proxy, used to validate client requests and reduce the returned results. +proxy: + timeTickInterval: 200 # ms, the interval that proxy synchronize the time tick + healthCheckTimeout: 3000 # ms, the interval that to do component healthy check + msgStream: + timeTick: + bufSize: 512 + maxNameLength: 255 # Maximum length of name for a collection or alias + # Maximum number of fields in a collection. + # As of today (2.2.0 and after) it is strongly DISCOURAGED to set maxFieldNum >= 64. + # So adjust at your risk! + maxFieldNum: 64 + maxVectorFieldNum: 4 # Maximum number of vector fields in a collection. + maxShardNum: 16 # Maximum number of shards in a collection + maxDimension: 32768 # Maximum dimension of a vector + # Whether to produce gin logs.\n + # please adjust in embedded Milvus: false + ginLogging: true + ginLogSkipPaths: / # skip url path for gin log + maxTaskNum: 1024 # max task number of proxy task queue + mustUsePartitionKey: false # switch for whether proxy must use partition key for the collection + accessLog: + enable: false # if use access log + minioEnable: false # if upload sealed access log file to minio + localPath: /tmp/milvus_access + filename: # Log filename, leave empty to use stdout. + maxSize: 64 # Max size for a single file, in MB. + cacheSize: 10240 # Size of log of memory cache, in B + rotatedTime: 0 # Max time for single access log file in seconds + remotePath: access_log/ # File path in minIO + remoteMaxTime: 0 # Max time for log file in minIO, in hours + formatters: + base: + format: "[$time_now] [ACCESS] <$user_name: $user_addr> $method_name [status: $method_status] [code: $error_code] [sdk: $sdk_version] [msg: $error_msg] [traceID: $trace_id] [timeCost: $time_cost]" + query: + format: "[$time_now] [ACCESS] <$user_name: $user_addr> $method_name [status: $method_status] [code: $error_code] [sdk: $sdk_version] [msg: $error_msg] [traceID: $trace_id] [timeCost: $time_cost] [database: $database_name] [collection: $collection_name] [partitions: $partition_name] [expr: $method_expr]" + methods: "Query,Search,Delete" + connectionCheckIntervalSeconds: 120 # the interval time(in seconds) for connection manager to scan inactive client info + connectionClientInfoTTLSeconds: 86400 # inactive client info TTL duration, in seconds + maxConnectionNum: 10000 # the max client info numbers that proxy should manage, avoid too many client infos + gracefulStopTimeout: 30 # seconds. force stop node without graceful stop + slowQuerySpanInSeconds: 5 # query whose executed time exceeds the `slowQuerySpanInSeconds` can be considered slow, in seconds. + http: + enabled: true # Whether to enable the http server + debug_mode: false # Whether to enable http server debug mode + port: # high-level restful api + acceptTypeAllowInt64: true # high-level restful api, whether http client can deal with int64 + enablePprof: true # Whether to enable pprof middleware on the metrics port + ip: # if not specified, use the first unicastable address + port: 19530 + internalPort: 19529 + grpc: + serverMaxSendSize: 268435456 + serverMaxRecvSize: 67108864 + clientMaxSendSize: 268435456 + clientMaxRecvSize: 67108864 + +# Related configuration of queryCoord, used to manage topology and load balancing for the query nodes, and handoff from growing segments to sealed segments. +queryCoord: + taskMergeCap: 1 + taskExecutionCap: 256 + autoHandoff: true # Enable auto handoff + autoBalance: true # Enable auto balance + autoBalanceChannel: true # Enable auto balance channel + balancer: ScoreBasedBalancer # auto balancer used for segments on queryNodes + globalRowCountFactor: 0.1 # the weight used when balancing segments among queryNodes + scoreUnbalanceTolerationFactor: 0.05 # the least value for unbalanced extent between from and to nodes when doing balance + reverseUnBalanceTolerationFactor: 1.3 # the largest value for unbalanced extent between from and to nodes after doing balance + overloadedMemoryThresholdPercentage: 90 # The threshold percentage that memory overload + balanceIntervalSeconds: 60 + memoryUsageMaxDifferencePercentage: 30 + rowCountFactor: 0.4 # the row count weight used when balancing segments among queryNodes + segmentCountFactor: 0.4 # the segment count weight used when balancing segments among queryNodes + globalSegmentCountFactor: 0.1 # the segment count weight used when balancing segments among queryNodes + segmentCountMaxSteps: 50 # segment count based plan generator max steps + rowCountMaxSteps: 50 # segment count based plan generator max steps + randomMaxSteps: 10 # segment count based plan generator max steps + growingRowCountWeight: 4 # the memory weight of growing segment row count + balanceCostThreshold: 0.001 # the threshold of balance cost, if the difference of cluster's cost after executing the balance plan is less than this value, the plan will not be executed + checkSegmentInterval: 1000 + checkChannelInterval: 1000 + checkBalanceInterval: 10000 + checkIndexInterval: 10000 + channelTaskTimeout: 60000 # 1 minute + segmentTaskTimeout: 120000 # 2 minute + distPullInterval: 500 + collectionObserverInterval: 200 + checkExecutedFlagInterval: 100 + heartbeatAvailableInterval: 10000 # 10s, Only QueryNodes which fetched heartbeats within the duration are available + loadTimeoutSeconds: 600 + distRequestTimeout: 5000 # the request timeout for querycoord fetching data distribution from querynodes, in milliseconds + heatbeatWarningLag: 5000 # the lag value for querycoord report warning when last heartbeat is too old, in milliseconds + checkHandoffInterval: 5000 + enableActiveStandby: false + checkInterval: 1000 + checkHealthInterval: 3000 # 3s, the interval when query coord try to check health of query node + checkHealthRPCTimeout: 2000 # 100ms, the timeout of check health rpc to query node + brokerTimeout: 5000 # 5000ms, querycoord broker rpc timeout + collectionRecoverTimes: 3 # if collection recover times reach the limit during loading state, release it + observerTaskParallel: 16 # the parallel observer dispatcher task number + checkAutoBalanceConfigInterval: 10 # the interval of check auto balance config + checkNodeSessionInterval: 60 # the interval(in seconds) of check querynode cluster session + gracefulStopTimeout: 5 # seconds. force stop node without graceful stop + enableStoppingBalance: true # whether enable stopping balance + channelExclusiveNodeFactor: 4 # the least node number for enable channel's exclusive mode + cleanExcludeSegmentInterval: 60 # the time duration of clean pipeline exclude segment which used for filter invalid data, in seconds + ip: # if not specified, use the first unicastable address + port: 19531 + grpc: + serverMaxSendSize: 536870912 + serverMaxRecvSize: 268435456 + clientMaxSendSize: 268435456 + clientMaxRecvSize: 536870912 + +# Related configuration of queryNode, used to run hybrid search between vector and scalar data. +queryNode: + stats: + publishInterval: 1000 # Interval for querynode to report node information (milliseconds) + segcore: + knowhereThreadPoolNumRatio: 4 # The number of threads in knowhere's thread pool. If disk is enabled, the pool size will multiply with knowhereThreadPoolNumRatio([1, 32]). + chunkRows: 128 # The number of vectors in a chunk. + interimIndex: + enableIndex: true # Enable segment build with index to accelerate vector search when segment is in growing or binlog. + nlist: 128 # temp index nlist, recommend to set sqrt(chunkRows), must smaller than chunkRows/8 + nprobe: 16 # nprobe to search small index, based on your accuracy requirement, must smaller than nlist + memExpansionRate: 1.15 # extra memory needed by building interim index + buildParallelRate: 0.5 # the ratio of building interim index parallel matched with cpu num + knowhereScoreConsistency: false # Enable knowhere strong consistency score computation logic + loadMemoryUsageFactor: 1 # The multiply factor of calculating the memory usage while loading segments + enableDisk: false # enable querynode load disk index, and search on disk index + maxDiskUsagePercentage: 95 + cache: + enabled: true + memoryLimit: 2147483648 # 2 GB, 2 * 1024 *1024 *1024 + readAheadPolicy: willneed # The read ahead policy of chunk cache, options: `normal, random, sequential, willneed, dontneed` + # options: async, sync, disable. + # Specifies the necessity for warming up the chunk cache. + # 1. If set to "sync" or "async" the original vector data will be synchronously/asynchronously loaded into the + # chunk cache during the load process. This approach has the potential to substantially reduce query/search latency + # for a specific duration post-load, albeit accompanied by a concurrent increase in disk usage; + # 2. If set to "disable" original vector data will only be loaded into the chunk cache during search/query. + warmup: disable + mmap: + mmapEnabled: false # Enable mmap for loading data + lazyload: + enabled: false # Enable lazyload for loading data + waitTimeout: 30000 # max wait timeout duration in milliseconds before start to do lazyload search and retrieve + requestResourceTimeout: 5000 # max timeout in milliseconds for waiting request resource for lazy load, 5s by default + requestResourceRetryInterval: 2000 # retry interval in milliseconds for waiting request resource for lazy load, 2s by default + maxRetryTimes: 1 # max retry times for lazy load, 1 by default + maxEvictPerRetry: 1 # max evict count for lazy load, 1 by default + grouping: + enabled: true + maxNQ: 1000 + topKMergeRatio: 20 + scheduler: + receiveChanSize: 10240 + unsolvedQueueSize: 10240 + # maxReadConcurrentRatio is the concurrency ratio of read task (search task and query task). + # Max read concurrency would be the value of hardware.GetCPUNum * maxReadConcurrentRatio. + # It defaults to 2.0, which means max read concurrency would be the value of hardware.GetCPUNum * 2. + # Max read concurrency must greater than or equal to 1, and less than or equal to hardware.GetCPUNum * 100. + # (0, 100] + maxReadConcurrentRatio: 1 + cpuRatio: 10 # ratio used to estimate read task cpu usage. + maxTimestampLag: 86400 + scheduleReadPolicy: + # fifo: A FIFO queue support the schedule. + # user-task-polling: + # The user's tasks will be polled one by one and scheduled. + # Scheduling is fair on task granularity. + # The policy is based on the username for authentication. + # And an empty username is considered the same user. + # When there are no multi-users, the policy decay into FIFO" + name: fifo + taskQueueExpire: 60 # Control how long (many seconds) that queue retains since queue is empty + enableCrossUserGrouping: false # Enable Cross user grouping when using user-task-polling policy. (Disable it if user's task can not merge each other) + maxPendingTaskPerUser: 1024 # Max pending task per user in scheduler + dataSync: + flowGraph: + maxQueueLength: 16 # Maximum length of task queue in flowgraph + maxParallelism: 1024 # Maximum number of tasks executed in parallel in the flowgraph + enableSegmentPrune: false # use partition prune function on shard delegator + ip: # if not specified, use the first unicastable address + port: 21123 + grpc: + serverMaxSendSize: 536870912 + serverMaxRecvSize: 268435456 + clientMaxSendSize: 268435456 + clientMaxRecvSize: 536870912 + +indexCoord: + bindIndexNodeMode: + enable: false + address: localhost:22930 + withCred: false + nodeID: 0 + segment: + minSegmentNumRowsToEnableIndex: 1024 # It's a threshold. When the segment num rows is less than this value, the segment will not be indexed + +indexNode: + scheduler: + buildParallel: 1 + enableDisk: true # enable index node build disk vector index + maxDiskUsagePercentage: 95 + ip: # if not specified, use the first unicastable address + port: 21121 + grpc: + serverMaxSendSize: 536870912 + serverMaxRecvSize: 268435456 + clientMaxSendSize: 268435456 + clientMaxRecvSize: 536870912 + +dataCoord: + channel: + watchTimeoutInterval: 300 # Timeout on watching channels (in seconds). Datanode tickler update watch progress will reset timeout timer. + balanceWithRpc: true # Whether to enable balance with RPC, default to use etcd watch + legacyVersionWithoutRPCWatch: 2.4.1 # Datanodes <= this version are considered as legacy nodes, which doesn't have rpc based watch(). This is only used during rolling upgrade where legacy nodes won't get new channels + balanceSilentDuration: 300 # The duration after which the channel manager start background channel balancing + balanceInterval: 360 # The interval with which the channel manager check dml channel balance status + checkInterval: 1 # The interval in seconds with which the channel manager advances channel states + notifyChannelOperationTimeout: 5 # Timeout notifing channel operations (in seconds). + segment: + maxSize: 1024 # Maximum size of a segment in MB + diskSegmentMaxSize: 2048 # Maximum size of a segment in MB for collection which has Disk index + sealProportion: 0.12 + assignmentExpiration: 2000 # The time of the assignment expiration in ms + allocLatestExpireAttempt: 200 # The time attempting to alloc latest lastExpire from rootCoord after restart + maxLife: 86400 # The max lifetime of segment in seconds, 24*60*60 + # If a segment didn't accept dml records in maxIdleTime and the size of segment is greater than + # minSizeFromIdleToSealed, Milvus will automatically seal it. + # The max idle time of segment in seconds, 10*60. + maxIdleTime: 600 + minSizeFromIdleToSealed: 16 # The min size in MB of segment which can be idle from sealed. + # The max number of binlog file for one segment, the segment will be sealed if + # the number of binlog file reaches to max value. + maxBinlogFileNumber: 32 + smallProportion: 0.5 # The segment is considered as "small segment" when its # of rows is smaller than + # (smallProportion * segment max # of rows). + # A compaction will happen on small segments if the segment after compaction will have + compactableProportion: 0.85 + # over (compactableProportion * segment max # of rows) rows. + # MUST BE GREATER THAN OR EQUAL TO !!! + # During compaction, the size of segment # of rows is able to exceed segment max # of rows by (expansionRate-1) * 100%. + expansionRate: 1.25 + autoUpgradeSegmentIndex: false # whether auto upgrade segment index to index engine's version + enableCompaction: true # Enable data segment compaction + compaction: + enableAutoCompaction: true + indexBasedCompaction: true + rpcTimeout: 10 + maxParallelTaskNum: 10 + workerMaxParallelTaskNum: 2 + levelzero: + forceTrigger: + minSize: 8388608 # The minimum size in bytes to force trigger a LevelZero Compaction, default as 8MB + maxSize: 67108864 # The maxmum size in bytes to force trigger a LevelZero Compaction, default as 64MB + deltalogMinNum: 10 # The minimum number of deltalog files to force trigger a LevelZero Compaction + deltalogMaxNum: 30 # The maxmum number of deltalog files to force trigger a LevelZero Compaction, default as 30 + enableGarbageCollection: true + gc: + interval: 3600 # gc interval in seconds + missingTolerance: 86400 # file meta missing tolerance duration in seconds, default to 24hr(1d) + dropTolerance: 10800 # file belongs to dropped entity tolerance duration in seconds. 3600 + removeConcurrent: 32 # number of concurrent goroutines to remove dropped s3 objects + scanInterval: 168 # garbage collection scan residue interval in hours + enableActiveStandby: false + brokerTimeout: 5000 # 5000ms, dataCoord broker rpc timeout + autoBalance: true # Enable auto balance + checkAutoBalanceConfigInterval: 10 # the interval of check auto balance config + import: + filesPerPreImportTask: 2 # The maximum number of files allowed per pre-import task. + taskRetention: 10800 # The retention period in seconds for tasks in the Completed or Failed state. + maxSizeInMBPerImportTask: 6144 # To prevent generating of small segments, we will re-group imported files. This parameter represents the sum of file sizes in each group (each ImportTask). + scheduleInterval: 2 # The interval for scheduling import, measured in seconds. + checkIntervalHigh: 2 # The interval for checking import, measured in seconds, is set to a high frequency for the import checker. + checkIntervalLow: 120 # The interval for checking import, measured in seconds, is set to a low frequency for the import checker. + maxImportFileNumPerReq: 1024 # The maximum number of files allowed per single import request. + waitForIndex: true # Indicates whether the import operation waits for the completion of index building. + gracefulStopTimeout: 5 # seconds. force stop node without graceful stop + ip: # if not specified, use the first unicastable address + port: 13333 + grpc: + serverMaxSendSize: 536870912 + serverMaxRecvSize: 268435456 + clientMaxSendSize: 268435456 + clientMaxRecvSize: 536870912 + +dataNode: + dataSync: + flowGraph: + maxQueueLength: 16 # Maximum length of task queue in flowgraph + maxParallelism: 1024 # Maximum number of tasks executed in parallel in the flowgraph + maxParallelSyncMgrTasks: 256 # The max concurrent sync task number of datanode sync mgr globally + skipMode: + enable: true # Support skip some timetick message to reduce CPU usage + skipNum: 4 # Consume one for every n records skipped + coldTime: 60 # Turn on skip mode after there are only timetick msg for x seconds + segment: + insertBufSize: 16777216 # Max buffer size to flush for a single segment. + deleteBufBytes: 16777216 # Max buffer size in bytes to flush del for a single channel, default as 16MB + syncPeriod: 600 # The period to sync segments if buffer is not empty. + memory: + forceSyncEnable: true # Set true to force sync if memory usage is too high + forceSyncSegmentNum: 1 # number of segments to sync, segments with top largest buffer will be synced. + checkInterval: 3000 # the interval to check datanode memory usage, in milliseconds + forceSyncWatermark: 0.5 # memory watermark for standalone, upon reaching this watermark, segments will be synced. + timetick: + byRPC: true + interval: 500 + channel: + # specify the size of global work pool of all channels + # if this parameter <= 0, will set it as the maximum number of CPUs that can be executing + # suggest to set it bigger on large collection numbers to avoid blocking + workPoolSize: -1 + # specify the size of global work pool for channel checkpoint updating + # if this parameter <= 0, will set it as 10 + updateChannelCheckpointMaxParallel: 10 + updateChannelCheckpointInterval: 60 # the interval duration(in seconds) for datanode to update channel checkpoint of each channel + updateChannelCheckpointRPCTimeout: 20 # timeout in seconds for UpdateChannelCheckpoint RPC call + maxChannelCheckpointsPerPRC: 128 # The maximum number of channel checkpoints per UpdateChannelCheckpoint RPC. + channelCheckpointUpdateTickInSeconds: 10 # The frequency, in seconds, at which the channel checkpoint updater executes updates. + import: + maxConcurrentTaskNum: 16 # The maximum number of import/pre-import tasks allowed to run concurrently on a datanode. + maxImportFileSizeInGB: 16 # The maximum file size (in GB) for an import file, where an import file refers to either a Row-Based file or a set of Column-Based files. + readBufferSizeInMB: 16 # The data block size (in MB) read from chunk manager by the datanode during import. + compaction: + levelZeroBatchMemoryRatio: 0.05 # The minimal memory ratio of free memory for level zero compaction executing in batch mode + gracefulStopTimeout: 1800 # seconds. force stop node without graceful stop + ip: # if not specified, use the first unicastable address + port: 21124 + grpc: + serverMaxSendSize: 536870912 + serverMaxRecvSize: 268435456 + clientMaxSendSize: 268435456 + clientMaxRecvSize: 536870912 + +# Configures the system log output. +log: + level: info # Only supports debug, info, warn, error, panic, or fatal. Default 'info'. + file: + rootPath: # root dir path to put logs, default "" means no log file will print. please adjust in embedded Milvus: /tmp/milvus/logs + maxSize: 300 # MB + maxAge: 10 # Maximum time for log retention in day. + maxBackups: 20 + format: text # text or json + stdout: true # Stdout enable or not + +grpc: + log: + level: WARNING + gracefulStopTimeout: 10 # second, time to wait graceful stop finish + client: + compressionEnabled: false + dialTimeout: 200 + keepAliveTime: 10000 + keepAliveTimeout: 20000 + maxMaxAttempts: 10 + initialBackoff: 0.2 + maxBackoff: 10 + minResetInterval: 1000 + maxCancelError: 32 + minSessionCheckInterval: 200 + +# Configure the proxy tls enable. +tls: + serverPemPath: configs/cert/server.pem + serverKeyPath: configs/cert/server.key + caPemPath: configs/cert/ca.pem + +common: + defaultPartitionName: _default # default partition name for a collection + defaultIndexName: _default_idx # default index name + entityExpiration: -1 # Entity expiration in seconds, CAUTION -1 means never expire + indexSliceSize: 16 # MB + threadCoreCoefficient: + highPriority: 10 # This parameter specify how many times the number of threads is the number of cores in high priority pool + middlePriority: 5 # This parameter specify how many times the number of threads is the number of cores in middle priority pool + lowPriority: 1 # This parameter specify how many times the number of threads is the number of cores in low priority pool + buildIndexThreadPoolRatio: 0.75 + DiskIndex: + MaxDegree: 56 + SearchListSize: 100 + PQCodeBudgetGBRatio: 0.125 + BuildNumThreadsRatio: 1 + SearchCacheBudgetGBRatio: 0.1 + LoadNumThreadRatio: 8 + BeamWidthRatio: 4 + gracefulTime: 5000 # milliseconds. it represents the interval (in ms) by which the request arrival time needs to be subtracted in the case of Bounded Consistency. + gracefulStopTimeout: 1800 # seconds. it will force quit the server if the graceful stop process is not completed during this time. + storageType: remote # please adjust in embedded Milvus: local, available values are [local, remote, opendal], value minio is deprecated, use remote instead + # Default value: auto + # Valid values: [auto, avx512, avx2, avx, sse4_2] + # This configuration is only used by querynode and indexnode, it selects CPU instruction set for Searching and Index-building. + simdType: auto + security: + authorizationEnabled: false + # The superusers will ignore some system check processes, + # like the old password verification when updating the credential + superUsers: + tlsMode: 0 + session: + ttl: 30 # ttl value when session granting a lease to register service + retryTimes: 30 # retry times when session sending etcd requests + locks: + metrics: + enable: false # whether gather statistics for metrics locks + threshold: + info: 500 # minimum milliseconds for printing durations in info level + warn: 1000 # minimum milliseconds for printing durations in warn level + storage: + scheme: s3 + enablev2: false + ttMsgEnabled: true # Whether the instance disable sending ts messages + traceLogMode: 0 # trace request info + bloomFilterSize: 100000 # bloom filter initial size + maxBloomFalsePositive: 0.001 # max false positive rate for bloom filter + +# QuotaConfig, configurations of Milvus quota and limits. +# By default, we enable: +# 1. TT protection; +# 2. Memory protection. +# 3. Disk quota protection. +# You can enable: +# 1. DML throughput limitation; +# 2. DDL, DQL qps/rps limitation; +# 3. DQL Queue length/latency protection; +# 4. DQL result rate protection; +# If necessary, you can also manually force to deny RW requests. +quotaAndLimits: + enabled: true # `true` to enable quota and limits, `false` to disable. + # quotaCenterCollectInterval is the time interval that quotaCenter + # collects metrics from Proxies, Query cluster and Data cluster. + # seconds, (0 ~ 65536) + quotaCenterCollectInterval: 3 + ddl: + enabled: false + collectionRate: -1 # qps, default no limit, rate for CreateCollection, DropCollection, LoadCollection, ReleaseCollection + partitionRate: -1 # qps, default no limit, rate for CreatePartition, DropPartition, LoadPartition, ReleasePartition + db: + collectionRate: -1 # qps of db level , default no limit, rate for CreateCollection, DropCollection, LoadCollection, ReleaseCollection + partitionRate: -1 # qps of db level, default no limit, rate for CreatePartition, DropPartition, LoadPartition, ReleasePartition + indexRate: + enabled: false + max: -1 # qps, default no limit, rate for CreateIndex, DropIndex + db: + max: -1 # qps of db level, default no limit, rate for CreateIndex, DropIndex + flushRate: + enabled: true + max: -1 # qps, default no limit, rate for flush + collection: + max: 0.1 # qps, default no limit, rate for flush at collection level. + db: + max: -1 # qps of db level, default no limit, rate for flush + compactionRate: + enabled: false + max: -1 # qps, default no limit, rate for manualCompaction + db: + max: -1 # qps of db level, default no limit, rate for manualCompaction + dml: + # dml limit rates, default no limit. + # The maximum rate will not be greater than max. + enabled: false + insertRate: + max: -1 # MB/s, default no limit + db: + max: -1 # MB/s, default no limit + collection: + max: -1 # MB/s, default no limit + partition: + max: -1 # MB/s, default no limit + upsertRate: + max: -1 # MB/s, default no limit + db: + max: -1 # MB/s, default no limit + collection: + max: -1 # MB/s, default no limit + partition: + max: -1 # MB/s, default no limit + deleteRate: + max: -1 # MB/s, default no limit + db: + max: -1 # MB/s, default no limit + collection: + max: -1 # MB/s, default no limit + partition: + max: -1 # MB/s, default no limit + bulkLoadRate: + max: -1 # MB/s, default no limit, not support yet. TODO: limit bulkLoad rate + db: + max: -1 # MB/s, default no limit, not support yet. TODO: limit db bulkLoad rate + collection: + max: -1 # MB/s, default no limit, not support yet. TODO: limit collection bulkLoad rate + partition: + max: -1 # MB/s, default no limit, not support yet. TODO: limit partition bulkLoad rate + dql: + # dql limit rates, default no limit. + # The maximum rate will not be greater than max. + enabled: false + searchRate: + max: -1 # vps (vectors per second), default no limit + db: + max: -1 # vps (vectors per second), default no limit + collection: + max: -1 # vps (vectors per second), default no limit + partition: + max: -1 # vps (vectors per second), default no limit + queryRate: + max: -1 # qps, default no limit + db: + max: -1 # qps, default no limit + collection: + max: -1 # qps, default no limit + partition: + max: -1 # qps, default no limit + limits: + maxCollectionNum: 65536 + maxCollectionNumPerDB: 65536 + maxInsertSize: -1 # maximum size of a single insert request, in bytes, -1 means no limit + maxResourceGroupNumOfQueryNode: 1024 # maximum number of resource groups of query nodes + limitWriting: + # forceDeny false means dml requests are allowed (except for some + # specific conditions, such as memory of nodes to water marker), true means always reject all dml requests. + forceDeny: false + ttProtection: + enabled: false + # maxTimeTickDelay indicates the backpressure for DML Operations. + # DML rates would be reduced according to the ratio of time tick delay to maxTimeTickDelay, + # if time tick delay is greater than maxTimeTickDelay, all DML requests would be rejected. + # seconds + maxTimeTickDelay: 300 + memProtection: + # When memory usage > memoryHighWaterLevel, all dml requests would be rejected; + # When memoryLowWaterLevel < memory usage < memoryHighWaterLevel, reduce the dml rate; + # When memory usage < memoryLowWaterLevel, no action. + enabled: true + dataNodeMemoryLowWaterLevel: 0.85 # (0, 1], memoryLowWaterLevel in DataNodes + dataNodeMemoryHighWaterLevel: 0.95 # (0, 1], memoryHighWaterLevel in DataNodes + queryNodeMemoryLowWaterLevel: 0.85 # (0, 1], memoryLowWaterLevel in QueryNodes + queryNodeMemoryHighWaterLevel: 0.95 # (0, 1], memoryHighWaterLevel in QueryNodes + growingSegmentsSizeProtection: + # No action will be taken if the growing segments size is less than the low watermark. + # When the growing segments size exceeds the low watermark, the dml rate will be reduced, + # but the rate will not be lower than minRateRatio * dmlRate. + enabled: false + minRateRatio: 0.5 + lowWaterLevel: 0.2 + highWaterLevel: 0.4 + diskProtection: + enabled: true # When the total file size of object storage is greater than `diskQuota`, all dml requests would be rejected; + diskQuota: -1 # MB, (0, +inf), default no limit + diskQuotaPerDB: -1 # MB, (0, +inf), default no limit + diskQuotaPerCollection: -1 # MB, (0, +inf), default no limit + diskQuotaPerPartition: -1 # MB, (0, +inf), default no limit + limitReading: + # forceDeny false means dql requests are allowed (except for some + # specific conditions, such as collection has been dropped), true means always reject all dql requests. + forceDeny: false + queueProtection: + enabled: false + # nqInQueueThreshold indicated that the system was under backpressure for Search/Query path. + # If NQ in any QueryNode's queue is greater than nqInQueueThreshold, search&query rates would gradually cool off + # until the NQ in queue no longer exceeds nqInQueueThreshold. We think of the NQ of query request as 1. + # int, default no limit + nqInQueueThreshold: -1 + # queueLatencyThreshold indicated that the system was under backpressure for Search/Query path. + # If dql latency of queuing is greater than queueLatencyThreshold, search&query rates would gradually cool off + # until the latency of queuing no longer exceeds queueLatencyThreshold. + # The latency here refers to the averaged latency over a period of time. + # milliseconds, default no limit + queueLatencyThreshold: -1 + resultProtection: + enabled: false + # maxReadResultRate indicated that the system was under backpressure for Search/Query path. + # If dql result rate is greater than maxReadResultRate, search&query rates would gradually cool off + # until the read result rate no longer exceeds maxReadResultRate. + # MB/s, default no limit + maxReadResultRate: -1 + maxReadResultRatePerDB: -1 + maxReadResultRatePerCollection: -1 + # colOffSpeed is the speed of search&query rates cool off. + # (0, 1] + coolOffSpeed: 0.9 + +trace: + # trace exporter type, default is stdout, + # optional values: ['noop','stdout', 'jaeger', 'otlp'] + exporter: noop + # fraction of traceID based sampler, + # optional values: [0, 1] + # Fractions >= 1 will always sample. Fractions < 0 are treated as zero. + sampleFraction: 0 + jaeger: + url: # when exporter is jaeger should set the jaeger's URL + otlp: + endpoint: # example: "127.0.0.1:4318" + secure: true + +#when using GPU indexing, Milvus will utilize a memory pool to avoid frequent memory allocation and deallocation. +#here, you can set the size of the memory occupied by the memory pool, with the unit being MB. +#note that there is a possibility of Milvus crashing when the actual memory demand exceeds the value set by maxMemSize. +#if initMemSize and MaxMemSize both set zero, +#milvus will automatically initialize half of the available GPU memory, +#maxMemSize will the whole available GPU memory. +gpu: + initMemSize: # Gpu Memory Pool init size + maxMemSize: # Gpu Memory Pool Max size diff --git a/comps/dataprep/milvus/prepare_doc_milvus.py b/comps/dataprep/milvus/prepare_doc_milvus.py index 72cbf2424c..38ad4ef42c 100644 --- a/comps/dataprep/milvus/prepare_doc_milvus.py +++ b/comps/dataprep/milvus/prepare_doc_milvus.py @@ -27,6 +27,7 @@ from comps import CustomLogger, DocPath, opea_microservices, register_microservice from comps.dataprep.utils import ( create_upload_folder, + decode_filename, document_loader, encode_filename, get_file_structure, @@ -72,7 +73,44 @@ def empty_embedding() -> List[float]: return [e if e is not None else empty_embedding() for e in batched_embeddings] -def ingest_data_to_milvus(doc_path: DocPath): +def ingest_chunks_to_milvus(file_name: str, chunks: List, embedder): + if logflag: + logger.info(f"[ ingest chunks ] file name: {file_name}") + + # insert documents to Milvus + insert_docs = [] + for chunk in chunks: + insert_docs.append(Document(page_content=chunk, metadata={partition_field_name: file_name})) + + # Batch size + batch_size = 32 + num_chunks = len(chunks) + + for i in range(0, num_chunks, batch_size): + if logflag: + logger.info(f"[ ingest chunks ] Current batch: {i}") + batch_docs = insert_docs[i : i + batch_size] + + try: + _ = Milvus.from_documents( + batch_docs, + embedder, + collection_name=COLLECTION_NAME, + connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT}, + partition_key_field=partition_field_name, + ) + except Exception as e: + if logflag: + logger.info(f"[ ingest chunks ] fail to ingest chunks into Milvus. error: {e}") + raise HTTPException(status_code=500, detail=f"Fail to store chunks of file {file_name}.") + + if logflag: + logger.info(f"[ ingest chunks ] Docs ingested file {file_name} to Milvus collection {COLLECTION_NAME}.") + + return True + + +def ingest_data_to_milvus(doc_path: DocPath, embedder): """Ingest document to Milvus.""" path = doc_path.path file_name = path.split("/")[-1] @@ -88,10 +126,15 @@ def ingest_data_to_milvus(doc_path: DocPath): text_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on) else: text_splitter = RecursiveCharacterTextSplitter( - chunk_size=doc_path.chunk_size, chunk_overlap=100, add_start_index=True, separators=get_separators() + chunk_size=doc_path.chunk_size, + chunk_overlap=doc_path.chunk_overlap, + add_start_index=True, + separators=get_separators(), ) content = document_loader(path) + if logflag: + logger.info("[ ingest data ] file content loaded") structured_types = [".xlsx", ".csv", ".json", "jsonl"] _, ext = os.path.splitext(path) @@ -105,210 +148,291 @@ def ingest_data_to_milvus(doc_path: DocPath): table_chunks = get_tables_result(path, doc_path.table_strategy) chunks = chunks + table_chunks if logflag: - logger.info("[ ingest data ] Done preprocessing. Created ", len(chunks), " chunks of the original file.") + logger.info(f"[ ingest data ] Done preprocessing. Created {len(chunks)} chunks of the original file.") - # Create vectorstore - if MOSEC_EMBEDDING_ENDPOINT: - # create embeddings using MOSEC endpoint service - if logflag: - logger.info( - f"[ ingest data ] MOSEC_EMBEDDING_ENDPOINT:{MOSEC_EMBEDDING_ENDPOINT}, MOSEC_EMBEDDING_MODEL:{MOSEC_EMBEDDING_MODEL}" - ) - embedder = MosecEmbeddings(model=MOSEC_EMBEDDING_MODEL) - elif TEI_EMBEDDING_ENDPOINT: - # create embeddings using TEI endpoint service - if logflag: - logger.info(f"[ ingest data ] TEI_EMBEDDING_ENDPOINT:{TEI_EMBEDDING_ENDPOINT}") - embedder = HuggingFaceHubEmbeddings(model=TEI_EMBEDDING_ENDPOINT) - else: - # create embeddings using local embedding model - if logflag: - logger.info(f"[ ingest data ] Local TEI_EMBEDDING_MODEL:{TEI_EMBEDDING_MODEL}") - embedder = HuggingFaceBgeEmbeddings(model_name=TEI_EMBEDDING_MODEL) + return ingest_chunks_to_milvus(file_name, chunks, embedder) - # insert documents to Milvus - insert_docs = [] - for chunk in chunks: - insert_docs.append(Document(page_content=chunk, metadata={partition_field_name: file_name})) - try: - _ = Milvus.from_documents( - insert_docs, - embedder, - collection_name=COLLECTION_NAME, - connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT}, - partition_key_field=partition_field_name, - ) - except Exception as e: - if logflag: - logger.info(f"[ ingest data ] fail to ingest data into Milvus. error: {e}") - return False +def search_by_file(collection, file_name): + query = f"{partition_field_name} == '{file_name}'" + results = collection.query( + expr=query, + output_fields=[partition_field_name, "pk"], + ) + if logflag: + logger.info(f"[ search by file ] searched by {file_name}") + logger.info(f"[ search by file ] {len(results)} results: {results}") + return results + +def search_all(collection): + results = collection.query(expr="pk >= 0", output_fields=[partition_field_name, "pk"]) if logflag: - logger.info(f"[ ingest data ] Docs ingested from {path} to Milvus collection {COLLECTION_NAME}.") + logger.info(f"[ search all ] {len(results)} results: {results}") + return results - return True + +def delete_all_data(my_milvus): + if logflag: + logger.info("[ delete all ] deleting all data in milvus") + if my_milvus.col: + my_milvus.col.drop() + if logflag: + logger.info("[ delete all ] delete success: all data") -async def ingest_link_to_milvus(link_list: List[str]): +def delete_by_partition_field(my_milvus, partition_field): + if logflag: + logger.info(f"[ delete partition ] deleting {partition_field_name} {partition_field}") + pks = my_milvus.get_pks(f'{partition_field_name} == "{partition_field}"') + if logflag: + logger.info(f"[ delete partition ] target pks: {pks}") + res = my_milvus.delete(pks) + my_milvus.col.flush() + if logflag: + logger.info(f"[ delete partition ] delete success: {res}") + + +@register_microservice(name="opea_service@prepare_doc_milvus", endpoint="/v1/dataprep", host="0.0.0.0", port=6010) +async def ingest_documents( + files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), + link_list: Optional[str] = Form(None), + chunk_size: int = Form(1000), + chunk_overlap: int = Form(100), + process_table: bool = Form(False), + table_strategy: str = Form("fast"), +): + if logflag: + logger.info(f"[ upload ] files:{files}") + logger.info(f"[ upload ] link_list:{link_list}") + + if files and link_list: + raise HTTPException(status_code=400, detail="Provide either a file or a string list, not both.") + # Create vectorstore if MOSEC_EMBEDDING_ENDPOINT: # create embeddings using MOSEC endpoint service if logflag: logger.info( - f"MOSEC_EMBEDDING_ENDPOINT:{MOSEC_EMBEDDING_ENDPOINT},MOSEC_EMBEDDING_MODEL:{MOSEC_EMBEDDING_MODEL}" + f"[ upload ] MOSEC_EMBEDDING_ENDPOINT:{MOSEC_EMBEDDING_ENDPOINT}, MOSEC_EMBEDDING_MODEL:{MOSEC_EMBEDDING_MODEL}" ) embedder = MosecEmbeddings(model=MOSEC_EMBEDDING_MODEL) elif TEI_EMBEDDING_ENDPOINT: # create embeddings using TEI endpoint service if logflag: - logger.info(f"TEI_EMBEDDING_ENDPOINT:{TEI_EMBEDDING_ENDPOINT}") + logger.info(f"[ upload ] TEI_EMBEDDING_ENDPOINT:{TEI_EMBEDDING_ENDPOINT}") embedder = HuggingFaceHubEmbeddings(model=TEI_EMBEDDING_ENDPOINT) else: # create embeddings using local embedding model if logflag: - logger.info(f"Local TEI_EMBEDDING_MODEL:{TEI_EMBEDDING_MODEL}") + logger.info(f"[ upload ] Local TEI_EMBEDDING_MODEL:{TEI_EMBEDDING_MODEL}") embedder = HuggingFaceBgeEmbeddings(model_name=TEI_EMBEDDING_MODEL) - for link in link_list: - content = parse_html([link])[0][0] - if logflag: - logger.info(f"[ ingest link ] link: {link} content: {content}") - encoded_link = encode_filename(link) - save_path = upload_folder + encoded_link + ".txt" - if logflag: - logger.info(f"[ ingest link ] save_path: {save_path}") - await save_content_to_local_disk(save_path, content) - - document = Document(page_content=content, metadata={partition_field_name: encoded_link + ".txt"}) - _ = Milvus.from_documents( - document, - embedder, - collection_name=COLLECTION_NAME, - connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT}, - partition_key_field=partition_field_name, - ) - - -@register_microservice(name="opea_service@prepare_doc_milvus", endpoint="/v1/dataprep", host="0.0.0.0", port=6010) -async def ingest_documents( - files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), - link_list: Optional[str] = Form(None), - chunk_size: int = Form(1500), - chunk_overlap: int = Form(100), - process_table: bool = Form(False), - table_strategy: str = Form("fast"), -): - if logflag: - logger.info(f"files:{files}") - logger.info(f"link_list:{link_list}") - if files and link_list: - raise HTTPException(status_code=400, detail="Provide either a file or a string list, not both.") + # define Milvus obj + my_milvus = Milvus( + embedding_function=embedder, + collection_name=COLLECTION_NAME, + connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT}, + index_params=index_params, + auto_id=True, + ) if files: if not isinstance(files, list): files = [files] uploaded_files = [] + for file in files: - save_path = upload_folder + file.filename - await save_content_to_local_disk(save_path, file) - uploaded_files.append(save_path) + encode_file = encode_filename(file.filename) + save_path = upload_folder + encode_file if logflag: - logger.info(f"Successfully saved file {save_path}") - - def process_files_wrapper(files): - if not isinstance(files, list): - files = [files] - for file in files: - assert ingest_data_to_milvus( - DocPath( - path=file, - chunk_size=chunk_size, - chunk_overlap=chunk_overlap, - process_table=process_table, - table_strategy=table_strategy, + logger.info(f"[ upload ] processing file {save_path}") + + if my_milvus.col: + # check whether the file is already uploaded + try: + search_res = search_by_file(my_milvus.col, encode_file) + except Exception as e: + raise HTTPException( + status_code=500, detail=f"Failed when searching in Milvus db for file {file.filename}." + ) + if len(search_res) > 0: + if logflag: + logger.info(f"[ upload ] File {file.filename} already exists.") + raise HTTPException( + status_code=400, + detail=f"Uploaded file {file.filename} already exists. Please change file name.", ) - ) - try: - # Create a SparkContext - conf = SparkConf().setAppName("Parallel-dataprep").setMaster("local[*]") - sc = SparkContext(conf=conf) - # Create an RDD with parallel processing - parallel_num = min(len(uploaded_files), os.cpu_count()) - rdd = sc.parallelize(uploaded_files, parallel_num) - # Perform a parallel operation - rdd_trans = rdd.map(process_files_wrapper) - rdd_trans.collect() - # Stop the SparkContext - sc.stop() - except: - # Stop the SparkContext - sc.stop() + await save_content_to_local_disk(save_path, file) + ingest_data_to_milvus( + DocPath( + path=save_path, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + process_table=process_table, + table_strategy=table_strategy, + ), + embedder, + ) + uploaded_files.append(save_path) + if logflag: + logger.info(f"Saved file {save_path} into local disk.") + + # def process_files_wrapper(files): + # if not isinstance(files, list): + # files = [files] + # for file in files: + # encode_file = encode_filename(file.filename) + # save_path = upload_folder + encode_file + # ingest_data_to_milvus( + # DocPath( + # path=save_path, + # chunk_size=chunk_size, + # chunk_overlap=chunk_overlap, + # process_table=process_table, + # table_strategy=table_strategy, + # ), + # embedder + # ) + + # try: + # # Create a SparkContext + # conf = SparkConf().setAppName("Parallel-dataprep").setMaster("local[*]") + # sc = SparkContext(conf=conf) + # # Create an RDD with parallel processing + # parallel_num = min(len(uploaded_files), os.cpu_count()) + # rdd = sc.parallelize(uploaded_files, parallel_num) + # print(uploaded_files) + # # Perform a parallel operation + # rdd_trans = rdd.map(process_files_wrapper) + # rdd_trans.collect() + # # Stop the SparkContext + # sc.stop() + # except: + # # Stop the SparkContext + # sc.stop() results = {"status": 200, "message": "Data preparation succeeded"} if logflag: logger.info(results) return results if link_list: - try: - link_list = json.loads(link_list) # Parse JSON string to list - if not isinstance(link_list, list): - raise HTTPException(status_code=400, detail="link_list should be a list.") - await ingest_link_to_milvus(link_list) - if logflag: - logger.info(f"Successfully saved link list {link_list}") - results = {"status": 200, "message": "Data preparation succeeded"} + link_list = json.loads(link_list) # Parse JSON string to list + if not isinstance(link_list, list): + raise HTTPException(status_code=400, detail="link_list should be a list.") + + for link in link_list: + encoded_link = encode_filename(link) if logflag: - logger.info(results) - return results - except json.JSONDecodeError: - raise HTTPException(status_code=400, detail="Invalid JSON format for link_list.") + logger.info(f"[ upload ] processing link {encoded_link}") + + # check whether the link file already exists + if my_milvus.col: + try: + search_res = search_by_file(my_milvus.col, encoded_link + ".txt") + except Exception as e: + raise HTTPException(status_code=500, detail=f"Failed when searching in Milvus db for link {link}.") + if len(search_res) > 0: + if logflag: + logger.info(f"[ upload ] Link {link} already exists.") + raise HTTPException( + status_code=400, detail=f"Uploaded link {link} already exists. Please change link." + ) + + save_path = upload_folder + encoded_link + ".txt" + content = parse_html([link])[0][0] + await save_content_to_local_disk(save_path, content) + ingest_data_to_milvus( + DocPath( + path=save_path, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + process_table=process_table, + table_strategy=table_strategy, + ), + embedder, + ) + if logflag: + logger.info(f"[ upload ] Successfully saved link list {link_list}") + return {"status": 200, "message": "Data preparation succeeded"} raise HTTPException(status_code=400, detail="Must provide either a file or a string list.") @register_microservice( - name="opea_service@prepare_doc_milvus_file", endpoint="/v1/dataprep/get_file", host="0.0.0.0", port=6011 + name="opea_service@prepare_doc_milvus", endpoint="/v1/dataprep/get_file", host="0.0.0.0", port=6010 ) async def rag_get_file_structure(): if logflag: - logger.info("[ dataprep - get file ] start to get file structure") + logger.info("[ get ] start to get file structure") - if not Path(upload_folder).exists(): + # Create vectorstore + if MOSEC_EMBEDDING_ENDPOINT: + # create embeddings using MOSEC endpoint service if logflag: - logger.info("No file uploaded, return empty list.") - return [] + logger.info( + f"[ get ] MOSEC_EMBEDDING_ENDPOINT:{MOSEC_EMBEDDING_ENDPOINT}, MOSEC_EMBEDDING_MODEL:{MOSEC_EMBEDDING_MODEL}" + ) + embedder = MosecEmbeddings(model=MOSEC_EMBEDDING_MODEL) + elif TEI_EMBEDDING_ENDPOINT: + # create embeddings using TEI endpoint service + if logflag: + logger.info(f"[ get ] TEI_EMBEDDING_ENDPOINT:{TEI_EMBEDDING_ENDPOINT}") + embedder = HuggingFaceHubEmbeddings(model=TEI_EMBEDDING_ENDPOINT) + else: + # create embeddings using local embedding model + if logflag: + logger.info(f"[ get ] Local TEI_EMBEDDING_MODEL:{TEI_EMBEDDING_MODEL}") + embedder = HuggingFaceBgeEmbeddings(model_name=TEI_EMBEDDING_MODEL) - file_content = get_file_structure(upload_folder) - if logflag: - logger.info(file_content) - return file_content + # define Milvus obj + my_milvus = Milvus( + embedding_function=embedder, + collection_name=COLLECTION_NAME, + connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT}, + index_params=index_params, + auto_id=True, + ) + # collection does not exist + if not my_milvus.col: + logger.info(f"[ get ] collection {COLLECTION_NAME} does not exist.") + return [] -def delete_all_data(my_milvus): - if logflag: - logger.info("[ delete ] deleting all data in milvus") - my_milvus.delete(expr="pk >= 0") - my_milvus.col.flush() - if logflag: - logger.info("[ delete ] delete success: all data") + # get all files from db + try: + all_data = search_all(my_milvus.col) + except Exception as e: + raise HTTPException(status_code=500, detail="Failed when searching in Milvus db for all files.") + # return [] if no data in db + if len(all_data) == 0: + return [] -def delete_by_partition_field(my_milvus, partition_field): - if logflag: - logger.info(f"[ delete ] deleting {partition_field_name} {partition_field}") - pks = my_milvus.get_pks(f'{partition_field_name} == "{partition_field}"') + res_file = [res["filename"] for res in all_data] + unique_list = list(set(res_file)) if logflag: - logger.info(f"[ delete ] target pks: {pks}") - res = my_milvus.delete(pks) - my_milvus.col.flush() + logger.info(f"[ get ] unique list from db: {unique_list}") + + # construct result file list in format + file_list = [] + for file_name in unique_list: + file_dict = { + "name": decode_filename(file_name), + "id": decode_filename(file_name), + "type": "File", + "parent": "", + } + file_list.append(file_dict) + if logflag: - logger.info(f"[ delete ] delete success: {res}") + logger.info(f"[ get ] final file list: {file_list}") + return file_list @register_microservice( - name="opea_service@prepare_doc_milvus_del", endpoint="/v1/dataprep/delete_file", host="0.0.0.0", port=6012 + name="opea_service@prepare_doc_milvus", endpoint="/v1/dataprep/delete_file", host="0.0.0.0", port=6010 ) async def delete_single_file(file_path: str = Body(..., embed=True)): """Delete file according to `file_path`. @@ -319,23 +443,24 @@ async def delete_single_file(file_path: str = Body(..., embed=True)): """ if logflag: logger.info(file_path) - # create embedder obj + + # Create vectorstore if MOSEC_EMBEDDING_ENDPOINT: # create embeddings using MOSEC endpoint service if logflag: logger.info( - f"[ dataprep - del ] MOSEC_EMBEDDING_ENDPOINT:{MOSEC_EMBEDDING_ENDPOINT},MOSEC_EMBEDDING_MODEL:{MOSEC_EMBEDDING_MODEL}" + f"[ delete ] MOSEC_EMBEDDING_ENDPOINT:{MOSEC_EMBEDDING_ENDPOINT}, MOSEC_EMBEDDING_MODEL:{MOSEC_EMBEDDING_MODEL}" ) embedder = MosecEmbeddings(model=MOSEC_EMBEDDING_MODEL) elif TEI_EMBEDDING_ENDPOINT: # create embeddings using TEI endpoint service if logflag: - logger.info(f"[ dataprep - del ] TEI_EMBEDDING_ENDPOINT:{TEI_EMBEDDING_ENDPOINT}") + logger.info(f"[ delete ] TEI_EMBEDDING_ENDPOINT:{TEI_EMBEDDING_ENDPOINT}") embedder = HuggingFaceHubEmbeddings(model=TEI_EMBEDDING_ENDPOINT) else: # create embeddings using local embedding model if logflag: - logger.info(f"[ dataprep - del ] Local TEI_EMBEDDING_MODEL:{TEI_EMBEDDING_MODEL}") + logger.info(f"[ delete ] Local TEI_EMBEDDING_MODEL:{TEI_EMBEDDING_MODEL}") embedder = HuggingFaceBgeEmbeddings(model_name=TEI_EMBEDDING_MODEL) # define Milvus obj @@ -350,51 +475,61 @@ async def delete_single_file(file_path: str = Body(..., embed=True)): # delete all uploaded files if file_path == "all": if logflag: - logger.info("[ dataprep - del ] deleting all files") + logger.info("[ delete ] deleting all files") + delete_all_data(my_milvus) - remove_folder_with_ignore(upload_folder) + + # delete files on local disk + try: + remove_folder_with_ignore(upload_folder) + except Exception as e: + if logflag: + logger.info(f"[ delete ] {e}. Fail to delete {upload_folder}.") + raise HTTPException(status_code=500, detail=f"Fail to delete {upload_folder}.") + if logflag: - logger.info("[ dataprep - del ] successfully delete all files.") + logger.info("[ delete ] successfully delete all files.") + create_upload_folder(upload_folder) if logflag: - logger.info({"status": True}) + logger.info("[ delete ] new upload folder created.") return {"status": True} encode_file_name = encode_filename(file_path) delete_path = Path(upload_folder + "/" + encode_file_name) if logflag: - logger.info(f"[dataprep - del] delete_path: {delete_path}") + logger.info(f"[delete] delete_path: {delete_path}") # partially delete files if delete_path.exists(): - # file + + # TODO: check existence before delete + + # delete file if delete_path.is_file(): if logflag: - logger.info(f"[dataprep - del] deleting file {encode_file_name}") + logger.info(f"[delete] deleting file {encode_file_name}") try: delete_by_partition_field(my_milvus, encode_file_name) - delete_path.unlink() - if logflag: - logger.info(f"[dataprep - del] file {encode_file_name} deleted") - logger.info({"status": True}) - return {"status": True} except Exception as e: if logflag: - logger.info(f"[dataprep - del] fail to delete file {delete_path}: {e}") - logger.info({"status": False}) + logger.info(f"[delete] fail to delete file {delete_path}: {e}") return {"status": False} - # folder + delete_path.unlink() + if logflag: + logger.info(f"[delete] file {file_path} deleted") + return {"status": True} + + # delete folder else: if logflag: - logger.info("[dataprep - del] delete folder is not supported for now.") - logger.info({"status": False}) - return {"status": False} + logger.info(f"[delete] delete folder {file_path} is not supported for now.") + raise HTTPException(status_code=404, detail=f"Delete folder {file_path} is not supported for now.") else: raise HTTPException(status_code=404, detail="File/folder not found. Please check del_path.") if __name__ == "__main__": create_upload_folder(upload_folder) + opea_microservices["opea_service@prepare_doc_milvus"].start() - opea_microservices["opea_service@prepare_doc_milvus_file"].start() - opea_microservices["opea_service@prepare_doc_milvus_del"].start() diff --git a/comps/dataprep/multimodal_utils.py b/comps/dataprep/multimodal_utils.py new file mode 100644 index 0000000000..cd71c5fc31 --- /dev/null +++ b/comps/dataprep/multimodal_utils.py @@ -0,0 +1,258 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import base64 +import json +import os +import uuid +from pathlib import Path +from typing import Iterator + +import cv2 +import requests +import webvtt +import whisper +from moviepy.editor import VideoFileClip + + +def create_upload_folder(upload_path): + """Create a directory to store uploaded video data.""" + if not os.path.exists(upload_path): + Path(upload_path).mkdir(parents=True, exist_ok=True) + + +def load_json_file(file_path): + """Read contents of json file.""" + with open(file_path, "r") as file: + data = json.load(file) + return data + + +def clear_upload_folder(upload_path): + """Clear the upload directory.""" + for root, dirs, files in os.walk(upload_path, topdown=False): + for file in files: + file_path = os.path.join(root, file) + os.remove(file_path) + for dir in dirs: + dir_path = os.path.join(root, dir) + os.rmdir(dir_path) + + +def generate_video_id(): + """Generates a unique identifier for a video file.""" + return str(uuid.uuid4()) + + +def convert_video_to_audio(video_path: str, output_audio_path: str): + """Converts video to audio using MoviePy library that uses `ffmpeg` under the hood. + + :param video_path: file path of video file (.mp4) + :param output_audio_path: file path of audio file (.wav) to be created + """ + video_clip = VideoFileClip(video_path) + audio_clip = video_clip.audio + audio_clip.write_audiofile(output_audio_path) + video_clip.close() + audio_clip.close() + + +def load_whisper_model(model_name: str = "base"): + """Load a whisper model for generating video transcripts.""" + return whisper.load_model(model_name) + + +def extract_transcript_from_audio(whisper_model, audio_path: str): + """Generate transcript from audio file. + + :param whisper_model: a pre-loaded whisper model object + :param audio_path: file path of audio file (.wav) + """ + options = dict(task="translate", best_of=5, language="en") + return whisper_model.transcribe(audio_path, **options) + + +def format_timestamp_for_transcript(seconds: float, always_include_hours: bool = True, fractionalSeperator: str = "."): + """Format timestamp for video transcripts.""" + milliseconds = round(seconds * 1000.0) + + hours = milliseconds // 3_600_000 + milliseconds -= hours * 3_600_000 + + minutes = milliseconds // 60_000 + milliseconds -= minutes * 60_000 + + seconds = milliseconds // 1_000 + milliseconds -= seconds * 1_000 + + hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else "" + return f"{hours_marker}{minutes:02d}:{seconds:02d}{fractionalSeperator}{milliseconds:03d}" + + +def write_vtt(transcript: Iterator[dict], vtt_path: str): + """Write transcripts to a .vtt file.""" + with open(vtt_path, "a") as file: + file.write("WEBVTT\n\n") + for segment in transcript["segments"]: + text = (segment["text"]).replace("-->", "->") + file.write( + f"{format_timestamp_for_transcript(segment['start'])} --> {format_timestamp_for_transcript(segment['end'])}\n" + ) + file.write(f"{text.strip()}\n\n") + + +def delete_audio_file(audio_path: str): + """Delete audio file after extracting transcript.""" + os.remove(audio_path) + + +def time_to_frame(time: float, fps: float): + """Convert time in seconds into frame number.""" + return int(time * fps - 1) + + +def str2time(strtime: str): + """Get time in seconds from string.""" + strtime = strtime.strip('"') + hrs, mins, seconds = [float(c) for c in strtime.split(":")] + + total_seconds = hrs * 60**2 + mins * 60 + seconds + + return total_seconds + + +def convert_img_to_base64(image): + "Convert image to base64 string" + _, buffer = cv2.imencode(".jpg", image) + encoded_string = base64.b64encode(buffer) + return encoded_string.decode() + + +def extract_frames_and_annotations_from_transcripts(video_id: str, video_path: str, vtt_path: str, output_dir: str): + """Extract frames (.jpg) and annotations (.json) from video file (.mp4) and captions file (.vtt)""" + # Set up location to store frames and annotations + os.makedirs(output_dir, exist_ok=True) + os.makedirs(os.path.join(output_dir, "frames"), exist_ok=True) + + # Load video and get fps + vidcap = cv2.VideoCapture(video_path) + fps = vidcap.get(cv2.CAP_PROP_FPS) + + # read captions file + captions = webvtt.read(vtt_path) + + annotations = [] + for idx, caption in enumerate(captions): + start_time = str2time(caption.start) + end_time = str2time(caption.end) + + mid_time = (end_time + start_time) / 2 + text = caption.text.replace("\n", " ") + + frame_no = time_to_frame(mid_time, fps) + mid_time_ms = mid_time * 1000 + vidcap.set(cv2.CAP_PROP_POS_MSEC, mid_time_ms) + success, frame = vidcap.read() + + if success: + # Save frame for further processing + img_fname = f"frame_{idx}" + img_fpath = os.path.join(output_dir, "frames", img_fname + ".jpg") + cv2.imwrite(img_fpath, frame) + + # Convert image to base64 encoded string + b64_img_str = convert_img_to_base64(frame) + + # Create annotations for frame from transcripts + annotations.append( + { + "video_id": video_id, + "video_name": os.path.basename(video_path), + "b64_img_str": b64_img_str, + "caption": text, + "time": mid_time_ms, + "frame_no": frame_no, + "sub_video_id": idx, + } + ) + + # Save transcript annotations as json file for further processing + with open(os.path.join(output_dir, "annotations.json"), "w") as f: + json.dump(annotations, f) + + vidcap.release() + return annotations + + +def use_lvm(endpoint: str, img_b64_string: str, prompt: str = "Provide a short description for this scene."): + """Generate image captions/descriptions using LVM microservice.""" + inputs = {"image": img_b64_string, "prompt": prompt, "max_new_tokens": 32} + response = requests.post(url=endpoint, data=json.dumps(inputs)) + print(response) + return response.json()["text"] + + +def extract_frames_and_generate_captions( + video_id: str, video_path: str, lvm_endpoint: str, output_dir: str, key_frame_per_second: int = 1 +): + """Extract frames (.jpg) and annotations (.json) from video file (.mp4) by generating captions using LVM microservice.""" + # Set up location to store frames and annotations + os.makedirs(output_dir, exist_ok=True) + os.makedirs(os.path.join(output_dir, "frames"), exist_ok=True) + + # Load video and get fps + vidcap = cv2.VideoCapture(video_path) + fps = vidcap.get(cv2.CAP_PROP_FPS) + + annotations = [] + hop = round(fps / key_frame_per_second) + curr_frame = 0 + idx = -1 + + while True: + ret, frame = vidcap.read() + if not ret: + break + + if curr_frame % hop == 0: + idx += 1 + + mid_time = vidcap.get(cv2.CAP_PROP_POS_MSEC) + mid_time_ms = mid_time * 1000 + + frame_no = curr_frame + frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + + # Save frame for further processing + img_fname = f"frame_{idx}" + img_fpath = os.path.join(output_dir, "frames", img_fname + ".jpg") + cv2.imwrite(img_fpath, frame) + + # Convert image to base64 encoded string + b64_img_str = convert_img_to_base64(frame) + + # Caption generation using LVM microservice + caption = use_lvm(lvm_endpoint, b64_img_str) + caption = caption.strip() + text = caption.replace("\n", " ") + + # Create annotations for frame from transcripts + annotations.append( + { + "video_id": video_id, + "video_name": os.path.basename(video_path), + "b64_img_str": b64_img_str, + "caption": text, + "time": mid_time_ms, + "frame_no": frame_no, + "sub_video_id": idx, + } + ) + + curr_frame += 1 + + # Save caption annotations as json file for further processing + with open(os.path.join(output_dir, "annotations.json"), "w") as f: + json.dump(annotations, f) + + vidcap.release() diff --git a/comps/dataprep/pgvector/langchain/docker/Dockerfile b/comps/dataprep/pgvector/langchain/docker/Dockerfile index 75e70c524d..897d15564c 100644 --- a/comps/dataprep/pgvector/langchain/docker/Dockerfile +++ b/comps/dataprep/pgvector/langchain/docker/Dockerfile @@ -1,4 +1,3 @@ - # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 @@ -10,10 +9,9 @@ ARG ARCH="cpu" RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ build-essential \ - libgl1-mesa-glx \ - libjemalloc-dev \ default-jre \ - vim + libgl1-mesa-glx \ + libjemalloc-dev RUN useradd -m -s /bin/bash user && \ mkdir -p /home/user && \ @@ -38,4 +36,3 @@ USER user WORKDIR /home/user/comps/dataprep/pgvector/langchain ENTRYPOINT ["python", "prepare_doc_pgvector.py"] - diff --git a/comps/dataprep/pinecone/docker/Dockerfile b/comps/dataprep/pinecone/docker/Dockerfile index d61ecf65fd..4bb51956be 100644 --- a/comps/dataprep/pinecone/docker/Dockerfile +++ b/comps/dataprep/pinecone/docker/Dockerfile @@ -1,4 +1,3 @@ - # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 @@ -10,12 +9,10 @@ ARG ARCH="cpu" RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ build-essential \ - libgl1-mesa-glx \ - libjemalloc-dev \ default-jre \ - vim \ - libcairo2 - + libcairo2 \ + libgl1-mesa-glx \ + libjemalloc-dev RUN useradd -m -s /bin/bash user && \ mkdir -p /home/user && \ @@ -43,4 +40,3 @@ USER user WORKDIR /home/user/comps/dataprep/pinecone ENTRYPOINT ["python", "prepare_doc_pinecone.py"] - diff --git a/comps/dataprep/qdrant/docker/Dockerfile b/comps/dataprep/qdrant/docker/Dockerfile index ff9f6b2533..f36b80bc95 100644 --- a/comps/dataprep/qdrant/docker/Dockerfile +++ b/comps/dataprep/qdrant/docker/Dockerfile @@ -1,4 +1,3 @@ - # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 @@ -10,10 +9,9 @@ ARG ARCH="cpu" RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ build-essential \ - libgl1-mesa-glx \ - libjemalloc-dev \ default-jre \ - vim + libgl1-mesa-glx \ + libjemalloc-dev RUN useradd -m -s /bin/bash user && \ mkdir -p /home/user && \ diff --git a/comps/dataprep/redis/README.md b/comps/dataprep/redis/README.md index 4617dfa25c..76361a236f 100644 --- a/comps/dataprep/redis/README.md +++ b/comps/dataprep/redis/README.md @@ -1,6 +1,8 @@ # Dataprep Microservice with Redis -For dataprep microservice, we provide two frameworks: `Langchain` and `LlamaIndex`. We also provide `Langchain_ray` which uses ray to parallel the data prep for multi-file performance improvement(observed 5x - 15x speedup by processing 1000 files/links.). +We have provided dataprep microservice for multimodal data input (e.g., text and image) [here](multimodal_langchain/README.md). + +For dataprep microservice for text input, we provide here two frameworks: `Langchain` and `LlamaIndex`. We also provide `Langchain_ray` which uses ray to parallel the data prep for multi-file performance improvement(observed 5x - 15x speedup by processing 1000 files/links.). We organized these two folders in the same way, so you can use either framework for dataprep microservice with the following constructions. diff --git a/comps/dataprep/redis/langchain/docker/Dockerfile b/comps/dataprep/redis/langchain/docker/Dockerfile index bcd4f99173..61620b88fa 100644 --- a/comps/dataprep/redis/langchain/docker/Dockerfile +++ b/comps/dataprep/redis/langchain/docker/Dockerfile @@ -1,4 +1,3 @@ - # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 @@ -10,13 +9,12 @@ ARG ARCH="cpu" RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ build-essential \ + default-jre \ libgl1-mesa-glx \ libjemalloc-dev \ - default-jre \ - vim \ + libreoffice \ poppler-utils \ - tesseract-ocr \ - libreoffice + tesseract-ocr RUN useradd -m -s /bin/bash user && \ mkdir -p /home/user && \ @@ -41,4 +39,3 @@ USER user WORKDIR /home/user/comps/dataprep/redis/langchain ENTRYPOINT ["python", "prepare_doc_redis.py"] - diff --git a/comps/dataprep/redis/langchain_ray/docker/Dockerfile b/comps/dataprep/redis/langchain_ray/docker/Dockerfile index f9f91521fc..3f6b10b61c 100644 --- a/comps/dataprep/redis/langchain_ray/docker/Dockerfile +++ b/comps/dataprep/redis/langchain_ray/docker/Dockerfile @@ -1,4 +1,3 @@ - # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 @@ -10,10 +9,9 @@ ARG ARCH="cpu" RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ build-essential \ + libcairo2 \ libgl1-mesa-glx \ libjemalloc-dev \ - vim \ - libcairo2 \ poppler-utils \ tesseract-ocr @@ -41,4 +39,3 @@ USER user WORKDIR /home/user/comps/dataprep/redis/langchain_ray ENTRYPOINT ["python", "prepare_doc_redis_on_ray.py"] - diff --git a/comps/dataprep/redis/llama_index/docker/Dockerfile b/comps/dataprep/redis/llama_index/docker/Dockerfile index 111bdbd0b5..f34930e71e 100644 --- a/comps/dataprep/redis/llama_index/docker/Dockerfile +++ b/comps/dataprep/redis/llama_index/docker/Dockerfile @@ -1,4 +1,3 @@ - # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 @@ -10,10 +9,9 @@ ARG ARCH="cpu" RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ build-essential \ + libcairo2 \ libgl1-mesa-glx \ libjemalloc-dev \ - vim \ - libcairo2 \ poppler-utils \ tesseract-ocr @@ -40,4 +38,3 @@ USER user WORKDIR /home/user/comps/dataprep/redis/llama_index ENTRYPOINT ["python", "prepare_doc_redis.py"] - diff --git a/comps/dataprep/redis/multimodal_langchain/README.md b/comps/dataprep/redis/multimodal_langchain/README.md new file mode 100644 index 0000000000..19042c6ae4 --- /dev/null +++ b/comps/dataprep/redis/multimodal_langchain/README.md @@ -0,0 +1,213 @@ +# Dataprep Microservice for Multimodal Data with Redis + +This `dataprep` microservice accepts videos (mp4 files) and their transcripts (optional) from the user and ingests them into Redis vectorstore. + +# πŸš€1. Start Microservice with Python(Option 1οΌ‰ + +## 1.1 Install Requirements + +```bash +# Install ffmpeg static build +wget https://johnvansickle.com/ffmpeg/builds/ffmpeg-git-amd64-static.tar.xz +mkdir ffmpeg-git-amd64-static +tar -xvf ffmpeg-git-amd64-static.tar.xz -C ffmpeg-git-amd64-static --strip-components 1 +export PATH=$(pwd)/ffmpeg-git-amd64-static:$PATH +cp $(pwd)/ffmpeg-git-amd64-static/ffmpeg /usr/local/bin/ + +pip install -r requirements.txt +``` + +## 1.2 Start Redis Stack Server + +Please refer to this [readme](../../../vectorstores/langchain/redis/README.md). + +## 1.3 Setup Environment Variables + +```bash +export your_ip=$(hostname -I | awk '{print $1}') +export REDIS_URL="redis://${your_ip}:6379" +export INDEX_NAME=${your_redis_index_name} +export PYTHONPATH=${path_to_comps} +``` + +## 1.4 Start LVM Microservice (Optional) + +This is required only if you are going to consume the _generate_captions_ API of this microservice as in [Section 4.3](#43-consume-generate_captions-api). + +Please refer to this [readme](../../../lvms/README.md) to start the LVM microservice. +After LVM is up, set up environment variables. + +```bash +export your_ip=$(hostname -I | awk '{print $1}') +export LVM_ENDPOINT="http://${your_ip}:9399/v1/lvm" +``` + +## 1.5 Start Data Preparation Microservice for Redis with Python Script + +Start document preparation microservice for Redis with below command. + +```bash +python prepare_videodoc_redis.py +``` + +# πŸš€2. Start Microservice with Docker (Option 2) + +## 2.1 Start Redis Stack Server + +Please refer to this [readme](../../../vectorstores/langchain/redis/README.md). + +## 2.2 Start LVM Microservice (Optional) + +This is required only if you are going to consume the _generate_captions_ API of this microservice as described [here](#43-consume-generate_captions-api). + +Please refer to this [readme](../../../lvms/README.md) to start the LVM microservice. +After LVM is up, set up environment variables. + +```bash +export your_ip=$(hostname -I | awk '{print $1}') +export LVM_ENDPOINT="http://${your_ip}:9399/v1/lvm" +``` + +## 2.3 Setup Environment Variables + +```bash +export your_ip=$(hostname -I | awk '{print $1}') +export EMBEDDING_MODEL_ID="BridgeTower/bridgetower-large-itm-mlm-itc" +export REDIS_URL="redis://${your_ip}:6379" +export WHISPER_MODEL="base" +export INDEX_NAME=${your_redis_index_name} +export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token} +``` + +## 2.4 Build Docker Image + +```bash +cd ../../../../ +docker build -t opea/dataprep-redis:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/redis/multimodal_langchain/docker/Dockerfile . +``` + +## 2.5 Run Docker with CLI (Option A) + +```bash +docker run -d --name="dataprep-redis-server" -p 6007:6007 --runtime=runc --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e REDIS_URL=$REDIS_URL -e INDEX_NAME=$INDEX_NAME -e LVM_ENDPOINT=$LVM_ENDPOINT -e HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN opea/dataprep-redis:latest +``` + +## 2.6 Run with Docker Compose (Option B - deprecated, will move to genAIExample in future) + +```bash +cd comps/dataprep/redis/multimodal_langchain/docker +docker compose -f docker-compose-dataprep-redis.yaml up -d +``` + +# πŸš€3. Status Microservice + +```bash +docker container logs -f dataprep-redis-server +``` + +# πŸš€4. Consume Microservice + +Once this dataprep microservice is started, user can use the below commands to invoke the microservice to convert videos and their transcripts (optional) to embeddings and save to the Redis vector store. + +This mircroservice has provided 3 different ways for users to ingest videos into Redis vector store corresponding to the 3 use cases. + +## 4.1 Consume _videos_with_transcripts_ API + +**Use case:** This API is used when a transcript file (under `.vtt` format) is available for each video. + +**Important notes:** + +- Make sure the file paths after `files=@` are correct. +- Every transcript file's name must be identical with its corresponding video file's name (except their extension .vtt and .mp4). For example, `video1.mp4` and `video1.vtt`. Otherwise, if `video1.vtt` is not included correctly in this API call, this microservice will return error `No captions file video1.vtt found for video1.mp4`. + +### Single video-transcript pair upload + +```bash +curl -X POST \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./video1.mp4" \ + -F "files=@./video1.vtt" \ + http://localhost:6007/v1/videos_with_transcripts +``` + +### Multiple video-transcript pair upload + +```bash +curl -X POST \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./video1.mp4" \ + -F "files=@./video1.vtt" \ + -F "files=@./video2.mp4" \ + -F "files=@./video2.vtt" \ + http://localhost:6007/v1/videos_with_transcripts +``` + +## 4.2 Consume _generate_transcripts_ API + +**Use case:** This API should be used when a video has meaningful audio or recognizable speech but its transcript file is not available. + +In this use case, this microservice will use [`whisper`](https://openai.com/index/whisper/) model to generate the `.vtt` transcript for the video. + +### Single video upload + +```bash +curl -X POST \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./video1.mp4" \ + http://localhost:6007/v1/generate_transcripts +``` + +### Multiple video upload + +```bash +curl -X POST \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./video1.mp4" \ + -F "files=@./video2.mp4" \ + http://localhost:6007/v1/generate_transcripts +``` + +## 4.3 Consume _generate_captions_ API + +**Use case:** This API should be used when a video does not have meaningful audio or does not have audio. + +In this use case, transcript either does not provide any meaningful information or does not exist. Thus, it is preferred to leverage a LVM microservice to summarize the video frames. + +- Single video upload + +```bash +curl -X POST \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./video1.mp4" \ + http://localhost:6007/v1/generate_captions +``` + +- Multiple video upload + +```bash +curl -X POST \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./video1.mp4" \ + -F "files=@./video2.mp4" \ + http://localhost:6007/v1/generate_captions +``` + +## 4.4 Consume get_videos API + +To get names of uploaded videos, use the following command. + +```bash +curl -X POST \ + -H "Content-Type: application/json" \ + http://localhost:6007/v1/dataprep/get_videos +``` + +## 4.5 Consume delete_videos API + +To delete uploaded videos and clear the database, use the following command. + +```bash +curl -X POST \ + -H "Content-Type: application/json" \ + http://localhost:6007/v1/dataprep/delete_videos +``` diff --git a/comps/dataprep/redis/multimodal_langchain/__init__.py b/comps/dataprep/redis/multimodal_langchain/__init__.py new file mode 100644 index 0000000000..916f3a44b2 --- /dev/null +++ b/comps/dataprep/redis/multimodal_langchain/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/dataprep/redis/multimodal_langchain/config.py b/comps/dataprep/redis/multimodal_langchain/config.py new file mode 100644 index 0000000000..0cae533788 --- /dev/null +++ b/comps/dataprep/redis/multimodal_langchain/config.py @@ -0,0 +1,70 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +# Models +EMBED_MODEL = os.getenv("EMBED_MODEL", "BridgeTower/bridgetower-large-itm-mlm-itc") +WHISPER_MODEL = os.getenv("WHISPER_MODEL", "small") + +# Redis Connection Information +REDIS_HOST = os.getenv("REDIS_HOST", "localhost") +REDIS_PORT = int(os.getenv("REDIS_PORT", 6379)) + +# Lvm Microservice Information +LVM_ENDPOINT = os.getenv("LVM_ENDPOINT", "http://localhost:9399/v1/lvm") + + +def get_boolean_env_var(var_name, default_value=False): + """Retrieve the boolean value of an environment variable. + + Args: + var_name (str): The name of the environment variable to retrieve. + default_value (bool): The default value to return if the variable + is not found. + Returns: + bool: The value of the environment variable, interpreted as a boolean. + """ + true_values = {"true", "1", "t", "y", "yes"} + false_values = {"false", "0", "f", "n", "no"} + + # Retrieve the environment variable's value + value = os.getenv(var_name, "").lower() + + # Decide the boolean value based on the content of the string + if value in true_values: + return True + elif value in false_values: + return False + else: + return default_value + + +def format_redis_conn_from_env(): + redis_url = os.getenv("REDIS_URL", None) + if redis_url: + return redis_url + else: + using_ssl = get_boolean_env_var("REDIS_SSL", False) + start = "rediss://" if using_ssl else "redis://" + + # if using RBAC + password = os.getenv("REDIS_PASSWORD", None) + username = os.getenv("REDIS_USERNAME", "default") + if password is not None: + start += f"{username}:{password}@" + + return start + f"{REDIS_HOST}:{REDIS_PORT}" + + +REDIS_URL = format_redis_conn_from_env() + +# Vector Index Configuration +INDEX_NAME = os.getenv("INDEX_NAME", "mm-rag-redis") + +current_file_path = os.path.abspath(__file__) +parent_dir = os.path.dirname(current_file_path) +REDIS_SCHEMA = os.getenv("REDIS_SCHEMA", "schema.yml") +TIMEOUT_SECONDS = int(os.getenv("TIMEOUT_SECONDS", 600)) +schema_path = os.path.join(parent_dir, REDIS_SCHEMA) +INDEX_SCHEMA = schema_path diff --git a/comps/dataprep/redis/multimodal_langchain/docker/Dockerfile b/comps/dataprep/redis/multimodal_langchain/docker/Dockerfile new file mode 100644 index 0000000000..a6c2be7e3b --- /dev/null +++ b/comps/dataprep/redis/multimodal_langchain/docker/Dockerfile @@ -0,0 +1,37 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +ENV LANG=C.UTF-8 + +ARG ARCH="cpu" + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + build-essential \ + libgl1-mesa-glx \ + libjemalloc-dev \ + default-jre \ + wget \ + vim + +# Install ffmpeg static build +RUN cd /root && wget https://johnvansickle.com/ffmpeg/builds/ffmpeg-git-amd64-static.tar.xz && \ + mkdir ffmpeg-git-amd64-static && tar -xvf ffmpeg-git-amd64-static.tar.xz -C ffmpeg-git-amd64-static --strip-components 1 && \ + export PATH=/root/ffmpeg-git-amd64-static:$PATH && \ + cp /root/ffmpeg-git-amd64-static/ffmpeg /usr/local/bin/ + +RUN mkdir -p /home/user + +COPY comps /home/user/comps + +RUN pip install --no-cache-dir --upgrade pip setuptools && \ + if [ ${ARCH} = "cpu" ]; then pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu; fi && \ + pip install --no-cache-dir -r /home/user/comps/dataprep/redis/multimodal_langchain/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +WORKDIR /home/user/comps/dataprep/redis/multimodal_langchain + +ENTRYPOINT ["python", "prepare_videodoc_redis.py"] + diff --git a/comps/dataprep/redis/multimodal_langchain/docker/docker-compose-dataprep-redis.yaml b/comps/dataprep/redis/multimodal_langchain/docker/docker-compose-dataprep-redis.yaml new file mode 100644 index 0000000000..d98ddbd878 --- /dev/null +++ b/comps/dataprep/redis/multimodal_langchain/docker/docker-compose-dataprep-redis.yaml @@ -0,0 +1,29 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3" +services: + redis-vector-db: + image: redis/redis-stack:7.2.0-v9 + container_name: redis-vector-db + ports: + - "6379:6379" + - "8001:8001" + dataprep-redis: + image: opea/dataprep-redis:latest + container_name: dataprep-redis-server + ports: + - "6007:6007" + ipc: host + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + REDIS_URL: ${REDIS_URL} + INDEX_NAME: ${INDEX_NAME} + LVM_ENDPOINT: ${LVM_ENDPOINT} + restart: unless-stopped + +networks: + default: + driver: bridge diff --git a/comps/dataprep/redis/multimodal_langchain/prepare_videodoc_redis.py b/comps/dataprep/redis/multimodal_langchain/prepare_videodoc_redis.py new file mode 100644 index 0000000000..d658c58b0c --- /dev/null +++ b/comps/dataprep/redis/multimodal_langchain/prepare_videodoc_redis.py @@ -0,0 +1,527 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os +import shutil +import subprocess +import time +import uuid +from pathlib import Path +from typing import Any, Dict, Iterable, List, Optional, Type, Union + +from config import EMBED_MODEL, INDEX_NAME, INDEX_SCHEMA, LVM_ENDPOINT, REDIS_URL, WHISPER_MODEL +from fastapi import File, HTTPException, UploadFile +from langchain_community.utilities.redis import _array_to_buffer +from langchain_community.vectorstores import Redis +from langchain_community.vectorstores.redis.base import _generate_field_schema, _prepare_metadata +from langchain_community.vectorstores.redis.schema import read_schema +from langchain_core.embeddings import Embeddings +from langchain_core.utils import get_from_dict_or_env +from PIL import Image + +from comps import opea_microservices, register_microservice +from comps.dataprep.multimodal_utils import ( + clear_upload_folder, + convert_video_to_audio, + create_upload_folder, + delete_audio_file, + extract_frames_and_annotations_from_transcripts, + extract_frames_and_generate_captions, + extract_transcript_from_audio, + generate_video_id, + load_json_file, + load_whisper_model, + write_vtt, +) +from comps.embeddings.multimodal_embeddings.bridgetower.bridgetower_embedding import BridgeTowerEmbedding + +device = "cpu" +upload_folder = "./uploaded_files/" + + +class MultimodalRedis(Redis): + """Redis vector database to process multimodal data.""" + + @classmethod + def from_text_image_pairs_return_keys( + cls: Type[Redis], + texts: List[str], + images: List[str], + embedding: Embeddings = BridgeTowerEmbedding, + metadatas: Optional[List[dict]] = None, + index_name: Optional[str] = None, + index_schema: Optional[Union[Dict[str, str], str, os.PathLike]] = None, + vector_schema: Optional[Dict[str, Union[str, int]]] = None, + **kwargs: Any, + ): + """ + Args: + texts (List[str]): List of texts to add to the vectorstore. + images (List[str]): List of path-to-images to add to the vectorstore. + embedding (Embeddings): Embeddings to use for the vectorstore. + metadatas (Optional[List[dict]], optional): Optional list of metadata + dicts to add to the vectorstore. Defaults to None. + index_name (Optional[str], optional): Optional name of the index to + create or add to. Defaults to None. + index_schema (Optional[Union[Dict[str, str], str, os.PathLike]], optional): + Optional fields to index within the metadata. Overrides generated + schema. Defaults to None. + vector_schema (Optional[Dict[str, Union[str, int]]], optional): Optional + vector schema to use. Defaults to None. + **kwargs (Any): Additional keyword arguments to pass to the Redis client. + Returns: + Tuple[Redis, List[str]]: Tuple of the Redis instance and the keys of + the newly created documents. + Raises: + ValueError: If the number of texts does not equal the number of images. + ValueError: If the number of metadatas does not match the number of texts. + """ + # the length of texts must be equal to the length of images + if len(texts) != len(images): + raise ValueError(f"the len of captions {len(texts)} does not equal the len of images {len(images)}") + + redis_url = get_from_dict_or_env(kwargs, "redis_url", "REDIS_URL") + + if "redis_url" in kwargs: + kwargs.pop("redis_url") + + # flag to use generated schema + if "generate" in kwargs: + kwargs.pop("generate") + + # see if the user specified keys + keys = None + if "keys" in kwargs: + keys = kwargs.pop("keys") + + # Name of the search index if not given + if not index_name: + index_name = uuid.uuid4().hex + + # type check for metadata + if metadatas: + if isinstance(metadatas, list) and len(metadatas) != len(texts): # type: ignore # noqa: E501 + raise ValueError("Number of metadatas must match number of texts") + if not (isinstance(metadatas, list) and isinstance(metadatas[0], dict)): + raise ValueError("Metadatas must be a list of dicts") + generated_schema = _generate_field_schema(metadatas[0]) + + if not index_schema: + index_schema = generated_schema + + # Create instance + instance = cls( + redis_url, + index_name, + embedding, + index_schema=index_schema, + vector_schema=vector_schema, + **kwargs, + ) + # Add data to Redis + keys = instance.add_text_image_pairs(texts, images, metadatas, keys=keys) + return instance, keys + + def add_text_image_pairs( + self, + texts: Iterable[str], + images: Iterable[str], + metadatas: Optional[List[dict]] = None, + embeddings: Optional[List[List[float]]] = None, + batch_size: int = 2, + clean_metadata: bool = True, + **kwargs: Any, + ) -> List[str]: + """Add more embeddings of text-image pairs to the vectorstore. + + Args: + texts (Iterable[str]): Iterable of strings/text to add to the vectorstore. + images: Iterable[str]: Iterable of strings/text of path-to-image to add to the vectorstore. + metadatas (Optional[List[dict]], optional): Optional list of metadatas. + Defaults to None. + embeddings (Optional[List[List[float]]], optional): Optional pre-generated + embeddings. Defaults to None. + keys (List[str]) or ids (List[str]): Identifiers of entries. + Defaults to None. + batch_size (int, optional): Batch size to use for writes. Defaults to 1000. + Returns: + List[str]: List of ids added to the vectorstore + """ + ids = [] + # Get keys or ids from kwargs + # Other vectorstores use ids + keys_or_ids = kwargs.get("keys", kwargs.get("ids")) + + # type check for metadata + if metadatas: + if isinstance(metadatas, list) and len(metadatas) != len(texts): # type: ignore # noqa: E501 + raise ValueError("Number of metadatas must match number of texts") + if not (isinstance(metadatas, list) and isinstance(metadatas[0], dict)): + raise ValueError("Metadatas must be a list of dicts") + pil_imgs = [Image.open(img) for img in images] + if not embeddings: + embeddings = self._embeddings.embed_image_text_pairs(list(texts), pil_imgs, batch_size=batch_size) + self._create_index_if_not_exist(dim=len(embeddings[0])) + + # Write data to redis + pipeline = self.client.pipeline(transaction=False) + for i, text in enumerate(texts): + # Use provided values by default or fallback + key = keys_or_ids[i] if keys_or_ids else str(uuid.uuid4().hex) + if not key.startswith(self.key_prefix + ":"): + key = self.key_prefix + ":" + key + metadata = metadatas[i] if metadatas else {} + metadata = _prepare_metadata(metadata) if clean_metadata else metadata + pipeline.hset( + key, + mapping={ + self._schema.content_key: text, + self._schema.content_vector_key: _array_to_buffer(embeddings[i], self._schema.vector_dtype), + **metadata, + }, + ) + ids.append(key) + + # Write batch + if i % batch_size == 0: + pipeline.execute() + + # Cleanup final batch + pipeline.execute() + return ids + + +def prepare_data_and_metadata_from_annotation( + annotation, path_to_frames, title, num_transcript_concat_for_ingesting=2, num_transcript_concat_for_inference=7 +): + text_list = [] + image_list = [] + metadatas = [] + for i, frame in enumerate(annotation): + frame_index = frame["sub_video_id"] + path_to_frame = os.path.join(path_to_frames, f"frame_{frame_index}.jpg") + # augment this frame's transcript with a reasonable number of neighboring frames' transcripts helps semantic retrieval + lb_ingesting = max(0, i - num_transcript_concat_for_ingesting) + ub_ingesting = min(len(annotation), i + num_transcript_concat_for_ingesting + 1) + caption_for_ingesting = " ".join([annotation[j]["caption"] for j in range(lb_ingesting, ub_ingesting)]) + + # augment this frame's transcript with more neighboring frames' transcript to provide more context to LVM for question answering + lb_inference = max(0, i - num_transcript_concat_for_inference) + ub_inference = min(len(annotation), i + num_transcript_concat_for_inference + 1) + caption_for_inference = " ".join([annotation[j]["caption"] for j in range(lb_inference, ub_inference)]) + + video_id = frame["video_id"] + b64_img_str = frame["b64_img_str"] + time_of_frame = frame["time"] + embedding_type = "pair" + source_video = frame["video_name"] + + text_list.append(caption_for_ingesting) + image_list.append(path_to_frame) + metadatas.append( + { + "content": caption_for_ingesting, + "b64_img_str": b64_img_str, + "video_id": video_id, + "source_video": source_video, + "time_of_frame_ms": float(time_of_frame), + "embedding_type": embedding_type, + "title": title, + "transcript_for_inference": caption_for_inference, + } + ) + + return text_list, image_list, metadatas + + +def ingest_multimodal(videoname, data_folder, embeddings): + """Ingest text image pairs to Redis from the data/ directory that consists of frames and annotations.""" + data_folder = os.path.abspath(data_folder) + annotation_file_path = os.path.join(data_folder, "annotations.json") + path_to_frames = os.path.join(data_folder, "frames") + + annotation = load_json_file(annotation_file_path) + + # prepare data to ingest + text_list, image_list, metadatas = prepare_data_and_metadata_from_annotation(annotation, path_to_frames, videoname) + + MultimodalRedis.from_text_image_pairs_return_keys( + texts=[f"From {videoname}. " + text for text in text_list], + images=image_list, + embedding=embeddings, + metadatas=metadatas, + index_name=INDEX_NAME, + index_schema=INDEX_SCHEMA, + redis_url=REDIS_URL, + ) + + +def drop_index(index_name, redis_url=REDIS_URL): + print(f"dropping index {index_name}") + try: + assert Redis.drop_index(index_name=index_name, delete_documents=True, redis_url=redis_url) + print(f"index {index_name} deleted") + except Exception as e: + print(f"index {index_name} delete failed: {e}") + return False + return True + + +@register_microservice( + name="opea_service@prepare_videodoc_redis", endpoint="/v1/generate_transcripts", host="0.0.0.0", port=6007 +) +async def ingest_videos_generate_transcripts(files: List[UploadFile] = File(None)): + """Upload videos with speech, generate transcripts using whisper and ingest into redis.""" + + if files: + video_files = [] + for file in files: + if os.path.splitext(file.filename)[1] == ".mp4": + video_files.append(file) + else: + raise HTTPException( + status_code=400, detail=f"File {file.filename} is not an mp4 file. Please upload mp4 files only." + ) + + for video_file in video_files: + st = time.time() + print(f"Processing video {video_file.filename}") + + # Assign unique identifier to video + video_id = generate_video_id() + + # Create video file name by appending identifier + video_name = os.path.splitext(video_file.filename)[0] + video_file_name = f"{video_name}_{video_id}.mp4" + video_dir_name = os.path.splitext(video_file_name)[0] + + # Save video file in upload_directory + with open(os.path.join(upload_folder, video_file_name), "wb") as f: + shutil.copyfileobj(video_file.file, f) + + # Extract temporary audio wav file from video mp4 + audio_file = video_dir_name + ".wav" + print(f"Extracting {audio_file}") + convert_video_to_audio( + os.path.join(upload_folder, video_file_name), os.path.join(upload_folder, audio_file) + ) + print(f"Done extracting {audio_file}") + + # Load whisper model + print("Loading whisper model....") + whisper_model = load_whisper_model(model_name=WHISPER_MODEL) + print("Done loading whisper!") + + # Extract transcript from audio + print("Extracting transcript from audio") + transcripts = extract_transcript_from_audio(whisper_model, os.path.join(upload_folder, audio_file)) + + # Save transcript as vtt file and delete audio file + vtt_file = video_dir_name + ".vtt" + write_vtt(transcripts, os.path.join(upload_folder, vtt_file)) + delete_audio_file(os.path.join(upload_folder, audio_file)) + print("Done extracting transcript.") + + # Store frames and caption annotations in a new directory + print("Extracting frames and generating annotation") + extract_frames_and_annotations_from_transcripts( + video_id, + os.path.join(upload_folder, video_file_name), + os.path.join(upload_folder, vtt_file), + os.path.join(upload_folder, video_dir_name), + ) + print("Done extracting frames and generating annotation") + # Delete temporary vtt file + os.remove(os.path.join(upload_folder, vtt_file)) + + # Ingest multimodal data into redis + print("Ingesting data to redis vector store") + ingest_multimodal(video_name, os.path.join(upload_folder, video_dir_name), embeddings) + + # Delete temporary video directory containing frames and annotations + shutil.rmtree(os.path.join(upload_folder, video_dir_name)) + + print(f"Processed video {video_file.filename}") + end = time.time() + print(str(end - st)) + + return {"status": 200, "message": "Data preparation succeeded"} + + raise HTTPException(status_code=400, detail="Must provide at least one video (.mp4) file.") + + +@register_microservice( + name="opea_service@prepare_videodoc_redis", endpoint="/v1/generate_captions", host="0.0.0.0", port=6007 +) +async def ingest_videos_generate_caption(files: List[UploadFile] = File(None)): + """Upload videos without speech (only background music or no audio), generate captions using lvm microservice and ingest into redis.""" + + if files: + video_files = [] + for file in files: + if os.path.splitext(file.filename)[1] == ".mp4": + video_files.append(file) + else: + raise HTTPException( + status_code=400, detail=f"File {file.filename} is not an mp4 file. Please upload mp4 files only." + ) + + for video_file in video_files: + print(f"Processing video {video_file.filename}") + + # Assign unique identifier to video + video_id = generate_video_id() + + # Create video file name by appending identifier + video_name = os.path.splitext(video_file.filename)[0] + video_file_name = f"{video_name}_{video_id}.mp4" + video_dir_name = os.path.splitext(video_file_name)[0] + + # Save video file in upload_directory + with open(os.path.join(upload_folder, video_file_name), "wb") as f: + shutil.copyfileobj(video_file.file, f) + + # Store frames and caption annotations in a new directory + extract_frames_and_generate_captions( + video_id, + os.path.join(upload_folder, video_file_name), + LVM_ENDPOINT, + os.path.join(upload_folder, video_dir_name), + ) + + # Ingest multimodal data into redis + ingest_multimodal(video_name, os.path.join(upload_folder, video_dir_name), embeddings) + + # Delete temporary video directory containing frames and annotations + # shutil.rmtree(os.path.join(upload_folder, video_dir_name)) + + print(f"Processed video {video_file.filename}") + + return {"status": 200, "message": "Data preparation succeeded"} + + raise HTTPException(status_code=400, detail="Must provide at least one video (.mp4) file.") + + +@register_microservice( + name="opea_service@prepare_videodoc_redis", + endpoint="/v1/videos_with_transcripts", + host="0.0.0.0", + port=6007, +) +async def ingest_videos_with_transcripts(files: List[UploadFile] = File(None)): + + if files: + video_files, video_file_names = [], [] + captions_files, captions_file_names = [], [] + for file in files: + if os.path.splitext(file.filename)[1] == ".mp4": + video_files.append(file) + video_file_names.append(file.filename) + elif os.path.splitext(file.filename)[1] == ".vtt": + captions_files.append(file) + captions_file_names.append(file.filename) + else: + print(f"Skipping file {file.filename} because of unsupported format.") + + # Check if every video file has a captions file + for video_file_name in video_file_names: + file_prefix = os.path.splitext(video_file_name)[0] + if (file_prefix + ".vtt") not in captions_file_names: + raise HTTPException( + status_code=400, detail=f"No captions file {file_prefix}.vtt found for {video_file_name}" + ) + + if len(video_files) == 0: + return HTTPException( + status_code=400, + detail="The uploaded files have unsupported formats. Please upload at least one video file (.mp4) with captions (.vtt)", + ) + + for video_file in video_files: + print(f"Processing video {video_file.filename}") + + # Assign unique identifier to video + video_id = generate_video_id() + + # Create video file name by appending identifier + video_name = os.path.splitext(video_file.filename)[0] + video_file_name = f"{video_name}_{video_id}.mp4" + video_dir_name = os.path.splitext(video_file_name)[0] + + # Save video file in upload_directory + with open(os.path.join(upload_folder, video_file_name), "wb") as f: + shutil.copyfileobj(video_file.file, f) + + # Save captions file in upload directory + vtt_file_name = os.path.splitext(video_file.filename)[0] + ".vtt" + vtt_idx = None + for idx, caption_file in enumerate(captions_files): + if caption_file.filename == vtt_file_name: + vtt_idx = idx + break + vtt_file = video_dir_name + ".vtt" + with open(os.path.join(upload_folder, vtt_file), "wb") as f: + shutil.copyfileobj(captions_files[vtt_idx].file, f) + + # Store frames and caption annotations in a new directory + extract_frames_and_annotations_from_transcripts( + video_id, + os.path.join(upload_folder, video_file_name), + os.path.join(upload_folder, vtt_file), + os.path.join(upload_folder, video_dir_name), + ) + + # Delete temporary vtt file + os.remove(os.path.join(upload_folder, vtt_file)) + + # Ingest multimodal data into redis + ingest_multimodal(video_name, os.path.join(upload_folder, video_dir_name), embeddings) + + # Delete temporary video directory containing frames and annotations + shutil.rmtree(os.path.join(upload_folder, video_dir_name)) + + print(f"Processed video {video_file.filename}") + + return {"status": 200, "message": "Data preparation succeeded"} + + raise HTTPException( + status_code=400, detail="Must provide at least one pair consisting of video (.mp4) and captions (.vtt)" + ) + + +@register_microservice( + name="opea_service@prepare_videodoc_redis", endpoint="/v1/dataprep/get_videos", host="0.0.0.0", port=6007 +) +async def rag_get_file_structure(): + """Returns list of names of uploaded videos saved on the server.""" + + if not Path(upload_folder).exists(): + print("No file uploaded, return empty list.") + return [] + + uploaded_videos = os.listdir(upload_folder) + return uploaded_videos + + +@register_microservice( + name="opea_service@prepare_videodoc_redis", endpoint="/v1/dataprep/delete_videos", host="0.0.0.0", port=6007 +) +async def delete_videos(): + """Delete all uploaded videos along with redis index.""" + index_deleted = drop_index(index_name=INDEX_NAME) + + if not index_deleted: + raise HTTPException(status_code=409, detail="Uploaded videos could not be deleted. Index does not exist") + + clear_upload_folder(upload_folder) + print("Successfully deleted all uploaded videos.") + return {"status": True} + + +if __name__ == "__main__": + create_upload_folder(upload_folder) + # Load embeddings model + print("Initializing BridgeTower model as embedder...") + embeddings = BridgeTowerEmbedding(model_name=EMBED_MODEL, device=device) + print("Done initialization of embedder!") + opea_microservices["opea_service@prepare_videodoc_redis"].start() diff --git a/comps/dataprep/redis/multimodal_langchain/requirements.txt b/comps/dataprep/redis/multimodal_langchain/requirements.txt new file mode 100644 index 0000000000..574d2952aa --- /dev/null +++ b/comps/dataprep/redis/multimodal_langchain/requirements.txt @@ -0,0 +1,19 @@ +docarray[full] +fastapi +langchain==0.1.12 +langchain_benchmarks +moviepy +openai-whisper +opencv-python +opentelemetry-api +opentelemetry-exporter-otlp +opentelemetry-sdk +Pillow +prometheus-fastapi-instrumentator +pydantic==2.8.2 +python-multipart +redis +shortuuid +transformers +uvicorn +webvtt-py diff --git a/comps/dataprep/redis/multimodal_langchain/schema.yml b/comps/dataprep/redis/multimodal_langchain/schema.yml new file mode 100644 index 0000000000..32f4a79ae4 --- /dev/null +++ b/comps/dataprep/redis/multimodal_langchain/schema.yml @@ -0,0 +1,19 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +text: + - name: content + - name: b64_img_str + - name: video_id + - name: source_video + - name: embedding_type + - name: title + - name: transcript_for_inference +numeric: + - name: time_of_frame_ms +vector: + - name: content_vector + algorithm: HNSW + datatype: FLOAT32 + dims: 512 + distance_metric: COSINE diff --git a/comps/embeddings/README.md b/comps/embeddings/README.md index 07be2fcbb8..537f6aa034 100644 --- a/comps/embeddings/README.md +++ b/comps/embeddings/README.md @@ -80,8 +80,7 @@ First, you need to start a TEI service. ```bash your_port=8090 model="BAAI/bge-large-en-v1.5" -revision="refs/pr/5" -docker run -p $your_port:80 -v ./data:/data --name tei_server -e http_proxy=$http_proxy -e https_proxy=$https_proxy --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 --model-id $model --revision $revision +docker run -p $your_port:80 -v ./data:/data --name tei_server -e http_proxy=$http_proxy -e https_proxy=$https_proxy --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 --model-id $model ``` Then you need to test your TEI service using the following commands: @@ -124,8 +123,7 @@ First, you need to start a TEI service. ```bash your_port=8090 model="BAAI/bge-large-en-v1.5" -revision="refs/pr/5" -docker run -p $your_port:80 -v ./data:/data --name tei_server -e http_proxy=$http_proxy -e https_proxy=$https_proxy --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 --model-id $model --revision $revision +docker run -p $your_port:80 -v ./data:/data --name tei_server -e http_proxy=$http_proxy -e https_proxy=$https_proxy --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 --model-id $model ``` Then you need to test your TEI service using the following commands: diff --git a/comps/embeddings/langchain-mosec/docker/Dockerfile b/comps/embeddings/langchain-mosec/docker/Dockerfile index 2fa2e7036f..4628216f1a 100644 --- a/comps/embeddings/langchain-mosec/docker/Dockerfile +++ b/comps/embeddings/langchain-mosec/docker/Dockerfile @@ -1,4 +1,3 @@ - # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 @@ -6,8 +5,7 @@ FROM langchain/langchain:latest RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ libgl1-mesa-glx \ - libjemalloc-dev \ - vim + libjemalloc-dev RUN useradd -m -s /bin/bash user && \ mkdir -p /home/user && \ @@ -25,4 +23,3 @@ ENV PYTHONPATH=$PYTHONPATH:/home/user WORKDIR /home/user/comps/embeddings/langchain-mosec ENTRYPOINT ["python", "embedding_mosec.py"] - diff --git a/comps/embeddings/langchain/docker/Dockerfile b/comps/embeddings/langchain/docker/Dockerfile index 464bacf660..365c738117 100644 --- a/comps/embeddings/langchain/docker/Dockerfile +++ b/comps/embeddings/langchain/docker/Dockerfile @@ -1,4 +1,3 @@ - # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 @@ -8,8 +7,7 @@ ARG ARCH="cpu" RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ libgl1-mesa-glx \ - libjemalloc-dev \ - vim + libjemalloc-dev RUN useradd -m -s /bin/bash user && \ mkdir -p /home/user && \ @@ -28,4 +26,3 @@ ENV PYTHONPATH=$PYTHONPATH:/home/user WORKDIR /home/user/comps/embeddings/langchain ENTRYPOINT ["python", "embedding_tei.py"] - diff --git a/comps/embeddings/llama_index/docker/Dockerfile b/comps/embeddings/llama_index/docker/Dockerfile index 914293db82..8d17b0dfae 100644 --- a/comps/embeddings/llama_index/docker/Dockerfile +++ b/comps/embeddings/llama_index/docker/Dockerfile @@ -1,4 +1,3 @@ - # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 @@ -7,7 +6,6 @@ FROM ubuntu:22.04 RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ libgl1-mesa-glx \ libjemalloc-dev \ - vim \ python3 \ python3-pip @@ -27,4 +25,3 @@ ENV PYTHONPATH=$PYTHONPATH:/home/user WORKDIR /home/user/comps/embeddings/llama_index ENTRYPOINT ["python3", "embedding_tei.py"] - diff --git a/comps/embeddings/multimodal_embeddings/README.md b/comps/embeddings/multimodal_embeddings/README.md new file mode 100644 index 0000000000..c2cf2b875c --- /dev/null +++ b/comps/embeddings/multimodal_embeddings/README.md @@ -0,0 +1,185 @@ +# Multimodal Embeddings Microservice + +The Multimodal Embedding Microservice is designed to efficiently convert pairs of textual string and image into vectorized embeddings, facilitating seamless integration into various machine learning and data processing workflows. This service utilizes advanced algorithms to generate high-quality embeddings that capture the joint semantic essence of the input text-and-image pairs, making it ideal for applications in multi-modal data processing, information retrieval, and similar fields. + +Key Features: + +**High Performance**: Optimized for quick and reliable conversion of textual data and image inputs into vector embeddings. + +**Scalability**: Built to handle high volumes of requests simultaneously, ensuring robust performance even under heavy loads. + +**Ease of Integration**: Provides a simple and intuitive API, allowing for straightforward integration into existing systems and workflows. + +**Customizable**: Supports configuration and customization to meet specific use case requirements, including different embedding models and preprocessing techniques. + +Users are albe to configure and build embedding-related services according to their actual needs. + +## πŸš€1. Start Microservice with Python (Option 1) + +Currently, we provide two ways to implement the multimodal embedding service: + +1. Build the multimodal embedding model **locally** from the server, which is faster, but takes up memory on the local server. +2. Build it based on the multimodal embedding inference endpoint (**MMEI endpoint**), which provides more flexibility, but may bring some network latency. + +For both of the implementations, you need to install requirements first. + +### 1.1 Install Requirements + +```bash +# run with langchain +pip install -r multimodal_langchain/requirements.txt +``` + +### 1.2 Start Embedding Service + +You can select one of the following to start the multimodal embedding service: + +**Start Multimodal Embedding Service with MMEI** + +First, you need to start a MMEI service. + +```bash +export your_mmei_port=8080 +export EMBEDDER_PORT=$your_mmei_port +``` + +Currently, we employ [**BridgeTower**](https://huggingface.co/BridgeTower/bridgetower-large-itm-mlm-gaudi) model for MMEI and provide two ways to start MMEI: + +1. Start MMEI on Gaudi2 HPU +2. Start MMEI on Xeon CPU (if Gaudi2 HPU is not available) + +- Gaudi2 HPU + +```bash +cd ../../.. +docker build -t opea/bridgetower-embedder:latest --build-arg EMBEDDER_PORT=$EMBEDDER_PORT --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/embeddings/multimodal_embeddings/bridgetower/docker/Dockerfile_hpu . +cd comps/embeddings/multimodal_embeddings/bridgetower/docker/ +docker compose -f docker_compose_bridgetower_embedding_endpoint.yaml up -d +``` + +- Xeon CPU + +```bash +cd ../../.. +docker build -t opea/bridgetower-embedder:latest --build-arg EMBEDDER_PORT=$EMBEDDER_PORT --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/embeddings/multimodal_embeddings/bridgetower/docker/Dockerfile . +cd comps/embeddings/multimodal_embeddings/bridgetower/docker/ +docker compose -f docker_compose_bridgetower_embedding_endpoint.yaml up -d +``` + +Then you need to test your MMEI service using the following commands: + +```bash +curl http://localhost:$your_mmei_port/v1/encode \ + -X POST \ + -H "Content-Type:application/json" \ + -d '{"text":"This is example"}' +``` + +Start the embedding service with MMEI_EMBEDDING_ENDPOINT. + +```bash +# run with langchain +cd multimodal_langchain +export MMEI_EMBEDDING_ENDPOINT="http://localhost:$your_mmei_port/v1/encode" +export your_embedding_port_microservice=6600 +export MM_EMBEDDING_PORT_MICROSERVICE=$your_embedding_port_microservice +python mm_embedding_mmei.py +``` + +**Start Embedding Service with Local Model** + +```bash +# run with langchain +cd multimodal_langchain +export your_embedding_port_microservice=6600 +export MM_EMBEDDING_PORT_MICROSERVICE=$your_embedding_port_microservice +python local_mm_embedding.py +``` + +## πŸš€2. Start Microservice with Docker (Option 2) + +### 2.1 Start Multimodal Embedding Inference (MMEI) Service + +First, you need to start a MMEI service. + +```bash +export your_mmei_port=8080 +export EMBEDDER_PORT=$your_mmei_port +``` + +Currently, we employ [**BridgeTower**](https://huggingface.co/BridgeTower/bridgetower-large-itm-mlm-gaudi) model for MMEI and provide two ways to start MMEI: + +1. Start MMEI on Gaudi2 HPU +2. Start MMEI on Xeon CPU (if Gaudi2 HPU is not available) + +- Gaudi2 HPU + +```bash +cd ../../.. +docker build -t opea/bridgetower-embedder:latest --build-arg EMBEDDER_PORT=$EMBEDDER_PORT --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/embeddings/multimodal_embeddings/bridgetower/docker/Dockerfile_hpu . +cd comps/embeddings/multimodal_embeddings/bridgetower/docker/ +docker compose -f docker_compose_bridgetower_embedding_endpoint.yaml up -d +``` + +- Xeon CPU + +```bash +cd ../../.. +docker build -t opea/bridgetower-embedder:latest --build-arg EMBEDDER_PORT=$EMBEDDER_PORT --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/embeddings/multimodal_embeddings/bridgetower/docker/Dockerfile . +cd comps/embeddings/multimodal_embeddings/bridgetower/docker/ +docker compose -f docker_compose_bridgetower_embedding_endpoint.yaml up -d +``` + +Then you need to test your MMEI service using the following commands: + +```bash +curl http://localhost:$your_mmei_port/v1/encode \ + -X POST \ + -H "Content-Type:application/json" \ + -d '{"text":"This is example"}' +``` + +Export the `MMEI_EMBEDDING_ENDPOINT` for later usage: + +```bash +export ip_address=$(hostname -I | awk '{print $1}') +export MMEI_EMBEDDING_ENDPOINT="http://$ip_address:$your_mmei_port/v1/encode" +``` + +### 2.2 Build Docker Image + +#### Build Langchain Docker + +```bash +cd ../../.. +docker build -t opea/embedding-multimodal:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/embeddings/multimodal_embeddings/multimodal_langchain/docker/Dockerfile . +``` + +### 2.3 Run Docker with Docker Compose + +```bash +cd multimodal_langchain/docker +export your_embedding_port_microservice=6600 +export MM_EMBEDDING_PORT_MICROSERVICE=$your_embedding_port_microservice +docker compose -f docker_compose_multimodal_embedding.yaml up -d +``` + +## πŸš€3. Consume Embedding Service + +### 2.2 Consume Embedding Service + +**Compute a joint embedding of an image-text pair** + +```bash +curl -X POST http://0.0.0.0:6600/v1/embeddings \ + -H "Content-Type: application/json" \ + -d '{"text": {"text" : "This is some sample text."}, "image" : {"url": "https://github.com/docarray/docarray/blob/main/tests/toydata/image-data/apple.png?raw=true"}}' +``` + +**Compute an embedding of a text** + +```bash +curl -X POST http://0.0.0.0:6600/v1/embeddings \ + -H "Content-Type: application/json" \ + -d '{"text" : "This is some sample text."}' +``` diff --git a/comps/embeddings/multimodal_embeddings/__init__.py b/comps/embeddings/multimodal_embeddings/__init__.py new file mode 100644 index 0000000000..916f3a44b2 --- /dev/null +++ b/comps/embeddings/multimodal_embeddings/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/embeddings/multimodal_embeddings/bridgetower/__init__.py b/comps/embeddings/multimodal_embeddings/bridgetower/__init__.py new file mode 100644 index 0000000000..e64366189a --- /dev/null +++ b/comps/embeddings/multimodal_embeddings/bridgetower/__init__.py @@ -0,0 +1,5 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from .bridgetower_embedding import BridgeTowerEmbedding +from .bridgetower_custom import BridgeTowerTextFeatureExtractor, BridgeTowerForITC diff --git a/comps/embeddings/multimodal_embeddings/bridgetower/bridgetower_custom.py b/comps/embeddings/multimodal_embeddings/bridgetower/bridgetower_custom.py new file mode 100644 index 0000000000..0a89c3fa9a --- /dev/null +++ b/comps/embeddings/multimodal_embeddings/bridgetower/bridgetower_custom.py @@ -0,0 +1,243 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from collections import OrderedDict +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +from torch import nn +from torchvision import transforms +from torchvision.transforms import CenterCrop, Compose, Normalize, Resize, ToTensor +from transformers import BridgeTowerModel, BridgeTowerPreTrainedModel +from transformers.modeling_outputs import SequenceClassifierOutput +from transformers.models.bridgetower.modeling_bridgetower import ( + BridgeTowerContrastiveHead, + BridgeTowerTextModel, + BridgeTowerVisionModel, +) + + +class LayerNorm(nn.LayerNorm): + """Subclass torch's LayerNorm to handle fp16.""" + + def forward(self, x: torch.Tensor): + orig_type = x.dtype + ret = super().forward(x.type(torch.float32)) + return ret.type(orig_type) + + +class BridgeTowerImageFeatureExtractor(nn.Module): + def __init__( + self, + patch_size=14, + width=1024, + resolution_after=294, + ckpt_path=None, + ): + super().__init__() + + self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False) + + scale = width**-0.5 + self.class_embedding = nn.Parameter(scale * torch.randn(width)) + self.positional_embedding = nn.Parameter(scale * torch.randn((resolution_after // patch_size) ** 2 + 1, width)) + self.ln_pre = LayerNorm(width) + + if ckpt_path is not None: + sd = torch.load(ckpt_path) + if "state_dict" in sd: + sd = sd["state_dict"] + print(f"Loading feature extractor checkpoint from {ckpt_path}") + self.load_state_dict(sd) + + def forward(self, x: torch.Tensor): + x = self.conv1(x) # shape = [*, width, grid, grid] + x = x.reshape(x.shape[0], x.shape[1], -1) # shape = [*, width, grid ** 2] + x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width] + t = self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device) + x = torch.cat([t, x], dim=1) # shape = [*, grid ** 2 + 1, width] + x = x + self.positional_embedding.to(x.dtype) + x = self.ln_pre(x) + x = x.permute(1, 0, 2) # NLD -> LND + return x + + +class BridgeTowerITCHead(nn.Module): + def __init__(self, hidden_size, embed_size): + super().__init__() + self.fc = nn.Linear(hidden_size, embed_size) + + def forward(self, x): + x = self.fc(x) + return x + + +class _BridgeTowerTextModelWrapper(nn.Module): + def __init__(self, config): + super().__init__() + self.text_model = BridgeTowerTextModel(config) + + def forward(self, **kwargs): + return self.text_model(**kwargs) + + +class _BridgeTowerVisionModelWrapper(nn.Module): + def __init__(self, config): + super().__init__() + self.vision_model = BridgeTowerVisionModel(config.vision_config) + + if config.share_cross_modal_transformer_layers: + self.cross_modal_image_transform = nn.Linear(config.vision_config.hidden_size, config.hidden_size) + else: + self.cross_modal_image_transform = nn.ModuleList( + [ + nn.Linear(config.vision_config.hidden_size, config.hidden_size) + for _ in range(config.num_hidden_layers) + ] + ) + self.token_type_embeddings = nn.Embedding(2, config.hidden_size) + + def forward(self, **kwargs): + return self.vision_model(**kwargs) + + +class BridgeTowerVisionFeatureExtractor(BridgeTowerPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.bridgetower = _BridgeTowerVisionModelWrapper(config) + self.itc_image_head = BridgeTowerContrastiveHead(config.hidden_size, config.contrastive_hidden_size) + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[torch.LongTensor] = None, + ): + + outputs = self.bridgetower(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True) + final_hidden_cls = outputs.hidden_states[-1][:, 0, :] + + image_embeds_with_ln = self.bridgetower.vision_model.visual.forward_post(final_hidden_cls) + image_token_type_embeddings = self.bridgetower.token_type_embeddings( + torch.full((1,), 1, dtype=torch.long, device=self.bridgetower.token_type_embeddings.weight.device) + ).expand_as(image_embeds_with_ln) + + image_embeds = self.bridgetower.cross_modal_image_transform(image_embeds_with_ln) + image_token_type_embeddings + + final_hidden_cls = F.normalize(self.itc_image_head(image_embeds), dim=-1, p=2) + + return final_hidden_cls + + +class BridgeTowerTextFeatureExtractor(BridgeTowerPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.bridgetower = _BridgeTowerTextModelWrapper(config.text_config) + self.itc_text_head = BridgeTowerITCHead(config.hidden_size, config.contrastive_hidden_size) + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[torch.LongTensor] = None, + ): + + outputs = self.bridgetower(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True) + final_hidden_cls = outputs.hidden_states[-1][:, 0, :] + final_hidden_cls = F.normalize(self.itc_text_head(final_hidden_cls), dim=-1, p=2) + + return final_hidden_cls + + +class BridgeTowerForITC(BridgeTowerPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.bridgetower = BridgeTowerModel(config) + + self.itc_text_head = BridgeTowerITCHead(config.hidden_size, config.contrastive_hidden_size) + self.itc_image_head = BridgeTowerITCHead(config.hidden_size, config.contrastive_hidden_size) + self.itc_cross_modal_head = BridgeTowerITCHead(config.hidden_size * 2, config.contrastive_hidden_size) + + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + pixel_mask: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + image_embeds: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[torch.LongTensor] = None, + ) -> Union[SequenceClassifierOutput, Tuple[torch.FloatTensor]]: + + assert output_hidden_states, "output_hidden_states should be set to True for BridgeTowerForITC" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bridgetower( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + pixel_values=pixel_values, + pixel_mask=pixel_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + image_embeds=image_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooler_output = outputs.pooler_output if return_dict else outputs[2] + + hidden_states_txt, hidden_states_img, hidden_states_cross_modal = outputs.hidden_states + + final_hidden_txt = hidden_states_txt[-1] + final_hidden_img = hidden_states_img[-1] + + image_embeds_with_ln = self.bridgetower.vision_model.visual.forward_post(final_hidden_img) + image_token_type_embeddings = self.bridgetower.token_type_embeddings( + torch.full((1,), 1, dtype=torch.long, device=self.bridgetower.token_type_embeddings.weight.device) + ).expand_as(image_embeds_with_ln) + + final_hidden_img = ( + self.bridgetower.cross_modal_image_transform(image_embeds_with_ln) + image_token_type_embeddings + ) + + final_hidden_txt = F.normalize(self.itc_text_head(final_hidden_txt[:, 0, :]), dim=-1, p=2) + final_hidden_img = F.normalize(self.itc_image_head(final_hidden_img[:, 0, :]), dim=-1, p=2) + final_hidden_cross = F.normalize(self.itc_cross_modal_head(pooler_output), dim=-1, p=2) + + logits = torch.stack([final_hidden_txt, final_hidden_img, final_hidden_cross], dim=-2) + + if not return_dict: + return tuple(logits) + + return SequenceClassifierOutput( + loss=None, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/comps/embeddings/multimodal_embeddings/bridgetower/bridgetower_embedding.py b/comps/embeddings/multimodal_embeddings/bridgetower/bridgetower_embedding.py new file mode 100644 index 0000000000..f61d8e1c33 --- /dev/null +++ b/comps/embeddings/multimodal_embeddings/bridgetower/bridgetower_embedding.py @@ -0,0 +1,122 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from typing import Any, List + +import torch +from langchain_core.embeddings import Embeddings +from langchain_core.pydantic_v1 import BaseModel, Extra +from PIL import Image +from transformers import BridgeTowerProcessor + +from .bridgetower_custom import BridgeTowerForITC, BridgeTowerTextFeatureExtractor + + +class BridgeTowerEmbedding(BaseModel, Embeddings): + """BridgeTower embedding model.""" + + model_name: str = "BridgeTower/bridgetower-large-itm-mlm-itc" + device: str = "cpu" + TEXT_MODEL: Any + PROCESSOR: Any + MODEL: Any + + def __init__(self, **kwargs: Any): + """Initialize the BridgeTowerEmbedding class.""" + super().__init__(**kwargs) + + if "device" in kwargs: + if kwargs["device"] == "hpu": + try: + import habana_frameworks.torch.core as htcore + + self.device = torch.device("hpu") + except ImportError: + self.device = "cpu" + elif kwargs["device"] == "gpu": + if torch.cuda.is_available(): + self.device = "cuda" + else: + self.device = "cpu" + + self.TEXT_MODEL = BridgeTowerTextFeatureExtractor.from_pretrained(self.model_name).to(self.device) + self.PROCESSOR = BridgeTowerProcessor.from_pretrained(self.model_name) + self.MODEL = BridgeTowerForITC.from_pretrained(self.model_name).to(self.device) + + class Config: + """Configuration for this pydantic object.""" + + extra = Extra.forbid + + def embed_documents(self, texts: List[str]) -> List[List[float]]: + """Embed a list of documents using BridgeTower. + + Args: + texts: The list of texts to embed. + Returns: + List of embeddings, one for each text. + """ + encodings = self.PROCESSOR.tokenizer(texts, return_tensors="pt").to(self.device) + with torch.no_grad(): + outputs = self.TEXT_MODEL(**encodings) + embeddings = outputs.cpu().numpy().tolist() + return embeddings + + def embed_query(self, text: str) -> List[float]: + """Embed a query using BridgeTower. + + Args: + text: The text to embed. + Returns: + Embeddings for the text. + """ + return self.embed_documents([text])[0] + + def embed_image_text_pairs(self, texts: List[str], images: list[Image], batch_size=2) -> List[List[float]]: # type: ignore + """Embed a list of image-text pairs using BridgeTower. + + Args: + texts: The list of texts to embed. + images: The list of path-to-images to embed + batch_size: the batch size to process, default to 2 + Returns: + List of embeddings, one for each image-text pairs. + """ + + # the length of texts must be equal to the length of images + assert len(texts) == len(images), "the number of captions should be equal to the number of images" + + image_list = [] + text_list = [] + embeddings = [] + for pil_img, text in zip(images, texts): + # print(path_to_img) + # img = read_image(path_to_img, mode=ImageReadMode.RGB) + # img = transform.to_pil_image(img) + + img = pil_img.convert("RGB") + image_list.append(img) + text_list.append(text) + if len(text_list) == batch_size: + batch = self.PROCESSOR( + image_list, text_list, return_tensors="pt", max_length=200, padding="max_length", truncation=True + ).to(self.device) + with torch.no_grad(): + batch_embeddings = self.MODEL(**batch, output_hidden_states=True) + + for i in range(len(text_list)): + embeddings.append(batch_embeddings.logits[i, 2, :].detach().cpu().numpy().tolist()) + image_list = [] + text_list = [] + # embedding the remaining + if len(text_list) > 0: + batch = self.PROCESSOR( + image_list, text_list, return_tensors="pt", max_length=100, padding="max_length", truncation=True + ).to(self.device) + with torch.no_grad(): + batch_embeddings = self.MODEL(**batch, output_hidden_states=True) + for i in range(len(text_list)): + embeddings.append(batch_embeddings.logits[i, 2, :].detach().cpu().numpy().tolist()) + image_list = [] + text_list = [] + return embeddings diff --git a/comps/embeddings/multimodal_embeddings/bridgetower/bridgetower_server.py b/comps/embeddings/multimodal_embeddings/bridgetower/bridgetower_server.py new file mode 100644 index 0000000000..62e70c74fe --- /dev/null +++ b/comps/embeddings/multimodal_embeddings/bridgetower/bridgetower_server.py @@ -0,0 +1,153 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import asyncio +import base64 +import os +import uuid +from functools import partial +from io import BytesIO +from typing import List + +import PIL +import PIL.Image +import requests +import uvicorn +from fastapi import BackgroundTasks, FastAPI, Request +from fastapi.responses import JSONResponse, Response +from utils import build_logger + +from comps.embeddings.multimodal_embeddings.bridgetower import BridgeTowerEmbedding + +worker_id = str(uuid.uuid4())[:6] +print(f"worker_id: {worker_id}") +logger = build_logger("embedding_worker", f"bridgetower_embedding_worker_{worker_id}.log") +model_semaphore = None +global_counter = 0 + +model_name_or_path = None +model_dtype = None +use_hpu_graphs = True + + +app = FastAPI() + + +def release_model_semaphore(fn=None): + model_semaphore.release() + if fn is not None: + fn() + + +def get_queue_length(): + if model_semaphore is None: + return 0 + else: + return ( + args.limit_model_concurrency + - model_semaphore._value + + (len(model_semaphore._waiters) if model_semaphore._waiters is not None else 0) + ) + + +def get_status(): + return { + "model_names": [model_name_or_path], + "speed": 1, + "queue_length": get_queue_length(), + "global_counter": global_counter, + } + + +@app.get("/v1/health_check") +async def health() -> Response: + """Health check.""" + return Response(status_code=200, content=b'{"message" : "BridgeTower server is running..."}') + + +@app.post("/v1/encode") +async def encode(request: Request) -> Response: + global model_semaphore, global_counter + global_counter += 1 + + request_dict = await request.json() + if model_semaphore is None: + model_semaphore = asyncio.Semaphore(args.limit_model_concurrency) + await model_semaphore.acquire() + + text = request_dict.pop("text") + image = None + if "img_b64_str" in request_dict.keys(): + img_b64_str = request_dict.pop("img_b64_str") + image = PIL.Image.open(BytesIO(base64.b64decode(img_b64_str))) + if image is None: + # embed text only + embeddings = embedder.embed_documents([text])[0] + else: + # embed image and text pair + embeddings = embedder.embed_image_text_pairs([text], [image], batch_size=1)[0] + + background_tasks = BackgroundTasks() + background_tasks.add_task(partial(release_model_semaphore)) + return JSONResponse( + status_code=200, + content={ + "embedding": embeddings, + }, + background=background_tasks, + ) + + +@app.post("/v1/worker_get_status") +async def get_woker_status(): + return get_status() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--host", type=str, default="0.0.0.0") + parser.add_argument("--model_name_or_path", type=str, default="BridgeTower/bridgetower-large-itm-mlm-itc") + parser.add_argument("--warmup", type=int, default=1, help="Number of warmup iterations for benchmarking.") + parser.add_argument("--device", type=str, default="cpu") + parser.add_argument("--limit-model-concurrency", type=int, default=5) + + args = parser.parse_args() + # get port from env variable if exist + args.port = int(os.getenv("PORT", 8080)) + + print(f"device: {args.device}") + logger.info(f"args: {args}") + + if args.device == "hpu": + try: + import habana_frameworks.torch.core as htcore + except ImportError: + print("device: hpu is not available. Using cpu instead!") + args.device = "cpu" + + model_name_or_path = args.model_name_or_path + + embedder = BridgeTowerEmbedding(device=args.device) + + # warmup + print("Warmup...") + image_paths = ["https://llava-vl.github.io/static/images/view.jpg"] + example_prompts = ["This is test image!"] + images = [] + for image_path in image_paths: + images.append(PIL.Image.open(requests.get(image_path, stream=True, timeout=3000).raw)) + for i in range(args.warmup): + embedder.embed_image_text_pairs( + example_prompts, + images, + batch_size=1, + ) + print("Done warmup...") + + uvicorn.run( + app, + host=args.host, + port=args.port, + log_level="debug", + ) diff --git a/comps/embeddings/multimodal_embeddings/bridgetower/docker/Dockerfile b/comps/embeddings/multimodal_embeddings/bridgetower/docker/Dockerfile new file mode 100644 index 0000000000..83cd41ae18 --- /dev/null +++ b/comps/embeddings/multimodal_embeddings/bridgetower/docker/Dockerfile @@ -0,0 +1,25 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.10-slim +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ +USER user +# Set environment variables +ENV LANG=en_US.UTF-8 +ENV PYTHONPATH=/home/user:/usr/lib/habanalabs/:/optimum-habana + +COPY --chown=user comps /home/user/comps + +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir -r /home/user/comps/embeddings/multimodal_embeddings/multimodal_langchain/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +ARG EMBEDDER_PORT=8080 +ENV PORT=$EMBEDDER_PORT + +WORKDIR /home/user/comps/embeddings/multimodal_embeddings/bridgetower + +ENTRYPOINT ["python", "bridgetower_server.py", "--device", "cpu"] \ No newline at end of file diff --git a/comps/embeddings/multimodal_embeddings/bridgetower/docker/Dockerfile_hpu b/comps/embeddings/multimodal_embeddings/bridgetower/docker/Dockerfile_hpu new file mode 100644 index 0000000000..e571ab2538 --- /dev/null +++ b/comps/embeddings/multimodal_embeddings/bridgetower/docker/Dockerfile_hpu @@ -0,0 +1,29 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# HABANA environment +FROM vault.habana.ai/gaudi-docker/1.16.1/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest AS hpu +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +RUN rm -rf /etc/ssh/ssh_host* +USER user +# Set environment variables +ENV LANG=en_US.UTF-8 +ENV PYTHONPATH=/home/user:/usr/lib/habanalabs/:/optimum-habana + +COPY --chown=user comps /home/user/comps + +# Install requirements and optimum habana +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir -r /home/user/comps/embeddings/multimodal_embeddings/multimodal_langchain/requirements.txt && \ + pip install optimum[habana] + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +ARG EMBEDDER_PORT=8080 +ENV PORT=$EMBEDDER_PORT + +WORKDIR /home/user/comps/embeddings/multimodal_embeddings/bridgetower +ENTRYPOINT ["python", "bridgetower_server.py", "--device", "hpu"] \ No newline at end of file diff --git a/comps/embeddings/multimodal_embeddings/bridgetower/docker/docker_compose_bridgetower_embedding_endpoint.yaml b/comps/embeddings/multimodal_embeddings/bridgetower/docker/docker_compose_bridgetower_embedding_endpoint.yaml new file mode 100644 index 0000000000..9767490d0a --- /dev/null +++ b/comps/embeddings/multimodal_embeddings/bridgetower/docker/docker_compose_bridgetower_embedding_endpoint.yaml @@ -0,0 +1,19 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +services: + bridgetower: + image: opea/bridgetower-embedder:latest + container_name: bridgetower-embedding-server + ports: + - ${EMBEDDER_PORT}:${EMBEDDER_PORT} + ipc: host + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + restart: unless-stopped + +networks: + default: + driver: bridge diff --git a/comps/embeddings/multimodal_embeddings/bridgetower/utils.py b/comps/embeddings/multimodal_embeddings/bridgetower/utils.py new file mode 100644 index 0000000000..673d54dbcc --- /dev/null +++ b/comps/embeddings/multimodal_embeddings/bridgetower/utils.py @@ -0,0 +1,90 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import logging +import os +import sys + +handler = None +save_log = True +LOGDIR = "." + + +def build_logger(logger_name, logger_filename): + global handler + + formatter = logging.Formatter( + fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + ) + + # Set the format of root handlers + if not logging.getLogger().handlers: + logging.basicConfig(level=logging.INFO) + logging.getLogger().handlers[0].setFormatter(formatter) + + # Redirect stdout and stderr to loggers + stdout_logger = logging.getLogger("stdout") + stdout_logger.setLevel(logging.INFO) + sl = StreamToLogger(stdout_logger, logging.INFO) + sys.stdout = sl + + stderr_logger = logging.getLogger("stderr") + stderr_logger.setLevel(logging.ERROR) + sl = StreamToLogger(stderr_logger, logging.ERROR) + sys.stderr = sl + + # Get logger + logger = logging.getLogger(logger_name) + logger.setLevel(logging.INFO) + + # Add a file handler for all loggers + if handler is None and save_log: + os.makedirs(LOGDIR, exist_ok=True) + filename = os.path.join(LOGDIR, logger_filename) + handler = logging.handlers.TimedRotatingFileHandler(filename, when="D", utc=True, encoding="UTF-8") + handler.setFormatter(formatter) + + for name, item in logging.root.manager.loggerDict.items(): + if isinstance(item, logging.Logger): + item.addHandler(handler) + + return logger + + +class StreamToLogger(object): + """Fake file-like stream object that redirects writes to a logger instance.""" + + def __init__(self, logger, log_level=logging.INFO): + self.terminal = sys.stdout + self.logger = logger + self.log_level = log_level + self.linebuf = "" + + def __getattr__(self, attr): + return getattr(self.terminal, attr) + + def write(self, buf): + temp_linebuf = self.linebuf + buf + self.linebuf = "" + for line in temp_linebuf.splitlines(True): + # From the io.TextIOWrapper docs: + # On output, if newline is None, any '\n' characters written + # are translated to the system default line separator. + # By default sys.stdout.write() expects '\n' newlines and then + # translates them so this is still cross platform. + if line[-1] == "\n": + self.logger.log(self.log_level, line.rstrip()) + else: + self.linebuf += line + + def flush(self): + if self.linebuf != "": + self.logger.log(self.log_level, self.linebuf.rstrip()) + self.linebuf = "" + + +def pretty_print_semaphore(semaphore): + if semaphore is None: + return "None" + return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})" diff --git a/comps/embeddings/multimodal_embeddings/multimodal_langchain/__init__.py b/comps/embeddings/multimodal_embeddings/multimodal_langchain/__init__.py new file mode 100644 index 0000000000..916f3a44b2 --- /dev/null +++ b/comps/embeddings/multimodal_embeddings/multimodal_langchain/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/embeddings/multimodal_embeddings/multimodal_langchain/docker/Dockerfile b/comps/embeddings/multimodal_embeddings/multimodal_langchain/docker/Dockerfile new file mode 100644 index 0000000000..97d5906ecb --- /dev/null +++ b/comps/embeddings/multimodal_embeddings/multimodal_langchain/docker/Dockerfile @@ -0,0 +1,29 @@ + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM langchain/langchain:latest + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + libgl1-mesa-glx \ + libjemalloc-dev \ + vim + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps + +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir -r /home/user/comps/embeddings/multimodal_embeddings/multimodal_langchain/requirements.txt + +# RUN pip install --upgrade pydantic + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +WORKDIR /home/user/comps/embeddings/multimodal_embeddings/multimodal_langchain + +ENTRYPOINT ["python", "mm_embedding_mmei.py"] diff --git a/comps/embeddings/multimodal_embeddings/multimodal_langchain/docker/docker_compose_multimodal_embedding.yaml b/comps/embeddings/multimodal_embeddings/multimodal_langchain/docker/docker_compose_multimodal_embedding.yaml new file mode 100644 index 0000000000..314233f931 --- /dev/null +++ b/comps/embeddings/multimodal_embeddings/multimodal_langchain/docker/docker_compose_multimodal_embedding.yaml @@ -0,0 +1,21 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +services: + embedding: + image: opea/embedding-multimodal:latest + container_name: embedding-multimodal-server + ports: + - ${MM_EMBEDDING_PORT_MICROSERVICE}:${MM_EMBEDDING_PORT_MICROSERVICE} + ipc: host + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + MMEI_EMBEDDING_ENDPOINT: ${MMEI_EMBEDDING_ENDPOINT} + MM_EMBEDDING_PORT_MICROSERVICE: ${MM_EMBEDDING_PORT_MICROSERVICE} + restart: unless-stopped + +networks: + default: + driver: bridge diff --git a/comps/embeddings/multimodal_embeddings/multimodal_langchain/local_mm_embedding.py b/comps/embeddings/multimodal_embeddings/multimodal_langchain/local_mm_embedding.py new file mode 100644 index 0000000000..7327284a8f --- /dev/null +++ b/comps/embeddings/multimodal_embeddings/multimodal_langchain/local_mm_embedding.py @@ -0,0 +1,58 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +from comps import ( + CustomLogger, + EmbedDoc, + EmbedMultimodalDoc, + MultimodalDoc, + ServiceType, + TextDoc, + TextImageDoc, + opea_microservices, + register_microservice, +) +from comps.embeddings.multimodal_embeddings.bridgetower import BridgeTowerEmbedding + +logger = CustomLogger("local_multimodal_embedding") +logflag = os.getenv("LOGFLAG", False) + +port = int(os.getenv("MM_EMBEDDING_PORT_MICROSERVICE", 6600)) + + +@register_microservice( + name="opea_service@local_multimodal_embedding", + service_type=ServiceType.EMBEDDING, + endpoint="/v1/embeddings", + host="0.0.0.0", + port=port, + input_datatype=MultimodalDoc, + output_datatype=EmbedMultimodalDoc, +) +def embedding(input: MultimodalDoc) -> EmbedDoc: + if logflag: + logger.info(input) + + if isinstance(input, TextDoc): + # Handle text input + embed_vector = embeddings.embed_query(input.text) + res = EmbedDoc(text=input.text, embedding=embed_vector) + + elif isinstance(input, TextImageDoc): + # Handle text + image input + pil_image = input.image.url.load_pil() + embed_vector = embeddings.embed_image_text_pairs([input.text.text], [pil_image], batch_size=1)[0] + res = EmbedMultimodalDoc(text=input.text.text, url=input.image.url, embedding=embed_vector) + else: + raise ValueError("Invalid input type") + + if logflag: + logger.info(res) + return res + + +if __name__ == "__main__": + embeddings = BridgeTowerEmbedding() + opea_microservices["opea_service@local_multimodal_embedding"].start() diff --git a/comps/embeddings/multimodal_embeddings/multimodal_langchain/mm_embedding_mmei.py b/comps/embeddings/multimodal_embeddings/multimodal_langchain/mm_embedding_mmei.py new file mode 100644 index 0000000000..fbd972a202 --- /dev/null +++ b/comps/embeddings/multimodal_embeddings/multimodal_langchain/mm_embedding_mmei.py @@ -0,0 +1,84 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import base64 +import os +import time + +import requests +from fastapi.responses import JSONResponse + +from comps import ( + CustomLogger, + EmbedDoc, + EmbedMultimodalDoc, + MultimodalDoc, + ServiceType, + TextDoc, + TextImageDoc, + opea_microservices, + register_microservice, + register_statistics, + statistics_dict, +) + +logger = CustomLogger("multimodal_embedding_mmei_langchain") +logflag = os.getenv("LOGFLAG", False) +port = int(os.getenv("MM_EMBEDDING_PORT_MICROSERVICE", 6600)) +headers = {"Content-Type": "application/json"} + + +@register_microservice( + name="opea_service@multimodal_embedding_mmei_langchain", + service_type=ServiceType.EMBEDDING, + endpoint="/v1/embeddings", + host="0.0.0.0", + port=port, + input_datatype=MultimodalDoc, + output_datatype=EmbedMultimodalDoc, +) +@register_statistics(names=["opea_service@multimodal_embedding_mmei_langchain"]) +def embedding(input: MultimodalDoc) -> EmbedDoc: + start = time.time() + if logflag: + logger.info(input) + + json = {} + if isinstance(input, TextDoc): + json["text"] = input.text + elif isinstance(input, TextImageDoc): + json["text"] = input.text.text + img_bytes = input.image.url.load_bytes() + base64_img = base64.b64encode(img_bytes).decode("utf-8") + json["img_b64_str"] = base64_img + else: + return JSONResponse(status_code=400, content={"message": "Bad request!"}) + + # call multimodal embedding endpoint + try: + response = requests.post(mmei_embedding_endpoint, headers=headers, json=json) + if response.status_code != 200: + return JSONResponse(status_code=503, content={"message": "Multimodal embedding endpoint failed!"}) + + response_json = response.json() + embed_vector = response_json["embedding"] + if isinstance(input, TextDoc): + res = EmbedDoc(text=input.text, embedding=embed_vector) + elif isinstance(input, TextImageDoc): + res = EmbedMultimodalDoc(text=input.text.text, url=input.image.url, embedding=embed_vector) + except requests.exceptions.ConnectionError: + res = JSONResponse(status_code=503, content={"message": "Multimodal embedding endpoint not started!"}) + statistics_dict["opea_service@multimodal_embedding_mmei_langchain"].append_latency(time.time() - start, None) + if logflag: + logger.info(res) + return res + + +if __name__ == "__main__": + url_endpoint = os.getenv("MMEI_EMBEDDING_HOST_ENDPOINT", "http://0.0.0.0") + port_endpoint = os.getenv("MMEI_EMBEDDING_PORT_ENDPOINT", "8080") + path_endpoint = os.getenv("MMEI_EMBEDDING_PATH_ENDPOINT", "/v1/encode") + + mmei_embedding_endpoint = os.getenv("MMEI_EMBEDDING_ENDPOINT", f"{url_endpoint}:{port_endpoint}{path_endpoint}") + logger.info(f"MMEI Gaudi Embedding initialized at {mmei_embedding_endpoint}") + opea_microservices["opea_service@multimodal_embedding_mmei_langchain"].start() diff --git a/comps/embeddings/multimodal_embeddings/multimodal_langchain/requirements.txt b/comps/embeddings/multimodal_embeddings/multimodal_langchain/requirements.txt new file mode 100644 index 0000000000..cc9d77a432 --- /dev/null +++ b/comps/embeddings/multimodal_embeddings/multimodal_langchain/requirements.txt @@ -0,0 +1,14 @@ +docarray[full] +fastapi +huggingface_hub +langchain +opentelemetry-api +opentelemetry-exporter-otlp +opentelemetry-sdk +prometheus-fastapi-instrumentator +pydantic==2.8.2 +shortuuid +torch +torchvision +transformers +uvicorn diff --git a/comps/finetuning/README.md b/comps/finetuning/README.md index 411395ec95..44ee3d10ca 100644 --- a/comps/finetuning/README.md +++ b/comps/finetuning/README.md @@ -61,7 +61,7 @@ docker build -t opea/finetuning:latest --build-arg https_proxy=$https_proxy --bu Start docker container with below command: ```bash -docker run -d --name="finetuning-server" -p 8005:8005 --runtime=runc --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/finetuning:latest +docker run -d --name="finetuning-server" -p 8015:8015 --runtime=runc --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/finetuning:latest ``` ## 2.2 Setup on Gaudi2 @@ -81,7 +81,7 @@ Start docker container with below command: ```bash export HF_TOKEN=${your_huggingface_token} -docker run --runtime=habana -e HABANA_VISIBLE_DEVICES=all -p 8005:8005 -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host -e https_proxy=$https_proxy -e http_proxy=$http_proxy -e no_proxy=$no_proxy -e HF_TOKEN=$HF_TOKEN opea/finetuning-gaudi:latest +docker run --runtime=habana -e HABANA_VISIBLE_DEVICES=all -p 8015:8015 -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host -e https_proxy=$https_proxy -e http_proxy=$http_proxy -e no_proxy=$no_proxy -e HF_TOKEN=$HF_TOKEN opea/finetuning-gaudi:latest ``` # πŸš€3. Consume Finetuning Service @@ -92,10 +92,10 @@ Assuming a training file `alpaca_data.json` is uploaded, it can be downloaded in ```bash # upload a training file -curl http://${your_ip}:8005/v1/finetune/upload_training_files -X POST -H "Content-Type: multipart/form-data" -F "files=@./alpaca_data.json" +curl http://${your_ip}:8015/v1/finetune/upload_training_files -X POST -H "Content-Type: multipart/form-data" -F "files=@./alpaca_data.json" # create a finetuning job -curl http://${your_ip}:8005/v1/fine_tuning/jobs \ +curl http://${your_ip}:8015/v1/fine_tuning/jobs \ -X POST \ -H "Content-Type: application/json" \ -d '{ @@ -104,18 +104,18 @@ curl http://${your_ip}:8005/v1/fine_tuning/jobs \ }' # list finetuning jobs -curl http://${your_ip}:8005/v1/fine_tuning/jobs -X GET +curl http://${your_ip}:8015/v1/fine_tuning/jobs -X GET # retrieve one finetuning job -curl http://localhost:8005/v1/fine_tuning/jobs/retrieve -X POST -H "Content-Type: application/json" -d '{ +curl http://localhost:8015/v1/fine_tuning/jobs/retrieve -X POST -H "Content-Type: application/json" -d '{ "fine_tuning_job_id": ${fine_tuning_job_id}}' # cancel one finetuning job -curl http://localhost:8005/v1/fine_tuning/jobs/cancel -X POST -H "Content-Type: application/json" -d '{ +curl http://localhost:8015/v1/fine_tuning/jobs/cancel -X POST -H "Content-Type: application/json" -d '{ "fine_tuning_job_id": ${fine_tuning_job_id}}' # list checkpoints of a finetuning job -curl http://${your_ip}:8005/v1/finetune/list_checkpoints -X POST -H "Content-Type: application/json" -d '{"fine_tuning_job_id": ${fine_tuning_job_id}}' +curl http://${your_ip}:8015/v1/finetune/list_checkpoints -X POST -H "Content-Type: application/json" -d '{"fine_tuning_job_id": ${fine_tuning_job_id}}' ``` diff --git a/comps/finetuning/datasets/.gitkeep b/comps/finetuning/datasets/.gitkeep deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/comps/finetuning/finetuning_service.py b/comps/finetuning/finetuning_service.py index fabb32bc40..031380a5d0 100644 --- a/comps/finetuning/finetuning_service.py +++ b/comps/finetuning/finetuning_service.py @@ -20,20 +20,20 @@ ) -@register_microservice(name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs", host="0.0.0.0", port=8005) +@register_microservice(name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs", host="0.0.0.0", port=8015) def create_finetuning_jobs(request: FineTuningJobsRequest, background_tasks: BackgroundTasks): return handle_create_finetuning_jobs(request, background_tasks) @register_microservice( - name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs", host="0.0.0.0", port=8005, methods=["GET"] + name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs", host="0.0.0.0", port=8015, methods=["GET"] ) def list_finetuning_jobs(): return handle_list_finetuning_jobs() @register_microservice( - name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs/retrieve", host="0.0.0.0", port=8005 + name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs/retrieve", host="0.0.0.0", port=8015 ) def retrieve_finetuning_job(request: FineTuningJobIDRequest): job = handle_retrieve_finetuning_job(request) @@ -41,7 +41,7 @@ def retrieve_finetuning_job(request: FineTuningJobIDRequest): @register_microservice( - name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs/cancel", host="0.0.0.0", port=8005 + name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs/cancel", host="0.0.0.0", port=8015 ) def cancel_finetuning_job(request: FineTuningJobIDRequest): job = handle_cancel_finetuning_job(request) @@ -52,7 +52,7 @@ def cancel_finetuning_job(request: FineTuningJobIDRequest): name="opea_service@finetuning", endpoint="/v1/finetune/upload_training_files", host="0.0.0.0", - port=8005, + port=8015, ) async def upload_training_files( files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), @@ -69,7 +69,7 @@ async def upload_training_files( @register_microservice( - name="opea_service@finetuning", endpoint="/v1/finetune/list_checkpoints", host="0.0.0.0", port=8005 + name="opea_service@finetuning", endpoint="/v1/finetune/list_checkpoints", host="0.0.0.0", port=8015 ) def list_checkpoints(request: FineTuningJobIDRequest): checkpoints = handle_list_finetuning_checkpoints(request) diff --git a/comps/finetuning/handlers.py b/comps/finetuning/handlers.py index 2bdab42a92..6aa7e5d3e2 100644 --- a/comps/finetuning/handlers.py +++ b/comps/finetuning/handlers.py @@ -12,6 +12,7 @@ from pydantic_yaml import parse_yaml_raw_as, to_yaml_file from ray.job_submission import JobSubmissionClient +from comps import CustomLogger from comps.cores.proto.api_protocol import ( FineTuningJob, FineTuningJobIDRequest, @@ -20,6 +21,8 @@ ) from comps.finetuning.llm_on_ray.finetune.finetune_config import FinetuneConfig +logger = CustomLogger("finetuning_handlers") + MODEL_CONFIG_FILE_MAP = { "meta-llama/Llama-2-7b-chat-hf": "./models/llama-2-7b-chat-hf.yaml", "mistralai/Mistral-7B-v0.1": "./models/mistral-7b-v0.1.yaml", @@ -50,7 +53,7 @@ def update_job_status(job_id: FineTuningJobID): status = str(job_status).lower() # Ray status "stopped" is OpenAI status "cancelled" status = "cancelled" if status == "stopped" else status - print(f"Status of job {job_id} is '{status}'") + logger.info(f"Status of job {job_id} is '{status}'") running_finetuning_jobs[job_id].status = status if status == "finished" or status == "cancelled" or status == "failed": break @@ -102,7 +105,7 @@ def handle_create_finetuning_jobs(request: FineTuningJobsRequest, background_tas ) finetune_config.General.output_dir = os.path.join(JOBS_PATH, job.id) if os.getenv("DEVICE", ""): - print(f"specific device: {os.getenv('DEVICE')}") + logger.info(f"specific device: {os.getenv('DEVICE')}") finetune_config.Training.device = os.getenv("DEVICE") finetune_config_file = f"{JOBS_PATH}/{job.id}.yaml" @@ -117,7 +120,7 @@ def handle_create_finetuning_jobs(request: FineTuningJobsRequest, background_tas # Path to the local directory that contains the script.py file runtime_env={"working_dir": "./"}, ) - print(f"Submitted Ray job: {ray_job_id} ...") + logger.info(f"Submitted Ray job: {ray_job_id} ...") running_finetuning_jobs[job.id] = job finetuning_job_to_ray_job[job.id] = ray_job_id @@ -169,7 +172,7 @@ async def save_content_to_local_disk(save_path: str, content): content = await content.read() fout.write(content) except Exception as e: - print(f"Write file failed. Exception: {e}") + logger.info(f"Write file failed. Exception: {e}") raise Exception(status_code=500, detail=f"Write file {save_path} failed. Exception: {e}") diff --git a/comps/finetuning/jobs/.gitkeep b/comps/finetuning/jobs/.gitkeep deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/comps/finetuning/lanuch.sh b/comps/finetuning/launch.sh similarity index 68% rename from comps/finetuning/lanuch.sh rename to comps/finetuning/launch.sh index a7e249b6f3..bb5042ac6a 100644 --- a/comps/finetuning/lanuch.sh +++ b/comps/finetuning/launch.sh @@ -2,11 +2,11 @@ # SPDX-License-Identifier: Apache-2.0 if [[ -n "$RAY_PORT" ]];then - export RAY_ADDRESS=http://127.0.0.1:$RAY_PORT ray start --head --port $RAY_PORT else - export RAY_ADDRESS=http://127.0.0.1:8265 ray start --head + export RAY_PORT=8265 fi +export RAY_ADDRESS=http://127.0.0.1:$RAY_PORT python finetuning_service.py diff --git a/comps/finetuning/llm_on_ray/common/__init__.py b/comps/finetuning/llm_on_ray/common/__init__.py index a4ad1e878e..954b7baa4b 100644 --- a/comps/finetuning/llm_on_ray/common/__init__.py +++ b/comps/finetuning/llm_on_ray/common/__init__.py @@ -3,5 +3,4 @@ # # Copyright 2023 The LLM-on-Ray Authors. -from .logging import logger from .torch_config import TorchConfig diff --git a/comps/finetuning/llm_on_ray/common/common.py b/comps/finetuning/llm_on_ray/common/common.py index 136d2526f8..ac01ae12e1 100644 --- a/comps/finetuning/llm_on_ray/common/common.py +++ b/comps/finetuning/llm_on_ray/common/common.py @@ -7,7 +7,9 @@ import importlib import os -from .logging import logger +from comps import CustomLogger + +logger = CustomLogger("llm_on_ray") def import_all_modules(basedir, prefix=None): diff --git a/comps/finetuning/llm_on_ray/common/logging.py b/comps/finetuning/llm_on_ray/common/logging.py deleted file mode 100644 index e2aec567a2..0000000000 --- a/comps/finetuning/llm_on_ray/common/logging.py +++ /dev/null @@ -1,56 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 -# -# Copyright 2023 The LLM-on-Ray Authors. - -import functools -import logging -import logging.config -import traceback - -__all__ = ["logger", "get_logger"] - -use_accelerate_log = False -logger_name = "common" - -logging_config = { - "version": 1, - "loggers": { - "root": {"level": "INFO", "handlers": ["consoleHandler"]}, - "common": { - "level": "INFO", - "handlers": ["consoleHandler"], - "qualname": "common", - "propagate": 0, - }, - }, - "handlers": { - "consoleHandler": { - "class": "logging.StreamHandler", - "level": "INFO", - "formatter": "standardFormatter", - }, - }, - "formatters": { - "standardFormatter": { - "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s", - "datefmt": "", - } - }, -} - -if logging_config is not None: - try: - logging.config.dictConfig(logging_config) - except Exception: - traceback.print_exc() - exit(1) - -if use_accelerate_log: - import accelerate - - get_logger = functools.partial(accelerate.logging.get_logger, name=logger_name) -else: - get_logger = functools.partial(logging.getLogger, name=logger_name) - -logger = get_logger() diff --git a/comps/finetuning/llm_on_ray/finetune/finetune.py b/comps/finetuning/llm_on_ray/finetune/finetune.py index f268800f23..03b8adfaae 100644 --- a/comps/finetuning/llm_on_ray/finetune/finetune.py +++ b/comps/finetuning/llm_on_ray/finetune/finetune.py @@ -23,10 +23,13 @@ from ray.air.config import ScalingConfig from ray.train.torch import TorchTrainer +from comps import CustomLogger from comps.finetuning.llm_on_ray import common from comps.finetuning.llm_on_ray.finetune.data_process import DataProcessor from comps.finetuning.llm_on_ray.finetune.finetune_config import FinetuneConfig +logger = CustomLogger("llm_on_ray/finetune") + def adapt_transformers_to_device(config: Dict): device = config["Training"]["device"] @@ -332,10 +335,10 @@ def train_func(config: Dict[str, Any]): training_args, trainer = get_trainer(config, model, tokenizer, tokenized_dataset, data_collator) - common.logger.info("train start") + logger.info("train start") trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint) trainer.save_model() - common.logger.info("train finish") + logger.info("train finish") def get_finetune_config(): @@ -401,7 +404,7 @@ def main(external_config=None): else: ray.init(runtime_env=runtime_env) - common.logger.info(f"ray available resources = {ray.available_resources()}") + logger.info(f"ray available resources = {ray.available_resources()}") use_gpu = True if device == "gpu" else False scaling_config = ScalingConfig( num_workers=num_training_workers, diff --git a/comps/guardrails/llama_guard/docker/Dockerfile b/comps/guardrails/llama_guard/docker/Dockerfile index aaec44a079..491a4171fd 100644 --- a/comps/guardrails/llama_guard/docker/Dockerfile +++ b/comps/guardrails/llama_guard/docker/Dockerfile @@ -9,8 +9,7 @@ ARG ARCH="cpu" RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ libgl1-mesa-glx \ - libjemalloc-dev \ - vim + libjemalloc-dev RUN useradd -m -s /bin/bash user && \ mkdir -p /home/user && \ diff --git a/comps/guardrails/pii_detection/custom/docker/Dockerfile b/comps/guardrails/pii_detection/custom/docker/Dockerfile index f7a69757cc..b29ff09baf 100644 --- a/comps/guardrails/pii_detection/custom/docker/Dockerfile +++ b/comps/guardrails/pii_detection/custom/docker/Dockerfile @@ -10,8 +10,7 @@ ENV LANG=C.UTF-8 RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ build-essential \ libgl1-mesa-glx \ - libjemalloc-dev \ - vim + libjemalloc-dev RUN useradd -m -s /bin/bash user && \ mkdir -p /home/user && \ diff --git a/comps/guardrails/toxicity_detection/docker/Dockerfile b/comps/guardrails/toxicity_detection/docker/Dockerfile index 18c3726564..369b5e5af6 100644 --- a/comps/guardrails/toxicity_detection/docker/Dockerfile +++ b/comps/guardrails/toxicity_detection/docker/Dockerfile @@ -9,8 +9,7 @@ ARG ARCH="cpu" RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ libgl1-mesa-glx \ - libjemalloc-dev \ - vim + libjemalloc-dev RUN useradd -m -s /bin/bash user && \ mkdir -p /home/user && \ diff --git a/comps/intent_detection/README.md b/comps/intent_detection/README.md new file mode 100644 index 0000000000..fa9062bb6a --- /dev/null +++ b/comps/intent_detection/README.md @@ -0,0 +1,99 @@ +# Intent Detection Microservice by TGI + +# πŸš€1. Start Microservice with Python(Option 1οΌ‰ + +## 1.1 Install Requirements + +```bash +pip install -r requirements.txt +``` + +## 1.2 Start TGI Service + +```bash +export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token} +export LANGCHAIN_TRACING_V2=true +export LANGCHAIN_API_KEY=${your_langchain_api_key} +export LANGCHAIN_PROJECT="opea/gen-ai-comps:llms" +docker run -p 8008:80 -v ./data:/data --name tgi_service --shm-size 1g ghcr.io/huggingface/text-generation-inference:1.4 --model-id ${your_hf_llm_model} +``` + +## 1.3 Verify the TGI Service + +```bash +curl http://${your_ip}:8008/generate \ + -X POST \ + -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \ + -H 'Content-Type: application/json' +``` + +## 1.4 Setup Environment Variables + +```bash +export TGI_LLM_ENDPOINT="http://${your_ip}:8008" +export LANGCHAIN_TRACING_V2=true +export LANGCHAIN_API_KEY=${your_langchain_api_key} +export LANGCHAIN_PROJECT="opea/intent" +``` + +## 1.5 Start Intent Detection Microservice with Python Script + +Start intent detection microservice with below command. + +```bash +cd /your_project_path/GenAIComps/ +cp comps/intent_detection/langchain/intent_detection.py . +python intent_detection.py +``` + +# πŸš€2. Start Microservice with Docker (Option 2) + +## 2.1 Start TGI Service + +Please refer to 1.2. + +## 2.2 Setup Environment Variables + +```bash +export TGI_LLM_ENDPOINT="http://${your_ip}:8008" +export LANGCHAIN_TRACING_V2=true +export LANGCHAIN_API_KEY=${your_langchain_api_key} +export LANGCHAIN_PROJECT="opea/intent" +``` + +## 2.3 Build Docker Image + +```bash +cd /your_project_path/GenAIComps +docker build --no-cache -t opea/llm-tgi:latest -f comps/intent_detection/langchain/Dockerfile . +``` + +## 2.4 Run Docker with CLI (Option A) + +```bash +docker run -it --name="intent-tgi-server" --net=host --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e TGI_LLM_ENDPOINT=$TGI_LLM_ENDPOINT -e HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN opea/llm-tgi:latest +``` + +## 2.5 Run with Docker Compose (Option B) + +```bash +cd /your_project_path/GenAIComps/comps/intent_detection/langchain +export LLM_MODEL_ID=${your_hf_llm_model} +export http_proxy=${your_http_proxy} +export https_proxy=${your_http_proxy} +export TGI_LLM_ENDPOINT="http://tgi-service:80" +export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token} +export LANGCHAIN_API_KEY=${your_langchain_api_key} +docker compose -f docker_compose_intent.yaml up -d +``` + +# πŸš€3. Consume Microservice + +Once intent detection microservice is started, user can use below command to invoke the microservice. + +```bash +curl http://${your_ip}:9000/v1/chat/intent\ + -X POST \ + -d '{"query":"What is Deep Learning?","max_new_tokens":10,"top_k":1,"temperature":0.001,"streaming":false}' \ + -H 'Content-Type: application/json' +``` diff --git a/comps/intent_detection/langchain/Dockerfile b/comps/intent_detection/langchain/Dockerfile new file mode 100644 index 0000000000..297b1b88f4 --- /dev/null +++ b/comps/intent_detection/langchain/Dockerfile @@ -0,0 +1,25 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM langchain/langchain:latest + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + libgl1-mesa-glx \ + libjemalloc-dev \ + vim + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps + +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir -r /home/user/comps/intent_detection/langchain/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +WORKDIR /home/user/comps/intent_detection/langchain +ENTRYPOINT ["python", "intent_detection.py"] diff --git a/comps/intent_detection/langchain/docker_compose_intent.yaml b/comps/intent_detection/langchain/docker_compose_intent.yaml new file mode 100644 index 0000000000..2a15242734 --- /dev/null +++ b/comps/intent_detection/langchain/docker_compose_intent.yaml @@ -0,0 +1,32 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3.8" + +services: + tgi_service: + image: ghcr.io/huggingface/text-generation-inference:1.4 + container_name: tgi-service + ports: + - "8008:80" + volumes: + - "./data:/data" + shm_size: 1g + command: --model-id ${LLM_MODEL_ID} + llm: + image: opea/llm-tgi:latest + container_name: intent-tgi-server + ports: + - "9000:9000" + ipc: host + environment: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT} + HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + LANGCHAIN_API_KEY: ${LANGCHAIN_API_KEY} + restart: unless-stopped + +networks: + default: + driver: bridge diff --git a/comps/intent_detection/langchain/intent_detection.py b/comps/intent_detection/langchain/intent_detection.py new file mode 100644 index 0000000000..bf2e430c60 --- /dev/null +++ b/comps/intent_detection/langchain/intent_detection.py @@ -0,0 +1,47 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +from langchain import LLMChain, PromptTemplate +from langchain_community.llms import HuggingFaceEndpoint +from langsmith import traceable + +from comps import GeneratedDoc, LLMParamsDoc, ServiceType, opea_microservices, register_microservice +from comps.intent_detection.langchain.template import IntentTemplate + + +@register_microservice( + name="opea_service@llm_intent", + service_type=ServiceType.LLM, + endpoint="/v1/chat/intent", + host="0.0.0.0", + port=9000, +) +@traceable(run_type="llm") +def llm_generate(input: LLMParamsDoc): + llm_endpoint = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080") + llm = HuggingFaceEndpoint( + endpoint_url=llm_endpoint, + max_new_tokens=input.max_new_tokens, + top_k=input.top_k, + top_p=input.top_p, + typical_p=input.typical_p, + temperature=input.temperature, + repetition_penalty=input.repetition_penalty, + streaming=input.streaming, + timeout=600, + ) + + prompt = PromptTemplate(template=IntentTemplate.generate_intent_template, input_variables=["query"]) + + llm_chain = LLMChain(prompt=prompt, llm=llm) + + response = llm_chain.invoke(input.query) + response = response["text"] + print("response", response) + return GeneratedDoc(text=response, prompt=input.query) + + +if __name__ == "__main__": + opea_microservices["opea_service@llm_intent"].start() diff --git a/comps/intent_detection/langchain/requirements.txt b/comps/intent_detection/langchain/requirements.txt new file mode 100644 index 0000000000..55cf47ae72 --- /dev/null +++ b/comps/intent_detection/langchain/requirements.txt @@ -0,0 +1,9 @@ +docarray[full] +fastapi +huggingface_hub +langchain==0.1.16 +opentelemetry-api +opentelemetry-exporter-otlp +opentelemetry-sdk +prometheus-fastapi-instrumentator +shortuuid diff --git a/comps/intent_detection/langchain/template.py b/comps/intent_detection/langchain/template.py new file mode 100644 index 0000000000..1a425ce43e --- /dev/null +++ b/comps/intent_detection/langchain/template.py @@ -0,0 +1,8 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + + +class IntentTemplate: + def generate_intent_template(query): + return f"""Please identify the intent of the user query. You may only respond with "chitchat" or "QA" without explanations or engaging in conversation. +### User Query: {query}, ### Response: """ diff --git a/comps/knowledgegraphs/langchain/docker/Dockerfile b/comps/knowledgegraphs/langchain/docker/Dockerfile index dd96f0dbea..655b44d241 100755 --- a/comps/knowledgegraphs/langchain/docker/Dockerfile +++ b/comps/knowledgegraphs/langchain/docker/Dockerfile @@ -7,8 +7,7 @@ ARG ARCH="cpu" # Set this to "cpu" or "gpu" RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ libgl1-mesa-glx \ - libjemalloc-dev \ - vim + libjemalloc-dev RUN useradd -m -s /bin/bash user && \ mkdir -p /home/user && \ diff --git a/comps/llms/faq-generation/tgi/Dockerfile b/comps/llms/faq-generation/tgi/Dockerfile index ff48db4713..0d6bb9d61f 100644 --- a/comps/llms/faq-generation/tgi/Dockerfile +++ b/comps/llms/faq-generation/tgi/Dockerfile @@ -1,4 +1,3 @@ - # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 @@ -6,8 +5,7 @@ FROM langchain/langchain:latest RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ libgl1-mesa-glx \ - libjemalloc-dev \ - vim + libjemalloc-dev RUN useradd -m -s /bin/bash user && \ mkdir -p /home/user && \ diff --git a/comps/llms/summarization/tgi/Dockerfile b/comps/llms/summarization/tgi/Dockerfile index c1e1fdcca2..da449312bb 100644 --- a/comps/llms/summarization/tgi/Dockerfile +++ b/comps/llms/summarization/tgi/Dockerfile @@ -1,4 +1,3 @@ - # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 @@ -6,8 +5,7 @@ FROM langchain/langchain:latest RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ libgl1-mesa-glx \ - libjemalloc-dev \ - vim + libjemalloc-dev RUN useradd -m -s /bin/bash user && \ mkdir -p /home/user && \ diff --git a/comps/llms/text-generation/native/docker/Dockerfile b/comps/llms/text-generation/native/docker/Dockerfile index 69b7a73f57..f7d32cdb9b 100644 --- a/comps/llms/text-generation/native/docker/Dockerfile +++ b/comps/llms/text-generation/native/docker/Dockerfile @@ -1,5 +1,3 @@ - - # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 @@ -11,13 +9,10 @@ ENV LANG=en_US.UTF-8 ARG REPO=https://github.com/huggingface/optimum-habana.git ARG REPO_VER=v1.12.1 -RUN apt-get update && \ - apt-get install git-lfs && \ - git-lfs install && \ - apt-get install -y --no-install-recommends --fix-missing \ +RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \ + git-lfs \ libgl1-mesa-glx \ - libjemalloc-dev \ - vim + libjemalloc-dev RUN useradd -m -s /bin/bash user && \ mkdir -p /home/user && \ @@ -25,6 +20,8 @@ RUN useradd -m -s /bin/bash user && \ USER user +RUN git lfs install + COPY comps /home/user/comps RUN pip install --upgrade-strategy eager optimum[habana] && \ diff --git a/comps/llms/text-generation/ollama/Dockerfile b/comps/llms/text-generation/ollama/Dockerfile index 876ca1eefb..bf78ff3948 100644 --- a/comps/llms/text-generation/ollama/Dockerfile +++ b/comps/llms/text-generation/ollama/Dockerfile @@ -1,14 +1,12 @@ - # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 FROM langchain/langchain:latest RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + curl \ libgl1-mesa-glx \ - libjemalloc-dev \ - vim \ - curl + libjemalloc-dev RUN useradd -m -s /bin/bash user && \ mkdir -p /home/user && \ diff --git a/comps/llms/text-generation/tgi/Dockerfile b/comps/llms/text-generation/tgi/Dockerfile index 545af59df0..6797f86032 100644 --- a/comps/llms/text-generation/tgi/Dockerfile +++ b/comps/llms/text-generation/tgi/Dockerfile @@ -1,4 +1,3 @@ - # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 @@ -6,8 +5,7 @@ FROM langchain/langchain:latest RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ libgl1-mesa-glx \ - libjemalloc-dev \ - vim + libjemalloc-dev RUN useradd -m -s /bin/bash user && \ mkdir -p /home/user && \ diff --git a/comps/llms/text-generation/vllm-ray/docker/Dockerfile.microservice b/comps/llms/text-generation/vllm-ray/docker/Dockerfile.microservice index 10d6500a1c..516ad1a4bc 100644 --- a/comps/llms/text-generation/vllm-ray/docker/Dockerfile.microservice +++ b/comps/llms/text-generation/vllm-ray/docker/Dockerfile.microservice @@ -1,23 +1,11 @@ -# Copyright (c) 2024 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 FROM langchain/langchain:latest RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ libgl1-mesa-glx \ - libjemalloc-dev \ - vim + libjemalloc-dev RUN useradd -m -s /bin/bash user && \ mkdir -p /home/user && \ @@ -34,4 +22,4 @@ ENV PYTHONPATH=$PYTHONPATH:/home/user WORKDIR /home/user/comps/llms/text-generation/vllm-ray -ENTRYPOINT ["python", "llm.py"] \ No newline at end of file +ENTRYPOINT ["python", "llm.py"] diff --git a/comps/llms/text-generation/vllm/docker/Dockerfile.microservice b/comps/llms/text-generation/vllm/docker/Dockerfile.microservice index ccd9772736..ebc1638e03 100644 --- a/comps/llms/text-generation/vllm/docker/Dockerfile.microservice +++ b/comps/llms/text-generation/vllm/docker/Dockerfile.microservice @@ -7,8 +7,7 @@ ARG ARCH="cpu" # Set this to "cpu" or "gpu" RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ libgl1-mesa-glx \ - libjemalloc-dev \ - vim + libjemalloc-dev RUN useradd -m -s /bin/bash user && \ mkdir -p /home/user && \ diff --git a/comps/llms/utils/lm-eval/Dockerfile.cpu b/comps/llms/utils/lm-eval/Dockerfile.cpu index ceb98887d8..5f419bfbfe 100644 --- a/comps/llms/utils/lm-eval/Dockerfile.cpu +++ b/comps/llms/utils/lm-eval/Dockerfile.cpu @@ -1,24 +1,30 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + ARG UBUNTU_VER=22.04 FROM ubuntu:${UBUNTU_VER} as devel + +ENV LANG=C.UTF-8 + RUN useradd -m -s /bin/bash user && \ mkdir -p /home/user && \ chown -R user /home/user/ -ARG REPO_COMPS=https://github.com/opea-project/GenAIComps.git -ARG BRANCH=main -ENV LANG=C.UTF-8 RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \ aspell \ aspell-en \ build-essential \ + git \ python3 \ - python3-pip \ python3-dev \ python3-distutils \ - git \ - vim \ + python3-pip \ wget + USER user + +ARG REPO_COMPS=https://github.com/opea-project/GenAIComps.git +ARG BRANCH=main RUN git clone --single-branch --branch=${BRANCH} ${REPO_COMPS} /home/user/GenAIComps/ && \ cd /home/user/GenAIComps/ && python3 setup.py install && \ pip install --no-cache-dir -r /home/user/GenAIComps/comps/llms/utils/lm-eval/requirements.txt diff --git a/comps/lvms/video-llama/server/requirements.txt b/comps/lvms/video-llama/server/requirements.txt index 41dacfbd21..afbac6004b 100644 --- a/comps/lvms/video-llama/server/requirements.txt +++ b/comps/lvms/video-llama/server/requirements.txt @@ -31,4 +31,6 @@ torchaudio==0.13.1 --index-url https://download.pytorch.org/whl/cpu torchvision==0.14.1 --index-url https://download.pytorch.org/whl/cpu transformers uvicorn +validators webdataset +werkzeug diff --git a/comps/lvms/video-llama/server/server.py b/comps/lvms/video-llama/server/server.py index f54cdc65e4..20841732c5 100644 --- a/comps/lvms/video-llama/server/server.py +++ b/comps/lvms/video-llama/server/server.py @@ -5,12 +5,14 @@ import argparse import logging import os +import re from threading import Thread from urllib.parse import urlparse import decord import requests import uvicorn +import validators from extract_vl_embedding import VLEmbeddingExtractor as VL from fastapi import FastAPI, Query from fastapi.middleware.cors import CORSMiddleware @@ -21,6 +23,7 @@ from transformers import TextIteratorStreamer, set_seed from video_llama.common.registry import registry from video_llama.conversation.conversation_video import Chat +from werkzeug.utils import secure_filename # Initialize decord bridge and seed decord.bridge.set_bridge("torch") @@ -33,7 +36,7 @@ context_db = None streamer = None chat = None -VIDEO_DIR = "/home/user/videos" +VIDEO_DIR = "/home/user/comps/lvms/video-llama/server/data" CFG_PATH = "video_llama_config/video_llama_eval_only_vl.yaml" MODEL_TYPE = "llama_v2" @@ -161,6 +164,43 @@ def is_local_file(url): return not url.startswith("http://") and not url.startswith("https://") +def is_valid_url(url): + # Validate the URL's structure + validation = validators.url(url) + if not validation: + logging.error("URL is invalid") + return False + + # Parse the URL to components + parsed_url = urlparse(url) + + # Check the scheme + if parsed_url.scheme not in ["http", "https"]: + logging.error("URL scheme is invalid") + return False + + # Check for "../" in the path + if "../" in parsed_url.path: + logging.error("URL contains '../', which is not allowed") + return False + + # Check that the path only contains one "." for the file extension + if parsed_url.path.count(".") != 1: + logging.error("URL path does not meet the requirement of having only one '.'") + return False + + # If all checks pass, the URL is valid + logging.info("URL is valid") + return True + + +def is_valid_video(filename): + if re.match(r"^[a-zA-Z0-9-_]+\.(mp4)$", filename, re.IGNORECASE): + return secure_filename(filename) + else: + return False + + @app.get("/health") async def health() -> Response: """Health check.""" @@ -175,46 +215,54 @@ async def generate( prompt: str = Query(..., description="Query for Video-LLama", examples="What is the man doing?"), max_new_tokens: int = Query(150, description="Maximum number of tokens to generate", examples=150), ) -> StreamingResponse: - if not is_local_file(video_url): - parsed_url = urlparse(video_url) - video_name = os.path.basename(parsed_url.path) - else: - video_name = os.path.basename(video_url) - if video_name.lower().endswith(".mp4"): - logging.info(f"Format check passed, the file '{video_name}' is an MP4 file.") + if video_url.lower().endswith(".mp4"): + logging.info(f"Format check passed, the file '{video_url}' is an MP4 file.") else: - logging.info(f"Format check failed, the file '{video_name}' is not an MP4 file.") - return JSONResponse(status_code=400, content={"message": "Invalid file type. Only mp4 videos are allowed."}) - - if not is_local_file(video_url): - try: - video_path = os.path.join(VIDEO_DIR, video_name) - response = requests.get(video_url, stream=True) - - if response.status_code == 200: - with open(video_path, "wb") as file: - for chunk in response.iter_content(chunk_size=1024): - if chunk: # filter out keep-alive new chunks - file.write(chunk) - logging.info(f"File downloaded: {video_path}") - else: + logging.info(f"Format check failed, the file '{video_url}' is not an MP4 file.") + return JSONResponse(status_code=500, content={"message": "Invalid file type. Only mp4 videos are allowed."}) + + if is_local_file(video_url): + # validate the video name + if is_valid_video(video_url): + secure_video_name = is_valid_video(video_url) # only support video name without path + else: + return JSONResponse(status_code=500, content={"message": "Invalid file name."}) + + video_path = os.path.join(VIDEO_DIR, secure_video_name) + if os.path.exists(video_path): + logging.info(f"File found: {video_path}") + else: + logging.error(f"File not found: {video_path}") + return JSONResponse( + status_code=404, content={"message": "File not found. Only local files under data folder are allowed."} + ) + else: + # validate the remote URL + if not is_valid_url(video_url): + return JSONResponse(status_code=500, content={"message": "Invalid URL."}) + else: + parsed_url = urlparse(video_url) + video_path = os.path.join(VIDEO_DIR, os.path.basename(parsed_url.path)) + try: + response = requests.get(video_url, stream=True) + if response.status_code == 200: + with open(video_path, "wb") as file: + for chunk in response.iter_content(chunk_size=1024): + if chunk: # filter out keep-alive new chunks + file.write(chunk) + logging.info(f"File downloaded: {video_path}") + else: + logging.info(f"Error downloading file: {response.status_code}") + return JSONResponse(status_code=500, content={"message": "Error downloading file."}) + except Exception as e: logging.info(f"Error downloading file: {response.status_code}") return JSONResponse(status_code=500, content={"message": "Error downloading file."}) - except Exception as e: - logging.info(f"Error downloading file: {response.status_code}") - return JSONResponse(status_code=500, content={"message": "Error downloading file."}) - else: - # check if the video exist - video_path = video_url - if not os.path.exists(video_path): - logging.info(f"File not found: {video_path}") - return JSONResponse(status_code=404, content={"message": "File not found."}) + video_info = videoInfo(start_time=start, duration=duration, video_path=video_path) # format context and instruction instruction = f"{get_context(prompt,context_db)[0]}: {prompt}" - # logging.info("instruction:",instruction) return StreamingResponse(stream_res(video_info, instruction, max_new_tokens)) diff --git a/comps/nginx/README.md b/comps/nginx/README.md index 416b9b2e1c..cae340331b 100644 --- a/comps/nginx/README.md +++ b/comps/nginx/README.md @@ -7,8 +7,8 @@ In GenAIComps, we utilize nginx to streamline our network services. We provide a ## πŸš€1. Build Docker Image ```bash -cd docker -docker build -t opea/nginx:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./Dockerfile . +cd ../.. +docker build -t opea/nginx:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/nginx/docker/Dockerfile . ``` ## πŸš€2. Environment Settings diff --git a/comps/nginx/docker/Dockerfile b/comps/nginx/docker/Dockerfile index 6816fb0a5b..447d3946a4 100644 --- a/comps/nginx/docker/Dockerfile +++ b/comps/nginx/docker/Dockerfile @@ -6,7 +6,7 @@ FROM nginx:alpine RUN apk add --no-cache gettext -COPY nginx.conf.template /etc/nginx/nginx.conf.template +COPY comps/nginx/docker/nginx.conf.template /etc/nginx/nginx.conf.template ENV FRONTEND_SERVICE_IP=localhost ENV FRONTEND_SERVICE_PORT=5173 @@ -14,7 +14,7 @@ ENV BACKEND_SERVICE_NAME=chatqna ENV BACKEND_SERVICE_IP=localhost ENV BACKEND_SERVICE_PORT=8888 -COPY start-nginx.sh /usr/local/bin/start-nginx.sh +COPY comps/nginx/docker/start-nginx.sh /usr/local/bin/start-nginx.sh RUN chmod +x /usr/local/bin/start-nginx.sh CMD ["/usr/local/bin/start-nginx.sh"] diff --git a/comps/prompt_registry/mongo/docker/Dockerfile b/comps/prompt_registry/mongo/docker/Dockerfile index db2e9c59d9..a2845430c7 100644 --- a/comps/prompt_registry/mongo/docker/Dockerfile +++ b/comps/prompt_registry/mongo/docker/Dockerfile @@ -8,8 +8,7 @@ ENV LANG=C.UTF-8 RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ build-essential \ libgl1-mesa-glx \ - libjemalloc-dev \ - vim + libjemalloc-dev RUN useradd -m -s /bin/bash user && \ mkdir -p /home/user && \ @@ -28,4 +27,4 @@ ENV PYTHONPATH=$PYTHONPATH:/home/user WORKDIR /home/user/comps/prompt_registry/mongo -ENTRYPOINT ["python", "prompt.py"] \ No newline at end of file +ENTRYPOINT ["python", "prompt.py"] diff --git a/comps/ragas/tgi/Dockerfile b/comps/ragas/tgi/Dockerfile index 55d4229a04..f55d8de0e4 100644 --- a/comps/ragas/tgi/Dockerfile +++ b/comps/ragas/tgi/Dockerfile @@ -5,8 +5,7 @@ FROM langchain/langchain:latest RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ libgl1-mesa-glx \ - libjemalloc-dev \ - vim + libjemalloc-dev RUN useradd -m -s /bin/bash user && \ mkdir -p /home/user && \ diff --git a/comps/reranks/fastrag/docker/Dockerfile b/comps/reranks/fastrag/docker/Dockerfile index 8372e33b76..2bf8e02b53 100644 --- a/comps/reranks/fastrag/docker/Dockerfile +++ b/comps/reranks/fastrag/docker/Dockerfile @@ -1,4 +1,3 @@ - # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 @@ -7,10 +6,9 @@ FROM python:3.10-slim ENV LANG=C.UTF-8 RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + git \ libgl1-mesa-glx \ - libjemalloc-dev \ - vim \ - git + libjemalloc-dev RUN useradd -m -s /bin/bash user && \ mkdir -p /home/user && \ @@ -33,4 +31,3 @@ ENV PYTHONPATH=$PYTHONPH:/home/user WORKDIR /home/user/comps/reranks/fastrag ENTRYPOINT ["python", "local_reranking.py"] - diff --git a/comps/reranks/langchain-mosec/docker/Dockerfile b/comps/reranks/langchain-mosec/docker/Dockerfile index 9a678dc4ad..7f3714e606 100644 --- a/comps/reranks/langchain-mosec/docker/Dockerfile +++ b/comps/reranks/langchain-mosec/docker/Dockerfile @@ -1,4 +1,3 @@ - # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 @@ -6,8 +5,7 @@ FROM langchain/langchain:latest RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ libgl1-mesa-glx \ - libjemalloc-dev \ - vim + libjemalloc-dev RUN useradd -m -s /bin/bash user && \ mkdir -p /home/user && \ @@ -25,4 +23,3 @@ ENV PYTHONPATH=$PYTHONPATH:/home/user WORKDIR /home/user/comps/reranks/langchain-mosec ENTRYPOINT ["python", "reranking_mosec_xeon.py"] - diff --git a/comps/reranks/tei/docker/Dockerfile b/comps/reranks/tei/docker/Dockerfile index 851fbfd581..2a8e7959f8 100644 --- a/comps/reranks/tei/docker/Dockerfile +++ b/comps/reranks/tei/docker/Dockerfile @@ -1,4 +1,3 @@ - # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 @@ -10,8 +9,7 @@ ARG ARCH="cpu" RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ libgl1-mesa-glx \ - libjemalloc-dev \ - vim + libjemalloc-dev RUN useradd -m -s /bin/bash user && \ mkdir -p /home/user && \ @@ -30,4 +28,3 @@ ENV PYTHONPATH=$PYTHONPATH:/home/user WORKDIR /home/user/comps/reranks/tei ENTRYPOINT ["python", "reranking_tei.py"] - diff --git a/comps/reranks/video-rag-qna/README.md b/comps/reranks/video-rag-qna/README.md new file mode 100644 index 0000000000..9edfe41188 --- /dev/null +++ b/comps/reranks/video-rag-qna/README.md @@ -0,0 +1,62 @@ +# Rerank Microservice + +This is a Docker-based microservice that do result rerank for VideoRAGQnA use case. Local rerank is used rather than rerank model. + +For the `VideoRAGQnA` usecase, during the data preparation phase, frames are extracted from videos and stored in a vector database. To identify the most relevant video, we count the occurrences of each video source among the retrieved data with rerank function `get_top_doc`. This sorts the video as a descending list of names, ranked by their degree of match with the query. Then we could send the `top_n` videos to the downstream LVM. + +# πŸš€1. Start Microservice with Docker + +## 1.1 Build Images + +```bash +cd GenAIComps +docker build --no-cache -t opea/reranking-videoragqna:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/reranks/video-rag-qna/docker/Dockerfile . +``` + +## 1.2 Start Rerank Service + +```bash +docker compose -f comps/reranks/video-rag-qna/docker/docker_compose_reranking.yaml up -d +# wait until ready +until docker logs reranking-videoragqna-server 2>&1 | grep -q "Uvicorn running on"; do + sleep 2 +done +``` + +Available configuration by environment variable: + +- CHUNK_DURATION: target chunk duration, should be aligned with VideoRAGQnA dataprep. Default 10s. + +# βœ… 2. Test + +```bash +export ip_address=$(hostname -I | awk '{print $1}') +curl -X 'POST' \ +"http://${ip_address}:8000/v1/reranking" \ +-H 'accept: application/json' \ +-H 'Content-Type: application/json' \ +-d '{ + "retrieved_docs": [{"doc": [{"text": "this is the retrieved text"}]}], + "initial_query": "this is the query", + "top_n": 1, + "metadata": [ + {"other_key": "value", "video":"top_video_name", "timestamp":"20"}, + {"other_key": "value", "video":"second_video_name", "timestamp":"40"}, + {"other_key": "value", "video":"top_video_name", "timestamp":"20"} + ] +}' +``` + +The result should be: + +```bash +{"id":"random number","video_url":"http://0.0.0.0:6005/top_video_name","chunk_start":20.0,"chunk_duration":10.0,"prompt":"this is the query","max_new_tokens":512} +``` + +# ♻️ 3. Clean + +```bash +# remove the container +cid=$(docker ps -aq --filter "name=reranking-videoragqna-server") +if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi +``` diff --git a/comps/reranks/video-rag-qna/docker/Dockerfile b/comps/reranks/video-rag-qna/docker/Dockerfile new file mode 100644 index 0000000000..617f47b6af --- /dev/null +++ b/comps/reranks/video-rag-qna/docker/Dockerfile @@ -0,0 +1,24 @@ + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +ENV LANG=C.UTF-8 + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps + +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir -r /home/user/comps/reranks/video-rag-qna/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +WORKDIR /home/user/comps/reranks/video-rag-qna + +ENTRYPOINT ["python", "local_reranking.py"] \ No newline at end of file diff --git a/comps/reranks/video-rag-qna/docker/docker_compose_reranking.yaml b/comps/reranks/video-rag-qna/docker/docker_compose_reranking.yaml new file mode 100644 index 0000000000..d819f331a1 --- /dev/null +++ b/comps/reranks/video-rag-qna/docker/docker_compose_reranking.yaml @@ -0,0 +1,21 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +services: + reranking: + image: opea/reranking-videoragqna:latest + container_name: reranking-videoragqna-server + ports: + - "8000:8000" + ipc: host + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + CHUNK_DURATION: ${CHUNK_DURATION} + FILE_SERVER_ENDPOINT: ${FILE_SERVER_ENDPOINT} + restart: unless-stopped + +networks: + default: + driver: bridge diff --git a/comps/reranks/video-rag-qna/local_reranking.py b/comps/reranks/video-rag-qna/local_reranking.py new file mode 100644 index 0000000000..3a3043ca8a --- /dev/null +++ b/comps/reranks/video-rag-qna/local_reranking.py @@ -0,0 +1,89 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import logging +import os +import time + +from comps import ( + LVMVideoDoc, + SearchedMultimodalDoc, + ServiceType, + opea_microservices, + register_microservice, + register_statistics, + statistics_dict, +) + +chunk_duration = os.getenv("CHUNK_DURATION", "10") or "10" +chunk_duration = float(chunk_duration) if chunk_duration.isdigit() else 10.0 + +file_server_endpoint = os.getenv("FILE_SERVER_ENDPOINT") or "http://0.0.0.0:6005" + +logging.basicConfig( + level=logging.INFO, format="%(levelname)s: [%(asctime)s] %(message)s", datefmt="%d/%m/%Y %I:%M:%S" +) + + +def get_top_doc(top_n, videos) -> list: + hit_score = {} + if videos is None: + return None + for video_name in videos: + try: + if video_name not in hit_score.keys(): + hit_score[video_name] = 0 + hit_score[video_name] += 1 + except KeyError as r: + logging.info(f"no video name {r}") + + x = dict(sorted(hit_score.items(), key=lambda item: -item[1])) # sorted dict of video name and score + top_n_names = list(x.keys())[:top_n] + logging.info(f"top docs = {x}") + logging.info(f"top n docs names = {top_n_names}") + + return top_n_names + + +def find_timestamp_from_video(metadata_list, video): + return next( + (metadata["timestamp"] for metadata in metadata_list if metadata["video"] == video), + None, + ) + + +@register_microservice( + name="opea_service@reranking_visual_rag", + service_type=ServiceType.RERANK, + endpoint="/v1/reranking", + host="0.0.0.0", + port=8000, + input_datatype=SearchedMultimodalDoc, + output_datatype=LVMVideoDoc, +) +@register_statistics(names=["opea_service@reranking_visual_rag"]) +def reranking(input: SearchedMultimodalDoc) -> LVMVideoDoc: + start = time.time() + + # get top video name from metadata + video_names = [meta["video"] for meta in input.metadata] + top_video_names = get_top_doc(input.top_n, video_names) + + # only use the first top video + timestamp = find_timestamp_from_video(input.metadata, top_video_names[0]) + video_url = f"{file_server_endpoint.rstrip('/')}/{top_video_names[0]}" + + result = LVMVideoDoc( + video_url=video_url, + prompt=input.initial_query, + chunk_start=timestamp, + chunk_duration=float(chunk_duration), + max_new_tokens=512, + ) + statistics_dict["opea_service@reranking_visual_rag"].append_latency(time.time() - start, None) + + return result + + +if __name__ == "__main__": + opea_microservices["opea_service@reranking_visual_rag"].start() diff --git a/comps/reranks/video-rag-qna/requirements.txt b/comps/reranks/video-rag-qna/requirements.txt new file mode 100644 index 0000000000..c7cc250eba --- /dev/null +++ b/comps/reranks/video-rag-qna/requirements.txt @@ -0,0 +1,11 @@ +datasets +docarray +fastapi +opentelemetry-api +opentelemetry-exporter-otlp +opentelemetry-sdk +Pillow +prometheus-fastapi-instrumentator +pydub +shortuuid +uvicorn diff --git a/comps/retrievers/haystack/qdrant/docker/Dockerfile b/comps/retrievers/haystack/qdrant/docker/Dockerfile index e9916c8db0..eaeba479af 100644 --- a/comps/retrievers/haystack/qdrant/docker/Dockerfile +++ b/comps/retrievers/haystack/qdrant/docker/Dockerfile @@ -1,4 +1,3 @@ - # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 @@ -6,8 +5,7 @@ FROM python:3.11-slim RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ libgl1-mesa-glx \ - libjemalloc-dev \ - vim + libjemalloc-dev RUN useradd -m -s /bin/bash user && \ mkdir -p /home/user && \ @@ -24,4 +22,4 @@ ENV PYTHONPATH=$PYTHONPATH:/home/user WORKDIR /home/user/comps/retrievers/haystack/qdrant -ENTRYPOINT ["python", "retriever_qdrant.py"] \ No newline at end of file +ENTRYPOINT ["python", "retriever_qdrant.py"] diff --git a/comps/retrievers/langchain/milvus/docker/Dockerfile b/comps/retrievers/langchain/milvus/docker/Dockerfile index 99c977e6f4..233a0ec132 100644 --- a/comps/retrievers/langchain/milvus/docker/Dockerfile +++ b/comps/retrievers/langchain/milvus/docker/Dockerfile @@ -1,4 +1,3 @@ - # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 @@ -11,8 +10,7 @@ ARG ARCH="cpu" RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ build-essential \ libgl1-mesa-glx \ - libjemalloc-dev \ - vim + libjemalloc-dev RUN useradd -m -s /bin/bash user && \ mkdir -p /home/user && \ @@ -31,4 +29,3 @@ ENV PYTHONPATH=$PYTHONPATH:/home/user WORKDIR /home/user/comps/retrievers/langchain/milvus ENTRYPOINT ["python", "retriever_milvus.py"] - diff --git a/comps/retrievers/langchain/pathway/README.md b/comps/retrievers/langchain/pathway/README.md new file mode 100644 index 0000000000..6f8e953f05 --- /dev/null +++ b/comps/retrievers/langchain/pathway/README.md @@ -0,0 +1,104 @@ +# Retriever Microservice with Pathway + +## πŸš€Start Microservices + +### With the Docker CLI + +We suggest using `docker compose` to run this app, refer to [`docker compose`](#with-the-docker-compose) section below. + +If you prefer to run them separately, refer to this section. + +#### (Optionally) Start the TEI (embedder) service separately + +> Note that Docker compose will start this service as well, this step is thus optional. + +```bash +export LANGCHAIN_TRACING_V2=true +export LANGCHAIN_API_KEY=${your_langchain_api_key} +export LANGCHAIN_PROJECT="opea/retriever" +model=BAAI/bge-base-en-v1.5 +revision=refs/pr/4 +# TEI_EMBEDDING_ENDPOINT="http://${your_ip}:6060" # if you want to use the hosted embedding service, example: "http://127.0.0.1:6060" + +# then run: +docker run -p 6060:80 -e http_proxy=$http_proxy -e https_proxy=$https_proxy --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.2 --model-id $model --revision $revision +``` + +Health check the embedding service with: + +```bash +curl 127.0.0.1:6060/embed -X POST -d '{"inputs":"What is Deep Learning?"}' -H 'Content-Type: application/json' +``` + +If the model supports re-ranking, you can also use: + +```bash +curl 127.0.0.1:6060/rerank -X POST -d '{"query":"What is Deep Learning?", "texts": ["Deep Learning is not...", "Deep learning is..."]}' -H 'Content-Type: application/json' +``` + +#### Start Retriever Service + +Retriever service queries the Pathway vector store on incoming requests. +Make sure that Pathway vector store is already running, [see Pathway vector store here](../../../vectorstores/langchain/pathway/README.md). + +Retriever service expects the Pathway host and port variables to connect to the vector DB. Set the Pathway vector store environment variables. + +```bash +export PATHWAY_HOST=0.0.0.0 +export PATHWAY_PORT=8666 +``` + +```bash +# make sure you are in the root folder of the repo +docker build -t opea/retriever-pathway:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/langchain/pathway/docker/Dockerfile . + +docker run -p 7000:7000 -e PATHWAY_HOST=${PATHWAY_HOST} -e PATHWAY_PORT=${PATHWAY_PORT} -e http_proxy=$http_proxy -e https_proxy=$https_proxy --network="host" opea/retriever-pathway:latest +``` + +### With the Docker compose + +First, set the env variables: + +```bash +export PATHWAY_HOST=0.0.0.0 +export PATHWAY_PORT=8666 +export LANGCHAIN_TRACING_V2=true +export LANGCHAIN_API_KEY=${your_langchain_api_key} +export LANGCHAIN_PROJECT="opea/retriever" +model=BAAI/bge-base-en-v1.5 +revision=refs/pr/4 +# TEI_EMBEDDING_ENDPOINT="http://${your_ip}:6060" # if you want to use the hosted embedding service, example: "http://127.0.0.1:6060" +``` + +Text embeddings inference service expects the `RETRIEVE_MODEL_ID` variable to be set. + +```bash +export RETRIEVE_MODEL_ID=BAAI/bge-base-en-v1.5 +``` + +Note that following docker compose sets the `network_mode: host` in retriever image to allow local vector store connection. +This will start the both the embedding and retriever services: + +```bash +cd comps/retrievers/langchain/pathway/docker + +docker compose -f docker_compose_retriever.yaml build +docker compose -f docker_compose_retriever.yaml up + +# shut down the containers +docker compose -f docker_compose_retriever.yaml down +``` + +Make sure the retriever service is working as expected: + +```bash +curl http://0.0.0.0:7000/v1/health_check -X GET -H 'Content-Type: application/json' +``` + +send an example query: + +```bash +exm_embeddings=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") + +curl http://0.0.0.0:7000/v1/retrieval -X POST -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${exm_embeddings}}" -H 'Content-Type: application/json' +``` diff --git a/comps/retrievers/langchain/pathway/docker/Dockerfile b/comps/retrievers/langchain/pathway/docker/Dockerfile new file mode 100644 index 0000000000..b70c01f0eb --- /dev/null +++ b/comps/retrievers/langchain/pathway/docker/Dockerfile @@ -0,0 +1,30 @@ + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM langchain/langchain:latest + +ARG ARCH="cpu" + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + libgl1-mesa-glx \ + libjemalloc-dev \ + vim + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +COPY comps /home/user/comps + +USER user + +RUN pip install --no-cache-dir --upgrade pip && \ + if [ ${ARCH} = "cpu" ]; then pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu; fi && \ + pip install --no-cache-dir -r /home/user/comps/retrievers/langchain/pathway/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +WORKDIR /home/user/comps/retrievers/langchain/pathway + +ENTRYPOINT ["bash", "entrypoint.sh"] diff --git a/comps/retrievers/langchain/pathway/docker/docker_compose_retriever.yaml b/comps/retrievers/langchain/pathway/docker/docker_compose_retriever.yaml new file mode 100644 index 0000000000..b2b9383d6b --- /dev/null +++ b/comps/retrievers/langchain/pathway/docker/docker_compose_retriever.yaml @@ -0,0 +1,35 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3.8" + +services: + tei_xeon_service: + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.2 + container_name: tei-xeon-server + ports: + - "6060:80" + volumes: + - "./data:/data" + shm_size: 1g + command: --model-id ${RETRIEVE_MODEL_ID} + retriever: + image: opea/retriever-pathway:latest + container_name: retriever-pathway-server + ports: + - "7000:7000" + ipc: host + network_mode: host + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + PATHWAY_HOST: ${PATHWAY_HOST} + PATHWAY_PORT: ${PATHWAY_PORT} + TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT} + LANGCHAIN_API_KEY: ${LANGCHAIN_API_KEY} + restart: unless-stopped + +networks: + default: + driver: bridge diff --git a/comps/retrievers/langchain/pathway/entrypoint.sh b/comps/retrievers/langchain/pathway/entrypoint.sh new file mode 100644 index 0000000000..f5c8fc1511 --- /dev/null +++ b/comps/retrievers/langchain/pathway/entrypoint.sh @@ -0,0 +1,6 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +pip --no-cache-dir install -r requirements-runtime.txt + +python retriever_pathway.py diff --git a/comps/retrievers/langchain/pathway/requirements-runtime.txt b/comps/retrievers/langchain/pathway/requirements-runtime.txt new file mode 100644 index 0000000000..53d49066d5 --- /dev/null +++ b/comps/retrievers/langchain/pathway/requirements-runtime.txt @@ -0,0 +1 @@ +langsmith diff --git a/comps/retrievers/langchain/pathway/requirements.txt b/comps/retrievers/langchain/pathway/requirements.txt new file mode 100644 index 0000000000..98fe20fd1e --- /dev/null +++ b/comps/retrievers/langchain/pathway/requirements.txt @@ -0,0 +1,12 @@ +docarray[full] +fastapi +frontend==0.0.3 +huggingface_hub +langchain_community == 0.2.0 +opentelemetry-api +opentelemetry-exporter-otlp +opentelemetry-sdk +pathway +prometheus-fastapi-instrumentator +sentence_transformers +shortuuid diff --git a/comps/retrievers/langchain/pathway/retriever_pathway.py b/comps/retrievers/langchain/pathway/retriever_pathway.py new file mode 100644 index 0000000000..72b7babaa0 --- /dev/null +++ b/comps/retrievers/langchain/pathway/retriever_pathway.py @@ -0,0 +1,52 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os +import time + +from langchain_community.vectorstores import PathwayVectorClient +from langsmith import traceable + +from comps import ( + EmbedDoc, + SearchedDoc, + ServiceType, + TextDoc, + opea_microservices, + register_microservice, + register_statistics, + statistics_dict, +) + +host = os.getenv("PATHWAY_HOST", "127.0.0.1") +port = int(os.getenv("PATHWAY_PORT", 8666)) + +EMBED_MODEL = os.getenv("EMBED_MODEL", "BAAI/bge-base-en-v1.5") + +tei_embedding_endpoint = os.getenv("TEI_EMBEDDING_ENDPOINT") + + +@register_microservice( + name="opea_service@retriever_pathway", + service_type=ServiceType.RETRIEVER, + endpoint="/v1/retrieval", + host="0.0.0.0", + port=7000, +) +@traceable(run_type="retriever") +@register_statistics(names=["opea_service@retriever_pathway"]) +def retrieve(input: EmbedDoc) -> SearchedDoc: + start = time.time() + documents = pw_client.similarity_search(input.text, input.fetch_k) + + docs = [TextDoc(text=r.page_content) for r in documents] + + time_spent = time.time() - start + statistics_dict["opea_service@retriever_pathway"].append_latency(time_spent, None) # noqa: E501 + return SearchedDoc(retrieved_docs=docs, initial_query=input.text) + + +if __name__ == "__main__": + # Create the vectorstore client + pw_client = PathwayVectorClient(host=host, port=port) + opea_microservices["opea_service@retriever_pathway"].start() diff --git a/comps/retrievers/langchain/pgvector/docker/Dockerfile b/comps/retrievers/langchain/pgvector/docker/Dockerfile index 0b935d7a6a..84122d6291 100644 --- a/comps/retrievers/langchain/pgvector/docker/Dockerfile +++ b/comps/retrievers/langchain/pgvector/docker/Dockerfile @@ -1,4 +1,3 @@ - # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 @@ -6,8 +5,7 @@ FROM langchain/langchain:latest RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ libgl1-mesa-glx \ - libjemalloc-dev \ - vim + libjemalloc-dev RUN useradd -m -s /bin/bash user && \ mkdir -p /home/user && \ diff --git a/comps/retrievers/langchain/pinecone/docker/Dockerfile b/comps/retrievers/langchain/pinecone/docker/Dockerfile index dbb6d57c26..6d36c0f55f 100644 --- a/comps/retrievers/langchain/pinecone/docker/Dockerfile +++ b/comps/retrievers/langchain/pinecone/docker/Dockerfile @@ -1,4 +1,3 @@ - # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 @@ -8,8 +7,7 @@ ARG ARCH="cpu" RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ libgl1-mesa-glx \ - libjemalloc-dev \ - vim + libjemalloc-dev RUN useradd -m -s /bin/bash user && \ mkdir -p /home/user && \ @@ -28,4 +26,4 @@ ENV PYTHONPATH=$PYTHONPATH:/home/user WORKDIR /home/user/comps/retrievers/langchain/pinecone -ENTRYPOINT ["python", "retriever_pinecone.py"] \ No newline at end of file +ENTRYPOINT ["python", "retriever_pinecone.py"] diff --git a/comps/retrievers/langchain/redis/docker/Dockerfile b/comps/retrievers/langchain/redis/docker/Dockerfile index e3d519910a..1993e5fd90 100644 --- a/comps/retrievers/langchain/redis/docker/Dockerfile +++ b/comps/retrievers/langchain/redis/docker/Dockerfile @@ -1,4 +1,3 @@ - # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 @@ -8,8 +7,7 @@ ARG ARCH="cpu" RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ libgl1-mesa-glx \ - libjemalloc-dev \ - vim + libjemalloc-dev RUN useradd -m -s /bin/bash user && \ mkdir -p /home/user && \ diff --git a/comps/retrievers/llamaindex/docker/Dockerfile b/comps/retrievers/llamaindex/docker/Dockerfile index 7d9cd64be1..4b022718a8 100644 --- a/comps/retrievers/llamaindex/docker/Dockerfile +++ b/comps/retrievers/llamaindex/docker/Dockerfile @@ -1,13 +1,12 @@ - # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 FROM ubuntu:22.04 RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + python3-pip \ libgl1-mesa-glx \ - libjemalloc-dev \ - vim + libjemalloc-dev RUN useradd -m -s /bin/bash user && \ mkdir -p /home/user && \ diff --git a/comps/vectorstores/README.md b/comps/vectorstores/README.md index bdccf5bc06..492ef970f5 100644 --- a/comps/vectorstores/README.md +++ b/comps/vectorstores/README.md @@ -17,3 +17,7 @@ For details, please refer to this [readme](langchain/pgvector/README.md) ## Vectorstores Microservice with Pinecone For details, please refer to this [readme](langchain/pinecone/README.md) + +## Vectorstores Microservice with Pathway + +For details, please refer to this [readme](langchain/pathway/README.md) diff --git a/comps/vectorstores/langchain/lancedb/README.md b/comps/vectorstores/langchain/lancedb/README.md new file mode 100644 index 0000000000..bfe01585c2 --- /dev/null +++ b/comps/vectorstores/langchain/lancedb/README.md @@ -0,0 +1,139 @@ +# LanceDB + +LanceDB is an embedded vector database for AI applications. It is open source and distributed with an Apache-2.0 license. + +LanceDB datasets are persisted to disk and can be shared in Python. + +## Setup + +```bash +npm install -S vectordb +``` + +## Usage + +### Create a new index from texts + +```python +import os +import tempfile +from langchain.vectorstores import LanceDB +from langchain.embeddings.openai import OpenAIEmbeddings +from vectordb import connect + + +async def run(): + dir = tempfile.mkdtemp(prefix="lancedb-") + db = await connect(dir) + table = await db.create_table("vectors", [{"vector": [0] * 1536, "text": "sample", "id": 1}]) + + vector_store = await LanceDB.from_texts( + ["Hello world", "Bye bye", "hello nice world"], + [{"id": 2}, {"id": 1}, {"id": 3}], + OpenAIEmbeddings(), + table=table, + ) + + result_one = await vector_store.similarity_search("hello world", 1) + print(result_one) + # [ Document(page_content='hello nice world', metadata={'id': 3}) ] + + +# Run the function +import asyncio + +asyncio.run(run()) +``` + +API Reference: + +- `LanceDB` from `@langchain/community/vectorstores/lancedb` +- `OpenAIEmbeddings` from `@langchain/openai` + +### Create a new index from a loader + +```python +import os +import tempfile +from langchain.vectorstores import LanceDB +from langchain.embeddings.openai import OpenAIEmbeddings +from langchain.document_loaders.fs import TextLoader +from vectordb import connect + +# Create docs with a loader +loader = TextLoader("src/document_loaders/example_data/example.txt") +docs = loader.load() + + +async def run(): + dir = tempfile.mkdtemp(prefix="lancedb-") + db = await connect(dir) + table = await db.create_table("vectors", [{"vector": [0] * 1536, "text": "sample", "source": "a"}]) + + vector_store = await LanceDB.from_documents(docs, OpenAIEmbeddings(), table=table) + + result_one = await vector_store.similarity_search("hello world", 1) + print(result_one) + # [ + # Document(page_content='Foo\nBar\nBaz\n\n', metadata={'source': 'src/document_loaders/example_data/example.txt'}) + # ] + + +# Run the function +import asyncio + +asyncio.run(run()) +``` + +API Reference: + +- `LanceDB` from `@langchain/community/vectorstores/lancedb` +- `OpenAIEmbeddings` from `@langchain/openai` +- `TextLoader` from `langchain/document_loaders/fs/text` + +### Open an existing dataset + +```python +import os +import tempfile +from langchain.vectorstores import LanceDB +from langchain.embeddings.openai import OpenAIEmbeddings +from vectordb import connect + + +async def run(): + uri = await create_test_db() + db = await connect(uri) + table = await db.open_table("vectors") + + vector_store = LanceDB(OpenAIEmbeddings(), table=table) + + result_one = await vector_store.similarity_search("hello world", 1) + print(result_one) + # [ Document(page_content='Hello world', metadata={'id': 1}) ] + + +async def create_test_db(): + dir = tempfile.mkdtemp(prefix="lancedb-") + db = await connect(dir) + await db.create_table( + "vectors", + [ + {"vector": [0] * 1536, "text": "Hello world", "id": 1}, + {"vector": [0] * 1536, "text": "Bye bye", "id": 2}, + {"vector": [0] * 1536, "text": "hello nice world", "id": 3}, + ], + ) + return dir + + +# Run the function +import asyncio + +asyncio.run(run()) +``` + +API Reference: + +- `LanceDB` from `@langchain/community/vectorstores/lancedb` +- `OpenAIEmbeddings` from `@langchain/openai` diff --git a/comps/vectorstores/langchain/pathway/Dockerfile b/comps/vectorstores/langchain/pathway/Dockerfile new file mode 100644 index 0000000000..31cd06a824 --- /dev/null +++ b/comps/vectorstores/langchain/pathway/Dockerfile @@ -0,0 +1,25 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM pathwaycom/pathway:0.13.2-slim + +ENV DOCKER_BUILDKIT=1 +ENV PYTHONUNBUFFERED=1 + +RUN apt-get update && apt-get install -y \ + poppler-utils \ + libreoffice \ + libmagic-dev \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +COPY requirements.txt /app/ + +RUN pip install --no-cache-dir -r requirements.txt + +COPY vectorstore_pathway.py /app/ + + +CMD ["python", "vectorstore_pathway.py"] + diff --git a/comps/vectorstores/langchain/pathway/README.md b/comps/vectorstores/langchain/pathway/README.md new file mode 100644 index 0000000000..fb0be01523 --- /dev/null +++ b/comps/vectorstores/langchain/pathway/README.md @@ -0,0 +1,84 @@ +# Start the Pathway Vector DB Server + +Set the environment variables for Pathway, and the embedding model. + +> Note: If you are using `TEI_EMBEDDING_ENDPOINT`, make sure embedding service is already running. +> See the instructions under [here](../../../retrievers/langchain/pathway/README.md) + +```bash +export PATHWAY_HOST=0.0.0.0 +export PATHWAY_PORT=8666 +# TEI_EMBEDDING_ENDPOINT="http://${your_ip}:6060" # uncomment if you want to use the hosted embedding service, example: "http://127.0.0.1:6060" +``` + +## Configuration + +### Setting up the Pathway data sources + +Pathway can listen to many sources simultaneously, such as local files, S3 folders, cloud storage, and any data stream. Whenever a new file is added or an existing file is modified, Pathway parses, chunks and indexes the documents in real-time. + +See [pathway-io](https://pathway.com/developers/api-docs/pathway-io) for more information. + +You can easily connect to the data inside the folder with the Pathway file system connector. The data will automatically be updated by Pathway whenever the content of the folder changes. In this example, we create a single data source that reads the files under the `./data` folder. + +You can manage your data sources by configuring the `data_sources` in `vectorstore_pathway.py`. + +```python +import pathway as pw + +data = pw.io.fs.read( + "./data", + format="binary", + mode="streaming", + with_metadata=True, +) # This creates a Pathway connector that tracks +# all the files in the ./data directory + +data_sources = [data] +``` + +### Other configs (parser, splitter and the embedder) + +Pathway vectorstore handles the ingestion and processing of the documents. +This allows you to configure the parser, splitter and the embedder. +Whenever a file is added or modified in one of the sources, Pathway will automatically ingest the file. + +By default, `ParseUnstructured` parser, `langchain.text_splitter.CharacterTextSplitter` splitter and `BAAI/bge-base-en-v1.5` embedder are used. + +For more information, see the relevant Pathway docs: + +- [Vector store docs](https://pathway.com/developers/api-docs/pathway-xpacks-llm/vectorstore) +- [parsers docs](https://pathway.com/developers/api-docs/pathway-xpacks-llm/parsers) +- [splitters docs](https://pathway.com/developers/api-docs/pathway-xpacks-llm/splitters) +- [embedders docs](https://pathway.com/developers/api-docs/pathway-xpacks-llm/embedders) + +## Building and running + +Build the Docker and run the Pathway Vector Store: + +```bash +cd comps/vectorstores/langchain/pathway + +docker build --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -t opea/vectorstore-pathway:latest . + +# with locally loaded model, you may add `EMBED_MODEL` env variable to configure the model. +docker run -e PATHWAY_HOST=${PATHWAY_HOST} -e PATHWAY_PORT=${PATHWAY_PORT} -e http_proxy=$http_proxy -e https_proxy=$https_proxy -v ./data:/app/data -p ${PATHWAY_PORT}:${PATHWAY_PORT} opea/vectorstore-pathway:latest + +# with the hosted embedder (network argument is needed for the vector server to reach to the embedding service) +docker run -e PATHWAY_HOST=${PATHWAY_HOST} -e PATHWAY_PORT=${PATHWAY_PORT} -e TEI_EMBEDDING_ENDPOINT=${TEI_EMBEDDING_ENDPOINT} -e http_proxy=$http_proxy -e https_proxy=$https_proxy -v ./data:/app/data -p ${PATHWAY_PORT}:${PATHWAY_PORT} --network="host" opea/vectorstore-pathway:latest +``` + +## Health check the vector store + +Wait until the server finishes indexing the docs, and send the following request to check it. + +```bash +curl -X 'POST' \ + "http://$PATHWAY_HOST:$PATHWAY_PORT/v1/statistics" \ + -H 'accept: */*' \ + -H 'Content-Type: application/json' +``` + +This should respond with something like: + +> `{"file_count": 1, "last_indexed": 1724325093, "last_modified": 1724317365}` diff --git a/comps/vectorstores/langchain/pathway/data/nke-10k-2023.pdf b/comps/vectorstores/langchain/pathway/data/nke-10k-2023.pdf new file mode 100644 index 0000000000..6ade8863e8 Binary files /dev/null and b/comps/vectorstores/langchain/pathway/data/nke-10k-2023.pdf differ diff --git a/comps/vectorstores/langchain/pathway/requirements.txt b/comps/vectorstores/langchain/pathway/requirements.txt new file mode 100644 index 0000000000..3d88eddf6c --- /dev/null +++ b/comps/vectorstores/langchain/pathway/requirements.txt @@ -0,0 +1,4 @@ +langchain_openai +pathway[xpack-llm] >= 0.14.1 +sentence_transformers +unstructured[all-docs] >= 0.10.28,<0.15 diff --git a/comps/vectorstores/langchain/pathway/vectorstore_pathway.py b/comps/vectorstores/langchain/pathway/vectorstore_pathway.py new file mode 100644 index 0000000000..c6cac04b7a --- /dev/null +++ b/comps/vectorstores/langchain/pathway/vectorstore_pathway.py @@ -0,0 +1,62 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import logging +import os + +import pathway as pw +from langchain import text_splitter +from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceHubEmbeddings +from pathway.xpacks.llm.parsers import ParseUnstructured +from pathway.xpacks.llm.vector_store import VectorStoreServer + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(name)s %(levelname)s %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", +) + +# This creates a Pathway connector that tracks all the files in the `data/` directory. +# Additions and modifications will be reflected on the index automatically. + +data = pw.io.fs.read( + "./data", + format="binary", + mode="streaming", + with_metadata=True, +) + +data_sources = [data] + +splitter = text_splitter.TokenTextSplitter(chunk_size=450, chunk_overlap=50) + +host = os.getenv("PATHWAY_HOST", "127.0.0.1") +port = int(os.getenv("PATHWAY_PORT", 8666)) + +EMBED_MODEL = os.getenv("EMBED_MODEL", "BAAI/bge-base-en-v1.5") + +tei_embedding_endpoint = os.getenv("TEI_EMBEDDING_ENDPOINT") + +if __name__ == "__main__": + # Create vectorstore + if tei_embedding_endpoint: + # create embeddings using TEI endpoint service + logging.info(f"Initializing the embedder from tei_embedding_endpoint: {tei_embedding_endpoint}") + embeddings = HuggingFaceHubEmbeddings(model=tei_embedding_endpoint) + else: + # create embeddings using local embedding model + embeddings = HuggingFaceBgeEmbeddings(model_name=EMBED_MODEL) + + server = VectorStoreServer.from_langchain_components( + *data_sources, + embedder=embeddings, + parser=ParseUnstructured(), + splitter=splitter, + ) + + server.run_server( + host, + port=port, + with_cache=True, + cache_backend=pw.persistence.Backend.filesystem("./Cache"), + ) diff --git a/comps/web_retrievers/langchain/chroma/docker/Dockerfile b/comps/web_retrievers/langchain/chroma/docker/Dockerfile index c391fefe2d..a5a65bbd5d 100644 --- a/comps/web_retrievers/langchain/chroma/docker/Dockerfile +++ b/comps/web_retrievers/langchain/chroma/docker/Dockerfile @@ -2,16 +2,19 @@ # SPDX-License-Identifier: Apache-2.0 FROM langchain/langchain:latest + +ARG ARCH="cpu" # Set this to "cpu" or "gpu" + RUN useradd -m -s /bin/bash user && \ mkdir -p /home/user && \ chown -R user /home/user/ -ARG ARCH="cpu" # Set this to "cpu" or "gpu" RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ libgl1-mesa-glx \ - libjemalloc-dev \ - vim + libjemalloc-dev + USER user + COPY comps /home/user/comps RUN pip install --no-cache-dir --upgrade pip && \ diff --git a/tests/test_dataprep_milvus.sh b/tests/test_dataprep_milvus.sh index 727ef81e11..d9d9b6cab1 100644 --- a/tests/test_dataprep_milvus.sh +++ b/tests/test_dataprep_milvus.sh @@ -33,16 +33,14 @@ function start_service() { # start milvus vector db mkdir $WORKPATH/milvus cd $WORKPATH/milvus - wget https://raw.githubusercontent.com/milvus-io/milvus/v2.4.6/configs/milvus.yaml - wget https://github.com/milvus-io/milvus/releases/download/v2.4.6/milvus-standalone-docker-compose.yml -O docker-compose.yml + wget https://raw.githubusercontent.com/milvus-io/milvus/v2.4.9/configs/milvus.yaml + wget https://github.com/milvus-io/milvus/releases/download/v2.4.9/milvus-standalone-docker-compose.yml -O docker-compose.yml sed '/- \${DOCKER_VOLUME_DIRECTORY:-\.}\/volumes\/milvus:\/var\/lib\/milvus/a \ \ \ \ \ \ - \${DOCKER_VOLUME_DIRECTORY:-\.}\/milvus.yaml:\/milvus\/configs\/milvus.yaml' -i docker-compose.yml docker compose up -d # set service ports mosec_embedding_port=5021 dataprep_service_port=5022 - dataprep_file_service_port=5023 - dataprep_del_service_port=5024 # start mosec embedding service docker run -d --name="test-comps-dataprep-milvus-mosec-server" -p $mosec_embedding_port:8000 -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/langchain-mosec:comps @@ -50,69 +48,89 @@ function start_service() { # start dataprep service MOSEC_EMBEDDING_ENDPOINT="http://${ip_address}:${mosec_embedding_port}" MILVUS=${ip_address} - docker run -d --name="test-comps-dataprep-milvus-server" -p ${dataprep_service_port}:6010 -p ${dataprep_file_service_port}:6011 -p ${dataprep_del_service_port}:6012 -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy -e MOSEC_EMBEDDING_ENDPOINT=${MOSEC_EMBEDDING_ENDPOINT} -e MILVUS=${MILVUS} --ipc=host opea/dataprep-milvus:comps + docker run -d --name="test-comps-dataprep-milvus-server" -p ${dataprep_service_port}:6010 -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy -e MOSEC_EMBEDDING_ENDPOINT=${MOSEC_EMBEDDING_ENDPOINT} -e MILVUS=${MILVUS} -e LOGFLAG=true --ipc=host opea/dataprep-milvus:comps sleep 1m } -function validate_microservice() { - cd $LOG_PATH - - # test /v1/dataprep - dataprep_service_port=5022 - URL="http://${ip_address}:$dataprep_service_port/v1/dataprep" - echo "Deep learning is a subset of machine learning that utilizes neural networks with multiple layers to analyze various levels of abstract data representations. It enables computers to identify patterns and make decisions with minimal human intervention by learning from large amounts of data." > $LOG_PATH/dataprep_file.txt - HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -F 'files=@./dataprep_file.txt' -H 'Content-Type: multipart/form-data' "$URL") - if [ "$HTTP_STATUS" -eq 200 ]; then - echo "[ dataprep ] HTTP status is 200. Checking content..." - cp ./dataprep_file.txt ./dataprep_file2.txt - local CONTENT=$(curl -s -X POST -F 'files=@./dataprep_file2.txt' -H 'Content-Type: multipart/form-data' "$URL" | tee ${LOG_PATH}/dataprep.log) - - if echo "$CONTENT" | grep -q "Data preparation succeeded"; then - echo "[ dataprep ] Content is as expected." - else - echo "[ dataprep ] Content does not match the expected result: $CONTENT" - docker logs test-comps-dataprep-milvus-server >> ${LOG_PATH}/dataprep.log - exit 1 - fi +function validate_service() { + local URL="$1" + local EXPECTED_RESULT="$2" + local SERVICE_NAME="$3" + local DOCKER_NAME="$4" + local INPUT_DATA="$5" + + if [[ $SERVICE_NAME == *"dataprep_upload_file"* ]]; then + cd $LOG_PATH + HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F 'files=@./dataprep_file.txt' -H 'Content-Type: multipart/form-data' "$URL") + elif [[ $SERVICE_NAME == *"dataprep_upload_link"* ]]; then + HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F 'link_list=["https://www.ces.tech/"]' -F "chunk_size=500" "$URL") + elif [[ $SERVICE_NAME == *"dataprep_get"* ]]; then + HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -H 'Content-Type: application/json' "$URL") + elif [[ $SERVICE_NAME == *"dataprep_del"* ]]; then + HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d '{"file_path": "all"}' -H 'Content-Type: application/json' "$URL") else - echo "[ dataprep ] HTTP status is not 200. Received status was $HTTP_STATUS" - docker logs test-comps-dataprep-milvus-server >> ${LOG_PATH}/dataprep.log - exit 1 + HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL") fi + HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://') + RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g') - # test /v1/dataprep/get_file - dataprep_file_service_port=5023 - URL="http://${ip_address}:$dataprep_file_service_port/v1/dataprep/get_file" - HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -H 'Content-Type: application/json' "$URL") - if [ "$HTTP_STATUS" -eq 200 ]; then - echo "[ dataprep - file ] HTTP status is 200. Checking content..." - local CONTENT=$(curl -s -X POST -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/dataprep_file.log) - - if echo "$CONTENT" | grep -q '{"name":'; then - echo "[ dataprep - file ] Content is as expected." - else - echo "[ dataprep - file ] Content does not match the expected result: $CONTENT" - docker logs test-comps-dataprep-milvus-server >> ${LOG_PATH}/dataprep_file.log - exit 1 + docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log + + # check response status + if [ "$HTTP_STATUS" -ne "200" ]; then + echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" + ##################### + if [[ $SERVICE_NAME == *"dataprep_upload_link"* ]]; then + docker logs test-comps-dataprep-milvus-mosec-server >> ${LOG_PATH}/mosec-embedding.log fi + exit 1 else - echo "[ dataprep - file ] HTTP status is not 200. Received status was $HTTP_STATUS" - docker logs test-comps-dataprep-milvus-server >> ${LOG_PATH}/dataprep_file.log + echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." + fi + # check response body + if [[ "$RESPONSE_BODY" != *"$EXPECTED_RESULT"* ]]; then + echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY" exit 1 + else + echo "[ $SERVICE_NAME ] Content is as expected." fi + sleep 5s +} + +function validate_microservice() { + cd $LOG_PATH + dataprep_service_port=5022 + # test /v1/dataprep/delete_file - dataprep_del_service_port=5024 - URL="http://${ip_address}:$dataprep_del_service_port/v1/dataprep/delete_file" - HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d '{"file_path": "dataprep_file.txt"}' -H 'Content-Type: application/json' "$URL") - if [ "$HTTP_STATUS" -eq 200 ]; then - echo "[ dataprep - del ] HTTP status is 200." - docker logs test-comps-dataprep-milvus-server >> ${LOG_PATH}/dataprep_del.log - else - echo "[ dataprep - del ] HTTP status is not 200. Received status was $HTTP_STATUS" - docker logs test-comps-dataprep-milvus-server >> ${LOG_PATH}/dataprep_del.log - exit 1 - fi + validate_service \ + "http://${ip_address}:${dataprep_service_port}/v1/dataprep/delete_file" \ + '{"status":true}' \ + "dataprep_del" \ + "test-comps-dataprep-milvus-server" + + # test /v1/dataprep upload file + echo "Deep learning is a subset of machine learning that utilizes neural networks with multiple layers to analyze various levels of abstract data representations. It enables computers to identify patterns and make decisions with minimal human intervention by learning from large amounts of data." > $LOG_PATH/dataprep_file.txt + validate_service \ + "http://${ip_address}:${dataprep_service_port}/v1/dataprep" \ + "Data preparation succeeded" \ + "dataprep_upload_file" \ + "test-comps-dataprep-milvus-server" + + # test /v1/dataprep upload link + validate_service \ + "http://${ip_address}:${dataprep_service_port}/v1/dataprep" \ + "Data preparation succeeded" \ + "dataprep_upload_link" \ + "test-comps-dataprep-milvus-server" + + # test /v1/dataprep/get_file + validate_service \ + "http://${ip_address}:${dataprep_service_port}/v1/dataprep/get_file" \ + '{"name":' \ + "dataprep_get" \ + "test-comps-dataprep-milvus-server" + } function stop_docker() { diff --git a/tests/test_dataprep_redis_multimodal_langchain.sh b/tests/test_dataprep_redis_multimodal_langchain.sh new file mode 100644 index 0000000000..e5a75f8604 --- /dev/null +++ b/tests/test_dataprep_redis_multimodal_langchain.sh @@ -0,0 +1,278 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -x + +WORKPATH=$(dirname "$PWD") +LOG_PATH="$WORKPATH/tests" +ip_address=$(hostname -I | awk '{print $1}') +LVM_PORT=5028 +LVM_ENDPOINT="http://${ip_address}:${LVM_PORT}/v1/lvm" +WHISPER_MODEL="base" +INDEX_NAME="dataprep" +video_name="WeAreGoingOnBullrun" +transcript_fn="${video_name}.vtt" +video_fn="${video_name}.mp4" + +function build_docker_images() { + cd $WORKPATH + echo $(pwd) + docker build --no-cache -t opea/dataprep-redis:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/redis/multimodal_langchain/docker/Dockerfile . + + if [ $? -ne 0 ]; then + echo "opea/dataprep-redis built fail" + exit 1 + else + echo "opea/dataprep-redis built successful" + fi +} + +function build_lvm_docker_images() { + cd $WORKPATH + echo $(pwd) + docker build --no-cache -t opea/llava:comps -f comps/lvms/llava/Dockerfile . + if [ $? -ne 0 ]; then + echo "opea/llava built fail" + exit 1 + else + echo "opea/llava built successful" + fi + docker build --no-cache -t opea/lvm:comps -f comps/lvms/Dockerfile . + if [ $? -ne 0 ]; then + echo "opea/lvm built fail" + exit 1 + else + echo "opea/lvm built successful" + fi +} + +function start_lvm_service() { + unset http_proxy + docker run -d --name="test-comps-lvm-llava" -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p 5029:8399 --ipc=host opea/llava:comps + docker run -d --name="test-comps-lvm" -e LVM_ENDPOINT=http://$ip_address:5029 -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p ${LVM_PORT}:9399 --ipc=host opea/lvm:comps + sleep 5m +} + +function start_lvm() { + cd $WORKPATH + echo $(pwd) + echo "Building LVM Docker Images" + build_lvm_docker_images + echo "Starting LVM Services" + start_lvm_service + +} + +function start_service() { + # start redis + echo "Starting Redis server" + REDIS_PORT=6380 + docker run -d --name="test-comps-dataprep-redis-multimodal-langchain" -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p $REDIS_PORT:6379 -p 8002:8001 --ipc=host redis/redis-stack:7.2.0-v9 + + # start dataprep microservice + echo "Starting dataprep microservice" + dataprep_service_port=5013 + REDIS_URL="redis://${ip_address}:${REDIS_PORT}" + docker run -d --name="test-comps-dataprep-redis-multimodal-langchain-server" -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e REDIS_URL=$REDIS_URL -e INDEX_NAME=$INDEX_NAME -e LVM_ENDPOINT=$LVM_ENDPOINT -p ${dataprep_service_port}:6007 --runtime=runc --ipc=host opea/dataprep-redis:comps + + sleep 1m +} + +function prepare_data() { + echo "Prepare Transcript .vtt" + cd ${LOG_PATH} + echo $(pwd) + echo """WEBVTT + +00:00:00.000 --> 00:00:03.400 +Last year the smoking tire went on the bull run live rally in the + +00:00:03.400 --> 00:00:09.760 +2010 Ford SBT Raptor. I liked it so much. I bought one. Here it is. We're going back + +00:00:09.760 --> 00:00:12.920 +to bull run this year of course we'll help from our friends at Black Magic and + +00:00:12.920 --> 00:00:19.560 +we're so serious about it. We got two Valentine one radar detectors. Oh yeah. + +00:00:19.560 --> 00:00:23.760 +So we're all set up and the reason we got two is because we're going to be going + +00:00:23.760 --> 00:00:29.920 +a little bit faster. We got a 2011 Shelby GT500. The 550 horsepower + +00:00:29.920 --> 00:00:34.560 +all-luminum V8. We are going to be right in the action bringing you guys a video + +00:00:34.560 --> 00:00:40.120 +every single day live from the bull run rally July 9th to 16th and the only + +00:00:40.120 --> 00:00:45.240 +place to watch it is on BlackmagicShine.com. We're right here on the smoking + +00:00:45.240 --> 00:00:47.440 +tire.""" > ${transcript_fn} + + echo "Downloading Video" + wget http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/WeAreGoingOnBullrun.mp4 -O ${video_fn} + +} + +function validate_microservice() { + cd $LOG_PATH + + # test v1/generate_transcripts upload file + echo "Testing generate_transcripts API" + URL="http://${ip_address}:$dataprep_service_port/v1/generate_transcripts" + HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F "files=@./$video_fn" -H 'Content-Type: multipart/form-data' "$URL") + HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://') + RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g') + SERVICE_NAME="dataprep - upload - file" + + if [ "$HTTP_STATUS" -ne "200" ]; then + echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs test-comps-dataprep-redis-multimodal-langchain-server >> ${LOG_PATH}/dataprep_upload_file.log + exit 1 + else + echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." + fi + if [[ "$RESPONSE_BODY" != *"Data preparation succeeded"* ]]; then + echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY" + docker logs test-comps-dataprep-redis-multimodal-langchain-server >> ${LOG_PATH}/dataprep_upload_file.log + exit 1 + else + echo "[ $SERVICE_NAME ] Content is as expected." + fi + + # test v1/videos_with_transcripts upload file + echo "Testing videos_with_transcripts API" + URL="http://${ip_address}:$dataprep_service_port/v1/videos_with_transcripts" + + HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F "files=@./$video_fn" -F "files=@./$transcript_fn" -H 'Content-Type: multipart/form-data' "$URL") + HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://') + RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g') + SERVICE_NAME="dataprep - upload - file" + + if [ "$HTTP_STATUS" -ne "200" ]; then + echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs test-comps-dataprep-redis-multimodal-langchain-server >> ${LOG_PATH}/dataprep_upload_file.log + exit 1 + else + echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." + fi + if [[ "$RESPONSE_BODY" != *"Data preparation succeeded"* ]]; then + echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY" + docker logs test-comps-dataprep-redis-multimodal-langchain-server >> ${LOG_PATH}/dataprep_upload_file.log + exit 1 + else + echo "[ $SERVICE_NAME ] Content is as expected." + fi + + # test v1/generate_captions upload file + echo "Testing generate_captions API" + URL="http://${ip_address}:$dataprep_service_port/v1/generate_captions" + + HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F "files=@./$video_fn" -H 'Content-Type: multipart/form-data' "$URL") + HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://') + RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g') + SERVICE_NAME="dataprep - upload - file" + + if [ "$HTTP_STATUS" -ne "200" ]; then + echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs test-comps-dataprep-redis-multimodal-langchain-server >> ${LOG_PATH}/dataprep_upload_file.log + exit 1 + else + echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." + fi + if [[ "$RESPONSE_BODY" != *"Data preparation succeeded"* ]]; then + echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY" + docker logs test-comps-dataprep-redis-multimodal-langchain-server >> ${LOG_PATH}/dataprep_upload_file.log + exit 1 + else + echo "[ $SERVICE_NAME ] Content is as expected." + fi + + + + # test /v1/dataprep/get_videos + echo "Testing get_videos API" + URL="http://${ip_address}:$dataprep_service_port/v1/dataprep/get_videos" + HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST "$URL") + HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://') + RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g') + SERVICE_NAME="dataprep - get" + + if [ "$HTTP_STATUS" -ne "200" ]; then + echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs test-comps-dataprep-redis-multimodal-langchain-server >> ${LOG_PATH}/dataprep_file.log + exit 1 + else + echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." + fi + if [[ "$RESPONSE_BODY" != *${video_name}* ]]; then + echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY" + docker logs test-comps-dataprep-redis-multimodal-langchain-server >> ${LOG_PATH}/dataprep_file.log + exit 1 + else + echo "[ $SERVICE_NAME ] Content is as expected." + fi + + # test /v1/dataprep/delete_videos + echo "Testing delete_videos API" + URL="http://${ip_address}:$dataprep_service_port/v1/dataprep/delete_videos" + HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d '{"file_path": "dataprep_file.txt"}' -H 'Content-Type: application/json' "$URL") + HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://') + RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g') + SERVICE_NAME="dataprep - del" + + # check response status + if [ "$HTTP_STATUS" -ne "200" ]; then + echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs test-comps-dataprep-redis-multimodal-langchain-server >> ${LOG_PATH}/dataprep_del.log + exit 1 + else + echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." + fi + # check response body + if [[ "$RESPONSE_BODY" != *'{"status":true}'* ]]; then + echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY" + docker logs test-comps-dataprep-redis-multimodal-langchain-server >> ${LOG_PATH}/dataprep_del.log + exit 1 + else + echo "[ $SERVICE_NAME ] Content is as expected." + fi +} + +function stop_docker() { + cid=$(docker ps -aq --filter "name=test-comps-dataprep-redis-multimodal-langchain*") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi + cid=$(docker ps -aq --filter "name=test-comps-lvm*") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi + +} + +function delete_data() { + cd ${LOG_PATH} + rm -rf WeAreGoingOnBullrun.vtt + rm -rf WeAreGoingOnBullrun.mp4 + sleep 1s +} + +function main() { + + stop_docker + start_lvm + build_docker_images + start_service + prepare_data + + validate_microservice + delete_data + stop_docker + # echo y | docker system prune + +} + +main diff --git a/tests/test_embeddings_llama_index.sh b/tests/test_embeddings_llama_index.sh index 0487260441..81eac442ba 100644 --- a/tests/test_embeddings_llama_index.sh +++ b/tests/test_embeddings_llama_index.sh @@ -11,12 +11,12 @@ ip_address=$(hostname -I | awk '{print $1}') function build_docker_images() { cd $WORKPATH echo $(pwd) - docker build --no-cache -t opea/embedding-tei-llamaindex:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/embeddings/llama_index/docker/Dockerfile . + docker build --no-cache -t opea/embedding-tei-llama-index:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/embeddings/llama_index/docker/Dockerfile . if [ $? -ne 0 ]; then - echo "opea/embedding-tei-llamaindex built fail" + echo "opea/embedding-tei-llama-index built fail" exit 1 else - echo "opea/embedding-tei-llamaindex built successful" + echo "opea/embedding-tei-llama-index built successful" fi } @@ -24,17 +24,17 @@ function start_service() { tei_endpoint=5001 model="BAAI/bge-large-en-v1.5" revision="refs/pr/5" - docker run -d --name="test-comps-embedding-tei-llamaindex-endpoint" -p $tei_endpoint:80 -v ./data:/data -e http_proxy=$http_proxy -e https_proxy=$https_proxy --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.2 --model-id $model --revision $revision + docker run -d --name="test-comps-embedding-tei-llama-index-endpoint" -p $tei_endpoint:80 -v ./data:/data -e http_proxy=$http_proxy -e https_proxy=$https_proxy --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.2 --model-id $model --revision $revision export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:${tei_endpoint}" tei_service_port=5010 - docker run -d --name="test-comps-embedding-tei-llamaindex-server" -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p ${tei_service_port}:6000 --ipc=host -e TEI_EMBEDDING_ENDPOINT=$TEI_EMBEDDING_ENDPOINT opea/embedding-tei-llamaindex:comps + docker run -d --name="test-comps-embedding-tei-llama-index-server" -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p ${tei_service_port}:6000 --ipc=host -e TEI_EMBEDDING_ENDPOINT=$TEI_EMBEDDING_ENDPOINT opea/embedding-tei-llama-index:comps sleep 3m } function validate_microservice() { tei_service_port=5010 URL="http://${ip_address}:$tei_service_port/v1/embeddings" - docker logs test-comps-embedding-tei-llamaindex-server >> ${LOG_PATH}/embedding.log + docker logs test-comps-embedding-tei-llama-index-server >> ${LOG_PATH}/embedding.log HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d '{"text":"What is Deep Learning?"}' -H 'Content-Type: application/json' "$URL") if [ "$HTTP_STATUS" -eq 200 ]; then echo "[ embedding - llama_index ] HTTP status is 200. Checking content..." @@ -44,12 +44,12 @@ function validate_microservice() { echo "[ embedding - llama_index ] Content is as expected." else echo "[ embedding - llama_index ] Content does not match the expected result: $CONTENT" - docker logs test-comps-embedding-tei-llamaindex-server >> ${LOG_PATH}/embedding.log + docker logs test-comps-embedding-tei-llama-index-server >> ${LOG_PATH}/embedding.log exit 1 fi else echo "[ embedding - llama_index ] HTTP status is not 200. Received status was $HTTP_STATUS" - docker logs test-comps-embedding-tei-llamaindex-server >> ${LOG_PATH}/embedding.log + docker logs test-comps-embedding-tei-llama-index-server >> ${LOG_PATH}/embedding.log exit 1 fi } diff --git a/tests/test_intent_detection_langchain.sh b/tests/test_intent_detection_langchain.sh new file mode 100644 index 0000000000..45910ca7f8 --- /dev/null +++ b/tests/test_intent_detection_langchain.sh @@ -0,0 +1,57 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -xe + +WORKPATH=$(dirname "$PWD") +ip_address=$(hostname -I | awk '{print $1}') +function build_docker_images() { + cd $WORKPATH + docker build --no-cache -t opea/llm-tgi:latest -f comps/intent_detection/langchain/Dockerfile . +} + +function start_service() { + tgi_endpoint=5004 + # Remember to set HF_TOKEN before invoking this test! + export HUGGINGFACEHUB_API_TOKEN=${HF_TOKEN} + model=Intel/neural-chat-7b-v3-3 + docker run -d --name="test-comps-intent-tgi-endpoint" -p $tgi_endpoint:80 -v ./data:/data --shm-size 1g ghcr.io/huggingface/text-generation-inference:1.4 --model-id $model + + export TGI_LLM_ENDPOINT="http://${ip_address}:${tgi_endpoint}" + tei_service_port=5005 + unset http_proxy + docker run -d --name="test-comps-intent-tei-server" -p ${tei_service_port}:9000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e TGI_LLM_ENDPOINT=$TGI_LLM_ENDPOINT -e HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN opea/llm-tgi:latest + sleep 3m +} + +function validate_microservice() { + tei_service_port=5005 + http_proxy="" curl http://${ip_address}:${tei_service_port}/v1/chat/intent\ + -X POST \ + -d '{"query":"What is Deep Learning?","max_new_tokens":10,"top_k":1,"temperature":0.001,"streaming":false}' \ + -H 'Content-Type: application/json' + docker logs test-comps-intent-tei-server + docker logs test-comps-intent-tgi-endpoint +} + +function stop_docker() { + cid=$(docker ps -aq --filter "name=test-comps-intent*") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_microservice + + stop_docker + echo y | docker system prune + +} + +main diff --git a/tests/test_lvms_video-llama.sh b/tests/test_lvms_video-llama.sh index 1e94982fb3..a9dcbf3a7f 100755 --- a/tests/test_lvms_video-llama.sh +++ b/tests/test_lvms_video-llama.sh @@ -62,7 +62,7 @@ function start_service() { } function validate_microservice() { - result=$(http_proxy="" curl http://localhost:5031/v1/lvm -X POST -d '{"video_url":"./data/silence_girl.mp4","chunk_start": 0,"chunk_duration": 7,"prompt":"What is the person doing?","max_new_tokens": 50}' -H 'Content-Type: application/json') + result=$(http_proxy="" curl http://localhost:5031/v1/lvm -X POST -d '{"video_url":"silence_girl.mp4","chunk_start": 0,"chunk_duration": 7,"prompt":"What is the person doing?","max_new_tokens": 50}' -H 'Content-Type: application/json') if [[ $result == *"silence"* ]]; then echo "Result correct." else diff --git a/tests/test_multimodal_embeddings_langchain_cpu.sh b/tests/test_multimodal_embeddings_langchain_cpu.sh new file mode 100644 index 0000000000..77a7b6d993 --- /dev/null +++ b/tests/test_multimodal_embeddings_langchain_cpu.sh @@ -0,0 +1,111 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -x + +WORKPATH=$(dirname "$PWD") +ip_address=$(hostname -I | awk '{print $1}') +export your_mmei_port=8089 +export EMBEDDER_PORT=$your_mmei_port +export MMEI_EMBEDDING_ENDPOINT="http://$ip_address:$your_mmei_port/v1/encode" +export your_embedding_port_microservice=6609 +export MM_EMBEDDING_PORT_MICROSERVICE=$your_embedding_port_microservice +unset http_proxy + +function build_mmei_docker_images() { + cd $WORKPATH + echo $(pwd) + docker build --no-cache -t opea/bridgetower-embedder:latest --build-arg EMBEDDER_PORT=$EMBEDDER_PORT --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/embeddings/multimodal_embeddings/bridgetower/docker/Dockerfile . + + if [ $? -ne 0 ]; then + echo "opea/bridgetower-embedder built fail" + exit 1 + else + echo "opea/bridgetower-embedder built successful" + fi +} + +function build_embedding_service_images() { + cd $WORKPATH + echo $(pwd) + docker build --no-cache -t opea/embedding-multimodal:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/embeddings/multimodal_embeddings/multimodal_langchain/docker/Dockerfile . + + if [ $? -ne 0 ]; then + echo "opea/embedding-multimodal built fail" + exit 1 + else + echo "opea/embedding-multimodal built successful" + fi +} + +function build_docker_images() { + build_mmei_docker_images + build_embedding_service_images +} + +function start_service() { + cd $WORKPATH + cd comps/embeddings/multimodal_embeddings/bridgetower/docker/ + docker compose -f docker_compose_bridgetower_embedding_endpoint.yaml up -d + cd $WORKPATH + cd comps/embeddings/multimodal_embeddings/multimodal_langchain/docker/ + docker compose -f docker_compose_multimodal_embedding.yaml up -d + sleep 2m +} +function validate_microservice_text_embedding() { + result=$(http_proxy="" curl http://${ip_address}:$MM_EMBEDDING_PORT_MICROSERVICE/v1/embeddings \ + -X POST \ + -H "Content-Type: application/json" \ + -d '{"text" : "This is some sample text."}') + + if [[ $result == *"embedding"* ]]; then + echo "Result correct." + else + echo "Result wrong. Received was $result" + docker logs bridgetower-embedding-server + docker logs embedding-multimodal-server + exit 1 + fi +} + +function validate_microservice_image_text_pair_embedding() { + result=$(http_proxy="" curl http://${ip_address}:$MM_EMBEDDING_PORT_MICROSERVICE/v1/embeddings \ + -X POST \ + -H "Content-Type: application/json" \ + -d '{"text": {"text" : "This is some sample text."}, "image" : {"url": "https://github.com/docarray/docarray/blob/main/tests/toydata/image-data/apple.png?raw=true"}}') + + if [[ $result == *"embedding"* ]]; then + echo "Result correct." + else + echo "Result wrong. Received was $result" + docker logs bridgetower-embedding-server + docker logs embedding-multimodal-server + exit 1 + fi +} + +function validate_microservice() { + validate_microservice_text_embedding + validate_microservice_image_text_pair_embedding +} + +function stop_docker() { + cid=$(docker ps -aq --filter "name=bridgetower-embedding-server" --filter "name=embedding-multimodal-server") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_microservice + + stop_docker + echo y | docker system prune +} + +main diff --git a/tests/test_multimodal_embeddings_langchain_hpu.sh b/tests/test_multimodal_embeddings_langchain_hpu.sh new file mode 100644 index 0000000000..50c789c7d2 --- /dev/null +++ b/tests/test_multimodal_embeddings_langchain_hpu.sh @@ -0,0 +1,111 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -x + +WORKPATH=$(dirname "$PWD") +ip_address=$(hostname -I | awk '{print $1}') +export your_mmei_port=8089 +export EMBEDDER_PORT=$your_mmei_port +export MMEI_EMBEDDING_ENDPOINT="http://$ip_address:$your_mmei_port/v1/encode" +export your_embedding_port_microservice=6609 +export MM_EMBEDDING_PORT_MICROSERVICE=$your_embedding_port_microservice +unset http_proxy + +function build_mmei_docker_images() { + cd $WORKPATH + echo $(pwd) + docker build --no-cache -t opea/bridgetower-embedder:latest --build-arg EMBEDDER_PORT=$EMBEDDER_PORT --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/embeddings/multimodal_embeddings/bridgetower/docker/Dockerfile_hpu . + + if [ $? -ne 0 ]; then + echo "opea/bridgetower-embedder built fail" + exit 1 + else + echo "opea/bridgetower-embedder built successful" + fi +} + +function build_embedding_service_images() { + cd $WORKPATH + echo $(pwd) + docker build --no-cache -t opea/embedding-multimodal:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/embeddings/multimodal_embeddings/multimodal_langchain/docker/Dockerfile . + + if [ $? -ne 0 ]; then + echo "opea/embedding-multimodal built fail" + exit 1 + else + echo "opea/embedding-multimodal built successful" + fi +} + +function build_docker_images() { + build_mmei_docker_images + build_embedding_service_images +} + +function start_service() { + cd $WORKPATH + cd comps/embeddings/multimodal_embeddings/bridgetower/docker/ + docker compose -f docker_compose_bridgetower_embedding_endpoint.yaml up -d + cd $WORKPATH + cd comps/embeddings/multimodal_embeddings/multimodal_langchain/docker/ + docker compose -f docker_compose_multimodal_embedding.yaml up -d + sleep 2m +} +function validate_microservice_text_embedding() { + result=$(http_proxy="" curl http://${ip_address}:$MM_EMBEDDING_PORT_MICROSERVICE/v1/embeddings \ + -X POST \ + -H "Content-Type: application/json" \ + -d '{"text" : "This is some sample text."}') + + if [[ $result == *"embedding"* ]]; then + echo "Result correct." + else + echo "Result wrong. Received was $result" + docker logs bridgetower-embedding-server + docker logs embedding-multimodal-server + exit 1 + fi +} + +function validate_microservice_image_text_pair_embedding() { + result=$(http_proxy="" curl http://${ip_address}:$MM_EMBEDDING_PORT_MICROSERVICE/v1/embeddings \ + -X POST \ + -H "Content-Type: application/json" \ + -d '{"text": {"text" : "This is some sample text."}, "image" : {"url": "https://github.com/docarray/docarray/blob/main/tests/toydata/image-data/apple.png?raw=true"}}') + + if [[ $result == *"embedding"* ]]; then + echo "Result correct." + else + echo "Result wrong. Received was $result" + docker logs bridgetower-embedding-server + docker logs embedding-multimodal-server + exit 1 + fi +} + +function validate_microservice() { + validate_microservice_text_embedding + validate_microservice_image_text_pair_embedding +} + +function stop_docker() { + cid=$(docker ps -aq --filter "name=bridgetower-embedding-server" --filter "name=embedding-multimodal-server") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_microservice + + stop_docker + echo y | docker system prune +} + +main diff --git a/tests/test_nginx.sh b/tests/test_nginx.sh index 3675a57537..626c6974a1 100644 --- a/tests/test_nginx.sh +++ b/tests/test_nginx.sh @@ -9,8 +9,8 @@ LOG_PATH="$WORKPATH/tests" ip_address=$(hostname -I | awk '{print $1}') function build_docker_images() { - cd $WORKPATH/comps/nginx/docker - docker build --no-cache --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -t opea/nginx:comps -f ./Dockerfile . + cd $WORKPATH + docker build --no-cache --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -t opea/nginx:comps -f comps/nginx/docker/Dockerfile . if [ $? -ne 0 ]; then echo "opea/nginx built fail" exit 1 diff --git a/tests/test_reranks_video-rag-qna.sh b/tests/test_reranks_video-rag-qna.sh new file mode 100755 index 0000000000..cf4d0c5c8a --- /dev/null +++ b/tests/test_reranks_video-rag-qna.sh @@ -0,0 +1,78 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -xe + +WORKPATH=$(dirname "$PWD") +ip_address=$(hostname -I | awk '{print $1}') + +function build_docker_images() { + cd $WORKPATH + docker build --no-cache -t opea/reranking-videoragqna:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/reranks/video-rag-qna/docker/Dockerfile . +} + +function start_service() { + docker run -d --name "test-comps-reranking-videoragqna-server" \ + -p 5037:8000 \ + --ipc=host \ + -e no_proxy=${no_proxy} \ + -e http_proxy=${http_proxy} \ + -e https_proxy=${https_proxy} \ + -e CHUNK_DURATION=${CHUNK_DURATION} \ + -e FILE_SERVER_ENDPOINT=${FILE_SERVER_ENDPOINT} \ + opea/reranking-videoragqna:latest + + + until docker logs test-comps-reranking-videoragqna-server 2>&1 | grep -q "Uvicorn running on"; do + sleep 2 + done +} + +function validate_microservice() { + result=$(\ + http_proxy="" \ + curl -X 'POST' \ + "http://${ip_address}:5037/v1/reranking" \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "retrieved_docs": [ + {"doc": [{"text": "this is the retrieved text"}]} + ], + "initial_query": "this is the query", + "top_n": 1, + "metadata": [ + {"other_key": "value", "video":"top_video_name", "timestamp":"20"}, + {"other_key": "value", "video":"second_video_name", "timestamp":"40"}, + {"other_key": "value", "video":"top_video_name", "timestamp":"20"} + ] + }') + if [[ $result == *"this is the query"* ]]; then + echo "Result correct." + else + echo "Result wrong." + exit 1 + fi +} + +function stop_docker() { + cid=$(docker ps -aq --filter "name=test-comps-reranking*") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_microservice + + stop_docker + echo y | docker system prune + +} + +main diff --git a/tests/test_retrievers_langchain_pathway.sh b/tests/test_retrievers_langchain_pathway.sh new file mode 100644 index 0000000000..4db471b569 --- /dev/null +++ b/tests/test_retrievers_langchain_pathway.sh @@ -0,0 +1,103 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -x + +WORKPATH=$(dirname "$PWD") +LOG_PATH="$WORKPATH/tests" +ip_address=$(hostname -I | awk '{print $1}') +function build_docker_images() { + cd $WORKPATH + + cd comps/vectorstores/langchain/pathway + + docker build --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -t opea/vectorstore-pathway:comps . + + cd $WORKPATH + + docker build --no-cache -t opea/retriever-pathway:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/langchain/pathway/docker/Dockerfile . + if [ $? -ne 0 ]; then + echo "opea/retriever-pathway built fail" + exit 1 + else + echo "opea/retriever-pathway built successful" + fi +} + +function start_service() { + cd $WORKPATH + + # tei endpoint + tei_endpoint=5008 + model="BAAI/bge-base-en-v1.5" + docker run -d --name="test-comps-retriever-tei-endpoint" -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p $tei_endpoint:80 -v ./data:/data --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.2 --model-id $model + + sleep 30s + export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:${tei_endpoint}" + + result=$(http_proxy='' + curl $TEI_EMBEDDING_ENDPOINT -X POST -d '{"inputs":"Hey,"}' -H 'Content-Type: application/json') + + echo "embed_result:" + echo $result + + sleep 30s + + # pathway + export PATHWAY_HOST="0.0.0.0" + export PATHWAY_PORT=5432 + + docker run -d --name="test-comps-vectorstore-pathway" -e PATHWAY_HOST=${PATHWAY_HOST} -e PATHWAY_PORT=${PATHWAY_PORT} -e TEI_EMBEDDING_ENDPOINT=${TEI_EMBEDDING_ENDPOINT} -e http_proxy=$http_proxy -e https_proxy=$https_proxy -v $WORKPATH/comps/vectorstores/langchain/pathway/README.md:/app/data/README.md -p ${PATHWAY_PORT}:${PATHWAY_PORT} --network="host" opea/vectorstore-pathway:comps + + sleep 45s + + export PATHWAY_HOST=$ip_address # needed in order to reach to vector store + + docker run -d --name="test-comps-retriever-pathway" -p 5009:7000 -e PATHWAY_HOST=${PATHWAY_HOST} -e PATHWAY_PORT=${PATHWAY_PORT} -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/retriever-pathway:comps + + sleep 10s +} + +function validate_microservice() { + retriever_port=5009 + export PATH="${HOME}/miniforge3/bin:$PATH" + + test_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") + + result=$(http_proxy='' + curl http://${ip_address}:$retriever_port/v1/retrieval \ + -X POST \ + -d "{\"text\":\"test\",\"embedding\":${test_embedding}}" \ + -H 'Content-Type: application/json') + if [[ $result == *"retrieved_docs"* ]]; then + echo "Result correct." + else + echo "Result wrong. Received was $result" + docker logs test-comps-vectorstore-pathway >> ${LOG_PATH}/vectorstore-pathway.log + docker logs test-comps-retriever-tei-endpoint >> ${LOG_PATH}/tei-endpoint.log + docker logs test-comps-retriever-pathway >> ${LOG_PATH}/retriever-pathway.log + exit 1 + fi +} + +function stop_docker() { + cid=$(docker ps -aq --filter "name=test-comps*") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_microservice + + stop_docker + echo y | docker system prune + +} + +main diff --git a/tests/test_vectorstores_langchain_pathway.sh b/tests/test_vectorstores_langchain_pathway.sh new file mode 100644 index 0000000000..0b1eab5adf --- /dev/null +++ b/tests/test_vectorstores_langchain_pathway.sh @@ -0,0 +1,96 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -x + +WORKPATH=$(dirname "$PWD") +LOG_PATH="$WORKPATH/tests" +ip_address=$(hostname -I | awk '{print $1}') +function build_docker_images() { + cd $WORKPATH + + cd comps/vectorstores/langchain/pathway + + docker build --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -t opea/vectorstore-pathway:comps . + + cd $WORKPATH + + if [ $? -ne 0 ]; then + echo "opea/retriever-pathway built fail" + exit 1 + else + echo "opea/retriever-pathway built successful" + fi +} + +function start_service() { + cd $WORKPATH + + # tei endpoint + tei_endpoint=5008 + model="BAAI/bge-base-en-v1.5" + docker run -d --name="test-comps-retriever-tei-endpoint" -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p $tei_endpoint:80 -v ./data:/data --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.2 --model-id $model + + sleep 30s + export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:${tei_endpoint}" + + result=$(http_proxy='' + curl $TEI_EMBEDDING_ENDPOINT -X POST -d '{"inputs":"Hey,"}' -H 'Content-Type: application/json') + + echo "embed_result:" + echo $result + + sleep 30s + + # pathway + export PATHWAY_HOST="0.0.0.0" + export PATHWAY_PORT=5432 + + docker run -d --name="test-comps-vectorstore-pathway" -e PATHWAY_HOST=${PATHWAY_HOST} -e PATHWAY_PORT=${PATHWAY_PORT} -e TEI_EMBEDDING_ENDPOINT=${TEI_EMBEDDING_ENDPOINT} -e http_proxy=$http_proxy -e https_proxy=$https_proxy -v $WORKPATH/comps/vectorstores/langchain/pathway/README.md:/app/data/README.md -p ${PATHWAY_PORT}:${PATHWAY_PORT} --network="host" opea/vectorstore-pathway:comps + + sleep 45s + + export PATHWAY_HOST=$ip_address # needed in order to reach to vector store + + sleep 10s +} + +function validate_microservice() { + export PATH="${HOME}/miniforge3/bin:$PATH" + + result=$(http_proxy='' + curl http://${PATHWAY_HOST}:$PATHWAY_PORT/v1/retrieve \ + -X POST \ + -d "{\"query\":\"test\",\"k\":3}" \ + -H 'Content-Type: application/json') + if [[ $result == *"Pathway"* ]]; then + echo "Result correct." + else + echo "Result wrong. Received was $result" + docker logs test-comps-vectorstore-pathway >> ${LOG_PATH}/vectorstore-pathway.log + docker logs test-comps-retriever-tei-endpoint >> ${LOG_PATH}/tei-endpoint.log + exit 1 + fi +} + +function stop_docker() { + cid=$(docker ps -aq --filter "name=test-comps*") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_microservice + + stop_docker + echo y | docker system prune + +} + +main