diff --git a/AgentQnA/docker_compose/amd/gpu/rocm/compose.yaml b/AgentQnA/docker_compose/amd/gpu/rocm/compose.yaml index 49c88328d..965a61bd4 100644 --- a/AgentQnA/docker_compose/amd/gpu/rocm/compose.yaml +++ b/AgentQnA/docker_compose/amd/gpu/rocm/compose.yaml @@ -49,7 +49,7 @@ services: model: ${LLM_MODEL_ID} temperature: ${temperature} max_new_tokens: ${max_new_tokens} - streaming: false + stream: false tools: /home/user/tools/worker_agent_tools.yaml require_human_feedback: false RETRIEVAL_TOOL_URL: ${RETRIEVAL_TOOL_URL} @@ -83,7 +83,7 @@ services: model: ${LLM_MODEL_ID} temperature: ${temperature} max_new_tokens: ${max_new_tokens} - streaming: false + stream: false tools: /home/user/tools/supervisor_agent_tools.yaml require_human_feedback: false no_proxy: ${no_proxy} diff --git a/AgentQnA/docker_compose/intel/cpu/xeon/compose_openai.yaml b/AgentQnA/docker_compose/intel/cpu/xeon/compose_openai.yaml index 837f2a087..a9032cce3 100644 --- a/AgentQnA/docker_compose/intel/cpu/xeon/compose_openai.yaml +++ b/AgentQnA/docker_compose/intel/cpu/xeon/compose_openai.yaml @@ -19,7 +19,7 @@ services: model: ${model} temperature: ${temperature} max_new_tokens: ${max_new_tokens} - streaming: false + stream: false tools: /home/user/tools/worker_agent_tools.yaml require_human_feedback: false RETRIEVAL_TOOL_URL: ${RETRIEVAL_TOOL_URL} @@ -51,7 +51,7 @@ services: model: ${model} temperature: ${temperature} max_new_tokens: ${max_new_tokens} - streaming: false + stream: false tools: /home/user/tools/supervisor_agent_tools.yaml require_human_feedback: false no_proxy: ${no_proxy} diff --git a/AgentQnA/docker_compose/intel/hpu/gaudi/compose.yaml b/AgentQnA/docker_compose/intel/hpu/gaudi/compose.yaml index 6a9d0b465..02b30c07a 100644 --- a/AgentQnA/docker_compose/intel/hpu/gaudi/compose.yaml +++ b/AgentQnA/docker_compose/intel/hpu/gaudi/compose.yaml @@ -21,7 +21,7 @@ services: model: ${LLM_MODEL_ID} temperature: ${temperature} max_new_tokens: ${max_new_tokens} - streaming: false + stream: false tools: /home/user/tools/worker_agent_tools.yaml require_human_feedback: false RETRIEVAL_TOOL_URL: ${RETRIEVAL_TOOL_URL} @@ -55,7 +55,7 @@ services: model: ${LLM_MODEL_ID} temperature: ${temperature} max_new_tokens: ${max_new_tokens} - streaming: false + stream: false tools: /home/user/tools/supervisor_agent_tools.yaml require_human_feedback: false no_proxy: ${no_proxy} diff --git a/AgentQnA/tests/step2_start_retrieval_tool.sh b/AgentQnA/tests/step2_start_retrieval_tool.sh index df080e40d..91fb1ea0a 100644 --- a/AgentQnA/tests/step2_start_retrieval_tool.sh +++ b/AgentQnA/tests/step2_start_retrieval_tool.sh @@ -7,6 +7,7 @@ WORKPATH=$(dirname "$PWD") export WORKDIR=$WORKPATH/../../ echo "WORKDIR=${WORKDIR}" export ip_address=$(hostname -I | awk '{print $1}') +export host_ip=${ip_address} export HF_CACHE_DIR=$WORKDIR/hf_cache if [ ! -d "$HF_CACHE_DIR" ]; then diff --git a/AgentQnA/tests/test_compose_on_gaudi.sh b/AgentQnA/tests/test_compose_on_gaudi.sh index 5f7e899dc..0720a9b2b 100644 --- a/AgentQnA/tests/test_compose_on_gaudi.sh +++ b/AgentQnA/tests/test_compose_on_gaudi.sh @@ -2,7 +2,7 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -set -e +set -xe WORKPATH=$(dirname "$PWD") export WORKDIR=$WORKPATH/../../ @@ -82,4 +82,4 @@ echo "=================== #5 Agent and API server stopped====================" echo y | docker system prune -echo "ALL DONE!" +echo "ALL DONE!!" diff --git a/AgentQnA/tests/test_compose_on_rocm.sh b/AgentQnA/tests/test_compose_on_rocm.sh index 204de7ead..1ff501396 100644 --- a/AgentQnA/tests/test_compose_on_rocm.sh +++ b/AgentQnA/tests/test_compose_on_rocm.sh @@ -2,7 +2,7 @@ # Copyright (C) 2024 Advanced Micro Devices, Inc. # SPDX-License-Identifier: Apache-2.0 -set -e +set -xe WORKPATH=$(dirname "$PWD") export WORKDIR=$WORKPATH/../../ @@ -72,4 +72,4 @@ echo "=================== #5 Agent and API server stopped====================" echo y | docker system prune -echo "ALL DONE!" +echo "ALL DONE!!" diff --git a/AudioQnA/audioqna.py b/AudioQnA/audioqna.py index efbd5ddc5..79abcccb9 100644 --- a/AudioQnA/audioqna.py +++ b/AudioQnA/audioqna.py @@ -26,7 +26,7 @@ def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **k next_inputs["messages"] = [{"role": "user", "content": inputs["asr_result"]}] next_inputs["max_tokens"] = llm_parameters_dict["max_tokens"] next_inputs["top_p"] = llm_parameters_dict["top_p"] - next_inputs["stream"] = inputs["streaming"] # False as default + next_inputs["stream"] = inputs["stream"] # False as default next_inputs["frequency_penalty"] = inputs["frequency_penalty"] # next_inputs["presence_penalty"] = inputs["presence_penalty"] # next_inputs["repetition_penalty"] = inputs["repetition_penalty"] @@ -91,7 +91,7 @@ async def handle_request(self, request: Request): frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0, presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0, repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03, - streaming=False, # TODO add streaming LLM output as input to TTS + stream=False, # TODO add stream LLM output as input to TTS ) result_dict, runtime_graph = await self.megaservice.schedule( initial_inputs={"audio": chat_request.audio}, diff --git a/AudioQnA/audioqna_multilang.py b/AudioQnA/audioqna_multilang.py index f7e51c0a7..66c2ad1a3 100644 --- a/AudioQnA/audioqna_multilang.py +++ b/AudioQnA/audioqna_multilang.py @@ -28,7 +28,7 @@ def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **k next_inputs["messages"] = [{"role": "user", "content": inputs["asr_result"]}] next_inputs["max_tokens"] = llm_parameters_dict["max_tokens"] next_inputs["top_p"] = llm_parameters_dict["top_p"] - next_inputs["stream"] = inputs["streaming"] # False as default + next_inputs["stream"] = inputs["stream"] # False as default next_inputs["frequency_penalty"] = inputs["frequency_penalty"] # next_inputs["presence_penalty"] = inputs["presence_penalty"] # next_inputs["repetition_penalty"] = inputs["repetition_penalty"] @@ -103,7 +103,7 @@ async def handle_request(self, request: Request): frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0, presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0, repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03, - streaming=False, # TODO add streaming LLM output as input to TTS + stream=False, # TODO add stream LLM output as input to TTS ) result_dict, runtime_graph = await self.megaservice.schedule( initial_inputs={"audio": chat_request.audio}, llm_parameters=parameters diff --git a/AudioQnA/benchmark/performance/benchmark.yaml b/AudioQnA/benchmark/performance/benchmark.yaml index 659a99a75..3b7c5e672 100644 --- a/AudioQnA/benchmark/performance/benchmark.yaml +++ b/AudioQnA/benchmark/performance/benchmark.yaml @@ -40,7 +40,7 @@ test_cases: top_k: 10 top_p: 0.95 repetition_penalty: 1.03 - streaming: true + stream: true llmserve: run_test: true service_name: "llm-svc" # Replace with your service name diff --git a/AudioQnA/docker_compose/amd/gpu/rocm/compose.yaml b/AudioQnA/docker_compose/amd/gpu/rocm/compose.yaml index 44f320b92..4cef1598c 100644 --- a/AudioQnA/docker_compose/amd/gpu/rocm/compose.yaml +++ b/AudioQnA/docker_compose/amd/gpu/rocm/compose.yaml @@ -53,7 +53,7 @@ services: ipc: host audioqna-backend-server: image: ${REGISTRY:-opea}/audioqna:${TAG:-latest} - container_name: audioqna-xeon-backend-server + container_name: audioqna-rocm-backend-server depends_on: - whisper-service - tgi-service diff --git a/AudioQnA/kubernetes/intel/README_gmc.md b/AudioQnA/kubernetes/intel/README_gmc.md index 767fdf366..6c9e7394c 100644 --- a/AudioQnA/kubernetes/intel/README_gmc.md +++ b/AudioQnA/kubernetes/intel/README_gmc.md @@ -66,7 +66,7 @@ This involves deploying the AudioQnA custom resource. You can use audioQnA_xeon. ```sh export CLIENT_POD=$(kubectl get pod -n audioqa -l app=client-test -o jsonpath={.items..metadata.name}) export accessUrl=$(kubectl get gmc -n audioqa -o jsonpath="{.items[?(@.metadata.name=='audioqa')].status.accessUrl}") - kubectl exec "$CLIENT_POD" -n audioqa -- curl -s --no-buffer $accessUrl -X POST -d '{"byte_str": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "parameters":{"max_new_tokens":64, "do_sample": true, "streaming":false}}' -H 'Content-Type: application/json' + kubectl exec "$CLIENT_POD" -n audioqa -- curl -s --no-buffer $accessUrl -X POST -d '{"byte_str": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "parameters":{"max_new_tokens":64, "do_sample": true, "stream":false}}' -H 'Content-Type: application/json' ``` > [NOTE] diff --git a/AudioQnA/tests/test_compose_on_gaudi.sh b/AudioQnA/tests/test_compose_on_gaudi.sh index ab480d55c..5947e5106 100644 --- a/AudioQnA/tests/test_compose_on_gaudi.sh +++ b/AudioQnA/tests/test_compose_on_gaudi.sh @@ -44,6 +44,7 @@ function start_services() { # sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env # Start Docker Containers + sed -i "s|container_name: audioqna-gaudi-backend-server|container_name: audioqna-gaudi-backend-server\n volumes:\n - \"${WORKPATH}\/docker_image_build\/GenAIComps:\/home\/user\/GenAIComps\"|g" compose.yaml docker compose up -d > ${LOG_PATH}/start_services_with_compose.log n=0 until [[ "$n" -ge 200 ]]; do diff --git a/AudioQnA/tests/test_compose_on_rocm.sh b/AudioQnA/tests/test_compose_on_rocm.sh index a2d2ef234..faae0b67f 100644 --- a/AudioQnA/tests/test_compose_on_rocm.sh +++ b/AudioQnA/tests/test_compose_on_rocm.sh @@ -46,6 +46,7 @@ function start_services() { # sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env # Start Docker Containers + sed -i "s|container_name: audioqna-rocm-backend-server|container_name: audioqna-rocm-backend-server\n volumes:\n - \"${WORKPATH}\/docker_image_build\/GenAIComps:\/home\/user\/GenAIComps\"|g" compose.yaml docker compose up -d > ${LOG_PATH}/start_services_with_compose.log n=0 until [[ "$n" -ge 200 ]]; do @@ -63,7 +64,7 @@ function validate_megaservice() { docker logs whisper-service > $LOG_PATH/whisper-service.log docker logs speecht5-service > $LOG_PATH/tts-service.log docker logs tgi-service > $LOG_PATH/tgi-service.log - docker logs audioqna-xeon-backend-server > $LOG_PATH/audioqna-xeon-backend-server.log + docker logs audioqna-rocm-backend-server > $LOG_PATH/audioqna-rocm-backend-server.log echo "$response" | sed 's/^"//;s/"$//' | base64 -d > speech.mp3 if [[ $(file speech.mp3) == *"RIFF"* ]]; then diff --git a/AudioQnA/tests/test_compose_on_xeon.sh b/AudioQnA/tests/test_compose_on_xeon.sh index 04ed04e06..9f0fd5ae5 100644 --- a/AudioQnA/tests/test_compose_on_xeon.sh +++ b/AudioQnA/tests/test_compose_on_xeon.sh @@ -45,6 +45,7 @@ function start_services() { # sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env # Start Docker Containers + sed -i "s|container_name: audioqna-xeon-backend-server|container_name: audioqna-xeon-backend-server\n volumes:\n - \"${WORKPATH}\/docker_image_build\/GenAIComps:\/home\/user\/GenAIComps\"|g" compose.yaml docker compose up -d > ${LOG_PATH}/start_services_with_compose.log n=0 until [[ "$n" -ge 200 ]]; do diff --git a/AudioQnA/tests/test_gmc_on_gaudi.sh b/AudioQnA/tests/test_gmc_on_gaudi.sh index d90bd3624..aaf915a46 100755 --- a/AudioQnA/tests/test_gmc_on_gaudi.sh +++ b/AudioQnA/tests/test_gmc_on_gaudi.sh @@ -34,7 +34,7 @@ function validate_audioqa() { export CLIENT_POD=$(kubectl get pod -n $APP_NAMESPACE -l app=client-test -o jsonpath={.items..metadata.name}) echo "$CLIENT_POD" accessUrl=$(kubectl get gmc -n $APP_NAMESPACE -o jsonpath="{.items[?(@.metadata.name=='audioqa')].status.accessUrl}") - byte_str=$(kubectl exec "$CLIENT_POD" -n $APP_NAMESPACE -- curl $accessUrl -s -X POST -d '{"byte_str": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "parameters":{"max_tokens":64, "do_sample": true, "streaming":false}}' -H 'Content-Type: application/json' | jq .byte_str) + byte_str=$(kubectl exec "$CLIENT_POD" -n $APP_NAMESPACE -- curl $accessUrl -s -X POST -d '{"byte_str": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "parameters":{"max_tokens":64, "do_sample": true, "stream":false}}' -H 'Content-Type: application/json' | jq .byte_str) echo "$byte_str" > $LOG_PATH/curl_audioqa.log if [ -z "$byte_str" ]; then echo "audioqa failed, please check the logs in ${LOG_PATH}!" diff --git a/AudioQnA/tests/test_gmc_on_xeon.sh b/AudioQnA/tests/test_gmc_on_xeon.sh index 15e04e62c..ae09d99a9 100755 --- a/AudioQnA/tests/test_gmc_on_xeon.sh +++ b/AudioQnA/tests/test_gmc_on_xeon.sh @@ -34,7 +34,7 @@ function validate_audioqa() { export CLIENT_POD=$(kubectl get pod -n $APP_NAMESPACE -l app=client-test -o jsonpath={.items..metadata.name}) echo "$CLIENT_POD" accessUrl=$(kubectl get gmc -n $APP_NAMESPACE -o jsonpath="{.items[?(@.metadata.name=='audioqa')].status.accessUrl}") - byte_str=$(kubectl exec "$CLIENT_POD" -n $APP_NAMESPACE -- curl $accessUrl -s -X POST -d '{"byte_str": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "parameters":{"max_tokens":64, "do_sample": true, "streaming":false}}' -H 'Content-Type: application/json' | jq .byte_str) + byte_str=$(kubectl exec "$CLIENT_POD" -n $APP_NAMESPACE -- curl $accessUrl -s -X POST -d '{"byte_str": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "parameters":{"max_tokens":64, "do_sample": true, "stream":false}}' -H 'Content-Type: application/json' | jq .byte_str) echo "$byte_str" > $LOG_PATH/curl_audioqa.log if [ -z "$byte_str" ]; then echo "audioqa failed, please check the logs in ${LOG_PATH}!" diff --git a/AvatarChatbot/avatarchatbot.py b/AvatarChatbot/avatarchatbot.py index 46c64e809..d9ee0fd6c 100644 --- a/AvatarChatbot/avatarchatbot.py +++ b/AvatarChatbot/avatarchatbot.py @@ -29,7 +29,7 @@ def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **k next_inputs["messages"] = [{"role": "user", "content": inputs["asr_result"]}] next_inputs["max_tokens"] = llm_parameters_dict["max_tokens"] next_inputs["top_p"] = llm_parameters_dict["top_p"] - next_inputs["stream"] = inputs["streaming"] # False as default + next_inputs["stream"] = inputs["stream"] # False as default next_inputs["frequency_penalty"] = inputs["frequency_penalty"] # next_inputs["presence_penalty"] = inputs["presence_penalty"] # next_inputs["repetition_penalty"] = inputs["repetition_penalty"] @@ -112,7 +112,7 @@ async def handle_request(self, request: Request): top_p=chat_request.top_p if chat_request.top_p else 0.95, temperature=chat_request.temperature if chat_request.temperature else 0.01, repetition_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 1.03, - streaming=False, # TODO add streaming LLM output as input to TTS + stream=False, # TODO add stream LLM output as input to TTS ) # print(parameters) diff --git a/AvatarChatbot/tests/test_compose_on_gaudi.sh b/AvatarChatbot/tests/test_compose_on_gaudi.sh index f73c854c9..032cd3d11 100755 --- a/AvatarChatbot/tests/test_compose_on_gaudi.sh +++ b/AvatarChatbot/tests/test_compose_on_gaudi.sh @@ -71,6 +71,7 @@ function start_services() { export FPS=10 # Start Docker Containers + sed -i "s|container_name: avatarchatbot-gaudi-backend-server|container_name: avatarchatbot-gaudi-backend-server\n volumes:\n - \"${WORKPATH}\/docker_image_build\/GenAIComps:\/home\/user\/GenAIComps\"|g" compose.yaml docker compose up -d > ${LOG_PATH}/start_services_with_compose.log n=0 until [[ "$n" -ge 200 ]]; do diff --git a/AvatarChatbot/tests/test_compose_on_xeon.sh b/AvatarChatbot/tests/test_compose_on_xeon.sh index 7f9e5533f..5700c523c 100755 --- a/AvatarChatbot/tests/test_compose_on_xeon.sh +++ b/AvatarChatbot/tests/test_compose_on_xeon.sh @@ -71,6 +71,7 @@ function start_services() { export FPS=10 # Start Docker Containers + sed -i "s|container_name: avatarchatbot-xeon-backend-server|container_name: avatarchatbot-xeon-backend-server\n volumes:\n - \"${WORKPATH}\/docker_image_build\/GenAIComps:\/home\/user\/GenAIComps\"|g" compose.yaml docker compose up -d n=0 until [[ "$n" -ge 100 ]]; do diff --git a/ChatQnA/benchmark/performance/kubernetes/intel/gaudi/benchmark.yaml b/ChatQnA/benchmark/performance/kubernetes/intel/gaudi/benchmark.yaml index f1c35e67f..07945c220 100644 --- a/ChatQnA/benchmark/performance/kubernetes/intel/gaudi/benchmark.yaml +++ b/ChatQnA/benchmark/performance/kubernetes/intel/gaudi/benchmark.yaml @@ -58,7 +58,7 @@ test_cases: top_k: 10 top_p: 0.95 repetition_penalty: 1.03 - streaming: true + stream: true llmserve: run_test: false service_name: "chatqna-tgi" # Replace with your service name diff --git a/ChatQnA/chatqna.py b/ChatQnA/chatqna.py index e9ace6088..30e154c9e 100644 --- a/ChatQnA/chatqna.py +++ b/ChatQnA/chatqna.py @@ -76,7 +76,7 @@ def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **k next_inputs["messages"] = [{"role": "user", "content": inputs["inputs"]}] next_inputs["max_tokens"] = llm_parameters_dict["max_tokens"] next_inputs["top_p"] = llm_parameters_dict["top_p"] - next_inputs["stream"] = inputs["streaming"] + next_inputs["stream"] = inputs["stream"] next_inputs["frequency_penalty"] = inputs["frequency_penalty"] # next_inputs["presence_penalty"] = inputs["presence_penalty"] # next_inputs["repetition_penalty"] = inputs["repetition_penalty"] @@ -158,7 +158,7 @@ def align_outputs(self, data, cur_node, inputs, runtime_graph, llm_parameters_di next_data["inputs"] = prompt - elif self.services[cur_node].service_type == ServiceType.LLM and not llm_parameters_dict["streaming"]: + elif self.services[cur_node].service_type == ServiceType.LLM and not llm_parameters_dict["stream"]: next_data["text"] = data["choices"][0]["message"]["content"] else: next_data = data @@ -342,7 +342,7 @@ async def handle_request(self, request: Request): frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0, presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0, repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03, - streaming=stream_opt, + stream=stream_opt, chat_template=chat_request.chat_template if chat_request.chat_template else None, ) retriever_parameters = RetrieverParms( diff --git a/ChatQnA/chatqna_wrapper.py b/ChatQnA/chatqna_wrapper.py index 5e4f13b47..971d9edb4 100644 --- a/ChatQnA/chatqna_wrapper.py +++ b/ChatQnA/chatqna_wrapper.py @@ -86,7 +86,7 @@ async def handle_request(self, request: Request): frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0, presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0, repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03, - streaming=stream_opt, + stream=stream_opt, chat_template=chat_request.chat_template if chat_request.chat_template else None, ) retriever_parameters = RetrieverParms( diff --git a/ChatQnA/tests/test_compose_on_gaudi.sh b/ChatQnA/tests/test_compose_on_gaudi.sh index a2392d59e..d81513ed3 100644 --- a/ChatQnA/tests/test_compose_on_gaudi.sh +++ b/ChatQnA/tests/test_compose_on_gaudi.sh @@ -38,6 +38,7 @@ function start_services() { export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} # Start Docker Containers + sed -i "s|container_name: chatqna-gaudi-backend-server|container_name: chatqna-gaudi-backend-server\n volumes:\n - \"${WORKPATH}\/docker_image_build\/GenAIComps:\/home\/user\/GenAIComps\"|g" compose.yaml docker compose -f compose.yaml up -d > ${LOG_PATH}/start_services_with_compose.log n=0 diff --git a/ChatQnA/tests/test_compose_on_rocm.sh b/ChatQnA/tests/test_compose_on_rocm.sh index 9744731d2..c7d47ffb0 100644 --- a/ChatQnA/tests/test_compose_on_rocm.sh +++ b/ChatQnA/tests/test_compose_on_rocm.sh @@ -65,6 +65,7 @@ function start_services() { cd "$WORKPATH"/docker_compose/amd/gpu/rocm # Start Docker Containers + sed -i "s|container_name: chatqna-backend-server|container_name: chatqna-backend-server\n volumes:\n - \"${WORKPATH}\/docker_image_build\/GenAIComps:\/home\/user\/GenAIComps\"|g" compose.yaml docker compose -f compose.yaml up -d > "${LOG_PATH}"/start_services_with_compose.log n=0 diff --git a/ChatQnA/tests/test_compose_on_xeon.sh b/ChatQnA/tests/test_compose_on_xeon.sh index 189816cac..5a22bbe70 100644 --- a/ChatQnA/tests/test_compose_on_xeon.sh +++ b/ChatQnA/tests/test_compose_on_xeon.sh @@ -38,6 +38,7 @@ function start_services() { export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} # Start Docker Containers + sed -i "s|container_name: chatqna-xeon-backend-server|container_name: chatqna-xeon-backend-server\n volumes:\n - \"${WORKPATH}\/docker_image_build\/GenAIComps:\/home\/user\/GenAIComps\"|g" compose.yaml docker compose -f compose.yaml up -d > ${LOG_PATH}/start_services_with_compose.log n=0 diff --git a/CodeGen/benchmark/performance/benchmark.yaml b/CodeGen/benchmark/performance/benchmark.yaml index 90d74d02b..7ec64fbe7 100644 --- a/CodeGen/benchmark/performance/benchmark.yaml +++ b/CodeGen/benchmark/performance/benchmark.yaml @@ -38,7 +38,7 @@ test_cases: top_k: 10 top_p: 0.95 repetition_penalty: 1.03 - streaming: true + stream: true llmserve: run_test: true service_name: "llm-svc" # Replace with your service name diff --git a/CodeGen/codegen.py b/CodeGen/codegen.py index 9769d682d..16db9aa26 100644 --- a/CodeGen/codegen.py +++ b/CodeGen/codegen.py @@ -53,7 +53,7 @@ async def handle_request(self, request: Request): frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0, presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0, repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03, - streaming=stream_opt, + stream=stream_opt, ) result_dict, runtime_graph = await self.megaservice.schedule( initial_inputs={"query": prompt}, llm_parameters=parameters diff --git a/CodeGen/docker_compose/amd/gpu/rocm/README.md b/CodeGen/docker_compose/amd/gpu/rocm/README.md index 46e24f16a..a1ac0ce60 100644 --- a/CodeGen/docker_compose/amd/gpu/rocm/README.md +++ b/CodeGen/docker_compose/amd/gpu/rocm/README.md @@ -113,7 +113,7 @@ curl http://${HOST_IP}:${CODEGEN_TGI_SERVICE_PORT}/generate \ ```bash curl http://${HOST_IP}:${CODEGEN_LLM_SERVICE_PORT}/v1/chat/completions\ -X POST \ - -d '{"query":"Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception.","max_tokens":256,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \ + -d '{"query":"Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception.","max_tokens":256,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"stream":true}' \ -H 'Content-Type: application/json' ``` diff --git a/CodeGen/docker_compose/intel/cpu/xeon/README.md b/CodeGen/docker_compose/intel/cpu/xeon/README.md index d44adc91d..e5ea8e7d5 100644 --- a/CodeGen/docker_compose/intel/cpu/xeon/README.md +++ b/CodeGen/docker_compose/intel/cpu/xeon/README.md @@ -138,7 +138,7 @@ docker compose up -d ```bash curl http://${host_ip}:9000/v1/chat/completions\ -X POST \ - -d '{"query":"Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception.","max_tokens":256,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \ + -d '{"query":"Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception.","max_tokens":256,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"stream":true}' \ -H 'Content-Type: application/json' ``` @@ -250,7 +250,7 @@ There are 4 areas worth noting as shown in the screenshot above: 1. Enter and submit your question 2. Your previous questions -3. Answers from AI assistant (Code will be highlighted properly according to the programming language it is written in, also support streaming output) +3. Answers from AI assistant (Code will be highlighted properly according to the programming language it is written in, also support stream output) 4. Copy or replace code with one click (Note that you need to select the code in the editor first and then click "replace", otherwise the code will be inserted) You can also select the code in the editor and ask the AI assistant questions about the code directly. diff --git a/CodeGen/docker_compose/intel/hpu/gaudi/README.md b/CodeGen/docker_compose/intel/hpu/gaudi/README.md index ad6835925..ef2692f0d 100644 --- a/CodeGen/docker_compose/intel/hpu/gaudi/README.md +++ b/CodeGen/docker_compose/intel/hpu/gaudi/README.md @@ -119,7 +119,7 @@ docker compose up -d ```bash curl http://${host_ip}:9000/v1/chat/completions\ -X POST \ - -d '{"query":"Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception.","max_tokens":256,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \ + -d '{"query":"Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception.","max_tokens":256,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"stream":true}' \ -H 'Content-Type: application/json' ``` @@ -227,7 +227,7 @@ There are 4 areas worth noting as shown in the screenshot above: 1. Enter and submit your question 2. Your previous questions -3. Answers from AI assistant (Code will be highlighted properly according to the programming language it is written in, also support streaming output) +3. Answers from AI assistant (Code will be highlighted properly according to the programming language it is written in, also support stream output) 4. Copy or replace code with one click (Note that you need to select the code in the editor first and then click "replace", otherwise the code will be inserted) You can also select the code in the editor and ask the AI assistant questions about the code directly. diff --git a/CodeTrans/benchmark/performance/benchmark.yaml b/CodeTrans/benchmark/performance/benchmark.yaml index 8680e886d..a8bff2f3d 100644 --- a/CodeTrans/benchmark/performance/benchmark.yaml +++ b/CodeTrans/benchmark/performance/benchmark.yaml @@ -38,7 +38,7 @@ test_cases: top_k: 10 top_p: 0.95 repetition_penalty: 1.03 - streaming: true + stream: true llmserve: run_test: true service_name: "codetrans-llm-svc" # Replace with your service name diff --git a/DocSum/docker_compose/intel/cpu/xeon/README.md b/DocSum/docker_compose/intel/cpu/xeon/README.md index f87aedc6b..98aaad918 100644 --- a/DocSum/docker_compose/intel/cpu/xeon/README.md +++ b/DocSum/docker_compose/intel/cpu/xeon/README.md @@ -289,7 +289,7 @@ You will have the following Docker Images: **summary_type=map_reduce** - Map_reduce mode will split the inputs into multiple chunks, map each document to an individual summary, then consolidate those summaries into a single global summary. `streaming=True` is not allowed here. + Map_reduce mode will split the inputs into multiple chunks, map each document to an individual summary, then consolidate those summaries into a single global summary. `stream=True` is not allowed here. In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS)` diff --git a/DocSum/docker_compose/intel/hpu/gaudi/README.md b/DocSum/docker_compose/intel/hpu/gaudi/README.md index 171bf37d9..65a1799d3 100644 --- a/DocSum/docker_compose/intel/hpu/gaudi/README.md +++ b/DocSum/docker_compose/intel/hpu/gaudi/README.md @@ -280,7 +280,7 @@ You will have the following Docker Images: **summary_type=map_reduce** - Map_reduce mode will split the inputs into multiple chunks, map each document to an individual summary, then consolidate those summaries into a single global summary. `streaming=True` is not allowed here. + Map_reduce mode will split the inputs into multiple chunks, map each document to an individual summary, then consolidate those summaries into a single global summary. `stream=True` is not allowed here. In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS)` diff --git a/DocSum/docsum.py b/DocSum/docsum.py index a640c0f08..d1689d92a 100644 --- a/DocSum/docsum.py +++ b/DocSum/docsum.py @@ -231,7 +231,7 @@ async def handle_request(self, request: Request, files: List[UploadFile] = File( frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0, presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0, repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03, - streaming=stream_opt, + stream=stream_opt, model=chat_request.model if chat_request.model else None, language=chat_request.language if chat_request.language else "auto", summary_type=summary_type, diff --git a/EdgeCraftRAG/chatqna.py b/EdgeCraftRAG/chatqna.py index d9441d09f..bc6f0a643 100644 --- a/EdgeCraftRAG/chatqna.py +++ b/EdgeCraftRAG/chatqna.py @@ -52,7 +52,7 @@ async def handle_request(self, request: Request): frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0, presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0, repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03, - streaming=stream_opt, + stream=stream_opt, chat_template=chat_request.chat_template if chat_request.chat_template else None, ) result_dict, runtime_graph = await self.megaservice.schedule(initial_inputs=input, llm_parameters=parameters) diff --git a/EdgeCraftRAG/ui/gradio/ecragui.py b/EdgeCraftRAG/ui/gradio/ecragui.py index f8e46ff0c..8e46dee03 100644 --- a/EdgeCraftRAG/ui/gradio/ecragui.py +++ b/EdgeCraftRAG/ui/gradio/ecragui.py @@ -175,7 +175,7 @@ async def bot( } server_addr = f"http://{MEGA_SERVICE_HOST_IP}:{MEGA_SERVICE_PORT}" - # Async for streaming response + # Async for stream response partial_text = "" link_urls = [] image_paths = [] diff --git a/FaqGen/benchmark/performance/benchmark.yaml b/FaqGen/benchmark/performance/benchmark.yaml index 2c9c914de..30e92ed8c 100644 --- a/FaqGen/benchmark/performance/benchmark.yaml +++ b/FaqGen/benchmark/performance/benchmark.yaml @@ -38,7 +38,7 @@ test_cases: top_k: 10 top_p: 0.95 repetition_penalty: 1.03 - streaming: true + stream: true llmserve: run_test: false service_name: "faq-micro-svc" # Replace with your service name diff --git a/FaqGen/docker_compose/intel/hpu/gaudi/README.md b/FaqGen/docker_compose/intel/hpu/gaudi/README.md index b2ef57e77..4b8ebb523 100644 --- a/FaqGen/docker_compose/intel/hpu/gaudi/README.md +++ b/FaqGen/docker_compose/intel/hpu/gaudi/README.md @@ -124,7 +124,7 @@ docker compose up -d ``` ```bash - ##enable streaming + ##enable stream curl http://${host_ip}:8888/v1/faqgen \ -H "Content-Type: multipart/form-data" \ -F "messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5." \ diff --git a/FaqGen/faqgen.py b/FaqGen/faqgen.py index da96bc1bd..f4b0a5803 100644 --- a/FaqGen/faqgen.py +++ b/FaqGen/faqgen.py @@ -109,7 +109,7 @@ async def handle_request(self, request: Request, files: List[UploadFile] = File( frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0, presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0, repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03, - streaming=stream_opt, + stream=stream_opt, model=chat_request.model if chat_request.model else None, ) result_dict, runtime_graph = await self.megaservice.schedule( diff --git a/GraphRAG/graphrag.py b/GraphRAG/graphrag.py index 6d3e3b982..8ed4471f3 100644 --- a/GraphRAG/graphrag.py +++ b/GraphRAG/graphrag.py @@ -64,7 +64,7 @@ def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **k next_inputs["messages"] = [{"role": "user", "content": inputs["inputs"]}] next_inputs["max_tokens"] = llm_parameters_dict["max_tokens"] next_inputs["top_p"] = llm_parameters_dict["top_p"] - next_inputs["stream"] = inputs["streaming"] + next_inputs["stream"] = inputs["stream"] next_inputs["frequency_penalty"] = inputs["frequency_penalty"] # next_inputs["presence_penalty"] = inputs["presence_penalty"] # next_inputs["repetition_penalty"] = inputs["repetition_penalty"] @@ -191,7 +191,7 @@ def parser_input(data, TypeClass, key): frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0, presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0, repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03, - streaming=stream_opt, + stream=stream_opt, chat_template=chat_request.chat_template if chat_request.chat_template else None, ) retriever_parameters = RetrieverParms( diff --git a/GraphRAG/tests/test_compose_on_gaudi.sh b/GraphRAG/tests/test_compose_on_gaudi.sh index df14dee2c..3525936ae 100755 --- a/GraphRAG/tests/test_compose_on_gaudi.sh +++ b/GraphRAG/tests/test_compose_on_gaudi.sh @@ -40,6 +40,7 @@ function start_services() { export host_ip=${ip_address} # Start Docker Containers + sed -i "s|container_name: graphrag-gaudi-backend-server|container_name: graphrag-gaudi-backend-server\n volumes:\n - \"${WORKPATH}\/docker_image_build\/GenAIComps:\/home\/user\/GenAIComps\"|g" compose.yaml docker compose -f compose.yaml up -d > ${LOG_PATH}/start_services_with_compose.log n=0 diff --git a/MultimodalQnA/multimodalqna.py b/MultimodalQnA/multimodalqna.py index 3b248246e..02b8334ae 100644 --- a/MultimodalQnA/multimodalqna.py +++ b/MultimodalQnA/multimodalqna.py @@ -220,7 +220,7 @@ async def handle_request(self, request: Request): data = await request.json() stream_opt = bool(data.get("stream", False)) if stream_opt: - print("[ MultimodalQnAService ] stream=True not used, this has not support streaming yet!") + print("[ MultimodalQnAService ] stream=True not used, this has not support stream yet!") stream_opt = False chat_request = ChatCompletionRequest.model_validate(data) # Multimodal RAG QnA With Videos has not yet accepts image as input during QnA. @@ -263,7 +263,7 @@ async def handle_request(self, request: Request): frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0, presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0, repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03, - streaming=stream_opt, + stream=stream_opt, chat_template=chat_request.chat_template if chat_request.chat_template else None, ) result_dict, runtime_graph = await cur_megaservice.schedule( @@ -272,8 +272,8 @@ async def handle_request(self, request: Request): for node, response in result_dict.items(): # the last microservice in this megaservice is LVM. # checking if LVM returns StreamingResponse - # Currently, LVM with LLAVA has not yet supported streaming. - # @TODO: Will need to test this once LVM with LLAVA supports streaming + # Currently, LVM with LLAVA has not yet supported stream. + # @TODO: Will need to test this once LVM with LLAVA supports stream if ( isinstance(response, StreamingResponse) and node == runtime_graph.all_leaves()[-1] diff --git a/MultimodalQnA/tests/test_compose_on_gaudi.sh b/MultimodalQnA/tests/test_compose_on_gaudi.sh index da34708b2..a0279d5b4 100644 --- a/MultimodalQnA/tests/test_compose_on_gaudi.sh +++ b/MultimodalQnA/tests/test_compose_on_gaudi.sh @@ -62,6 +62,7 @@ function start_services() { cd $WORKPATH/docker_compose/intel/hpu/gaudi # Start Docker Containers + sed -i "s|container_name: multimodalqna-backend-server|container_name: multimodalqna-backend-server\n volumes:\n - \"${WORKPATH}\/docker_image_build\/GenAIComps:\/home\/user\/GenAIComps\"|g" compose.yaml docker compose -f compose.yaml up -d > ${LOG_PATH}/start_services_with_compose.log sleep 2m } diff --git a/MultimodalQnA/tests/test_compose_on_rocm.sh b/MultimodalQnA/tests/test_compose_on_rocm.sh index eabbbfeb9..ade7187ed 100644 --- a/MultimodalQnA/tests/test_compose_on_rocm.sh +++ b/MultimodalQnA/tests/test_compose_on_rocm.sh @@ -68,6 +68,7 @@ function start_services() { # Start Docker Containers + sed -i "s|container_name: multimodalqna-backend-server|container_name: multimodalqna-backend-server\n volumes:\n - \"${WORKPATH}\/docker_image_build\/GenAIComps:\/home\/user\/GenAIComps\"|g" compose.yaml docker compose -f compose.yaml up -d > ${LOG_PATH}/start_services_with_compose.log sleep 2m } diff --git a/MultimodalQnA/tests/test_compose_on_xeon.sh b/MultimodalQnA/tests/test_compose_on_xeon.sh index 904a3e7a7..e49f265ab 100644 --- a/MultimodalQnA/tests/test_compose_on_xeon.sh +++ b/MultimodalQnA/tests/test_compose_on_xeon.sh @@ -61,6 +61,7 @@ function start_services() { # Start Docker Containers + sed -i "s|container_name: multimodalqna-backend-server|container_name: multimodalqna-backend-server\n volumes:\n - \"${WORKPATH}\/docker_image_build\/GenAIComps:\/home\/user\/GenAIComps\"|g" compose.yaml docker compose -f compose.yaml up -d > ${LOG_PATH}/start_services_with_compose.log sleep 2m } diff --git a/ProductivitySuite/docker_compose/intel/cpu/xeon/README.md b/ProductivitySuite/docker_compose/intel/cpu/xeon/README.md index 66b62d010..ac2e40492 100644 --- a/ProductivitySuite/docker_compose/intel/cpu/xeon/README.md +++ b/ProductivitySuite/docker_compose/intel/cpu/xeon/README.md @@ -277,7 +277,7 @@ Please refer to **[keycloak_setup_guide](keycloak_setup_guide.md)** for more det ```bash curl http://${host_ip}:9000/v1/chat/completions\ -X POST \ - -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \ + -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"stream":true}' \ -H 'Content-Type: application/json' ``` diff --git a/SearchQnA/docker_compose/intel/cpu/xeon/README.md b/SearchQnA/docker_compose/intel/cpu/xeon/README.md index c2c148656..e669dd4d9 100644 --- a/SearchQnA/docker_compose/intel/cpu/xeon/README.md +++ b/SearchQnA/docker_compose/intel/cpu/xeon/README.md @@ -140,7 +140,7 @@ curl http://${host_ip}:3006/generate \ # llm microservice curl http://${host_ip}:3007/v1/chat/completions\ -X POST \ - -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \ + -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"stream":true}' \ -H 'Content-Type: application/json' ``` diff --git a/SearchQnA/docker_compose/intel/hpu/gaudi/README.md b/SearchQnA/docker_compose/intel/hpu/gaudi/README.md index 49ecd4272..1bf646bb3 100644 --- a/SearchQnA/docker_compose/intel/hpu/gaudi/README.md +++ b/SearchQnA/docker_compose/intel/hpu/gaudi/README.md @@ -138,7 +138,7 @@ curl http://${host_ip}:3006/generate \ # llm microservice curl http://${host_ip}:3007/v1/chat/completions\ -X POST \ - -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \ + -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"stream":true}' \ -H 'Content-Type: application/json' ``` diff --git a/SearchQnA/searchqna.py b/SearchQnA/searchqna.py index 21c7bf019..1eb0b4db0 100644 --- a/SearchQnA/searchqna.py +++ b/SearchQnA/searchqna.py @@ -96,7 +96,7 @@ async def handle_request(self, request: Request): frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0, presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0, repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03, - streaming=stream_opt, + stream=stream_opt, ) result_dict, runtime_graph = await self.megaservice.schedule( initial_inputs={"input": prompt}, llm_parameters=parameters diff --git a/VideoQnA/videoqna.py b/VideoQnA/videoqna.py index 3b699faa7..c447dd2ab 100644 --- a/VideoQnA/videoqna.py +++ b/VideoQnA/videoqna.py @@ -85,7 +85,7 @@ async def handle_request(self, request: Request): frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0, presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0, repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03, - streaming=stream_opt, + stream=stream_opt, ) result_dict, runtime_graph = await self.megaservice.schedule( initial_inputs={"text": prompt}, llm_parameters=parameters diff --git a/VisualQnA/benchmark/performance/benchmark.yaml b/VisualQnA/benchmark/performance/benchmark.yaml index 9ddf92293..179317bdb 100644 --- a/VisualQnA/benchmark/performance/benchmark.yaml +++ b/VisualQnA/benchmark/performance/benchmark.yaml @@ -38,7 +38,7 @@ test_cases: top_k: 10 top_p: 0.95 repetition_penalty: 1.03 - streaming: true + stream: true lvmserve: run_test: true service_name: "lvm-serving-svc" # Replace with your service name diff --git a/VisualQnA/visualqna.py b/VisualQnA/visualqna.py index 312239615..6d7fae154 100644 --- a/VisualQnA/visualqna.py +++ b/VisualQnA/visualqna.py @@ -52,7 +52,7 @@ async def handle_request(self, request: Request): frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0, presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0, repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03, - streaming=stream_opt, + stream=stream_opt, ) result_dict, runtime_graph = await self.megaservice.schedule( initial_inputs={"prompt": prompt, "image": images[0]}, llm_parameters=parameters diff --git a/WorkflowExecAgent/docker_compose/intel/cpu/xeon/compose_vllm.yaml b/WorkflowExecAgent/docker_compose/intel/cpu/xeon/compose_vllm.yaml index f72ef10e0..4304831f2 100644 --- a/WorkflowExecAgent/docker_compose/intel/cpu/xeon/compose_vllm.yaml +++ b/WorkflowExecAgent/docker_compose/intel/cpu/xeon/compose_vllm.yaml @@ -20,7 +20,7 @@ services: model: ${model} temperature: ${temperature} max_new_tokens: ${max_new_tokens} - streaming: false + stream: false tools: /home/user/tools/tools.yaml no_proxy: ${no_proxy} http_proxy: ${http_proxy}