Skip to content

Commit

Permalink
Adapt to latest vllm changes
Browse files Browse the repository at this point in the history
- Remove --eager-enfoce on hpu to improve performance
- Refactor to the upstream docker entrypoint changes

Fixes issue opea-project#631.

Signed-off-by: Lianhao Lu <[email protected]>
  • Loading branch information
lianhao committed Dec 10, 2024
1 parent 7219249 commit cde4096
Show file tree
Hide file tree
Showing 4 changed files with 4 additions and 6 deletions.
2 changes: 1 addition & 1 deletion helm-charts/common/agent/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ tgi:
vllm:
enabled: false
LLM_MODEL_ID: "mistralai/Mistral-7B-Instruct-v0.3"
extraCmdArgs: ["/bin/bash", "-c", "python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model mistralai/Mistral-7B-Instruct-v0.3 --tensor-parallel-size 1 --host 0.0.0.0 --port 2080 --download-dir /data --block-size 128 --max-num-seqs 4096 --max-seq_len-to-capture 8192 --enable-auto-tool-choice --tool-call-parser mistral"]
extraCmdArgs: ["--tensor-parallel-size","1","--block-size","128","--max-num-seqs","4096","--max-seq_len-to-capture","8192","--enable-auto-tool-choice","--tool-call-parser","mistral"]

replicaCount: 1
llm_endpoint_url: ""
Expand Down
2 changes: 1 addition & 1 deletion helm-charts/common/llm-uservice/ci-vllm-gaudi-values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ vllm:
tag: "latest"
LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
OMPI_MCA_btl_vader_single_copy_mechanism: none
extraCmdArgs: ["--enforce-eager","--tensor-parallel-size","1","--block-size","128","--max-num-seqs","256","--max-seq_len-to-capture","2048"]
extraCmdArgs: ["--tensor-parallel-size","1","--block-size","128","--max-num-seqs","256","--max-seq_len-to-capture","2048"]
resources:
limits:
habana.ai/gaudi: 1
Expand Down
4 changes: 1 addition & 3 deletions helm-charts/common/vllm/gaudi-values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,7 @@ image:

# VLLM_CPU_KVCACHE_SPACE: "40"
OMPI_MCA_btl_vader_single_copy_mechanism: none
extraCmdArgs: ["--enforce-eager","--tensor-parallel-size","1","--block-size","128","--max-num-seqs","256","--max-seq_len-to-capture","2048"]
# Workaround for current HPU image with start command /bin/bash
# extraCmdArgs: ["/bin/bash","-c","python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model Intel/neural-chat-7b-v3-3 --tensor-parallel-size 1 --host 0.0.0.0 --port 2080 --download-dir /data --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048"]
extraCmdArgs: ["--tensor-parallel-size","1","--block-size","128","--max-num-seqs","256","--max-seq_len-to-capture","2048"]
resources:
limits:
habana.ai/gaudi: 1
2 changes: 1 addition & 1 deletion helm-charts/common/vllm/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ resources: {}
# cpu: 100m
# memory: 128Mi

extraCmdArgs: ["--enforce-eager", "--dtype", "auto"]
extraCmdArgs: []

livenessProbe:
httpGet:
Expand Down

0 comments on commit cde4096

Please sign in to comment.