From 5b42ca889b9afec8fb86c6853d0886097cc63879 Mon Sep 17 00:00:00 2001 From: XinyaoWa Date: Fri, 13 Dec 2024 17:56:24 +0800 Subject: [PATCH] vllm comps support openai API ChatCompletionRequest (#1032) * vllm support openai API Signed-off-by: Xinyao Wang * fix bug Signed-off-by: Xinyao Wang * fix bug Signed-off-by: Xinyao Wang * test_llms_text-generation_vllm_langchain_on_intel_hpu.sh Signed-off-by: Xinyao Wang * fix time Signed-off-by: Xinyao Wang * fix bug Signed-off-by: Xinyao Wang * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix bug Signed-off-by: Xinyao Wang --------- Signed-off-by: Xinyao Wang Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../text-generation/vllm/langchain/README.md | 30 +++--- .../text-generation/vllm/langchain/llm.py | 93 +++++++++++++++++++ .../text-generation/vllm/langchain/query.sh | 20 ---- ...-generation_vllm_langchain_on_intel_hpu.sh | 19 +++- 4 files changed, 121 insertions(+), 41 deletions(-) delete mode 100644 comps/llms/text-generation/vllm/langchain/query.sh diff --git a/comps/llms/text-generation/vllm/langchain/README.md b/comps/llms/text-generation/vllm/langchain/README.md index 1405273b0..bb83f0dc5 100644 --- a/comps/llms/text-generation/vllm/langchain/README.md +++ b/comps/llms/text-generation/vllm/langchain/README.md @@ -223,29 +223,21 @@ User can set the following model parameters according to needs: - streaming(true/false): return text response in streaming mode or non-streaming mode ```bash -# 1. Non-streaming mode +# stream mode curl http://${your_ip}:9000/v1/chat/completions \ - -X POST \ - -d '{"query":"What is Deep Learning?","max_tokens":17,"top_p":1,"temperature":0.7,"frequency_penalty":0,"presence_penalty":0, "streaming":false}' \ - -H 'Content-Type: application/json' + -X POST \ + -d '{"model": "${model_name}", "messages": "What is Deep Learning?", "max_tokens":17}' \ + -H 'Content-Type: application/json' -# 2. Streaming mode curl http://${your_ip}:9000/v1/chat/completions \ - -X POST \ - -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \ - -H 'Content-Type: application/json' + -X POST \ + -d '{"model": "${model_name}", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' \ + -H 'Content-Type: application/json' -# 3. Custom chat template with streaming mode +#Non-stream mode curl http://${your_ip}:9000/v1/chat/completions \ - -X POST \ - -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true, "chat_template":"### You are a helpful, respectful and honest assistant to help the user with questions.\n### Context: {context}\n### Question: {question}\n### Answer:"}' \ - -H 'Content-Type: application/json' + -X POST \ + -d '{"model": "${model_name}", "messages": "What is Deep Learning?", "max_tokens":17, "stream":false}' \ + -H 'Content-Type: application/json' -4. # Chat with SearchedDoc (Retrieval context) -curl http://${your_ip}:9000/v1/chat/completions \ - -X POST \ - -d '{"initial_query":"What is Deep Learning?","retrieved_docs":[{"text":"Deep Learning is a ..."},{"text":"Deep Learning is b ..."}]}' \ - -H 'Content-Type: application/json' ``` - -For parameters, can refer to [LangChain VLLMOpenAI API](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.vllm.VLLMOpenAI.html) diff --git a/comps/llms/text-generation/vllm/langchain/llm.py b/comps/llms/text-generation/vllm/langchain/llm.py index ccedec451..143c9b9d0 100644 --- a/comps/llms/text-generation/vllm/langchain/llm.py +++ b/comps/llms/text-generation/vllm/langchain/llm.py @@ -7,6 +7,7 @@ from fastapi.responses import StreamingResponse from langchain_community.llms import VLLMOpenAI from langchain_core.prompts import PromptTemplate +from openai import OpenAI from template import ChatTemplate from comps import ( @@ -194,6 +195,98 @@ async def stream_generator(): logger.info(response) return GeneratedDoc(text=response, prompt=input.query) + else: + if logflag: + logger.info("[ ChatCompletionRequest ] input in opea format") + client = OpenAI( + api_key="EMPTY", + base_url=llm_endpoint + "/v1", + ) + + if isinstance(input.messages, str): + prompt = input.messages + if prompt_template: + if sorted(input_variables) == ["context", "question"]: + prompt = prompt_template.format(question=input.messages, context="\n".join(input.documents)) + elif input_variables == ["question"]: + prompt = prompt_template.format(question=input.messages) + else: + logger.info( + f"[ ChatCompletionRequest ] {prompt_template} not used, we only support 2 input variables ['question', 'context']" + ) + else: + if input.documents: + # use rag default template + prompt = ChatTemplate.generate_rag_prompt(input.messages, input.documents, input.model) + + chat_completion = client.completions.create( + model=model_name, + prompt=prompt, + echo=input.echo, + frequency_penalty=input.frequency_penalty, + max_tokens=input.max_tokens, + n=input.n, + presence_penalty=input.presence_penalty, + seed=input.seed, + stop=input.stop, + stream=input.stream, + suffix=input.suffix, + temperature=input.temperature, + top_p=input.top_p, + user=input.user, + ) + else: + if input.messages[0]["role"] == "system": + if "{context}" in input.messages[0]["content"]: + if input.documents is None or input.documents == []: + input.messages[0]["content"].format(context="") + else: + input.messages[0]["content"].format(context="\n".join(input.documents)) + else: + if prompt_template: + system_prompt = prompt_template + if input_variables == ["context"]: + system_prompt = prompt_template.format(context="\n".join(input.documents)) + else: + logger.info( + f"[ ChatCompletionRequest ] {prompt_template} not used, only support 1 input variables ['context']" + ) + + input.messages.insert(0, {"role": "system", "content": system_prompt}) + + chat_completion = client.chat.completions.create( + model=model_name, + messages=input.messages, + frequency_penalty=input.frequency_penalty, + max_tokens=input.max_tokens, + n=input.n, + presence_penalty=input.presence_penalty, + response_format=input.response_format, + seed=input.seed, + stop=input.stop, + stream=input.stream, + stream_options=input.stream_options, + temperature=input.temperature, + top_p=input.top_p, + user=input.user, + ) + + if input.stream: + + def stream_generator(): + for c in chat_completion: + if logflag: + logger.info(c) + chunk = c.model_dump_json() + if chunk not in ["<|im_end|>", "<|endoftext|>"]: + yield f"data: {chunk}\n\n" + yield "data: [DONE]\n\n" + + return StreamingResponse(stream_generator(), media_type="text/event-stream") + else: + if logflag: + logger.info(chat_completion) + return chat_completion if __name__ == "__main__": diff --git a/comps/llms/text-generation/vllm/langchain/query.sh b/comps/llms/text-generation/vllm/langchain/query.sh deleted file mode 100644 index 31fa18750..000000000 --- a/comps/llms/text-generation/vllm/langchain/query.sh +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -your_ip="0.0.0.0" -model=$(curl http://localhost:8008/v1/models -s|jq -r '.data[].id') - -curl http://${your_ip}:8008/v1/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "'$model'", - "prompt": "What is Deep Learning?", - "max_tokens": 32, - "temperature": 0 - }' - -##query microservice -curl http://${your_ip}:9000/v1/chat/completions \ - -X POST \ - -d '{"query":"What is Deep Learning?","max_tokens":17,"top_p":1,"temperature":0.7,"frequency_penalty":0,"presence_penalty":0, "streaming":false}' \ - -H 'Content-Type: application/json' diff --git a/tests/llms/test_llms_text-generation_vllm_langchain_on_intel_hpu.sh b/tests/llms/test_llms_text-generation_vllm_langchain_on_intel_hpu.sh index 6b8e468f8..c83799128 100644 --- a/tests/llms/test_llms_text-generation_vllm_langchain_on_intel_hpu.sh +++ b/tests/llms/test_llms_text-generation_vllm_langchain_on_intel_hpu.sh @@ -44,6 +44,7 @@ function start_service() { -p $port_number:80 \ -e HABANA_VISIBLE_DEVICES=all \ -e OMPI_MCA_btl_vader_single_copy_mechanism=none \ + -e VLLM_SKIP_WARMUP=true \ --cap-add=sys_nice \ --ipc=host \ -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} \ @@ -62,7 +63,7 @@ function start_service() { # check whether vllm ray is fully ready n=0 - until [[ "$n" -ge 160 ]] || [[ $ready == true ]]; do + until [[ "$n" -ge 70 ]] || [[ $ready == true ]]; do docker logs test-comps-vllm-service > ${WORKPATH}/tests/test-comps-vllm-service.log n=$((n+1)) if grep -q throughput ${WORKPATH}/tests/test-comps-vllm-service.log; then @@ -90,9 +91,23 @@ function validate_microservice() { docker logs test-comps-vllm-microservice exit 1 fi + + result=$(http_proxy="" curl http://${ip_address}:5030/v1/chat/completions \ + -X POST \ + -d '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17, "stream":false}' \ + -H 'Content-Type: application/json') + if [[ $result == *"content"* ]]; then + echo "Result correct." + else + echo "Result wrong. Received was $result" + docker logs test-comps-vllm-service + docker logs test-comps-vllm-microservice + exit 1 + fi + result=$(http_proxy="" curl http://${ip_address}:5030/v1/chat/completions \ -X POST \ - -d '{"query":"What is Deep Learning?","max_tokens":17,"top_p":1,"temperature":0.7,"frequency_penalty":0,"presence_penalty":0, "streaming":false}' \ + -d '{"model": "Intel/neural-chat-7b-v3-3", "messages": "What is Deep Learning?", "max_tokens":17, "stream":false}' \ -H 'Content-Type: application/json') if [[ $result == *"text"* ]]; then echo "Result correct."