Skip to content

Commit

Permalink
vllm comps support openai API ChatCompletionRequest (opea-project#1032)
Browse files Browse the repository at this point in the history
* vllm support openai API

Signed-off-by: Xinyao Wang <[email protected]>

* fix bug

Signed-off-by: Xinyao Wang <[email protected]>

* fix bug

Signed-off-by: Xinyao Wang <[email protected]>

* test_llms_text-generation_vllm_langchain_on_intel_hpu.sh

Signed-off-by: Xinyao Wang <[email protected]>

* fix time

Signed-off-by: Xinyao Wang <[email protected]>

* fix bug

Signed-off-by: Xinyao Wang <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix bug

Signed-off-by: Xinyao Wang <[email protected]>

---------

Signed-off-by: Xinyao Wang <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
2 people authored and jjmaturino committed Dec 16, 2024
1 parent da99090 commit 5b42ca8
Show file tree
Hide file tree
Showing 4 changed files with 121 additions and 41 deletions.
30 changes: 11 additions & 19 deletions comps/llms/text-generation/vllm/langchain/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -223,29 +223,21 @@ User can set the following model parameters according to needs:
- streaming(true/false): return text response in streaming mode or non-streaming mode

```bash
# 1. Non-streaming mode
# stream mode
curl http://${your_ip}:9000/v1/chat/completions \
-X POST \
-d '{"query":"What is Deep Learning?","max_tokens":17,"top_p":1,"temperature":0.7,"frequency_penalty":0,"presence_penalty":0, "streaming":false}' \
-H 'Content-Type: application/json'
-X POST \
-d '{"model": "${model_name}", "messages": "What is Deep Learning?", "max_tokens":17}' \
-H 'Content-Type: application/json'

# 2. Streaming mode
curl http://${your_ip}:9000/v1/chat/completions \
-X POST \
-d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
-H 'Content-Type: application/json'
-X POST \
-d '{"model": "${model_name}", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' \
-H 'Content-Type: application/json'

# 3. Custom chat template with streaming mode
#Non-stream mode
curl http://${your_ip}:9000/v1/chat/completions \
-X POST \
-d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true, "chat_template":"### You are a helpful, respectful and honest assistant to help the user with questions.\n### Context: {context}\n### Question: {question}\n### Answer:"}' \
-H 'Content-Type: application/json'
-X POST \
-d '{"model": "${model_name}", "messages": "What is Deep Learning?", "max_tokens":17, "stream":false}' \
-H 'Content-Type: application/json'

4. # Chat with SearchedDoc (Retrieval context)
curl http://${your_ip}:9000/v1/chat/completions \
-X POST \
-d '{"initial_query":"What is Deep Learning?","retrieved_docs":[{"text":"Deep Learning is a ..."},{"text":"Deep Learning is b ..."}]}' \
-H 'Content-Type: application/json'
```

For parameters, can refer to [LangChain VLLMOpenAI API](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.vllm.VLLMOpenAI.html)
93 changes: 93 additions & 0 deletions comps/llms/text-generation/vllm/langchain/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from fastapi.responses import StreamingResponse
from langchain_community.llms import VLLMOpenAI
from langchain_core.prompts import PromptTemplate
from openai import OpenAI
from template import ChatTemplate

from comps import (
Expand Down Expand Up @@ -194,6 +195,98 @@ async def stream_generator():
logger.info(response)

return GeneratedDoc(text=response, prompt=input.query)
else:
if logflag:
logger.info("[ ChatCompletionRequest ] input in opea format")
client = OpenAI(
api_key="EMPTY",
base_url=llm_endpoint + "/v1",
)

if isinstance(input.messages, str):
prompt = input.messages
if prompt_template:
if sorted(input_variables) == ["context", "question"]:
prompt = prompt_template.format(question=input.messages, context="\n".join(input.documents))
elif input_variables == ["question"]:
prompt = prompt_template.format(question=input.messages)
else:
logger.info(
f"[ ChatCompletionRequest ] {prompt_template} not used, we only support 2 input variables ['question', 'context']"
)
else:
if input.documents:
# use rag default template
prompt = ChatTemplate.generate_rag_prompt(input.messages, input.documents, input.model)

chat_completion = client.completions.create(
model=model_name,
prompt=prompt,
echo=input.echo,
frequency_penalty=input.frequency_penalty,
max_tokens=input.max_tokens,
n=input.n,
presence_penalty=input.presence_penalty,
seed=input.seed,
stop=input.stop,
stream=input.stream,
suffix=input.suffix,
temperature=input.temperature,
top_p=input.top_p,
user=input.user,
)
else:
if input.messages[0]["role"] == "system":
if "{context}" in input.messages[0]["content"]:
if input.documents is None or input.documents == []:
input.messages[0]["content"].format(context="")
else:
input.messages[0]["content"].format(context="\n".join(input.documents))
else:
if prompt_template:
system_prompt = prompt_template
if input_variables == ["context"]:
system_prompt = prompt_template.format(context="\n".join(input.documents))
else:
logger.info(
f"[ ChatCompletionRequest ] {prompt_template} not used, only support 1 input variables ['context']"
)

input.messages.insert(0, {"role": "system", "content": system_prompt})

chat_completion = client.chat.completions.create(
model=model_name,
messages=input.messages,
frequency_penalty=input.frequency_penalty,
max_tokens=input.max_tokens,
n=input.n,
presence_penalty=input.presence_penalty,
response_format=input.response_format,
seed=input.seed,
stop=input.stop,
stream=input.stream,
stream_options=input.stream_options,
temperature=input.temperature,
top_p=input.top_p,
user=input.user,
)

if input.stream:

def stream_generator():
for c in chat_completion:
if logflag:
logger.info(c)
chunk = c.model_dump_json()
if chunk not in ["<|im_end|>", "<|endoftext|>"]:
yield f"data: {chunk}\n\n"
yield "data: [DONE]\n\n"

return StreamingResponse(stream_generator(), media_type="text/event-stream")
else:
if logflag:
logger.info(chat_completion)
return chat_completion


if __name__ == "__main__":
Expand Down
20 changes: 0 additions & 20 deletions comps/llms/text-generation/vllm/langchain/query.sh

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ function start_service() {
-p $port_number:80 \
-e HABANA_VISIBLE_DEVICES=all \
-e OMPI_MCA_btl_vader_single_copy_mechanism=none \
-e VLLM_SKIP_WARMUP=true \
--cap-add=sys_nice \
--ipc=host \
-e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} \
Expand All @@ -62,7 +63,7 @@ function start_service() {

# check whether vllm ray is fully ready
n=0
until [[ "$n" -ge 160 ]] || [[ $ready == true ]]; do
until [[ "$n" -ge 70 ]] || [[ $ready == true ]]; do
docker logs test-comps-vllm-service > ${WORKPATH}/tests/test-comps-vllm-service.log
n=$((n+1))
if grep -q throughput ${WORKPATH}/tests/test-comps-vllm-service.log; then
Expand Down Expand Up @@ -90,9 +91,23 @@ function validate_microservice() {
docker logs test-comps-vllm-microservice
exit 1
fi

result=$(http_proxy="" curl http://${ip_address}:5030/v1/chat/completions \
-X POST \
-d '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17, "stream":false}' \
-H 'Content-Type: application/json')
if [[ $result == *"content"* ]]; then
echo "Result correct."
else
echo "Result wrong. Received was $result"
docker logs test-comps-vllm-service
docker logs test-comps-vllm-microservice
exit 1
fi

result=$(http_proxy="" curl http://${ip_address}:5030/v1/chat/completions \
-X POST \
-d '{"query":"What is Deep Learning?","max_tokens":17,"top_p":1,"temperature":0.7,"frequency_penalty":0,"presence_penalty":0, "streaming":false}' \
-d '{"model": "Intel/neural-chat-7b-v3-3", "messages": "What is Deep Learning?", "max_tokens":17, "stream":false}' \
-H 'Content-Type: application/json')
if [[ $result == *"text"* ]]; then
echo "Result correct."
Expand Down

0 comments on commit 5b42ca8

Please sign in to comment.