From 5b42ca889b9afec8fb86c6853d0886097cc63879 Mon Sep 17 00:00:00 2001
From: XinyaoWa <xinyao.wang@intel.com>
Date: Fri, 13 Dec 2024 17:56:24 +0800
Subject: [PATCH] vllm comps support openai API ChatCompletionRequest (#1032)

* vllm support openai API

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>

* fix bug

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>

* fix bug

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>

* test_llms_text-generation_vllm_langchain_on_intel_hpu.sh

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>

* fix time

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>

* fix bug

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix bug

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>

---------

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../text-generation/vllm/langchain/README.md  | 30 +++---
 .../text-generation/vllm/langchain/llm.py     | 93 +++++++++++++++++++
 .../text-generation/vllm/langchain/query.sh   | 20 ----
 ...-generation_vllm_langchain_on_intel_hpu.sh | 19 +++-
 4 files changed, 121 insertions(+), 41 deletions(-)
 delete mode 100644 comps/llms/text-generation/vllm/langchain/query.sh

diff --git a/comps/llms/text-generation/vllm/langchain/README.md b/comps/llms/text-generation/vllm/langchain/README.md
index 1405273b0..bb83f0dc5 100644
--- a/comps/llms/text-generation/vllm/langchain/README.md
+++ b/comps/llms/text-generation/vllm/langchain/README.md
@@ -223,29 +223,21 @@ User can set the following model parameters according to needs:
 - streaming(true/false): return text response in streaming mode or non-streaming mode
 
 ```bash
-# 1. Non-streaming mode
+# stream mode
 curl http://${your_ip}:9000/v1/chat/completions \
-  -X POST \
-  -d '{"query":"What is Deep Learning?","max_tokens":17,"top_p":1,"temperature":0.7,"frequency_penalty":0,"presence_penalty":0, "streaming":false}' \
-  -H 'Content-Type: application/json'
+    -X POST \
+    -d '{"model": "${model_name}", "messages": "What is Deep Learning?", "max_tokens":17}' \
+    -H 'Content-Type: application/json'
 
-# 2. Streaming mode
 curl http://${your_ip}:9000/v1/chat/completions \
-  -X POST \
-  -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
-  -H 'Content-Type: application/json'
+    -X POST \
+    -d '{"model": "${model_name}", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' \
+    -H 'Content-Type: application/json'
 
-# 3. Custom chat template with streaming mode
+#Non-stream mode
 curl http://${your_ip}:9000/v1/chat/completions \
-  -X POST \
-  -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true, "chat_template":"### You are a helpful, respectful and honest assistant to help the user with questions.\n### Context: {context}\n### Question: {question}\n### Answer:"}' \
-  -H 'Content-Type: application/json'
+    -X POST \
+    -d '{"model": "${model_name}", "messages": "What is Deep Learning?", "max_tokens":17, "stream":false}' \
+    -H 'Content-Type: application/json'
 
-4. #  Chat with SearchedDoc (Retrieval context)
-curl http://${your_ip}:9000/v1/chat/completions \
-  -X POST \
-  -d '{"initial_query":"What is Deep Learning?","retrieved_docs":[{"text":"Deep Learning is a ..."},{"text":"Deep Learning is b ..."}]}' \
-  -H 'Content-Type: application/json'
 ```
-
-For parameters, can refer to [LangChain VLLMOpenAI API](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.vllm.VLLMOpenAI.html)
diff --git a/comps/llms/text-generation/vllm/langchain/llm.py b/comps/llms/text-generation/vllm/langchain/llm.py
index ccedec451..143c9b9d0 100644
--- a/comps/llms/text-generation/vllm/langchain/llm.py
+++ b/comps/llms/text-generation/vllm/langchain/llm.py
@@ -7,6 +7,7 @@
 from fastapi.responses import StreamingResponse
 from langchain_community.llms import VLLMOpenAI
 from langchain_core.prompts import PromptTemplate
+from openai import OpenAI
 from template import ChatTemplate
 
 from comps import (
@@ -194,6 +195,98 @@ async def stream_generator():
                 logger.info(response)
 
             return GeneratedDoc(text=response, prompt=input.query)
+    else:
+        if logflag:
+            logger.info("[ ChatCompletionRequest ] input in opea format")
+        client = OpenAI(
+            api_key="EMPTY",
+            base_url=llm_endpoint + "/v1",
+        )
+
+        if isinstance(input.messages, str):
+            prompt = input.messages
+            if prompt_template:
+                if sorted(input_variables) == ["context", "question"]:
+                    prompt = prompt_template.format(question=input.messages, context="\n".join(input.documents))
+                elif input_variables == ["question"]:
+                    prompt = prompt_template.format(question=input.messages)
+                else:
+                    logger.info(
+                        f"[ ChatCompletionRequest ] {prompt_template} not used, we only support 2 input variables ['question', 'context']"
+                    )
+            else:
+                if input.documents:
+                    # use rag default template
+                    prompt = ChatTemplate.generate_rag_prompt(input.messages, input.documents, input.model)
+
+            chat_completion = client.completions.create(
+                model=model_name,
+                prompt=prompt,
+                echo=input.echo,
+                frequency_penalty=input.frequency_penalty,
+                max_tokens=input.max_tokens,
+                n=input.n,
+                presence_penalty=input.presence_penalty,
+                seed=input.seed,
+                stop=input.stop,
+                stream=input.stream,
+                suffix=input.suffix,
+                temperature=input.temperature,
+                top_p=input.top_p,
+                user=input.user,
+            )
+        else:
+            if input.messages[0]["role"] == "system":
+                if "{context}" in input.messages[0]["content"]:
+                    if input.documents is None or input.documents == []:
+                        input.messages[0]["content"].format(context="")
+                    else:
+                        input.messages[0]["content"].format(context="\n".join(input.documents))
+            else:
+                if prompt_template:
+                    system_prompt = prompt_template
+                    if input_variables == ["context"]:
+                        system_prompt = prompt_template.format(context="\n".join(input.documents))
+                    else:
+                        logger.info(
+                            f"[ ChatCompletionRequest ] {prompt_template} not used, only support 1 input variables ['context']"
+                        )
+
+                    input.messages.insert(0, {"role": "system", "content": system_prompt})
+
+            chat_completion = client.chat.completions.create(
+                model=model_name,
+                messages=input.messages,
+                frequency_penalty=input.frequency_penalty,
+                max_tokens=input.max_tokens,
+                n=input.n,
+                presence_penalty=input.presence_penalty,
+                response_format=input.response_format,
+                seed=input.seed,
+                stop=input.stop,
+                stream=input.stream,
+                stream_options=input.stream_options,
+                temperature=input.temperature,
+                top_p=input.top_p,
+                user=input.user,
+            )
+
+        if input.stream:
+
+            def stream_generator():
+                for c in chat_completion:
+                    if logflag:
+                        logger.info(c)
+                    chunk = c.model_dump_json()
+                    if chunk not in ["<|im_end|>", "<|endoftext|>"]:
+                        yield f"data: {chunk}\n\n"
+                yield "data: [DONE]\n\n"
+
+            return StreamingResponse(stream_generator(), media_type="text/event-stream")
+        else:
+            if logflag:
+                logger.info(chat_completion)
+            return chat_completion
 
 
 if __name__ == "__main__":
diff --git a/comps/llms/text-generation/vllm/langchain/query.sh b/comps/llms/text-generation/vllm/langchain/query.sh
deleted file mode 100644
index 31fa18750..000000000
--- a/comps/llms/text-generation/vllm/langchain/query.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-your_ip="0.0.0.0"
-model=$(curl http://localhost:8008/v1/models -s|jq -r '.data[].id')
-
-curl http://${your_ip}:8008/v1/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-  "model": "'$model'",
-  "prompt": "What is Deep Learning?",
-  "max_tokens": 32,
-  "temperature": 0
-  }'
-
-##query microservice
-curl http://${your_ip}:9000/v1/chat/completions \
-  -X POST \
-  -d '{"query":"What is Deep Learning?","max_tokens":17,"top_p":1,"temperature":0.7,"frequency_penalty":0,"presence_penalty":0, "streaming":false}' \
-  -H 'Content-Type: application/json'
diff --git a/tests/llms/test_llms_text-generation_vllm_langchain_on_intel_hpu.sh b/tests/llms/test_llms_text-generation_vllm_langchain_on_intel_hpu.sh
index 6b8e468f8..c83799128 100644
--- a/tests/llms/test_llms_text-generation_vllm_langchain_on_intel_hpu.sh
+++ b/tests/llms/test_llms_text-generation_vllm_langchain_on_intel_hpu.sh
@@ -44,6 +44,7 @@ function start_service() {
         -p $port_number:80 \
         -e HABANA_VISIBLE_DEVICES=all \
         -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
+        -e VLLM_SKIP_WARMUP=true \
         --cap-add=sys_nice \
         --ipc=host \
         -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} \
@@ -62,7 +63,7 @@ function start_service() {
 
     # check whether vllm ray is fully ready
     n=0
-    until [[ "$n" -ge 160 ]] || [[ $ready == true ]]; do
+    until [[ "$n" -ge 70 ]] || [[ $ready == true ]]; do
         docker logs test-comps-vllm-service > ${WORKPATH}/tests/test-comps-vllm-service.log
         n=$((n+1))
         if grep -q throughput ${WORKPATH}/tests/test-comps-vllm-service.log; then
@@ -90,9 +91,23 @@ function validate_microservice() {
         docker logs test-comps-vllm-microservice
         exit 1
     fi
+
+    result=$(http_proxy="" curl http://${ip_address}:5030/v1/chat/completions \
+        -X POST \
+        -d '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17, "stream":false}' \
+        -H 'Content-Type: application/json')
+    if [[ $result == *"content"* ]]; then
+        echo "Result correct."
+    else
+        echo "Result wrong. Received was $result"
+        docker logs test-comps-vllm-service
+        docker logs test-comps-vllm-microservice
+        exit 1
+    fi
+
     result=$(http_proxy="" curl http://${ip_address}:5030/v1/chat/completions \
         -X POST \
-        -d '{"query":"What is Deep Learning?","max_tokens":17,"top_p":1,"temperature":0.7,"frequency_penalty":0,"presence_penalty":0, "streaming":false}' \
+        -d '{"model": "Intel/neural-chat-7b-v3-3", "messages": "What is Deep Learning?", "max_tokens":17, "stream":false}' \
         -H 'Content-Type: application/json')
     if [[ $result == *"text"* ]]; then
         echo "Result correct."