vllm comps support openai API ChatCompletionRequest (opea-project#1032)

* vllm support openai API Signed-off-by: Xinyao Wang <[email protected]> * fix bug Signed-off-by: Xinyao Wang <[email protected]> * fix bug Signed-off-by: Xinyao Wang <[email protected]> * test_llms_text-generation_vllm_langchain_on_intel_hpu.sh Signed-off-by: Xinyao Wang <[email protected]> * fix time Signed-off-by: Xinyao Wang <[email protected]> * fix bug Signed-off-by: Xinyao Wang <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix bug Signed-off-by: Xinyao Wang <[email protected]> --------- Signed-off-by: Xinyao Wang <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
jjmaturino · Dec 16, 2024 · 5b42ca8 · 5b42ca8
1 parent da99090
commit 5b42ca8
Show file tree

Hide file tree

Showing 4 changed files with 121 additions and 41 deletions.
diff --git a/comps/llms/text-generation/vllm/langchain/README.md b/comps/llms/text-generation/vllm/langchain/README.md
@@ -223,29 +223,21 @@ User can set the following model parameters according to needs:
 - streaming(true/false): return text response in streaming mode or non-streaming mode
 
 ```bash
-# 1. Non-streaming mode
+# stream mode
 curl http://${your_ip}:9000/v1/chat/completions \
-  -X POST \
-  -d '{"query":"What is Deep Learning?","max_tokens":17,"top_p":1,"temperature":0.7,"frequency_penalty":0,"presence_penalty":0, "streaming":false}' \
-  -H 'Content-Type: application/json'
+    -X POST \
+    -d '{"model": "${model_name}", "messages": "What is Deep Learning?", "max_tokens":17}' \
+    -H 'Content-Type: application/json'
 
-# 2. Streaming mode
 curl http://${your_ip}:9000/v1/chat/completions \
-  -X POST \
-  -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
-  -H 'Content-Type: application/json'
+    -X POST \
+    -d '{"model": "${model_name}", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' \
+    -H 'Content-Type: application/json'
 
-# 3. Custom chat template with streaming mode
+#Non-stream mode
 curl http://${your_ip}:9000/v1/chat/completions \
-  -X POST \
-  -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true, "chat_template":"### You are a helpful, respectful and honest assistant to help the user with questions.\n### Context: {context}\n### Question: {question}\n### Answer:"}' \
-  -H 'Content-Type: application/json'
+    -X POST \
+    -d '{"model": "${model_name}", "messages": "What is Deep Learning?", "max_tokens":17, "stream":false}' \
+    -H 'Content-Type: application/json'
 
-4. #  Chat with SearchedDoc (Retrieval context)
-curl http://${your_ip}:9000/v1/chat/completions \
-  -X POST \
-  -d '{"initial_query":"What is Deep Learning?","retrieved_docs":[{"text":"Deep Learning is a ..."},{"text":"Deep Learning is b ..."}]}' \
-  -H 'Content-Type: application/json'
 ```
-
-For parameters, can refer to [LangChain VLLMOpenAI API](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.vllm.VLLMOpenAI.html)
diff --git a/comps/llms/text-generation/vllm/langchain/llm.py b/comps/llms/text-generation/vllm/langchain/llm.py
@@ -7,6 +7,7 @@
 from fastapi.responses import StreamingResponse
 from langchain_community.llms import VLLMOpenAI
 from langchain_core.prompts import PromptTemplate
+from openai import OpenAI
 from template import ChatTemplate
 
 from comps import (
@@ -194,6 +195,98 @@ async def stream_generator():
                 logger.info(response)
 
             return GeneratedDoc(text=response, prompt=input.query)
+    else:
+        if logflag:
+            logger.info("[ ChatCompletionRequest ] input in opea format")
+        client = OpenAI(
+            api_key="EMPTY",
+            base_url=llm_endpoint + "/v1",
+        )
+
+        if isinstance(input.messages, str):
+            prompt = input.messages
+            if prompt_template:
+                if sorted(input_variables) == ["context", "question"]:
+                    prompt = prompt_template.format(question=input.messages, context="\n".join(input.documents))
+                elif input_variables == ["question"]:
+                    prompt = prompt_template.format(question=input.messages)
+                else:
+                    logger.info(
+                        f"[ ChatCompletionRequest ] {prompt_template} not used, we only support 2 input variables ['question', 'context']"
+                    )
+            else:
+                if input.documents:
+                    # use rag default template
+                    prompt = ChatTemplate.generate_rag_prompt(input.messages, input.documents, input.model)
+
+            chat_completion = client.completions.create(
+                model=model_name,
+                prompt=prompt,
+                echo=input.echo,
+                frequency_penalty=input.frequency_penalty,
+                max_tokens=input.max_tokens,
+                n=input.n,
+                presence_penalty=input.presence_penalty,
+                seed=input.seed,
+                stop=input.stop,
+                stream=input.stream,
+                suffix=input.suffix,
+                temperature=input.temperature,
+                top_p=input.top_p,
+                user=input.user,
+            )
+        else:
+            if input.messages[0]["role"] == "system":
+                if "{context}" in input.messages[0]["content"]:
+                    if input.documents is None or input.documents == []:
+                        input.messages[0]["content"].format(context="")
+                    else:
+                        input.messages[0]["content"].format(context="\n".join(input.documents))
+            else:
+                if prompt_template:
+                    system_prompt = prompt_template
+                    if input_variables == ["context"]:
+                        system_prompt = prompt_template.format(context="\n".join(input.documents))
+                    else:
+                        logger.info(
+                            f"[ ChatCompletionRequest ] {prompt_template} not used, only support 1 input variables ['context']"
+                        )
+
+                    input.messages.insert(0, {"role": "system", "content": system_prompt})
+
+            chat_completion = client.chat.completions.create(
+                model=model_name,
+                messages=input.messages,
+                frequency_penalty=input.frequency_penalty,
+                max_tokens=input.max_tokens,
+                n=input.n,
+                presence_penalty=input.presence_penalty,
+                response_format=input.response_format,
+                seed=input.seed,
+                stop=input.stop,
+                stream=input.stream,
+                stream_options=input.stream_options,
+                temperature=input.temperature,
+                top_p=input.top_p,
+                user=input.user,
+            )
+
+        if input.stream:
+
+            def stream_generator():
+                for c in chat_completion:
+                    if logflag:
+                        logger.info(c)
+                    chunk = c.model_dump_json()
+                    if chunk not in ["<|im_end|>", "<|endoftext|>"]:
+                        yield f"data: {chunk}\n\n"
+                yield "data: [DONE]\n\n"
+
+            return StreamingResponse(stream_generator(), media_type="text/event-stream")
+        else:
+            if logflag:
+                logger.info(chat_completion)
+            return chat_completion
 
 
 if __name__ == "__main__":

diff --git a/comps/llms/text-generation/vllm/langchain/query.sh b/comps/llms/text-generation/vllm/langchain/query.sh
diff --git a/tests/llms/test_llms_text-generation_vllm_langchain_on_intel_hpu.sh b/tests/llms/test_llms_text-generation_vllm_langchain_on_intel_hpu.sh
@@ -44,6 +44,7 @@ function start_service() {
         -p $port_number:80 \
         -e HABANA_VISIBLE_DEVICES=all \
         -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
+        -e VLLM_SKIP_WARMUP=true \
         --cap-add=sys_nice \
         --ipc=host \
         -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} \
@@ -62,7 +63,7 @@ function start_service() {
 
     # check whether vllm ray is fully ready
     n=0
-    until [[ "$n" -ge 160 ]] || [[ $ready == true ]]; do
+    until [[ "$n" -ge 70 ]] || [[ $ready == true ]]; do
         docker logs test-comps-vllm-service > ${WORKPATH}/tests/test-comps-vllm-service.log
         n=$((n+1))
         if grep -q throughput ${WORKPATH}/tests/test-comps-vllm-service.log; then
@@ -90,9 +91,23 @@ function validate_microservice() {
         docker logs test-comps-vllm-microservice
         exit 1
     fi
+
+    result=$(http_proxy="" curl http://${ip_address}:5030/v1/chat/completions \
+        -X POST \
+        -d '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17, "stream":false}' \
+        -H 'Content-Type: application/json')
+    if [[ $result == *"content"* ]]; then
+        echo "Result correct."
+    else
+        echo "Result wrong. Received was $result"
+        docker logs test-comps-vllm-service
+        docker logs test-comps-vllm-microservice
+        exit 1
+    fi
+
     result=$(http_proxy="" curl http://${ip_address}:5030/v1/chat/completions \
         -X POST \
-        -d '{"query":"What is Deep Learning?","max_tokens":17,"top_p":1,"temperature":0.7,"frequency_penalty":0,"presence_penalty":0, "streaming":false}' \
+        -d '{"model": "Intel/neural-chat-7b-v3-3", "messages": "What is Deep Learning?", "max_tokens":17, "stream":false}' \
         -H 'Content-Type: application/json')
     if [[ $result == *"text"* ]]; then
         echo "Result correct."