feat: accept list as prompt and use first string (huggingface#1702)

This PR allows the `CompletionRequest.prompt` to be sent as a string or array of strings. When an array is sent the first value will be used if it's a string; otherwise the according error will be thrown Fixes: huggingface#1690 Similar to: https://github.com/vllm-project/vllm/pull/323/files
kdamaszk · Jun 3, 2024 · 9189169 · 9189169
1 parent fea0f2f
commit 9189169
Show file tree

Hide file tree

Showing 11 changed files with 1,188 additions and 107 deletions.
diff --git a/clients/python/text_generation/types.py b/clients/python/text_generation/types.py
@@ -59,6 +59,17 @@ class ChatCompletionComplete(BaseModel):
     usage: Optional[Any] = None
 
 
+class CompletionComplete(BaseModel):
+    # Index of the chat completion
+    index: int
+    # Message associated with the chat completion
+    text: str
+    # Log probabilities for the chat completion
+    logprobs: Optional[Any]
+    # Reason for completion
+    finish_reason: str
+
+
 class Function(BaseModel):
     name: Optional[str]
     arguments: str
@@ -104,6 +115,16 @@ class ChatComplete(BaseModel):
     usage: Any
 
 
+class Completion(BaseModel):
+    # Completion details
+    id: str
+    object: str
+    created: int
+    model: str
+    system_fingerprint: str
+    choices: List[CompletionComplete]
+
+
 class ChatRequest(BaseModel):
     # Model identifier
     model: str

diff --git a/docs/source/basic_tutorials/launcher.md b/docs/source/basic_tutorials/launcher.md
@@ -398,6 +398,15 @@ Options:
   -e, --env
           Display a lot of information about your runtime environment
 
+```
+## MAX_CLIENT_BATCH_SIZE
+```shell
+      --max-client-batch-size <MAX_CLIENT_BATCH_SIZE>
+          Control the maximum number of inputs that a client can send in a single request
+          
+          [env: MAX_CLIENT_BATCH_SIZE=]
+          [default: 4]
+
 ```
 ## HELP
 ```shell

diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py
@@ -9,6 +9,7 @@
 import math
 import time
 import random
+import re
 
 from docker.errors import NotFound
 from typing import Optional, List, Dict
@@ -26,6 +27,7 @@
     ChatComplete,
     ChatCompletionChunk,
     ChatCompletionComplete,
+    Completion,
 )
 
 DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", None)
@@ -69,17 +71,22 @@ def convert_data(data):
             data = json.loads(data)
             if isinstance(data, Dict) and "choices" in data:
                 choices = data["choices"]
-                if (
-                    isinstance(choices, List)
-                    and len(choices) >= 1
-                    and "delta" in choices[0]
-                ):
-                    return ChatCompletionChunk(**data)
+                if isinstance(choices, List) and len(choices) >= 1:
+                    if "delta" in choices[0]:
+                        return ChatCompletionChunk(**data)
+                    if "text" in choices[0]:
+                        return Completion(**data)
                 return ChatComplete(**data)
 
             if isinstance(data, Dict):
                 return Response(**data)
             if isinstance(data, List):
+                if (
+                    len(data) > 0
+                    and "object" in data[0]
+                    and data[0]["object"] == "text_completion"
+                ):
+                    return [Completion(**d) for d in data]
                 return [Response(**d) for d in data]
             raise NotImplementedError
 
@@ -161,6 +168,9 @@ def eq_details(details: Details, other: Details) -> bool:
                 )
             )
 
+        def eq_completion(response: Completion, other: Completion) -> bool:
+            return response.choices[0].text == other.choices[0].text
+
         def eq_chat_complete(response: ChatComplete, other: ChatComplete) -> bool:
             return (
                 response.choices[0].message.content == other.choices[0].message.content
@@ -184,6 +194,11 @@ def eq_response(response: Response, other: Response) -> bool:
         if not isinstance(snapshot_data, List):
             snapshot_data = [snapshot_data]
 
+        if isinstance(serialized_data[0], Completion):
+            return len(snapshot_data) == len(serialized_data) and all(
+                [eq_completion(r, o) for r, o in zip(serialized_data, snapshot_data)]
+            )
+
         if isinstance(serialized_data[0], ChatComplete):
             return len(snapshot_data) == len(serialized_data) and all(
                 [eq_chat_complete(r, o) for r, o in zip(serialized_data, snapshot_data)]

diff --git a/...odels/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts.json b/...odels/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts.json
@@ -0,0 +1,38 @@
+{
+  "choices": [
+    {
+      "finish_reason": "eos_token",
+      "index": 1,
+      "logprobs": null,
+      "text": " PR for more information?"
+    },
+    {
+      "finish_reason": "length",
+      "index": 0,
+      "logprobs": null,
+      "text": "le Business Incubator is providing a workspace"
+    },
+    {
+      "finish_reason": "length",
+      "index": 2,
+      "logprobs": null,
+      "text": " severely flawed and often has a substandard"
+    },
+    {
+      "finish_reason": "length",
+      "index": 3,
+      "logprobs": null,
+      "text": "hd20220811-"
+    }
+  ],
+  "created": 1713284455,
+  "id": "",
+  "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+  "object": "text_completion",
+  "system_fingerprint": "2.0.0-native",
+  "usage": {
+    "completion_tokens": 36,
+    "prompt_tokens": 8,
+    "total_tokens": 44
+  }
+}