Complete the switch to vllm backend

stackhpc · Oct 27, 2023 · ebe84b0 · ebe84b0
1 parent fae060a
commit ebe84b0
Show file tree

Hide file tree

Showing 3 changed files with 34 additions and 26 deletions.
diff --git a/templates/api/deployment.yml b/templates/api/deployment.yml
@@ -21,11 +21,15 @@ spec:
         image: {{ printf "%s:%s" .Values.api.image.repository .Values.api.image.version }}
         ports:
         - name: api
-          containerPort: 80
+          containerPort: 8000
         volumeMounts:
         - name: data
           mountPath: /root/.cache/huggingface
+        command:
+        - python3.11
         args:
+          - -m
+          - vllm.entrypoints.api_server
           - --model
           - {{ .Values.huggingface.model }}
         {{- if .Values.huggingface.secretName }}
@@ -41,7 +45,7 @@ spec:
         {{- end }}
         readinessProbe:
           tcpSocket:
-            port: 80
+            port: 8000
           initialDelaySeconds: 15
           periodSeconds: 10
         resources:

diff --git a/templates/ui/app-config-map.yml b/templates/ui/app-config-map.yml
@@ -6,40 +6,45 @@ metadata:
     {{- include "azimuth-llm.labels" . | nindent 4 }}
 data:
   app.py: |
-    import huggingface_hub
-    from huggingface_hub import InferenceClient
+    import requests, json
     import gradio as gr
     from startup import wait_for_backend
 
+    # NOTE: This url should match the chart's api service name & namespace
     backend_url = "http://{{ .Values.api.service.name }}.{{ .Release.Namespace }}.svc"
     wait_for_backend(backend_url)
 
-    client = InferenceClient(model=backend_url)
-
     def inference(message, history):
         
-        if message == "":
-            yield ""
+        headers = {"User-Agent": "vLLM Client"}
+        pload = {
+            "prompt": message,
+            "stream": True,
+            "max_tokens": 128,
+        }
+        response = requests.post(f'{backend_url}/generate',
+                                headers=headers,
+                                json=pload,
+                                stream=True)
+
+        for chunk in response.iter_lines(chunk_size=8192,
+                                        decode_unicode=False,
+                                        delimiter=b"\0"):
+            if chunk:
+                data = json.loads(chunk.decode("utf-8"))
+                output = data["text"][0]
+                yield output
 
-        partial_message = ""
-        try:
-            for token in client.text_generation(message, max_new_tokens=500, stream=True):
-                partial_message += token
-                # Strip text marker from generated output
-                partial_message = partial_message.replace('<|endoftext|>', '')
-                yield partial_message
-        except huggingface_hub.inference._text_generation.ValidationError as e:
-            raise gr.Error("Context length exceeded. Please clear the chat window.")
 
     gr.ChatInterface(
         inference,
         chatbot=gr.Chatbot(
             height=500,
             show_copy_button=True,
+            # layout='panel',
         ),
-        title="Azimuth LLM",
-        description="This is the demo UI for the Azimuth LLM application.",
         textbox=gr.Textbox(placeholder="Ask me anything...", container=False, scale=7),
+        title="Large Language Model",
         retry_btn="Retry",
         undo_btn="Undo",
         clear_btn="Clear",
@@ -51,7 +56,7 @@ data:
         ready = False
         while not ready:
             try:
-                ready = (requests.get(f'{url}/health').status_code == 200)
+                ready = (requests.get(f'{url}/docs').status_code == 200)
                 print('Waiting for backend API to start')
                 time.sleep(5)
             except requests.exceptions.ConnectionError as e:

diff --git a/values.yaml b/values.yaml
@@ -23,24 +23,23 @@ huggingface:
   secretName:
   # OR FOR TESTING PURPOSES ONLY, you can instead provide the secret directly
   # as a chart value here (if secretName is set about then it will take priority)
-  token: 
+  token: ""
 
 # Configuration for the backend model serving API
 api:
   # Container image config
   image:
-    repository: ghcr.io/huggingface/text-generation-inference
-    # NOTE: versions > 0.9.4 are no longer Apache licensed :(
-    version: 0.9.4
+    repository: ghcr.io/stackhpc/azimuth-llm-api-base
+    version: fae060a
   # Service config 
   service:
     name: text-generation-inference
     type: ClusterIP
     zenith:
-      enabled: true
+      enabled: false
       skipAuth: false
       label: Inference API
-      iconUrl: 
+      iconUrl:
       description: |
         The raw inference API endpoints for the deployed LLM. 
         Public API docs are available [here](https://huggingface.github.io/text-generation-inference/#/Text%20Generation%20Inference)