Skip to content

Commit

Permalink
Complete the switch to vllm backend
Browse files Browse the repository at this point in the history
  • Loading branch information
Scott Davidson committed Oct 27, 2023
1 parent fae060a commit ebe84b0
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 26 deletions.
8 changes: 6 additions & 2 deletions templates/api/deployment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,15 @@ spec:
image: {{ printf "%s:%s" .Values.api.image.repository .Values.api.image.version }}
ports:
- name: api
containerPort: 80
containerPort: 8000
volumeMounts:
- name: data
mountPath: /root/.cache/huggingface
command:
- python3.11
args:
- -m
- vllm.entrypoints.api_server
- --model
- {{ .Values.huggingface.model }}
{{- if .Values.huggingface.secretName }}
Expand All @@ -41,7 +45,7 @@ spec:
{{- end }}
readinessProbe:
tcpSocket:
port: 80
port: 8000
initialDelaySeconds: 15
periodSeconds: 10
resources:
Expand Down
41 changes: 23 additions & 18 deletions templates/ui/app-config-map.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,40 +6,45 @@ metadata:
{{- include "azimuth-llm.labels" . | nindent 4 }}
data:
app.py: |
import huggingface_hub
from huggingface_hub import InferenceClient
import requests, json
import gradio as gr
from startup import wait_for_backend
# NOTE: This url should match the chart's api service name & namespace
backend_url = "http://{{ .Values.api.service.name }}.{{ .Release.Namespace }}.svc"
wait_for_backend(backend_url)
client = InferenceClient(model=backend_url)
def inference(message, history):
if message == "":
yield ""
headers = {"User-Agent": "vLLM Client"}
pload = {
"prompt": message,
"stream": True,
"max_tokens": 128,
}
response = requests.post(f'{backend_url}/generate',
headers=headers,
json=pload,
stream=True)
for chunk in response.iter_lines(chunk_size=8192,
decode_unicode=False,
delimiter=b"\0"):
if chunk:
data = json.loads(chunk.decode("utf-8"))
output = data["text"][0]
yield output
partial_message = ""
try:
for token in client.text_generation(message, max_new_tokens=500, stream=True):
partial_message += token
# Strip text marker from generated output
partial_message = partial_message.replace('<|endoftext|>', '')
yield partial_message
except huggingface_hub.inference._text_generation.ValidationError as e:
raise gr.Error("Context length exceeded. Please clear the chat window.")
gr.ChatInterface(
inference,
chatbot=gr.Chatbot(
height=500,
show_copy_button=True,
# layout='panel',
),
title="Azimuth LLM",
description="This is the demo UI for the Azimuth LLM application.",
textbox=gr.Textbox(placeholder="Ask me anything...", container=False, scale=7),
title="Large Language Model",
retry_btn="Retry",
undo_btn="Undo",
clear_btn="Clear",
Expand All @@ -51,7 +56,7 @@ data:
ready = False
while not ready:
try:
ready = (requests.get(f'{url}/health').status_code == 200)
ready = (requests.get(f'{url}/docs').status_code == 200)
print('Waiting for backend API to start')
time.sleep(5)
except requests.exceptions.ConnectionError as e:
Expand Down
11 changes: 5 additions & 6 deletions values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,24 +23,23 @@ huggingface:
secretName:
# OR FOR TESTING PURPOSES ONLY, you can instead provide the secret directly
# as a chart value here (if secretName is set about then it will take priority)
token:
token: ""

# Configuration for the backend model serving API
api:
# Container image config
image:
repository: ghcr.io/huggingface/text-generation-inference
# NOTE: versions > 0.9.4 are no longer Apache licensed :(
version: 0.9.4
repository: ghcr.io/stackhpc/azimuth-llm-api-base
version: fae060a
# Service config
service:
name: text-generation-inference
type: ClusterIP
zenith:
enabled: true
enabled: false
skipAuth: false
label: Inference API
iconUrl:
iconUrl:
description: |
The raw inference API endpoints for the deployed LLM.
Public API docs are available [here](https://huggingface.github.io/text-generation-inference/#/Text%20Generation%20Inference)
Expand Down

0 comments on commit ebe84b0

Please sign in to comment.