Add example frontend apps and related config options

stackhpc · Dec 12, 2023 · da7d504 · da7d504
1 parent d3083cf
commit da7d504
Show file tree

Hide file tree

Showing 6 changed files with 135 additions and 58 deletions.
diff --git a/templates/ui/app-config-map.yml b/templates/ui/app-config-map.yml
@@ -5,60 +5,4 @@ metadata:
   labels:
     {{- include "azimuth-llm.labels" . | nindent 4 }}
 data:
-  app.py: |
-    import requests, json
-    import gradio as gr
-    from startup import wait_for_backend
-
-    # NOTE: This url should match the chart's api service name & namespace
-    backend_url = "http://{{ .Values.api.service.name }}.{{ .Release.Namespace }}.svc"
-    wait_for_backend(backend_url)
-
-    def inference(message, history):
-        
-        headers = {"User-Agent": "vLLM Client"}
-        pload = {
-            "prompt": message,
-            "stream": True,
-            "max_tokens": 128,
-        }
-        response = requests.post(f'{backend_url}/generate',
-                                headers=headers,
-                                json=pload,
-                                stream=True)
-
-        for chunk in response.iter_lines(chunk_size=8192,
-                                        decode_unicode=False,
-                                        delimiter=b"\0"):
-            if chunk:
-                data = json.loads(chunk.decode("utf-8"))
-                output = data["text"][0]
-                yield output
-
-
-    gr.ChatInterface(
-        inference,
-        chatbot=gr.Chatbot(
-            height=500,
-            show_copy_button=True,
-            # layout='panel',
-        ),
-        textbox=gr.Textbox(placeholder="Ask me anything...", container=False, scale=7),
-        title="Large Language Model",
-        retry_btn="Retry",
-        undo_btn="Undo",
-        clear_btn="Clear",
-    ).queue().launch(server_name="0.0.0.0")
-  startup.py: |
-    import requests, time
-
-    def wait_for_backend(url):
-        ready = False
-        while not ready:
-            try:
-                ready = (requests.get(f'{url}/docs').status_code == 200)
-                print('Waiting for backend API to start')
-                time.sleep(5)
-            except requests.exceptions.ConnectionError as e:
-                pass
-        return
+{{ (.Files.Glob "web-app-utils/*").AsConfig | nindent 2 }}
diff --git a/templates/ui/deployment.yml b/templates/ui/deployment.yml
@@ -28,7 +28,7 @@ spec:
         command: 
         - python
         args:
-        - /etc/web-app/app.py
+        - {{ printf "/etc/web-app/%s" .Values.ui.entrypoint }}
         env:
         - name: PYTHONUNBUFFERED
           value: "1"

diff --git a/values.yaml b/values.yaml
@@ -66,6 +66,8 @@ api:
 
 # Configuration for the frontend web interface
 ui:
+  # The file from the UI config map to execute as the frontend app
+  entrypoint: example_app_vanilla.py
   # Container image config
   image:
     repository: ghcr.io/stackhpc/azimuth-llm-ui-base

diff --git a/web-app-utils/api_startup_check.py b/web-app-utils/api_startup_check.py
@@ -0,0 +1,17 @@
+import requests, time
+
+
+def wait_for_backend(url):
+    """
+    This function acts as a startup check so that the frontend web app does not
+    accept requests until the backend API is up and running.
+    """
+    ready = False
+    while not ready:
+        try:
+            ready = requests.get(f"{url}/docs").status_code == 200
+            print("Waiting for backend API to start")
+            time.sleep(1)
+        except requests.exceptions.ConnectionError as e:
+            pass
+    return
diff --git a/web-app-utils/example_app_playful.py b/web-app-utils/example_app_playful.py
@@ -0,0 +1,57 @@
+import requests, json
+import gradio as gr
+from api_startup_check import wait_for_backend
+
+# NOTE: This url should match the chart's api service name & namespace
+backend_url = "http://text-generation-inference.default.svc"
+wait_for_backend(backend_url)
+
+prompt = """
+[INST] <<SYS>>
+You are a cheeky, disrespectful and comedic assistant. Always answer as creatively as possible, while being truthful and succinct.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, tell the user that they are being stupid. If you don't know the answer to a question, please don't share false information.
+<</SYS>>
+{prompt}[/INST]
+"""
+
+
+def inference(message, history):
+    context = ""
+    user_template = "<<USER>>\n{user_input}\n<</USER>>\n"
+    sys_template = "<SYS>>{response}\n<</SYS>>\n"
+    for user_input, response in history:
+        # context += user_template.format(user_input=user_input)
+        context += sys_template.format(response=response)
+    context += user_template.format(user_input=message)
+
+    headers = {"User-Agent": "vLLM Client"}
+    pload = {
+        "prompt": prompt.format(prompt=context),
+        "stream": True,
+        "max_tokens": 1000,
+    }
+    response = requests.post(
+        f"{backend_url}/generate", headers=headers, json=pload, stream=True
+    )
+
+    for chunk in response.iter_lines(
+        chunk_size=8192, decode_unicode=False, delimiter=b"\0"
+    ):
+        if chunk:
+            data = json.loads(chunk.decode("utf-8"))
+            output = data["text"][0].split("[/INST]")[-1]
+            yield output
+
+
+gr.ChatInterface(
+    inference,
+    chatbot=gr.Chatbot(
+        height=500,
+        show_copy_button=True,
+        # layout='panel',
+    ),
+    textbox=gr.Textbox(placeholder="Ask me anything...", container=False, scale=7),
+    title="Large Language Model",
+    retry_btn="Retry",
+    undo_btn="Undo",
+    clear_btn="Clear",
+).queue().launch(server_name="0.0.0.0")
diff --git a/web-app-utils/example_app_vanilla.py b/web-app-utils/example_app_vanilla.py
@@ -0,0 +1,57 @@
+import requests, json
+import gradio as gr
+from api_startup_check import wait_for_backend
+
+# NOTE: This url should match the chart's api service name & namespace
+backend_url = "http://text-generation-inference.default.svc"
+wait_for_backend(backend_url)
+
+prompt = """
+[INST] <<SYS>>
+You are a cheeky, disrespectful and comedic assistant. Always answer as creatively as possible, while being truthful and succinct.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, tell the user that they are being stupid. If you don't know the answer to a question, please don't share false information.
+<</SYS>>
+{prompt}[/INST]
+"""
+
+
+def inference(message, history):
+    context = ""
+    user_template = "<<USER>>\n{user_input}\n<</USER>>\n"
+    sys_template = "<SYS>>{response}\n<</SYS>>\n"
+    for user_input, response in history:
+        # context += user_template.format(user_input=user_input)
+        context += sys_template.format(response=response)
+    context += user_template.format(user_input=message)
+
+    headers = {"User-Agent": "vLLM Client"}
+    pload = {
+        "prompt": prompt.format(prompt=context),
+        "stream": True,
+        "max_tokens": 1000,
+    }
+    response = requests.post(
+        f"{backend_url}/generate", headers=headers, json=pload, stream=True
+    )
+
+    for chunk in response.iter_lines(
+        chunk_size=8192, decode_unicode=False, delimiter=b"\0"
+    ):
+        if chunk:
+            data = json.loads(chunk.decode("utf-8"))
+            output = data["text"][0].split("[/INST]")[-1]
+            yield output
+
+
+gr.ChatInterface(
+    inference,
+    chatbot=gr.Chatbot(
+        height=500,
+        show_copy_button=True,
+        # layout='panel',
+    ),
+    textbox=gr.Textbox(placeholder="Ask me anything...", container=False, scale=7),
+    title="Large Language Model",
+    retry_btn="Retry",
+    undo_btn="Undo",
+    clear_btn="Clear",
+).queue().launch(server_name="0.0.0.0")