Serving LLMs (#22)

kyryl-opens-ml · Sep 18, 2024 · 78636c6 · 78636c6
1 parent 89daf7b
commit 78636c6
Show file tree

Hide file tree

Showing 9 changed files with 260 additions and 18 deletions.
diff --git a/.gitignore b/.gitignore
@@ -162,4 +162,5 @@ taskfile.yml
 TODO.md
 README.p.md
 .DS_Store
-README.private.md
+README.private.md
+vllm
diff --git a/module-3/README.md b/module-3/README.md
@@ -90,3 +90,8 @@ python generative-api/pipeline_phi3.py ./data/test.json
 
 - https://github.com/microsoft/nni
 - https://github.com/autogluon/autogluon
+
+
+## Updated design doc
+
+[Google doc](https://docs.google.com/document/d/1vkjE5QohSkxkcWCWahciqR43K4RjCjXMpixx3hoYjXo/edit?usp=sharing)
diff --git a/module-3/generative-example/run_training_job.py b/module-3/generative-example/run_training_job.py
@@ -8,7 +8,7 @@
     "WANDB_PROJECT": os.getenv("WANDB_PROJECT"),
     "WANDB_API_KEY": os.getenv("WANDB_API_KEY"),
 }
-custom_image = Image.from_registry("ghcr.io/kyryl-opens-ml/generative-example:pr-11").env(env)
+custom_image = Image.from_registry("ghcr.io/kyryl-opens-ml/generative-example:main").env(env)
 
 
 @app.function(image=custom_image, gpu="A100", timeout=10 * 60 * 60)

diff --git a/module-4/README.md b/module-4/README.md
@@ -157,4 +157,9 @@ dagster dev -f dagster_pipelines/text2sql_pipeline.py -p 3000 -h 0.0.0.0
 ### References:
 
 - [Introducing Asset Checks](https://dagster.io/blog/dagster-asset-checks)
-- [Anomaly Detection](https://dagster.io/glossary/anomaly-detection)
+- [Anomaly Detection](https://dagster.io/glossary/anomaly-detection)
+
+
+## Updated design doc
+
+[Google doc](https://docs.google.com/document/d/1j9-RFCrLRQy54TsywHxvje56EuntAbUbSlw_POsWl5Q/edit?usp=sharing)
diff --git a/module-5/README.md b/module-5/README.md
@@ -13,7 +13,6 @@
 ***
 
 
-
 # Setup
 
 Create kind cluster
@@ -50,7 +49,7 @@ Deploy k8s:
 
 ```
 kubectl create -f k8s/app-streamlit.yaml
-kubectl port-forward --address 0.0.0.0 svc/app-streamlit 8081:8080
+kubectl port-forward --address 0.0.0.0 svc/app-streamlit 8080:8080
 ```
 
 # Fast API
@@ -65,7 +64,7 @@ Deploy k8s:
 
 ```
 kubectl create -f k8s/app-fastapi.yaml
-kubectl port-forward --address 0.0.0.0 svc/app-fastapi 8081:8080
+kubectl port-forward --address 0.0.0.0 svc/app-fastapi 8080:8080
 ```
 
 
@@ -86,16 +85,6 @@ make run_pytriton
 ```
 
 
-# LLMs
-
-
-- https://github.com/vllm-project/vllm
-- https://github.com/huggingface/text-generation-inference
-- https://github.com/predibase/lorax
-- https://github.com/triton-inference-server/vllm_backend
-- https://github.com/ray-project/ray-llm
-
-
 # KServe 
 
 Install KServe
@@ -115,3 +104,66 @@ Call API
 ```
 curl -v -H "Host: custom-model.default.example.com" -H "Content-Type: application/json" "http://localhost:8080/v1/models/custom-model:predict" -d @data-samples/kserve-input.json
 ```
+
+
+# Serving LLMs via vLLM
+
+
+Run server 
+
+```
+mkdir -p vllm-storage
+export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True
+vllm serve microsoft/Phi-3-mini-4k-instruct --dtype auto --max-model-len 512 --enable-lora --gpu-memory-utilization 0.8 --download-dir ./vllm-storage
+```
+
+
+Run client 
+
+Get list of models:
+
+```
+python serving-llm/client.py list-of-models
+```
+
+
+Add custom adapter:
+
+```
+python serving-llm/client.py load-from-registry truskovskiyk/ml-in-production-practice/modal_generative_example:latest sql-default-model
+python serving-llm/client.py load-adapter sql-default-model ./sql-default-model
+python serving-llm/client.py list-of-models
+```
+
+
+Test client:
+
+```
+python serving-llm/client.py test-client microsoft/Phi-3-mini-4k-instruct
+python serving-llm/client.py test-client sql-default-model
+```
+
+
+Deploy 
+
+Run K8S with GPUs
+
+```
+ curl -LO https://storage.googleapis.com/minikube/releases/latest/minikube_latest_amd64.deb
+sudo dpkg -i minikube_latest_amd64.deb
+minikube start --driver docker --container-runtime docker --gpus all
+```
+
+Create deployment 
+
+```
+kubectl create -f ./k8s/vllm-inference.yaml
+kubectl port-forward --address 0.0.0.0 svc/app-vllm 8000:8000
+kubectl logs <POD> -c model-loader
+kubectl logs <POD> -c app-vllm
+```
+
+
+## Updated design doc
+
+[Google doc](https://docs.google.com/document/d/1ZCnnsnHHiDkc3FgK2XBVur9W7nkDA7SKoPd1pGa-irQ/edit?usp=sharing)
diff --git a/module-5/k8s/vllm-inference.yaml b/module-5/k8s/vllm-inference.yaml
@@ -0,0 +1,117 @@
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: vllm-storage-pvc
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 50Gi
+  storageClassName: standard
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: app-vllm
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: app-vllm
+  template:
+    metadata:
+      labels:
+        app: app-vllm
+    spec:
+      containers:
+        - name: app-vllm
+          image: vllm/vllm-openai:latest
+          env:
+            - name: VLLM_ALLOW_RUNTIME_LORA_UPDATING
+              value: "True"
+          command: ["vllm"]
+          args:
+            - "serve"
+            - "microsoft/Phi-3-mini-4k-instruct"
+            - "--dtype"
+            - "auto"
+            - "--max-model-len"
+            - "512"
+            - "--enable-lora"
+            - "--gpu-memory-utilization"
+            - "0.8"
+            - "--download-dir"
+            - "/vllm-storage"
+          resources:
+            limits:
+              nvidia.com/gpu: 1
+            requests:
+              nvidia.com/gpu: 1
+          volumeMounts:
+            - name: vllm-storage
+              mountPath: /vllm-storage
+
+        - name: model-loader
+          image: ghcr.io/kyryl-opens-ml/app-fastapi:latest
+          env:
+            - name: WANDB_API_KEY
+              valueFrom:
+                secretKeyRef:
+                  name: wandb
+                  key: WANDB_API_KEY
+          command: ["/bin/sh", "-c"]
+          args:
+            - |
+              echo "Model Loader: Waiting for vllm server to be available on port 8000..."
+              while ! curl -s http://localhost:8000/health >/dev/null; do
+                echo "Model Loader: vllm server not available yet. Retrying in 5 seconds..."
+                sleep 5
+              done
+              echo "Model Loader: vllm server is now available. Starting model loading..."
+              
+              # Execute the required Python commands
+              python serving-llm/client.py load-from-registry truskovskiyk/ml-in-production-practice/modal_generative_example:latest sql-default-model
+              if [ $? -ne 0 ]; then
+                echo "Model Loader: Failed to load model from registry."
+                exit 1
+              fi
+              
+              python serving-llm/client.py load-adapter sql-default-model ./sql-default-model
+              if [ $? -ne 0 ]; then
+                echo "Model Loader: Failed to load adapter."
+                exit 1
+              fi
+              
+              echo "Model Loader: Model loading completed successfully."
+          volumeMounts:
+            - name: vllm-storage
+              mountPath: /vllm-storage
+          resources:
+            limits:
+              cpu: "500m"
+              memory: "512Mi"
+            requests:
+              cpu: "250m"
+              memory: "256Mi"
+
+      volumes:
+        - name: vllm-storage
+          persistentVolumeClaim:
+            claimName: vllm-storage-pvc
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: app-vllm
+  labels:
+    app: app-vllm
+spec:
+  type: ClusterIP
+  ports:
+    - port: 8000
+      protocol: TCP
+      targetPort: 8000
+  selector:
+    app: app-vllm
diff --git a/module-5/requirements.txt b/module-5/requirements.txt
@@ -4,7 +4,8 @@ streamlit==1.38.0
 uvicorn==0.21.1
 fastapi==0.109.2
 wandb==0.17.9
-kserve 
+kserve==0.13.1
 torch==2.4.1
 nvidia_pytriton==0.5.10
-ipython
+openai==1.46.0
+ipython==8.27.0
diff --git a/module-5/serving-llm/__init__.py b/module-5/serving-llm/__init__.py
diff --git a/module-5/serving-llm/client.py b/module-5/serving-llm/client.py
@@ -0,0 +1,61 @@
+from pathlib import Path
+import wandb
+import requests
+import json 
+import typer
+from rich import print
+from openai import OpenAI
+
+DEFAULT_BASE_URL = "http://localhost:8000/v1"
+EXAMPLE_CONTEXT = "CREATE TABLE salesperson (salesperson_id INT, name TEXT, region TEXT); INSERT INTO salesperson (salesperson_id, name, region) VALUES (1, 'John Doe', 'North'), (2, 'Jane Smith', 'South'); CREATE TABLE timber_sales (sales_id INT, salesperson_id INT, volume REAL, sale_date DATE); INSERT INTO timber_sales (sales_id, salesperson_id, volume, sale_date) VALUES (1, 1, 120, '2021-01-01'), (2, 1, 150, '2021-02-01'), (3, 2, 180, '2021-01-01');"
+EXAMPLE_QUERY = "What is the total volume of timber sold by each salesperson, sorted by salesperson?"
+
+def load_from_registry(model_name: str, model_path: Path):
+    with wandb.init() as run:
+        artifact = run.use_artifact(model_name, type="model")
+        artifact_dir = artifact.download(root=model_path)
+        print(f"{artifact_dir}")
+
+
+def list_of_models(url: str = DEFAULT_BASE_URL):
+    url = f"{url}/models"
+    response = requests.get(url)
+    models = response.json()
+    print(json.dumps(models, indent=4))
+
+def load_adapter(lora_name: str, lora_path: str, url: str = DEFAULT_BASE_URL):
+    url = f"{url}/load_lora_adapter"
+    payload = {
+        "lora_name": lora_name,
+        "lora_path": lora_path
+    }
+    response = requests.post(url, json=payload)
+    print(response)
+
+def unload_adapter(lora_name: str, url: str = DEFAULT_BASE_URL):
+    url = f"{url}/unload_lora_adapter"
+    payload = {
+        "lora_name": lora_name
+    }
+    response = requests.post(url, json=payload)
+    result = response.json()
+    print(json.dumps(result, indent=4))
+
+def test_client(model: str, context: str = EXAMPLE_CONTEXT, query: str = EXAMPLE_QUERY, url: str = DEFAULT_BASE_URL):
+    client = OpenAI(base_url=url, api_key="any-api-key")
+    messages = [{"content": f"{context}\n Input: {query}", "role": "user"}]
+    completion = client.chat.completions.create(model=model, messages=messages)
+    print(completion.choices[0].message.content)
+
+def cli():
+    app = typer.Typer()
+    app.command()(load_from_registry)
+    app.command()(list_of_models)
+    app.command()(load_adapter)
+    app.command()(unload_adapter)
+    app.command()(test_client)
+    app()
+
+if __name__ == "__main__":
+    cli()
+