diff --git a/module-5/README.md b/module-5/README.md index f2311c9..3265733 100644 --- a/module-5/README.md +++ b/module-5/README.md @@ -110,18 +110,55 @@ curl -v -H "Host: custom-model.default.example.com" -H "Content-Type: applicatio # Serving LLMs via vLLM +Run server + ``` +mkdir -p vllm-storage export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True -vllm serve microsoft/Phi-3-mini-4k-instruct --dtype auto --max-model-len 512 --enable-lora --gpu-memory-utilization 0.8 +vllm serve microsoft/Phi-3-mini-4k-instruct --dtype auto --max-model-len 512 --enable-lora --gpu-memory-utilization 0.8 --download-dir ./vllm-storage +``` + + +Run client + +Get list of models: + +``` +python ml-in-production-practice/module-5/serving-llm/client.py list-of-models +``` + +Add custom adapter: +``` +python ml-in-production-practice/module-5/serving-llm/client.py load-from-registry truskovskiyk/ml-in-production-practice/modal_generative_example:latest sql-default-model +python ml-in-production-practice/module-5/serving-llm/client.py load-adapter sql-default-model ./sql-default-model +python ml-in-production-practice/module-5/serving-llm/client.py list-of-models +``` -vllm serve microsoft/Phi-3-mini-4k-instruct --enable-lora \ - --lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/ +Test client: - ``` +python ml-in-production-practice/module-5/serving-llm/client.py test-client microsoft/Phi-3-mini-4k-instruct +python ml-in-production-practice/module-5/serving-llm/client.py test-client sql-default-model +``` + + +Deploy + +Run K8S with GPUs + +``` +minikube start --driver docker --container-runtime docker --gpus all +``` + +Create deployment + +``` +kubectl create -f +``` + ## Updated design doc diff --git a/module-5/k8s/vllm-inference.yaml b/module-5/k8s/vllm-inference.yaml new file mode 100644 index 0000000..81e2d4b --- /dev/null +++ b/module-5/k8s/vllm-inference.yaml @@ -0,0 +1,40 @@ +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: app-vllm +spec: + replicas: 2 + selector: + matchLabels: + app: app-vllm + template: + metadata: + labels: + app: app-vllm + spec: + containers: + - name: app-vllm + image: vllm/vllm-openai:latest + env: + - name: WANDB_API_KEY + valueFrom: + secretKeyRef: + name: wandb + key: WANDB_API_KEY + resources: + limits: + nvidia.com/gpu: 1 +--- +apiVersion: v1 +kind: Service +metadata: + name: app-vllm + labels: + app: app-vllm +spec: + ports: + - port: 8080 + protocol: TCP + selector: + app: app-vllm diff --git a/module-5/serving-llm/client.py b/module-5/serving-llm/client.py index 5a0faf1..ae17715 100644 --- a/module-5/serving-llm/client.py +++ b/module-5/serving-llm/client.py @@ -2,9 +2,13 @@ import wandb import requests import json +import typer +from rich import print +from openai import OpenAI - -BASE_URL = "http://localhost:8000/v1" +DEFAULT_BASE_URL = "http://localhost:8000/v1" +EXAMPLE_CONTEXT = "CREATE TABLE salesperson (salesperson_id INT, name TEXT, region TEXT); INSERT INTO salesperson (salesperson_id, name, region) VALUES (1, 'John Doe', 'North'), (2, 'Jane Smith', 'South'); CREATE TABLE timber_sales (sales_id INT, salesperson_id INT, volume REAL, sale_date DATE); INSERT INTO timber_sales (sales_id, salesperson_id, volume, sale_date) VALUES (1, 1, 120, '2021-01-01'), (2, 1, 150, '2021-02-01'), (3, 2, 180, '2021-01-01');" +EXAMPLE_QUERY = "What is the total volume of timber sold by each salesperson, sorted by salesperson?" def load_from_registry(model_name: str, model_path: Path): with wandb.init() as run: @@ -13,18 +17,14 @@ def load_from_registry(model_name: str, model_path: Path): print(f"{artifact_dir}") -def list_of_models(): - url = f"{BASE_URL}/models" +def list_of_models(url: str = DEFAULT_BASE_URL): + url = f"{url}/models" response = requests.get(url) models = response.json() print(json.dumps(models, indent=4)) -def load_adapter(lora_name: str, lora_path: str): - - lora_name = "sql-test" - lora_path = "data/sql-adapter/" - - url = f"{BASE_URL}/load_lora_adapter" +def load_adapter(lora_name: str, lora_path: str, url: str = DEFAULT_BASE_URL): + url = f"{url}/load_lora_adapter" payload = { "lora_name": lora_name, "lora_path": lora_path @@ -32,44 +32,20 @@ def load_adapter(lora_name: str, lora_path: str): response = requests.post(url, json=payload) print(response) -def unload_adapter(lora_name: str): - url = f"{BASE_URL}/unload_lora_adapter" +def unload_adapter(lora_name: str, url: str = DEFAULT_BASE_URL): + url = f"{url}/unload_lora_adapter" payload = { "lora_name": lora_name } - headers = {"Content-Type": "application/json"} - response = requests.post(url, headers=headers, json=payload) + response = requests.post(url, json=payload) result = response.json() print(json.dumps(result, indent=4)) -def test_client(model: str, prompt: str, max_tokens: int = 7, temperature: float = 0.0): - prompt = "test" - max_tokens: int = 7 - temperature: float = 0.0 - # model = "microsoft/Phi-3-mini-4k-instruct" - model = "sql-test" - url = f"{BASE_URL}/completions" - payload = { - "model": model, - "prompt": prompt, - "max_tokens": max_tokens, - "temperature": temperature - } - response = requests.post(url, json=payload) - completion = response.json() - print(json.dumps(completion, indent=4)) - -def run_inference_on_json(json_file: Path): - url = f"{BASE_URL}/completions" - with open(json_file, 'r') as f: - payload = json.load(f) - headers = {"Content-Type": "application/json"} - response = requests.post(url, headers=headers, json=payload) - completion = response.json() - print(json.dumps(completion, indent=4)) - - - +def test_client(model: str, context: str = EXAMPLE_CONTEXT, query: str = EXAMPLE_QUERY, url: str = DEFAULT_BASE_URL): + client = OpenAI(base_url=url, api_key="any-api-key") + messages = [{"content": f"{context}\n Input: {query}", "role": "user"}] + completion = client.chat.completions.create(model=model, messages=messages) + print(completion.choices[0].message.content) def cli(): app = typer.Typer() @@ -78,9 +54,6 @@ def cli(): app.command()(load_adapter) app.command()(unload_adapter) app.command()(test_client) - app.command()(upload_to_registry) - app.command()(run_inference_on_json) - app.command()(run_evaluate_on_json) app() if __name__ == "__main__":