Skip to content

Commit

Permalink
Serving LLMs
Browse files Browse the repository at this point in the history
  • Loading branch information
truskovskiyk committed Sep 18, 2024
1 parent 27498e3 commit ce51186
Show file tree
Hide file tree
Showing 3 changed files with 99 additions and 49 deletions.
45 changes: 41 additions & 4 deletions module-5/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,18 +110,55 @@ curl -v -H "Host: custom-model.default.example.com" -H "Content-Type: applicatio
# Serving LLMs via vLLM


Run server

```
mkdir -p vllm-storage
export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True
vllm serve microsoft/Phi-3-mini-4k-instruct --dtype auto --max-model-len 512 --enable-lora --gpu-memory-utilization 0.8
vllm serve microsoft/Phi-3-mini-4k-instruct --dtype auto --max-model-len 512 --enable-lora --gpu-memory-utilization 0.8 --download-dir ./vllm-storage
```


Run client

Get list of models:

```
python ml-in-production-practice/module-5/serving-llm/client.py list-of-models
```


Add custom adapter:

```
python ml-in-production-practice/module-5/serving-llm/client.py load-from-registry truskovskiyk/ml-in-production-practice/modal_generative_example:latest sql-default-model
python ml-in-production-practice/module-5/serving-llm/client.py load-adapter sql-default-model ./sql-default-model
python ml-in-production-practice/module-5/serving-llm/client.py list-of-models
```


vllm serve microsoft/Phi-3-mini-4k-instruct --enable-lora \
--lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/
Test client:

```
python ml-in-production-practice/module-5/serving-llm/client.py test-client microsoft/Phi-3-mini-4k-instruct
python ml-in-production-practice/module-5/serving-llm/client.py test-client sql-default-model
```


Deploy

Run K8S with GPUs

```
minikube start --driver docker --container-runtime docker --gpus all
```

Create deployment

```
kubectl create -f
```


## Updated design doc

Expand Down
40 changes: 40 additions & 0 deletions module-5/k8s/vllm-inference.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: app-vllm
spec:
replicas: 2
selector:
matchLabels:
app: app-vllm
template:
metadata:
labels:
app: app-vllm
spec:
containers:
- name: app-vllm
image: vllm/vllm-openai:latest
env:
- name: WANDB_API_KEY
valueFrom:
secretKeyRef:
name: wandb
key: WANDB_API_KEY
resources:
limits:
nvidia.com/gpu: 1
---
apiVersion: v1
kind: Service
metadata:
name: app-vllm
labels:
app: app-vllm
spec:
ports:
- port: 8080
protocol: TCP
selector:
app: app-vllm
63 changes: 18 additions & 45 deletions module-5/serving-llm/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,13 @@
import wandb
import requests
import json
import typer
from rich import print
from openai import OpenAI


BASE_URL = "http://localhost:8000/v1"
DEFAULT_BASE_URL = "http://localhost:8000/v1"
EXAMPLE_CONTEXT = "CREATE TABLE salesperson (salesperson_id INT, name TEXT, region TEXT); INSERT INTO salesperson (salesperson_id, name, region) VALUES (1, 'John Doe', 'North'), (2, 'Jane Smith', 'South'); CREATE TABLE timber_sales (sales_id INT, salesperson_id INT, volume REAL, sale_date DATE); INSERT INTO timber_sales (sales_id, salesperson_id, volume, sale_date) VALUES (1, 1, 120, '2021-01-01'), (2, 1, 150, '2021-02-01'), (3, 2, 180, '2021-01-01');"
EXAMPLE_QUERY = "What is the total volume of timber sold by each salesperson, sorted by salesperson?"

def load_from_registry(model_name: str, model_path: Path):
with wandb.init() as run:
Expand All @@ -13,63 +17,35 @@ def load_from_registry(model_name: str, model_path: Path):
print(f"{artifact_dir}")


def list_of_models():
url = f"{BASE_URL}/models"
def list_of_models(url: str = DEFAULT_BASE_URL):
url = f"{url}/models"
response = requests.get(url)
models = response.json()
print(json.dumps(models, indent=4))

def load_adapter(lora_name: str, lora_path: str):

lora_name = "sql-test"
lora_path = "data/sql-adapter/"

url = f"{BASE_URL}/load_lora_adapter"
def load_adapter(lora_name: str, lora_path: str, url: str = DEFAULT_BASE_URL):
url = f"{url}/load_lora_adapter"
payload = {
"lora_name": lora_name,
"lora_path": lora_path
}
response = requests.post(url, json=payload)
print(response)

def unload_adapter(lora_name: str):
url = f"{BASE_URL}/unload_lora_adapter"
def unload_adapter(lora_name: str, url: str = DEFAULT_BASE_URL):
url = f"{url}/unload_lora_adapter"
payload = {
"lora_name": lora_name
}
headers = {"Content-Type": "application/json"}
response = requests.post(url, headers=headers, json=payload)
response = requests.post(url, json=payload)
result = response.json()
print(json.dumps(result, indent=4))

def test_client(model: str, prompt: str, max_tokens: int = 7, temperature: float = 0.0):
prompt = "test"
max_tokens: int = 7
temperature: float = 0.0
# model = "microsoft/Phi-3-mini-4k-instruct"
model = "sql-test"
url = f"{BASE_URL}/completions"
payload = {
"model": model,
"prompt": prompt,
"max_tokens": max_tokens,
"temperature": temperature
}
response = requests.post(url, json=payload)
completion = response.json()
print(json.dumps(completion, indent=4))

def run_inference_on_json(json_file: Path):
url = f"{BASE_URL}/completions"
with open(json_file, 'r') as f:
payload = json.load(f)
headers = {"Content-Type": "application/json"}
response = requests.post(url, headers=headers, json=payload)
completion = response.json()
print(json.dumps(completion, indent=4))



def test_client(model: str, context: str = EXAMPLE_CONTEXT, query: str = EXAMPLE_QUERY, url: str = DEFAULT_BASE_URL):
client = OpenAI(base_url=url, api_key="any-api-key")
messages = [{"content": f"{context}\n Input: {query}", "role": "user"}]
completion = client.chat.completions.create(model=model, messages=messages)
print(completion.choices[0].message.content)

def cli():
app = typer.Typer()
Expand All @@ -78,9 +54,6 @@ def cli():
app.command()(load_adapter)
app.command()(unload_adapter)
app.command()(test_client)
app.command()(upload_to_registry)
app.command()(run_inference_on_json)
app.command()(run_evaluate_on_json)
app()

if __name__ == "__main__":
Expand Down

0 comments on commit ce51186

Please sign in to comment.