-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
89daf7b
commit 78636c6
Showing
9 changed files
with
260 additions
and
18 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -162,4 +162,5 @@ taskfile.yml | |
TODO.md | ||
README.p.md | ||
.DS_Store | ||
README.private.md | ||
README.private.md | ||
vllm |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
--- | ||
apiVersion: v1 | ||
kind: PersistentVolumeClaim | ||
metadata: | ||
name: vllm-storage-pvc | ||
spec: | ||
accessModes: | ||
- ReadWriteOnce | ||
resources: | ||
requests: | ||
storage: 50Gi | ||
storageClassName: standard | ||
--- | ||
apiVersion: apps/v1 | ||
kind: Deployment | ||
metadata: | ||
name: app-vllm | ||
spec: | ||
replicas: 1 | ||
selector: | ||
matchLabels: | ||
app: app-vllm | ||
template: | ||
metadata: | ||
labels: | ||
app: app-vllm | ||
spec: | ||
containers: | ||
- name: app-vllm | ||
image: vllm/vllm-openai:latest | ||
env: | ||
- name: VLLM_ALLOW_RUNTIME_LORA_UPDATING | ||
value: "True" | ||
command: ["vllm"] | ||
args: | ||
- "serve" | ||
- "microsoft/Phi-3-mini-4k-instruct" | ||
- "--dtype" | ||
- "auto" | ||
- "--max-model-len" | ||
- "512" | ||
- "--enable-lora" | ||
- "--gpu-memory-utilization" | ||
- "0.8" | ||
- "--download-dir" | ||
- "/vllm-storage" | ||
resources: | ||
limits: | ||
nvidia.com/gpu: 1 | ||
requests: | ||
nvidia.com/gpu: 1 | ||
volumeMounts: | ||
- name: vllm-storage | ||
mountPath: /vllm-storage | ||
|
||
- name: model-loader | ||
image: ghcr.io/kyryl-opens-ml/app-fastapi:latest | ||
env: | ||
- name: WANDB_API_KEY | ||
valueFrom: | ||
secretKeyRef: | ||
name: wandb | ||
key: WANDB_API_KEY | ||
command: ["/bin/sh", "-c"] | ||
args: | ||
- | | ||
echo "Model Loader: Waiting for vllm server to be available on port 8000..." | ||
while ! curl -s http://localhost:8000/health >/dev/null; do | ||
echo "Model Loader: vllm server not available yet. Retrying in 5 seconds..." | ||
sleep 5 | ||
done | ||
echo "Model Loader: vllm server is now available. Starting model loading..." | ||
# Execute the required Python commands | ||
python serving-llm/client.py load-from-registry truskovskiyk/ml-in-production-practice/modal_generative_example:latest sql-default-model | ||
if [ $? -ne 0 ]; then | ||
echo "Model Loader: Failed to load model from registry." | ||
exit 1 | ||
fi | ||
python serving-llm/client.py load-adapter sql-default-model ./sql-default-model | ||
if [ $? -ne 0 ]; then | ||
echo "Model Loader: Failed to load adapter." | ||
exit 1 | ||
fi | ||
echo "Model Loader: Model loading completed successfully." | ||
volumeMounts: | ||
- name: vllm-storage | ||
mountPath: /vllm-storage | ||
resources: | ||
limits: | ||
cpu: "500m" | ||
memory: "512Mi" | ||
requests: | ||
cpu: "250m" | ||
memory: "256Mi" | ||
|
||
volumes: | ||
- name: vllm-storage | ||
persistentVolumeClaim: | ||
claimName: vllm-storage-pvc | ||
--- | ||
apiVersion: v1 | ||
kind: Service | ||
metadata: | ||
name: app-vllm | ||
labels: | ||
app: app-vllm | ||
spec: | ||
type: ClusterIP | ||
ports: | ||
- port: 8000 | ||
protocol: TCP | ||
targetPort: 8000 | ||
selector: | ||
app: app-vllm |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
from pathlib import Path | ||
import wandb | ||
import requests | ||
import json | ||
import typer | ||
from rich import print | ||
from openai import OpenAI | ||
|
||
DEFAULT_BASE_URL = "http://localhost:8000/v1" | ||
EXAMPLE_CONTEXT = "CREATE TABLE salesperson (salesperson_id INT, name TEXT, region TEXT); INSERT INTO salesperson (salesperson_id, name, region) VALUES (1, 'John Doe', 'North'), (2, 'Jane Smith', 'South'); CREATE TABLE timber_sales (sales_id INT, salesperson_id INT, volume REAL, sale_date DATE); INSERT INTO timber_sales (sales_id, salesperson_id, volume, sale_date) VALUES (1, 1, 120, '2021-01-01'), (2, 1, 150, '2021-02-01'), (3, 2, 180, '2021-01-01');" | ||
EXAMPLE_QUERY = "What is the total volume of timber sold by each salesperson, sorted by salesperson?" | ||
|
||
def load_from_registry(model_name: str, model_path: Path): | ||
with wandb.init() as run: | ||
artifact = run.use_artifact(model_name, type="model") | ||
artifact_dir = artifact.download(root=model_path) | ||
print(f"{artifact_dir}") | ||
|
||
|
||
def list_of_models(url: str = DEFAULT_BASE_URL): | ||
url = f"{url}/models" | ||
response = requests.get(url) | ||
models = response.json() | ||
print(json.dumps(models, indent=4)) | ||
|
||
def load_adapter(lora_name: str, lora_path: str, url: str = DEFAULT_BASE_URL): | ||
url = f"{url}/load_lora_adapter" | ||
payload = { | ||
"lora_name": lora_name, | ||
"lora_path": lora_path | ||
} | ||
response = requests.post(url, json=payload) | ||
print(response) | ||
|
||
def unload_adapter(lora_name: str, url: str = DEFAULT_BASE_URL): | ||
url = f"{url}/unload_lora_adapter" | ||
payload = { | ||
"lora_name": lora_name | ||
} | ||
response = requests.post(url, json=payload) | ||
result = response.json() | ||
print(json.dumps(result, indent=4)) | ||
|
||
def test_client(model: str, context: str = EXAMPLE_CONTEXT, query: str = EXAMPLE_QUERY, url: str = DEFAULT_BASE_URL): | ||
client = OpenAI(base_url=url, api_key="any-api-key") | ||
messages = [{"content": f"{context}\n Input: {query}", "role": "user"}] | ||
completion = client.chat.completions.create(model=model, messages=messages) | ||
print(completion.choices[0].message.content) | ||
|
||
def cli(): | ||
app = typer.Typer() | ||
app.command()(load_from_registry) | ||
app.command()(list_of_models) | ||
app.command()(load_adapter) | ||
app.command()(unload_adapter) | ||
app.command()(test_client) | ||
app() | ||
|
||
if __name__ == "__main__": | ||
cli() | ||
|