diff --git a/examples/production_monitoring/README.md b/examples/production_monitoring/README.md new file mode 100644 index 0000000000000..29b611caeda23 --- /dev/null +++ b/examples/production_monitoring/README.md @@ -0,0 +1,54 @@ +# vLLM + Prometheus/Grafana + +This is a simple example that shows you how to connect vLLM metric logging to the Prometheus/Grafana stack. For this example, we launch Prometheus and Grafana via Docker. You can checkout other methods through [Prometheus](https://prometheus.io/) and [Grafana](https://grafana.com/) websites. + +Install: +- [`docker`](https://docs.docker.com/engine/install/) +- [`docker compose`](https://docs.docker.com/compose/install/linux/#install-using-the-repository) + +### Launch + +Prometheus metric logging is enabled by default in the OpenAI-compatible server. Launch via the entrypoint: +```bash +python3 -m vllm.entrypoints.openai.api_server \ + --model mistralai/Mistral-7B-v0.1 \ + --max-model-len 2048 \ + --disable-log-requests +``` + +Launch Prometheus and Grafana servers with `docker compose`: +```bash +docker compose up +``` + +Submit some sample requests to the server: +```bash +wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json + +python3 ../../benchmarks/benchmark_serving.py \ + --model mistralai/Mistral-7B-v0.1 \ + --tokenizer mistralai/Mistral-7B-v0.1 \ + --endpoint /v1/completions \ + --dataset ShareGPT_V3_unfiltered_cleaned_split.json \ + --request-rate 3.0 +``` + +Navigating to [`http://localhost:8000/metrics`](http://localhost:8000/metrics) will show the raw Prometheus metrics being exposed by vLLM. + +### Grafana Dashboard + +Navigate to [`http://localhost:3000`](http://localhost:3000). Log in with the default username (`admin`) and password (`admin`). + +#### Add Prometheus Data Source + +Navigate to [`http://localhost:3000/connections/datasources/new`](http://localhost:3000/connections/datasources/new) and select Prometheus. + +On Prometheus configuration page, we need to add the `Prometheus Server URL` in `Connection`. For this setup, Grafana and Prometheus are running in separate containers, but Docker creates DNS name for each containers. You can just use `http://prometheus:9090`. + +Click `Save & Test`. You should get a green check saying "Successfully queried the Prometheus API.". + +#### Import Dashboard + +Navigate to [`http://localhost:3000/dashboard/import`](http://localhost:3000/dashboard/import), upload `grafana.json`, and select the `prometheus` datasource. You should see a screen that looks like the following: + +![Grafana Dashboard Image](https://i.imgur.com/R2vH9VW.png) diff --git a/examples/production_monitoring/docker-compose.yaml b/examples/production_monitoring/docker-compose.yaml new file mode 100644 index 0000000000000..13b987c120f7d --- /dev/null +++ b/examples/production_monitoring/docker-compose.yaml @@ -0,0 +1,19 @@ +# docker-compose.yaml +version: "3" + +services: + prometheus: + image: prom/prometheus:latest + extra_hosts: + - "host.docker.internal:host-gateway" # allow a direct connection from container to the local machine + ports: + - "9090:9090" # the default port used by Prometheus + volumes: + - ${PWD}/prometheus.yaml:/etc/prometheus/prometheus.yml # mount Prometheus config file + + grafana: + image: grafana/grafana:latest + depends_on: + - prometheus + ports: + - "3000:3000" # the default port used by Grafana diff --git a/examples/production_monitoring/grafana.json b/examples/production_monitoring/grafana.json new file mode 100644 index 0000000000000..f48b6314eb055 --- /dev/null +++ b/examples/production_monitoring/grafana.json @@ -0,0 +1,931 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": {}, + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "10.2.3" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Monitoring vLLM Inference Server", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "End to end request latency measured in seconds.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 9, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket[$__rate_interval])))", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "P99", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "P95", + "range": true, + "refId": "B", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "P90", + "range": true, + "refId": "C", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "P50", + "range": true, + "refId": "D", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(vllm:e2e_request_latency_seconds_sum[$__rate_interval])\n/\nrate(vllm:e2e_request_latency_seconds_count[$__rate_interval])", + "hide": false, + "instant": false, + "legendFormat": "Average", + "range": true, + "refId": "E" + } + ], + "title": "E2E Request Latency", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Number of tokens processed per second", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 8, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "rate(vllm:prompt_tokens_total[$__rate_interval])", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "Prompt Tokens/Sec", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "rate(vllm:generation_tokens_total[$__rate_interval])", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "Generation Tokens/Sec", + "range": true, + "refId": "B", + "useBackend": false + } + ], + "title": "Token Throughput", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Inter token latency in seconds.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 10, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__rate_interval])))", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "P99", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "P95", + "range": true, + "refId": "B", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "P90", + "range": true, + "refId": "C", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "P50", + "range": true, + "refId": "D", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(vllm:time_per_output_token_seconds_sum[$__rate_interval])\n/\nrate(vllm:time_per_output_token_seconds_count[$__rate_interval])", + "hide": false, + "instant": false, + "legendFormat": "Mean", + "range": true, + "refId": "E" + } + ], + "title": "Time Per Output Token Latency", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Number of requests in RUNNING, WAITING, and SWAPPED state", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "vllm:num_requests_running", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "Num Running", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "vllm:num_requests_swapped", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "Num Swapped", + "range": true, + "refId": "B", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "vllm:num_requests_waiting", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "Num Waiting", + "range": true, + "refId": "C", + "useBackend": false + } + ], + "title": "Scheduler State", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "P50, P90, P95, and P99 TTFT latency in seconds.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "P99", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket[$__rate_interval])))", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "P95", + "range": true, + "refId": "B", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "P90", + "range": true, + "refId": "C", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "P50", + "range": true, + "refId": "D", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(vllm:time_to_first_token_seconds_sum[$__rate_interval])\n/\nrate(vllm:time_to_first_token_seconds_count[$__rate_interval])", + "hide": false, + "instant": false, + "legendFormat": "Average", + "range": true, + "refId": "E" + } + ], + "title": "Time To First Token Latency", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Percentage of used cache blocks by vLLM.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "vllm:gpu_cache_usage_perc", + "instant": false, + "legendFormat": "GPU Cache Usage", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "vllm:cpu_cache_usage_perc", + "hide": false, + "instant": false, + "legendFormat": "CPU Cache Usage", + "range": true, + "refId": "B" + } + ], + "title": "Cache Utilization", + "type": "timeseries" + } + ], + "refresh": "", + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-5m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "vLLM", + "uid": "b281712d-8bff-41ef-9f3f-71ad43c05e9b", + "version": 2, + "weekStart": "" +} diff --git a/examples/production_monitoring/prometheus.yaml b/examples/production_monitoring/prometheus.yaml new file mode 100644 index 0000000000000..754533b9dfbd0 --- /dev/null +++ b/examples/production_monitoring/prometheus.yaml @@ -0,0 +1,10 @@ +# prometheus.yaml +global: + scrape_interval: 5s + evaluation_interval: 30s + +scrape_configs: + - job_name: vllm + static_configs: + - targets: + - 'host.docker.internal:8000' diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 6c916695a74e2..7e1af7c83607a 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -10,7 +10,7 @@ SchedulerConfig, LoRAConfig) from vllm.core.scheduler import Scheduler, SchedulerOutputs from vllm.engine.arg_utils import EngineArgs -from vllm.engine.metrics import record_metrics +from vllm.engine.metrics import StatLogger, Stats from vllm.engine.ray_utils import RayWorkerVllm, initialize_cluster, ray from vllm.logger import init_logger from vllm.outputs import RequestOutput @@ -28,8 +28,7 @@ from ray.util.placement_group import PlacementGroup logger = init_logger(__name__) - -_LOGGING_INTERVAL_SEC = 5 +_LOCAL_LOGGING_INTERVAL_SEC = 5 class LLMEngine: @@ -115,12 +114,10 @@ def __init__( # Create the scheduler. self.scheduler = Scheduler(scheduler_config, cache_config, lora_config) - # Logging. - self.last_logging_time = 0.0 - # List of (timestamp, num_tokens) - self.num_prompt_tokens: List[Tuple[float, int]] = [] - # List of (timestamp, num_tokens) - self.num_generation_tokens: List[Tuple[float, int]] = [] + # Metric Logging. + if self.log_stats: + self.stat_logger = StatLogger( + local_interval=_LOCAL_LOGGING_INTERVAL_SEC) def get_tokenizer_for_seq(self, sequence: Sequence): return self.tokenizer.get_lora_tokenizer(sequence.lora_request) @@ -532,6 +529,7 @@ def _check_beam_search_early_stopping( def _process_sequence_group_outputs(self, seq_group: SequenceGroup, outputs: SequenceGroupOutput) -> None: + # Process prompt logprobs prompt_logprobs = outputs.prompt_logprobs if prompt_logprobs is not None: @@ -727,10 +725,10 @@ def _process_model_outputs( and not seq_group.prefix.computed): seq_group.prefix.computed = True + # Log stats. if self.log_stats: - # Log the system stats. - self._log_system_stats(scheduler_outputs.prompt_run, - scheduler_outputs.num_batched_tokens) + self.stat_logger.log(self._get_stats(scheduler_outputs)) + return request_outputs def step(self) -> List[RequestOutput]: @@ -805,81 +803,73 @@ def step(self) -> List[RequestOutput]: return self._process_model_outputs(output, scheduler_outputs) def do_log_stats(self) -> None: - self._log_system_stats(False, 0) + """Forced log when no requests active.""" + if self.log_stats: + self.stat_logger.log(self._get_stats(scheduler_outputs=None)) - def _log_system_stats( - self, - prompt_run: bool, - num_batched_tokens: int, - ) -> None: + def _get_stats(self, + scheduler_outputs: Optional[SchedulerOutputs]) -> Stats: + """Get Stats to be Logged to Prometheus.""" now = time.monotonic() - # Log the number of batched input tokens. - if prompt_run: - self.num_prompt_tokens.append((now, num_batched_tokens)) - else: - self.num_generation_tokens.append((now, num_batched_tokens)) - should_log = now - self.last_logging_time >= _LOGGING_INTERVAL_SEC - if not should_log: - return + # KV Cache Usage in %. + num_total_gpu = self.cache_config.num_gpu_blocks + num_free_gpu = self.scheduler.block_manager.get_num_free_gpu_blocks() + gpu_cache_usage = 1.0 - (num_free_gpu / num_total_gpu) - # Discard the old stats. - self.num_prompt_tokens = [(t, n) for t, n in self.num_prompt_tokens - if now - t < _LOGGING_INTERVAL_SEC] - self.num_generation_tokens = [(t, n) - for t, n in self.num_generation_tokens - if now - t < _LOGGING_INTERVAL_SEC] - - if len(self.num_prompt_tokens) > 1: - total_num_tokens = sum(n for _, n in self.num_prompt_tokens[:-1]) - window = now - self.num_prompt_tokens[0][0] - avg_prompt_throughput = total_num_tokens / window - else: - avg_prompt_throughput = 0.0 - if len(self.num_generation_tokens) > 1: - total_num_tokens = sum(n - for _, n in self.num_generation_tokens[:-1]) - window = now - self.num_generation_tokens[0][0] - avg_generation_throughput = total_num_tokens / window - else: - avg_generation_throughput = 0.0 - - total_num_gpu_blocks = self.cache_config.num_gpu_blocks - num_free_gpu_blocks = ( - self.scheduler.block_manager.get_num_free_gpu_blocks()) - num_used_gpu_blocks = total_num_gpu_blocks - num_free_gpu_blocks - gpu_cache_usage = num_used_gpu_blocks / total_num_gpu_blocks - - total_num_cpu_blocks = self.cache_config.num_cpu_blocks - if total_num_cpu_blocks > 0: - num_free_cpu_blocks = ( - self.scheduler.block_manager.get_num_free_cpu_blocks()) - num_used_cpu_blocks = total_num_cpu_blocks - num_free_cpu_blocks - cpu_cache_usage = num_used_cpu_blocks / total_num_cpu_blocks - else: - cpu_cache_usage = 0.0 - - record_metrics( - avg_prompt_throughput=avg_prompt_throughput, - avg_generation_throughput=avg_generation_throughput, - scheduler_running=len(self.scheduler.running), - scheduler_swapped=len(self.scheduler.swapped), - scheduler_waiting=len(self.scheduler.waiting), + num_total_cpu = self.cache_config.num_cpu_blocks + cpu_cache_usage = 0. + if num_total_cpu > 0: + num_free_cpu = self.scheduler.block_manager.get_num_free_cpu_blocks( + ) + cpu_cache_usage = 1.0 - (num_free_cpu / num_total_cpu) + + # Scheduler State + num_running = len(self.scheduler.running) + num_swapped = len(self.scheduler.swapped) + num_waiting = len(self.scheduler.waiting) + + # Iteration stats if we have scheduler output. + num_prompt_tokens = 0 + num_generation_tokens = 0 + time_to_first_tokens = [] + time_per_output_tokens = [] + time_e2e_requests = [] + if scheduler_outputs is not None: + prompt_run = scheduler_outputs.prompt_run + + # Number of Tokens. + if prompt_run: + num_prompt_tokens = scheduler_outputs.num_batched_tokens + else: + num_generation_tokens = scheduler_outputs.num_batched_tokens + + # Latency Timings. + time_last_iters = [] + for seq_group in scheduler_outputs.scheduled_seq_groups: + # Time since last token. (n.b. updates seq_group.last_token_time) + time_last_iters.append(seq_group.get_last_latency(now)) + # Time since arrival for all finished requests. + if seq_group.is_finished(): + time_e2e_requests.append(now - seq_group.arrival_time) + + time_to_first_tokens = time_last_iters if prompt_run else [] + time_per_output_tokens = [] if prompt_run else time_last_iters + + return Stats( + now=now, + num_running=num_running, + num_swapped=num_swapped, + num_waiting=num_waiting, gpu_cache_usage=gpu_cache_usage, cpu_cache_usage=cpu_cache_usage, + num_prompt_tokens=num_prompt_tokens, + num_generation_tokens=num_generation_tokens, + time_to_first_tokens=time_to_first_tokens, + time_per_output_tokens=time_per_output_tokens, + time_e2e_requests=time_e2e_requests, ) - logger.info("Avg prompt throughput: " - f"{avg_prompt_throughput:.1f} tokens/s, " - "Avg generation throughput: " - f"{avg_generation_throughput:.1f} tokens/s, " - f"Running: {len(self.scheduler.running)} reqs, " - f"Swapped: {len(self.scheduler.swapped)} reqs, " - f"Pending: {len(self.scheduler.waiting)} reqs, " - f"GPU KV cache usage: {gpu_cache_usage * 100:.1f}%, " - f"CPU KV cache usage: {cpu_cache_usage * 100:.1f}%") - self.last_logging_time = now - def _decode_sequence(self, seq: Sequence, prms: SamplingParams) -> None: """Decodes the new token for a sequence.""" (new_tokens, new_output_text, prefix_offset, diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index c64071207f6a0..e613b9f551b2f 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -1,4 +1,19 @@ -from aioprometheus import Gauge +from vllm.logger import init_logger +from aioprometheus import Counter, Gauge, Histogram + +import time +import numpy as np +from typing import List +from dataclasses import dataclass + +logger = init_logger(__name__) + +labels = {} + + +def add_global_metrics_labels(**kwargs): + labels.update(kwargs) + # The begin-* and end* here are used by the documentation generator # to extract the metrics definitions. @@ -9,12 +24,16 @@ gauge_avg_generation_throughput = Gauge( "vllm:avg_generation_throughput_toks_per_s", "Average generation throughput in tokens/s.") +counter_prompt_tokens = Counter("vllm:prompt_tokens_total", + "Number of prefill tokens processed.") +counter_generation_tokens = Counter("vllm:generation_tokens_total", + "Number of generation tokens processed.") gauge_scheduler_running = Gauge( "vllm:num_requests_running", - "Number of requests that is currently running for inference.") + "Number of requests currently running on GPU.") gauge_scheduler_swapped = Gauge("vllm:num_requests_swapped", - "Number requests swapped to CPU.") + "Number of requests swapped to CPU.") gauge_scheduler_waiting = Gauge("vllm:num_requests_waiting", "Number of requests waiting to be processed.") @@ -24,28 +43,131 @@ gauge_cpu_cache_usage = Gauge( "vllm:cpu_cache_usage_perc", "CPU KV-cache usage. 1 means 100 percent usage.") + +histogram_time_to_first_token = Histogram( + "vllm:time_to_first_token_seconds", + "Histogram of time to first token in seconds.", + buckets=[ + 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5, 0.75, 1.0, + 2.5, 5.0, 7.5, 10.0 + ]) +histogram_time_per_output_tokens = Histogram( + "vllm:time_per_output_token_seconds", + "Histogram of time per output token in seconds.", + buckets=[ + 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, 1.0, 2.5 + ]) +histogram_e2e_request_latency = Histogram( + "vllm:e2e_request_latency_seconds", + "Histogram of end to end request latency in seconds.", + buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0]) # end-metrics-definitions -labels = {} +@dataclass +class Stats: + """Created by LLMEngine for use by StatLogger.""" + now: float -def add_global_metrics_labels(**kwargs): - labels.update(kwargs) + # System stats. + num_running: int + num_waiting: int + num_swapped: int + gpu_cache_usage: float + cpu_cache_usage: float + + # Raw stats from last model iteration. + num_prompt_tokens: int + num_generation_tokens: int + time_to_first_tokens: List[float] + time_per_output_tokens: List[float] + time_e2e_requests: List[float] + + +class StatLogger: + """StatLogger is used LLMEngine to log to Promethus and Stdout.""" + + def __init__(self, local_interval: float) -> None: + # Metadata for logging locally. + self.last_local_log = time.monotonic() + self.local_interval = local_interval + + # Tracked stats over current local logging interval. + self.num_prompt_tokens: List[int] = [] + self.num_generation_tokens: List[int] = [] + + def _get_throughput(self, tracked_stats: List[int], now: float) -> float: + return float(np.sum(tracked_stats) / (now - self.last_local_log)) + + def _local_interval_elapsed(self, now: float) -> bool: + elapsed_time = now - self.last_local_log + return elapsed_time > self.local_interval + + def _log_prometheus(self, stats: Stats) -> None: + # Set system stat gauges. + gauge_scheduler_running.set(labels, stats.num_running) + gauge_scheduler_swapped.set(labels, stats.num_swapped) + gauge_scheduler_waiting.set(labels, stats.num_waiting) + gauge_gpu_cache_usage.set(labels, stats.gpu_cache_usage) + gauge_cpu_cache_usage.set(labels, stats.cpu_cache_usage) + + # Add to token counters. + counter_prompt_tokens.add(labels, stats.num_prompt_tokens) + counter_generation_tokens.add(labels, stats.num_generation_tokens) + + # Observe request level latencies in histograms. + for ttft in stats.time_to_first_tokens: + histogram_time_to_first_token.observe(labels, ttft) + for tpot in stats.time_per_output_tokens: + histogram_time_per_output_tokens.observe(labels, tpot) + for e2e in stats.time_e2e_requests: + histogram_e2e_request_latency.observe(labels, e2e) + + def _log_prometheus_interval(self, prompt_throughput: float, + generation_throughput: float) -> None: + # Logs metrics to prometheus that are computed every logging_interval. + # Support legacy gauge metrics that make throughput calculations on the vLLM side. + # Moving forward, we should use counters like counter_prompt_tokens, counter_generation_tokens + # Which log raw data and calculate summaries using rate() on the grafana/prometheus side. + # See https://github.com/vllm-project/vllm/pull/2316#discussion_r1464204666 + gauge_avg_prompt_throughput.set(labels, prompt_throughput) + gauge_avg_generation_throughput.set(labels, generation_throughput) + + def log(self, stats: Stats) -> None: + """Called by LLMEngine. + Logs to prometheus and tracked stats every iteration. + Logs to Stdout every self.local_interval seconds.""" + + # Log to prometheus. + self._log_prometheus(stats) + + # Save tracked stats for token counters. + self.num_prompt_tokens.append(stats.num_prompt_tokens) + self.num_generation_tokens.append(stats.num_generation_tokens) + + # Log locally every local_interval seconds. + if self._local_interval_elapsed(stats.now): + + # Compute summary metrics for tracked stats (and log them to promethus if applicable). + prompt_throughput = self._get_throughput(self.num_prompt_tokens, + now=stats.now) + generation_throughput = self._get_throughput( + self.num_generation_tokens, now=stats.now) + self._log_prometheus_interval( + prompt_throughput=prompt_throughput, + generation_throughput=generation_throughput) + # Log to stdout. + logger.info( + f"Avg prompt throughput: {prompt_throughput:.1f} tokens/s, " + f"Avg generation throughput: {generation_throughput:.1f} tokens/s, " + f"Running: {stats.num_running} reqs, " + f"Swapped: {stats.num_swapped} reqs, " + f"Pending: {stats.num_waiting} reqs, " + f"GPU KV cache usage: {stats.gpu_cache_usage * 100:.1f}%, " + f"CPU KV cache usage: {stats.cpu_cache_usage * 100:.1f}%") -def record_metrics( - avg_prompt_throughput: float, - avg_generation_throughput: float, - scheduler_running: int, - scheduler_swapped: int, - scheduler_waiting: int, - gpu_cache_usage: float, - cpu_cache_usage: float, -): - gauge_avg_prompt_throughput.set(labels, avg_prompt_throughput) - gauge_avg_generation_throughput.set(labels, avg_generation_throughput) - gauge_scheduler_running.set(labels, scheduler_running) - gauge_scheduler_swapped.set(labels, scheduler_swapped) - gauge_scheduler_waiting.set(labels, scheduler_waiting) - gauge_gpu_cache_usage.set(labels, gpu_cache_usage) - gauge_cpu_cache_usage.set(labels, cpu_cache_usage) + # Reset tracked stats for next interval. + self.num_prompt_tokens = [] + self.num_generation_tokens = [] + self.last_local_log = stats.now diff --git a/vllm/engine/server_metrics.py b/vllm/engine/server_metrics.py new file mode 100644 index 0000000000000..3c5d1fa27bc69 --- /dev/null +++ b/vllm/engine/server_metrics.py @@ -0,0 +1,7 @@ +from aioprometheus import Counter, Histogram + +INFERENCE_REQUEST_SUCCESS_COUNTER = Counter('vllm:inference_request_success', 'Number of successful inference calls') +INFERENCE_REQUEST_ABORTED_COUNTER = Counter('vllm:inference_request_aborted', 'Number of aborted inference calls') + +INFERENCE_STATE_DURATION_HISTOGRAM = Histogram("vllm:inference_queue_duration", "Histogram of time in SequenceStatus") + \ No newline at end of file diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index deb0fddd643cc..7b62a8696eb9c 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -24,6 +24,8 @@ from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion +from vllm.engine.server_metrics import INFERENCE_REQUEST_SUCCESS_COUNTER, INFERENCE_REQUEST_ABORTED_COUNTER + TIMEOUT_KEEP_ALIVE = 5 # seconds openai_serving_chat: OpenAIServingChat = None @@ -146,15 +148,23 @@ async def show_available_models(): @app.post("/v1/chat/completions") async def create_chat_completion(request: ChatCompletionRequest, raw_request: Request): + generator = await openai_serving_chat.create_chat_completion( request, raw_request) + label = {"endpoint": "/v1/chat/completions"} + if hasattr(request, "model"): + label["model"] = request.model + if isinstance(generator, ErrorResponse): + INFERENCE_REQUEST_ABORTED_COUNTER.inc(label) return JSONResponse(content=generator.model_dump(), status_code=generator.code) if request.stream: + INFERENCE_REQUEST_SUCCESS_COUNTER.inc(label) return StreamingResponse(content=generator, media_type="text/event-stream") else: + INFERENCE_REQUEST_SUCCESS_COUNTER.inc(label) return JSONResponse(content=generator.model_dump()) @@ -162,13 +172,20 @@ async def create_chat_completion(request: ChatCompletionRequest, async def create_completion(request: CompletionRequest, raw_request: Request): generator = await openai_serving_completion.create_completion( request, raw_request) + label = {"endpoint": "/v1/completions"} + if hasattr(request, "model"): + label["model"] = request.model if isinstance(generator, ErrorResponse): + INFERENCE_REQUEST_ABORTED_COUNTER.inc(label) + return JSONResponse(content=generator.model_dump(), status_code=generator.code) if request.stream: + INFERENCE_REQUEST_SUCCESS_COUNTER.inc(label) return StreamingResponse(content=generator, media_type="text/event-stream") else: + INFERENCE_REQUEST_SUCCESS_COUNTER.inc(label) return JSONResponse(content=generator.model_dump()) diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 8c9a7ad309cea..e11d575ba42c3 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -299,6 +299,7 @@ async def create_completion(self, request: CompletionRequest, sampling_params, f"{request_id}-{i}", prompt_token_ids=input_ids)) +####################################################################################### except ValueError as e: return self.create_error_response(str(e)) diff --git a/vllm/sequence.py b/vllm/sequence.py index d28627f47498f..a42b0f7dc1968 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -7,6 +7,8 @@ from vllm.prefix import Prefix from vllm.sampling_params import SamplingParams from vllm.lora.request import LoRARequest +import time +# from vllm.engine.server_metrics import INFERENCE_STATE_DURATION_HISTOGRAM PromptLogprobs = List[Optional[Dict[int, float]]] SampleLogprobs = List[Dict[int, float]] @@ -52,7 +54,6 @@ def get_finished_reason(status: "SequenceStatus") -> Union[str, None]: class SequenceData: """Data associated with a sequence. - Args: prompt_token_ids: The token IDs of the prompt. @@ -130,17 +131,40 @@ def __init__( self.logical_token_blocks: List[LogicalTokenBlock] = [] # Initialize the logical token blocks with the prompt token ids. self._append_tokens_to_blocks(prompt_token_ids) - self.status = SequenceStatus.WAITING + self._status = SequenceStatus.WAITING # Used for incremental detokenization self.prefix_offset = 0 self.read_offset = 0 # Input + output tokens self.tokens: Optional[List[str]] = None + self.start_time = time.perf_counter() + self.status_dwell_time = {} + @property def lora_int_id(self) -> int: return self.lora_request.lora_int_id if self.lora_request else 0 + + @property + def status(self) -> SequenceStatus: + """Get the status of the sequence.""" + return self._status + + @status.setter + def status(self, new_status: SequenceStatus) -> None: + """Set the status of the sequence.""" + # self.status_dwell_time[self.status] = time.perf_counter() - self.start_time + dwell_time = time.perf_counter() - self.start_time + # INFERENCE_STATE_DURATION_HISTOGRAM.labels(state=self.status).observe(dwell_time) + # INFERENCE_STATE_DURATION_HISTOGRAM.observe(dwell_time, state=self.state) + + # if self.status == SequenceStatus.WAITING: + # INFERENCE_STATE_DURATION_HISTOGRAM(self.status_dwell_time[self.status]) + print() + print(new_status) + print() + self._status = new_status def _append_logical_block(self) -> None: block = LogicalTokenBlock( @@ -254,6 +278,7 @@ def __init__( self.seqs_dict = {seq.seq_id: seq for seq in seqs} self.sampling_params = sampling_params self.arrival_time = arrival_time + self.last_token_time = arrival_time self.lora_request = lora_request self.prefix: Optional[Prefix] = prefix self.prompt_logprobs: Optional[PromptLogprobs] = None @@ -274,6 +299,12 @@ def prompt_token_ids(self) -> List[int]: def lora_int_id(self) -> int: return self.lora_request.lora_int_id if self.lora_request else 0 + def get_last_latency(self, now: float) -> float: + """Gets last token latency for Request level timings.""" + latency = now - self.last_token_time + self.last_token_time = now + return latency + def get_max_num_running_seqs(self) -> int: """The maximum number of sequences running in parallel in the remaining lifetime of the request.""" @@ -299,7 +330,7 @@ def get_seqs( return list(self.seqs_dict.values()) else: return [ - seq for seq in self.seqs_dict.values() if seq.status == status + seq for seq in self.seqs_dict.values() if seq._status == status ] def get_unfinished_seqs(self) -> List[Sequence]: