vllm-project · simon-mo · Apr 28, 2024 · Jan 31, 2024 · Jan 31, 2024 · Jan 31, 2024
diff --git a/requirements.txt b/requirements.txt
@@ -11,6 +11,7 @@ fastapi
 uvicorn[standard]
 pydantic >= 2.0  # Required for OpenAI server.
 prometheus_client >= 0.18.0
+prometheus-fastapi-instrumentator
 pynvml == 11.5.0
 triton >= 2.1.0
 outlines == 0.0.34

@@ -139,7 +139,7 @@ def abort_seq_group(self, request_id: Union[str, Iterable[str]]) -> None:
             for seq_group in state_queue:
                 if not request_ids:
                     # Using 'break' here may add two extra iterations,
-                    # but is acceptable to reduce complexity .
+                    # but is acceptable to reduce complexity.
                     break
                 if seq_group.request_id in request_ids:
                     # Appending aborted group into pending list.

@@ -1,4 +1,5 @@
 import time
+from collections import Counter as CollectionsCounter
 from typing import Iterable, List, Optional, Tuple, Type, Union
 
 from transformers import PreTrainedTokenizer
@@ -121,7 +122,8 @@ def __init__(
         if self.log_stats:
             self.stat_logger = StatLogger(
                 local_interval=_LOCAL_LOGGING_INTERVAL_SEC,
-                labels=dict(model_name=model_config.model))
+                labels=dict(model_name=model_config.model),
+                max_model_len=self.model_config.max_model_len)
             self.stat_logger.info("cache_config", self.cache_config)
 
     @classmethod
@@ -673,24 +675,42 @@ def _get_stats(self,
         # Iteration stats if we have scheduler output.
         num_prompt_tokens = 0
         num_generation_tokens = 0
+        num_prompt_tokens_lst = []
+        num_generation_tokens_lst = []
+        request_n = []
         time_to_first_tokens = []
         time_per_output_tokens = []
         time_e2e_requests = []
+        finished_reason_counter = CollectionsCounter()
         if scheduler_outputs is not None:
             prompt_run = scheduler_outputs.prompt_run
 
-            # Number of Tokens.
+            # Number of Tokens
             if prompt_run:
-                num_prompt_tokens = sum(
+                num_prompt_tokens_lst = [
                     len(seq_group.prompt_token_ids)
-                    for seq_group in scheduler_outputs.scheduled_seq_groups)
+                    for seq_group in scheduler_outputs.scheduled_seq_groups
+                ]
+                num_prompt_tokens = sum(num_prompt_tokens_lst)
                 num_generation_tokens = sum(
                     seq_group.num_seqs()
                     for seq_group in scheduler_outputs.scheduled_seq_groups)
             else:
                 num_generation_tokens = scheduler_outputs.num_batched_tokens
+                num_generation_tokens_lst = [
+                    seq.get_output_len()
+                    for seq_group in scheduler_outputs.scheduled_seq_groups
+                    for seq in seq_group.get_finished_seqs()
+                ]
+
+            # Sampling Params
+            if prompt_run:
+                request_n = [
+                    seq_group.sampling_params.n
+                    for seq_group in scheduler_outputs.scheduled_seq_groups
+                ]
 
-            # Latency Timings.
+            # Latency Timings
             time_last_iters = []
             for seq_group in scheduler_outputs.scheduled_seq_groups:
                 # Time since last token.
@@ -704,15 +724,28 @@ def _get_stats(self,
             time_to_first_tokens = time_last_iters if prompt_run else []
             time_per_output_tokens = [] if prompt_run else time_last_iters
 
+            # Finished Requests
+            for seq_group in scheduler_outputs.scheduled_seq_groups:
+                if not seq_group.is_finished():
+                    continue
+                finished_reason_counter += CollectionsCounter([
+                    SequenceStatus.get_finished_reason(seq.status)
+                    for seq in seq_group.get_finished_seqs()
+                ])
+
         return Stats(
             now=now,
             num_running=num_running,
             num_swapped=num_swapped,
             num_waiting=num_waiting,
             gpu_cache_usage=gpu_cache_usage,
             cpu_cache_usage=cpu_cache_usage,
+            finished_reason_counter=finished_reason_counter,
             num_prompt_tokens=num_prompt_tokens,
             num_generation_tokens=num_generation_tokens,
+            num_prompt_tokens_lst=num_prompt_tokens_lst,
+            num_generation_tokens_lst=num_generation_tokens_lst,
+            request_n=request_n,
             time_to_first_tokens=time_to_first_tokens,
             time_per_output_tokens=time_per_output_tokens,
             time_e2e_requests=time_e2e_requests,

diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
@@ -1,5 +1,6 @@
 import time
 from dataclasses import dataclass
+from typing import Counter as CollectionsCounter
 from typing import Dict, List
 
 import numpy as np
@@ -19,7 +20,7 @@
 # begin-metrics-definitions
 class Metrics:
 
-    def __init__(self, labelnames: List[str]):
+    def __init__(self, labelnames: List[str], max_model_len: int):
         # Unregister any existing vLLM collectors
         for collector in list(REGISTRY._collector_to_names):
             if hasattr(collector, "_name") and "vllm" in collector._name:
@@ -61,6 +62,22 @@ def __init__(self, labelnames: List[str]):
             name="vllm:generation_tokens_total",
             documentation="Number of generation tokens processed.",
             labelnames=labelnames)
+        self.counter_request_success = Counter(
+            name="vllm:request_success",
+            documentation="Count of successfully processed requests.",
+            labelnames=labelnames)
 app = fastapi.FastAPI(lifespan=lifespan) 
 app = fastapi.FastAPI(lifespan=lifespan) 
  
+        self.histogram_request_prompt_tokens = Histogram(
+            name="vllm:request_prompt_tokens",
+            documentation="Number of prefill tokens processed.",
+            labelnames=labelnames,
+            buckets=build_1_2_5_buckets(max_model_len),
+        )
+        self.histogram_request_generation_tokens = Histogram(
+            name="vllm:request_generation_tokens",
+            documentation="Number of generation tokens processed.",
+            labelnames=labelnames,
+            buckets=build_1_2_5_buckets(max_model_len),
+        )
         self.histogram_time_to_first_token = Histogram(
             name="vllm:time_to_first_token_seconds",
             documentation="Histogram of time to first token in seconds.",
@@ -82,6 +99,12 @@ def __init__(self, labelnames: List[str]):
             documentation="Histogram of end to end request latency in seconds.",
             labelnames=labelnames,
             buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0])
+        self.histogram_request_n = Histogram(
+            name="vllm:request_params_n",
+            documentation="Histogram of the n request parameter.",
+            labelnames=labelnames,
+            buckets=[1, 2, 5, 10, 20],
+        )
 
         # Legacy metrics
         self.gauge_avg_prompt_throughput = Gauge(
@@ -99,6 +122,28 @@ def __init__(self, labelnames: List[str]):
 # end-metrics-definitions
 
 
+def build_1_2_5_buckets(max_value: int):
+    """
+    Builds a list of buckets with increasing powers of 10 multiplied by 
+    mantissa values (1, 2, 5) until the value exceeds the specified maximum.
+
+    Example:
+    >>> build_1_2_5_buckets(100)
+    [1, 2, 5, 10, 20, 50, 100]
+    """
+    mantissa_lst = [1, 2, 5]
+    exponent = 0
+    buckets = []
+    while True:
+        for m in mantissa_lst:
+            value = m * 10**exponent
+            if value <= max_value:
+                buckets.append(value)
+            else:
+                return buckets
+        exponent += 1
+
+
 @dataclass
 class Stats:
     """Created by LLMEngine for use by StatLogger."""
@@ -112,8 +157,12 @@ class Stats:
     cpu_cache_usage: float
 
     # Raw stats from last model iteration.
+    finished_reason_counter: CollectionsCounter[str]
     num_prompt_tokens: int
     num_generation_tokens: int
+    num_prompt_tokens_lst: List[int]
+    num_generation_tokens_lst: List[int]
+    request_n: List[int]
     time_to_first_tokens: List[float]
     time_per_output_tokens: List[float]
     time_e2e_requests: List[float]
@@ -122,7 +171,8 @@ class Stats:
 class StatLogger:
     """StatLogger is used LLMEngine to log to Promethus and Stdout."""
 
-    def __init__(self, local_interval: float, labels: Dict[str, str]) -> None:
+    def __init__(self, local_interval: float, labels: Dict[str, str],
+                 max_model_len: int) -> None:
         # Metadata for logging locally.
         self.last_local_log = time.monotonic()
         self.local_interval = local_interval
@@ -133,7 +183,8 @@ def __init__(self, local_interval: float, labels: Dict[str, str]) -> None:
 
         # Prometheus metrics
         self.labels = labels
-        self.metrics = Metrics(labelnames=list(labels.keys()))
+        self.metrics = Metrics(labelnames=list(labels.keys()),
+                               max_model_len=max_model_len)
 
     def info(self, type: str, obj: object) -> None:
         if type == "cache_config":
@@ -165,6 +216,26 @@ def _log_prometheus(self, stats: Stats) -> None:
         self.metrics.counter_generation_tokens.labels(**self.labels).inc(
             stats.num_generation_tokens)
 
+        # Add to request counters.
+        for finished_reason, count in stats.finished_reason_counter.items():
+            self.metrics.counter_request_success.labels({
+                **self.labels,
+                "finished_reason":
+                finished_reason,
+            }).inc(count)
+
+        # Observe number of tokens in histograms.
+        for val in stats.num_prompt_tokens_lst:
+            self.metrics.histogram_request_prompt_tokens.labels(
+                **self.labels).observe(val)
+        for val in stats.num_generation_tokens_lst:
+            self.metrics.histogram_request_generation_tokens.labels(
+                **self.labels).observe(val)
+
+        # Observe sampling params in histograms.
+        for n in stats.request_n:
+            self.metrics.histogram_request_n.labels(**self.labels).observe(n)
+
         # Observe request level latencies in histograms.
         for ttft in stats.time_to_first_tokens:
             self.metrics.histogram_time_to_first_token.labels(

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
@@ -11,7 +11,7 @@
 from fastapi.exceptions import RequestValidationError
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, Response, StreamingResponse
-from prometheus_client import make_asgi_app
+from prometheus_fastapi_instrumentator import Instrumentator
 
 import vllm
 from vllm.engine.arg_utils import AsyncEngineArgs
@@ -45,18 +45,15 @@ async def _force_log():
 
 
 app = fastapi.FastAPI(lifespan=lifespan)
+# Instrument the app with HTTP metrics and expose it on /metrics
+Instrumentator().instrument(app).expose(app, endpoint="/metrics")
 
 
 def parse_args():
     parser = make_arg_parser()
     return parser.parse_args()
 
 
-# Add prometheus asgi middleware to route /metrics requests
-metrics_app = make_asgi_app()
-app.mount("/metrics", metrics_app)
-
-
 @app.exception_handler(RequestValidationError)
 async def validation_exception_handler(_, exc):
     err = openai_serving_chat.create_error_response(message=str(exc))