-
-
Notifications
You must be signed in to change notification settings - Fork 5.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add more Prometheus metrics #2764
Changes from 22 commits
319bc37
9d4ce95
ae7eb6e
2188daa
e41c15f
71ec7c3
45bd839
f237c50
f17a966
8e0d8c1
9ed04ef
6aebd80
de84dac
35944cc
76cd774
93b0796
3643e0c
60f1049
95daee7
cf4acef
0f8dae9
bce096c
e15f653
7b05baa
0958259
5e2c246
5cc7b64
1eeb31d
4c79cbe
4c41a89
ac8435b
f22abf5
9352ce7
b2c0445
e147575
f9bc64e
5ded719
dd84d51
e127a4c
2d36609
e81d95a
ece2ec0
5a658c8
717b559
61fad41
f103ad8
bf1a0c4
cc0d5eb
d7f493b
54bf260
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
import time | ||
from collections import Counter as CollectionsCounter | ||
from typing import Iterable, List, Optional, Tuple, Type, Union | ||
|
||
from transformers import PreTrainedTokenizer | ||
|
@@ -121,7 +122,8 @@ def __init__( | |
if self.log_stats: | ||
self.stat_logger = StatLogger( | ||
local_interval=_LOCAL_LOGGING_INTERVAL_SEC, | ||
labels=dict(model_name=model_config.model)) | ||
labels=dict(model_name=model_config.model), | ||
max_model_len=self.model_config.max_model_len) | ||
self.stat_logger.info("cache_config", self.cache_config) | ||
|
||
@classmethod | ||
|
@@ -673,24 +675,42 @@ def _get_stats(self, | |
# Iteration stats if we have scheduler output. | ||
num_prompt_tokens = 0 | ||
num_generation_tokens = 0 | ||
num_prompt_tokens_lst = [] | ||
num_generation_tokens_lst = [] | ||
request_n = [] | ||
time_to_first_tokens = [] | ||
time_per_output_tokens = [] | ||
time_e2e_requests = [] | ||
finished_reason_counter = CollectionsCounter() | ||
if scheduler_outputs is not None: | ||
prompt_run = scheduler_outputs.prompt_run | ||
|
||
# Number of Tokens. | ||
# Number of Tokens | ||
if prompt_run: | ||
num_prompt_tokens = sum( | ||
num_prompt_tokens_lst = [ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is actually an incorrect calculation. This So what I would suggest is that we log the of the generation and the prompt once the This way we know that we are logging the full prefill length and that we are logging one item per iteration |
||
len(seq_group.prompt_token_ids) | ||
for seq_group in scheduler_outputs.scheduled_seq_groups) | ||
for seq_group in scheduler_outputs.scheduled_seq_groups | ||
] | ||
num_prompt_tokens = sum(num_prompt_tokens_lst) | ||
num_generation_tokens = sum( | ||
seq_group.num_seqs() | ||
for seq_group in scheduler_outputs.scheduled_seq_groups) | ||
else: | ||
num_generation_tokens = scheduler_outputs.num_batched_tokens | ||
num_generation_tokens_lst = [ | ||
seq.get_output_len() | ||
for seq_group in scheduler_outputs.scheduled_seq_groups | ||
for seq in seq_group.get_finished_seqs() | ||
] | ||
|
||
# Sampling Params | ||
if prompt_run: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same issue related to chunked prefill. If our prefill has N chunked, we will create N histogram entires |
||
request_n = [ | ||
seq_group.sampling_params.n | ||
for seq_group in scheduler_outputs.scheduled_seq_groups | ||
] | ||
|
||
# Latency Timings. | ||
# Latency Timings | ||
time_last_iters = [] | ||
for seq_group in scheduler_outputs.scheduled_seq_groups: | ||
# Time since last token. | ||
|
@@ -704,15 +724,28 @@ def _get_stats(self, | |
time_to_first_tokens = time_last_iters if prompt_run else [] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. these are also now incorrect b/c of chunked prefill |
||
time_per_output_tokens = [] if prompt_run else time_last_iters | ||
|
||
# Finished Requests | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Its a bit hard to follow where we have 5-6 loops that loop through the same list of |
||
for seq_group in scheduler_outputs.scheduled_seq_groups: | ||
if not seq_group.is_finished(): | ||
continue | ||
finished_reason_counter += CollectionsCounter([ | ||
SequenceStatus.get_finished_reason(seq.status) | ||
for seq in seq_group.get_finished_seqs() | ||
]) | ||
|
||
return Stats( | ||
now=now, | ||
num_running=num_running, | ||
num_swapped=num_swapped, | ||
num_waiting=num_waiting, | ||
gpu_cache_usage=gpu_cache_usage, | ||
cpu_cache_usage=cpu_cache_usage, | ||
finished_reason_counter=finished_reason_counter, | ||
num_prompt_tokens=num_prompt_tokens, | ||
num_generation_tokens=num_generation_tokens, | ||
num_prompt_tokens_lst=num_prompt_tokens_lst, | ||
num_generation_tokens_lst=num_generation_tokens_lst, | ||
request_n=request_n, | ||
time_to_first_tokens=time_to_first_tokens, | ||
time_per_output_tokens=time_per_output_tokens, | ||
time_e2e_requests=time_e2e_requests, | ||
|
Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
@@ -1,5 +1,6 @@ | ||||||||||||||||||||||||
import time | ||||||||||||||||||||||||
from dataclasses import dataclass | ||||||||||||||||||||||||
from typing import Counter as CollectionsCounter | ||||||||||||||||||||||||
from typing import Dict, List | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
import numpy as np | ||||||||||||||||||||||||
|
@@ -19,7 +20,7 @@ | |||||||||||||||||||||||
# begin-metrics-definitions | ||||||||||||||||||||||||
class Metrics: | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
def __init__(self, labelnames: List[str]): | ||||||||||||||||||||||||
def __init__(self, labelnames: List[str], max_model_len: int): | ||||||||||||||||||||||||
# Unregister any existing vLLM collectors | ||||||||||||||||||||||||
for collector in list(REGISTRY._collector_to_names): | ||||||||||||||||||||||||
if hasattr(collector, "_name") and "vllm" in collector._name: | ||||||||||||||||||||||||
|
@@ -61,6 +62,22 @@ def __init__(self, labelnames: List[str]): | |||||||||||||||||||||||
name="vllm:generation_tokens_total", | ||||||||||||||||||||||||
documentation="Number of generation tokens processed.", | ||||||||||||||||||||||||
labelnames=labelnames) | ||||||||||||||||||||||||
self.counter_request_success = Counter( | ||||||||||||||||||||||||
name="vllm:request_success", | ||||||||||||||||||||||||
documentation="Count of successfully processed requests.", | ||||||||||||||||||||||||
labelnames=labelnames) | ||||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This isn't just counting successful responses, it's counting all finish reasons. If you could find an elegant way to implement the counters we lost when switching from There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A quick option to add http related metrics would be to use This involves installing the package: Then adding the following 2 lines after the app creation: vllm/vllm/entrypoints/openai/api_server.py Lines 47 to 48 in 8af890a
This will add the following metrics:
Should I add it to the PR? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I like this solution, it saves us from reinventing the wheel in vLLM There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice, if we're going to be using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry, I'm not sure I'm following. Which implementation are you referring to that should be removed? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The code highlighted in the original comment self.counter_request_success = Counter(
name="vllm:request_success",
documentation="Count of successfully processed requests.",
labelnames=labelnames) can be removed if we are getting these metrics from There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you for clarifying. I believe that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What do you think of the idea of renaming this to something like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. While the idea of combining these metrics may seem appealing at first glance, I believe they should be kept separate for the following reasons:
|
||||||||||||||||||||||||
self.histogram_request_prompt_tokens = Histogram( | ||||||||||||||||||||||||
name="vllm:request_prompt_tokens", | ||||||||||||||||||||||||
documentation="Number of prefill tokens processed.", | ||||||||||||||||||||||||
labelnames=labelnames, | ||||||||||||||||||||||||
buckets=build_1_2_5_buckets(max_model_len), | ||||||||||||||||||||||||
) | ||||||||||||||||||||||||
self.histogram_request_generation_tokens = Histogram( | ||||||||||||||||||||||||
name="vllm:request_generation_tokens", | ||||||||||||||||||||||||
documentation="Number of generation tokens processed.", | ||||||||||||||||||||||||
labelnames=labelnames, | ||||||||||||||||||||||||
buckets=build_1_2_5_buckets(max_model_len), | ||||||||||||||||||||||||
) | ||||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These two could be constructed using It wouldn't be exactly the same, but it would prevent additional overhead in the server. i.e. if you calculate it on grafana (and your scrape interval is 1 minute) then it'd be a histogram of how many tokens get processed/generated per minute rather than how many tokens get processed/generated per request. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for your feedback! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since this metric doesn't actually introduce any overhead (because the data from There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Exactly! I suggest to deprecate the 2 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should keep these metrics, because a developer may not want to have to aggregate histogram data in order to get the same effect of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Prometheus histograms have this nice feature where in addition to the bucket counters, they include 2 additional counters
Therfore, Source: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh I see, thanks for explaining. In that case you could move the Although I think there might be some objection to changing metrics that people are already using in dashboards. cc @simon-mo @Yard1 @robertgshaw2-neuralmagic (not sure who to ping for metrics related things, so please tell me if I should stop) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. from my point of view, it's fine to duplicate metrics for backward compatibility reason. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure, I'll relocate these metrics to the legacy section. Perhaps in the future, when we're able to make breaking changes, we can consider removing them. |
||||||||||||||||||||||||
self.histogram_time_to_first_token = Histogram( | ||||||||||||||||||||||||
name="vllm:time_to_first_token_seconds", | ||||||||||||||||||||||||
documentation="Histogram of time to first token in seconds.", | ||||||||||||||||||||||||
|
@@ -82,6 +99,12 @@ def __init__(self, labelnames: List[str]): | |||||||||||||||||||||||
documentation="Histogram of end to end request latency in seconds.", | ||||||||||||||||||||||||
labelnames=labelnames, | ||||||||||||||||||||||||
buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0]) | ||||||||||||||||||||||||
self.histogram_request_n = Histogram( | ||||||||||||||||||||||||
name="vllm:request_params_n", | ||||||||||||||||||||||||
documentation="Histogram of the n request parameter.", | ||||||||||||||||||||||||
labelnames=labelnames, | ||||||||||||||||||||||||
buckets=[1, 2, 5, 10, 20], | ||||||||||||||||||||||||
) | ||||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we keep what's currently called There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Great suggestion! I'll incorporate the new metrics into the Grafana dashboard demo. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This can be resolved now as you've added the Grafana examples |
||||||||||||||||||||||||
|
||||||||||||||||||||||||
# Legacy metrics | ||||||||||||||||||||||||
self.gauge_avg_prompt_throughput = Gauge( | ||||||||||||||||||||||||
|
@@ -99,6 +122,28 @@ def __init__(self, labelnames: List[str]): | |||||||||||||||||||||||
# end-metrics-definitions | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
|
||||||||||||||||||||||||
def build_1_2_5_buckets(max_value: int): | ||||||||||||||||||||||||
""" | ||||||||||||||||||||||||
Builds a list of buckets with increasing powers of 10 multiplied by | ||||||||||||||||||||||||
mantissa values (1, 2, 5) until the value exceeds the specified maximum. | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
Example: | ||||||||||||||||||||||||
>>> build_1_2_5_buckets(100) | ||||||||||||||||||||||||
[1, 2, 5, 10, 20, 50, 100] | ||||||||||||||||||||||||
""" | ||||||||||||||||||||||||
mantissa_lst = [1, 2, 5] | ||||||||||||||||||||||||
exponent = 0 | ||||||||||||||||||||||||
buckets = [] | ||||||||||||||||||||||||
while True: | ||||||||||||||||||||||||
for m in mantissa_lst: | ||||||||||||||||||||||||
value = m * 10**exponent | ||||||||||||||||||||||||
if value <= max_value: | ||||||||||||||||||||||||
buckets.append(value) | ||||||||||||||||||||||||
else: | ||||||||||||||||||||||||
return buckets | ||||||||||||||||||||||||
exponent += 1 | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
|
||||||||||||||||||||||||
@dataclass | ||||||||||||||||||||||||
class Stats: | ||||||||||||||||||||||||
"""Created by LLMEngine for use by StatLogger.""" | ||||||||||||||||||||||||
|
@@ -112,8 +157,12 @@ class Stats: | |||||||||||||||||||||||
cpu_cache_usage: float | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
# Raw stats from last model iteration. | ||||||||||||||||||||||||
finished_reason_counter: CollectionsCounter[str] | ||||||||||||||||||||||||
num_prompt_tokens: int | ||||||||||||||||||||||||
num_generation_tokens: int | ||||||||||||||||||||||||
num_prompt_tokens_lst: List[int] | ||||||||||||||||||||||||
num_generation_tokens_lst: List[int] | ||||||||||||||||||||||||
request_n: List[int] | ||||||||||||||||||||||||
time_to_first_tokens: List[float] | ||||||||||||||||||||||||
time_per_output_tokens: List[float] | ||||||||||||||||||||||||
time_e2e_requests: List[float] | ||||||||||||||||||||||||
|
@@ -122,7 +171,8 @@ class Stats: | |||||||||||||||||||||||
class StatLogger: | ||||||||||||||||||||||||
"""StatLogger is used LLMEngine to log to Promethus and Stdout.""" | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
def __init__(self, local_interval: float, labels: Dict[str, str]) -> None: | ||||||||||||||||||||||||
def __init__(self, local_interval: float, labels: Dict[str, str], | ||||||||||||||||||||||||
max_model_len: int) -> None: | ||||||||||||||||||||||||
# Metadata for logging locally. | ||||||||||||||||||||||||
self.last_local_log = time.monotonic() | ||||||||||||||||||||||||
self.local_interval = local_interval | ||||||||||||||||||||||||
|
@@ -133,7 +183,8 @@ def __init__(self, local_interval: float, labels: Dict[str, str]) -> None: | |||||||||||||||||||||||
|
||||||||||||||||||||||||
# Prometheus metrics | ||||||||||||||||||||||||
self.labels = labels | ||||||||||||||||||||||||
self.metrics = Metrics(labelnames=list(labels.keys())) | ||||||||||||||||||||||||
self.metrics = Metrics(labelnames=list(labels.keys()), | ||||||||||||||||||||||||
max_model_len=max_model_len) | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
def info(self, type: str, obj: object) -> None: | ||||||||||||||||||||||||
if type == "cache_config": | ||||||||||||||||||||||||
|
@@ -165,6 +216,26 @@ def _log_prometheus(self, stats: Stats) -> None: | |||||||||||||||||||||||
self.metrics.counter_generation_tokens.labels(**self.labels).inc( | ||||||||||||||||||||||||
stats.num_generation_tokens) | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
# Add to request counters. | ||||||||||||||||||||||||
for finished_reason, count in stats.finished_reason_counter.items(): | ||||||||||||||||||||||||
self.metrics.counter_request_success.labels({ | ||||||||||||||||||||||||
**self.labels, | ||||||||||||||||||||||||
"finished_reason": | ||||||||||||||||||||||||
finished_reason, | ||||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does this not raise an error because There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good catch! Although it doesn't raise an error, it behaves unexpectedly by appending the |
||||||||||||||||||||||||
}).inc(count) | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
# Observe number of tokens in histograms. | ||||||||||||||||||||||||
for val in stats.num_prompt_tokens_lst: | ||||||||||||||||||||||||
self.metrics.histogram_request_prompt_tokens.labels( | ||||||||||||||||||||||||
**self.labels).observe(val) | ||||||||||||||||||||||||
for val in stats.num_generation_tokens_lst: | ||||||||||||||||||||||||
self.metrics.histogram_request_generation_tokens.labels( | ||||||||||||||||||||||||
**self.labels).observe(val) | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
# Observe sampling params in histograms. | ||||||||||||||||||||||||
for n in stats.request_n: | ||||||||||||||||||||||||
self.metrics.histogram_request_n.labels(**self.labels).observe(n) | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
# Observe request level latencies in histograms. | ||||||||||||||||||||||||
for ttft in stats.time_to_first_tokens: | ||||||||||||||||||||||||
self.metrics.histogram_time_to_first_token.labels( | ||||||||||||||||||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,7 +11,7 @@ | |
from fastapi.exceptions import RequestValidationError | ||
from fastapi.middleware.cors import CORSMiddleware | ||
from fastapi.responses import JSONResponse, Response, StreamingResponse | ||
from prometheus_client import make_asgi_app | ||
from prometheus_fastapi_instrumentator import Instrumentator | ||
|
||
import vllm | ||
from vllm.engine.arg_utils import AsyncEngineArgs | ||
|
@@ -45,18 +45,15 @@ async def _force_log(): | |
|
||
|
||
app = fastapi.FastAPI(lifespan=lifespan) | ||
# Instrument the app with HTTP metrics and expose it on /metrics | ||
Instrumentator().instrument(app).expose(app, endpoint="/metrics") | ||
|
||
|
||
def parse_args(): | ||
parser = make_arg_parser() | ||
return parser.parse_args() | ||
|
||
|
||
# Add prometheus asgi middleware to route /metrics requests | ||
metrics_app = make_asgi_app() | ||
app.mount("/metrics", metrics_app) | ||
|
||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is how the metrics defined in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I replaced it with the I noticed it also solves the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
While this does expose a Have you confirmed that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I've verified that both approaches expose all metrics. The only discrepancy I've noticed is that |
||
@app.exception_handler(RequestValidationError) | ||
async def validation_exception_handler(_, exc): | ||
err = openai_serving_chat.create_error_response(message=str(exc)) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please move this one to be just after the
# Finished Requests
commentThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there any reason this can't just be a list of finished reasons that we process on the
stat_logger
side?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think so, and it would make the stats logging code in the engine a bit cleaner
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If I move
finished_reason_counter = CollectionsCounter()
after# Finished Requests
, it might be undefined when accessed later for the same reason asrequest_n
andrequest_best_of
. Please refer to that comment for details.Done.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah in the diff view I hadn't realised this was nested, it's ok to leave as is then