Skip to content

Commit

Permalink
doc: Add metrics documentation and add a 'Reference' section
Browse files Browse the repository at this point in the history
  • Loading branch information
Hugoch committed Jul 15, 2024
1 parent dbb23fb commit 50d6467
Show file tree
Hide file tree
Showing 6 changed files with 157 additions and 9 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ repos:
- id: check-yaml
- id: end-of-file-fixer
- id: trailing-whitespace
exclude: docs/source/basic_tutorials/launcher.md
exclude: docs/source/reference/launcher.md
- repo: https://github.com/psf/black
rev: 24.2.0
hooks:
Expand Down
18 changes: 11 additions & 7 deletions docs/source/_toctree.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,6 @@
title: Installation from source
- local: supported_models
title: Supported Models and Hardware
- local: messages_api
title: Messages API
- local: architecture
title: Internal Architecture
title: Getting started
- sections:
- local: basic_tutorials/consuming_tgi
Expand All @@ -31,8 +27,6 @@
title: Serving Private & Gated Models
- local: basic_tutorials/using_cli
title: Using TGI CLI
- local: basic_tutorials/launcher
title: All TGI CLI options
- local: basic_tutorials/non_core_models
title: Non-core Model Serving
- local: basic_tutorials/safety
Expand All @@ -46,6 +40,16 @@
- local: basic_tutorials/train_medusa
title: Train Medusa
title: Tutorials
- sections:
- local: architecture
title: Internal Architecture
- local: reference/launcher
title: All TGI CLI options
- local: messages_api
title: Messages API
- local: reference/metrics
title: Exported Metrics
title: Reference
- sections:
- local: conceptual/streaming
title: Streaming
Expand All @@ -62,7 +66,7 @@
- local: conceptual/speculation
title: Speculation (Medusa, ngram)
- local: conceptual/guidance
title: How Guidance Works (via outlines
title: How Guidance Works (via outlines)
- local: conceptual/lora
title: LoRA (Low-Rank Adaptation)

Expand Down
File renamed without changes.
30 changes: 30 additions & 0 deletions docs/source/reference/metrics.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Metrics

TGI exposes multiple metrics that can be collected via the `/metrics` Prometheus endpoint.
These metrics can be used to monitor the performance of TGI, autoscale deployment and to help identify bottlenecks.

The following metrics are exposed:

| Metric Name | Description | Type | Unit |
|--------------------------------------------|------------------------------------------------------------------------------------------|-----------|---------|
| `tgi_batch_current_max_tokens` | Maximum tokens for the current batch | Gauge | Count |
| `tgi_batch_current_size` | Current batch size | Gauge | Count |
| `tgi_batch_decode_duration` | Time spent decoding a batch per method (prefill or decode) | Histogram | Seconds |
| `tgi_batch_filter_duration` | Time spent filtering batches and sending generated tokens per method (prefill or decode) | Histogram | Seconds |
| `tgi_batch_forward_duration` | Batch forward duration per method (prefill or decode) | Histogram | Seconds |
| `tgi_batch_inference_count` | Inference calls per method (prefill or decode) | Counter | Count |
| `tgi_batch_inference_duration` | Batch inference duration | Histogram | Seconds |
| `tgi_batch_inference_success` | Number of successful inference calls per method (prefill or decode) | Counter | Count |
| `tgi_batch_next_size` | Batch size of the next batch | Histogram | Count |
| `tgi_queue_size` | Current queue size | Gauge | Count |
| `tgi_request_count` | Total number of requests | Counter | Count |
| `tgi_request_duration` | Total time spent processing the request (e2e latency) | Histogram | Seconds |
| `tgi_request_generated_tokens` | Generated tokens per request | Histogram | Count |
| `tgi_request_inference_duration` | Request inference duration | Histogram | Seconds |
| `tgi_request_input_length` | Input token length per request | Histogram | Count |
| `tgi_request_max_new_tokens` | Maximum new tokens per request | Histogram | Count |
| `tgi_request_mean_time_per_token_duration` | Mean time per token per request (inter-token latency) | Histogram | Seconds |
| `tgi_request_queue_duration` | Time spent in the queue per request | Histogram | Seconds |
| `tgi_request_skipped_tokens` | Speculated tokens per request | Histogram | Count |
| `tgi_request_success` | Number of successful requests | Counter | |
| `tgi_request_validation_duration` | Time spent validating the request | Histogram | Seconds |
114 changes: 114 additions & 0 deletions router/src/server.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1727,6 +1727,120 @@ pub async fn run(
.install_recorder()
.expect("failed to install metrics recorder");

// Metrics descriptions
metrics::describe_counter!("tgi_request_success", "Number of successful requests");
metrics::describe_histogram!(
"tgi_request_duration",
metrics::Unit::Seconds,
"Request duration"
);
metrics::describe_histogram!(
"tgi_request_validation_duration",
metrics::Unit::Seconds,
"Request validation duration"
);
metrics::describe_histogram!(
"tgi_request_queue_duration",
metrics::Unit::Seconds,
"Request queue duration"
);
metrics::describe_histogram!(
"tgi_request_inference_duration",
metrics::Unit::Seconds,
"Request inference duration"
);
metrics::describe_histogram!(
"tgi_request_mean_time_per_token_duration",
metrics::Unit::Seconds,
"Mean time per token per request"
);
metrics::describe_histogram!(
"tgi_request_generated_tokens",
metrics::Unit::Count,
"Generated tokens per request"
);
metrics::describe_counter!(
"tgi_batch_inference_count",
metrics::Unit::Count,
"Inference calls per method (prefill or decode)"
);
metrics::describe_counter!(
"tgi_request_count",
metrics::Unit::Count,
"Total number of requests"
);
metrics::describe_counter!(
"tgi_batch_inference_success",
metrics::Unit::Count,
"Number of successful inference calls per method (prefill or decode)"
);
metrics::describe_gauge!(
"tgi_batch_current_size",
metrics::Unit::Count,
"Current batch size"
);
metrics::describe_gauge!("tgi_queue_size", metrics::Unit::Count, "Current queue size");
metrics::describe_gauge!(
"tgi_batch_current_max_tokens",
metrics::Unit::Count,
"Maximum tokens for the current batch"
);
metrics::describe_histogram!(
"tgi_request_max_new_tokens",
metrics::Unit::Count,
"Maximum new tokens per request"
);
metrics::describe_histogram!(
"tgi_batch_inference_duration",
metrics::Unit::Seconds,
"Batch inference duration"
);
metrics::describe_histogram!(
"tgi_batch_forward_duration",
metrics::Unit::Seconds,
"Batch forward duration per method (prefill or decode)"
);
metrics::describe_histogram!(
"tgi_request_skipped_tokens",
metrics::Unit::Count,
"Speculated tokens per request"
);
metrics::describe_histogram!(
"tgi_batch_filter_duration",
metrics::Unit::Seconds,
"Time spent filtering batches and sending generated tokens per method (prefill or decode)"
);
metrics::describe_histogram!(
"tgi_request_queue_duration",
metrics::Unit::Seconds,
"Time spent in the queue per request"
);
metrics::describe_histogram!(
"tgi_request_validation_duration",
metrics::Unit::Seconds,
"Time spent validating the request"
);
metrics::describe_histogram!(
"tgi_request_duration",
metrics::Unit::Seconds,
"Total time spent processing the request"
);
metrics::describe_histogram!(
"tgi_batch_decode_duration",
metrics::Unit::Seconds,
"Time spent decoding a batch per method (prefill or decode)"
);
metrics::describe_histogram!(
"tgi_request_input_length",
metrics::Unit::Count,
"Input token length per request"
);
metrics::describe_histogram!(
"tgi_batch_next_size",
metrics::Unit::Count,
"Batch size of the next batch"
);

// CORS layer
let allow_origin = allow_origin.unwrap_or(AllowOrigin::any());
let cors_layer = CorsLayer::new()
Expand Down
2 changes: 1 addition & 1 deletion update_doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def check_cli(check: bool):
final_doc += f"## {header}\n```shell\n{rendered_block}\n```\n"
block = []

filename = "docs/source/basic_tutorials/launcher.md"
filename = "docs/source/reference/launcher.md"
if check:
with open(filename, "r") as f:
doc = f.read()
Expand Down

0 comments on commit 50d6467

Please sign in to comment.