doc: Add metrics documentation and add a 'Reference' section

huggingface · Jul 15, 2024 · 50d6467 · 50d6467
1 parent dbb23fb
commit 50d6467
Show file tree

Hide file tree

Showing 6 changed files with 157 additions and 9 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -5,7 +5,7 @@ repos:
     -   id: check-yaml
     -   id: end-of-file-fixer
     -   id: trailing-whitespace
-        exclude: docs/source/basic_tutorials/launcher.md
+        exclude: docs/source/reference/launcher.md
 -   repo: https://github.com/psf/black
     rev: 24.2.0
     hooks:

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
@@ -17,10 +17,6 @@
     title: Installation from source
   - local: supported_models
     title: Supported Models and Hardware
-  - local: messages_api
-    title: Messages API
-  - local: architecture
-    title: Internal Architecture
   title: Getting started
 - sections:
   - local: basic_tutorials/consuming_tgi
@@ -31,8 +27,6 @@
     title: Serving Private & Gated Models
   - local: basic_tutorials/using_cli
     title: Using TGI CLI
-  - local: basic_tutorials/launcher
-    title: All TGI CLI options
   - local: basic_tutorials/non_core_models
     title: Non-core Model Serving
   - local: basic_tutorials/safety
@@ -46,6 +40,16 @@
   - local: basic_tutorials/train_medusa
     title: Train Medusa
   title: Tutorials
+- sections:
+  - local: architecture
+    title: Internal Architecture
+  - local: reference/launcher
+    title: All TGI CLI options
+  - local: messages_api
+    title: Messages API
+  - local: reference/metrics
+    title: Exported Metrics
+  title: Reference
 - sections:
   - local: conceptual/streaming
     title: Streaming
@@ -62,7 +66,7 @@
   - local: conceptual/speculation
     title: Speculation (Medusa, ngram)
   - local: conceptual/guidance
-    title: How Guidance Works (via outlines
+    title: How Guidance Works (via outlines)
   - local: conceptual/lora
     title: LoRA (Low-Rank Adaptation)
 

diff --git a/docs/source/basic_tutorials/launcher.md → docs/source/reference/launcher.md b/docs/source/basic_tutorials/launcher.md → docs/source/reference/launcher.md
diff --git a/docs/source/reference/metrics.md b/docs/source/reference/metrics.md
@@ -0,0 +1,30 @@
+# Metrics
+
+TGI exposes multiple metrics that can be collected via the `/metrics` Prometheus endpoint.
+These metrics can be used to monitor the performance of TGI, autoscale deployment and to help identify bottlenecks.
+
+The following metrics are exposed:
+
+| Metric Name                                | Description                                                                              | Type      | Unit    |
+|--------------------------------------------|------------------------------------------------------------------------------------------|-----------|---------|
+| `tgi_batch_current_max_tokens`             | Maximum tokens for the current batch                                                     | Gauge     | Count   |
+| `tgi_batch_current_size`                   | Current batch size                                                                       | Gauge     | Count   |
+| `tgi_batch_decode_duration`                | Time spent decoding a batch per method (prefill or decode)                               | Histogram | Seconds |
+| `tgi_batch_filter_duration`                | Time spent filtering batches and sending generated tokens per method (prefill or decode) | Histogram | Seconds |
+| `tgi_batch_forward_duration`               | Batch forward duration per method (prefill or decode)                                    | Histogram | Seconds |
+| `tgi_batch_inference_count`                | Inference calls per method (prefill or decode)                                           | Counter   | Count   |
+| `tgi_batch_inference_duration`             | Batch inference duration                                                                 | Histogram | Seconds |
+| `tgi_batch_inference_success`              | Number of successful inference calls per method (prefill or decode)                      | Counter   | Count   |
+| `tgi_batch_next_size`                      | Batch size of the next batch                                                             | Histogram | Count   |
+| `tgi_queue_size`                           | Current queue size                                                                       | Gauge     | Count   |
+| `tgi_request_count`                        | Total number of requests                                                                 | Counter   | Count   |
+| `tgi_request_duration`                     | Total time spent processing the request (e2e latency)                                    | Histogram | Seconds |
+| `tgi_request_generated_tokens`             | Generated tokens per request                                                             | Histogram | Count   |
+| `tgi_request_inference_duration`           | Request inference duration                                                               | Histogram | Seconds |
+| `tgi_request_input_length`                 | Input token length per request                                                           | Histogram | Count   |
+| `tgi_request_max_new_tokens`               | Maximum new tokens per request                                                           | Histogram | Count   |
+| `tgi_request_mean_time_per_token_duration` | Mean time per token per request (inter-token latency)                                    | Histogram | Seconds |
+| `tgi_request_queue_duration`               | Time spent in the queue per request                                                      | Histogram | Seconds |
+| `tgi_request_skipped_tokens`               | Speculated tokens per request                                                            | Histogram | Count   |
+| `tgi_request_success`                      | Number of successful requests                                                            | Counter   |         |
+| `tgi_request_validation_duration`          | Time spent validating the request                                                        | Histogram | Seconds |
diff --git a/router/src/server.rs b/router/src/server.rs
@@ -1727,6 +1727,120 @@ pub async fn run(
         .install_recorder()
         .expect("failed to install metrics recorder");
 
+    // Metrics descriptions
+    metrics::describe_counter!("tgi_request_success", "Number of successful requests");
+    metrics::describe_histogram!(
+        "tgi_request_duration",
+        metrics::Unit::Seconds,
+        "Request duration"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_validation_duration",
+        metrics::Unit::Seconds,
+        "Request validation duration"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_queue_duration",
+        metrics::Unit::Seconds,
+        "Request queue duration"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_inference_duration",
+        metrics::Unit::Seconds,
+        "Request inference duration"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_mean_time_per_token_duration",
+        metrics::Unit::Seconds,
+        "Mean time per token per request"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_generated_tokens",
+        metrics::Unit::Count,
+        "Generated tokens per request"
+    );
+    metrics::describe_counter!(
+        "tgi_batch_inference_count",
+        metrics::Unit::Count,
+        "Inference calls per method (prefill or decode)"
+    );
+    metrics::describe_counter!(
+        "tgi_request_count",
+        metrics::Unit::Count,
+        "Total number of requests"
+    );
+    metrics::describe_counter!(
+        "tgi_batch_inference_success",
+        metrics::Unit::Count,
+        "Number of successful inference calls per method (prefill or decode)"
+    );
+    metrics::describe_gauge!(
+        "tgi_batch_current_size",
+        metrics::Unit::Count,
+        "Current batch size"
+    );
+    metrics::describe_gauge!("tgi_queue_size", metrics::Unit::Count, "Current queue size");
+    metrics::describe_gauge!(
+        "tgi_batch_current_max_tokens",
+        metrics::Unit::Count,
+        "Maximum tokens for the current batch"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_max_new_tokens",
+        metrics::Unit::Count,
+        "Maximum new tokens per request"
+    );
+    metrics::describe_histogram!(
+        "tgi_batch_inference_duration",
+        metrics::Unit::Seconds,
+        "Batch inference duration"
+    );
+    metrics::describe_histogram!(
+        "tgi_batch_forward_duration",
+        metrics::Unit::Seconds,
+        "Batch forward duration per method (prefill or decode)"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_skipped_tokens",
+        metrics::Unit::Count,
+        "Speculated tokens per request"
+    );
+    metrics::describe_histogram!(
+        "tgi_batch_filter_duration",
+        metrics::Unit::Seconds,
+        "Time spent filtering batches and sending generated tokens per method (prefill or decode)"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_queue_duration",
+        metrics::Unit::Seconds,
+        "Time spent in the queue per request"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_validation_duration",
+        metrics::Unit::Seconds,
+        "Time spent validating the request"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_duration",
+        metrics::Unit::Seconds,
+        "Total time spent processing the request"
+    );
+    metrics::describe_histogram!(
+        "tgi_batch_decode_duration",
+        metrics::Unit::Seconds,
+        "Time spent decoding a batch per method (prefill or decode)"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_input_length",
+        metrics::Unit::Count,
+        "Input token length per request"
+    );
+    metrics::describe_histogram!(
+        "tgi_batch_next_size",
+        metrics::Unit::Count,
+        "Batch size of the next batch"
+    );
+
     // CORS layer
     let allow_origin = allow_origin.unwrap_or(AllowOrigin::any());
     let cors_layer = CorsLayer::new()

diff --git a/update_doc.py b/update_doc.py
@@ -63,7 +63,7 @@ def check_cli(check: bool):
     final_doc += f"## {header}\n```shell\n{rendered_block}\n```\n"
     block = []
 
-    filename = "docs/source/basic_tutorials/launcher.md"
+    filename = "docs/source/reference/launcher.md"
     if check:
         with open(filename, "r") as f:
             doc = f.read()