diff --git a/.github/workflows/build_pr_documentation.yaml b/.github/workflows/build_pr_documentation.yaml index bf03bfdf362..a5ce39a5f5e 100644 --- a/.github/workflows/build_pr_documentation.yaml +++ b/.github/workflows/build_pr_documentation.yaml @@ -11,7 +11,7 @@ concurrency: jobs: build: - uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yaml@main + uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main with: commit_sha: ${{ github.event.pull_request.head.sha }} pr_number: ${{ github.event.number }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6f5e685ea8e..0c8b6885483 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -5,7 +5,7 @@ repos: - id: check-yaml - id: end-of-file-fixer - id: trailing-whitespace - exclude: docs/source/basic_tutorials/launcher.md + exclude: docs/source/reference/launcher.md - repo: https://github.com/psf/black rev: 24.2.0 hooks: diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index e97c00aa260..f52fa2ec2a5 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -17,8 +17,6 @@ title: Installation from source - local: supported_models title: Supported Models and Hardware - - local: messages_api - title: Messages API - local: architecture title: Internal Architecture - local: usage_statistics @@ -33,8 +31,6 @@ title: Serving Private & Gated Models - local: basic_tutorials/using_cli title: Using TGI CLI - - local: basic_tutorials/launcher - title: All TGI CLI options - local: basic_tutorials/non_core_models title: Non-core Model Serving - local: basic_tutorials/safety @@ -48,6 +44,14 @@ - local: basic_tutorials/train_medusa title: Train Medusa title: Tutorials +- sections: + - local: reference/launcher + title: All TGI CLI options + - local: reference/metrics + title: Exported Metrics + - local: reference/api_reference + title: API Reference + title: Reference - sections: - local: conceptual/streaming title: Streaming @@ -64,7 +68,7 @@ - local: conceptual/speculation title: Speculation (Medusa, ngram) - local: conceptual/guidance - title: How Guidance Works (via outlines + title: How Guidance Works (via outlines) - local: conceptual/lora title: LoRA (Low-Rank Adaptation) diff --git a/docs/source/messages_api.md b/docs/source/reference/api_reference.md similarity index 84% rename from docs/source/messages_api.md rename to docs/source/reference/api_reference.md index 250aaae2249..52043c80f8a 100644 --- a/docs/source/messages_api.md +++ b/docs/source/reference/api_reference.md @@ -1,17 +1,30 @@ -# Messages API +# HTTP API Reference -Text Generation Inference (TGI) now supports the Messages API, which is fully compatible with the OpenAI Chat Completion API. This feature is available starting from version 1.4.0. You can use OpenAI's client libraries or third-party libraries expecting OpenAI schema to interact with TGI's Messages API. Below are some examples of how to utilize this compatibility. +#### Table of Contents -> **Note:** The Messages API is supported from TGI version 1.4.0 and above. Ensure you are using a compatible version to access this feature. +- [Text Generation Inference custom API](#text-generation-inference-custom-api) +- [OpenAI Messages API](#openai-messages-api) + - [Making a Request](#making-a-request) + - [Streaming](#streaming) + - [Synchronous](#synchronous) + - [Hugging Face Inference Endpoints](#hugging-face-inference-endpoints) + - [Cloud Providers](#cloud-providers) + - [Amazon SageMaker](#amazon-sagemaker) -#### Table of Contents +The HTTP API is a RESTful API that allows you to interact with the text-generation-inference component. Two endpoints are available: +* Text Generation Inference [custom API](https://huggingface.github.io/text-generation-inference/) +* OpenAI's [Messages API](#openai-messages-api) + + +## Text Generation Inference custom API -- [Making a Request](#making-a-request) -- [Streaming](#streaming) -- [Synchronous](#synchronous) -- [Hugging Face Inference Endpoints](#hugging-face-inference-endpoints) -- [Cloud Providers](#cloud-providers) - - [Amazon SageMaker](#amazon-sagemaker) +Check the [API documentation](https://huggingface.github.io/text-generation-inference/) for more information on how to interact with the Text Generation Inference API. + +## OpenAI Messages API + +Text Generation Inference (TGI) now supports the Messages API, which is fully compatible with the OpenAI Chat Completion API. This feature is available starting from version 1.4.0. You can use OpenAI's client libraries or third-party libraries expecting OpenAI schema to interact with TGI's Messages API. Below are some examples of how to utilize this compatibility. + +> **Note:** The Messages API is supported from TGI version 1.4.0 and above. Ensure you are using a compatible version to access this feature. ## Making a Request diff --git a/docs/source/basic_tutorials/launcher.md b/docs/source/reference/launcher.md similarity index 100% rename from docs/source/basic_tutorials/launcher.md rename to docs/source/reference/launcher.md diff --git a/docs/source/reference/metrics.md b/docs/source/reference/metrics.md new file mode 100644 index 00000000000..d34d38eab7d --- /dev/null +++ b/docs/source/reference/metrics.md @@ -0,0 +1,30 @@ +# Metrics + +TGI exposes multiple metrics that can be collected via the `/metrics` Prometheus endpoint. +These metrics can be used to monitor the performance of TGI, autoscale deployment and to help identify bottlenecks. + +The following metrics are exposed: + +| Metric Name | Description | Type | Unit | +|--------------------------------------------|------------------------------------------------------------------------------------------|-----------|---------| +| `tgi_batch_current_max_tokens` | Maximum tokens for the current batch | Gauge | Count | +| `tgi_batch_current_size` | Current batch size | Gauge | Count | +| `tgi_batch_decode_duration` | Time spent decoding a batch per method (prefill or decode) | Histogram | Seconds | +| `tgi_batch_filter_duration` | Time spent filtering batches and sending generated tokens per method (prefill or decode) | Histogram | Seconds | +| `tgi_batch_forward_duration` | Batch forward duration per method (prefill or decode) | Histogram | Seconds | +| `tgi_batch_inference_count` | Inference calls per method (prefill or decode) | Counter | Count | +| `tgi_batch_inference_duration` | Batch inference duration | Histogram | Seconds | +| `tgi_batch_inference_success` | Number of successful inference calls per method (prefill or decode) | Counter | Count | +| `tgi_batch_next_size` | Batch size of the next batch | Histogram | Count | +| `tgi_queue_size` | Current queue size | Gauge | Count | +| `tgi_request_count` | Total number of requests | Counter | Count | +| `tgi_request_duration` | Total time spent processing the request (e2e latency) | Histogram | Seconds | +| `tgi_request_generated_tokens` | Generated tokens per request | Histogram | Count | +| `tgi_request_inference_duration` | Request inference duration | Histogram | Seconds | +| `tgi_request_input_length` | Input token length per request | Histogram | Count | +| `tgi_request_max_new_tokens` | Maximum new tokens per request | Histogram | Count | +| `tgi_request_mean_time_per_token_duration` | Mean time per token per request (inter-token latency) | Histogram | Seconds | +| `tgi_request_queue_duration` | Time spent in the queue per request | Histogram | Seconds | +| `tgi_request_skipped_tokens` | Speculated tokens per request | Histogram | Count | +| `tgi_request_success` | Number of successful requests | Counter | | +| `tgi_request_validation_duration` | Time spent validating the request | Histogram | Seconds | diff --git a/router/src/server.rs b/router/src/server.rs index ab268efa2f7..8ec7a8716ed 100644 --- a/router/src/server.rs +++ b/router/src/server.rs @@ -2003,6 +2003,120 @@ async fn start( .install_recorder() .expect("failed to install metrics recorder"); + // Metrics descriptions + metrics::describe_counter!("tgi_request_success", "Number of successful requests"); + metrics::describe_histogram!( + "tgi_request_duration", + metrics::Unit::Seconds, + "Request duration" + ); + metrics::describe_histogram!( + "tgi_request_validation_duration", + metrics::Unit::Seconds, + "Request validation duration" + ); + metrics::describe_histogram!( + "tgi_request_queue_duration", + metrics::Unit::Seconds, + "Request queue duration" + ); + metrics::describe_histogram!( + "tgi_request_inference_duration", + metrics::Unit::Seconds, + "Request inference duration" + ); + metrics::describe_histogram!( + "tgi_request_mean_time_per_token_duration", + metrics::Unit::Seconds, + "Mean time per token per request" + ); + metrics::describe_histogram!( + "tgi_request_generated_tokens", + metrics::Unit::Count, + "Generated tokens per request" + ); + metrics::describe_counter!( + "tgi_batch_inference_count", + metrics::Unit::Count, + "Inference calls per method (prefill or decode)" + ); + metrics::describe_counter!( + "tgi_request_count", + metrics::Unit::Count, + "Total number of requests" + ); + metrics::describe_counter!( + "tgi_batch_inference_success", + metrics::Unit::Count, + "Number of successful inference calls per method (prefill or decode)" + ); + metrics::describe_gauge!( + "tgi_batch_current_size", + metrics::Unit::Count, + "Current batch size" + ); + metrics::describe_gauge!("tgi_queue_size", metrics::Unit::Count, "Current queue size"); + metrics::describe_gauge!( + "tgi_batch_current_max_tokens", + metrics::Unit::Count, + "Maximum tokens for the current batch" + ); + metrics::describe_histogram!( + "tgi_request_max_new_tokens", + metrics::Unit::Count, + "Maximum new tokens per request" + ); + metrics::describe_histogram!( + "tgi_batch_inference_duration", + metrics::Unit::Seconds, + "Batch inference duration" + ); + metrics::describe_histogram!( + "tgi_batch_forward_duration", + metrics::Unit::Seconds, + "Batch forward duration per method (prefill or decode)" + ); + metrics::describe_histogram!( + "tgi_request_skipped_tokens", + metrics::Unit::Count, + "Speculated tokens per request" + ); + metrics::describe_histogram!( + "tgi_batch_filter_duration", + metrics::Unit::Seconds, + "Time spent filtering batches and sending generated tokens per method (prefill or decode)" + ); + metrics::describe_histogram!( + "tgi_request_queue_duration", + metrics::Unit::Seconds, + "Time spent in the queue per request" + ); + metrics::describe_histogram!( + "tgi_request_validation_duration", + metrics::Unit::Seconds, + "Time spent validating the request" + ); + metrics::describe_histogram!( + "tgi_request_duration", + metrics::Unit::Seconds, + "Total time spent processing the request" + ); + metrics::describe_histogram!( + "tgi_batch_decode_duration", + metrics::Unit::Seconds, + "Time spent decoding a batch per method (prefill or decode)" + ); + metrics::describe_histogram!( + "tgi_request_input_length", + metrics::Unit::Count, + "Input token length per request" + ); + metrics::describe_histogram!( + "tgi_batch_next_size", + metrics::Unit::Count, + "Batch size of the next batch" + ); + // CORS layer let allow_origin = allow_origin.unwrap_or(AllowOrigin::any()); let cors_layer = CorsLayer::new() diff --git a/update_doc.py b/update_doc.py index e887e1c6dc0..3fb0d314305 100644 --- a/update_doc.py +++ b/update_doc.py @@ -63,7 +63,7 @@ def check_cli(check: bool): final_doc += f"## {header}\n```shell\n{rendered_block}\n```\n" block = [] - filename = "docs/source/basic_tutorials/launcher.md" + filename = "docs/source/reference/launcher.md" if check: with open(filename, "r") as f: doc = f.read()