Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pr 2076 ci run #2094

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,8 @@ this will impact performance.
### Distributed Tracing

`text-generation-inference` is instrumented with distributed tracing using OpenTelemetry. You can use this feature
by setting the address to an OTLP collector with the `--otlp-endpoint` argument.
by setting the address to an OTLP collector with the `--otlp-endpoint` argument. The default service name can be
overridden with the `--otlp-service-name` argument

### Architecture

Expand Down
4 changes: 4 additions & 0 deletions docs/source/architecture.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ Options:
[env: JSON_OUTPUT=]
--otlp-endpoint <OTLP_ENDPOINT>
[env: OTLP_ENDPOINT=]
--otlp-service-name <OTLP_SERVICE_NAME>
[env: OTLP_SERVICE_NAME=]
--cors-allow-origin <CORS_ALLOW_ORIGIN>
[env: CORS_ALLOW_ORIGIN=]
--ngrok
Expand Down Expand Up @@ -138,6 +140,8 @@ Serve's command line parameters on the TGI repository are these:
│ --logger-level TEXT [default: INFO] │
│ --json-output --no-json-output [default: no-json-output] │
│ --otlp-endpoint TEXT [default: None] │
│ --otlp-service-name TEXT [default: │
│ text-generation-inference...│
│ --help Show this message and exit. │
╰──────────────────────────────────────────────────────────────────────────────────────────────────────────╯
```
Expand Down
7 changes: 7 additions & 0 deletions docs/source/basic_tutorials/launcher.md
Original file line number Diff line number Diff line change
Expand Up @@ -336,6 +336,13 @@ Options:
--otlp-endpoint <OTLP_ENDPOINT>
[env: OTLP_ENDPOINT=]

```
## OTLP_SERVICE_NAME
```shell
--otlp-service-name <OTLP_SERVICE_NAME>
[env: OTLP_SERVICE_NAME=]
[default: text-generation-inference.router]

```
## CORS_ALLOW_ORIGIN
```shell
Expand Down
20 changes: 19 additions & 1 deletion launcher/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -413,6 +413,9 @@ struct Args {
#[clap(long, env)]
otlp_endpoint: Option<String>,

#[clap(default_value = "text-generation-inference.router", long, env)]
otlp_service_name: Option<String>,

#[clap(long, env)]
cors_allow_origin: Vec<String>,
#[clap(long, env)]
Expand Down Expand Up @@ -483,6 +486,7 @@ fn shard_manager(
max_batch_size: Option<usize>,
max_input_tokens: usize,
otlp_endpoint: Option<String>,
otlp_service_name: Option<String>,
log_level: LevelFilter,
status_sender: mpsc::Sender<ShardStatus>,
shutdown: Arc<AtomicBool>,
Expand Down Expand Up @@ -548,12 +552,18 @@ fn shard_manager(
(None, Some(factor)) => Some((RopeScaling::Linear, factor)),
};

// OpenTelemetry
// OpenTelemetry Endpoint
if let Some(otlp_endpoint) = otlp_endpoint {
shard_args.push("--otlp-endpoint".to_string());
shard_args.push(otlp_endpoint);
}

// OpenTelemetry Service Name
if let Some(otlp_service_name) = otlp_service_name {
shard_args.push("--otlp-service-name".to_string());
shard_args.push(otlp_service_name);
}

// In case we use sliding window, we may ignore the sliding in flash for some backends depending on the parameter.
shard_args.push("--max-input-tokens".to_string());
shard_args.push(max_input_tokens.to_string());
Expand Down Expand Up @@ -1035,6 +1045,7 @@ fn spawn_shards(
let shutdown = shutdown.clone();
let shutdown_sender = shutdown_sender.clone();
let otlp_endpoint = args.otlp_endpoint.clone();
let otlp_service_name = args.otlp_service_name.clone();
let quantize = args.quantize;
let speculate = args.speculate;
let dtype = args.dtype;
Expand Down Expand Up @@ -1074,6 +1085,7 @@ fn spawn_shards(
max_batch_size,
max_input_tokens,
otlp_endpoint,
otlp_service_name,
max_log_level,
status_sender,
shutdown,
Expand Down Expand Up @@ -1207,6 +1219,12 @@ fn spawn_webserver(
router_args.push(otlp_endpoint);
}

// OpenTelemetry
if let Some(otlp_service_name) = args.otlp_service_name {
router_args.push("--otlp-service-name".to_string());
router_args.push(otlp_service_name);
}

// CORS origins
for origin in args.cors_allow_origin.into_iter() {
router_args.push("--cors-allow-origin".to_string());
Expand Down
10 changes: 7 additions & 3 deletions router/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ struct Args {
json_output: bool,
#[clap(long, env)]
otlp_endpoint: Option<String>,
#[clap(default_value = "text-generation-inference.router", long, env)]
otlp_service_name: String,
#[clap(long, env)]
cors_allow_origin: Option<Vec<String>>,
#[clap(long, env)]
Expand Down Expand Up @@ -107,6 +109,7 @@ async fn main() -> Result<(), RouterError> {
validation_workers,
json_output,
otlp_endpoint,
otlp_service_name,
cors_allow_origin,
ngrok,
ngrok_authtoken,
Expand All @@ -117,7 +120,7 @@ async fn main() -> Result<(), RouterError> {
} = args;

// Launch Tokio runtime
init_logging(otlp_endpoint, json_output);
init_logging(otlp_endpoint, otlp_service_name, json_output);

// Validate args
if max_input_tokens >= max_total_tokens {
Expand Down Expand Up @@ -367,10 +370,11 @@ async fn main() -> Result<(), RouterError> {

/// Init logging using env variables LOG_LEVEL and LOG_FORMAT:
/// - otlp_endpoint is an optional URL to an Open Telemetry collector
/// - otlp_service_name service name to appear in APM
/// - LOG_LEVEL may be TRACE, DEBUG, INFO, WARN or ERROR (default to INFO)
/// - LOG_FORMAT may be TEXT or JSON (default to TEXT)
/// - LOG_COLORIZE may be "false" or "true" (default to "true" or ansi supported platforms)
fn init_logging(otlp_endpoint: Option<String>, json_output: bool) {
fn init_logging(otlp_endpoint: Option<String>, otlp_service_name: String, json_output: bool) {
let mut layers = Vec::new();

// STDOUT/STDERR layer
Expand Down Expand Up @@ -401,7 +405,7 @@ fn init_logging(otlp_endpoint: Option<String>, json_output: bool) {
trace::config()
.with_resource(Resource::new(vec![KeyValue::new(
"service.name",
"text-generation-inference.router",
otlp_service_name,
)]))
.with_sampler(Sampler::AlwaysOn),
)
Expand Down
3 changes: 2 additions & 1 deletion server/text_generation_server/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def serve(
logger_level: str = "INFO",
json_output: bool = False,
otlp_endpoint: Optional[str] = None,
otlp_service_name: str = "text-generation-inference.server",
max_input_tokens: Optional[int] = None,
):
if sharded:
Expand Down Expand Up @@ -76,7 +77,7 @@ def serve(

# Setup OpenTelemetry distributed tracing
if otlp_endpoint is not None:
setup_tracing(shard=os.getenv("RANK", 0), otlp_endpoint=otlp_endpoint)
setup_tracing(otlp_service_name=otlp_service_name, otlp_endpoint=otlp_endpoint)

# Downgrade enum into str for easier management later on
quantize = None if quantize is None else quantize.value
Expand Down
6 changes: 2 additions & 4 deletions server/text_generation_server/tracing.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,8 @@ def _start_span(self, handler_call_details, context, set_status_on_exception=Fal
)


def setup_tracing(shard: int, otlp_endpoint: str):
resource = Resource.create(
attributes={"service.name": f"text-generation-inference.server-{shard}"}
)
def setup_tracing(otlp_service_name: str, otlp_endpoint: str):
resource = Resource.create(attributes={"service.name": otlp_service_name})
span_exporter = OTLPSpanExporter(endpoint=otlp_endpoint, insecure=True)
span_processor = BatchSpanProcessor(span_exporter)

Expand Down
Loading