From 029010efe52c3b3aeda2c5def25f2037415489f2 Mon Sep 17 00:00:00 2001 From: Asmir Avdicevic Date: Mon, 22 Jul 2024 17:36:24 +0200 Subject: [PATCH] feat: metrics dumps & extended metrics (#2519) ## Description Extends several pieces: - introduces a "metric dumper" which is just a way of saying you can sample the internal metrics and write them to a CSV file (which should come in handy for 3 pieces that should come down the line; 1) CI using these to validate behavior, 2) debug dumps from 3rd parties, 3) local debugging) - along with the dumper the `doctor plot` has been extended to be able to read those dumps and also just generally improved some rough edges so it's less error prone. - node counts are here for relays. You now have a derived metric which simply counts unique daily node connections. Sample usage for the metrics dumper: `cargo run --bin iroh --all-features -- --metrics-dump-path test.metrics.csv start` Sample of the plotter: `cargo run --bin iroh --all-features -- doctor plot --timeframe 30 --interval 10 --file metrics.dump.csv magicsock_actor_tick_main_total,magicsock_actor_tick_msg_total,magicsock_actor_tick_endpoint_heartbeat_total,magicsock_actor_tick_endpoints_update_receiver_total,magicsock_actor_tick_re_stun_total` ## Breaking Changes ## Notes & open questions This is merging into https://github.com/n0-computer/iroh/pull/2464 as part of the larger metrics refactor. ## Change checklist - [ ] Self-review. - [ ] Documentation updates following the [style guide](https://rust-lang.github.io/rfcs/1574-more-api-documentation-conventions.html#appendix-a-full-conventions-text), if relevant. - [ ] Tests if relevant. - [ ] All breaking changes documented. --- Cargo.lock | 116 ++++++++++++++-------------- iroh-cli/Cargo.toml | 2 +- iroh-cli/src/commands.rs | 4 + iroh-cli/src/commands/doctor.rs | 131 +++++++++++++++++++++++++++----- iroh-cli/src/commands/start.rs | 20 +++++ iroh-cli/src/config.rs | 3 + iroh-metrics/Cargo.toml | 4 +- iroh-metrics/src/core.rs | 17 +++++ iroh-metrics/src/lib.rs | 8 ++ iroh-metrics/src/metrics.rs | 9 +++ iroh-metrics/src/service.rs | 72 ++++++++++++++++++ iroh-net/src/relay/metrics.rs | 5 ++ iroh-net/src/relay/server.rs | 95 ++++++++++++++++------- 13 files changed, 379 insertions(+), 107 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 63778db1c5..18c031120b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -296,7 +296,7 @@ dependencies = [ "bytes", "futures-util", "http 1.1.0", - "http-body 1.0.0", + "http-body 1.0.1", "http-body-util", "hyper 1.4.1", "hyper-util", @@ -329,7 +329,7 @@ dependencies = [ "bytes", "futures-util", "http 1.1.0", - "http-body 1.0.0", + "http-body 1.0.1", "http-body-util", "mime", "pin-project-lite", @@ -362,7 +362,7 @@ dependencies = [ "bytes", "futures-util", "http 1.1.0", - "http-body 1.0.0", + "http-body 1.0.1", "http-body-util", "hyper 1.4.1", "hyper-util", @@ -586,9 +586,9 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "castaway" -version = "0.2.2" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a17ed5635fc8536268e5d4de1e22e81ac34419e5f052d4d51f4e01dcc263fcc" +checksum = "0abae9be0aaf9ea96a3b1b8b1b55c602ca751eba1b1500220cea4ecbafe7c0d5" dependencies = [ "rustversion", ] @@ -1052,9 +1052,9 @@ dependencies = [ [[package]] name = "darling" -version = "0.20.9" +version = "0.20.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83b2eb4d90d12bdda5ed17de686c2acb4c57914f8f921b8da7e112b5a36f3fe1" +checksum = "6f63b86c8a8826a49b8c21f08a2d07338eec8d900540f8630dc76284be802989" dependencies = [ "darling_core", "darling_macro", @@ -1062,9 +1062,9 @@ dependencies = [ [[package]] name = "darling_core" -version = "0.20.9" +version = "0.20.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "622687fe0bac72a04e5599029151f5796111b90f1baaa9b544d807a5e31cd120" +checksum = "95133861a8032aaea082871032f5815eb9e98cef03fa916ab4500513994df9e5" dependencies = [ "fnv", "ident_case", @@ -1076,9 +1076,9 @@ dependencies = [ [[package]] name = "darling_macro" -version = "0.20.9" +version = "0.20.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "733cabb43482b1a1b53eee8583c2b9e8684d592215ea83efd305dd31bc2f0178" +checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806" dependencies = [ "darling_core", "quote", @@ -1145,9 +1145,9 @@ dependencies = [ [[package]] name = "der_derive" -version = "0.7.2" +version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5fe87ce4529967e0ba1dcf8450bab64d97dfd5010a6256187ffe2e43e6f0e049" +checksum = "8034092389675178f570469e6c3b0465d3d30b4505c294a6550db47f3c17ad18" dependencies = [ "proc-macro2", "quote", @@ -1280,9 +1280,9 @@ dependencies = [ [[package]] name = "document-features" -version = "0.2.8" +version = "0.2.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef5282ad69563b5fc40319526ba27e0e7363d552a896f0297d54f767717f9b95" +checksum = "cb6969eaabd2421f8a2775cfd2471a2b634372b4a25d41e3bd647b79912850a0" dependencies = [ "litrs", ] @@ -2142,9 +2142,9 @@ dependencies = [ [[package]] name = "http-body" -version = "1.0.0" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cac85db508abc24a2e48553ba12a996e87244a0395ce011e62b37158745d643" +checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" dependencies = [ "bytes", "http 1.1.0", @@ -2159,7 +2159,7 @@ dependencies = [ "bytes", "futures-util", "http 1.1.0", - "http-body 1.0.0", + "http-body 1.0.1", "pin-project-lite", ] @@ -2196,9 +2196,9 @@ dependencies = [ [[package]] name = "hyper" -version = "0.14.29" +version = "0.14.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f361cde2f109281a220d4307746cdfd5ee3f410da58a70377762396775634b33" +checksum = "a152ddd61dfaec7273fe8419ab357f33aee0d914c5f4efbf0d96fa749eea5ec9" dependencies = [ "bytes", "futures-channel", @@ -2229,7 +2229,7 @@ dependencies = [ "futures-util", "h2 0.4.5", "http 1.1.0", - "http-body 1.0.0", + "http-body 1.0.1", "httparse", "httpdate", "itoa", @@ -2247,7 +2247,7 @@ checksum = "ec3efd23720e2049821a693cbc7e65ea87c72f1c58ff2f9522ff332b1491e590" dependencies = [ "futures-util", "http 0.2.12", - "hyper 0.14.29", + "hyper 0.14.30", "rustls 0.21.12", "tokio", "tokio-rustls 0.24.1", @@ -2281,7 +2281,7 @@ dependencies = [ "futures-channel", "futures-util", "http 1.1.0", - "http-body 1.0.0", + "http-body 1.0.1", "hyper 1.4.1", "pin-project-lite", "socket2", @@ -2351,7 +2351,7 @@ dependencies = [ "bytes", "futures", "http 0.2.12", - "hyper 0.14.29", + "hyper 0.14.30", "log", "rand", "tokio", @@ -2996,9 +2996,9 @@ dependencies = [ [[package]] name = "itertools" -version = "0.12.1" +version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" dependencies = [ "either", ] @@ -3657,7 +3657,7 @@ checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" dependencies = [ "cfg-if", "libc", - "redox_syscall 0.5.2", + "redox_syscall 0.5.3", "smallvec", "windows-targets 0.52.6", ] @@ -4073,9 +4073,9 @@ dependencies = [ [[package]] name = "prometheus-client" -version = "0.22.2" +version = "0.22.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1ca959da22a332509f2a73ae9e5f23f9dcfc31fd3a54d71f159495bd5909baa" +checksum = "504ee9ff529add891127c4827eb481bd69dc0ebc72e9a682e187db4caa60c3ca" dependencies = [ "dtoa", "itoa", @@ -4354,19 +4354,20 @@ dependencies = [ [[package]] name = "ratatui" -version = "0.26.3" +version = "0.27.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f44c9e68fd46eda15c646fbb85e1040b657a58cdc8c98db1d97a55930d991eef" +checksum = "d16546c5b5962abf8ce6e2881e722b4e0ae3b6f1a08a26ae3573c55853ca68d3" dependencies = [ "bitflags 2.6.0", "cassowary", "compact_str", "crossterm", - "itertools 0.12.1", + "itertools 0.13.0", "lru", "paste", "stability", "strum 0.26.3", + "strum_macros 0.26.4", "unicode-segmentation", "unicode-truncate", "unicode-width", @@ -4374,9 +4375,9 @@ dependencies = [ [[package]] name = "raw-cpuid" -version = "11.0.2" +version = "11.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e29830cbb1290e404f24c73af91c5d8d631ce7e128691e9477556b540cd01ecd" +checksum = "cb9ee317cfe3fbd54b36a511efc1edd42e216903c9cd575e686dd68a2ba90d8d" dependencies = [ "bitflags 2.6.0", ] @@ -4454,9 +4455,9 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.5.2" +version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c82cf8cff14456045f55ec4241383baeff27af886adb72ffb2162f99911de0fd" +checksum = "2a908a6e00f1fdd0dfd9c0eb08ce85126f6d8bbda50017e74bc4a4b7d4a926a4" dependencies = [ "bitflags 2.6.0", ] @@ -4567,7 +4568,7 @@ dependencies = [ "h2 0.3.26", "http 0.2.12", "http-body 0.4.6", - "hyper 0.14.29", + "hyper 0.14.30", "hyper-rustls 0.24.2", "ipnet", "js-sys", @@ -4605,7 +4606,7 @@ dependencies = [ "futures-core", "futures-util", "http 1.1.0", - "http-body 1.0.0", + "http-body 1.0.1", "http-body-util", "hyper 1.4.1", "hyper-rustls 0.27.2", @@ -4959,9 +4960,9 @@ dependencies = [ [[package]] name = "security-framework" -version = "2.11.0" +version = "2.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c627723fd09706bacdb5cf41499e95098555af3c3c29d014dc3c458ef6be11c0" +checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02" dependencies = [ "bitflags 2.6.0", "core-foundation", @@ -4972,9 +4973,9 @@ dependencies = [ [[package]] name = "security-framework-sys" -version = "2.11.0" +version = "2.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "317936bbbd05227752583946b9e66d7ce3b489f84e11a94a510b4437fef407d7" +checksum = "75da29fe9b9b08fe9d6b22b5b4bcbc75d8db3aa31e639aa56bb62e9d46bfceaf" dependencies = [ "core-foundation-sys", "libc", @@ -5722,18 +5723,18 @@ checksum = "d72e255c0541f86589b0287139b70bd941a197ea4cea8fd8f87afe9c965a99e4" [[package]] name = "thiserror" -version = "1.0.61" +version = "1.0.63" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c546c80d6be4bc6a00c0f01730c08df82eaa7a7a61f11d656526506112cc1709" +checksum = "c0342370b38b6a11b6cc11d6a805569958d54cfa061a29969c3b5ce2ea405724" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.61" +version = "1.0.63" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46c3384250002a6d5af4d114f2845d37b57521033f30d5c3f46c4d70e1197533" +checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261" dependencies = [ "proc-macro2", "quote", @@ -5810,9 +5811,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.38.0" +version = "1.38.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba4f4a02a7a80d6f274636f0aa95c7e383b912d41fe721a31f29e29698585a4a" +checksum = "eb2caba9f80616f438e09748d5acda951967e1ea58508ef53d9c6402485a46df" dependencies = [ "backtrace", "bytes", @@ -5961,15 +5962,15 @@ dependencies = [ [[package]] name = "toml" -version = "0.8.14" +version = "0.8.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f49eb2ab21d2f26bd6db7bf383edc527a7ebaee412d17af4d40fdccd442f335" +checksum = "ac2caab0bf757388c6c0ae23b3293fdb463fee59434529014f85e3263b995c28" dependencies = [ "indexmap 2.2.6", "serde", "serde_spanned", "toml_datetime", - "toml_edit 0.22.14", + "toml_edit 0.22.16", ] [[package]] @@ -5994,9 +5995,9 @@ dependencies = [ [[package]] name = "toml_edit" -version = "0.22.14" +version = "0.22.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f21c7aaf97f1bd9ca9d4f9e73b0a6c74bd5afef56f2bc931943a6e1c37e04e38" +checksum = "278f3d518e152219c994ce877758516bca5e118eaed6996192a774fb9fbf0788" dependencies = [ "indexmap 2.2.6", "serde", @@ -6030,7 +6031,7 @@ dependencies = [ "bitflags 2.6.0", "bytes", "http 1.1.0", - "http-body 1.0.0", + "http-body 1.0.1", "http-body-util", "pin-project-lite", "tower-layer", @@ -6241,11 +6242,12 @@ checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202" [[package]] name = "unicode-truncate" -version = "1.0.0" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a5fbabedabe362c618c714dbefda9927b5afc8e2a8102f47f081089a9019226" +checksum = "b3644627a5af5fa321c95b9b235a72fd24cd29c648c2c379431e6628655627bf" dependencies = [ - "itertools 0.12.1", + "itertools 0.13.0", + "unicode-segmentation", "unicode-width", ] diff --git a/iroh-cli/Cargo.toml b/iroh-cli/Cargo.toml index a8c944be5a..0b1e46c3dd 100644 --- a/iroh-cli/Cargo.toml +++ b/iroh-cli/Cargo.toml @@ -49,7 +49,7 @@ portable-atomic = "1" postcard = "1.0.8" quic-rpc = { version = "0.11", features = ["flume-transport", "quinn-transport"] } rand = "0.8.5" -ratatui = "0.26.2" +ratatui = "0.27" reqwest = { version = "0.12.4", default-features = false, features = ["json", "rustls-tls"] } rustyline = "12.0.0" serde = { version = "1.0.197", features = ["derive"] } diff --git a/iroh-cli/src/commands.rs b/iroh-cli/src/commands.rs index 5c35727153..cf97d62fe8 100644 --- a/iroh-cli/src/commands.rs +++ b/iroh-cli/src/commands.rs @@ -46,6 +46,10 @@ pub(crate) struct Cli { /// Address to serve RPC on. #[clap(long)] pub(crate) rpc_addr: Option, + + /// If set, metrics will be dumped in CSV format to the specified path at regular intervals (100ms). + #[clap(long)] + pub(crate) metrics_dump_path: Option, } #[derive(Debug, Clone)] diff --git a/iroh-cli/src/commands/doctor.rs b/iroh-cli/src/commands/doctor.rs index 1d8ccd195e..a9886c827b 100644 --- a/iroh-cli/src/commands/doctor.rs +++ b/iroh-cli/src/commands/doctor.rs @@ -55,6 +55,7 @@ use crossterm::{ }; use rand::Rng; use ratatui::{prelude::*, widgets::*}; +use tracing::warn; #[derive(Debug, Clone, derive_more::Display)] pub enum SecretKeyOption { @@ -232,6 +233,9 @@ pub enum Commands { /// Endpoint to scrape for prometheus metrics #[clap(long, default_value = "http://localhost:9090")] scrape_url: String, + /// File to read the metrics from. Takes precedence over scrape_url. + #[clap(long)] + file: Option, }, } @@ -1172,6 +1176,7 @@ pub async fn run(command: Commands, config: &NodeConfig) -> anyhow::Result<()> { metrics, timeframe, scrape_url, + file, } => { let metrics: Vec = metrics.split(',').map(|s| s.to_string()).collect(); let interval = Duration::from_millis(interval); @@ -1182,7 +1187,7 @@ pub async fn run(command: Commands, config: &NodeConfig) -> anyhow::Result<()> { let backend = CrosstermBackend::new(stdout); let mut terminal = Terminal::new(backend)?; - let app = PlotterApp::new(metrics, timeframe, scrape_url); + let app = PlotterApp::new(metrics, timeframe, scrape_url, file); let res = run_plotter(&mut terminal, app, interval).await; disable_raw_mode()?; execute!( @@ -1214,7 +1219,7 @@ async fn run_plotter( loop { terminal.draw(|f| plotter_draw(f, &mut app))?; - if crossterm::event::poll(Duration::from_millis(100))? { + if crossterm::event::poll(Duration::from_millis(10))? { if let Event::Key(key) = event::read()? { if key.kind == KeyEventKind::Press { if let KeyCode::Char(c) = key.code { @@ -1283,8 +1288,16 @@ fn plot_chart(frame: &mut Frame, area: Rect, app: &PlotterApp, metric: &str) { let y_start = data_y_range.0; let y_end = data_y_range.1; + let last_val = data.last(); + let name = match last_val { + Some(val) => { + let val_y = val.1; + format!("{metric}: {val_y:.0}") + } + None => metric.to_string(), + }; let datasets = vec![Dataset::default() - .name(metric) + .name(name) .marker(symbols::Marker::Dot) .graph_type(GraphType::Line) .style(Style::default().fg(Color::Cyan)) @@ -1304,19 +1317,19 @@ fn plot_chart(frame: &mut Frame, area: Rect, app: &PlotterApp, metric: &str) { ]; let mut y_labels = vec![Span::styled( - format!("{:.1}", y_start), + format!("{:.0}", y_start), Style::default().add_modifier(Modifier::BOLD), )]; for i in 1..=10 { y_labels.push(Span::raw(format!( - "{:.1}", + "{:.0}", y_start + (y_end - y_start) / 10.0 * i as f64 ))); } y_labels.push(Span::styled( - format!("{:.1}", y_end), + format!("{:.0}", y_end), Style::default().add_modifier(Modifier::BOLD), )); @@ -1355,23 +1368,64 @@ struct PlotterApp { freeze: bool, internal_ts: Duration, scrape_url: String, + file_data: Vec, + file_header: Vec, } impl PlotterApp { - fn new(metrics: Vec, timeframe: usize, scrape_url: String) -> Self { + fn new( + metrics: Vec, + timeframe: usize, + scrape_url: String, + file: Option, + ) -> Self { let data = metrics.iter().map(|m| (m.clone(), vec![])).collect(); let data_y_range = metrics.iter().map(|m| (m.clone(), (0.0, 0.0))).collect(); + let mut file_data: Vec = file + .map(|f| std::fs::read_to_string(f).unwrap()) + .unwrap_or_default() + .split('\n') + .map(|s| s.to_string()) + .collect(); + let mut file_header = vec![]; + let mut timeframe = timeframe; + if !file_data.is_empty() { + file_header = file_data[0].split(',').map(|s| s.to_string()).collect(); + file_data.remove(0); + + while file_data.last().unwrap().is_empty() { + file_data.pop(); + } + + let first_line: Vec = file_data[0].split(',').map(|s| s.to_string()).collect(); + let last_line: Vec = file_data + .last() + .unwrap() + .split(',') + .map(|s| s.to_string()) + .collect(); + + let start_time: usize = first_line.first().unwrap().parse().unwrap(); + let end_time: usize = last_line.first().unwrap().parse().unwrap(); + + timeframe = (end_time - start_time) / 1000; + } + timeframe = timeframe.clamp(30, 90); + + file_data.reverse(); Self { should_quit: false, metrics, start_ts: Instant::now(), data, data_y_range, - timeframe: timeframe - 25, + timeframe, rng: rand::thread_rng(), freeze: false, internal_ts: Duration::default(), scrape_url, + file_data, + file_header, } } @@ -1392,16 +1446,34 @@ impl PlotterApp { return; } - let req = reqwest::Client::new().get(&self.scrape_url).send().await; - if req.is_err() { - return; - } - let data = req.unwrap().text().await.unwrap(); - let metrics_response = iroh_metrics::parse_prometheus_metrics(&data); - if metrics_response.is_err() { - return; - } - let metrics_response = metrics_response.unwrap(); + let metrics_response = match self.file_data.is_empty() { + true => { + let req = reqwest::Client::new().get(&self.scrape_url).send().await; + if req.is_err() { + return; + } + let data = req.unwrap().text().await.unwrap(); + let metrics_response = iroh_metrics::parse_prometheus_metrics(&data); + if metrics_response.is_err() { + return; + } + metrics_response.unwrap() + } + false => { + if self.file_data.len() == 1 { + self.freeze = true; + return; + } + let data = self.file_data.pop().unwrap(); + let r = parse_csv_metrics(&self.file_header, &data); + if let Ok(mr) = r { + mr + } else { + warn!("Failed to parse csv metrics: {:?}", r.err()); + HashMap::new() + } + } + }; self.internal_ts = self.start_ts.elapsed(); for metric in &self.metrics { let val = if metric.eq("random") { @@ -1412,7 +1484,12 @@ impl PlotterApp { 0.0 }; let e = self.data.entry(metric.clone()).or_default(); - e.push((self.internal_ts.as_secs_f64(), val)); + let mut ts = self.internal_ts.as_secs_f64(); + if metrics_response.contains_key("time") { + ts = *metrics_response.get("time").unwrap() / 1000.0; + } + self.internal_ts = Duration::from_secs_f64(ts); + e.push((ts, val)); let yr = self.data_y_range.get_mut(metric).unwrap(); if val * 1.1 < yr.0 { yr.0 = val * 1.2; @@ -1423,3 +1500,19 @@ impl PlotterApp { } } } + +fn parse_csv_metrics(header: &[String], data: &str) -> anyhow::Result> { + let mut metrics = HashMap::new(); + let data = data.split(',').collect::>(); + for (i, h) in header.iter().enumerate() { + let val = match h.as_str() { + "time" => { + let ts = data[i].parse::()?; + ts as f64 + } + _ => data[i].parse::()?, + }; + metrics.insert(h.clone(), val); + } + Ok(metrics) +} diff --git a/iroh-cli/src/commands/start.rs b/iroh-cli/src/commands/start.rs index c6c91c49f8..39449fd15c 100644 --- a/iroh-cli/src/commands/start.rs +++ b/iroh-cli/src/commands/start.rs @@ -1,3 +1,4 @@ +use std::path::PathBuf; use std::{future::Future, net::SocketAddr, path::Path, time::Duration}; use crate::config::NodeConfig; @@ -39,12 +40,17 @@ where { let _guard = crate::logging::init_terminal_and_file_logging(&config.file_logs, iroh_data_root)?; let metrics_fut = start_metrics_server(config.metrics_addr); + let metrics_dumper_fut = + start_metrics_dumper(config.metrics_dump_path.clone(), Duration::from_millis(100)); let res = run_with_command_inner(config, iroh_data_root, rpc_addr, run_type, command).await; if let Some(metrics_fut) = metrics_fut { metrics_fut.abort(); } + if let Some(metrics_dumper_fut) = metrics_dumper_fut { + metrics_dumper_fut.abort(); + } let (clear_rpc, res) = match res { Ok(()) => (true, res), @@ -186,6 +192,20 @@ pub fn start_metrics_server( None } +pub fn start_metrics_dumper( + path: Option, + interval: Duration, +) -> Option> { + // doesn't start the dumper if the address is None + Some(tokio::task::spawn(async move { + if let Some(path) = path { + if let Err(e) = iroh_metrics::metrics::start_metrics_dumper(path, interval).await { + eprintln!("Failed to start metrics dumper: {e}"); + } + } + })) +} + #[cfg(test)] mod tests { use super::*; diff --git a/iroh-cli/src/config.rs b/iroh-cli/src/config.rs index 3183d588b8..d47250d862 100644 --- a/iroh-cli/src/config.rs +++ b/iroh-cli/src/config.rs @@ -58,6 +58,8 @@ pub(crate) struct NodeConfig { /// Bind address on which to serve Prometheus metrics pub(crate) metrics_addr: Option, pub(crate) file_logs: super::logging::FileLogging, + /// Path to dump metrics to in CSV format. + pub(crate) metrics_dump_path: Option, } impl Default for NodeConfig { @@ -83,6 +85,7 @@ impl Default for NodeConfig { gc_policy: GcPolicy::Disabled, metrics_addr: Some(([127, 0, 0, 1], 9090).into()), file_logs: Default::default(), + metrics_dump_path: None, } } } diff --git a/iroh-metrics/Cargo.toml b/iroh-metrics/Cargo.toml index 19e6bbb9bb..b1568765ef 100644 --- a/iroh-metrics/Cargo.toml +++ b/iroh-metrics/Cargo.toml @@ -21,12 +21,12 @@ http-body-util = "0.1.0" hyper = { version = "1", features = ["server", "http1"] } hyper-util = { version = "0.1.1", features = ["tokio"] } once_cell = "1.17.0" -prometheus-client = { version = "0.22.0", optional = true } +prometheus-client = { version = "0.22", optional = true } reqwest = { version = "0.12.4", default-features = false, features = ["json", "rustls-tls"] } serde = { version = "1.0", features = ["derive"] } struct_iterable = "0.1" time = { version = "0.3.21", features = ["serde-well-known"] } -tokio = { version = "1", features = ["rt", "net"]} +tokio = { version = "1", features = ["rt", "net", "fs"]} tracing = "0.1" [dev-dependencies] diff --git a/iroh-metrics/src/core.rs b/iroh-metrics/src/core.rs index b5d3bab370..c1218a7af1 100644 --- a/iroh-metrics/src/core.rs +++ b/iroh-metrics/src/core.rs @@ -60,6 +60,23 @@ impl Counter { self.counter.inc_by(v) } + /// Set the [`Counter`] value. + /// Warning: this is not default behavior for a counter that should always be monotonically increasing. + #[cfg(feature = "metrics")] + pub fn set(&self, v: u64) -> u64 { + self.counter + .inner() + .store(v, std::sync::atomic::Ordering::Relaxed); + v + } + + /// Set the [`Counter`] value. + /// Warning: this is not default behavior for a counter that should always be monotonically increasing. + #[cfg(not(feature = "metrics"))] + pub fn set(&self, _v: u64) -> u64 { + 0 + } + /// Increase the [`Counter`] by `u64`, returning the previous value. #[cfg(not(feature = "metrics"))] pub fn inc_by(&self, _v: u64) -> u64 { diff --git a/iroh-metrics/src/lib.rs b/iroh-metrics/src/lib.rs index 68dbd2acbc..d7fa5beaf3 100644 --- a/iroh-metrics/src/lib.rs +++ b/iroh-metrics/src/lib.rs @@ -32,6 +32,14 @@ macro_rules! inc_by { }; } +/// Set the given counter to `n`. +#[macro_export] +macro_rules! set { + ($m:ty, $f:ident, $n:expr) => { + <$m as $crate::core::Metric>::with_metric(|m| m.$f.set($n)); + }; +} + /// Report usage statistics to the configured endpoint. #[allow(unused_variables)] pub async fn report_usage_stats(report: &UsageStatsReport) { diff --git a/iroh-metrics/src/metrics.rs b/iroh-metrics/src/metrics.rs index b8d57e5d23..49f413454b 100644 --- a/iroh-metrics/src/metrics.rs +++ b/iroh-metrics/src/metrics.rs @@ -53,3 +53,12 @@ use std::net::SocketAddr; pub async fn start_metrics_server(addr: SocketAddr) -> anyhow::Result<()> { crate::service::run(addr).await } + +/// Start a metrics dumper service. +#[cfg(feature = "metrics")] +pub async fn start_metrics_dumper( + path: std::path::PathBuf, + interval: std::time::Duration, +) -> anyhow::Result<()> { + crate::service::dumper(&path, interval).await +} diff --git a/iroh-metrics/src/service.rs b/iroh-metrics/src/service.rs index c114a60ff4..6b873d5b98 100644 --- a/iroh-metrics/src/service.rs +++ b/iroh-metrics/src/service.rs @@ -1,12 +1,16 @@ use std::net::SocketAddr; +use std::path::PathBuf; +use std::time::{Duration, Instant}; use anyhow::{anyhow, Context, Result}; use hyper::service::service_fn; use hyper::{Request, Response}; +use tokio::io::AsyncWriteExt as _; use tokio::net::TcpListener; use tracing::{error, info}; use crate::core::Core; +use crate::parse_prometheus_metrics; type BytesBody = http_body_util::Full; @@ -45,3 +49,71 @@ async fn handler(_req: Request) -> Result) -> BytesBody { http_body_util::Full::new(content.into()) } + +/// Start a metrics dumper loop to write metrics to an output file. +pub async fn dumper(path: &PathBuf, interval_ms: Duration) -> Result<()> { + info!(file = %path.display(), ?interval_ms, "running metrics dumper"); + let _ = Core::get().ok_or_else(|| anyhow!("metrics disabled"))?; + + let start = Instant::now(); + + let file = tokio::fs::OpenOptions::new() + .create(true) + .write(true) + .truncate(true) + .open(&path) + .await?; + + let mut file = tokio::io::BufWriter::new(file); + + // Dump metrics once with a header + dump_metrics(&mut file, &start, true).await?; + loop { + dump_metrics(&mut file, &start, false).await?; + tokio::time::sleep(interval_ms).await; + } +} + +/// Dump metrics to a file. +async fn dump_metrics( + file: &mut tokio::io::BufWriter, + start: &Instant, + write_header: bool, +) -> Result<()> { + let core = Core::get().ok_or_else(|| anyhow!("metrics disabled"))?; + let m = core.encode(); + match m { + Err(e) => error!("Failed to encode metrics: {e:#}"), + Ok(m) => { + let m = parse_prometheus_metrics(&m)?; + let time_since_start = start.elapsed().as_millis() as f64; + + // take the keys from m and sort them + let mut keys: Vec<&String> = m.keys().collect(); + keys.sort(); + + let mut metrics = String::new(); + if write_header { + metrics.push_str("time"); + for key in keys.iter() { + metrics.push(','); + metrics.push_str(key); + } + metrics.push('\n'); + } + + metrics.push_str(&format!("{}", time_since_start)); + for key in keys.iter() { + let value = m[*key]; + let formatted_value = format!("{:.3}", value); + metrics.push(','); + metrics.push_str(&formatted_value); + } + metrics.push('\n'); + + file.write_all(metrics.as_bytes()).await?; + file.flush().await?; + } + } + Ok(()) +} diff --git a/iroh-net/src/relay/metrics.rs b/iroh-net/src/relay/metrics.rs index de803d217a..486922516a 100644 --- a/iroh-net/src/relay/metrics.rs +++ b/iroh-net/src/relay/metrics.rs @@ -51,6 +51,9 @@ pub struct Metrics { /// Number of connections we have removed because of an error pub disconnects: Counter, + /// Number of unique client keys per day + pub unique_client_keys: Counter, + /// Number of accepted websocket connections pub websocket_accepts: Counter, /// Number of accepted 'iroh derp http' connection upgrades @@ -96,6 +99,8 @@ impl Default for Metrics { accepts: Counter::new("Number of times this server has accepted a connection."), disconnects: Counter::new("Number of clients that have then disconnected."), + unique_client_keys: Counter::new("Number of unique client keys per day."), + websocket_accepts: Counter::new("Number of accepted websocket connections"), derp_accepts: Counter::new("Number of accepted 'iroh derp http' connection upgrades"), // TODO: enable when we can have multiple connections for one node id diff --git a/iroh-net/src/relay/server.rs b/iroh-net/src/relay/server.rs index 8240b0fb2c..4ce80a3e6f 100644 --- a/iroh-net/src/relay/server.rs +++ b/iroh-net/src/relay/server.rs @@ -1,4 +1,5 @@ //! based on tailscale/derp/derp_server.go +use std::collections::HashMap; use std::pin::Pin; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Arc; @@ -10,7 +11,8 @@ use futures_lite::Stream; use futures_sink::Sink; use hyper::HeaderMap; use iroh_metrics::core::UsageStatsReport; -use iroh_metrics::{inc, report_usage_stats}; +use iroh_metrics::{inc, inc_by, report_usage_stats}; +use time::{Date, OffsetDateTime}; use tokio::io::{AsyncRead, AsyncWrite}; use tokio::sync::mpsc; use tokio::task::JoinHandle; @@ -231,6 +233,7 @@ pub(crate) struct ServerActor { receiver: mpsc::Receiver, /// All clients connected to this server clients: Clients, + client_counter: ClientCounter, } impl ServerActor { @@ -239,6 +242,7 @@ impl ServerActor { key, receiver, clients: Clients::new(), + client_counter: ClientCounter::default(), } } @@ -290,43 +294,45 @@ impl ServerActor { tracing::warn!("send disco packet: no way to reach client {key:?}, dropped packet"); inc!(Metrics, disco_packets_dropped); } - } - ServerMessage::CreateClient(client_builder) => { - inc!(Metrics, accepts); + } + ServerMessage::CreateClient(client_builder) => { + inc!(Metrics, accepts); - tracing::trace!("create client: {:?}", client_builder.key); - let key = client_builder.key; + tracing::trace!("create client: {:?}", client_builder.key); + let key = client_builder.key; - report_usage_stats(&UsageStatsReport::new( + report_usage_stats(&UsageStatsReport::new( "relay_accepts".to_string(), self.key.to_string(), 1, None, // TODO(arqu): attribute to user id; possibly with the re-introduction of request tokens or other auth Some(key.to_string()), )).await; + let nc = self.client_counter.update(key); + inc_by!(Metrics, unique_client_keys, nc); + + // build and register client, starting up read & write loops for the + // client connection + self.clients.register(client_builder); - // build and register client, starting up read & write loops for the - // client connection - self.clients.register(client_builder); - - } - ServerMessage::RemoveClient((key, conn_num)) => { - inc!(Metrics, disconnects); - tracing::trace!("remove client: {:?}", key); - // ensure we still have the client in question - if self.clients.has_client(&key, conn_num) { - // remove the client from the map of clients, & notify any peers that it - // has sent messages that it has left the network - self.clients.unregister(&key); + } + ServerMessage::RemoveClient((key, conn_num)) => { + inc!(Metrics, disconnects); + tracing::trace!("remove client: {:?}", key); + // ensure we still have the client in question + if self.clients.has_client(&key, conn_num) { + // remove the client from the map of clients, & notify any peers that it + // has sent messages that it has left the network + self.clients.unregister(&key); } - } - ServerMessage::Shutdown => { - tracing::info!("server gracefully shutting down..."); - // close all client connections and client read/write loops - self.clients.shutdown().await; - return Ok(()); - } - } + } + ServerMessage::Shutdown => { + tracing::info!("server gracefully shutting down..."); + // close all client connections and client read/write loops + self.clients.shutdown().await; + return Ok(()); + } + } } } } @@ -510,6 +516,39 @@ impl AsyncWrite for MaybeTlsStream { } } +struct ClientCounter { + clients: HashMap, + last_clear_date: Date, +} + +impl Default for ClientCounter { + fn default() -> Self { + Self { + clients: HashMap::new(), + last_clear_date: OffsetDateTime::now_utc().date(), + } + } +} + +impl ClientCounter { + fn check_and_clear(&mut self) { + let today = OffsetDateTime::now_utc().date(); + if today != self.last_clear_date { + self.clients.clear(); + self.last_clear_date = today; + } + } + + /// Updates the client counter. + pub fn update(&mut self, client: PublicKey) -> u64 { + self.check_and_clear(); + let new_conn = !self.clients.contains_key(&client); + let counter = self.clients.entry(client).or_insert(0); + *counter += 1; + new_conn as u64 + } +} + #[cfg(test)] mod tests { use super::*;