From eee61a673d93297715f292b53f201a17388b3564 Mon Sep 17 00:00:00 2001 From: hopeyen Date: Fri, 15 Mar 2024 15:11:43 -0500 Subject: [PATCH] feat: basic metrics, separate from indexer-rs metrics --- Cargo.lock | 233 ++++++++++++++++++++++++-- docs/server_guide.md | 4 + file-service/Cargo.toml | 2 + file-service/src/config.rs | 9 +- file-service/src/file_server/files.rs | 100 ----------- file-service/src/file_server/mod.rs | 4 + file-service/src/file_server/range.rs | 8 +- file-service/src/lib.rs | 1 + file-service/src/main.rs | 3 +- file-service/src/metrics.rs | 91 ++++++++++ 10 files changed, 335 insertions(+), 120 deletions(-) delete mode 100644 file-service/src/file_server/files.rs create mode 100644 file-service/src/metrics.rs diff --git a/Cargo.lock b/Cargo.lock index ce9a049..09bb18c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -647,26 +647,54 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" +[[package]] +name = "autometrics" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52c47a2283b04388a392a7bff8a0f68db615d967e4d177068f573773fcefa91b" +dependencies = [ + "autometrics-macros 0.3.3", + "const_format", + "metrics-exporter-prometheus 0.11.0", + "once_cell", + "opentelemetry-prometheus 0.11.0", + "opentelemetry_api 0.18.0", + "opentelemetry_sdk 0.18.0", + "prometheus", +] + [[package]] name = "autometrics" version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "95cef5eb1e18adfb843202bf71587174e480ed67c0ca3e976bf40e82d9adce86" dependencies = [ - "autometrics-macros", + "autometrics-macros 0.6.0", "cfg_aliases", "http 0.2.11", "linkme", - "metrics-exporter-prometheus", + "metrics-exporter-prometheus 0.12.2", "once_cell", - "opentelemetry-prometheus", - "opentelemetry_sdk", + "opentelemetry-prometheus 0.13.0", + "opentelemetry_sdk 0.20.0", "prometheus", "prometheus-client", "spez", "thiserror", ] +[[package]] +name = "autometrics-macros" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "214b577a372c973db7ade265f56e1f84c3fdda7fe4f813466f81fce8c2755e6f" +dependencies = [ + "percent-encoding", + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "autometrics-macros" version = "0.6.0" @@ -1382,6 +1410,26 @@ version = "0.9.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8" +[[package]] +name = "const_format" +version = "0.2.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3a214c7af3d04997541b18d432afaff4c455e79e2029079647e72fc2bd27673" +dependencies = [ + "const_format_proc_macros", +] + +[[package]] +name = "const_format_proc_macros" +version = "0.2.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7f6ff08fd20f4f299298a28e2dfa8a8ba1036e6cd2460ac1de7b425d76f2500" +dependencies = [ + "proc-macro2", + "quote", + "unicode-xid", +] + [[package]] name = "constant_time_eq" version = "0.1.5" @@ -2425,6 +2473,7 @@ dependencies = [ "anyhow", "async-graphql 6.0.11", "async-graphql-axum", + "autometrics 0.3.3", "axum", "axum-macros", "base64 0.21.7", @@ -2452,6 +2501,7 @@ dependencies = [ "ipfs-api-prelude", "merkle-cbt", "object_store", + "once_cell", "prometheus", "rand 0.8.5", "reqwest", @@ -2809,7 +2859,7 @@ dependencies = [ "no-std-compat", "nonzero_ext", "parking_lot", - "quanta", + "quanta 0.11.1", "rand 0.8.5", "smallvec", ] @@ -3310,7 +3360,7 @@ dependencies = [ "anyhow", "arc-swap", "async-trait", - "autometrics", + "autometrics 0.6.0", "axum", "bigdecimal 0.4.2", "build-info", @@ -3706,6 +3756,15 @@ dependencies = [ "hashbrown 0.14.3", ] +[[package]] +name = "mach" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b823e83b2affd8f40a9ee8c29dbc56404c1e34cd2710921f2801e2cf29527afa" +dependencies = [ + "libc", +] + [[package]] name = "mach2" version = "0.4.2" @@ -3755,6 +3814,17 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "metrics" +version = "0.20.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b9b8653cec6897f73b519a43fba5ee3d50f62fe9af80b428accdcc093b4a849" +dependencies = [ + "ahash 0.7.7", + "metrics-macros 0.6.0", + "portable-atomic 0.3.20", +] + [[package]] name = "metrics" version = "0.21.1" @@ -3762,8 +3832,23 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fde3af1a009ed76a778cb84fdef9e7dbbdf5775ae3e4cc1f434a6a307f6f76c5" dependencies = [ "ahash 0.8.7", - "metrics-macros", - "portable-atomic", + "metrics-macros 0.7.1", + "portable-atomic 1.6.0", +] + +[[package]] +name = "metrics-exporter-prometheus" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8603921e1f54ef386189335f288441af761e0fc61bcb552168d9cedfe63ebc70" +dependencies = [ + "indexmap 1.9.3", + "metrics 0.20.1", + "metrics-util 0.14.0", + "parking_lot", + "portable-atomic 0.3.20", + "quanta 0.10.1", + "thiserror", ] [[package]] @@ -3774,12 +3859,23 @@ checksum = "1d4fa7ce7c4862db464a37b0b31d89bca874562f034bd7993895572783d02950" dependencies = [ "base64 0.21.7", "indexmap 1.9.3", - "metrics", - "metrics-util", - "quanta", + "metrics 0.21.1", + "metrics-util 0.15.1", + "quanta 0.11.1", "thiserror", ] +[[package]] +name = "metrics-macros" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "731f8ecebd9f3a4aa847dfe75455e4757a45da40a7793d2f0b1f9b6ed18b23f3" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "metrics-macros" version = "0.7.1" @@ -3791,6 +3887,23 @@ dependencies = [ "syn 2.0.48", ] +[[package]] +name = "metrics-util" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7d24dc2dbae22bff6f1f9326ffce828c9f07ef9cc1e8002e5279f845432a30a" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", + "hashbrown 0.12.3", + "metrics 0.20.1", + "num_cpus", + "parking_lot", + "portable-atomic 0.3.20", + "quanta 0.10.1", + "sketches-ddsketch", +] + [[package]] name = "metrics-util" version = "0.15.1" @@ -3800,9 +3913,9 @@ dependencies = [ "crossbeam-epoch", "crossbeam-utils", "hashbrown 0.13.1", - "metrics", + "metrics 0.21.1", "num_cpus", - "quanta", + "quanta 0.11.1", "sketches-ddsketch", ] @@ -4234,6 +4347,27 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "opentelemetry" +version = "0.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69d6c3d7288a106c0a363e4b0e8d308058d56902adefb16f4936f417ffef086e" +dependencies = [ + "opentelemetry_api 0.18.0", + "opentelemetry_sdk 0.18.0", +] + +[[package]] +name = "opentelemetry-prometheus" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06c3d833835a53cf91331d2cfb27e9121f5a95261f31f08a1f79ab31688b8da8" +dependencies = [ + "opentelemetry", + "prometheus", + "protobuf", +] + [[package]] name = "opentelemetry-prometheus" version = "0.13.0" @@ -4241,12 +4375,28 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c7d81bc254e2d572120363a2b16cdb0d715d301b5789be0cfc26ad87e4e10e53" dependencies = [ "once_cell", - "opentelemetry_api", - "opentelemetry_sdk", + "opentelemetry_api 0.20.0", + "opentelemetry_sdk 0.20.0", "prometheus", "protobuf", ] +[[package]] +name = "opentelemetry_api" +version = "0.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c24f96e21e7acc813c7a8394ee94978929db2bcc46cf6b5014fc612bf7760c22" +dependencies = [ + "fnv", + "futures-channel", + "futures-util", + "indexmap 1.9.3", + "js-sys", + "once_cell", + "pin-project-lite", + "thiserror", +] + [[package]] name = "opentelemetry_api" version = "0.20.0" @@ -4263,6 +4413,26 @@ dependencies = [ "urlencoding", ] +[[package]] +name = "opentelemetry_sdk" +version = "0.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ca41c4933371b61c2a2f214bf16931499af4ec90543604ec828f7a625c09113" +dependencies = [ + "async-trait", + "crossbeam-channel", + "dashmap", + "fnv", + "futures-channel", + "futures-executor", + "futures-util", + "once_cell", + "opentelemetry_api 0.18.0", + "percent-encoding", + "rand 0.8.5", + "thiserror", +] + [[package]] name = "opentelemetry_sdk" version = "0.20.0" @@ -4274,7 +4444,7 @@ dependencies = [ "futures-executor", "futures-util", "once_cell", - "opentelemetry_api", + "opentelemetry_api 0.20.0", "ordered-float", "regex", "thiserror", @@ -4651,6 +4821,15 @@ dependencies = [ "plotters-backend", ] +[[package]] +name = "portable-atomic" +version = "0.3.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e30165d31df606f5726b090ec7592c308a0eaf61721ff64c9a3018e344a8753e" +dependencies = [ + "portable-atomic 1.6.0", +] + [[package]] name = "portable-atomic" version = "1.6.0" @@ -4899,6 +5078,22 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "quanta" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7e31331286705f455e56cca62e0e717158474ff02b7936c1fa596d983f4ae27" +dependencies = [ + "crossbeam-utils", + "libc", + "mach", + "once_cell", + "raw-cpuid", + "wasi 0.10.2+wasi-snapshot-preview1", + "web-sys", + "winapi", +] + [[package]] name = "quanta" version = "0.11.1" @@ -7255,6 +7450,12 @@ version = "0.9.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" +[[package]] +name = "wasi" +version = "0.10.2+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6" + [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" diff --git a/docs/server_guide.md b/docs/server_guide.md index 9c1d072..ab0a38c 100644 --- a/docs/server_guide.md +++ b/docs/server_guide.md @@ -122,6 +122,10 @@ with configuration You are open for business! +### Performance and Monitoring + +Basic service metrics are hosted at the address configued by `common.server.metrics_host_and_port`, default at "0.0.0.0:7601". Optionally separate metrics are tracked specifically for file service performances at `server.metrics_host_and_port`. The metrics are minimal and please submit feedback for additional specific measurements. + ### Security Considerations The server enforces various security measures to protect user data and system integrity. These measures include: diff --git a/file-service/Cargo.toml b/file-service/Cargo.toml index 425f19d..053c0f3 100644 --- a/file-service/Cargo.toml +++ b/file-service/Cargo.toml @@ -19,6 +19,7 @@ thegraph = { git = "https://github.com/edgeandnode/toolshed", tag = "thegraph-v0 anyhow = "1.0" async-graphql = "6.0.11" async-graphql-axum = "6.0.11" +autometrics = { version = "0.3.3", features = ["prometheus-exporter"] } axum = "0.6.20" sha3 = "0.10.6" base64 = "0.21" @@ -40,6 +41,7 @@ ipfs-api-backend-hyper = "0.6" ipfs-api-prelude = "0.6" merkle-cbt = "0.3.2" object_store = {version = "0.9", features = [ "http", "aws", "gcp", "azure" ]} +once_cell = "1.17" prometheus = "0.13.3" rand = "0.8.4" reqwest = { version = "0.11", features = ["json", "stream", "multipart"] } diff --git a/file-service/src/config.rs b/file-service/src/config.rs index 261ee09..33411ef 100644 --- a/file-service/src/config.rs +++ b/file-service/src/config.rs @@ -45,7 +45,6 @@ pub struct ServerArgs { help = "Admin Auth token for server management" )] pub admin_auth_token: Option, - //TODO: More complex price management #[arg( long, value_name = "admin-addr", @@ -54,6 +53,14 @@ pub struct ServerArgs { help = "Expost Admin service at address with both host and port" )] pub admin_host_and_port: SocketAddr, + #[arg( + long, + value_name = "metric-addr", + default_value = "0.0.0.0/5000", + env = "FILE_METRICS_HOST_AND_PORT", + help = "Expost Metrics service at address with both host and port" + )] + pub metrics_host_and_port: Option, #[arg( long, value_name = "ipfs-gateway-url", diff --git a/file-service/src/file_server/files.rs b/file-service/src/file_server/files.rs deleted file mode 100644 index 7891198..0000000 --- a/file-service/src/file_server/files.rs +++ /dev/null @@ -1,100 +0,0 @@ -// #![cfg(feature = "acceptor")] -use http::header::CONTENT_RANGE; - -use file_exchange::errors::{Error, ServerError}; - -use crate::file_server::util::{Health, Operator}; -// #![cfg(feature = "acceptor")] -// use hyper_rustls::TlsAcceptor; -use hyper::{Body, Request, Response, StatusCode}; - -use super::{ - range::{parse_range_header, serve_file, serve_file_range}, - ServerContext, -}; - -// Serve file requests -pub async fn file_service( - id: DeploymentId, - req: &Request, - context: &ServerContext, -) -> Result, Error> { - tracing::debug!("Received file range request"); - let context_ref = context.lock().await; - tracing::debug!( - bundles = tracing::field::debug(&context_ref), - id, - "Received file range request" - ); - - // Validate the auth token - let auth_token = req - .headers() - .get(http::header::AUTHORIZATION) - .and_then(|t| t.to_str().ok()); - - let free = context_ref.free_query_auth_token.is_none() - || (auth_token.is_some() - && context_ref.free_query_auth_token.is_some() - && auth_token.unwrap() == context_ref.free_query_auth_token.as_deref().unwrap()); - - if !free { - tracing::warn!("Respond with unauthorized query"); - return Ok(Response::builder() - .status(StatusCode::UNAUTHORIZED) - .body("Paid service is not implemented, need free query authentication".into()) - .unwrap()); - } - - let requested_bundle = match context_ref.bundles.get(id) { - Some(s) => s.clone(), - None => { - tracing::debug!( - server_context = tracing::field::debug(&context_ref), - id, - "Requested bundle is not served locally" - ); - return Ok(Response::builder() - .status(StatusCode::NOT_FOUND) - .body("Bundle not found".into()) - .unwrap()); - } - }; - - match req.headers().get("file_hash") { - Some(hash) if hash.to_str().is_ok() => { - let mut file_path = requested_bundle.local_path.clone(); - let file_manifest = match requested_bundle - .file_manifests - .iter() - .find(|file| file.meta_info.hash == hash.to_str().unwrap()) - { - Some(c) => c, - None => { - return Ok(Response::builder() - .status(StatusCode::NOT_FOUND) - .body("File manifest not found".into()) - .unwrap()) - } - }; - file_path.push(file_manifest.meta_info.name.clone()); - // Parse the range header to get the start and end bytes - match req.headers().get(CONTENT_RANGE) { - Some(r) => { - tracing::debug!("Parse content range header"); - let range = parse_range_header(r)?; - //TODO: validate receipt - serve_file_range(&file_path, range).await - } - None => { - tracing::info!("Serve file"); - serve_file(&file_path).await - } - } - } - _ => Ok(Response::builder() - .status(StatusCode::NOT_ACCEPTABLE) - .body("Missing required file_manifest_hash header".into()) - .unwrap()), - } -} diff --git a/file-service/src/file_server/mod.rs b/file-service/src/file_server/mod.rs index c9ff663..d43adfc 100644 --- a/file-service/src/file_server/mod.rs +++ b/file-service/src/file_server/mod.rs @@ -71,9 +71,13 @@ impl IndexerServiceImpl for ServerContext { // path if path.starts_with("/bundles/id/") => { // } tracing::trace!("Process file service {deployment:?}"); + let query_duration_timer = crate::metrics::RESPONSE_TIME + .with_label_values(&[&deployment.to_string()]) + .start_timer(); let body = service::file_service(deployment, &request, self) .await .map_err(FileServiceError::QueryForwardingError)?; + query_duration_timer.observe_duration(); let response = FileServiceResponse { inner: body }; Ok((request, response)) } diff --git a/file-service/src/file_server/range.rs b/file-service/src/file_server/range.rs index 0669bdf..3ef7ece 100644 --- a/file-service/src/file_server/range.rs +++ b/file-service/src/file_server/range.rs @@ -1,4 +1,3 @@ -use file_exchange::manifest::store::Store; // #![cfg(feature = "acceptor")] use hyper::header::{CONTENT_LENGTH, CONTENT_RANGE}; use hyper::{Body, Response, StatusCode}; @@ -8,7 +7,10 @@ use std::io::Read; use serde_json::Value; -use file_exchange::errors::{Error, ServerError}; +use file_exchange::{ + errors::{Error, ServerError}, + manifest::store::Store, +}; // Function to parse the Range header and return the start and end bytes pub fn parse_range_header(range_header: &Value) -> Result<(usize, usize), Error> { @@ -89,6 +91,8 @@ pub async fn serve_file_range( }; let content = store.range_read(file_name, &range).await?; + let transferred_bytes = crate::metrics::TRANSFERRED_BYTES.with_label_values(&[file_name]); + transferred_bytes.set(length.try_into().unwrap()); Response::builder() .status(StatusCode::PARTIAL_CONTENT) .header(CONTENT_RANGE, format!("bytes {}-{}/{}", start, end, length)) diff --git a/file-service/src/lib.rs b/file-service/src/lib.rs index 6b841ba..96ba980 100644 --- a/file-service/src/lib.rs +++ b/file-service/src/lib.rs @@ -2,3 +2,4 @@ pub mod admin; pub mod config; pub mod database; pub mod file_server; +pub mod metrics; diff --git a/file-service/src/main.rs b/file-service/src/main.rs index bafca1b..d72abe6 100644 --- a/file-service/src/main.rs +++ b/file-service/src/main.rs @@ -4,7 +4,7 @@ use clap::Parser; use file_service::file_server::{ cost::cost, initialize_server_context, status::status, util::graphql_playground, }; -use file_service::{admin, config}; +use file_service::{admin, config, metrics}; use indexer_common::indexer_service::http::{ IndexerService, IndexerServiceOptions, IndexerServiceRelease, }; @@ -39,6 +39,7 @@ async fn main() -> Result<(), Error> { .await .expect("Failed to initiate bundle server"); admin::serve_admin(state.clone()); + metrics::serve_metrics(&config.server); IndexerService::run(IndexerServiceOptions { release, diff --git a/file-service/src/metrics.rs b/file-service/src/metrics.rs new file mode 100644 index 0000000..0454bb7 --- /dev/null +++ b/file-service/src/metrics.rs @@ -0,0 +1,91 @@ +use autometrics::encode_global_metrics; +use axum::http::StatusCode; +use axum::routing::get; +use axum::Router; +use axum::Server; +use once_cell::sync::Lazy; +use prometheus::{core::Collector, Registry}; +use prometheus::{HistogramOpts, HistogramVec, IntGaugeVec, Opts}; + +use crate::config::ServerArgs; + +use tracing::debug; + +// Response Time: response time for file requests. +#[allow(dead_code)] +pub static RESPONSE_TIME: Lazy = Lazy::new(|| { + let m = HistogramVec::new( + HistogramOpts::new( + "response_time", + "Response time for file requests in microseconds", + ) + .namespace("file_service"), + &["file_id"], + ) + .expect("Failed to create response_time timer"); + prometheus::register(Box::new(m.clone())).expect("Failed to register response_time timer"); + m +}); + +// Number of bytes transferred per request per file +#[allow(dead_code)] +pub static TRANSFERRED_BYTES: Lazy = Lazy::new(|| { + let m = IntGaugeVec::new( + Opts::new( + "transferred_bytes", + "Number of bytes transferred per request", + ) + .namespace("file_service"), + &["file_id"], + ) + .expect("Failed to create transferred_bytes gauge"); + prometheus::register(Box::new(m.clone())).expect("Failed to register transferred_byes gauge"); + m +}); + +#[allow(dead_code)] +pub static REGISTRY: Lazy = Lazy::new(prometheus::Registry::new); + +#[allow(dead_code)] +pub fn register_metrics(registry: &Registry, metrics: Vec>) { + for metric in metrics { + registry.register(metric).expect("Cannot register metrics"); + debug!("registered metric"); + } +} + +#[allow(dead_code)] +pub fn start_metrics() { + register_metrics( + ®ISTRY, + vec![ + Box::new(RESPONSE_TIME.clone()), + Box::new(TRANSFERRED_BYTES.clone()), + ], + ); +} + +/// This handler serializes the metrics into a string for Prometheus to scrape +#[allow(dead_code)] +pub async fn get_metrics() -> (StatusCode, String) { + match encode_global_metrics() { + Ok(metrics) => (StatusCode::OK, metrics), + Err(err) => (StatusCode::INTERNAL_SERVER_ERROR, format!("{err:?}")), + } +} + +pub fn serve_metrics(config: &ServerArgs) { + start_metrics(); + if config.metrics_host_and_port.is_none() { + return; + }; + // Set up the exporter to collect metrics + let app = Router::new().route("/metrics", get(get_metrics)); + let metrics_addr = config.metrics_host_and_port.unwrap(); + tokio::spawn(async move { + Server::bind(&metrics_addr) + .serve(app.into_make_service()) + .await + .expect("Failed to initialize admin server") + }); +}