From 5cdf5c88aecf73840bb04c21b7a9e658cb5bf928 Mon Sep 17 00:00:00 2001 From: axiomatic-aardvark Date: Wed, 13 Dec 2023 18:50:41 +0200 Subject: [PATCH] feat: more metrics --- grafana.json | 474 +++++++++++++++++++-- subgraph-radio/src/metrics/mod.rs | 68 ++- subgraph-radio/src/operator/attestation.rs | 28 +- 3 files changed, 540 insertions(+), 30 deletions(-) diff --git a/grafana.json b/grafana.json index 960aea8..744d43d 100644 --- a/grafana.json +++ b/grafana.json @@ -520,7 +520,11 @@ "type": "table" }, { - "datasource": "${DS_PROMETHEUS}", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", "fieldConfig": { "defaults": { "color": { @@ -577,7 +581,98 @@ "x": 12, "y": 8 }, - "id": 123129, + "id": 123142, + "options": { + "legend": { + "calcs": ["last", "min", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "editorMode": "code", + "exemplar": false, + "expr": "rate(cached_ppoi_messages[ $__rate_interval ])", + "interval": "", + "legendFormat": "", + "range": true, + "refId": "A" + } + ], + "title": "Average Message Frequency", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 123133, "options": { "legend": { "calcs": [], @@ -593,32 +688,250 @@ "targets": [ { "datasource": "${DS_PROMETHEUS}", + "editorMode": "code", "exemplar": true, - "expr": "function_calls_concurrent", + "expr": "graphcast_subgraph_radio_diverging_subgraphs", "interval": "", "legendFormat": "", + "range": true, "refId": "A" + } + ], + "title": "Number of diverged subgraphs", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } }, + "overrides": [ + { + "__systemRef": "hideSeriesFrom", + "matcher": { + "id": "byNames", + "options": { + "mode": "exclude", + "names": [ + "graphcast_subgraph_radio_gossip_peers{instance=\"subgraph-radio:2384\", job=\"subgraph-radio\"}" + ], + "prefix": "All except:", + "readOnly": true + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": false, + "tooltip": false, + "viz": true + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 123143, + "options": { + "legend": { + "calcs": ["last", "min", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ { "datasource": "${DS_PROMETHEUS}", - "exemplar": true, - "expr": "histogram_quantile(0.95, sum(rate(function_calls_duration_bucket[5m])) by (le))", - "hide": false, + "editorMode": "code", + "exemplar": false, + "expr": "graphcast_subgraph_radio_average_processing_time", "interval": "", "legendFormat": "", - "refId": "B" + "range": true, + "refId": "A" + } + ], + "title": "Average Processing Time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "__systemRef": "hideSeriesFrom", + "matcher": { + "id": "byNames", + "options": { + "mode": "exclude", + "names": [ + "graphcast_subgraph_radio_gossip_peers{instance=\"subgraph-radio:2384\", job=\"subgraph-radio\"}" + ], + "prefix": "All except:", + "readOnly": true + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": false, + "tooltip": false, + "viz": true + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "id": 123144, + "options": { + "legend": { + "calcs": ["last", "min", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last", + "sortDesc": true }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ { "datasource": "${DS_PROMETHEUS}", - "exemplar": true, - "expr": "rate(function_calls_count[5m])", - "hide": false, + "editorMode": "code", + "exemplar": false, + "expr": "topk(10, sum(rate(frequent_senders_counter[$__interval])) by (indexer_address))", "interval": "", "legendFormat": "", - "refId": "C" + "range": true, + "refId": "A" } ], - "title": "Function Call Stats", + "title": "Frequent Senders", "type": "timeseries" }, { @@ -626,6 +939,7 @@ "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "description": "", "fieldConfig": { "defaults": { "color": { @@ -680,15 +994,17 @@ "h": 8, "w": 12, "x": 0, - "y": 16 + "y": 32 }, - "id": 123133, + "id": 123146, "options": { "legend": { - "calcs": [], - "displayMode": "list", + "calcs": ["last", "min", "max"], + "displayMode": "table", "placement": "bottom", - "showLegend": true + "showLegend": true, + "sortBy": "Last", + "sortDesc": true }, "tooltip": { "mode": "single", @@ -699,15 +1015,15 @@ { "datasource": "${DS_PROMETHEUS}", "editorMode": "code", - "exemplar": true, - "expr": "graphcast_subgraph_radio_diverging_subgraphs", + "exemplar": false, + "expr": "graphcast_subgraph_radio_latest_message_timestamp", "interval": "", "legendFormat": "", "range": true, "refId": "A" } ], - "title": "Number of diverged subgraphs", + "title": "Latest Message Timestamps", "type": "timeseries" }, { @@ -769,12 +1085,12 @@ "h": 8, "w": 12, "x": 12, - "y": 16 + "y": 32 }, - "id": 123135, + "id": 123147, "options": { "legend": { - "calcs": ["last", "min", "max", "sum"], + "calcs": ["last", "min", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true, @@ -790,15 +1106,117 @@ { "datasource": "${DS_PROMETHEUS}", "editorMode": "code", - "exemplar": true, - "expr": "graphcast_subgraph_radio_local_ppois_to_compare", + "exemplar": false, + "expr": "graphcast_subgraph_radio_max_stake_poi", "interval": "", - "legendFormat": "{{deployment}}", + "legendFormat": "", "range": true, "refId": "A" } ], - "title": "Locally tracked Public POIs", + "title": "Max Stake POI", + "type": "timeseries" + }, + { + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 40 + }, + "id": 123129, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "exemplar": true, + "expr": "function_calls_concurrent", + "interval": "", + "legendFormat": "", + "refId": "A" + }, + { + "datasource": "${DS_PROMETHEUS}", + "exemplar": true, + "expr": "histogram_quantile(0.95, sum(rate(function_calls_duration_bucket[5m])) by (le))", + "hide": false, + "interval": "", + "legendFormat": "", + "refId": "B" + }, + { + "datasource": "${DS_PROMETHEUS}", + "exemplar": true, + "expr": "rate(function_calls_count[5m])", + "hide": false, + "interval": "", + "legendFormat": "", + "refId": "C" + } + ], + "title": "Function Call Stats", "type": "timeseries" } ], @@ -834,6 +1252,6 @@ "timezone": "browser", "title": "Graphcast Subgraph Radio", "uid": "graphcast-subgraph-radio", - "version": 17, + "version": 20, "weekStart": "" } diff --git a/subgraph-radio/src/metrics/mod.rs b/subgraph-radio/src/metrics/mod.rs index 7ace88f..fa0123e 100644 --- a/subgraph-radio/src/metrics/mod.rs +++ b/subgraph-radio/src/metrics/mod.rs @@ -5,7 +5,7 @@ use axum::Router; use axum_server::Handle; use once_cell::sync::Lazy; use prometheus::{core::Collector, Registry}; -use prometheus::{IntCounter, IntCounterVec, IntGauge, IntGaugeVec, Opts}; +use prometheus::{Gauge, GaugeVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, Opts}; use std::{net::SocketAddr, str::FromStr}; use tracing::{debug, info}; @@ -113,6 +113,68 @@ pub static RECEIVED_MESSAGES: Lazy = Lazy::new(|| { m }); +#[allow(dead_code)] +pub static AVERAGE_PROCESSING_TIME: Lazy = Lazy::new(|| { + let m = Gauge::with_opts( + Opts::new( + "average_processing_time", + "Average time taken to process each POI message", + ) + .namespace("graphcast") + .subsystem("subgraph_radio"), + ) + .expect("Failed to create average_processing_time gauge"); + prometheus::register(Box::new(m.clone())) + .expect("Failed to register average_processing_time gauge"); + m +}); + +#[allow(dead_code)] +pub static FREQUENT_SENDERS_COUNTER: Lazy = Lazy::new(|| { + let m = IntCounterVec::new( + Opts::new( + "frequent_senders", + "Count of messages received from each indexer", + ) + .namespace("graphcast") + .subsystem("subgraph_radio"), + &["indexer_address"], + ) + .expect("Failed to create frequent_senders counter"); + prometheus::register(Box::new(m.clone())).expect("Failed to register frequent_senders counter"); + m +}); + +#[allow(dead_code)] +pub static LATEST_MESSAGE_TIMESTAMP: Lazy = Lazy::new(|| { + let m = GaugeVec::new( + Opts::new( + "latest_message_timestamp", + "Timestamp of the last received public POI message", + ) + .namespace("graphcast") + .subsystem("subgraph_radio"), + &["deployment_hash"], + ) + .expect("Failed to create latest_message_timestamp gauge"); + prometheus::register(Box::new(m.clone())) + .expect("Failed to register latest_message_timestamp gauge"); + m +}); + +#[allow(dead_code)] +pub static MAX_STAKE_POI: Lazy = Lazy::new(|| { + let m = GaugeVec::new( + Opts::new("max_stake_poi", "Highest stake-backed POI") + .namespace("graphcast") + .subsystem("subgraph_radio"), + &["deployment_hash"], + ) + .expect("Failed to create max_stake_poi gauge"); + prometheus::register(Box::new(m.clone())).expect("Failed to register max_poi_stake gauge"); + m +}); + #[allow(dead_code)] pub static REGISTRY: Lazy = Lazy::new(prometheus::Registry::new); @@ -136,6 +198,10 @@ pub fn start_metrics() { Box::new(CONNECTED_PEERS.clone()), Box::new(GOSSIP_PEERS.clone()), Box::new(RECEIVED_MESSAGES.clone()), + Box::new(AVERAGE_PROCESSING_TIME.clone()), + Box::new(FREQUENT_SENDERS_COUNTER.clone()), + Box::new(LATEST_MESSAGE_TIMESTAMP.clone()), + Box::new(MAX_STAKE_POI.clone()), ], ); } diff --git a/subgraph-radio/src/operator/attestation.rs b/subgraph-radio/src/operator/attestation.rs index 8e14d40..a7e4317 100644 --- a/subgraph-radio/src/operator/attestation.rs +++ b/subgraph-radio/src/operator/attestation.rs @@ -2,6 +2,9 @@ use crate::database::{ clear_all_notifications, create_notification, get_comparison_results_by_deployment, get_notifications, get_remote_ppoi_messages_by_identifier, save_comparison_result, }; +use crate::metrics::{ + AVERAGE_PROCESSING_TIME, FREQUENT_SENDERS_COUNTER, LATEST_MESSAGE_TIMESTAMP, MAX_STAKE_POI, +}; use crate::operator::notifier::NotificationMode; use crate::DatabaseError; use async_graphql::{Enum, Error as AsyncGraphqlError, ErrorExtensions, SimpleObject}; @@ -13,6 +16,7 @@ use sqlx::SqlitePool; use std::error::Error; use std::fmt; use std::str::FromStr; +use std::time::Instant; use std::{ collections::{HashMap, HashSet}, fmt::Display, @@ -109,6 +113,8 @@ pub async fn process_ppoi_message( messages: Vec>, callbook: &CallBook, ) -> Result { + let start_time = Instant::now(); + let mut remote_attestations: RemoteAttestationsMap = HashMap::new(); // Check if there are existing attestations for the block let first_message = messages.first(); @@ -128,6 +134,14 @@ pub async fn process_ppoi_message( .map_err(|e| AttestationError::BuildError(MessageError::FieldDerivations(e)))? as u64; + FREQUENT_SENDERS_COUNTER + .with_label_values(&[&radio_msg.graph_account]) + .inc(); + + LATEST_MESSAGE_TIMESTAMP + .with_label_values(&[&msg.identifier]) + .set(radio_msg.nonce as f64); + let blocks = remote_attestations .entry(msg.identifier.to_string()) .or_default(); @@ -174,6 +188,10 @@ pub async fn process_ppoi_message( let senders = combine_senders(blocks.entry(first_msg.payload.block_number).or_default()); active_indexers.set(senders.len().try_into().unwrap()); + let duration = start_time.elapsed(); + let average_time = duration.as_secs_f64() / messages.len() as f64; + AVERAGE_PROCESSING_TIME.set(average_time); + Ok(remote_attestations) } @@ -468,9 +486,17 @@ pub fn compare_attestations( .unwrap_or(std::cmp::Ordering::Equal) }); + let most_attested_poi = sorted_remote_attestations.last().cloned(); + + if let Some(attestation) = &most_attested_poi { + MAX_STAKE_POI + .with_label_values(&[ipfs_hash]) + .set(attestation.stake_weight as f64); + } + // Determine the comparison result based on the top attested remote PPOI let result_type = if let Some(local_att) = &local_attestation { - if let Some(most_attested) = sorted_remote_attestations.last() { + if let Some(most_attested) = &most_attested_poi { if most_attested.ppoi == local_att.ppoi { ComparisonResultType::Match } else {