From 641480bd80cd3c366f8a9f639d1402d18d999b8a Mon Sep 17 00:00:00 2001 From: Michael McGee Date: Thu, 30 Nov 2023 17:28:59 +0000 Subject: [PATCH] metrics: add prometheus endpoint A metrics tile is added and wired up to the metrics shared memory area, with a simple socket based HTTP server that serves the values in Prometheus text format. --- ffi/rust/firedancer-sys/src/tango/cnc.rs | 2 - ffi/rust/firedancer-sys/src/tango/fseq.rs | 7 - solana | 2 +- src/app/fdctl/Local.mk | 7 +- src/app/fdctl/config.c | 6 + src/app/fdctl/config.h | 3 + src/app/fdctl/config/default.toml | 12 +- src/app/fdctl/monitor/monitor.c | 48 +- src/app/fdctl/run/run1.c | 9 +- src/app/fdctl/run/tiles/fd_metric.c | 487 ++++++++++++++++++ src/app/fdctl/run/tiles/fd_quic.c | 31 +- .../run/tiles/generated/metric_seccomp.h | 74 +++ src/app/fdctl/run/tiles/metric.seccomppolicy | 32 ++ src/app/fdctl/run/tiles/tiles.c | 1 + src/app/fdctl/run/tiles/tiles.h | 2 + src/app/fdctl/topology.c | 10 +- src/app/fdctl/topology.h | 20 + src/disco/metrics/fd_metrics.c | 3 +- src/disco/metrics/fd_metrics.h | 165 +++++- src/disco/metrics/fd_metrics_base.h | 96 ++++ src/disco/metrics/gen_metrics.py | 192 +++++-- src/disco/metrics/generated/Local.mk | 2 + src/disco/metrics/generated/fd_metrics_all.c | 29 ++ src/disco/metrics/generated/fd_metrics_all.h | 169 +++++- src/disco/metrics/generated/fd_metrics_quic.c | 7 + src/disco/metrics/generated/fd_metrics_quic.h | 18 +- src/disco/metrics/metrics.xml | 90 ++-- src/disco/mux/fd_mux.c | 87 ++-- src/disco/mux/test_mux.c | 11 + src/disco/quic/test_quic_tile.c | 3 - src/disco/replay/fd_replay.c | 11 + src/tango/bench_frag_tx.c | 11 + src/tango/cnc/fd_cnc.h | 24 - src/tango/cnc/test_cnc.c | 3 - src/tango/fseq/fd_fseq.h | 38 -- src/tango/fseq/test_fseq.c | 8 - src/tango/test_frag_rx.c | 8 + src/tango/test_frag_tx.c | 11 + src/tango/test_meta_rx.c | 8 + src/tango/test_meta_tx.c | 11 + src/util/hist/fd_histf.h | 1 + 41 files changed, 1449 insertions(+), 310 deletions(-) create mode 100644 src/app/fdctl/run/tiles/fd_metric.c create mode 100644 src/app/fdctl/run/tiles/generated/metric_seccomp.h create mode 100644 src/app/fdctl/run/tiles/metric.seccomppolicy create mode 100644 src/disco/metrics/fd_metrics_base.h create mode 100644 src/disco/metrics/generated/Local.mk create mode 100644 src/disco/metrics/generated/fd_metrics_all.c create mode 100644 src/disco/metrics/generated/fd_metrics_quic.c diff --git a/ffi/rust/firedancer-sys/src/tango/cnc.rs b/ffi/rust/firedancer-sys/src/tango/cnc.rs index de7e5398ec..5359216cbf 100644 --- a/ffi/rust/firedancer-sys/src/tango/cnc.rs +++ b/ffi/rust/firedancer-sys/src/tango/cnc.rs @@ -23,8 +23,6 @@ pub use crate::gentango::{ fd_cstr_to_cnc_signal, FD_CNC_ALIGN, FD_CNC_APP_ALIGN, - FD_CNC_DIAG_BACKP_CNT, - FD_CNC_DIAG_IN_BACKP, FD_CNC_ERR_AGAIN, FD_CNC_ERR_FAIL, FD_CNC_ERR_INVAL, diff --git a/ffi/rust/firedancer-sys/src/tango/fseq.rs b/ffi/rust/firedancer-sys/src/tango/fseq.rs index 4a01ecaa28..24ae886961 100644 --- a/ffi/rust/firedancer-sys/src/tango/fseq.rs +++ b/ffi/rust/firedancer-sys/src/tango/fseq.rs @@ -13,12 +13,5 @@ pub use crate::gentango::{ FD_FSEQ_ALIGN, FD_FSEQ_APP_ALIGN, FD_FSEQ_APP_FOOTPRINT, - FD_FSEQ_DIAG_FILT_CNT, - FD_FSEQ_DIAG_FILT_SZ, - FD_FSEQ_DIAG_OVRNP_CNT, - FD_FSEQ_DIAG_OVRNR_CNT, - FD_FSEQ_DIAG_PUB_CNT, - FD_FSEQ_DIAG_PUB_SZ, - FD_FSEQ_DIAG_SLOW_CNT, FD_FSEQ_FOOTPRINT, }; diff --git a/solana b/solana index 66ee1f4969..b3900619d0 160000 --- a/solana +++ b/solana @@ -1 +1 @@ -Subproject commit 66ee1f4969cc7fd84793e3101701302f84634fb1 +Subproject commit b3900619d05574914dd8b5700b92e7d38120a984 diff --git a/src/app/fdctl/Local.mk b/src/app/fdctl/Local.mk index 12db0a89de..8661a8cc97 100644 --- a/src/app/fdctl/Local.mk +++ b/src/app/fdctl/Local.mk @@ -3,9 +3,9 @@ ifdef FD_HAS_ALLOCA ifdef FD_HAS_X86 ifdef FD_HAS_DOUBLE -.PHONY: fdctl cargo +.PHONY: fdctl cargo rust solana -$(call add-objs,main1 config caps utility topology keys ready mem spy help run/run run/tiles/tiles run/run1 run/run_solana run/tiles/tiles run/tiles/fd_net run/tiles/fd_netmux run/tiles/fd_dedup run/tiles/fd_pack run/tiles/fd_quic run/tiles/fd_verify run/tiles/fd_bank run/tiles/fd_shred run/tiles/fd_store monitor/monitor monitor/helper configure/configure configure/large_pages configure/sysctl configure/shmem configure/xdp configure/xdp_leftover configure/ethtool configure/workspace_leftover configure/workspace,fd_fdctl) +$(call add-objs,main1 config caps utility topology keys ready mem spy help run/run run/tiles/tiles run/run1 run/run_solana run/tiles/tiles run/tiles/fd_net run/tiles/fd_metric run/tiles/fd_netmux run/tiles/fd_dedup run/tiles/fd_pack run/tiles/fd_quic run/tiles/fd_verify run/tiles/fd_bank run/tiles/fd_shred run/tiles/fd_store monitor/monitor monitor/helper configure/configure configure/large_pages configure/sysctl configure/shmem configure/xdp configure/xdp_leftover configure/ethtool configure/workspace_leftover configure/workspace,fd_fdctl) $(call make-bin-rust,fdctl,main,fd_fdctl fd_disco fd_flamenco fd_ip fd_reedsol fd_ballet fd_tango fd_util fd_quic solana_validator) $(OBJDIR)/obj/app/fdctl/configure/xdp.o: src/tango/xdp/fd_xdp_redirect_prog.o $(OBJDIR)/obj/app/fdctl/config.o: src/app/fdctl/config/default.toml @@ -18,6 +18,7 @@ $(OBJDIR)/obj/app/fdctl/run/tiles/fd_pack.o: src/app/fdctl/run/tiles/generated/p $(OBJDIR)/obj/app/fdctl/run/tiles/fd_quic.o: src/app/fdctl/run/tiles/generated/quic_seccomp.h $(OBJDIR)/obj/app/fdctl/run/tiles/fd_shred.o: src/app/fdctl/run/tiles/generated/shred_seccomp.h $(OBJDIR)/obj/app/fdctl/run/tiles/fd_verify.o: src/app/fdctl/run/tiles/generated/verify_seccomp.h +$(OBJDIR)/obj/app/fdctl/run/tiles/fd_metric.o: src/app/fdctl/run/tiles/generated/metric_seccomp.h # Phony target to always rerun cargo build ... it will detect if anything # changed on the library side. @@ -56,6 +57,8 @@ $(OBJDIR)/bin/solana: solana/target/$(RUST_PROFILE)/solana rust: $(OBJDIR)/bin/solana +solana: $(OBJDIR)/bin/solana + endif endif endif diff --git a/src/app/fdctl/config.c b/src/app/fdctl/config.c index bce3024d79..b4bf1074ca 100644 --- a/src/app/fdctl/config.c +++ b/src/app/fdctl/config.c @@ -234,6 +234,8 @@ static int parse_key_value( config_t * config, ENTRY_UINT ( ., tiles.shred, max_pending_shred_sets ); ENTRY_USHORT( ., tiles.shred, shred_listen_port ); + ENTRY_USHORT( ., tiles.metric, prometheus_listen_port ); + ENTRY_BOOL ( ., development, sandbox ); ENTRY_BOOL ( ., development, no_solana_labs ); @@ -554,6 +556,7 @@ topo_initialize( config_t * config ) { TILE( config->layout.bank_tile_count, FD_TOPO_TILE_KIND_BANK, FD_TOPO_WKSP_KIND_BANK, ULONG_MAX ); TILE( 1, FD_TOPO_TILE_KIND_SHRED, FD_TOPO_WKSP_KIND_SHRED, fd_topo_find_link( topo, FD_TOPO_LINK_KIND_SHRED_TO_STORE, i ) ); TILE( 1, FD_TOPO_TILE_KIND_STORE, FD_TOPO_WKSP_KIND_STORE, ULONG_MAX ); + TILE( 1, FD_TOPO_TILE_KIND_METRIC, FD_TOPO_WKSP_KIND_METRIC, ULONG_MAX ); topo->tile_cnt = tile_cnt; @@ -982,6 +985,9 @@ config_parse( int * pargc, break; case FD_TOPO_TILE_KIND_STORE: break; + case FD_TOPO_TILE_KIND_METRIC: + tile->metric.prometheus_listen_port = config->tiles.metric.prometheus_listen_port; + break; default: FD_LOG_ERR(( "unknown tile kind %lu", tile->kind )); } diff --git a/src/app/fdctl/config.h b/src/app/fdctl/config.h index b4ca6bf1a6..cf9f2e1901 100644 --- a/src/app/fdctl/config.h +++ b/src/app/fdctl/config.h @@ -188,6 +188,9 @@ typedef struct { ushort shred_listen_port; } shred; + struct { + ushort prometheus_listen_port; + } metric; } tiles; } config_t; diff --git a/src/app/fdctl/config/default.toml b/src/app/fdctl/config/default.toml index b1550d701d..5a9e69f2e2 100644 --- a/src/app/fdctl/config/default.toml +++ b/src/app/fdctl/config/default.toml @@ -477,7 +477,7 @@ dynamic_port_range = "8900-9000" # # It is suggested to use all available CPU cores for Firedancer, so # that the Solana network can run as fast as possible. - affinity = "0-15" + affinity = "0-16" # How many net tiles to run. Each networking tile will service # exactly one queue from a network device being listened to. If @@ -696,6 +696,7 @@ dynamic_port_range = "8900-9000" # the client nor the server has sent any packet to the other for # a period of time. Once this timeout is reached the connection # will be terminated. + # # An idle connection will be terminated if it remains idle longer than # some threshold. "idle_timeout_millis" represents this threshold in # milliseconds @@ -784,6 +785,15 @@ dynamic_port_range = "8900-9000" # this one. shred_listen_port = 8003 + # The metric tile receives metrics updates published from the rest + # of the tiles and serves them via. a Prometheus compatible HTTP + # endpoint. + [tiles.metric] + # The port to listen on for HTTP request for Prometheus metrics. + # Firedancer serves metrics at a URI like + # 127.0.0.1:7999/metrics + prometheus_listen_port = 7999 + # These options can be useful for development, but should not be used # when connecting to a live cluster, as they may cause the validator to # be unstable or have degraded performance or security. The program diff --git a/src/app/fdctl/monitor/monitor.c b/src/app/fdctl/monitor/monitor.c index 109d53003c..34e53be1e7 100644 --- a/src/app/fdctl/monitor/monitor.c +++ b/src/app/fdctl/monitor/monitor.c @@ -100,15 +100,15 @@ tile_snap( tile_snap_t * snap_cur, /* Snapshot for each tile, indexed [0,til FD_COMPILER_MFENCE(); snap->pid = FD_MGAUGE_GET( TILE, PID ); snap->in_backp = FD_MGAUGE_GET( STEM, IN_BACKPRESSURE ); - snap->backp_cnt = FD_MCNT_GET( STEM, BACKPRESSURE ); - snap->housekeeping_ticks = FD_MHIST_SUM( STEM, LOOP_DURATION_HOUSEKEEPING ); - snap->backpressure_ticks = FD_MHIST_SUM( STEM, LOOP_DURATION_BACKPRESSURE ); - snap->caught_up_ticks = FD_MHIST_SUM( STEM, LOOP_DURATION_CAUGHT_UP ); - snap->overrun_polling_ticks = FD_MHIST_SUM( STEM, LOOP_DURATION_OVERRUN_POLLING ); - snap->overrun_reading_ticks = FD_MHIST_SUM( STEM, LOOP_DURATION_OVERRUN_READING ); - snap->filter_before_frag_ticks = FD_MHIST_SUM( STEM, LOOP_DURATION_FILTER_BEFORE_FRAGMENT ); - snap->filter_after_frag_ticks = FD_MHIST_SUM( STEM, LOOP_DURATION_FILTER_AFTER_FRAGMENT ); - snap->finish_ticks = FD_MHIST_SUM( STEM, LOOP_DURATION_FINISH ); + snap->backp_cnt = FD_MCNT_GET( STEM, BACKPRESSURE_COUNT ); + snap->housekeeping_ticks = FD_MHIST_SUM( STEM, LOOP_HOUSEKEEPING_DURATION_SECONDS ); + snap->backpressure_ticks = FD_MHIST_SUM( STEM, LOOP_BACKPRESSURE_DURATION_SECONDS ); + snap->caught_up_ticks = FD_MHIST_SUM( STEM, LOOP_CAUGHT_UP_DURATION_SECONDS ); + snap->overrun_polling_ticks = FD_MHIST_SUM( STEM, LOOP_OVERRUN_POLLING_DURATION_SECONDS ); + snap->overrun_reading_ticks = FD_MHIST_SUM( STEM, LOOP_OVERRUN_READING_DURATION_SECONDS ); + snap->filter_before_frag_ticks = FD_MHIST_SUM( STEM, LOOP_FILTER_BEFORE_FRAGMENT_DURATION_SECONDS ); + snap->filter_after_frag_ticks = FD_MHIST_SUM( STEM, LOOP_FILTER_AFTER_FRAGMENT_DURATION_SECONDS ); + snap->finish_ticks = FD_MHIST_SUM( STEM, LOOP_FINISH_DURATION_SECONDS ); FD_COMPILER_MFENCE(); } } @@ -126,15 +126,29 @@ link_snap( link_snap_t * snap_cur, ulong const * fseq = topo->tiles[ tile_idx ].in_link_fseq[ in_idx ]; snap->fseq_seq = fd_fseq_query( fseq ); - ulong const * fseq_diag = (ulong const *)fd_fseq_app_laddr_const( fseq ); + + ulong const * in_metrics = (ulong const *)fd_metrics_link_in( topo->tiles[ tile_idx ].metrics, in_idx ); + + fd_topo_link_t * link = &topo->links[ topo->tiles[ tile_idx ].in_link_id[ in_idx ] ]; + ulong producer_id = fd_topo_find_link_producer( topo, link ); + ulong const * out_metrics = NULL; + if( FD_LIKELY( producer_id!=ULONG_MAX && topo->tiles[ tile_idx ].in_link_reliable[ in_idx ] ) ) { + fd_topo_tile_t * producer = &topo->tiles[ producer_id ]; + ulong out_idx; + for( out_idx=0UL; out_idxout_cnt; out_idx++ ) { + if( producer->out_link_id[ out_idx ]==link->id ) break; + } + out_metrics = fd_metrics_link_out( producer->metrics, out_idx ); + } FD_COMPILER_MFENCE(); - snap->fseq_diag_tot_cnt = fseq_diag[ FD_FSEQ_DIAG_PUB_CNT ]; - snap->fseq_diag_tot_sz = fseq_diag[ FD_FSEQ_DIAG_PUB_SZ ]; - snap->fseq_diag_filt_cnt = fseq_diag[ FD_FSEQ_DIAG_FILT_CNT ]; - snap->fseq_diag_filt_sz = fseq_diag[ FD_FSEQ_DIAG_FILT_SZ ]; - snap->fseq_diag_ovrnp_cnt = fseq_diag[ FD_FSEQ_DIAG_OVRNP_CNT ]; - snap->fseq_diag_ovrnr_cnt = fseq_diag[ FD_FSEQ_DIAG_OVRNR_CNT ]; - snap->fseq_diag_slow_cnt = fseq_diag[ FD_FSEQ_DIAG_SLOW_CNT ]; + snap->fseq_diag_tot_cnt = in_metrics[ FD_METRICS_COUNTER_LINK_PUBLISHED_COUNT_OFF ]; + snap->fseq_diag_tot_sz = in_metrics[ FD_METRICS_COUNTER_LINK_PUBLISHED_SIZE_BYTES_OFF ]; + snap->fseq_diag_filt_cnt = in_metrics[ FD_METRICS_COUNTER_LINK_FILTERED_COUNT_OFF ]; + snap->fseq_diag_filt_sz = in_metrics[ FD_METRICS_COUNTER_LINK_FILTERED_SIZE_BYTES_OFF ]; + snap->fseq_diag_ovrnp_cnt = in_metrics[ FD_METRICS_COUNTER_LINK_OVERRUN_POLLING_COUNT_OFF ]; + snap->fseq_diag_ovrnr_cnt = in_metrics[ FD_METRICS_COUNTER_LINK_OVERRUN_READING_COUNT_OFF ]; + + snap->fseq_diag_slow_cnt = out_metrics[ FD_METRICS_COUNTER_LINK_SLOW_COUNT_OFF ]; FD_COMPILER_MFENCE(); snap->fseq_diag_tot_cnt += snap->fseq_diag_filt_cnt; snap->fseq_diag_tot_sz += snap->fseq_diag_filt_sz; diff --git a/src/app/fdctl/run/run1.c b/src/app/fdctl/run/run1.c index 192f5bf3c5..abeecf5e8c 100644 --- a/src/app/fdctl/run/run1.c +++ b/src/app/fdctl/run/run1.c @@ -56,11 +56,6 @@ tile_main( void * _args ) { &fd_tile_private_stack0, &fd_tile_private_stack1 ); FD_LOG_NOTICE(( "booting tile %s:%lu pid:%lu", fd_topo_tile_kind_str( tile->kind ), tile->kind_id, fd_log_group_id() )); - /* calling fd_tempo_tick_per_ns requires nanosleep, it is cached with - a FD_ONCE. We do this for all tiles before sandboxing so that we - don't need to allow the nanosleep syscall. */ - fd_tempo_tick_per_ns( NULL ); - /* preload shared memory before sandboxing, so it is already mapped */ fd_topo_join_tile_workspaces( args->config->name, &args->config->topo, @@ -96,7 +91,7 @@ tile_main( void * _args ) { fd_sandbox( args->config->development.sandbox, args->config->uid, args->config->gid, - 0UL, + config->rlimit_file_cnt, allow_fds_cnt+allow_fds_offset, allow_fds, seccomp_filter_cnt, @@ -170,7 +165,7 @@ tile_main( void * _args ) { fd_alloca( FD_MUX_TILE_SCRATCH_ALIGN, FD_MUX_TILE_SCRATCH_FOOTPRINT( tile->in_cnt, out_cnt_reliable ) ), ctx, &callbacks ); - + FD_LOG_ERR(( "tile run loop returned" )); return 0; } diff --git a/src/app/fdctl/run/tiles/fd_metric.c b/src/app/fdctl/run/tiles/fd_metric.c new file mode 100644 index 0000000000..013cd1a9d2 --- /dev/null +++ b/src/app/fdctl/run/tiles/fd_metric.c @@ -0,0 +1,487 @@ +#include "tiles.h" + +#include "generated/metric_seccomp.h" + +#include "../../../../ballet/http/picohttpparser.h" + +#include +#include +#include +#include +#include +#include +#include + +#define MAX_CONNS 128 + +/* The metric tile reads metrics updates from other tiles, maybe + presents them on a local HTTP endpoint, and maybe uploads them to + a server InfluxDB endpoint. */ + +typedef struct { + ulong bytes_read; + char input[ 1024 ]; + + ulong output_len; + char output[ 1048576 ]; + ulong bytes_written; +} fd_metric_connection_t; + +typedef struct { + fd_topo_t * topo; + + int socket_fd; + + fd_metric_connection_t conns[ MAX_CONNS ]; + struct pollfd fds[ MAX_CONNS+1 ]; + + ulong conn_id; +} fd_metric_ctx_t; + +FD_FN_CONST static inline ulong +scratch_align( void ) { + return 128UL; +} + +FD_FN_PURE static inline ulong +scratch_footprint( fd_topo_tile_t * tile ) { + (void)tile; + ulong l = FD_LAYOUT_INIT; + l = FD_LAYOUT_APPEND( l, alignof( fd_metric_ctx_t ), sizeof( fd_metric_ctx_t ) ); + return FD_LAYOUT_FINI( l, scratch_align() ); +} + +FD_FN_CONST static inline void * +mux_ctx( void * scratch ) { + return (void*)fd_ulong_align_up( (ulong)scratch, alignof( fd_metric_ctx_t ) ); +} + +static void +close_conn( fd_metric_ctx_t * ctx, + ulong idx ) { + if( FD_UNLIKELY( -1==close( ctx->fds[ idx ].fd ) ) ) FD_LOG_ERR(( "close failed (%i-%s)", errno, strerror( errno ) )); + ctx->fds[ idx ].fd = -1; +} + +static void +accept_conns( fd_metric_ctx_t * ctx ) { + for(;;) { + int fd = accept( ctx->socket_fd, NULL, NULL ); + + if( FD_UNLIKELY( -1==fd ) ) { + if( FD_LIKELY( EAGAIN==errno ) ) break; + else if( FD_LIKELY( ENETDOWN==errno || EPROTO==errno || ENOPROTOOPT==errno || EHOSTDOWN==errno || + ENONET==errno || EHOSTUNREACH==errno || EOPNOTSUPP==errno || ENETUNREACH==errno ) ) continue; + else FD_LOG_ERR(( "accept failed (%i-%s)", errno, strerror( errno ) )); + } + + /* Just evict oldest connection if it's still alive, they were too slow. */ + if( FD_UNLIKELY( -1!=ctx->fds[ ctx->conn_id ].fd ) ) close_conn( ctx, ctx->conn_id ); + + ctx->fds[ ctx->conn_id ].fd = fd; + ctx->conns[ ctx->conn_id ] = (fd_metric_connection_t){ + .bytes_read = 0UL, + .bytes_written = 0UL, + .output_len = 0UL, + }; + ctx->conn_id = (ctx->conn_id + 1) % MAX_CONNS; + } +} + +#define PRINT( ... ) (__extension__({ \ + int n = snprintf( *out, *out_len, __VA_ARGS__ ); \ + if( FD_UNLIKELY( n<0 ) ) return -1; \ + if( FD_UNLIKELY( (ulong)n>=*out_len ) ) return -1; \ + *out += n; *out_len -= (ulong)n; \ + n; \ + })) + +#define PRINT_LINK_IN (0) +#define PRINT_LINK_OUT (1) +#define PRINT_TILE (2) + +static ulong +find_producer_out_idx( fd_topo_t * topo, + fd_topo_tile_t * producer, + fd_topo_tile_t * consumer, + ulong consumer_in_idx ) { + /* This finds all reliable consumers of the producers primary output, + and then returns the position of the consumer (specified by tile + and index of the in of that tile) in that list. The list ordering + is not important, except that it matches the ordering of fseqs + provided to the mux tile, so that metrics written for each link + index are retrieved at the same index here. + + This is why we only count reliable links, because the mux tile only + looks at and writes producer side diagnostics (is the link slow) + for reliable links. */ + + ulong count = 0UL; + for( ulong i=0; itile_cnt; i++ ) { + fd_topo_tile_t * consumer_tile = &topo->tiles[ i ]; + for( ulong j=0; jin_cnt; j++ ) { + if( FD_UNLIKELY( consumer_tile->in_link_id[ j ] == producer->out_link_id_primary && consumer_tile->in_link_reliable[ j ] ) ) { + if( FD_UNLIKELY( consumer==consumer_tile && consumer_in_idx==j ) ) return count; + count++; + } + } + } + return ULONG_MAX; +} + +static long +prometheus_print1( fd_topo_t * topo, + char ** out, + ulong * out_len, + ulong tile_kind, + ulong metrics_cnt, + fd_metrics_meta_t const * metrics, + int print_mode ) { + for( ulong i=0; iname, metric->desc, metric->name, fd_metrics_meta_type_str( metric ) ); + + for( ulong j=0; jtile_cnt; j++ ) { + fd_topo_tile_t * tile = &topo->tiles[ j ]; + if( FD_LIKELY( tile_kind!=ULONG_MAX && tile_kind!=tile->kind ) ) continue; + + if( FD_LIKELY( metric->type==FD_METRICS_TYPE_COUNTER || metric->type==FD_METRICS_TYPE_GAUGE ) ) { + if( FD_LIKELY( print_mode==PRINT_TILE ) ) { + ulong value = *(fd_metrics_tile( tile->metrics ) + metric->offset); + PRINT( "%s{kind=\"%s\",kind_id=\"%lu\"} %lu\n", metric->name, fd_topo_tile_kind_str( tile->kind ), tile->kind_id, value ); + } else { + if( FD_LIKELY( print_mode==PRINT_LINK_IN ) ) { + for( ulong k=0; kin_cnt; k++ ) { + fd_topo_link_t * link = &topo->links[ tile->in_link_id[ k ] ]; + ulong value = *(fd_metrics_link_in( tile->metrics, k ) + metric->offset ); + PRINT( "%s{kind=\"%s\",kind_id=\"%lu\",link_kind=\"%s\",link_kind_id=\"%lu\"} %lu\n", metric->name, fd_topo_tile_kind_str( tile->kind ), tile->kind_id, fd_topo_link_kind_str( link->kind ), link->kind_id, value ); + } + } else if( FD_LIKELY( print_mode==PRINT_LINK_OUT ) ) { + for( ulong k=0; kin_cnt; k++ ) { + fd_topo_link_t * link = &topo->links[ tile->in_link_id[ k ] ]; + if( FD_UNLIKELY( !tile->in_link_reliable[ k ] ) ) continue; + + ulong producer_idx = fd_topo_find_link_producer( topo, link ); + if( FD_UNLIKELY( producer_idx==ULONG_MAX ) ) continue; + + fd_topo_tile_t * producer = &topo->tiles[ producer_idx ]; + if( FD_UNLIKELY( producer->out_link_id_primary!=link->id ) ) continue; + + /* This index needs to line up with what the mux tile thinks the index is + of that tile in its consumer list. */ + ulong producer_out_idx = find_producer_out_idx( topo, producer, tile, k ); + ulong value = *(fd_metrics_link_out( producer->metrics, producer_out_idx ) + metric->offset ); + + PRINT( "%s{kind=\"%s\",kind_id=\"%lu\",link_kind=\"%s\",link_kind_id=\"%lu\"} %lu\n", metric->name, fd_topo_tile_kind_str( tile->kind ), tile->kind_id, fd_topo_link_kind_str( link->kind ), link->kind_id, value ); + } + } + } + } else if( FD_LIKELY( metric->type==FD_METRICS_TYPE_HISTOGRAM ) ) { + fd_histf_t hist[1]; + if( FD_LIKELY( metric->histogram.converter==FD_METRICS_CONVERTER_SECONDS ) ) + FD_TEST( fd_histf_new( hist, fd_metrics_convert_seconds_to_ticks( metric->histogram.seconds.min ), fd_metrics_convert_seconds_to_ticks ( metric->histogram.seconds.max ) ) ); + else if( FD_LIKELY( metric->histogram.converter==FD_METRICS_CONVERTER_NONE ) ) + FD_TEST( fd_histf_new( hist, metric->histogram.none.min, metric->histogram.none.max ) ); + else FD_LOG_ERR(( "unknown histogram converter %i", metric->histogram.converter )); + + ulong value = 0; + char value_str[ 64 ]; + for( ulong k=0; kmetrics ) + metric->offset + k); + + char * le; + char le_str[ 64 ]; + if( FD_UNLIKELY( k==FD_HISTF_BUCKET_CNT-1UL ) ) le = "+Inf"; + else { + ulong edge = fd_histf_right( hist, k ); + if( FD_LIKELY( metric->histogram.converter==FD_METRICS_CONVERTER_SECONDS ) ) { + double edgef = fd_metrics_convert_ticks_to_seconds( edge-1 ); + snprintf1( le_str, sizeof( le_str ), "%.17g", edgef ); + } else { + snprintf1( le_str, sizeof( le_str ), "%lu", edge-1 ); + } + le = le_str; + } + + snprintf1( value_str, sizeof( value_str ), "%lu", value ); + PRINT( "%s_bucket{kind=\"%s\",kind_id=\"%lu\",le=\"%s\"} %s\n", metric->name, fd_topo_tile_kind_str( tile->kind ), tile->kind_id, le, value_str ); + } + + char sum_str[ 64 ]; + if( FD_LIKELY( metric->histogram.converter==FD_METRICS_CONVERTER_SECONDS ) ) { + double sumf = fd_metrics_convert_ticks_to_seconds( *(fd_metrics_tile( tile->metrics ) + metric->offset + FD_HISTF_BUCKET_CNT) ); + snprintf1( sum_str, sizeof( sum_str ), "%.17g", sumf ); + } else { + snprintf1( sum_str, sizeof( sum_str ), "%lu", *(fd_metrics_tile( tile->metrics ) + metric->offset + FD_HISTF_BUCKET_CNT) ); + } + + PRINT( "%s_sum{kind=\"%s\",kind_id=\"%lu\"} %s\n", metric->name, fd_topo_tile_kind_str( tile->kind ), tile->kind_id, sum_str ); + PRINT( "%s_count{kind=\"%s\",kind_id=\"%lu\"} %s\n", metric->name, fd_topo_tile_kind_str( tile->kind ), tile->kind_id, value_str ); + } + } + + if( FD_LIKELY( i!=metrics_cnt-1 ) ) PRINT( "\n" ); + } + + return 0; +} + +static long +prometheus_print( fd_topo_t * topo, + char ** out, + ulong * out_len ) { + ulong start_len = *out_len; + + PRINT( "HTTP/1.1 200 OK\r\nContent-Length: " ); + char * content_len = *out; + + /* Stuff a bunch of whitespace so we can replace with the real Content-Length later. + Enough whitespace to print ULONG_MAX. */ + PRINT( " \r\nContent-Type: text/plain; version=0.0.4\r\n\r\n" ); + ulong content_start = (ulong)(start_len - *out_len); + + long result = prometheus_print1( topo, out, out_len, ULONG_MAX, FD_METRICS_ALL_TOTAL, FD_METRICS_ALL, PRINT_TILE ); + if( FD_UNLIKELY( result<0 ) ) return result; + PRINT( "\r\n" ); + result = prometheus_print1( topo, out, out_len, ULONG_MAX, FD_METRICS_ALL_LINK_IN_TOTAL, FD_METRICS_ALL_LINK_IN, PRINT_LINK_IN ); + if( FD_UNLIKELY( result<0 ) ) return result; + PRINT( "\r\n" ); + result = prometheus_print1( topo, out, out_len, ULONG_MAX, FD_METRICS_ALL_LINK_OUT_TOTAL, FD_METRICS_ALL_LINK_OUT, PRINT_LINK_OUT ); + if( FD_UNLIKELY( result<0 ) ) return result; + PRINT( "\r\n" ); + result = prometheus_print1( topo, out, out_len, FD_TOPO_TILE_KIND_QUIC, FD_METRICS_QUIC_TOTAL, FD_METRICS_QUIC, PRINT_TILE ); + if( FD_UNLIKELY( result<0 ) ) return result; + + /* Now backfill Content-Length */ + int printed = snprintf( content_len, 21, "%lu", start_len - *out_len - content_start ); + if( FD_UNLIKELY( printed<0 ) ) return -1; + if( FD_UNLIKELY( (ulong)printed>=21 ) ) return -1; + content_len[ printed ] = ' '; /* Clear NUL terminator */ + + return (long)(start_len - *out_len); +} + +static long +http_404_print( char ** out, + ulong * out_len ) { + ulong start_len = *out_len; + PRINT( "HTTP/1.1 404 Not Found\r\nContent-Length: 0\r\n\r\n" ); + return (long)(start_len - *out_len); +} + +static long +http_400_print( char ** out, + ulong * out_len ) { + ulong start_len = *out_len; + PRINT( "HTTP/1.1 400 Internal Server Error\r\nContent-Length: 0\r\n\r\n" ); + return (long)(start_len - *out_len); +} + + +static void +read_conn( fd_metric_ctx_t * ctx, + ulong idx ) { + fd_metric_connection_t * conn = &ctx->conns[ idx ]; + if( FD_UNLIKELY( conn->bytes_read==ULONG_MAX ) ) return; /* Connection now in write mode, no need to read more. */ + + long sz = read( ctx->fds[ idx ].fd, conn->input + conn->bytes_read, sizeof( conn->input ) - conn->bytes_read ); + if( FD_UNLIKELY( -1==sz && errno==EAGAIN ) ) return; /* No data to read, continue. */ + else if( FD_UNLIKELY( -1==sz ) ) FD_LOG_ERR(( "read failed (%i-%s)", errno, strerror( errno ) )); /* Unexpected programmer error, abort */ + else if( FD_UNLIKELY( !sz ) ) { + close_conn( ctx, idx ); /* EOF, peer closed connection */ + return; + } + + /* New data was read... process it */ + conn->bytes_read += (ulong)sz; + if( FD_UNLIKELY( conn->bytes_read == sizeof( conn->input ) ) ) { + close_conn( ctx, idx ); /* Input buffer full, request too long, terminate connection */ + return; + } + + char const * method; + ulong method_len; + char const * path; + ulong path_len; + int minor_version; + struct phr_header headers[ 32 ]; + ulong num_headers = 32UL; + int result = phr_parse_request( conn->input, + conn->bytes_read, + &method, &method_len, + &path, &path_len, + &minor_version, + headers, &num_headers, + conn->bytes_read - (ulong)sz ); + if( FD_UNLIKELY( -2==result ) ) return; /* Request still partial, wait for more data */ + else if( FD_UNLIKELY( -1==result ) ) { + /* Invalid request, terminate connection */ + close_conn( ctx, idx ); /* Malformed HTTP request, terminate connection */ + return; + } + + char * out = conn->output; + ulong out_len = sizeof( conn->output ); + + /* Well formed request, process it */ + int valid = method_len==3 && !strncmp( method, "GET", method_len ) && path_len==8 && !strncmp( path, "/metrics", path_len ); + long printed = 0; + if( FD_UNLIKELY( !valid ) ) printed = http_404_print( &out, &out_len ); + else { + printed = prometheus_print( ctx->topo, &out, &out_len ); + if( FD_UNLIKELY( -1==printed ) ) { + FD_LOG_WARNING(( "unable to print metrics to HTTP endpoint" )); + printed = http_400_print( &out, &out_len ); + } + } + + if( FD_UNLIKELY( -1==printed ) ) { + FD_LOG_WARNING(( "internal server error" )); + close_conn( ctx, idx ); + return; + } + + conn->bytes_read = ULONG_MAX; /* Mark connection as ready to write, no longer readable. */ + conn->output_len = (ulong)printed; + conn->bytes_written = 0UL; +} + +static void +write_conn( fd_metric_ctx_t * ctx, + ulong idx ) { + fd_metric_connection_t * conn = &ctx->conns[ idx ]; + if( FD_UNLIKELY( conn->bytes_read!=ULONG_MAX ) ) return; /* No data staged for write yet. */ + + long sz = write( ctx->fds[ idx ].fd, conn->output + conn->bytes_written, conn->output_len - conn->bytes_written ); + if( FD_UNLIKELY( -1==sz && (errno==EAGAIN || errno==EINTR) ) ) return; /* No data to write, continue. */ + if( FD_UNLIKELY( -1==sz && errno==EPIPE ) ) { + close_conn( ctx, idx ); /* Peer closed connection */ + return; + } + if( FD_UNLIKELY( -1==sz ) ) FD_LOG_ERR(( "write failed (%i-%s)", errno, strerror( errno ) )); /* Unexpected programmer error, abort */ + + conn->bytes_written += (ulong)sz; + if( FD_UNLIKELY( conn->bytes_written==conn->output_len ) ) { + close_conn( ctx, idx ); /* All data written, close connection gracefully. */ + } +} + +static void +before_credit( void * _ctx, + fd_mux_context_t * mux ) { + (void)mux; + + fd_metric_ctx_t * ctx = (fd_metric_ctx_t *)_ctx; + + int nfds = poll( ctx->fds, MAX_CONNS+1, 0 ); + if( FD_UNLIKELY( 0==nfds ) ) return; + else if( FD_UNLIKELY( -1==nfds && errno==EINTR ) ) return; + else if( FD_UNLIKELY( -1==nfds ) ) FD_LOG_ERR(( "poll failed (%i-%s)", errno, strerror( errno ) )); + + /* Poll existing connections for new data. */ + for( ulong i=0; ifds[ i ].fd ) ) continue; + if( FD_UNLIKELY( i==MAX_CONNS ) ) { + accept_conns( ctx ); + } else { + if( FD_LIKELY( ctx->fds[ i ].revents & POLLIN ) ) read_conn( ctx, i ); + if( FD_LIKELY( ctx->fds[ i ].revents & POLLOUT ) ) write_conn( ctx, i ); + /* No need to handle POLLHUP, read() will return 0 soon enough. */ + } + } +} + +static void +privileged_init( fd_topo_t * topo, + fd_topo_tile_t * tile, + void * scratch ) { + (void)topo; + + FD_SCRATCH_ALLOC_INIT( l, scratch ); + fd_metric_ctx_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof( fd_metric_ctx_t ), sizeof( fd_metric_ctx_t ) ); + + int sockfd = socket( AF_INET, SOCK_STREAM | SOCK_NONBLOCK, 0 ); + if( FD_UNLIKELY( -1==sockfd ) ) FD_LOG_ERR(( "socket failed (%i-%s)", errno, strerror( errno ) )); + + int optval = 1; + if( FD_UNLIKELY( -1==setsockopt( sockfd, SOL_SOCKET, SO_REUSEADDR, &optval, sizeof( optval ) ) ) ) + FD_LOG_ERR(( "setsockopt failed (%i-%s)", errno, strerror( errno ) )); + + struct sockaddr_in addr = { + .sin_family = AF_INET, + .sin_port = fd_ushort_bswap( tile->metric.prometheus_listen_port ), + .sin_addr.s_addr = INADDR_ANY, + }; + + if( FD_UNLIKELY( -1==bind( sockfd, fd_type_pun( &addr ), sizeof( addr ) ) ) ) FD_LOG_ERR(( "bind failed (%i-%s)", errno, strerror( errno ) )); + if( FD_UNLIKELY( -1==listen( sockfd, 128 ) ) ) FD_LOG_ERR(( "listen failed (%i-%s)", errno, strerror( errno ) )); + + ctx->socket_fd = sockfd; +} + +static void +unprivileged_init( fd_topo_t * topo, + fd_topo_tile_t * tile, + void * scratch ) { + FD_SCRATCH_ALLOC_INIT( l, scratch ); + fd_metric_ctx_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof( fd_metric_ctx_t ), sizeof( fd_metric_ctx_t ) ); + + ctx->topo = topo; + + ctx->conn_id = 0; + for( ulong i=0; ifds[ i ].fd = -1; + ctx->fds[ i ].events = POLLIN | POLLOUT; + } + + ctx->fds[ MAX_CONNS ].fd = ctx->socket_fd; + ctx->fds[ MAX_CONNS ].events = POLLIN | POLLOUT; + + ulong scratch_top = FD_SCRATCH_ALLOC_FINI( l, 1UL ); + if( FD_UNLIKELY( scratch_top > (ulong)scratch + scratch_footprint( tile ) ) ) + FD_LOG_ERR(( "scratch overflow %lu %lu %lu", scratch_top - (ulong)scratch - scratch_footprint( tile ), scratch_top, (ulong)scratch + scratch_footprint( tile ) )); + + FD_LOG_NOTICE(( "Prometheus metrics endpoint listening on port %u", tile->metric.prometheus_listen_port )); +} + +static ulong +populate_allowed_seccomp( void * scratch, + ulong out_cnt, + struct sock_filter * out ) { + FD_SCRATCH_ALLOC_INIT( l, scratch ); + fd_metric_ctx_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof( fd_metric_ctx_t ), sizeof( fd_metric_ctx_t ) ); + + populate_sock_filter_policy_metric( out_cnt, out, (uint)fd_log_private_logfile_fd(), (uint)ctx->socket_fd ); + return sock_filter_policy_metric_instr_cnt; +} + +static ulong +populate_allowed_fds( void * scratch, + ulong out_fds_cnt, + int * out_fds ) { + FD_SCRATCH_ALLOC_INIT( l, scratch ); + fd_metric_ctx_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof( fd_metric_ctx_t ), sizeof( fd_metric_ctx_t ) ); + + if( FD_UNLIKELY( out_fds_cnt<3 ) ) FD_LOG_ERR(( "out_fds_cnt %lu", out_fds_cnt )); + + ulong out_cnt = 0; + out_fds[ out_cnt++ ] = 2; /* stderr */ + if( FD_LIKELY( -1!=fd_log_private_logfile_fd() ) ) + out_fds[ out_cnt++ ] = fd_log_private_logfile_fd(); /* logfile */ + out_fds[ out_cnt++ ] = ctx->socket_fd; /* listen socket */ + return out_cnt; +} + +fd_tile_config_t fd_tile_metric = { + .mux_flags = FD_MUX_FLAG_MANUAL_PUBLISH | FD_MUX_FLAG_COPY, + .burst = 1UL, + .rlimit_file_cnt = MAX_CONNS+1, + .mux_ctx = mux_ctx, + .mux_before_credit = before_credit, + .populate_allowed_seccomp = populate_allowed_seccomp, + .populate_allowed_fds = populate_allowed_fds, + .scratch_align = scratch_align, + .scratch_footprint = scratch_footprint, + .privileged_init = privileged_init, + .unprivileged_init = unprivileged_init, +}; diff --git a/src/app/fdctl/run/tiles/fd_quic.c b/src/app/fdctl/run/tiles/fd_quic.c index c79e9e833d..b70e196463 100644 --- a/src/app/fdctl/run/tiles/fd_quic.c +++ b/src/app/fdctl/run/tiles/fd_quic.c @@ -32,24 +32,7 @@ packets being received by net tiles and forwarded on via. a mux (multiplexer). An arbitrary number of QUIC tiles can be run, and these will round-robin packets from the networking queues based on - the source IP address. - - An fd_quic_tile will use the cnc application region to accumulate the - following tile specific counters: - - TPU_CONN_LIVE_CNT is the number of currently open QUIC conns - - TPU_CONN_SEQ is the sequence number of the last QUIC conn - opened - - As such, the cnc app region must be at least 64B in size. - - Except for IN_BACKP, none of the diagnostics are cleared at tile - startup (as such that they can be accumulated over multiple runs). - Clearing is up to monitoring scripts. */ - -#define FD_QUIC_CNC_DIAG_TPU_CONN_LIVE_CNT (6UL) -#define FD_QUIC_CNC_DIAG_TPU_CONN_SEQ (7UL) + the source IP address. */ typedef struct { fd_tpu_reasm_t * reasm; @@ -280,8 +263,8 @@ static inline void metrics_write( void * _ctx ) { fd_quic_ctx_t * ctx = (fd_quic_ctx_t *)_ctx; - FD_MGAUGE_SET( QUIC, ACTIVE_CONNECTIONS, ctx->conn_cnt ); - FD_MGAUGE_SET( QUIC, TOTAL_CONNECTIONS, ctx->conn_seq ); + FD_MGAUGE_SET( QUIC, CONNECTIONS_ACTIVE_COUNT, ctx->conn_cnt ); + FD_MCNT_SET( QUIC, CONNECTIONS_TOTAL_COUNT, ctx->conn_seq ); } static void @@ -299,7 +282,7 @@ before_frag( void * _ctx, ulong src_ip_addr = fd_disco_netmux_sig_ip_addr( sig ); ushort src_tile = fd_disco_netmux_sig_src_tile( sig ); - if( FD_UNLIKELY( src_tile != SRC_TILE_NET ) ) { + if( FD_UNLIKELY( src_tile!=SRC_TILE_NET ) ) { *opt_filter = 1; return; } @@ -357,12 +340,12 @@ after_frag( void * _ctx, ushort dst_port = fd_disco_netmux_sig_port( *opt_sig ); - if( FD_LIKELY( dst_port == ctx->quic->config.net.listen_udp_port ) ) { + if( FD_LIKELY( dst_port==ctx->quic->config.net.listen_udp_port ) ) { fd_aio_pkt_info_t pkt = { .buf = ctx->buffer, .buf_sz = (ushort)*opt_sz }; fd_aio_send( ctx->quic_rx_aio, &pkt, 1, NULL, 1 ); - } else if( FD_LIKELY( dst_port == ctx->legacy_transaction_port ) ) { + } else if( FD_LIKELY( dst_port==ctx->legacy_transaction_port ) ) { ulong network_hdr_sz = fd_disco_netmux_sig_hdr_sz( *opt_sig ); - if( FD_UNLIKELY( *opt_sz < network_hdr_sz ) ) { + if( FD_UNLIKELY( *opt_sz +#include +#include +#include +#include +#include +#include +#include + +#if defined(__i386__) +# define ARCH_NR AUDIT_ARCH_I386 +#elif defined(__x86_64__) +# define ARCH_NR AUDIT_ARCH_X86_64 +#elif defined(__aarch64__) +# define ARCH_NR AUDIT_ARCH_AARCH64 +#else +# error "Target architecture is unsupported by seccomp." +#endif +static const unsigned int sock_filter_policy_metric_instr_cnt = 20; + +static void populate_sock_filter_policy_metric( ulong out_cnt, struct sock_filter * out, unsigned int logfile_fd, unsigned int socket_fd) { + FD_TEST( out_cnt >= 20 ); + struct sock_filter filter[20] = { + /* Check: Jump to RET_KILL_PROCESS if the script's arch != the runtime arch */ + BPF_STMT( BPF_LD | BPF_W | BPF_ABS, ( offsetof( struct seccomp_data, arch ) ) ), + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, ARCH_NR, 0, /* RET_KILL_PROCESS */ 16 ), + /* loading syscall number in accumulator */ + BPF_STMT( BPF_LD | BPF_W | BPF_ABS, ( offsetof( struct seccomp_data, nr ) ) ), + /* allow fsync based on expression */ + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, __NR_fsync, /* check_fsync */ 6, 0 ), + /* allow accept based on expression */ + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, __NR_accept, /* check_accept */ 7, 0 ), + /* simply allow read */ + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, __NR_read, /* RET_ALLOW */ 13, 0 ), + /* simply allow write */ + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, __NR_write, /* RET_ALLOW */ 12, 0 ), + /* simply allow close */ + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, __NR_close, /* RET_ALLOW */ 11, 0 ), + /* simply allow poll */ + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, __NR_poll, /* RET_ALLOW */ 10, 0 ), + /* none of the syscalls matched */ + { BPF_JMP | BPF_JA, 0, 0, /* RET_KILL_PROCESS */ 8 }, +// check_fsync: + /* load syscall argument 0 in accumulator */ + BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[0])), + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, logfile_fd, /* RET_ALLOW */ 7, /* RET_KILL_PROCESS */ 6 ), +// check_accept: + /* load syscall argument 0 in accumulator */ + BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[0])), + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, socket_fd, /* lbl_1 */ 0, /* RET_KILL_PROCESS */ 4 ), +// lbl_1: + /* load syscall argument 1 in accumulator */ + BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[1])), + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 0, /* lbl_2 */ 0, /* RET_KILL_PROCESS */ 2 ), +// lbl_2: + /* load syscall argument 2 in accumulator */ + BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[2])), + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 0, /* RET_ALLOW */ 1, /* RET_KILL_PROCESS */ 0 ), +// RET_KILL_PROCESS: + /* KILL_PROCESS is placed before ALLOW since it's the fallthrough case. */ + BPF_STMT( BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS ), +// RET_ALLOW: + /* ALLOW has to be reached by jumping */ + BPF_STMT( BPF_RET | BPF_K, SECCOMP_RET_ALLOW ), + }; + fd_memcpy( out, filter, sizeof( filter ) ); +} + +#endif diff --git a/src/app/fdctl/run/tiles/metric.seccomppolicy b/src/app/fdctl/run/tiles/metric.seccomppolicy new file mode 100644 index 0000000000..07269a7b34 --- /dev/null +++ b/src/app/fdctl/run/tiles/metric.seccomppolicy @@ -0,0 +1,32 @@ +# logfile_fd: It can be disabled by configuration, but typically tiles +# will open a log file on boot and write all messages there. +# +# socket_fd: The metric tile servers a prometheus string HTTP endpoint, +# which is over TCP and does not use our XDP program. It +# uses regular kernel sockets, so this is the socket file +# descriptor. +unsigned int logfile_fd, unsigned int socket_fd + +# logging: 'WARNING' and above fsync the logfile to disk immediately +# +# arg 0 is the file descriptor to fsync. +fsync: (eq (arg 0) logfile_fd) + +# server: serving metric values over HTTP requires accepting connections +# +# arg 0 is the listen socket file descriptor to accept connections on +accept: (and (eq (arg 0) socket_fd) + (eq (arg 1) 0) + (eq (arg 2) 0)) + +# server: serving metric values over HTTP requires reading from conns +read + +# server: serving metric values over HTTP requires writing to conns +write + +# server: serving metric values over HTTP requires closing conns +close + +# server: serving metric values over HTTP requires polling conns +poll diff --git a/src/app/fdctl/run/tiles/tiles.c b/src/app/fdctl/run/tiles/tiles.c index 608897dc28..5087aa9b34 100644 --- a/src/app/fdctl/run/tiles/tiles.c +++ b/src/app/fdctl/run/tiles/tiles.c @@ -15,6 +15,7 @@ fd_topo_tile_to_config( fd_topo_tile_t * tile ) { case FD_TOPO_TILE_KIND_BANK: return &fd_tile_bank; case FD_TOPO_TILE_KIND_SHRED: return &fd_tile_shred; case FD_TOPO_TILE_KIND_STORE: return &fd_tile_store; + case FD_TOPO_TILE_KIND_METRIC: return &fd_tile_metric; default: FD_LOG_ERR(( "unknown tile kind %lu", tile->kind )); } } diff --git a/src/app/fdctl/run/tiles/tiles.h b/src/app/fdctl/run/tiles/tiles.h index 87f83e623d..7edf0e1350 100644 --- a/src/app/fdctl/run/tiles/tiles.h +++ b/src/app/fdctl/run/tiles/tiles.h @@ -10,6 +10,7 @@ typedef struct { ulong mux_flags; ulong burst; + ulong rlimit_file_cnt; void * (*mux_ctx )( void * scratch ); fd_mux_during_housekeeping_fn * mux_during_housekeeping; @@ -41,6 +42,7 @@ extern fd_tile_config_t fd_tile_pack; extern fd_tile_config_t fd_tile_bank; extern fd_tile_config_t fd_tile_shred; extern fd_tile_config_t fd_tile_store; +extern fd_tile_config_t fd_tile_metric; void * fd_wksp_pod_map1( uchar const * pod, diff --git a/src/app/fdctl/topology.c b/src/app/fdctl/topology.c index 4129e8e349..4caafa83c7 100644 --- a/src/app/fdctl/topology.c +++ b/src/app/fdctl/topology.c @@ -261,10 +261,16 @@ fd_topo_workspace_fill( fd_topo_t * topo, } if( FD_UNLIKELY( wksp->kind==FD_TOPO_WKSP_KIND_METRIC_IN ) ) { - ulong * metrics = SCRATCH_ALLOC( FD_METRICS_ALIGN, FD_METRICS_FOOTPRINT() ); + ulong out_reliable_consumer_cnt = 0UL; + if( FD_LIKELY( tile->out_link_id_primary!=ULONG_MAX ) ) { + fd_topo_link_t * link = &topo->links[ tile->out_link_id_primary ]; + out_reliable_consumer_cnt = fd_topo_link_reliable_consumer_cnt( topo, link ); + } + + ulong * metrics = SCRATCH_ALLOC( FD_METRICS_ALIGN, FD_METRICS_FOOTPRINT(tile->in_cnt, out_reliable_consumer_cnt) ); if( FD_LIKELY( mode==FD_TOPO_FILL_MODE_NEW ) ) { snprintf1( path, sizeof(path), "metrics_%s_%lu", fd_topo_tile_kind_str( tile->kind ), tile->kind_id ); - INSERT_POD( path, fd_metrics_new( metrics ) ); + INSERT_POD( path, fd_metrics_new( metrics, tile->in_cnt, out_reliable_consumer_cnt ) ); } else if( FD_LIKELY( mode==FD_TOPO_FILL_MODE_JOIN ) ) { tile->metrics = fd_metrics_join( metrics ); if( FD_UNLIKELY( !tile->metrics ) ) FD_LOG_ERR(( "fd_metrics_join failed" )); diff --git a/src/app/fdctl/topology.h b/src/app/fdctl/topology.h index 53becd8c87..94333ebe0c 100644 --- a/src/app/fdctl/topology.h +++ b/src/app/fdctl/topology.h @@ -225,6 +225,10 @@ typedef struct { ushort shred_listen_port; ulong expected_shred_version; } shred; + + struct { + ushort prometheus_listen_port; + } metric; } fd_topo_tile_t; /* An fd_topo_t represents the overall structure of a Firedancer @@ -419,6 +423,22 @@ fd_topo_link_consumer_cnt( fd_topo_t * topo, return cnt; } +/* Given a link, count the number of reliable consumers of that link + among all the tiles in the topology. */ +FD_FN_PURE static inline ulong +fd_topo_link_reliable_consumer_cnt( fd_topo_t * topo, + fd_topo_link_t * link ) { + ulong cnt = 0; + for( ulong i=0; itile_cnt; i++ ) { + fd_topo_tile_t * tile = &topo->tiles[ i ]; + for( ulong j=0; jin_cnt; j++ ) { + if( FD_UNLIKELY( tile->in_link_id[ j ] == link->id && tile->in_link_reliable[ j ] ) ) cnt++; + } + } + + return cnt; +} + /* Join (map into the process) all shared memory (huge/gigantic pages) needed by the tile, in the given topology. All memory associated with the tile (aka. used by links that the tile either produces to or diff --git a/src/disco/metrics/fd_metrics.c b/src/disco/metrics/fd_metrics.c index 1f9d701fb9..822989a4a5 100644 --- a/src/disco/metrics/fd_metrics.c +++ b/src/disco/metrics/fd_metrics.c @@ -1,3 +1,4 @@ #include "fd_metrics.h" -ulong * fd_metrics_tl; +FD_TL ulong * fd_metrics_base_tl; +FD_TL ulong * fd_metrics_tl; diff --git a/src/disco/metrics/fd_metrics.h b/src/disco/metrics/fd_metrics.h index cd4ca257ef..4581c58227 100644 --- a/src/disco/metrics/fd_metrics.h +++ b/src/disco/metrics/fd_metrics.h @@ -1,57 +1,172 @@ #ifndef HEADER_fd_src_disco_metrics_fd_metrics_h #define HEADER_fd_src_disco_metrics_fd_metrics_h -#include "../../util/fd_util.h" -#include "generated/fd_metrics_all.h" +#include "fd_metrics_base.h" -extern ulong * fd_metrics_tl; +#include "generated/fd_metrics_all.h" +#include "generated/fd_metrics_quic.h" + +#include "../../tango/tempo/fd_tempo.h" + +/* fd_metrics mostly defines way of laying out metrics in shared + memory so that a producer and consumer can agree on where they + are, and can read and wite them quickly and with little to no + boilerplate. + + At initialization time, a thread can call fd_metrics_register + which saves a thread local base pointer. Then, macros are provided + which given a macro "name", maps it to an offset from that base + pointer and does a write of the corresponding ulong. + + For low-frequency metrics like incrementing rarely hit error + counters, it is OK to use the macros inline. For high frequency + metrics in core loops, it may be preferable to accumulate local + metric values in the tile and drain them to the metrics shared + memory periodically, eg, via. a housekeeping step. + + The metrics area is minimal and contains no metadata itself. For + example, histograms in the metrics shared memory are just the bucket + values, and there is no metadata about the edges. The consumer will + determine the edges by looking at the statically compiled metadata. + + This is to reduce cache traffic and keep the metrics area small, so + it can be copied to produce a snapshot quickly. When updating + metrics, the producer should do atomic writes so that these snapshots + will see consistent values. In particular, the producer should not + do a memcpy into the metrics region. */ + +/* The metrics region is laid out like + + [ in_link_N ulong ] + [ out_link_N ulong] + [ in_link_0_metrics ... in_link_N_metrics ] + [ out_link_0_metrics ... out_link_N_metrics ] + [ tile_metrics ] + + where every value is a ulong. Tile metrics come after link metrics, + so this base pointer points at the very start of the layout. You + shouldn't need to use this directly, instead it's used by the mux + tile when it's computing the metrics for specific links. */ +extern FD_TL ulong * fd_metrics_base_tl; + +/* All metrics in the application are ulongs, and are laid out + sequentially, so this thread local is a pointer to the first tile + specific metric in the layout, or the "tile_metrics" start as defined + above. All tile metrics are defined as an offset from this metrics + pointer. You shouldn't need to use this directly, instead it is used + by the macros below like FD_MCNT_SET etc. The thread local should be + set by calling fd_metrics_register. */ +extern FD_TL ulong * fd_metrics_tl; #define FD_METRICS_ALIGN (128UL) -#define FD_METRICS_FOOTPRINT() \ - FD_LAYOUT_FINI( FD_LAYOUT_APPEND( FD_LAYOUT_INIT, \ - 128UL, FD_METRICS_TOTAL_SZ ), \ +#define FD_METRICS_FOOTPRINT(in_link_cnt, out_link_reliable_consumer_cnt) \ + FD_LAYOUT_FINI( FD_LAYOUT_APPEND( FD_LAYOUT_APPEND( FD_LAYOUT_APPEND ( FD_LAYOUT_APPEND ( FD_LAYOUT_INIT, \ + 8UL, 16UL ), \ + 8UL, (in_link_cnt)*FD_METRICS_ALL_LINK_IN_TOTAL*sizeof(ulong) ), \ + 8UL, (out_link_reliable_consumer_cnt)*FD_METRICS_ALL_LINK_OUT_TOTAL*sizeof(ulong) ), \ + 8UL, FD_METRICS_TOTAL_SZ ), \ FD_METRICS_ALIGN ) -#define F( group, measurement ) (FD_METRICS_GROUP_##group##_OFF + FD_METRICS_##group##_##measurement##_OFF) +/* The following macros are convenience helpers for updating tile metric + values in shared memory, and can be used like + + FD_MGAUGE_SET( QUIC, CONNECTIONS_CREATED_COUNT, conn_cnt ); + + This compiles to a single write to an offset of the metrics pointer + above. */ -#define FD_MGAUGE_SET( group, measurement, value ) do { \ - fd_metrics_tl[ F(group, measurement) ] = (value); \ +#define FD_MGAUGE_SET( group, measurement, value ) do { \ + fd_metrics_tl[ MIDX(GAUGE, group, measurement) ] = (value); \ } while(0) -#define FD_MGAUGE_GET( group, measurement ) (fd_metrics_tl[ F(group, measurement) ]) -#define FD_MCNT_GET( group, measurement ) (fd_metrics_tl[ F(group, measurement) ]) +#define FD_MGAUGE_GET( group, measurement ) (fd_metrics_tl[ MIDX(GAUGE, group, measurement) ]) + +#define FD_MCNT_GET( group, measurement ) (fd_metrics_tl[ MIDX(COUNTER, group, measurement) ]) + +#define FD_MCNT_SET( group, measurement, value ) do { \ + fd_metrics_tl[ MIDX(COUNTER, group, measurement) ] = (value); \ + } while(0) -#define FD_MCNT_INC( group, measurement, value ) do { \ - fd_metrics_tl[ F(group, measurement) ] += (value); \ +#define FD_MCNT_INC( group, measurement, value ) do { \ + fd_metrics_tl[ MIDX(COUNTER, group, measurement) ] += (value); \ } while(0) -#define FD_MHIST_COPY( group, measurement, hist ) do { \ - ulong __fd_metrics_off = F(group, measurement); \ - for( ulong i=0; icounts[ i ]; \ - } \ - fd_metrics_tl[ __fd_metrics_off + FD_HISTF_BUCKET_CNT ] = \ - hist->sum; \ +#define FD_MHIST_MIN( group, measurement ) (FD_METRICS_HISTOGRAM_##group##_##measurement##_MIN) +#define FD_MHIST_SECONDS_MIN( group, measurement ) (fd_metrics_convert_seconds_to_ticks(FD_METRICS_HISTOGRAM_##group##_##measurement##_MIN)) +#define FD_MHIST_MAX( group, measurement ) (FD_METRICS_HISTOGRAM_##group##_##measurement##_MIN) +#define FD_MHIST_SECONDS_MAX( group, measurement ) (fd_metrics_convert_seconds_to_ticks(FD_METRICS_HISTOGRAM_##group##_##measurement##_MAX)) + +#define FD_MHIST_COPY( group, measurement, hist ) do { \ + ulong __fd_metrics_off = MIDX(HISTOGRAM, group, measurement); \ + for( ulong i=0; icounts[ i ]; \ + } \ + fd_metrics_tl[ __fd_metrics_off + FD_HISTF_BUCKET_CNT ] = hist->sum; \ } while(0) -#define FD_MHIST_SUM( group, measurement ) (fd_metrics_tl[ F(group, measurement) + FD_HISTF_BUCKET_CNT ]) +#define FD_MHIST_SUM( group, measurement ) (fd_metrics_tl[ MIDX(HISTOGRAM, group, measurement) + FD_HISTF_BUCKET_CNT ]) FD_PROTOTYPES_BEGIN +/* fd_metrics_tile returns a pointer to the tile-specific metrics area + for the given metrics object. */ +static inline ulong * +fd_metrics_tile( ulong * metrics ) { return metrics + 2UL + FD_METRICS_ALL_LINK_IN_TOTAL*metrics[ 0 ] + FD_METRICS_ALL_LINK_OUT_TOTAL*metrics[ 1 ]; } + +/* fd_metrics_link_in returns a pointer the in-link metrics area for the + given in link index of this metrics object. */ +static inline ulong * +fd_metrics_link_in( ulong * metrics, ulong in_idx ) { return metrics + 2UL + FD_METRICS_ALL_LINK_IN_TOTAL*in_idx; } + +/* fd_metrics_link_in returns a pointer the in-link metrics area for the + given out link index of this metrics object. */ +static inline ulong * +fd_metrics_link_out( ulong * metrics, ulong out_idx ) { return metrics + 2UL + FD_METRICS_ALL_LINK_IN_TOTAL*metrics[0] + FD_METRICS_ALL_LINK_OUT_TOTAL*out_idx; } + +/* fd_metrics_new formats an unused memory region for use as a metrics. + Assumes shmem is a non-NULL pointer to this region in the local + address space with the required footprint and alignment. All of the + mtrics will be initialized to zero. Returns shmem (and the memory + region it points to will be formatted as a metrics, caller is not + joined). */ + static inline void * -fd_metrics_new( void * mem ) { - fd_memset( mem, 0, FD_METRICS_FOOTPRINT() ); - return mem; +fd_metrics_new( void * shmem, + ulong in_link_cnt, + ulong out_link_consumer_cnt ) { + fd_memset( shmem, 0, FD_METRICS_FOOTPRINT(in_link_cnt, out_link_consumer_cnt) ); + ulong * metrics = shmem; + metrics[0] = in_link_cnt; + metrics[1] = out_link_consumer_cnt; + return shmem; } +/* fd_metrics_register sets the thread local values used by the macros + like FD_MCNT_SET to point to the provided metrics object. */ static inline ulong * fd_metrics_register( ulong * metrics ) { if( FD_UNLIKELY( !metrics ) ) FD_LOG_ERR(( "NULL metrics" )); - fd_metrics_tl = metrics; + fd_metrics_base_tl = metrics; + fd_metrics_tl = fd_metrics_tile( metrics ); return metrics; } +static inline ulong +fd_metrics_convert_seconds_to_ticks( double seconds ) { + /* The tick_per_ns() value needs to be the same across the tile doing + the sampling and the tile doing the reporting so that they compute + the same bucket edges for histograms. */ + double tick_per_ns = fd_tempo_tick_per_ns( NULL ); + return (ulong)(seconds * tick_per_ns * 1e9); +} + +static inline double +fd_metrics_convert_ticks_to_seconds( ulong ticks ) { + double tick_per_ns = fd_tempo_tick_per_ns( NULL ); + return (double)ticks / (tick_per_ns * 1e9); +} + static inline ulong * fd_metrics_join ( void * mem ) { return mem; } static inline void * fd_metrics_leave ( void * mem ) { return (void *)mem; } static inline void * fd_metrics_delete( void * mem ) { return (void *)mem; } diff --git a/src/disco/metrics/fd_metrics_base.h b/src/disco/metrics/fd_metrics_base.h new file mode 100644 index 0000000000..123ee1fac8 --- /dev/null +++ b/src/disco/metrics/fd_metrics_base.h @@ -0,0 +1,96 @@ +#ifndef HEADER_fd_src_disco_metrics_fd_metrics_base_h +#define HEADER_fd_src_disco_metrics_fd_metrics_base_h + +#include "../../util/fd_util.h" + +#define FD_METRICS_TYPE_GAUGE (0UL) +#define FD_METRICS_TYPE_COUNTER (1UL) +#define FD_METRICS_TYPE_HISTOGRAM (2UL) + +#define FD_METRICS_CONVERTER_NONE (0UL) +#define FD_METRICS_CONVERTER_SECONDS (1UL) + +#define MIDX( type, group, measurement ) (FD_METRICS_##type##_##group##_##measurement##_OFF) + +#define DECLARE_METRIC_GAUGE( GROUP, MEASUREMENT ) { \ + .name = FD_METRICS_GAUGE_##GROUP##_##MEASUREMENT##_NAME, \ + .type = FD_METRICS_TYPE_GAUGE, \ + .desc = FD_METRICS_GAUGE_##GROUP##_##MEASUREMENT##_DESC, \ + .offset = FD_METRICS_GAUGE_##GROUP##_##MEASUREMENT##_OFF, \ + } + +#define DECLARE_METRIC_COUNTER( GROUP, MEASUREMENT ) { \ + .name = FD_METRICS_COUNTER_##GROUP##_##MEASUREMENT##_NAME, \ + .type = FD_METRICS_TYPE_COUNTER, \ + .desc = FD_METRICS_COUNTER_##GROUP##_##MEASUREMENT##_DESC, \ + .offset = FD_METRICS_COUNTER_##GROUP##_##MEASUREMENT##_OFF, \ + } + +#define DECLARE_METRIC_HISTOGRAM_NONE( GROUP, MEASUREMENT ) { \ + .name = FD_METRICS_HISTOGRAM_##GROUP##_##MEASUREMENT##_NAME, \ + .type = FD_METRICS_TYPE_HISTOGRAM, \ + .desc = FD_METRICS_HISTOGRAM_##GROUP##_##MEASUREMENT##_DESC, \ + .offset = FD_METRICS_HISTOGRAM_##GROUP##_##MEASUREMENT##_OFF, \ + .histogram = { \ + .converter = FD_METRICS_CONVERTER_NONE, \ + .none = { \ + .min = FD_METRICS_HISTOGRAM_##GROUP##_##MEASUREMENT##_MIN, \ + .max = FD_METRICS_HISTOGRAM_##GROUP##_##MEASUREMENT##_MAX, \ + }, \ + }, \ + } + +#define DECLARE_METRIC_HISTOGRAM_SECONDS( GROUP, MEASUREMENT ) { \ + .name = FD_METRICS_HISTOGRAM_##GROUP##_##MEASUREMENT##_NAME, \ + .type = FD_METRICS_TYPE_HISTOGRAM, \ + .desc = FD_METRICS_HISTOGRAM_##GROUP##_##MEASUREMENT##_DESC, \ + .offset = FD_METRICS_HISTOGRAM_##GROUP##_##MEASUREMENT##_OFF, \ + .histogram = { \ + .converter = FD_METRICS_CONVERTER_SECONDS, \ + .seconds = { \ + .min = FD_METRICS_HISTOGRAM_##GROUP##_##MEASUREMENT##_MIN, \ + .max = FD_METRICS_HISTOGRAM_##GROUP##_##MEASUREMENT##_MAX, \ + }, \ + }, \ + } + +typedef struct { + char const * name; + int type; + char const * desc; + ulong offset; + + union { + struct { + int converter; + + union { + struct { + ulong min; + ulong max; + } none; + + struct { + double min; + double max; + } seconds; + }; + } histogram; + }; +} fd_metrics_meta_t; + +FD_PROTOTYPES_BEGIN + +FD_FN_PURE static inline char * +fd_metrics_meta_type_str( fd_metrics_meta_t const * metric ) { + switch( metric->type ) { + case FD_METRICS_TYPE_GAUGE: return "gauge"; + case FD_METRICS_TYPE_COUNTER: return "counter"; + case FD_METRICS_TYPE_HISTOGRAM: return "histogram"; + default: return "unknown"; + } +} + +FD_PROTOTYPES_END + +#endif /* HEADER_fd_src_disco_metrics_fd_metrics_base_h */ diff --git a/src/disco/metrics/gen_metrics.py b/src/disco/metrics/gen_metrics.py index 0b2202a14d..0d17885821 100644 --- a/src/disco/metrics/gen_metrics.py +++ b/src/disco/metrics/gen_metrics.py @@ -3,36 +3,75 @@ import re import os +# define metric object as python type +class Metric: + def __init__(self, _type, link, linkside, group_id, group_name, tile, name, summary, min, max, converter): + self.type = _type + self.link = link + self.linkside = linkside + self.group_name = group_name.upper() + self.group_id = group_id + self.tile = tile + self.name = re.sub(r'(? + + + + + + + + + + + + + - + - - + + - + + - The time it took to turn one core loop of the tile. The loop types - are mutually exclusive and exhaustive, so the sum of time across all - of them should give the total running time of the total, less some - measurement error. + Duration of one iteration of the run loop which did + housekeeping. The various loop durations are mutually + exclusive and exhaustive, so the sum of time across all of + them is roughly the total running time of the tile. Loop + durations are per iteration of the run loop and + non-blocking, so for example each 'caught up' sample does + not represent the time we waited for new input data, but + rather how long each iteration of the spin loop waiting for + the data took. - - - - - - - - - - - - - - - The size of the fragments that are being processed by the tile. - - - - - + + + Duration of one iteration of the run loop which terminated because we were backpressured by a consumer. + + + Duration of one iteration of the run loop which terminated because there was no new data to process. + + + Duration of one iteration of the run loop which terminated because we were overrun while polling. + + + Duration of one iteration of the run loop which terminated because we were overrun while reading. + + + Duration of one iteration of the run loop which terminated because we filtered the fragment before reading it. + + + Duration of one iteration of the run loop which terminated because we filtered the fragment after reading it. + + + Duration of one iteration of the run loop which received, did not filter, and processed the fragment. + + + + Size of each fragment that was filtered and not processed by the tile. + + + Size of each fragment that was processed (not filtered) by the tile. - - + + diff --git a/src/disco/mux/fd_mux.c b/src/disco/mux/fd_mux.c index ef6b5a87a2..e7bed5e19e 100644 --- a/src/disco/mux/fd_mux.c +++ b/src/disco/mux/fd_mux.c @@ -52,13 +52,14 @@ fd_mux_tile_in_update( fd_mux_tile_in_t * in, ulong seq = fd_seq_dec( in->seq, exposed_cnt ); if( FD_LIKELY( fd_seq_gt( seq, fd_fseq_query( in_fseq ) ) ) ) fd_fseq_update( in_fseq, seq ); - ulong * diag = (ulong *)fd_fseq_app_laddr( in_fseq ); + ulong * metrics = fd_metrics_link_in( fd_metrics_base_tl, in->idx ); + uint * accum = in->accum; ulong a0 = (ulong)accum[0]; ulong a1 = (ulong)accum[1]; ulong a2 = (ulong)accum[2]; ulong a3 = (ulong)accum[3]; ulong a4 = (ulong)accum[4]; ulong a5 = (ulong)accum[5]; FD_COMPILER_MFENCE(); - diag[0] += a0; diag[1] += a1; diag[2] += a2; - diag[3] += a3; diag[4] += a4; diag[5] += a5; + metrics[0] += a0; metrics[1] += a1; metrics[2] += a2; + metrics[3] += a3; metrics[4] += a4; metrics[5] += a5; FD_COMPILER_MFENCE(); accum[0] = 0U; accum[1] = 0U; accum[2] = 0U; accum[3] = 0U; accum[4] = 0U; accum[5] = 0U; @@ -109,6 +110,7 @@ fd_mux_tile( fd_cnc_t * cnc, /* out frag stream state */ ulong depth; /* ==fd_mcache_depth( mcache ), depth of the mcache / positive integer power of 2 */ + ulong _sync; /* local sync for mcache if mcache is NULL */ ulong * sync; /* ==fd_mcache_seq_laddr( mcache ), local addr where mux mcache sync info is published */ ulong seq; /* next mux frag sequence number to publish */ @@ -204,12 +206,17 @@ fd_mux_tile( fd_cnc_t * cnc, /* out frag stream init */ - if( FD_UNLIKELY( !mcache ) ) { FD_LOG_WARNING(( "NULL mcache" )); return 1; } - - depth = fd_mcache_depth ( mcache ); - sync = fd_mcache_seq_laddr( mcache ); + if( FD_LIKELY( mcache ) ) { + depth = fd_mcache_depth ( mcache ); + sync = fd_mcache_seq_laddr( mcache ); - seq = fd_mcache_seq_query( sync ); /* FIXME: ALLOW OPTION FOR MANUAL SPECIFICATION */ + seq = fd_mcache_seq_query( sync ); /* FIXME: ALLOW OPTION FOR MANUAL SPECIFICATION */ + } else { + depth = 128UL; + _sync = 0UL; + sync = &_sync; + seq = 0UL; + } /* out flow control init */ @@ -328,7 +335,7 @@ fd_mux_tile( fd_cnc_t * cnc, for( ulong out_idx=0UL; out_idxmetrics_write ) ) callbacks->metrics_write( ctx ); FD_COMPILER_MFENCE(); metric_backp_cnt = 0UL; @@ -465,7 +469,7 @@ fd_mux_tile( fd_cnc_t * cnc, /* See notes above about use of quasi-atomic diagnostic accum */ if( FD_LIKELY( slowest_out!=ULONG_MAX ) ) { FD_COMPILER_MFENCE(); - out_slow[ slowest_out ]++; + (*out_slow[ slowest_out ])++; FD_COMPILER_MFENCE(); } @@ -568,7 +572,7 @@ fd_mux_tile( fd_cnc_t * cnc, if( FD_UNLIKELY( diff<0L ) ) { /* Overrun (impossible if in is honoring our flow control) */ this_in->seq = seq_found; /* Resume from here (probably reasonably current, could query in mcache sync directly instead) */ hist = hist_ovrnp_ticks; - this_in->accum[ FD_FSEQ_DIAG_OVRNP_CNT ]++; + this_in->accum[ FD_METRICS_COUNTER_LINK_OVERRUN_POLLING_COUNT_OFF ]++; } /* Don't bother with spin as polling multiple locations */ long next = fd_tickcount(); @@ -615,7 +619,7 @@ fd_mux_tile( fd_cnc_t * cnc, if( FD_UNLIKELY( fd_seq_ne( seq_test, seq_found ) ) ) { /* Overrun while reading (impossible if this_in honoring our fctl) */ this_in->seq = seq_test; /* Resume from here (probably reasonably current, could query in mcache sync instead) */ - this_in->accum[ FD_FSEQ_DIAG_OVRNR_CNT ]++; + this_in->accum[ FD_METRICS_COUNTER_LINK_OVERRUN_READING_COUNT_OFF ]++; /* Don't bother with spin as polling multiple locations */ long next = fd_tickcount(); fd_histf_sample( hist_ovrnr_ticks, (ulong)(next - now) ); @@ -623,11 +627,12 @@ fd_mux_tile( fd_cnc_t * cnc, continue; } + ulong out_sz = sz; if( FD_LIKELY( !filter ) ) { /* We have successfully loaded the metadata. Decide whether it is interesting downstream and publish or filter accordingly. */ - if( FD_LIKELY( callbacks->after_frag ) ) callbacks->after_frag( ctx, (ulong)this_in->idx, &sig, &chunk, &sz, &filter, &mux ); + if( FD_LIKELY( callbacks->after_frag ) ) callbacks->after_frag( ctx, (ulong)this_in->idx, &sig, &chunk, &out_sz, &filter, &mux ); } long next = fd_tickcount(); @@ -645,7 +650,7 @@ fd_mux_tile( fd_cnc_t * cnc, if( FD_UNLIKELY( !(flags & FD_MUX_FLAG_COPY) ) ) cr_filt += (ulong)(cr_availseq = this_in_seq; this_in->mline = this_in->mcache + fd_mcache_line_idx( this_in_seq, this_in->depth ); - ulong diag_idx = FD_FSEQ_DIAG_PUB_CNT + 2UL*(ulong)filter; + ulong diag_idx = FD_METRICS_COUNTER_LINK_PUBLISHED_COUNT_OFF + 2UL*(ulong)filter; this_in->accum[ diag_idx ]++; this_in->accum[ diag_idx+1UL ] += (uint)sz; fd_histf_t * hist_ticks = fd_ptr_if( filter, (fd_histf_t*)hist_filter2_ticks, (fd_histf_t*)hist_fin_ticks ); fd_histf_t * hist_sz = fd_ptr_if( filter, (fd_histf_t*)hist_filter2_frag_sz, (fd_histf_t*)hist_fin_frag_sz ); fd_histf_sample( hist_ticks, (ulong)(next - now) ); - fd_histf_sample( hist_sz, (ulong)sz ); + fd_histf_sample( hist_sz, sz ); now = next; } diff --git a/src/disco/mux/test_mux.c b/src/disco/mux/test_mux.c index bf7c977275..47e24cd831 100644 --- a/src/disco/mux/test_mux.c +++ b/src/disco/mux/test_mux.c @@ -11,6 +11,17 @@ FD_STATIC_ASSERT( FD_MUX_TILE_OUT_MAX==8192UL, unit_test ); FD_STATIC_ASSERT( FD_MUX_TILE_SCRATCH_ALIGN==128UL, unit_test ); +#define FD_CNC_DIAG_IN_BACKP (0UL) +#define FD_CNC_DIAG_BACKP_CNT (1UL) + +#define FD_FSEQ_DIAG_PUB_CNT (0UL) +#define FD_FSEQ_DIAG_PUB_SZ (1UL) +#define FD_FSEQ_DIAG_FILT_CNT (2UL) +#define FD_FSEQ_DIAG_FILT_SZ (3UL) +#define FD_FSEQ_DIAG_OVRNP_CNT (4UL) +#define FD_FSEQ_DIAG_OVRNR_CNT (5UL) +#define FD_FSEQ_DIAG_SLOW_CNT (6UL) + struct test_cfg { fd_wksp_t * wksp; diff --git a/src/disco/quic/test_quic_tile.c b/src/disco/quic/test_quic_tile.c index 517fad4338..a836d50105 100644 --- a/src/disco/quic/test_quic_tile.c +++ b/src/disco/quic/test_quic_tile.c @@ -8,9 +8,6 @@ #include "../../util/net/fd_ip4.h" #include "../../ballet/base58/fd_base58.h" -FD_STATIC_ASSERT( FD_QUIC_CNC_DIAG_TPU_CONN_LIVE_CNT==6UL, uint_test ); -FD_STATIC_ASSERT( FD_QUIC_CNC_DIAG_TPU_CONN_SEQ ==7UL, unit_test ); - FD_STATIC_ASSERT( FD_QUIC_TILE_SCRATCH_ALIGN==128UL, unit_test ); struct test_cfg { diff --git a/src/disco/replay/fd_replay.c b/src/disco/replay/fd_replay.c index 5157bb51ea..97d245afa1 100644 --- a/src/disco/replay/fd_replay.c +++ b/src/disco/replay/fd_replay.c @@ -6,6 +6,17 @@ FD_STATIC_ASSERT( FD_FCTL_ALIGN<=FD_REPLAY_TILE_SCRATCH_ALIGN, packing ); +#define FD_CNC_DIAG_IN_BACKP (0UL) +#define FD_CNC_DIAG_BACKP_CNT (1UL) + +#define FD_FSEQ_DIAG_PUB_CNT (0UL) +#define FD_FSEQ_DIAG_PUB_SZ (1UL) +#define FD_FSEQ_DIAG_FILT_CNT (2UL) +#define FD_FSEQ_DIAG_FILT_SZ (3UL) +#define FD_FSEQ_DIAG_OVRNP_CNT (4UL) +#define FD_FSEQ_DIAG_OVRNR_CNT (5UL) +#define FD_FSEQ_DIAG_SLOW_CNT (6UL) + ulong fd_replay_tile_scratch_align( void ) { return FD_REPLAY_TILE_SCRATCH_ALIGN; diff --git a/src/tango/bench_frag_tx.c b/src/tango/bench_frag_tx.c index 853bbd17fb..603256f7e1 100644 --- a/src/tango/bench_frag_tx.c +++ b/src/tango/bench_frag_tx.c @@ -13,6 +13,17 @@ FD_STATIC_ASSERT( FD_CHUNK_SZ==64UL, unit_test ); static uchar fctl_mem[ FD_FCTL_FOOTPRINT( RX_MAX ) ] __attribute__((aligned(FD_FCTL_ALIGN))); static char * _fseq[ RX_MAX ]; +#define FD_CNC_DIAG_IN_BACKP (0UL) +#define FD_CNC_DIAG_BACKP_CNT (1UL) + +#define FD_FSEQ_DIAG_PUB_CNT (0UL) +#define FD_FSEQ_DIAG_PUB_SZ (1UL) +#define FD_FSEQ_DIAG_FILT_CNT (2UL) +#define FD_FSEQ_DIAG_FILT_SZ (3UL) +#define FD_FSEQ_DIAG_OVRNP_CNT (4UL) +#define FD_FSEQ_DIAG_OVRNR_CNT (5UL) +#define FD_FSEQ_DIAG_SLOW_CNT (6UL) + int main( int argc, char ** argv ) { diff --git a/src/tango/cnc/fd_cnc.h b/src/tango/cnc/fd_cnc.h index 219f14b722..d9f3a8219e 100644 --- a/src/tango/cnc/fd_cnc.h +++ b/src/tango/cnc/fd_cnc.h @@ -116,30 +116,6 @@ #define FD_CNC_ERR_AGAIN (-3) /* potentially transient failure */ #define FD_CNC_ERR_FAIL (-4) /* permanent failure */ -/* FD_CNC_DIAG_* specify standard locations in a producers cnc's - application region that can be used across a wide variety - communicating producers and consumers for accumulating flow control - diagnostics in a standard remote monitoring friendly way. This will - be on the first cache line of the application region, require the cnc - application region size to be at least 16 bytes and reserve the first - 16 bytes for these. Treating the application region as an array of - ulongs: - - IN_BACKP is 1 if the producer is currently waiting for one or more - consumers to catch up enough enough to resume publishing. - - BACKP_CNT is the number of times the producer had to wait for one - or more consumers to catch up to resume publishing. - - If the application wants to accumulate other counters in the cnc app - region (and, practically, because of alignment constraints, there is - always an implied app region of size 64 bytes currently), it is - recommended that the counters 2:7 be either rarely updated or - producer updated. */ - -#define FD_CNC_DIAG_IN_BACKP (0UL) /* updated by the producer, ideally never */ -#define FD_CNC_DIAG_BACKP_CNT (1UL) /* updated by the producer, ideally never */ - /* fd_cnc_t is an opaque handle of a command-and-control object. Details are exposed here to facilitate inlining of many cnc operations in performance critical app thread paths. */ diff --git a/src/tango/cnc/test_cnc.c b/src/tango/cnc/test_cnc.c index 37a25a7b8c..fa4d4183c5 100644 --- a/src/tango/cnc/test_cnc.c +++ b/src/tango/cnc/test_cnc.c @@ -20,9 +20,6 @@ FD_STATIC_ASSERT( FD_CNC_ERR_INVAL==-2, unit_test ); FD_STATIC_ASSERT( FD_CNC_ERR_AGAIN==-3, unit_test ); FD_STATIC_ASSERT( FD_CNC_ERR_FAIL ==-4, unit_test ); -FD_STATIC_ASSERT( FD_CNC_DIAG_IN_BACKP ==0UL, unit_test ); -FD_STATIC_ASSERT( FD_CNC_DIAG_BACKP_CNT==1UL, unit_test ); - #define APP_MIN (32UL) #define APP_MAX (192UL) uchar __attribute__((aligned(FD_CNC_ALIGN))) shmem[ FD_CNC_FOOTPRINT( APP_MAX ) ]; diff --git a/src/tango/fseq/fd_fseq.h b/src/tango/fseq/fd_fseq.h index a484adabd9..1e6b83a962 100644 --- a/src/tango/fseq/fd_fseq.h +++ b/src/tango/fseq/fd_fseq.h @@ -24,44 +24,6 @@ #define FD_FSEQ_APP_ALIGN (32UL) #define FD_FSEQ_APP_FOOTPRINT (96UL) -/* FD_FSEQ_DIAG_* specify standard locations in the fseq's application - region that can be used across a wide variety communicating producers - and consumers for accumulating flow control diagnostics in a standard - remote monitoring friendly way. Treating the application region as - an array of ulongs: - - PUB_CNT is the number of received fragments processed/forwarded by the consumer - PUB_SZ is the number of received fragment payload bytes processed/forward by the consumer - FILT_CNT is the number of received fragments skipped/ignored/filtered by the consumer - FILT_SZ is the number of received fragment payload bytes skipped/ignored/filtered by the consumer - OVRNP_CNT is the number of input overruns detected while polling for metadata by the consumer - OVRNR_CNT is the number of input overruns detected while reading metadata by the consumer - SLOW_CNT is the number of times the consumer was detected as rate limiting consumer by the producer - - It is worth noting that, given properly configured flow control: - - OVRNP_CNT==OVRNR_CNT==0 - PUB_CNT+FILT_CNT==RX_CNT==TX_CNT - PUB_SZ +FILT_SZ ==RX_SZ ==TX_SZ - - so there isn't much utility adding additional counters for RX_CNT, - RX_SZ, TX_CNT and/or TX_SZ as these can be strictly derived from the - counters that are already there under normal operating conditions and - abnormal operating conditions can be detected. - - Note that application that use these counters, counters 7:11 remain - available for application specific usage. To avoid cache line ping - pong, it recommend that any use of these be for rare events and/or - for events counted by the producer. */ - -#define FD_FSEQ_DIAG_PUB_CNT (0UL) /* On the 1st fseq cache line, updated by the consumer frequently */ -#define FD_FSEQ_DIAG_PUB_SZ (1UL) /* " */ -#define FD_FSEQ_DIAG_FILT_CNT (2UL) /* " */ -#define FD_FSEQ_DIAG_FILT_SZ (3UL) /* " */ -#define FD_FSEQ_DIAG_OVRNP_CNT (4UL) /* On the 2nd fseq cache line, updated by the consumer, ideally never */ -#define FD_FSEQ_DIAG_OVRNR_CNT (5UL) /* " */ -#define FD_FSEQ_DIAG_SLOW_CNT (6UL) /* ", updated by the producer, rarely */ - FD_PROTOTYPES_BEGIN /* fd_fseq_{align,footprint} return the required alignment and footprint diff --git a/src/tango/fseq/test_fseq.c b/src/tango/fseq/test_fseq.c index cf7acfe58e..9bdce1891c 100644 --- a/src/tango/fseq/test_fseq.c +++ b/src/tango/fseq/test_fseq.c @@ -6,14 +6,6 @@ FD_STATIC_ASSERT( FD_FSEQ_FOOTPRINT==128UL, unit_test ); FD_STATIC_ASSERT( FD_FSEQ_APP_ALIGN ==32UL, unit_test ); FD_STATIC_ASSERT( FD_FSEQ_APP_FOOTPRINT==96UL, unit_test ); -FD_STATIC_ASSERT( FD_FSEQ_DIAG_PUB_CNT ==0UL, unit_test ); -FD_STATIC_ASSERT( FD_FSEQ_DIAG_PUB_SZ ==1UL, unit_test ); -FD_STATIC_ASSERT( FD_FSEQ_DIAG_FILT_CNT ==2UL, unit_test ); -FD_STATIC_ASSERT( FD_FSEQ_DIAG_FILT_SZ ==3UL, unit_test ); -FD_STATIC_ASSERT( FD_FSEQ_DIAG_OVRNP_CNT==4UL, unit_test ); -FD_STATIC_ASSERT( FD_FSEQ_DIAG_OVRNR_CNT==5UL, unit_test ); -FD_STATIC_ASSERT( FD_FSEQ_DIAG_SLOW_CNT ==6UL, unit_test ); - static uchar shmem[ FD_FSEQ_FOOTPRINT ] __attribute__((aligned(FD_FSEQ_ALIGN))); int diff --git a/src/tango/test_frag_rx.c b/src/tango/test_frag_rx.c index 3af9497747..a773d3c160 100644 --- a/src/tango/test_frag_rx.c +++ b/src/tango/test_frag_rx.c @@ -6,6 +6,14 @@ FD_STATIC_ASSERT( FD_CHUNK_SZ==64UL, unit_test ); static uchar fseq_mem[ FD_FSEQ_FOOTPRINT ] __attribute__((aligned(FD_FSEQ_ALIGN))); +#define FD_FSEQ_DIAG_PUB_CNT (0UL) +#define FD_FSEQ_DIAG_PUB_SZ (1UL) +#define FD_FSEQ_DIAG_FILT_CNT (2UL) +#define FD_FSEQ_DIAG_FILT_SZ (3UL) +#define FD_FSEQ_DIAG_OVRNP_CNT (4UL) +#define FD_FSEQ_DIAG_OVRNR_CNT (5UL) +#define FD_FSEQ_DIAG_SLOW_CNT (6UL) + int main( int argc, char ** argv ) { diff --git a/src/tango/test_frag_tx.c b/src/tango/test_frag_tx.c index c075d0c385..965b375b3f 100644 --- a/src/tango/test_frag_tx.c +++ b/src/tango/test_frag_tx.c @@ -11,6 +11,17 @@ FD_STATIC_ASSERT( FD_CHUNK_SZ==64UL, unit_test ); static uchar fctl_mem[ FD_FCTL_FOOTPRINT( RX_MAX ) ] __attribute__((aligned(FD_FCTL_ALIGN))); static char * _fseq[ RX_MAX ]; +#define FD_CNC_DIAG_IN_BACKP (0UL) +#define FD_CNC_DIAG_BACKP_CNT (1UL) + +#define FD_FSEQ_DIAG_PUB_CNT (0UL) +#define FD_FSEQ_DIAG_PUB_SZ (1UL) +#define FD_FSEQ_DIAG_FILT_CNT (2UL) +#define FD_FSEQ_DIAG_FILT_SZ (3UL) +#define FD_FSEQ_DIAG_OVRNP_CNT (4UL) +#define FD_FSEQ_DIAG_OVRNR_CNT (5UL) +#define FD_FSEQ_DIAG_SLOW_CNT (6UL) + int main( int argc, char ** argv ) { diff --git a/src/tango/test_meta_rx.c b/src/tango/test_meta_rx.c index 4870b9cd6e..a6345a5253 100644 --- a/src/tango/test_meta_rx.c +++ b/src/tango/test_meta_rx.c @@ -4,6 +4,14 @@ static uchar fseq_mem[ FD_FSEQ_FOOTPRINT ] __attribute__((aligned(FD_FSEQ_ALIGN))); +#define FD_FSEQ_DIAG_PUB_CNT (0UL) +#define FD_FSEQ_DIAG_PUB_SZ (1UL) +#define FD_FSEQ_DIAG_FILT_CNT (2UL) +#define FD_FSEQ_DIAG_FILT_SZ (3UL) +#define FD_FSEQ_DIAG_OVRNP_CNT (4UL) +#define FD_FSEQ_DIAG_OVRNR_CNT (5UL) +#define FD_FSEQ_DIAG_SLOW_CNT (6UL) + int main( int argc, char ** argv ) { diff --git a/src/tango/test_meta_tx.c b/src/tango/test_meta_tx.c index 204b8f0b9c..134550e9e9 100644 --- a/src/tango/test_meta_tx.c +++ b/src/tango/test_meta_tx.c @@ -7,6 +7,17 @@ static uchar fctl_mem[ FD_FCTL_FOOTPRINT( RX_MAX ) ] __attribute__((aligned(FD_FCTL_ALIGN))); static char * _fseq[ RX_MAX ]; +#define FD_CNC_DIAG_IN_BACKP (0UL) +#define FD_CNC_DIAG_BACKP_CNT (1UL) + +#define FD_FSEQ_DIAG_PUB_CNT (0UL) +#define FD_FSEQ_DIAG_PUB_SZ (1UL) +#define FD_FSEQ_DIAG_FILT_CNT (2UL) +#define FD_FSEQ_DIAG_FILT_SZ (3UL) +#define FD_FSEQ_DIAG_OVRNP_CNT (4UL) +#define FD_FSEQ_DIAG_OVRNR_CNT (5UL) +#define FD_FSEQ_DIAG_SLOW_CNT (6UL) + int main( int argc, char ** argv ) { diff --git a/src/util/hist/fd_histf.h b/src/util/hist/fd_histf.h index bb44800ba6..43b1ec3e21 100644 --- a/src/util/hist/fd_histf.h +++ b/src/util/hist/fd_histf.h @@ -92,6 +92,7 @@ fd_histf_new( void * mem, ulong max_value ) { if( FD_UNLIKELY( max_value<=min_value ) ) return NULL; + min_value = fd_ulong_max( min_value, 1UL ); max_value = fd_ulong_max( max_value, min_value + FD_HISTF_BUCKET_CNT - 2UL ); fd_histf_t * hist = (fd_histf_t*)mem;