Skip to content

Commit

Permalink
metrics: add prometheus endpoint
Browse files Browse the repository at this point in the history
A metrics tile is added and wired up to the metrics shared memory area,
with a simple socket based HTTP server that serves the values in
Prometheus text format.
  • Loading branch information
mmcgee-jump committed Dec 4, 2023
1 parent a14ccef commit 641480b
Show file tree
Hide file tree
Showing 41 changed files with 1,449 additions and 310 deletions.
2 changes: 0 additions & 2 deletions ffi/rust/firedancer-sys/src/tango/cnc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@ pub use crate::gentango::{
fd_cstr_to_cnc_signal,
FD_CNC_ALIGN,
FD_CNC_APP_ALIGN,
FD_CNC_DIAG_BACKP_CNT,
FD_CNC_DIAG_IN_BACKP,
FD_CNC_ERR_AGAIN,
FD_CNC_ERR_FAIL,
FD_CNC_ERR_INVAL,
Expand Down
7 changes: 0 additions & 7 deletions ffi/rust/firedancer-sys/src/tango/fseq.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,5 @@ pub use crate::gentango::{
FD_FSEQ_ALIGN,
FD_FSEQ_APP_ALIGN,
FD_FSEQ_APP_FOOTPRINT,
FD_FSEQ_DIAG_FILT_CNT,
FD_FSEQ_DIAG_FILT_SZ,
FD_FSEQ_DIAG_OVRNP_CNT,
FD_FSEQ_DIAG_OVRNR_CNT,
FD_FSEQ_DIAG_PUB_CNT,
FD_FSEQ_DIAG_PUB_SZ,
FD_FSEQ_DIAG_SLOW_CNT,
FD_FSEQ_FOOTPRINT,
};
2 changes: 1 addition & 1 deletion solana
7 changes: 5 additions & 2 deletions src/app/fdctl/Local.mk
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@ ifdef FD_HAS_ALLOCA
ifdef FD_HAS_X86
ifdef FD_HAS_DOUBLE

.PHONY: fdctl cargo
.PHONY: fdctl cargo rust solana

$(call add-objs,main1 config caps utility topology keys ready mem spy help run/run run/tiles/tiles run/run1 run/run_solana run/tiles/tiles run/tiles/fd_net run/tiles/fd_netmux run/tiles/fd_dedup run/tiles/fd_pack run/tiles/fd_quic run/tiles/fd_verify run/tiles/fd_bank run/tiles/fd_shred run/tiles/fd_store monitor/monitor monitor/helper configure/configure configure/large_pages configure/sysctl configure/shmem configure/xdp configure/xdp_leftover configure/ethtool configure/workspace_leftover configure/workspace,fd_fdctl)
$(call add-objs,main1 config caps utility topology keys ready mem spy help run/run run/tiles/tiles run/run1 run/run_solana run/tiles/tiles run/tiles/fd_net run/tiles/fd_metric run/tiles/fd_netmux run/tiles/fd_dedup run/tiles/fd_pack run/tiles/fd_quic run/tiles/fd_verify run/tiles/fd_bank run/tiles/fd_shred run/tiles/fd_store monitor/monitor monitor/helper configure/configure configure/large_pages configure/sysctl configure/shmem configure/xdp configure/xdp_leftover configure/ethtool configure/workspace_leftover configure/workspace,fd_fdctl)
$(call make-bin-rust,fdctl,main,fd_fdctl fd_disco fd_flamenco fd_ip fd_reedsol fd_ballet fd_tango fd_util fd_quic solana_validator)
$(OBJDIR)/obj/app/fdctl/configure/xdp.o: src/tango/xdp/fd_xdp_redirect_prog.o
$(OBJDIR)/obj/app/fdctl/config.o: src/app/fdctl/config/default.toml
Expand All @@ -18,6 +18,7 @@ $(OBJDIR)/obj/app/fdctl/run/tiles/fd_pack.o: src/app/fdctl/run/tiles/generated/p
$(OBJDIR)/obj/app/fdctl/run/tiles/fd_quic.o: src/app/fdctl/run/tiles/generated/quic_seccomp.h
$(OBJDIR)/obj/app/fdctl/run/tiles/fd_shred.o: src/app/fdctl/run/tiles/generated/shred_seccomp.h
$(OBJDIR)/obj/app/fdctl/run/tiles/fd_verify.o: src/app/fdctl/run/tiles/generated/verify_seccomp.h
$(OBJDIR)/obj/app/fdctl/run/tiles/fd_metric.o: src/app/fdctl/run/tiles/generated/metric_seccomp.h

# Phony target to always rerun cargo build ... it will detect if anything
# changed on the library side.
Expand Down Expand Up @@ -56,6 +57,8 @@ $(OBJDIR)/bin/solana: solana/target/$(RUST_PROFILE)/solana

rust: $(OBJDIR)/bin/solana

solana: $(OBJDIR)/bin/solana

endif
endif
endif
Expand Down
6 changes: 6 additions & 0 deletions src/app/fdctl/config.c
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,8 @@ static int parse_key_value( config_t * config,
ENTRY_UINT ( ., tiles.shred, max_pending_shred_sets );
ENTRY_USHORT( ., tiles.shred, shred_listen_port );

ENTRY_USHORT( ., tiles.metric, prometheus_listen_port );

ENTRY_BOOL ( ., development, sandbox );
ENTRY_BOOL ( ., development, no_solana_labs );

Expand Down Expand Up @@ -554,6 +556,7 @@ topo_initialize( config_t * config ) {
TILE( config->layout.bank_tile_count, FD_TOPO_TILE_KIND_BANK, FD_TOPO_WKSP_KIND_BANK, ULONG_MAX );
TILE( 1, FD_TOPO_TILE_KIND_SHRED, FD_TOPO_WKSP_KIND_SHRED, fd_topo_find_link( topo, FD_TOPO_LINK_KIND_SHRED_TO_STORE, i ) );
TILE( 1, FD_TOPO_TILE_KIND_STORE, FD_TOPO_WKSP_KIND_STORE, ULONG_MAX );
TILE( 1, FD_TOPO_TILE_KIND_METRIC, FD_TOPO_WKSP_KIND_METRIC, ULONG_MAX );

topo->tile_cnt = tile_cnt;

Expand Down Expand Up @@ -982,6 +985,9 @@ config_parse( int * pargc,
break;
case FD_TOPO_TILE_KIND_STORE:
break;
case FD_TOPO_TILE_KIND_METRIC:
tile->metric.prometheus_listen_port = config->tiles.metric.prometheus_listen_port;
break;
default:
FD_LOG_ERR(( "unknown tile kind %lu", tile->kind ));
}
Expand Down
3 changes: 3 additions & 0 deletions src/app/fdctl/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,9 @@ typedef struct {
ushort shred_listen_port;
} shred;

struct {
ushort prometheus_listen_port;
} metric;
} tiles;
} config_t;

Expand Down
12 changes: 11 additions & 1 deletion src/app/fdctl/config/default.toml
Original file line number Diff line number Diff line change
Expand Up @@ -477,7 +477,7 @@ dynamic_port_range = "8900-9000"
#
# It is suggested to use all available CPU cores for Firedancer, so
# that the Solana network can run as fast as possible.
affinity = "0-15"
affinity = "0-16"

# How many net tiles to run. Each networking tile will service
# exactly one queue from a network device being listened to. If
Expand Down Expand Up @@ -696,6 +696,7 @@ dynamic_port_range = "8900-9000"
# the client nor the server has sent any packet to the other for
# a period of time. Once this timeout is reached the connection
# will be terminated.
#
# An idle connection will be terminated if it remains idle longer than
# some threshold. "idle_timeout_millis" represents this threshold in
# milliseconds
Expand Down Expand Up @@ -784,6 +785,15 @@ dynamic_port_range = "8900-9000"
# this one.
shred_listen_port = 8003

# The metric tile receives metrics updates published from the rest
# of the tiles and serves them via. a Prometheus compatible HTTP
# endpoint.
[tiles.metric]
# The port to listen on for HTTP request for Prometheus metrics.
# Firedancer serves metrics at a URI like
# 127.0.0.1:7999/metrics
prometheus_listen_port = 7999

# These options can be useful for development, but should not be used
# when connecting to a live cluster, as they may cause the validator to
# be unstable or have degraded performance or security. The program
Expand Down
48 changes: 31 additions & 17 deletions src/app/fdctl/monitor/monitor.c
Original file line number Diff line number Diff line change
Expand Up @@ -100,15 +100,15 @@ tile_snap( tile_snap_t * snap_cur, /* Snapshot for each tile, indexed [0,til
FD_COMPILER_MFENCE();
snap->pid = FD_MGAUGE_GET( TILE, PID );
snap->in_backp = FD_MGAUGE_GET( STEM, IN_BACKPRESSURE );
snap->backp_cnt = FD_MCNT_GET( STEM, BACKPRESSURE );
snap->housekeeping_ticks = FD_MHIST_SUM( STEM, LOOP_DURATION_HOUSEKEEPING );
snap->backpressure_ticks = FD_MHIST_SUM( STEM, LOOP_DURATION_BACKPRESSURE );
snap->caught_up_ticks = FD_MHIST_SUM( STEM, LOOP_DURATION_CAUGHT_UP );
snap->overrun_polling_ticks = FD_MHIST_SUM( STEM, LOOP_DURATION_OVERRUN_POLLING );
snap->overrun_reading_ticks = FD_MHIST_SUM( STEM, LOOP_DURATION_OVERRUN_READING );
snap->filter_before_frag_ticks = FD_MHIST_SUM( STEM, LOOP_DURATION_FILTER_BEFORE_FRAGMENT );
snap->filter_after_frag_ticks = FD_MHIST_SUM( STEM, LOOP_DURATION_FILTER_AFTER_FRAGMENT );
snap->finish_ticks = FD_MHIST_SUM( STEM, LOOP_DURATION_FINISH );
snap->backp_cnt = FD_MCNT_GET( STEM, BACKPRESSURE_COUNT );
snap->housekeeping_ticks = FD_MHIST_SUM( STEM, LOOP_HOUSEKEEPING_DURATION_SECONDS );
snap->backpressure_ticks = FD_MHIST_SUM( STEM, LOOP_BACKPRESSURE_DURATION_SECONDS );
snap->caught_up_ticks = FD_MHIST_SUM( STEM, LOOP_CAUGHT_UP_DURATION_SECONDS );
snap->overrun_polling_ticks = FD_MHIST_SUM( STEM, LOOP_OVERRUN_POLLING_DURATION_SECONDS );
snap->overrun_reading_ticks = FD_MHIST_SUM( STEM, LOOP_OVERRUN_READING_DURATION_SECONDS );
snap->filter_before_frag_ticks = FD_MHIST_SUM( STEM, LOOP_FILTER_BEFORE_FRAGMENT_DURATION_SECONDS );
snap->filter_after_frag_ticks = FD_MHIST_SUM( STEM, LOOP_FILTER_AFTER_FRAGMENT_DURATION_SECONDS );
snap->finish_ticks = FD_MHIST_SUM( STEM, LOOP_FINISH_DURATION_SECONDS );
FD_COMPILER_MFENCE();
}
}
Expand All @@ -126,15 +126,29 @@ link_snap( link_snap_t * snap_cur,

ulong const * fseq = topo->tiles[ tile_idx ].in_link_fseq[ in_idx ];
snap->fseq_seq = fd_fseq_query( fseq );
ulong const * fseq_diag = (ulong const *)fd_fseq_app_laddr_const( fseq );

ulong const * in_metrics = (ulong const *)fd_metrics_link_in( topo->tiles[ tile_idx ].metrics, in_idx );

fd_topo_link_t * link = &topo->links[ topo->tiles[ tile_idx ].in_link_id[ in_idx ] ];
ulong producer_id = fd_topo_find_link_producer( topo, link );
ulong const * out_metrics = NULL;
if( FD_LIKELY( producer_id!=ULONG_MAX && topo->tiles[ tile_idx ].in_link_reliable[ in_idx ] ) ) {
fd_topo_tile_t * producer = &topo->tiles[ producer_id ];
ulong out_idx;
for( out_idx=0UL; out_idx<producer->out_cnt; out_idx++ ) {
if( producer->out_link_id[ out_idx ]==link->id ) break;
}
out_metrics = fd_metrics_link_out( producer->metrics, out_idx );
}
FD_COMPILER_MFENCE();
snap->fseq_diag_tot_cnt = fseq_diag[ FD_FSEQ_DIAG_PUB_CNT ];
snap->fseq_diag_tot_sz = fseq_diag[ FD_FSEQ_DIAG_PUB_SZ ];
snap->fseq_diag_filt_cnt = fseq_diag[ FD_FSEQ_DIAG_FILT_CNT ];
snap->fseq_diag_filt_sz = fseq_diag[ FD_FSEQ_DIAG_FILT_SZ ];
snap->fseq_diag_ovrnp_cnt = fseq_diag[ FD_FSEQ_DIAG_OVRNP_CNT ];
snap->fseq_diag_ovrnr_cnt = fseq_diag[ FD_FSEQ_DIAG_OVRNR_CNT ];
snap->fseq_diag_slow_cnt = fseq_diag[ FD_FSEQ_DIAG_SLOW_CNT ];
snap->fseq_diag_tot_cnt = in_metrics[ FD_METRICS_COUNTER_LINK_PUBLISHED_COUNT_OFF ];
snap->fseq_diag_tot_sz = in_metrics[ FD_METRICS_COUNTER_LINK_PUBLISHED_SIZE_BYTES_OFF ];
snap->fseq_diag_filt_cnt = in_metrics[ FD_METRICS_COUNTER_LINK_FILTERED_COUNT_OFF ];
snap->fseq_diag_filt_sz = in_metrics[ FD_METRICS_COUNTER_LINK_FILTERED_SIZE_BYTES_OFF ];
snap->fseq_diag_ovrnp_cnt = in_metrics[ FD_METRICS_COUNTER_LINK_OVERRUN_POLLING_COUNT_OFF ];
snap->fseq_diag_ovrnr_cnt = in_metrics[ FD_METRICS_COUNTER_LINK_OVERRUN_READING_COUNT_OFF ];

snap->fseq_diag_slow_cnt = out_metrics[ FD_METRICS_COUNTER_LINK_SLOW_COUNT_OFF ];
FD_COMPILER_MFENCE();
snap->fseq_diag_tot_cnt += snap->fseq_diag_filt_cnt;
snap->fseq_diag_tot_sz += snap->fseq_diag_filt_sz;
Expand Down
9 changes: 2 additions & 7 deletions src/app/fdctl/run/run1.c
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,6 @@ tile_main( void * _args ) {
&fd_tile_private_stack0, &fd_tile_private_stack1 );
FD_LOG_NOTICE(( "booting tile %s:%lu pid:%lu", fd_topo_tile_kind_str( tile->kind ), tile->kind_id, fd_log_group_id() ));

/* calling fd_tempo_tick_per_ns requires nanosleep, it is cached with
a FD_ONCE. We do this for all tiles before sandboxing so that we
don't need to allow the nanosleep syscall. */
fd_tempo_tick_per_ns( NULL );

/* preload shared memory before sandboxing, so it is already mapped */
fd_topo_join_tile_workspaces( args->config->name,
&args->config->topo,
Expand Down Expand Up @@ -96,7 +91,7 @@ tile_main( void * _args ) {
fd_sandbox( args->config->development.sandbox,
args->config->uid,
args->config->gid,
0UL,
config->rlimit_file_cnt,
allow_fds_cnt+allow_fds_offset,
allow_fds,
seccomp_filter_cnt,
Expand Down Expand Up @@ -170,7 +165,7 @@ tile_main( void * _args ) {
fd_alloca( FD_MUX_TILE_SCRATCH_ALIGN, FD_MUX_TILE_SCRATCH_FOOTPRINT( tile->in_cnt, out_cnt_reliable ) ),
ctx,
&callbacks );

FD_LOG_ERR(( "tile run loop returned" ));
return 0;
}

Expand Down
Loading

0 comments on commit 641480b

Please sign in to comment.