Skip to content

Commit

Permalink
Clean up logging (#2265)
Browse files Browse the repository at this point in the history
Demotes some logging and cleans up some instrumentation to avoid
overly noisy output that isn't cause for concern.

* In catchup, demote retryable failures from WARN to INFO. We now
  only WARN if the overall operation fails (failed to fetch from
  any provider) or if we fail to fetch from a single provider in a
  way that indicates that provider is malicious (e.g. invalid Merkle
  proof) instead of just unreachable (e.g. HTTP error)
* Add instance state to `skip` list for `instrument`, since it generates
  a very large debug output with each log
* Remove `Debug` requirement from `StateCatchup` trait. This was generating
  a huge dump of useless data for the SQL provider in particular. Instead,
  we have a function `name() -> String` which is meant to be a short, readable
  identifier of the provider. We now use this when logging information about a
  specific catchup provider
* Update `derive_more`, so we could customize the `Debug` impl of `NodeState` to
  use the `name()` function of the nested `Box<dyn StateCatchup>`, instead of the
  `Debug` impl. This update was a breaking change and thus required minor tweaks
  to a lot of other uses of `derive_more`.
* Demote `timed out fetching proposal` from WARN to INFO. This happens normally
  when we are fetching an old proposal that the rest of the network has already
  GCed, and the task will eventually just exit on its own
* Demote `background task exited` from INFO to DEBUG for short-lived tasks
  (proposal fetching) only
* Demote L1 logging that happens every L1 block from INFO to DEBUG
  • Loading branch information
jbearer authored Nov 8, 2024
1 parent 092ad42 commit 8beed63
Show file tree
Hide file tree
Showing 15 changed files with 142 additions and 58 deletions.
8 changes: 5 additions & 3 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ blake3 = "1.5"
circular-buffer = "0.1.7"
clap = { version = "4.4", features = ["derive", "env", "string"] }
cld = "0.5"
derive_more = "0.99.17"
derive_more = { version = "1.0", features = ["full"] }
es-version = { git = "https://github.com/EspressoSystems/es-version.git", branch = "main" }
dotenvy = "0.15"
dyn-clone = "1.0"
Expand Down
4 changes: 2 additions & 2 deletions sequencer/src/api.rs
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,7 @@ impl<
Ok(tree)
}

#[tracing::instrument(skip(self))]
#[tracing::instrument(skip(self, instance))]
async fn get_frontier(
&self,
instance: &NodeState,
Expand Down Expand Up @@ -383,7 +383,7 @@ impl<N: ConnectedNetwork<PubKey>, V: Versions, P: SequencerPersistence> CatchupD
retain_accounts(&state.fee_merkle_tree, accounts.iter().copied())
}

#[tracing::instrument(skip(self))]
#[tracing::instrument(skip(self, _instance))]
async fn get_frontier(
&self,
_instance: &NodeState,
Expand Down
6 changes: 3 additions & 3 deletions sequencer/src/bin/keygen.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@ use tracing::info_span;
#[derive(Clone, Copy, Debug, Display, Default, ValueEnum)]
enum Scheme {
#[default]
#[display(fmt = "all")]
#[display("all")]
All,
#[display(fmt = "bls")]
#[display("bls")]
Bls,
#[display(fmt = "schnorr")]
#[display("schnorr")]
Schnorr,
}

Expand Down
6 changes: 3 additions & 3 deletions sequencer/src/bin/utils/keygen.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@ use tracing::info_span;
#[derive(Clone, Copy, Debug, Display, Default, ValueEnum)]
enum Scheme {
#[default]
#[display(fmt = "all")]
#[display("all")]
All,
#[display(fmt = "bls")]
#[display("bls")]
Bls,
#[display(fmt = "schnorr")]
#[display("schnorr")]
Schnorr,
}

Expand Down
39 changes: 29 additions & 10 deletions sequencer/src/catchup.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ use hotshot_types::{
data::ViewNumber, network::NetworkConfig, traits::node_implementation::ConsensusTime as _,
ValidatorConfig,
};
use itertools::Itertools;
use jf_merkle_tree::{prelude::MerkleNode, ForgetableMerkleTreeScheme, MerkleTreeScheme};
use serde::de::DeserializeOwned;
use std::collections::HashMap;
Expand Down Expand Up @@ -141,19 +142,19 @@ impl<ApiVer: StaticVersionType> StateCatchup for StatePeers<ApiVer> {
let snapshot = match req.send().await {
Ok(res) => res,
Err(err) => {
tracing::warn!("Error fetching accounts from peer: {err:#}");
tracing::info!(peer = %client.url, "error fetching accounts from peer: {err:#}");
continue;
}
};

// Verify proofs.
for account in accounts {
let Some((proof, _)) = FeeAccountProof::prove(&snapshot, (*account).into()) else {
tracing::warn!("response from peer missing account {account}");
tracing::warn!(peer = %client.url, "response from peer missing account {account}");
continue;
};
if let Err(err) = proof.verify(&fee_merkle_tree_root) {
tracing::warn!("peer gave invalid proof for account {account}: {err:#}");
tracing::warn!(peer = %client.url, "peer gave invalid proof for account {account}: {err:#}");
continue;
}
}
Expand All @@ -163,7 +164,7 @@ impl<ApiVer: StaticVersionType> StateCatchup for StatePeers<ApiVer> {
bail!("Could not fetch account from any peer");
}

#[tracing::instrument(skip(self, mt), height = mt.num_leaves())]
#[tracing::instrument(skip(self, _instance, mt))]
async fn try_remember_blocks_merkle_tree(
&self,
_instance: &NodeState,
Expand All @@ -172,27 +173,27 @@ impl<ApiVer: StaticVersionType> StateCatchup for StatePeers<ApiVer> {
mt: &mut BlockMerkleTree,
) -> anyhow::Result<()> {
for client in self.clients.iter() {
tracing::info!("Fetching frontier from {}", client.url);
tracing::debug!(peer = %client.url, "fetching frontier from peer");
match client
.get::<BlocksFrontier>(&format!("catchup/{height}/{}/blocks", view.u64()))
.send()
.await
{
Ok(frontier) => {
let Some(elem) = frontier.elem() else {
tracing::warn!("Provided frontier is missing leaf element");
tracing::warn!(peer = %client.url, "Provided frontier is missing leaf element");
continue;
};
match mt.remember(mt.num_leaves() - 1, *elem, &frontier) {
Ok(_) => return Ok(()),
Err(err) => {
tracing::warn!("Error verifying block proof: {}", err);
tracing::warn!(peer = %client.url, "Error verifying block proof: {err:#}");
continue;
}
}
}
Err(err) => {
tracing::warn!("Error fetching blocks from peer: {}", err);
tracing::info!(peer = %client.url, "error fetching blocks from peer: {err:#}");
}
}
}
Expand Down Expand Up @@ -233,6 +234,16 @@ impl<ApiVer: StaticVersionType> StateCatchup for StatePeers<ApiVer> {
fn backoff(&self) -> &BackoffParams {
&self.backoff
}

fn name(&self) -> String {
format!(
"StatePeers({})",
self.clients
.iter()
.map(|client| client.url.to_string())
.join(",")
)
}
}

pub(crate) trait CatchupStorage: Sync {
Expand Down Expand Up @@ -344,7 +355,7 @@ impl<T> SqlStateCatchup<T> {
#[async_trait]
impl<T> StateCatchup for SqlStateCatchup<T>
where
T: CatchupStorage + std::fmt::Debug + Send + Sync,
T: CatchupStorage + Send + Sync,
{
// TODO: add a test for the account proof validation
// issue # 2102 (https://github.com/EspressoSystems/espresso-sequencer/issues/2102)
Expand All @@ -364,7 +375,7 @@ where
.0)
}

#[tracing::instrument(skip(self))]
#[tracing::instrument(skip(self, instance, mt))]
async fn try_remember_blocks_merkle_tree(
&self,
instance: &NodeState,
Expand Down Expand Up @@ -410,6 +421,10 @@ where
fn backoff(&self) -> &BackoffParams {
&self.backoff
}

fn name(&self) -> String {
"SqlStateCatchup".into()
}
}

/// Disable catchup entirely.
Expand Down Expand Up @@ -479,4 +494,8 @@ impl StateCatchup for NullStateCatchup {
fn backoff(&self) -> &BackoffParams {
&self.backoff
}

fn name(&self) -> String {
"NullStateCatchup".into()
}
}
62 changes: 48 additions & 14 deletions sequencer/src/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ use hotshot_types::{
PeerConfig,
};
use std::time::Duration;
use tracing::{Instrument, Level};
use url::Url;

use crate::{
Expand Down Expand Up @@ -319,6 +320,17 @@ impl<N: ConnectedNetwork<PubKey>, P: SequencerPersistence, V: Versions> Sequence
self.tasks.spawn(name, task);
}

/// Spawn a short-lived background task attached to this context.
///
/// When this context is dropped or [`shut_down`](Self::shut_down), background tasks will be
/// cancelled in the reverse order that they were spawned.
///
/// The only difference between a short-lived background task and a [long-lived](Self::spawn)
/// one is how urgently logging related to the task is treated.
pub fn spawn_short_lived(&mut self, name: impl Display, task: impl Future + Send + 'static) {
self.tasks.spawn_short_lived(name, task);
}

/// Stop participating in consensus.
pub async fn shut_down(&mut self) {
tracing::info!("shutting down SequencerContext");
Expand Down Expand Up @@ -427,7 +439,7 @@ async fn fetch_proposals<N, P, V>(
// to the anchor. This allows state replay from the decided state.
let parent_view = proposal.data.justify_qc.view_number;
let parent_leaf = proposal.data.justify_qc.data.leaf_commit;
tasks.spawn(
tasks.spawn_short_lived(
format!("fetch proposal {parent_view:?},{parent_leaf}"),
fetch_proposal_chain(
consensus.clone(),
Expand Down Expand Up @@ -473,20 +485,20 @@ async fn fetch_proposal_chain<N, P, V>(
{
Ok(future) => future,
Err(err) => {
tracing::warn!(?view, %leaf, "failed to request proposal: {err:#}");
tracing::info!(?view, %leaf, "failed to request proposal: {err:#}");
sleep(Duration::from_secs(1)).await;
continue;
}
};
let proposal = match async_timeout(Duration::from_secs(30), future).await {
Ok(Ok(proposal)) => proposal,
Ok(Err(err)) => {
tracing::warn!("error fetching proposal: {err:#}");
tracing::info!("error fetching proposal: {err:#}");
sleep(Duration::from_secs(1)).await;
continue;
}
Err(_) => {
tracing::warn!("timed out fetching proposal");
tracing::info!("timed out fetching proposal");
sleep(Duration::from_secs(1)).await;
continue;
}
Expand Down Expand Up @@ -523,7 +535,7 @@ async fn fetch_proposal_chain<N, P, V>(
consensus
.update_saved_leaves(leaf, &handle.hotshot.upgrade_lock)
.await;
tracing::info!(
tracing::debug!(
?view,
"added view to validated state map view proposal fetcher"
);
Expand All @@ -550,21 +562,43 @@ async fn load_anchor_view(persistence: &impl SequencerPersistence) -> ViewNumber
#[derive(Debug, Default)]
pub(crate) struct TaskList(Vec<(String, JoinHandle<()>)>);

macro_rules! spawn_with_log_level {
($this:expr, $lvl:expr, $name:expr, $task: expr) => {
let name = $name.to_string();
let task = {
let name = name.clone();
let span = tracing::span!($lvl, "background task", name);
spawn(
async move {
tracing::event!($lvl, "spawning background task");
$task.await;
tracing::event!($lvl, "background task exited");
}
.instrument(span),
)
};
$this.0.push((name, task));
};
}

impl TaskList {
/// Spawn a background task attached to this [`TaskList`].
///
/// When this [`TaskList`] is dropped or [`shut_down`](Self::shut_down), background tasks will
/// be cancelled in the reverse order that they were spawned.
pub fn spawn(&mut self, name: impl Display, task: impl Future + Send + 'static) {
let name = name.to_string();
let task = {
let name = name.clone();
spawn(async move {
task.await;
tracing::info!(name, "background task exited");
})
};
self.0.push((name, task));
spawn_with_log_level!(self, Level::INFO, name, task);
}

/// Spawn a short-lived background task attached to this [`TaskList`].
///
/// When this [`TaskList`] is dropped or [`shut_down`](Self::shut_down), background tasks will
/// be cancelled in the reverse order that they were spawned.
///
/// The only difference between a short-lived background task and a [long-lived](Self::spawn)
/// one is how urgently logging related to the task is treated.
pub fn spawn_short_lived(&mut self, name: impl Display, task: impl Future + Send + 'static) {
spawn_with_log_level!(self, Level::DEBUG, name, task);
}

/// Stop all background tasks.
Expand Down
7 changes: 6 additions & 1 deletion types/src/v0/impls/instance_state.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,12 @@ use super::state::ValidatedState;
/// Represents the immutable state of a node.
///
/// For mutable state, use `ValidatedState`.
#[derive(Debug, Clone)]
#[derive(derive_more::Debug, Clone)]
pub struct NodeState {
pub node_id: u64,
pub chain_config: crate::v0_3::ChainConfig,
pub l1_client: L1Client,
#[debug("{}", peers.name())]
pub peers: Arc<dyn StateCatchup>,
pub genesis_header: GenesisHeader,
pub genesis_state: ValidatedState,
Expand Down Expand Up @@ -254,5 +255,9 @@ pub mod mock {
fn backoff(&self) -> &BackoffParams {
&self.backoff
}

fn name(&self) -> String {
"MockStateCatchup".into()
}
}
}
4 changes: 2 additions & 2 deletions types/src/v0/impls/l1.rs
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,7 @@ impl L1Client {
// Update the state snapshot;
let mut state = state.lock().await;
if head > state.snapshot.head {
tracing::info!(head, old_head = state.snapshot.head, "L1 head updated");
tracing::debug!(head, old_head = state.snapshot.head, "L1 head updated");
state.snapshot.head = head;
// Emit an event about the new L1 head. Ignore send errors; it just means no
// one is listening to events right now.
Expand All @@ -326,7 +326,7 @@ impl L1Client {
.ok();
}
}
tracing::info!("updated L1 snapshot to {:?}", state.snapshot);
tracing::debug!("updated L1 snapshot to {:?}", state.snapshot);
}

tracing::error!("L1 block stream ended unexpectedly, trying to re-establish");
Expand Down
Loading

0 comments on commit 8beed63

Please sign in to comment.