From 73de21b35d6b83def03f51caca06c1931ea8ee77 Mon Sep 17 00:00:00 2001 From: Divma <26765164+divagant-martian@users.noreply.github.com> Date: Thu, 25 Jul 2024 12:53:43 -0500 Subject: [PATCH 01/45] chore: fix clippy warnings (#2550) ## Description Fixes clippy warnings: - [needless borrows for generic args](https://rust-lang.github.io/rust-clippy/master/index.html#needless_borrows_for_generic_args) - [md lazy continuation](https://rust-lang.github.io/rust-clippy/master/index.html#/doc_lazy_continuation) which simply means clippy wants us to use consistent indentation for lists so that it can be sure how to format docs. ## Breaking Changes n/a ## Notes & open questions n/a ## Change checklist - [x] Self-review. - [x] Documentation updates following the [style guide](https://rust-lang.github.io/rfcs/1574-more-api-documentation-conventions.html#appendix-a-full-conventions-text), if relevant. - [ ] ~~tests if relevant.~~ - [ ] ~~All breaking changes documented.~~ --- iroh-base/src/ticket.rs | 2 +- iroh-blobs/src/get/db.rs | 2 +- iroh-blobs/src/protocol.rs | 50 ++++++++++++++---------------- iroh-blobs/src/store/fs.rs | 9 +++--- iroh-docs/src/lib.rs | 4 +-- iroh-docs/src/ranger.rs | 2 ++ iroh-gossip/src/proto/hyparview.rs | 8 ++--- iroh-gossip/src/proto/plumtree.rs | 20 ++++++------ iroh-net/src/endpoint.rs | 2 +- iroh-net/src/relay/client_conn.rs | 4 +-- iroh-net/src/relay/codec.rs | 3 +- 11 files changed, 53 insertions(+), 53 deletions(-) diff --git a/iroh-base/src/ticket.rs b/iroh-base/src/ticket.rs index e7b3f3bb87..04e7cbace9 100644 --- a/iroh-base/src/ticket.rs +++ b/iroh-base/src/ticket.rs @@ -36,7 +36,7 @@ pub trait Ticket: Sized { /// Serialize to string. fn serialize(&self) -> String { let mut out = Self::KIND.to_string(); - base32::fmt_append(&self.to_bytes(), &mut out); + base32::fmt_append(self.to_bytes(), &mut out); out } diff --git a/iroh-blobs/src/get/db.rs b/iroh-blobs/src/get/db.rs index fdfeba7d80..08ef2f82c7 100644 --- a/iroh-blobs/src/get/db.rs +++ b/iroh-blobs/src/get/db.rs @@ -346,7 +346,7 @@ async fn get_hash_seq< child: BlobId::from_offset((i as u64) + 1), hash: children[i], size, - valid_ranges: RangeSpec::new(&info.valid_ranges()), + valid_ranges: RangeSpec::new(info.valid_ranges()), }) .await?; } diff --git a/iroh-blobs/src/protocol.rs b/iroh-blobs/src/protocol.rs index 0a5e2a7ca2..9f24b72177 100644 --- a/iroh-blobs/src/protocol.rs +++ b/iroh-blobs/src/protocol.rs @@ -12,31 +12,29 @@ //! //! - Be paranoid about data integrity. //! -//! Data integrity is considered more important than performance. Data will be -//! validated both on the provider and getter side. A well behaved provider will -//! never send invalid data. Responses to range requests contain sufficient -//! information to validate the data. +//! Data integrity is considered more important than performance. Data will be validated both on +//! the provider and getter side. A well behaved provider will never send invalid data. Responses +//! to range requests contain sufficient information to validate the data. //! -//! Note: Validation using blake3 is extremely fast, so in almost all scenarios the -//! validation will not be the bottleneck even if we validate both on the provider -//! and getter side. +//! Note: Validation using blake3 is extremely fast, so in almost all scenarios the validation +//! will not be the bottleneck even if we validate both on the provider and getter side. //! //! - Do not limit the size of blobs or collections. //! -//! Blobs can be of arbitrary size, up to terabytes. Likewise, collections -//! can contain an arbitrary number of links. A well behaved implementation will -//! not require the entire blob or collection to be in memory at once. +//! Blobs can be of arbitrary size, up to terabytes. Likewise, collections can contain an +//! arbitrary number of links. A well behaved implementation will not require the entire blob or +//! collection to be in memory at once. //! //! - Be efficient when transferring large blobs, including range requests. //! -//! It is possible to request entire blobs or ranges of blobs, where the -//! minimum granularity is a chunk group of 16KiB or 16 blake3 chunks. The worst -//! case overhead when doing range requests is about two chunk groups per range. +//! It is possible to request entire blobs or ranges of blobs, where the minimum granularity is a +//! chunk group of 16KiB or 16 blake3 chunks. The worst case overhead when doing range requests +//! is about two chunk groups per range. //! //! - Be efficient when transferring multiple tiny blobs. //! -//! For tiny blobs the overhead of sending the blob hashes and the round-trip time -//! for each blob would be prohibitive. +//! For tiny blobs the overhead of sending the blob hashes and the round-trip time for each blob +//! would be prohibitive. //! //! To avoid roundtrips, the protocol allows grouping multiple blobs into *collections*. //! The semantic meaning of a collection is up to the application. For the purpose @@ -46,21 +44,21 @@ //! //! - Do not attempt to be generic in terms of the used hash function. //! -//! The protocol makes extensive use of the [blake3](https://crates.io/crates/blake3) -//! hash function and it's special properties such as blake3 verified streaming. +//! The protocol makes extensive use of the [blake3](https://crates.io/crates/blake3) hash +//! function and it's special properties such as blake3 verified streaming. //! //! - Do not support graph traversal. //! -//! The protocol only supports collections that directly contain blobs. If you have -//! deeply nested graph data, you will need to either do multiple requests or flatten -//! the graph into a single temporary collection. +//! The protocol only supports collections that directly contain blobs. If you have deeply nested +//! graph data, you will need to either do multiple requests or flatten the graph into a single +//! temporary collection. //! //! - Do not support discovery. //! -//! The protocol does not yet have a discovery mechanism for asking the provider -//! what ranges are available for a given blob. Currently you have to have some -//! out-of-band knowledge about what node has data for a given hash, or you can -//! just try to retrieve the data and see if it is available. +//! The protocol does not yet have a discovery mechanism for asking the provider what ranges are +//! available for a given blob. Currently you have to have some out-of-band knowledge about what +//! node has data for a given hash, or you can just try to retrieve the data and see if it is +//! available. //! //! A discovery protocol is planned in the future though. //! @@ -314,10 +312,10 @@ //! Reasons for not retrieving a complete response are two-fold: //! //! - the connection to the provider was interrupted, or the provider encountered -//! an internal error. In this case the provider will close the entire quinn connection. +//! an internal error. In this case the provider will close the entire quinn connection. //! //! - the provider does not have the requested data, or discovered on send that the -//! requested data is not valid. +//! requested data is not valid. //! //! In this case the provider will close just the stream used to send the response. //! The exact location of the missing data can be retrieved from the error. diff --git a/iroh-blobs/src/store/fs.rs b/iroh-blobs/src/store/fs.rs index e9e113a603..3ad704848c 100644 --- a/iroh-blobs/src/store/fs.rs +++ b/iroh-blobs/src/store/fs.rs @@ -36,12 +36,11 @@ //! //! Data can get out of the store in two ways: //! -//! 1. the data and outboard of both partial and complete entries can be read -//! at any time and shared over the network. Only data that is complete will -//! be shared, everything else will lead to validation errors. +//! 1. the data and outboard of both partial and complete entries can be read at any time and +//! shared over the network. Only data that is complete will be shared, everything else will +//! lead to validation errors. //! -//! 2. entries can be exported to the file system. This currently only works -//! for complete entries. +//! 2. entries can be exported to the file system. This currently only works for complete entries. //! //! Tables: //! diff --git a/iroh-docs/src/lib.rs b/iroh-docs/src/lib.rs index 0e84ea61f6..c9013ed62f 100644 --- a/iroh-docs/src/lib.rs +++ b/iroh-docs/src/lib.rs @@ -18,8 +18,8 @@ //! Aljoscha Meyer: //! //! > Range-based set reconciliation is a simple approach to efficiently compute the union of two -//! sets over a network, based on recursively partitioning the sets and comparing fingerprints of -//! the partitions to probabilistically detect whether a partition requires further work. +//! > sets over a network, based on recursively partitioning the sets and comparing fingerprints of +//! > the partitions to probabilistically detect whether a partition requires further work. //! //! The crate exposes a [generic storage interface](store::Store). There is an implementation //! of this interface, [store::fs::Store], that can be used either diff --git a/iroh-docs/src/ranger.rs b/iroh-docs/src/ranger.rs index 400e295a6a..2a341a1421 100644 --- a/iroh-docs/src/ranger.rs +++ b/iroh-docs/src/ranger.rs @@ -56,9 +56,11 @@ pub trait RangeValue: Sized + Debug + Ord + PartialEq + Clone + 'static {} /// Stores a range. /// /// There are three possibilities +/// /// - x, x: All elements in a set, denoted with /// - [x, y): x < y: Includes x, but not y /// - S \ [y, x) y < x: Includes x, but not y. +/// /// This means that ranges are "wrap around" conceptually. #[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize, Default)] pub struct Range { diff --git a/iroh-gossip/src/proto/hyparview.rs b/iroh-gossip/src/proto/hyparview.rs index f6e74e8e1f..6558e2417b 100644 --- a/iroh-gossip/src/proto/hyparview.rs +++ b/iroh-gossip/src/proto/hyparview.rs @@ -469,10 +469,10 @@ where /// Handle a [`Message::Shuffle`] /// /// > A node q that receives a Shuffle request will first decrease its time to live. If the time - /// to live of the message is greater than zero and the number of nodes in q’s active view is - /// greater than 1, the node will select a random node from its active view, different from the - /// one he received this shuffle message from, and simply forwards the Shuffle request. - /// Otherwise, node q accepts the Shuffle request and send back (p.8) + /// > to live of the message is greater than zero and the number of nodes in q’s active view is + /// > greater than 1, the node will select a random node from its active view, different from the + /// > one he received this shuffle message from, and simply forwards the Shuffle request. + /// > Otherwise, node q accepts the Shuffle request and send back (p.8) fn on_shuffle(&mut self, from: PI, shuffle: Shuffle, io: &mut impl IO) { if shuffle.ttl.expired() || self.active_view.len() <= 1 { let len = shuffle.nodes.len(); diff --git a/iroh-gossip/src/proto/plumtree.rs b/iroh-gossip/src/proto/plumtree.rs index 365630cff9..f5b66f039e 100644 --- a/iroh-gossip/src/proto/plumtree.rs +++ b/iroh-gossip/src/proto/plumtree.rs @@ -234,8 +234,8 @@ pub struct Config { /// /// The plumtree paper notes: /// > The timeout value is a protocol parameter that should be configured considering the - /// diameter of the overlay and a target maximum recovery latency, defined by the application - /// requirements. (p.8) + /// > diameter of the overlay and a target maximum recovery latency, defined by the application + /// > requirements. (p.8) pub graft_timeout_1: Duration, /// This timeout is registered when sending a [`Graft`] message. If a reply has not been /// received once the timeout expires, we send another [`Graft`] message to the next peer that @@ -243,7 +243,7 @@ pub struct Config { /// /// The plumtree paper notes: /// > This second timeout value should be smaller that the first, in the order of an average - /// round trip time to a neighbor. + /// > round trip time to a neighbor. pub graft_timeout_2: Duration, /// Timeout after which [`IHave`] messages are pushed to peers. pub dispatch_timeout: Duration, @@ -561,11 +561,11 @@ impl State { /// Handle receiving a [`Message::IHave`]. /// /// > When a node receives a IHAVE message, it simply marks the corresponding message as - /// missing It then starts a timer, with a predefined timeout value, and waits for the missing - /// message to be received via eager push before the timer expires. The timeout value is a - /// protocol parameter that should be configured considering the diameter of the overlay and a - /// target maximum recovery latency, defined by the application requirements. This is a - /// parameter that should be statically configured at deployment time. (p8) + /// > missing It then starts a timer, with a predefined timeout value, and waits for the missing + /// > message to be received via eager push before the timer expires. The timeout value is a + /// > protocol parameter that should be configured considering the diameter of the overlay and a + /// > target maximum recovery latency, defined by the application requirements. This is a + /// > parameter that should be statically configured at deployment time. (p8) fn on_ihave(&mut self, sender: PI, ihaves: Vec, io: &mut impl IO) { for ihave in ihaves { if !self.received_messages.contains_key(&ihave.id) { @@ -636,8 +636,8 @@ impl State { /// Handle a [`InEvent::NeighborDown`] when a peer leaves the topic. /// > When a neighbor is detected to leave the overlay, it is simple removed from the - /// membership. Furthermore, the record of IHAVE messages sent from failed members is deleted - /// from the missing history. (p9) + /// > membership. Furthermore, the record of IHAVE messages sent from failed members is deleted + /// > from the missing history. (p9) fn on_neighbor_down(&mut self, peer: PI) { self.missing_messages.retain(|_message_id, ihaves| { ihaves.retain(|(ihave_peer, _round)| *ihave_peer != peer); diff --git a/iroh-net/src/endpoint.rs b/iroh-net/src/endpoint.rs index 90f7f0a868..4db39618fc 100644 --- a/iroh-net/src/endpoint.rs +++ b/iroh-net/src/endpoint.rs @@ -841,7 +841,7 @@ impl Endpoint { /// This will launch discovery in all cases except if: /// 1) we do not have discovery enabled /// 2) we have discovery enabled, but already have at least one verified, unexpired - /// addresses for this `node_id` + /// addresses for this `node_id` /// /// # Errors /// diff --git a/iroh-net/src/relay/client_conn.rs b/iroh-net/src/relay/client_conn.rs index 05171937d1..3d5de5cae4 100644 --- a/iroh-net/src/relay/client_conn.rs +++ b/iroh-net/src/relay/client_conn.rs @@ -55,7 +55,7 @@ pub(crate) struct ClientConnManager { /// Channels that the [`ClientConnManager`] uses to communicate with the /// [`ClientConnIo`] to forward the client: /// - information about a peer leaving the network (This should only happen for peers that this -/// client was previously communciating with) +/// client was previously communciating with) /// - packets sent to this client from another client in the network #[derive(Debug)] pub(crate) struct ClientChannels { @@ -192,7 +192,7 @@ impl ClientConnManager { /// On the "write" side, the [`ClientConnIo`] can send the client: /// - a KEEP_ALIVE frame /// - a PEER_GONE frame to inform the client that a peer they have previously sent messages to -/// is gone from the network +/// is gone from the network /// - packets from other peers /// /// On the "read" side, it can: diff --git a/iroh-net/src/relay/codec.rs b/iroh-net/src/relay/codec.rs index f7b34defdc..66186ed6ad 100644 --- a/iroh-net/src/relay/codec.rs +++ b/iroh-net/src/relay/codec.rs @@ -30,7 +30,8 @@ pub(super) const PER_CLIENT_READ_QUEUE_DEPTH: usize = 512; /// ProtocolVersion is bumped whenever there's a wire-incompatible change. /// - version 1 (zero on wire): consistent box headers, in use by employee dev nodes a bit -/// - version 2: received packets have src addrs in FrameType::RecvPacket at beginning +/// - version 2: received packets have src addrs in FrameType::RecvPacket at beginning. +/// /// NOTE: we are technically running a modified version of the protocol. /// `FrameType::PeerPresent`, `FrameType::WatchConn`, `FrameType::ClosePeer`, have been removed. /// The server will error on that connection if a client sends one of these frames. From f97c1c0858161a8c0e0f64b862aaceea0035d371 Mon Sep 17 00:00:00 2001 From: Divma <26765164+divagant-martian@users.noreply.github.com> Date: Thu, 25 Jul 2024 14:14:13 -0500 Subject: [PATCH 02/45] tests(iroh-cli): update to new api (#2549) ## Description We recently update the api from `blob` to `blobs`, and `tag` to `tags`. This adjusts the tests. ## Breaking Changes n/a ## Notes & open questions n/a ## Change checklist - [x] Self-review. - [ ] ~~Documentation updates following the [style guide](https://rust-lang.github.io/rfcs/1574-more-api-documentation-conventions.html#appendix-a-full-conventions-text), if relevant.~~ - [ ] ~~Tests if relevant.~~ - [ ] ~~All breaking changes documented.~~ --- iroh-cli/tests/cli.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/iroh-cli/tests/cli.rs b/iroh-cli/tests/cli.rs index 9d74b2ab37..9da3996da6 100644 --- a/iroh-cli/tests/cli.rs +++ b/iroh-cli/tests/cli.rs @@ -388,18 +388,18 @@ fn cli_bao_store_migration() -> anyhow::Result<()> { ); println!("iroh started up."); - let tags_output = run_cli(&iroh_data_dir, ["tag", "list"])?; + let tags_output = run_cli(&iroh_data_dir, ["tags", "list"])?; let expected = r#""complete": 2vfkw5gcrtbybfsczoxq4mae47svtgcgsniwcvoz7xf36nz45yfa (Raw) "partial": 4yny3v7anmzzsajv2amm3nxpqd2owfw4dqnjwq6anv7nj2djmt2q (Raw) "#; assert_eq!(tags_output, expected); - let blob_output = run_cli(&iroh_data_dir, ["blob", "list", "blobs"])?; + let blob_output = run_cli(&iroh_data_dir, ["blobs", "list", "blobs"])?; let expected = r#" 2vfkw5gcrtbybfsczoxq4mae47svtgcgsniwcvoz7xf36nz45yfa (8 B) "#; assert_eq!(blob_output, expected); - let incomplete_blob_output = run_cli(iroh_data_dir, ["blob", "list", "incomplete-blobs"])?; + let incomplete_blob_output = run_cli(iroh_data_dir, ["blobs", "list", "incomplete-blobs"])?; let expected = r#"4yny3v7anmzzsajv2amm3nxpqd2owfw4dqnjwq6anv7nj2djmt2q (0 B) "#; assert_eq!(incomplete_blob_output, expected); @@ -709,7 +709,7 @@ fn make_get_cmd(iroh_data_dir: &Path, ticket: &str, out: Option) -> duc "--metrics-port", "disabled", "--start", - "blob", + "blobs", "get", ticket, "--out", @@ -830,7 +830,7 @@ fn test_provide_get_loop_single(input: Input, output: Output, hash: Hash) -> Res "--metrics-port", "disabled", "--start", - "blob", + "blobs", "get", "--node", &node, From 0fc37942be3d68399fbe45401ba7d67be43a83a6 Mon Sep 17 00:00:00 2001 From: Floris Bruynooghe Date: Thu, 25 Jul 2024 21:19:40 +0200 Subject: [PATCH 03/45] fix(iroh-cli)!: Improve cli and configuration file (#2532) ## Description The configuration file behaviour is improved: - Unknown fields will now cause an error rather than be silently ignored. - Make it possible to express GcPolicy in the TOML config file. This was not possible before. It was possible to disable it before, but it was disabled by default so that was rather moot. The --help output is improved: - Update header of iroh. - Directly point to config file docs. - Use consistent style. ## Breaking Changes - Unknown fields in the configuration file will now cause an error. - Configuring the GC Policy in the configuration file has changed. ## Notes & open questions ## Change checklist - [ ] Self-review. - [x] Documentation updates following the [style guide](https://rust-lang.github.io/rfcs/1574-more-api-documentation-conventions.html#appendix-a-full-conventions-text), if relevant. - [x] Tests if relevant. - [x] All breaking changes documented. --- Cargo.lock | 1 + iroh-cli/Cargo.toml | 1 + iroh-cli/src/commands.rs | 6 +- iroh-cli/src/commands/start.rs | 3 +- iroh-cli/src/config.rs | 178 +++++++++++++++++++++++++++++++-- iroh-cli/src/logging.rs | 5 +- iroh-net/src/relay/map.rs | 2 + iroh/src/node/builder.rs | 2 + 8 files changed, 182 insertions(+), 16 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f591bb74ad..b5ac47386d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2675,6 +2675,7 @@ dependencies = [ "tracing", "tracing-appender", "tracing-subscriber", + "url", "walkdir", ] diff --git a/iroh-cli/Cargo.toml b/iroh-cli/Cargo.toml index fe60b28514..6d4f0e5cbc 100644 --- a/iroh-cli/Cargo.toml +++ b/iroh-cli/Cargo.toml @@ -72,6 +72,7 @@ nix = { version = "0.27", features = ["signal", "process"] } rand_xorshift = "0.3.0" regex = "1.10.3" testdir = "0.9.1" +url = "2.5.0" walkdir = "2" [features] diff --git a/iroh-cli/src/commands.rs b/iroh-cli/src/commands.rs index cf97d62fe8..59529cd671 100644 --- a/iroh-cli/src/commands.rs +++ b/iroh-cli/src/commands.rs @@ -23,7 +23,7 @@ pub(crate) mod rpc; pub(crate) mod start; pub(crate) mod tags; -/// iroh is a tool for syncing bytes +/// iroh is a tool for building distributed apps /// https://iroh.computer/docs #[derive(Parser, Debug, Clone)] #[clap(version, verbatim_doc_comment)] @@ -31,7 +31,7 @@ pub(crate) struct Cli { #[clap(subcommand)] pub(crate) command: Commands, - /// Path to the configuration file. + /// Path to the configuration file, see https://iroh.computer/docs/reference/config. #[clap(long)] pub(crate) config: Option, @@ -47,7 +47,7 @@ pub(crate) struct Cli { #[clap(long)] pub(crate) rpc_addr: Option, - /// If set, metrics will be dumped in CSV format to the specified path at regular intervals (100ms). + /// Write metrics in CSV format at 100ms intervals. Disabled by default. #[clap(long)] pub(crate) metrics_dump_path: Option, } diff --git a/iroh-cli/src/commands/start.rs b/iroh-cli/src/commands/start.rs index 39449fd15c..c1d872a6f4 100644 --- a/iroh-cli/src/commands/start.rs +++ b/iroh-cli/src/commands/start.rs @@ -10,7 +10,7 @@ use iroh::{ net::relay::{RelayMap, RelayMode}, node::RpcStatus, }; -use tracing::{info_span, Instrument}; +use tracing::{info_span, trace, Instrument}; /// Whether to stop the node after running a command or run forever until stopped. #[derive(Debug, Copy, Clone, Eq, PartialEq)] @@ -79,6 +79,7 @@ where F: FnOnce(iroh::client::Iroh) -> T + Send + 'static, T: Future> + 'static, { + trace!(?config, "using config"); let relay_map = config.relay_map()?; let spinner = create_spinner("Iroh booting..."); diff --git a/iroh-cli/src/config.rs b/iroh-cli/src/config.rs index d47250d862..f99f05d371 100644 --- a/iroh-cli/src/config.rs +++ b/iroh-cli/src/config.rs @@ -6,6 +6,7 @@ use std::{ path::{Path, PathBuf}, str::FromStr, sync::Arc, + time::Duration, }; use anyhow::{anyhow, bail, Context, Result}; @@ -48,15 +49,18 @@ impl ConsolePaths { } /// The configuration for an iroh node. -#[derive(PartialEq, Eq, Debug, Deserialize, Serialize, Clone)] -#[serde(default)] +// Please note that this is documented in the `iroh.computer` repository under +// `src/app/docs/reference/config/page.mdx`. Any changes to this need to be updated there. +#[derive(PartialEq, Eq, Debug, Deserialize, Clone)] +#[serde(default, deny_unknown_fields)] pub(crate) struct NodeConfig { /// The nodes for relay to use. pub(crate) relay_nodes: Vec, /// How often to run garbage collection. - pub(crate) gc_policy: GcPolicy, + pub(crate) gc_policy: GcPolicyConfig, /// Bind address on which to serve Prometheus metrics pub(crate) metrics_addr: Option, + /// Configuration for the logfile. pub(crate) file_logs: super::logging::FileLogging, /// Path to dump metrics to in CSV format. pub(crate) metrics_dump_path: Option, @@ -82,7 +86,7 @@ impl Default for NodeConfig { }; Self { relay_nodes: relay_nodes.into(), - gc_policy: GcPolicy::Disabled, + gc_policy: GcPolicyConfig::default(), metrics_addr: Some(([127, 0, 0, 1], 9090).into()), file_logs: Default::default(), metrics_dump_path: None, @@ -91,8 +95,12 @@ impl Default for NodeConfig { } impl NodeConfig { - /// Create a config using defaults, and the passed in config file. - pub async fn load(file: Option<&Path>) -> Result { + /// Creates a config from default config file. + /// + /// If the *file* is `Some` the configuration will be read from it. Otherwise the + /// default config file will be loaded. If that is not present the default config will + /// be used. + pub(crate) async fn load(file: Option<&Path>) -> Result { let default_config = iroh_config_path(CONFIG_FILE_NAME)?; let config_file = match file { @@ -107,7 +115,7 @@ impl NodeConfig { }; let mut config = if let Some(file) = config_file { let config = tokio::fs::read_to_string(file).await?; - toml::from_str(&config)? + Self::load_toml(&config)? } else { Self::default() }; @@ -119,6 +127,11 @@ impl NodeConfig { Ok(config) } + fn load_toml(s: &str) -> Result { + let config = toml::from_str(s)?; + Ok(config) + } + /// Constructs a `RelayMap` based on the current configuration. pub(crate) fn relay_map(&self) -> Result> { if self.relay_nodes.is_empty() { @@ -128,6 +141,29 @@ impl NodeConfig { } } +/// Serde-compatible configuration for [`GcPolicy`]. +/// +/// The [`GcPolicy`] struct is not amenable to TOML serialisation, this covers this gap. +#[derive(PartialEq, Eq, Debug, Default, Deserialize, Clone)] +#[serde(default, deny_unknown_fields, rename = "gc_policy")] +pub(crate) struct GcPolicyConfig { + enabled: bool, + interval: Option, +} + +impl From for GcPolicy { + fn from(source: GcPolicyConfig) -> Self { + if source.enabled { + match source.interval { + Some(interval) => Self::Interval(Duration::from_secs(interval)), + None => Self::default(), + } + } else { + Self::Disabled + } + } +} + /// Environment for CLI and REPL /// /// This is cheaply cloneable and has interior mutability. If not running in the console @@ -415,12 +451,132 @@ pub(crate) fn iroh_cache_path(file_name: &Path) -> Result { #[cfg(test)] mod tests { + use std::net::{Ipv4Addr, Ipv6Addr}; + + use url::Url; + + use crate::logging::{EnvFilter, Rotation}; + use super::*; - #[tokio::test] - async fn test_default_settings() { - let config = NodeConfig::load(None).await.unwrap(); + #[test] + fn test_toml_invalid_field() { + let source = r#" + not_a_field = true + "#; + let res = NodeConfig::load_toml(source); + assert!(res.is_err()); + } + + #[test] + fn test_toml_relay_nodes() { + let source = r#" + [[relay_nodes]] + url = "https://example.org." + stun_only = false + stun_port = 123 + "#; + let config = NodeConfig::load_toml(source).unwrap(); + + let expected = RelayNode { + url: Url::parse("https://example.org./").unwrap().into(), + stun_only: false, + stun_port: 123, + }; + assert_eq!(config.relay_nodes, vec![expected]); + } + + #[test] + fn test_toml_gc_policy() { + let source = r#" + [gc_policy] + enabled = false + "#; + let config = NodeConfig::load_toml(source).unwrap(); + assert_eq!(GcPolicy::from(config.gc_policy), GcPolicy::Disabled); + + // Default interval should be used. + let source = r#" + [gc_policy] + enabled = true + "#; + let config = NodeConfig::load_toml(source).unwrap(); + let gc_policy = GcPolicy::from(config.gc_policy); + assert!(matches!(gc_policy, GcPolicy::Interval(_))); + assert_eq!(gc_policy, GcPolicy::default()); + + let source = r#" + [gc_policy] + enabled = true + interval = 1234 + "#; + let config = NodeConfig::load_toml(source).unwrap(); + assert_eq!( + GcPolicy::from(config.gc_policy), + GcPolicy::Interval(Duration::from_secs(1234)) + ); + + let source = r#" + [gc_policy] + not_a_field = true + "#; + let res = NodeConfig::load_toml(source); + assert!(res.is_err()); + } + + #[test] + fn test_toml_metrics_addr() { + let source = r#" + metrics_addr = "1.2.3.4:1234" + "#; + let config = NodeConfig::load_toml(source).unwrap(); + assert_eq!( + config.metrics_addr, + Some(SocketAddr::new(Ipv4Addr::new(1, 2, 3, 4).into(), 1234)), + ); + + let source = r#" + metrics_addr = "[123:456::789:abc]:1234" + "#; + let config = NodeConfig::load_toml(source).unwrap(); + assert_eq!( + config.metrics_addr, + Some(SocketAddr::new( + Ipv6Addr::new(0x123, 0x456, 0, 0, 0, 0, 0x789, 0xabc).into(), + 1234 + )), + ); + } - assert_eq!(config.relay_nodes.len(), 2); + #[test] + fn test_toml_file_logs() { + let source = r#" + [file_logs] + rust_log = "iroh_net=trace" + max_files = 123 + rotation = "daily" + dir = "/var/log/iroh" + "#; + let config = NodeConfig::load_toml(source).unwrap(); + assert_eq!( + config.file_logs.rust_log, + EnvFilter::from_str("iroh_net=trace").unwrap() + ); + assert_eq!(config.file_logs.max_files, 123); + assert_eq!(config.file_logs.rotation, Rotation::Daily); + assert_eq!(config.file_logs.dir, Some(PathBuf::from("/var/log/iroh"))); + + let source = r#" + [file_logs] + rust_log = "info" + "#; + let config = NodeConfig::load_toml(source).unwrap(); + assert_eq!( + config.file_logs.rust_log, + EnvFilter::from_str("info").unwrap() + ); + assert_eq!(config.file_logs.max_files, 4); + assert_eq!(config.file_logs.rotation, Rotation::Hourly); + assert_eq!(config.file_logs.dir, None); } } diff --git a/iroh-cli/src/logging.rs b/iroh-cli/src/logging.rs index 2f2539174c..561a8f95b4 100644 --- a/iroh-cli/src/logging.rs +++ b/iroh-cli/src/logging.rs @@ -97,8 +97,11 @@ pub(crate) fn init_terminal_logging() -> anyhow::Result<()> { Ok(()) } +/// Configuration for the logfiles. +// Please note that this is documented in the `iroh.computer` repository under +// `src/app/docs/reference/config/page.mdx`. Any changes to this need to be updated there. #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)] -#[serde(default)] +#[serde(default, deny_unknown_fields)] pub(crate) struct FileLogging { /// RUST_LOG directive to filter file logs. pub(crate) rust_log: EnvFilter, diff --git a/iroh-net/src/relay/map.rs b/iroh-net/src/relay/map.rs index 3e3d8a8434..ebd49f6d69 100644 --- a/iroh-net/src/relay/map.rs +++ b/iroh-net/src/relay/map.rs @@ -116,6 +116,8 @@ impl fmt::Display for RelayMap { /// Information on a specific relay server. /// /// Includes the Url where it can be dialed. +// Please note that this is documented in the `iroh.computer` repository under +// `src/app/docs/reference/config/page.mdx`. Any changes to this need to be updated there. #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, PartialOrd, Ord)] pub struct RelayNode { /// The [`RelayUrl`] where this relay server can be dialed. diff --git a/iroh/src/node/builder.rs b/iroh/src/node/builder.rs index 4623261724..a208cdaa94 100644 --- a/iroh/src/node/builder.rs +++ b/iroh/src/node/builder.rs @@ -802,6 +802,8 @@ impl ProtocolBuilder { } /// Policy for garbage collection. +// Please note that this is documented in the `iroh.computer` repository under +// `src/app/docs/reference/config/page.mdx`. Any changes to this need to be updated there. #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] pub enum GcPolicy { /// Garbage collection is disabled. From 081233357d4dbe0cabe890009d674839d9de18be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= Date: Fri, 26 Jul 2024 15:09:18 +0200 Subject: [PATCH 04/45] ci: Generate docs for each PR (#2547) ## Description Added a `docs.yaml` workflow that makes the github action bot reply to PRs with a link to built docs to your PRs. It only does so once, and then just updates the comment (although actually it doesn't necessarily need to, the link is always the same, but it's still nice in case someone changes the workflow). This is using the nightly toolchain, because we're using [`#[feature(doc_cfg)]`](https://doc.rust-lang.org/unstable-book/language-features/doc-cfg.html). ## Motivation I personally wanted this. Every now and then we get a PR with the description saying "the best way to review this is to start by looking at the generated docs". But then I need to checkout the PR, build the docs and open them. Having a link directly to the docs would be *amazing* IMO. I *also* think that having easy access to docs on every PR will make people check out the rendered docs on PRs more often. My hope here is that doc quality will thus improve. ## Breaking Changes None of course. This is only CI. ## Notes & open questions Wdyt? ## Change checklist - [x] Self-review. - ~~[ ] Documentation updates following the [style guide](https://rust-lang.github.io/rfcs/1574-more-api-documentation-conventions.html#appendix-a-full-conventions-text), if relevant.~~ - ~~[ ] Tests if relevant.~~ - ~~[ ] All breaking changes documented.~~ --- .github/workflows/docs.yaml | 56 +++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 .github/workflows/docs.yaml diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml new file mode 100644 index 0000000000..d9b51ec44f --- /dev/null +++ b/.github/workflows/docs.yaml @@ -0,0 +1,56 @@ +name: Docs Preview + +on: + pull_request: + +jobs: + preview_docs: + timeout-minutes: 30 + name: Docs preview + if: "github.event_name == 'pull_request'" + runs-on: ubuntu-latest + env: + RUSTC_WRAPPER: "sccache" + SCCACHE_GHA_ENABLED: "on" + SCCACHE_CACHE_SIZE: "50G" + PREVIEW_PATH: pr/${{ github.event.pull_request.number }}/docs + + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@master + with: + toolchain: nightly-2024-05-02 + - name: Install sccache + uses: mozilla-actions/sccache-action@v0.0.5 + + - name: Generate Docs + run: cargo doc --workspace --all-features --no-deps + env: + RUSTDOCFLAGS: --cfg docsrs + + - name: Deploy Docs to Preview Branch + uses: peaceiris/actions-gh-pages@v4 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + publish_dir: ./target/doc/ + destination_dir: ${{ env.PREVIEW_PATH }} + publish_branch: generated-docs-preview + + - name: Find Docs Comment + uses: peter-evans/find-comment@v3 + id: fc + with: + issue-number: ${{ github.event.pull_request.number }} + comment-author: 'github-actions[bot]' + body-includes: Documentation for this PR has been generated + + - name: Create or Update Docs Comment + uses: peter-evans/create-or-update-comment@v4 + with: + issue-number: ${{ github.event.pull_request.number }} + comment-id: ${{ steps.fc.outputs.comment-id }} + body: | + Documentation for this PR has been generated and is available at: https://${{ github.repository_owner }}.github.io/${{ github.event.repository.name }}/${{ env.PREVIEW_PATH }}/iroh/ + + Last updated: ${{ github.event.pull_request.updated_at }} + edit-mode: replace From ed4420b5df75d4cfe3623c3e722f33a8a19449ce Mon Sep 17 00:00:00 2001 From: Asmir Avdicevic Date: Sun, 28 Jul 2024 23:48:26 +0200 Subject: [PATCH 05/45] feat: override to staging relays (#2551) ## Description While I'm not a huge fan of this piece of code, I don't really see much of an alternative as we want to be able to force the override to use staging relays even in "prod" environments or when it's deeply integrated such as the FFI library cases. This should allow us to override the default relay maps for all of our CI setups in all of our repos. ## Breaking Changes ## Notes & open questions ## Change checklist - [ ] Self-review. - [ ] Documentation updates following the [style guide](https://rust-lang.github.io/rfcs/1574-more-api-documentation-conventions.html#appendix-a-full-conventions-text), if relevant. - [ ] Tests if relevant. - [ ] All breaking changes documented. --- iroh-cli/src/config.rs | 28 ++++++---------------- iroh-net/src/endpoint.rs | 49 +++++++++++++++++++++++---------------- iroh-net/src/relay/map.rs | 12 ++++++++++ 3 files changed, 48 insertions(+), 41 deletions(-) diff --git a/iroh-cli/src/config.rs b/iroh-cli/src/config.rs index f99f05d371..3b23c711c7 100644 --- a/iroh-cli/src/config.rs +++ b/iroh-cli/src/config.rs @@ -10,10 +10,7 @@ use std::{ }; use anyhow::{anyhow, bail, Context, Result}; -use iroh::net::{ - defaults, - relay::{RelayMap, RelayNode}, -}; +use iroh::net::relay::{RelayMap, RelayNode}; use iroh::node::GcPolicy; use iroh::{ client::Iroh, @@ -68,24 +65,13 @@ pub(crate) struct NodeConfig { impl Default for NodeConfig { fn default() -> Self { - #[cfg(not(test))] - let relay_nodes = { - use defaults::prod::{ - default_ap_relay_node, default_eu_relay_node, default_na_relay_node, - }; - [ - default_na_relay_node(), - default_eu_relay_node(), - default_ap_relay_node(), - ] - }; - #[cfg(test)] - let relay_nodes = { - use defaults::staging::{default_eu_relay_node, default_na_relay_node}; - [default_na_relay_node(), default_eu_relay_node()] - }; + let relay_map = iroh::net::endpoint::default_relay_mode().relay_map(); + let relay_nodes = relay_map + .nodes() + .map(|v| Arc::unwrap_or_clone(v.clone())) + .collect(); Self { - relay_nodes: relay_nodes.into(), + relay_nodes, gc_policy: GcPolicyConfig::default(), metrics_addr: Some(([127, 0, 0, 1], 9090).into()), file_logs: Default::default(), diff --git a/iroh-net/src/endpoint.rs b/iroh-net/src/endpoint.rs index 4db39618fc..69019ad148 100644 --- a/iroh-net/src/endpoint.rs +++ b/iroh-net/src/endpoint.rs @@ -19,7 +19,7 @@ use std::sync::Arc; use std::task::Poll; use std::time::Duration; -use anyhow::{anyhow, bail, ensure, Context, Result}; +use anyhow::{anyhow, bail, Context, Result}; use derive_more::Debug; use futures_lite::{Stream, StreamExt}; use tokio_util::sync::{CancellationToken, WaitForCancellationFuture}; @@ -27,12 +27,11 @@ use tracing::{debug, info_span, trace, warn}; use url::Url; use crate::{ - defaults, discovery::{Discovery, DiscoveryTask}, dns::{default_resolver, DnsResolver}, key::{PublicKey, SecretKey}, magicsock::{self, Handle}, - relay::{RelayMap, RelayMode, RelayUrl}, + relay::{RelayMode, RelayUrl}, tls, NodeId, }; @@ -59,6 +58,10 @@ pub use iroh_base::node_addr::{AddrInfo, NodeAddr}; /// is still no connection the configured [`Discovery`] will be used however. const DISCOVERY_WAIT_PERIOD: Duration = Duration::from_millis(500); +/// Environment variable to force the use of staging relays. +#[cfg(not(any(test, feature = "test-utils")))] +const ENV_FORCE_STAGING_RELAYS: &str = "IROH_FORCE_STAGING_RELAYS"; + /// Builder for [`Endpoint`]. /// /// By default the endpoint will generate a new random [`SecretKey`], which will result in a @@ -84,15 +87,9 @@ pub struct Builder { impl Default for Builder { fn default() -> Self { - // Use staging in testing - #[cfg(not(any(test, feature = "test-utils")))] - let relay_mode = RelayMode::Default; - #[cfg(any(test, feature = "test-utils"))] - let relay_mode = RelayMode::Staging; - Self { secret_key: Default::default(), - relay_mode, + relay_mode: default_relay_mode(), alpn_protocols: Default::default(), transport_config: Default::default(), concurrent_connections: Default::default(), @@ -121,15 +118,7 @@ impl Builder { /// /// NOTE: This will be improved soon to add support for binding on specific addresses. pub async fn bind(self, bind_port: u16) -> Result { - let relay_map = match self.relay_mode { - RelayMode::Disabled => RelayMap::empty(), - RelayMode::Default => defaults::prod::default_relay_map(), - RelayMode::Staging => defaults::staging::default_relay_map(), - RelayMode::Custom(relay_map) => { - ensure!(!relay_map.is_empty(), "Empty custom relay server map",); - relay_map - } - }; + let relay_map = self.relay_mode.relay_map(); let secret_key = self.secret_key.unwrap_or_else(SecretKey::generate); let static_config = StaticConfig { transport_config: Arc::new(self.transport_config.unwrap_or_default()), @@ -190,7 +179,7 @@ impl Builder { /// By default the Number0 relay servers are used. /// /// When using [RelayMode::Custom], the provided `relay_map` must contain at least one - /// configured relay node. If an invalid [`RelayMap`] is provided [`bind`] + /// configured relay node. If an invalid RelayMap is provided [`bind`] /// will result in an error. /// /// [`bind`]: Builder::bind @@ -1083,6 +1072,26 @@ fn proxy_url_from_env() -> Option { None } +/// Returns the default relay mode. +/// +/// If the `IROH_FORCE_STAGING_RELAYS` environment variable is set to `1`, it will return `RelayMode::Staging`. +/// Otherwise, it will return `RelayMode::Default`. +pub fn default_relay_mode() -> RelayMode { + // Use staging in testing + #[cfg(not(any(test, feature = "test-utils")))] + let force_staging_relays = match std::env::var(ENV_FORCE_STAGING_RELAYS) { + Ok(value) => value == "1", + Err(_) => false, + }; + #[cfg(any(test, feature = "test-utils"))] + let force_staging_relays = true; + + match force_staging_relays { + true => RelayMode::Staging, + false => RelayMode::Default, + } +} + /// Check if we are being executed in a CGI context. /// /// If so, a malicious client can send the `Proxy:` header, and it will diff --git a/iroh-net/src/relay/map.rs b/iroh-net/src/relay/map.rs index ebd49f6d69..d6313892cd 100644 --- a/iroh-net/src/relay/map.rs +++ b/iroh-net/src/relay/map.rs @@ -22,6 +22,18 @@ pub enum RelayMode { Custom(RelayMap), } +impl RelayMode { + /// Returns the relay map for this mode. + pub fn relay_map(&self) -> RelayMap { + match self { + RelayMode::Disabled => RelayMap::empty(), + RelayMode::Default => crate::defaults::prod::default_relay_map(), + RelayMode::Staging => crate::defaults::staging::default_relay_map(), + RelayMode::Custom(relay_map) => relay_map.clone(), + } + } +} + /// Configuration of all the relay servers that can be used. #[derive(Debug, Clone, PartialEq, Eq)] pub struct RelayMap { From e7a7552191b71b476cab0a75544f129e657d8dfe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=BCdiger=20Klaehn?= Date: Mon, 29 Jul 2024 10:23:19 +0300 Subject: [PATCH 06/45] refactor(iroh-docs): Replace flume with async_channel in docs (#2540) ## Description This is mostly a 1:1 replacement, except for the fact that the same_channel api is missing from async_channel. So I replaced it with some ugly code that uses the fact that a async_channel Sender or Receiver is just an Arc. To be removed if/when https://github.com/smol-rs/async-channel/pull/98 is merged, but until then I think it is fine. ## Breaking Changes None ## Notes & open questions Note: we can not use tokio::sync::mpsc::Channel for the actor because we can't control from which thread Drop is called. Note: some streams were Unpin before, but it was not explicit. Now I added Unpin explicitly (and boxed the stream to make it true). Not sure if the version check would catch this, pretty sure that not. But taking away Unpin would have been a breaking change. ## Change checklist - [ ] Self-review. - [ ] Documentation updates following the [style guide](https://rust-lang.github.io/rfcs/1574-more-api-documentation-conventions.html#appendix-a-full-conventions-text), if relevant. - [ ] Tests if relevant. - [ ] All breaking changes documented. --- Cargo.lock | 3 +- iroh-docs/Cargo.toml | 2 +- iroh-docs/src/actor.rs | 54 ++++++++++++++++++------------------ iroh-docs/src/engine.rs | 10 +++---- iroh-docs/src/engine/live.rs | 22 +++++++-------- iroh-docs/src/sync.rs | 49 ++++++++++++++++++++++++-------- iroh/Cargo.toml | 1 + iroh/src/node/rpc/docs.rs | 30 ++++++++++---------- 8 files changed, 100 insertions(+), 71 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b5ac47386d..a7afd8edd7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2478,6 +2478,7 @@ name = "iroh" version = "0.21.0" dependencies = [ "anyhow", + "async-channel", "bao-tree", "bytes", "clap", @@ -2733,10 +2734,10 @@ name = "iroh-docs" version = "0.21.0" dependencies = [ "anyhow", + "async-channel", "bytes", "derive_more", "ed25519-dalek", - "flume", "futures-buffered", "futures-lite 2.3.0", "futures-util", diff --git a/iroh-docs/Cargo.toml b/iroh-docs/Cargo.toml index 60a3fdb494..180898d423 100644 --- a/iroh-docs/Cargo.toml +++ b/iroh-docs/Cargo.toml @@ -16,11 +16,11 @@ workspace = true [dependencies] anyhow = "1" +async-channel = "2.3.1" blake3 = { package = "iroh-blake3", version = "1.4.5"} bytes = { version = "1.4", features = ["serde"] } derive_more = { version = "1.0.0-beta.6", features = ["debug", "deref", "display", "from", "try_into", "into", "as_ref"] } ed25519-dalek = { version = "2.0.0", features = ["serde", "rand_core"] } -flume = "0.11" futures-buffered = "0.2.4" futures-lite = "2.3.0" futures-util = { version = "0.3.25" } diff --git a/iroh-docs/src/actor.rs b/iroh-docs/src/actor.rs index 769f14a482..79bd34ad28 100644 --- a/iroh-docs/src/actor.rs +++ b/iroh-docs/src/actor.rs @@ -61,12 +61,12 @@ enum Action { #[display("ListAuthors")] ListAuthors { #[debug("reply")] - reply: flume::Sender>, + reply: async_channel::Sender>, }, #[display("ListReplicas")] ListReplicas { #[debug("reply")] - reply: flume::Sender>, + reply: async_channel::Sender>, }, #[display("ContentHashes")] ContentHashes { @@ -108,12 +108,12 @@ enum ReplicaAction { reply: oneshot::Sender>, }, Subscribe { - sender: flume::Sender, + sender: async_channel::Sender, #[debug("reply")] reply: oneshot::Sender>, }, Unsubscribe { - sender: flume::Sender, + sender: async_channel::Sender, #[debug("reply")] reply: oneshot::Sender>, }, @@ -166,7 +166,7 @@ enum ReplicaAction { }, GetMany { query: Query, - reply: flume::Sender>, + reply: async_channel::Sender>, }, DropReplica { reply: oneshot::Sender>, @@ -222,7 +222,7 @@ struct OpenReplica { /// [`SyncHandle::drop`] will not block. #[derive(Debug, Clone)] pub struct SyncHandle { - tx: flume::Sender, + tx: async_channel::Sender, join_handle: Arc>>, } @@ -232,7 +232,7 @@ pub struct OpenOpts { /// Set to true to set sync state to true. pub sync: bool, /// Optionally subscribe to replica events. - pub subscribe: Option>, + pub subscribe: Option>, } impl OpenOpts { /// Set sync state to true. @@ -241,7 +241,7 @@ impl OpenOpts { self } /// Subscribe to replica events. - pub fn subscribe(mut self, subscribe: flume::Sender) -> Self { + pub fn subscribe(mut self, subscribe: async_channel::Sender) -> Self { self.subscribe = Some(subscribe); self } @@ -255,7 +255,7 @@ impl SyncHandle { content_status_callback: Option, me: String, ) -> SyncHandle { - let (action_tx, action_rx) = flume::bounded(ACTION_CAP); + let (action_tx, action_rx) = async_channel::bounded(ACTION_CAP); let actor = Actor { store, states: Default::default(), @@ -298,7 +298,7 @@ impl SyncHandle { pub async fn subscribe( &self, namespace: NamespaceId, - sender: flume::Sender, + sender: async_channel::Sender, ) -> Result<()> { let (reply, rx) = oneshot::channel(); self.send_replica(namespace, ReplicaAction::Subscribe { sender, reply }) @@ -309,7 +309,7 @@ impl SyncHandle { pub async fn unsubscribe( &self, namespace: NamespaceId, - sender: flume::Sender, + sender: async_channel::Sender, ) -> Result<()> { let (reply, rx) = oneshot::channel(); self.send_replica(namespace, ReplicaAction::Unsubscribe { sender, reply }) @@ -435,7 +435,7 @@ impl SyncHandle { &self, namespace: NamespaceId, query: Query, - reply: flume::Sender>, + reply: async_channel::Sender>, ) -> Result<()> { let action = ReplicaAction::GetMany { query, reply }; self.send_replica(namespace, action).await?; @@ -489,13 +489,13 @@ impl SyncHandle { Ok(store) } - pub async fn list_authors(&self, reply: flume::Sender>) -> Result<()> { + pub async fn list_authors(&self, reply: async_channel::Sender>) -> Result<()> { self.send(Action::ListAuthors { reply }).await } pub async fn list_replicas( &self, - reply: flume::Sender>, + reply: async_channel::Sender>, ) -> Result<()> { self.send(Action::ListReplicas { reply }).await } @@ -566,7 +566,7 @@ impl SyncHandle { async fn send(&self, action: Action) -> Result<()> { self.tx - .send_async(action) + .send(action) .await .context("sending to iroh_docs actor failed")?; Ok(()) @@ -581,7 +581,10 @@ impl Drop for SyncHandle { fn drop(&mut self) { // this means we're dropping the last reference if let Some(handle) = Arc::get_mut(&mut self.join_handle) { - self.tx.send(Action::Shutdown { reply: None }).ok(); + // this call is the reason tx can not be a tokio mpsc channel. + // we have no control about where drop is called, yet tokio send_blocking panics + // when called from inside a tokio runtime. + self.tx.send_blocking(Action::Shutdown { reply: None }).ok(); let handle = handle.take().expect("this can only run once"); if let Err(err) = handle.join() { warn!(?err, "Failed to join sync actor"); @@ -593,7 +596,7 @@ impl Drop for SyncHandle { struct Actor { store: Store, states: OpenReplicas, - action_rx: flume::Receiver, + action_rx: async_channel::Receiver, content_status_callback: Option, tasks: JoinSet<()>, } @@ -619,10 +622,10 @@ impl Actor { } continue; } - action = self.action_rx.recv_async() => { + action = self.action_rx.recv() => { match action { Ok(action) => action, - Err(flume::RecvError::Disconnected) => { + Err(async_channel::RecvError) => { debug!("action channel disconnected"); break None; } @@ -979,17 +982,14 @@ impl OpenReplicas { } async fn iter_to_channel_async( - channel: flume::Sender>, + channel: async_channel::Sender>, iter: Result>>, ) -> Result<(), SendReplyError> { match iter { - Err(err) => channel - .send_async(Err(err)) - .await - .map_err(send_reply_error)?, + Err(err) => channel.send(Err(err)).await.map_err(send_reply_error)?, Ok(iter) => { for item in iter { - channel.send_async(item).await.map_err(send_reply_error)?; + channel.send(item).await.map_err(send_reply_error)?; } } } @@ -1032,10 +1032,10 @@ mod tests { let id = namespace.id(); sync.import_namespace(namespace.into()).await?; sync.open(id, Default::default()).await?; - let (tx, rx) = flume::bounded(10); + let (tx, rx) = async_channel::bounded(10); sync.subscribe(id, tx).await?; sync.close(id).await?; - assert!(rx.recv_async().await.is_err()); + assert!(rx.recv().await.is_err()); Ok(()) } } diff --git a/iroh-docs/src/engine.rs b/iroh-docs/src/engine.rs index 78190ae921..f6a2ae73aa 100644 --- a/iroh-docs/src/engine.rs +++ b/iroh-docs/src/engine.rs @@ -170,15 +170,15 @@ impl Engine { // Subscribe to insert events from the replica. let a = { - let (s, r) = flume::bounded(SUBSCRIBE_CHANNEL_CAP); + let (s, r) = async_channel::bounded(SUBSCRIBE_CHANNEL_CAP); this.sync.subscribe(namespace, s).await?; - r.into_stream() - .map(move |ev| LiveEvent::from_replica_event(ev, &content_status_cb)) + Box::pin(r).map(move |ev| LiveEvent::from_replica_event(ev, &content_status_cb)) }; // Subscribe to events from the [`live::Actor`]. let b = { - let (s, r) = flume::bounded(SUBSCRIBE_CHANNEL_CAP); + let (s, r) = async_channel::bounded(SUBSCRIBE_CHANNEL_CAP); + let r = Box::pin(r); let (reply, reply_rx) = oneshot::channel(); this.to_live_actor .send(ToLiveActor::Subscribe { @@ -188,7 +188,7 @@ impl Engine { }) .await?; reply_rx.await??; - r.into_stream().map(|event| Ok(LiveEvent::from(event))) + r.map(|event| Ok(LiveEvent::from(event))) }; Ok(a.or(b)) diff --git a/iroh-docs/src/engine/live.rs b/iroh-docs/src/engine/live.rs index f41744ac73..6e49536baa 100644 --- a/iroh-docs/src/engine/live.rs +++ b/iroh-docs/src/engine/live.rs @@ -78,7 +78,7 @@ pub enum ToLiveActor { Subscribe { namespace: NamespaceId, #[debug("sender")] - sender: flume::Sender, + sender: async_channel::Sender, #[debug("oneshot::Sender")] reply: sync::oneshot::Sender>, }, @@ -153,8 +153,8 @@ pub struct LiveActor { gossip: Gossip, bao_store: B, downloader: Downloader, - replica_events_tx: flume::Sender, - replica_events_rx: flume::Receiver, + replica_events_tx: async_channel::Sender, + replica_events_rx: async_channel::Receiver, /// Send messages to self. /// Note: Must not be used in methods called from `Self::run` directly to prevent deadlocks. @@ -192,7 +192,7 @@ impl LiveActor { sync_actor_tx: mpsc::Sender, gossip_actor_tx: mpsc::Sender, ) -> Self { - let (replica_events_tx, replica_events_rx) = flume::bounded(1024); + let (replica_events_tx, replica_events_rx) = async_channel::bounded(1024); Self { inbox, sync, @@ -262,7 +262,7 @@ impl LiveActor { } } } - event = self.replica_events_rx.recv_async() => { + event = self.replica_events_rx.recv() => { trace!(?i, "tick: replica_event"); inc!(Metrics, doc_live_tick_replica_event); let event = event.context("replica_events closed")?; @@ -865,7 +865,7 @@ impl From<&SyncFinished> for SyncDetails { struct SubscribersMap(HashMap); impl SubscribersMap { - fn subscribe(&mut self, namespace: NamespaceId, sender: flume::Sender) { + fn subscribe(&mut self, namespace: NamespaceId, sender: async_channel::Sender) { self.0.entry(namespace).or_default().subscribe(sender); } @@ -930,15 +930,15 @@ impl QueuedHashes { } #[derive(Debug, Default)] -struct Subscribers(Vec>); +struct Subscribers(Vec>); impl Subscribers { - fn subscribe(&mut self, sender: flume::Sender) { + fn subscribe(&mut self, sender: async_channel::Sender) { self.0.push(sender) } async fn send(&mut self, event: Event) -> bool { - let futs = self.0.iter().map(|sender| sender.send_async(event.clone())); + let futs = self.0.iter().map(|sender| sender.send(event.clone())); let res = futures_buffered::join_all(futs).await; // reverse the order so removing does not shift remaining indices for (i, res) in res.into_iter().enumerate().rev() { @@ -977,8 +977,8 @@ mod tests { #[tokio::test] async fn test_sync_remove() { let pk = PublicKey::from_bytes(&[1; 32]).unwrap(); - let (a_tx, a_rx) = flume::unbounded(); - let (b_tx, b_rx) = flume::unbounded(); + let (a_tx, a_rx) = async_channel::unbounded(); + let (b_tx, b_rx) = async_channel::unbounded(); let mut subscribers = Subscribers::default(); subscribers.subscribe(a_tx); subscribers.subscribe(b_tx); diff --git a/iroh-docs/src/sync.rs b/iroh-docs/src/sync.rs index 5d3896f4bc..773f497e69 100644 --- a/iroh-docs/src/sync.rs +++ b/iroh-docs/src/sync.rs @@ -108,17 +108,34 @@ pub struct SyncOutcome { pub num_sent: usize, } +fn get_as_ptr(value: &T) -> Option { + use std::mem; + if mem::size_of::() == std::mem::size_of::() + && mem::align_of::() == mem::align_of::() + { + // Safe only if size and alignment requirements are met + unsafe { Some(mem::transmute_copy(value)) } + } else { + None + } +} + +fn same_channel(a: &async_channel::Sender, b: &async_channel::Sender) -> bool { + get_as_ptr(a).unwrap() == get_as_ptr(b).unwrap() +} + #[derive(Debug, Default)] -struct Subscribers(Vec>); +struct Subscribers(Vec>); impl Subscribers { - pub fn subscribe(&mut self, sender: flume::Sender) { + pub fn subscribe(&mut self, sender: async_channel::Sender) { self.0.push(sender) } - pub fn unsubscribe(&mut self, sender: &flume::Sender) { - self.0.retain(|s| !s.same_channel(sender)); + pub fn unsubscribe(&mut self, sender: &async_channel::Sender) { + self.0.retain(|s| !same_channel(s, sender)); } pub fn send(&mut self, event: Event) { - self.0.retain(|sender| sender.send(event.clone()).is_ok()) + self.0 + .retain(|sender| sender.send_blocking(event.clone()).is_ok()) } pub fn len(&self) -> usize { self.0.len() @@ -263,10 +280,10 @@ impl ReplicaInfo { /// Subscribe to insert events. /// - /// When subscribing to a replica, you must ensure that the corresponding [`flume::Receiver`] is + /// When subscribing to a replica, you must ensure that the corresponding [`async_channel::Receiver`] is /// received from in a loop. If not receiving, local and remote inserts will hang waiting for /// the receiver to be received from. - pub fn subscribe(&mut self, sender: flume::Sender) { + pub fn subscribe(&mut self, sender: async_channel::Sender) { self.subscribers.subscribe(sender) } @@ -275,7 +292,7 @@ impl ReplicaInfo { /// Simply dropping the receiver is fine too. If you cloned a single sender to subscribe to /// multiple replicas, you can use this method to explicitly unsubscribe the sender from /// this replica without having to drop the receiver. - pub fn unsubscribe(&mut self, sender: &flume::Sender) { + pub fn unsubscribe(&mut self, sender: &async_channel::Sender) { self.subscribers.unsubscribe(sender) } @@ -2156,6 +2173,14 @@ mod tests { Ok(()) } + fn drain(events: async_channel::Receiver) -> Vec { + let mut res = vec![]; + while let Ok(ev) = events.try_recv() { + res.push(ev); + } + res + } + /// This tests that no events are emitted for entries received during sync which are obsolete /// (too old) by the time they are actually inserted in the store. #[test] @@ -2173,8 +2198,8 @@ mod tests { let mut replica1 = store1.new_replica(namespace.clone())?; let mut replica2 = store2.new_replica(namespace.clone())?; - let (events1_sender, events1) = flume::bounded(32); - let (events2_sender, events2) = flume::bounded(32); + let (events1_sender, events1) = async_channel::bounded(32); + let (events2_sender, events2) = async_channel::bounded(32); replica1.info.subscribe(events1_sender); replica2.info.subscribe(events2_sender); @@ -2198,8 +2223,8 @@ mod tests { .sync_process_message(from1, peer1, &mut state2) .unwrap(); assert!(from2.is_none()); - let events1 = events1.drain().collect::>(); - let events2 = events2.drain().collect::>(); + let events1 = drain(events1); + let events2 = drain(events2); assert_eq!(events1.len(), 1); assert_eq!(events2.len(), 1); assert!(matches!(events1[0], Event::LocalInsert { .. })); diff --git a/iroh/Cargo.toml b/iroh/Cargo.toml index 4ec069fe30..d31990e63f 100644 --- a/iroh/Cargo.toml +++ b/iroh/Cargo.toml @@ -17,6 +17,7 @@ workspace = true [dependencies] anyhow = { version = "1" } +async-channel = "2.3.1" bao-tree = { version = "0.13", features = ["tokio_fsm"], default-features = false } bytes = "1" derive_more = { version = "1.0.0-beta.6", features = ["debug", "display", "from", "try_into", "from_str"] } diff --git a/iroh/src/node/rpc/docs.rs b/iroh/src/node/rpc/docs.rs index 3fc35bc597..e974d68733 100644 --- a/iroh/src/node/rpc/docs.rs +++ b/iroh/src/node/rpc/docs.rs @@ -1,11 +1,10 @@ //! This module contains an impl block on [`DocsEngine`] with handlers for RPC requests use anyhow::anyhow; -use futures_lite::Stream; +use futures_lite::{Stream, StreamExt}; use iroh_base::rpc::RpcResult; use iroh_blobs::{store::Store as BaoStore, BlobFormat}; use iroh_docs::{Author, DocTicket, NamespaceSecret}; -use tokio_stream::StreamExt; use crate::client::docs::ShareMode; use crate::node::DocsEngine; @@ -60,18 +59,18 @@ impl DocsEngine { pub fn author_list( &self, _req: AuthorListRequest, - ) -> impl Stream> { - let (tx, rx) = flume::bounded(ITER_CHANNEL_CAP); + ) -> impl Stream> + Unpin { + let (tx, rx) = async_channel::bounded(ITER_CHANNEL_CAP); let sync = self.sync.clone(); // we need to spawn a task to send our request to the sync handle, because the method // itself must be sync. tokio::task::spawn(async move { let tx2 = tx.clone(); if let Err(err) = sync.list_authors(tx).await { - tx2.send_async(Err(err)).await.ok(); + tx2.send(Err(err)).await.ok(); } }); - rx.into_stream().map(|r| { + rx.boxed().map(|r| { r.map(|author_id| AuthorListResponse { author_id }) .map_err(Into::into) }) @@ -111,18 +110,21 @@ impl DocsEngine { Ok(DropResponse {}) } - pub fn doc_list(&self, _req: DocListRequest) -> impl Stream> { - let (tx, rx) = flume::bounded(ITER_CHANNEL_CAP); + pub fn doc_list( + &self, + _req: DocListRequest, + ) -> impl Stream> + Unpin { + let (tx, rx) = async_channel::bounded(ITER_CHANNEL_CAP); let sync = self.sync.clone(); // we need to spawn a task to send our request to the sync handle, because the method // itself must be sync. tokio::task::spawn(async move { let tx2 = tx.clone(); if let Err(err) = sync.list_replicas(tx).await { - tx2.send_async(Err(err)).await.ok(); + tx2.send(Err(err)).await.ok(); } }); - rx.into_stream().map(|r| { + rx.boxed().map(|r| { r.map(|(id, capability)| DocListResponse { id, capability }) .map_err(Into::into) }) @@ -249,19 +251,19 @@ impl DocsEngine { pub fn doc_get_many( &self, req: GetManyRequest, - ) -> impl Stream> { + ) -> impl Stream> + Unpin { let GetManyRequest { doc_id, query } = req; - let (tx, rx) = flume::bounded(ITER_CHANNEL_CAP); + let (tx, rx) = async_channel::bounded(ITER_CHANNEL_CAP); let sync = self.sync.clone(); // we need to spawn a task to send our request to the sync handle, because the method // itself must be sync. tokio::task::spawn(async move { let tx2 = tx.clone(); if let Err(err) = sync.get_many(doc_id, query, tx).await { - tx2.send_async(Err(err)).await.ok(); + tx2.send(Err(err)).await.ok(); } }); - rx.into_stream() + rx.boxed() .map(|r| r.map(|entry| GetManyResponse { entry }).map_err(Into::into)) } From d937234621791338a65338678badc35345784296 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= Date: Mon, 29 Jul 2024 09:46:23 +0200 Subject: [PATCH 07/45] feat(iroh): Improve documentation and canonicalize docs in `iroh::client` (#2553) ## Description - More pointers between individual modules to help people understand how they go from `A` to `B` (e.g. `Node` -> `Iroh`, `Iroh` -> `blobs::Client`, etc.) - Documentation on how to initialize `client::Iroh` - Generally make methods adhere to our documentation style. - Expose the type alias `iroh::client::RpcClient`, so it doesn't show up as `RpcClient>` everywhere it's used in the docs. ## Breaking Changes None, but noteworthy: - Expose the type alias `iroh::client::RpcClient`. ## Notes & open questions ## Change checklist - [X] Self-review. - [X] Documentation updates following the [style guide](https://rust-lang.github.io/rfcs/1574-more-api-documentation-conventions.html#appendix-a-full-conventions-text), if relevant. - ~~[ ] Tests if relevant.~~ - [X] All breaking changes documented. --------- Co-authored-by: Floris Bruynooghe --- iroh/src/client.rs | 37 +++++++++++++---- iroh/src/client/authors.rs | 20 +++++---- iroh/src/client/blobs.rs | 2 + iroh/src/client/docs.rs | 85 +++++++++++++++++++++----------------- iroh/src/client/gossip.rs | 8 ++-- iroh/src/client/node.rs | 46 ++++++++++++++------- iroh/src/client/quic.rs | 4 +- iroh/src/client/tags.rs | 21 ++++++---- iroh/src/lib.rs | 8 ++-- 9 files changed, 144 insertions(+), 87 deletions(-) diff --git a/iroh/src/client.rs b/iroh/src/client.rs index 3f6ae5055a..c33745f49c 100644 --- a/iroh/src/client.rs +++ b/iroh/src/client.rs @@ -1,4 +1,6 @@ //! Client to an Iroh node. +//! +//! See the documentation for [`Iroh`] for more information. use futures_lite::{Stream, StreamExt}; use ref_cast::RefCast; @@ -21,11 +23,23 @@ pub mod gossip; pub mod node; pub mod tags; +// Keep this type exposed, otherwise every occurrence of `RpcClient` in the API +// will show up as `RpcClient>` in the docs. /// Iroh rpc client - boxed so that we can have a concrete type. -pub(crate) type RpcClient = +pub type RpcClient = quic_rpc::RpcClient>; -/// Iroh client. +/// An iroh client. +/// +/// There are three ways to obtain this client, depending on which context +/// you're running in relative to the main [`Node`](crate::node::Node): +/// +/// 1. If you just spawned the client in rust the same process and have a reference to it: +/// Use [`Node::client()`](crate::node::Node::client). +/// 2. If the main node wasn't spawned in the same process, but on the same machine: +/// Use [`Iroh::connect_path`]. +/// 3. If the main node was spawned somewhere else and has been made accessible via IP: +/// Use [`Iroh::connect_addr`]. #[derive(Debug, Clone)] pub struct Iroh { rpc: RpcClient, @@ -40,37 +54,42 @@ impl Deref for Iroh { } impl Iroh { - /// Create a new high-level client to a Iroh node from the low-level RPC client. + /// Creates a new high-level client to a Iroh node from the low-level RPC client. + /// + /// Prefer using [`Node::client()`](crate::node::Node::client), [`Iroh::connect_path`] + /// or [`Iroh::connect_addr`] instead of calling this function. + /// + /// See also the [`Iroh`] struct documentation. pub fn new(rpc: RpcClient) -> Self { Self { rpc } } - /// Blobs client + /// Returns the blobs client. pub fn blobs(&self) -> &blobs::Client { blobs::Client::ref_cast(&self.rpc) } - /// Docs client + /// Returns the docs client. pub fn docs(&self) -> &docs::Client { docs::Client::ref_cast(&self.rpc) } - /// Authors client + /// Returns the authors client. pub fn authors(&self) -> &authors::Client { authors::Client::ref_cast(&self.rpc) } - /// Tags client + /// Returns the tags client. pub fn tags(&self) -> &tags::Client { tags::Client::ref_cast(&self.rpc) } - /// Gossip client + /// Returns the gossip client. pub fn gossip(&self) -> &gossip::Client { gossip::Client::ref_cast(&self.rpc) } - /// Node client + /// Returns the node client. pub fn node(&self) -> &node::Client { node::Client::ref_cast(&self.rpc) } diff --git a/iroh/src/client/authors.rs b/iroh/src/client/authors.rs index dbfe60ceef..f1d56ace87 100644 --- a/iroh/src/client/authors.rs +++ b/iroh/src/client/authors.rs @@ -1,4 +1,8 @@ //! API for author management. +//! +//! The main entry point is the [`Client`]. +//! +//! You obtain a [`Client`] via [`Iroh::authors()`](crate::client::Iroh::authors). use anyhow::Result; use futures_lite::{stream::StreamExt, Stream}; @@ -20,7 +24,7 @@ pub struct Client { } impl Client { - /// Create a new document author. + /// Creates a new document author. /// /// You likely want to save the returned [`AuthorId`] somewhere so that you can use this author /// again. @@ -42,7 +46,7 @@ impl Client { Ok(res.author_id) } - /// Set the node-wide default author. + /// Sets the node-wide default author. /// /// If the author does not exist, an error is returned. /// @@ -53,23 +57,25 @@ impl Client { Ok(()) } - /// List document authors for which we have a secret key. + /// Lists document authors for which we have a secret key. + /// + /// It's only possible to create writes from authors that we have the secret key of. pub async fn list(&self) -> Result>> { let stream = self.rpc.server_streaming(ListRequest {}).await?; Ok(flatten(stream).map(|res| res.map(|res| res.author_id))) } - /// Export the given author. + /// Exports the given author. /// - /// Warning: This contains sensitive data. + /// Warning: The [`Author`] struct contains sensitive data. pub async fn export(&self, author: AuthorId) -> Result> { let res = self.rpc.rpc(ExportRequest { author }).await??; Ok(res.author) } - /// Import the given author. + /// Imports the given author. /// - /// Warning: This contains sensitive data. + /// Warning: The [`Author`] struct contains sensitive data. pub async fn import(&self, author: Author) -> Result<()> { self.rpc.rpc(ImportRequest { author }).await??; Ok(()) diff --git a/iroh/src/client/blobs.rs b/iroh/src/client/blobs.rs index a9a48539b3..3151c3fb1f 100644 --- a/iroh/src/client/blobs.rs +++ b/iroh/src/client/blobs.rs @@ -2,6 +2,8 @@ //! //! The main entry point is the [`Client`]. //! +//! You obtain a [`Client`] via [`Iroh::blobs()`](crate::client::Iroh::blobs). +//! //! ## Interacting with the local blob store //! //! ### Importing data diff --git a/iroh/src/client/docs.rs b/iroh/src/client/docs.rs index 3da6529eb9..2b6946fcd8 100644 --- a/iroh/src/client/docs.rs +++ b/iroh/src/client/docs.rs @@ -1,4 +1,8 @@ //! API for document management. +//! +//! The main entry point is the [`Client`]. +//! +//! You obtain a [`Client`] via [`Iroh::docs()`](crate::client::Iroh::docs). use std::{ path::{Path, PathBuf}, @@ -47,14 +51,14 @@ pub struct Client { } impl Client { - /// Create a new document. + /// Creates a new document. pub async fn create(&self) -> Result { let res = self.rpc.rpc(CreateRequest {}).await??; let doc = Doc::new(self.rpc.clone(), res.id); Ok(doc) } - /// Delete a document from the local node. + /// Deletes a document from the local node. /// /// This is a destructive operation. Both the document secret key and all entries in the /// document will be permanently deleted from the node's storage. Content blobs will be deleted @@ -64,7 +68,7 @@ impl Client { Ok(()) } - /// Import a document from a namespace capability. + /// Imports a document from a namespace capability. /// /// This does not start sync automatically. Use [`Doc::start_sync`] to start sync. pub async fn import_namespace(&self, capability: Capability) -> Result { @@ -73,7 +77,7 @@ impl Client { Ok(doc) } - /// Import a document from a ticket and join all peers in the ticket. + /// Imports a document from a ticket and joins all peers in the ticket. pub async fn import(&self, ticket: DocTicket) -> Result { let DocTicket { capability, nodes } = ticket; let doc = self.import_namespace(capability).await?; @@ -81,9 +85,9 @@ impl Client { Ok(doc) } - /// Import a document from a ticket, create a subscription stream and join all peers in the ticket. + /// Imports a document from a ticket, creates a subscription stream and joins all peers in the ticket. /// - /// Returns the [`Doc`] and a [`Stream`] of [`LiveEvent`]s + /// Returns the [`Doc`] and a [`Stream`] of [`LiveEvent`]s. /// /// The subscription stream is created before the sync is started, so the first call to this /// method after starting the node is guaranteed to not miss any sync events. @@ -99,13 +103,15 @@ impl Client { Ok((doc, events)) } - /// List all documents. + /// Lists all documents. pub async fn list(&self) -> Result>> { let stream = self.rpc.server_streaming(DocListRequest {}).await?; Ok(flatten(stream).map(|res| res.map(|res| (res.id, res.capability)))) } - /// Get a [`Doc`] client for a single document. Return None if the document cannot be found. + /// Returns a [`Doc`] client for a single document. + /// + /// Returns None if the document cannot be found. pub async fn open(&self, id: NamespaceId) -> Result> { self.rpc.rpc(OpenRequest { doc_id: id }).await??; let doc = Doc::new(self.rpc.clone(), id); @@ -163,12 +169,12 @@ impl Doc { Ok(res) } - /// Get the document id of this doc. + /// Returns the document id of this doc. pub fn id(&self) -> NamespaceId { self.0.id } - /// Close the document. + /// Closes the document. pub async fn close(&self) -> Result<()> { if !self.0.closed.swap(true, Ordering::Relaxed) { self.rpc(CloseRequest { doc_id: self.id() }).await??; @@ -184,7 +190,7 @@ impl Doc { } } - /// Set the content of a key to a byte array. + /// Sets the content of a key to a byte array. pub async fn set_bytes( &self, author_id: AuthorId, @@ -203,7 +209,7 @@ impl Doc { Ok(res.entry.content_hash()) } - /// Set an entries on the doc via its key, hash, and size. + /// Sets an entries on the doc via its key, hash, and size. pub async fn set_hash( &self, author_id: AuthorId, @@ -223,7 +229,7 @@ impl Doc { Ok(()) } - /// Add an entry from an absolute file path + /// Adds an entry from an absolute file path pub async fn import_file( &self, author: AuthorId, @@ -246,7 +252,7 @@ impl Doc { Ok(ImportFileProgress::new(stream)) } - /// Export an entry as a file to a given absolute path. + /// Exports an entry as a file to a given absolute path. pub async fn export_file( &self, entry: Entry, @@ -266,7 +272,7 @@ impl Doc { Ok(ExportFileProgress::new(stream)) } - /// Delete entries that match the given `author` and key `prefix`. + /// Deletes entries that match the given `author` and key `prefix`. /// /// This inserts an empty entry with the key set to `prefix`, effectively clearing all other /// entries whose key starts with or is equal to the given `prefix`. @@ -285,9 +291,9 @@ impl Doc { Ok(removed) } - /// Get an entry for a key and author. + /// Returns an entry for a key and author. /// - /// Optionally also get the entry if it is empty (i.e. a deletion marker). + /// Optionally also returns the entry unless it is empty (i.e. a deletion marker). pub async fn get_exact( &self, author: AuthorId, @@ -306,7 +312,7 @@ impl Doc { Ok(res.entry.map(|entry| entry.into())) } - /// Get entries. + /// Returns all entries matching the query. pub async fn get_many( &self, query: impl Into, @@ -323,12 +329,12 @@ impl Doc { Ok(flatten(stream).map(|res| res.map(|res| res.entry.into()))) } - /// Get a single entry. + /// Returns a single entry. pub async fn get_one(&self, query: impl Into) -> Result> { self.get_many(query).await?.next().await.transpose() } - /// Share this document with peers over a ticket. + /// Shares this document with peers over a ticket. pub async fn share( &self, mode: ShareMode, @@ -345,7 +351,7 @@ impl Doc { Ok(res.0) } - /// Start to sync this document with a list of peers. + /// Starts to sync this document with a list of peers. pub async fn start_sync(&self, peers: Vec) -> Result<()> { self.ensure_open()?; let _res = self @@ -357,14 +363,14 @@ impl Doc { Ok(()) } - /// Stop the live sync for this document. + /// Stops the live sync for this document. pub async fn leave(&self) -> Result<()> { self.ensure_open()?; let _res = self.rpc(LeaveRequest { doc_id: self.id() }).await??; Ok(()) } - /// Subscribe to events for this document. + /// Subscribes to events for this document. pub async fn subscribe(&self) -> anyhow::Result>> { self.ensure_open()?; let stream = self @@ -378,14 +384,14 @@ impl Doc { })) } - /// Get status info for this document + /// Returns status info for this document pub async fn status(&self) -> anyhow::Result { self.ensure_open()?; let res = self.rpc(StatusRequest { doc_id: self.id() }).await??; Ok(res.status) } - /// Set the download policy for this document + /// Sets the download policy for this document pub async fn set_download_policy(&self, policy: DownloadPolicy) -> Result<()> { self.rpc(SetDownloadPolicyRequest { doc_id: self.id(), @@ -395,7 +401,7 @@ impl Doc { Ok(()) } - /// Get the download policy for this document + /// Returns the download policy for this document pub async fn get_download_policy(&self) -> Result { let res = self .rpc(GetDownloadPolicyRequest { doc_id: self.id() }) @@ -403,7 +409,7 @@ impl Doc { Ok(res.policy) } - /// Get sync peers for this document + /// Returns sync peers for this document pub async fn get_sync_peers(&self) -> Result>> { let res = self .rpc(GetSyncPeersRequest { doc_id: self.id() }) @@ -435,44 +441,44 @@ impl From for Entry { } impl Entry { - /// Get the [`RecordIdentifier`] for this entry. + /// Returns the [`RecordIdentifier`] for this entry. pub fn id(&self) -> &RecordIdentifier { self.0.id() } - /// Get the [`AuthorId`] of this entry. + /// Returns the [`AuthorId`] of this entry. pub fn author(&self) -> AuthorId { self.0.author() } - /// Get the [`struct@Hash`] of the content data of this record. + /// Returns the [`struct@Hash`] of the content data of this record. pub fn content_hash(&self) -> Hash { self.0.content_hash() } - /// Get the length of the data addressed by this record's content hash. + /// Returns the length of the data addressed by this record's content hash. pub fn content_len(&self) -> u64 { self.0.content_len() } - /// Get the key of this entry. + /// Returns the key of this entry. pub fn key(&self) -> &[u8] { self.0.key() } - /// Get the timestamp of this entry. + /// Returns the timestamp of this entry. pub fn timestamp(&self) -> u64 { self.0.timestamp() } - /// Read the content of an [`Entry`] as a streaming [`blobs::Reader`]. + /// Reads the content of an [`Entry`] as a streaming [`blobs::Reader`]. /// /// You can pass either a [`Doc`] or the `Iroh` client by reference as `client`. pub async fn content_reader(&self, client: impl Into<&RpcClient>) -> Result { blobs::Reader::from_rpc_read(client.into(), self.content_hash()).await } - /// Read all content of an [`Entry`] into a buffer. + /// Reads all content of an [`Entry`] into a buffer. /// /// You can pass either a [`Doc`] or the `Iroh` client by reference as `client`. pub async fn content_bytes(&self, client: impl Into<&RpcClient>) -> Result { @@ -490,7 +496,7 @@ impl Entry { /// file as an entry in the doc. #[derive(Debug, Serialize, Deserialize)] pub enum ImportProgress { - /// An item was found with name `name`, from now on referred to via `id` + /// An item was found with name `name`, from now on referred to via `id`. Found { /// A new unique id for this entry. id: u64, @@ -513,7 +519,7 @@ pub enum ImportProgress { /// The hash of the entry. hash: Hash, }, - /// We are done setting the entry to the doc + /// We are done setting the entry to the doc. AllDone { /// The key of the entry key: Bytes, @@ -621,7 +627,7 @@ impl ImportFileProgress { } } - /// Finish writing the stream, ignoring all intermediate progress events. + /// Finishes writing the stream, ignoring all intermediate progress events. /// /// Returns a [`ImportFileOutcome`] which contains a tag, key, and hash and the size of the /// content. @@ -693,8 +699,9 @@ impl ExportFileProgress { stream: Box::pin(stream), } } - /// Iterate through the export progress stream, returning when the stream has completed. + /// Iterates through the export progress stream, returning when the stream has completed. + /// /// Returns a [`ExportFileOutcome`] which contains a file path the data was written to and the size of the content. pub async fn finish(mut self) -> Result { let mut total_size = 0; diff --git a/iroh/src/client/gossip.rs b/iroh/src/client/gossip.rs index 7d80059ae5..9f24736365 100644 --- a/iroh/src/client/gossip.rs +++ b/iroh/src/client/gossip.rs @@ -4,6 +4,8 @@ //! //! The main entry point is the [`Client`]. //! +//! You obtain a [`Client`] via [`Iroh::gossip()`](crate::client::Iroh::gossip). +//! //! The gossip API is extremely simple. You use [`subscribe`](Client::subscribe) //! to subscribe to a topic. This returns a sink to send updates to the topic //! and a stream of responses. @@ -49,7 +51,7 @@ impl Default for SubscribeOpts { } impl Client { - /// Subscribe to a gossip topic. + /// Subscribes to a gossip topic. /// /// Returns a sink to send updates to the topic and a stream of responses. /// @@ -66,7 +68,7 @@ impl Client { /// immediate neighbors of the node. /// /// A Lagged event indicates that the gossip stream has not been consumed quickly enough. - /// You can adjust the buffer size with the [] option. + /// You can adjust the buffer size with the [`SubscribeOpts::subscription_capacity`] option. pub async fn subscribe_with_opts( &self, topic: TopicId, @@ -88,7 +90,7 @@ impl Client { Ok((sink, stream)) } - /// Subscribe to a gossip topic with default options. + /// Subscribes to a gossip topic with default options. pub async fn subscribe( &self, topic: impl Into, diff --git a/iroh/src/client/node.rs b/iroh/src/client/node.rs index 48868ea1bd..265d3c2a00 100644 --- a/iroh/src/client/node.rs +++ b/iroh/src/client/node.rs @@ -1,6 +1,10 @@ //! API to manage the iroh node itself. //! -//! The main entry point is the [Client]. +//! The main entry point is the [`Client`]. +//! +//! You obtain a [`Client`] via [`Iroh::node()`](crate::client::Iroh::node), +//! or just use [`Iroh`](crate::client::Iroh) directly, +//! as it has a `Deref` implementation for this [`Client`]. //! //! The client can be used to get information about the node, such as the //! [status](Client::status), [node id](Client::node_id) or @@ -14,7 +18,6 @@ use std::{collections::BTreeMap, net::SocketAddr}; use anyhow::Result; use futures_lite::{Stream, StreamExt}; -use iroh_base::key::PublicKey; use iroh_net::{endpoint::ConnectionInfo, relay::RelayUrl, NodeAddr, NodeId}; use ref_cast::RefCast; use serde::{Deserialize, Serialize}; @@ -34,59 +37,74 @@ pub struct Client { } impl Client { - /// Get statistics of the running node. + /// Fetches statistics of the running node. pub async fn stats(&self) -> Result> { let res = self.rpc.rpc(StatsRequest {}).await??; Ok(res.stats) } - /// Get information about the different connections we have made + /// Fetches information about currently known connections. + /// + /// This streams a *current snapshot*. It does not keep the stream open after finishing + /// transferring the snapshot. + /// + /// See also [`Endpoint::connection_infos`](crate::net::Endpoint::connection_infos). pub async fn connections(&self) -> Result>> { let stream = self.rpc.server_streaming(ConnectionsRequest {}).await?; Ok(flatten(stream).map(|res| res.map(|res| res.conn_info))) } - /// Get connection information about a node - pub async fn connection_info(&self, node_id: PublicKey) -> Result> { + /// Fetches connection information about a connection to another node identified by its [`NodeId`]. + /// + /// See also [`Endpoint::connection_info`](crate::net::Endpoint::connection_info). + pub async fn connection_info(&self, node_id: NodeId) -> Result> { let ConnectionInfoResponse { conn_info } = self.rpc.rpc(ConnectionInfoRequest { node_id }).await??; Ok(conn_info) } - /// Get status information about a node. + /// Fetches status information about this node. pub async fn status(&self) -> Result { let response = self.rpc.rpc(StatusRequest).await??; Ok(response) } - /// Get the id of this node. + /// Fetches the node id of this node. + /// + /// See also [`Endpoint::node_id`](crate::net::Endpoint::node_id). pub async fn node_id(&self) -> Result { let id = self.rpc.rpc(IdRequest).await??; Ok(id) } - /// Return the [`NodeAddr`] for this node. + /// Fetches the [`NodeAddr`] for this node. + /// + /// See also [`Endpoint::node_addr`](crate::net::Endpoint::node_addr). pub async fn node_addr(&self) -> Result { let addr = self.rpc.rpc(AddrRequest).await??; Ok(addr) } - /// Add a known node address to the node. + /// Adds a known node address to this node. + /// + /// See also [`Endpoint::add_node_addr`](crate::net::Endpoint::add_node_addr). pub async fn add_node_addr(&self, addr: NodeAddr) -> Result<()> { self.rpc.rpc(AddAddrRequest { addr }).await??; Ok(()) } - /// Get the relay server we are connected to. + /// Returns the relay server we are connected to. + /// + /// See also [`Endpoint::home_relay`](crate::net::Endpoint::home_relay). pub async fn home_relay(&self) -> Result> { let relay = self.rpc.rpc(RelayRequest).await??; Ok(relay) } - /// Shutdown the node. + /// Shuts down the node. /// - /// If `force` is true, the node will be killed instantly without waiting for things to - /// shutdown gracefully. + /// If `force` is true, the node will be shut down instantly without + /// waiting for things to stop gracefully. pub async fn shutdown(&self, force: bool) -> Result<()> { self.rpc.rpc(ShutdownRequest { force }).await?; Ok(()) diff --git a/iroh/src/client/quic.rs b/iroh/src/client/quic.rs index e0fd40a0ad..a9af6917d9 100644 --- a/iroh/src/client/quic.rs +++ b/iroh/src/client/quic.rs @@ -21,7 +21,7 @@ use crate::{ pub(crate) const RPC_ALPN: [u8; 17] = *b"n0/provider-rpc/1"; impl Iroh { - /// Connect to an iroh node running on the same computer, but in a different process. + /// Connects to an iroh node running on the same computer, but in a different process. pub async fn connect_path(root: impl AsRef) -> anyhow::Result { let rpc_status = RpcStatus::load(root).await?; match rpc_status { @@ -32,7 +32,7 @@ impl Iroh { } } - /// Connect to an iroh node at the given RPC address. + /// Connects to an iroh node at the given RPC address. pub async fn connect_addr(addr: SocketAddr) -> anyhow::Result { let client = connect_raw(addr).await?; Ok(Iroh::new(client)) diff --git a/iroh/src/client/tags.rs b/iroh/src/client/tags.rs index 0e663fb117..1b0accf03a 100644 --- a/iroh/src/client/tags.rs +++ b/iroh/src/client/tags.rs @@ -1,14 +1,17 @@ //! API for tag management. //! -//! The purpose of tags is to mark information as important. Currently this is -//! used for blobs. +//! The purpose of tags is to mark information as important to prevent it +//! from being garbage-collected (if the garbage collector is turned on). +//! Currently this is used for blobs. //! -//! The main entry point is the [Client]. +//! The main entry point is the [`Client`]. //! -//! [Client::list] can be used to list all tags. -//! [Client::list_hash_seq] can be used to list all tags with a hash_seq format. +//! You obtain a [`Client`] via [`Iroh::tags()`](crate::client::Iroh::tags). //! -//! [Client::delete] can be used to delete a tag. +//! [`Client::list`] can be used to list all tags. +//! [`Client::list_hash_seq`] can be used to list all tags with a hash_seq format. +//! +//! [`Client::delete`] can be used to delete a tag. use anyhow::Result; use futures_lite::{Stream, StreamExt}; use iroh_blobs::{BlobFormat, Hash, Tag}; @@ -26,19 +29,19 @@ pub struct Client { } impl Client { - /// List all tags. + /// Lists all tags. pub async fn list(&self) -> Result>> { let stream = self.rpc.server_streaming(ListRequest::all()).await?; Ok(stream.map(|res| res.map_err(anyhow::Error::from))) } - /// List all tags with a hash_seq format. + /// Lists all tags with a hash_seq format. pub async fn list_hash_seq(&self) -> Result>> { let stream = self.rpc.server_streaming(ListRequest::hash_seq()).await?; Ok(stream.map(|res| res.map_err(anyhow::Error::from))) } - /// Delete a tag. + /// Deletes a tag. pub async fn delete(&self, name: Tag) -> Result<()> { self.rpc.rpc(DeleteRequest { name }).await??; Ok(()) diff --git a/iroh/src/lib.rs b/iroh/src/lib.rs index d5140f1849..3e0a5c3ad6 100644 --- a/iroh/src/lib.rs +++ b/iroh/src/lib.rs @@ -77,10 +77,10 @@ //! ## Reexports //! //! The iroh crate re-exports the following crates: -//! - [iroh_base](iroh_base) as [`base`] -//! - [iroh_blobs](iroh_blobs) as [`blobs`] -//! - [iroh_docs](iroh_docs) as [`docs`] -//! - [iroh_net](iroh_net) as [`net`] +//! - [iroh_base] as [`base`] +//! - [iroh_blobs] as [`blobs`] +//! - [iroh_docs] as [`docs`] +//! - [iroh_net] as [`net`] //! //! ## Feature Flags //! From 741b42fa4260c94b4e80b633bffdf5add6ee24aa Mon Sep 17 00:00:00 2001 From: Asmir Avdicevic Date: Mon, 29 Jul 2024 10:22:28 +0200 Subject: [PATCH 08/45] fix: disable docs preview on forks (#2558) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Description We disable docs previews on forks because otherwise external contributors cant get a ✅ on CI. The reason for it being is that those forks don't get write permissions to the branch where docs are hosted and the alternative would be to manually juggle a lot of tokens for each user. ## Breaking Changes ## Notes & open questions ## Change checklist - [x] Self-review. - [x] Documentation updates following the [style guide](https://rust-lang.github.io/rfcs/1574-more-api-documentation-conventions.html#appendix-a-full-conventions-text), if relevant. - [x] Tests if relevant. - [x] All breaking changes documented. --- .github/workflows/docs.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index d9b51ec44f..0977e1fdbd 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -7,7 +7,7 @@ jobs: preview_docs: timeout-minutes: 30 name: Docs preview - if: "github.event_name == 'pull_request'" + if: "github.event_name == 'pull_request' && !github.event.pull_request.head.repo.fork" runs-on: ubuntu-latest env: RUSTC_WRAPPER: "sccache" From ffeb1a901387a56a1544ef058a86843f500eb84a Mon Sep 17 00:00:00 2001 From: Asmir Avdicevic Date: Mon, 29 Jul 2024 12:08:52 +0200 Subject: [PATCH 09/45] fix: force CI to use staging relays (#2560) ## Description Now that https://github.com/n0-computer/iroh/pull/2551 is landed, time to force all our repos to set this env var. ## Breaking Changes ## Notes & open questions ## Change checklist - [x] Self-review. - [ ] Documentation updates following the [style guide](https://rust-lang.github.io/rfcs/1574-more-api-documentation-conventions.html#appendix-a-full-conventions-text), if relevant. - [ ] Tests if relevant. - [ ] All breaking changes documented. --- .github/workflows/beta.yaml | 3 +++ .github/workflows/ci.yml | 1 + .github/workflows/commit.yml | 3 +++ .github/workflows/docker.yaml | 3 +++ .github/workflows/docs.yaml | 3 +++ .github/workflows/flaky.yaml | 3 +++ .github/workflows/netsim.yml | 1 + .github/workflows/release.yml | 1 + .github/workflows/test_relay_server.yml | 1 + .github/workflows/tests.yaml | 1 + 10 files changed, 20 insertions(+) diff --git a/.github/workflows/beta.yaml b/.github/workflows/beta.yaml index cc8988c853..aeb7fd494a 100644 --- a/.github/workflows/beta.yaml +++ b/.github/workflows/beta.yaml @@ -12,6 +12,9 @@ concurrency: group: beta-${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true +env: + IROH_FORCE_STAGING_RELAYS: "1" + jobs: tests: uses: './.github/workflows/tests.yaml' diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 293c867ec1..d1bbff6ef1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -18,6 +18,7 @@ env: RUSTDOCFLAGS: -Dwarnings MSRV: "1.76" SCCACHE_CACHE_SIZE: "50G" + IROH_FORCE_STAGING_RELAYS: "1" jobs: tests: diff --git a/.github/workflows/commit.yml b/.github/workflows/commit.yml index a798d22f07..1b5c6d238c 100644 --- a/.github/workflows/commit.yml +++ b/.github/workflows/commit.yml @@ -5,6 +5,9 @@ on: branches: [main] types: [opened, edited, synchronize] +env: + IROH_FORCE_STAGING_RELAYS: "1" + jobs: check-for-cc: runs-on: ubuntu-latest diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml index 7dee450cef..c98b0e2a95 100644 --- a/.github/workflows/docker.yaml +++ b/.github/workflows/docker.yaml @@ -36,6 +36,9 @@ on: type: boolean default: false +env: + IROH_FORCE_STAGING_RELAYS: "1" + jobs: build_and_publish: timeout-minutes: 30 diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index 0977e1fdbd..c1dad386e9 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -3,6 +3,9 @@ name: Docs Preview on: pull_request: +env: + IROH_FORCE_STAGING_RELAYS: "1" + jobs: preview_docs: timeout-minutes: 30 diff --git a/.github/workflows/flaky.yaml b/.github/workflows/flaky.yaml index a870007a05..93836b9a8d 100644 --- a/.github/workflows/flaky.yaml +++ b/.github/workflows/flaky.yaml @@ -37,6 +37,9 @@ concurrency: group: flaky-${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true +env: + IROH_FORCE_STAGING_RELAYS: "1" + jobs: tests: if: "contains(github.event.pull_request.labels.*.name, 'flaky-test') || github.event_name == 'workflow_dispatch' || github.event_name == 'schedule'" diff --git a/.github/workflows/netsim.yml b/.github/workflows/netsim.yml index 1c662644b9..49235d0a86 100644 --- a/.github/workflows/netsim.yml +++ b/.github/workflows/netsim.yml @@ -13,6 +13,7 @@ env: MSRV: "1.66" SCCACHE_GHA_ENABLED: "true" RUSTC_WRAPPER: "sccache" + IROH_FORCE_STAGING_RELAYS: "1" jobs: netsim: diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index e998a27e73..e70cd8c2be 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -31,6 +31,7 @@ env: SCCACHE_CACHE_SIZE: "50G" BIN_NAMES: "iroh,iroh-relay,iroh-dns-server" RELEASE_VERSION: ${{ github.event.inputs.release_version }} + IROH_FORCE_STAGING_RELAYS: "1" jobs: create-release: diff --git a/.github/workflows/test_relay_server.yml b/.github/workflows/test_relay_server.yml index d7a0a50b2d..ce3692a0a7 100644 --- a/.github/workflows/test_relay_server.yml +++ b/.github/workflows/test_relay_server.yml @@ -15,6 +15,7 @@ env: RUSTDOCFLAGS: -Dwarnings MSRV: "1.76" SCCACHE_CACHE_SIZE: "50G" + IROH_FORCE_STAGING_RELAYS: "1" jobs: build_relay_server: diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index 4f018451a8..94b07f652b 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -24,6 +24,7 @@ env: RUSTDOCFLAGS: -Dwarnings SCCACHE_CACHE_SIZE: "50G" CRATES_LIST: "iroh,iroh-blobs,iroh-gossip,iroh-metrics,iroh-net,iroh-net-bench,iroh-docs,iroh-test,iroh-cli,iroh-dns-server" + IROH_FORCE_STAGING_RELAYS: "1" jobs: build_and_test_nix: From 14fcceed53e9633402ba1b978f2002901b615ba8 Mon Sep 17 00:00:00 2001 From: Divma <26765164+divagant-martian@users.noreply.github.com> Date: Mon, 29 Jul 2024 10:49:43 -0500 Subject: [PATCH 10/45] tests(iroh-cli): remove flaky mark from 5 tests and improve logs (#2562) ## Description I have not seen these tests fail for at least the past two weeks. I think they might no longer be flaky. I suspect much of the flakiness came from quinn's logging errors on shutdown, which no longer happens. ## Breaking Changes n/a ## Notes & open questions n/a ## Change checklist - [x] Self-review. - [ ] ~~Documentation updates following the [style guide](https://rust-lang.github.io/rfcs/1574-more-api-documentation-conventions.html#appendix-a-full-conventions-text), if relevant.~~ - [ ] ~~Tests if relevant.~~ - [ ] ~~All breaking changes documented.~~ --- iroh-cli/tests/cli.rs | 74 +++++++++++++++++++------------------------ 1 file changed, 33 insertions(+), 41 deletions(-) diff --git a/iroh-cli/tests/cli.rs b/iroh-cli/tests/cli.rs index 9da3996da6..999d1be3a8 100644 --- a/iroh-cli/tests/cli.rs +++ b/iroh-cli/tests/cli.rs @@ -41,7 +41,6 @@ fn make_rand_file(size: usize, path: &Path) -> Result { } #[test] -#[ignore = "flaky"] fn cli_provide_one_file_basic() -> Result<()> { let dir = testdir!(); let path = dir.join("foo"); @@ -51,7 +50,6 @@ fn cli_provide_one_file_basic() -> Result<()> { } #[test] -#[ignore = "flaky"] fn cli_provide_one_file_large() -> Result<()> { let dir = testdir!(); let path = dir.join("foo"); @@ -62,7 +60,6 @@ fn cli_provide_one_file_large() -> Result<()> { /// Test single file download to a path #[test] -#[ignore = "flaky"] fn cli_provide_one_file_single_path() -> Result<()> { let dir = testdir!(); let path = dir.join("foo"); @@ -74,7 +71,6 @@ fn cli_provide_one_file_single_path() -> Result<()> { /// test single file download to stdout #[test] -#[ignore = "flaky"] fn cli_provide_one_file_single_stdout() -> Result<()> { let dir = testdir!(); let path = dir.join("foo"); @@ -86,7 +82,6 @@ fn cli_provide_one_file_single_stdout() -> Result<()> { } #[test] -#[ignore = "flaky"] fn cli_provide_folder() -> Result<()> { let path = testdir!().join("src"); std::fs::create_dir(&path)?; @@ -99,7 +94,6 @@ fn cli_provide_folder() -> Result<()> { } #[test] -#[ignore = "flaky"] fn cli_provide_tree() -> Result<()> { let path = testdir!().join("src"); std::fs::create_dir(&path)?; @@ -155,9 +149,7 @@ fn cli_provide_tree_resume() -> Result<()> { { println!("first test - empty work dir"); let get_iroh_data_dir = tmp.join("get_iroh_data_dir_01"); - let get = make_get_cmd(&get_iroh_data_dir, &ticket, Some(tgt.clone())); - let get_output = get.unchecked().run()?; - assert!(get_output.status.success()); + let get_output = run_get_cmd(&get_iroh_data_dir, &ticket, Some(tgt.clone()))?; let matches = explicit_matches(match_get_stderr(get_output.stderr)?); assert_eq!(matches, vec!["112.89 KiB"]); compare_files(&src, &tgt)?; @@ -169,9 +161,7 @@ fn cli_provide_tree_resume() -> Result<()> { println!("second test - full work dir"); let get_iroh_data_dir = tmp.join("get_iroh_data_dir_02"); copy_blob_dirs(&src_iroh_data_dir, &get_iroh_data_dir)?; - let get = make_get_cmd(&get_iroh_data_dir, &ticket, Some(tgt.clone())); - let get_output = get.unchecked().run()?; - assert!(get_output.status.success()); + let get_output = run_get_cmd(&get_iroh_data_dir, &ticket, Some(tgt.clone()))?; let matches = explicit_matches(match_get_stderr(get_output.stderr)?); assert_eq!(matches, vec!["0 B"]); compare_files(&src, &tgt)?; @@ -190,9 +180,7 @@ fn cli_provide_tree_resume() -> Result<()> { MakePartialResult::Retain } })?; - let get = make_get_cmd(&get_iroh_data_dir, &ticket, Some(tgt.clone())); - let get_output = get.unchecked().run()?; - assert!(get_output.status.success()); + let get_output = run_get_cmd(&get_iroh_data_dir, &ticket, Some(tgt.clone()))?; let matches = explicit_matches(match_get_stderr(get_output.stderr)?); assert_eq!(matches, vec!["98.04 KiB"]); compare_files(&src, &tgt)?; @@ -211,9 +199,7 @@ fn cli_provide_tree_resume() -> Result<()> { MakePartialResult::Retain } })?; - let get = make_get_cmd(&get_iroh_data_dir, &ticket, Some(tgt.clone())); - let get_output = get.unchecked().run()?; - assert!(get_output.status.success()); + let get_output = run_get_cmd(&get_iroh_data_dir, &ticket, Some(tgt.clone()))?; let matches = explicit_matches(match_get_stderr(get_output.stderr)?); assert_eq!(matches, vec!["65.98 KiB"]); compare_files(&src, &tgt)?; @@ -250,9 +236,7 @@ fn cli_provide_file_resume() -> Result<()> { { println!("first test - empty work dir"); let get_iroh_data_dir = tmp.join("get_iroh_data_dir_01"); - let get = make_get_cmd(&get_iroh_data_dir, &ticket, Some(tgt.clone())); - let get_output = get.unchecked().run()?; - assert!(get_output.status.success()); + let get_output = run_get_cmd(&get_iroh_data_dir, &ticket, Some(tgt.clone()))?; let matches = explicit_matches(match_get_stderr(get_output.stderr)?); assert_eq!(matches, vec!["98.04 KiB"]); assert_eq!(Hash::new(std::fs::read(&tgt)?), hash); @@ -265,9 +249,7 @@ fn cli_provide_file_resume() -> Result<()> { println!("second test - full work dir"); let get_iroh_data_dir = tmp.join("get_iroh_data_dir_02"); copy_blob_dirs(&src_iroh_data_dir, &get_iroh_data_dir)?; - let get = make_get_cmd(&get_iroh_data_dir, &ticket, Some(tgt.clone())); - let get_output = get.unchecked().run()?; - assert!(get_output.status.success()); + let get_output = run_get_cmd(&get_iroh_data_dir, &ticket, Some(tgt.clone()))?; let matches = explicit_matches(match_get_stderr(get_output.stderr)?); assert_eq!(matches, vec!["0 B"]); assert_eq!(Hash::new(std::fs::read(&tgt)?), hash); @@ -282,9 +264,7 @@ fn cli_provide_file_resume() -> Result<()> { make_partial(&get_iroh_data_dir, |_hash, _size| { MakePartialResult::Truncate(1024 * 32) })?; - let get = make_get_cmd(&get_iroh_data_dir, &ticket, Some(tgt.clone())); - let get_output = get.unchecked().run()?; - assert!(get_output.status.success()); + let get_output = run_get_cmd(&get_iroh_data_dir, &ticket, Some(tgt.clone()))?; let matches = explicit_matches(match_get_stderr(get_output.stderr)?); assert_eq!(matches, vec!["65.98 KiB"]); assert_eq!(Hash::new(std::fs::read(&tgt)?), hash); @@ -295,7 +275,6 @@ fn cli_provide_file_resume() -> Result<()> { } #[test] -#[ignore = "flaky"] fn cli_provide_from_stdin_to_stdout() -> Result<()> { let dir = testdir!(); let path = dir.join("foo"); @@ -408,7 +387,6 @@ fn cli_bao_store_migration() -> anyhow::Result<()> { #[cfg(unix)] #[tokio::test] -#[ignore = "flaky"] async fn cli_provide_persistence() -> anyhow::Result<()> { use iroh::blobs::store::ReadableStore; use nix::{ @@ -481,8 +459,8 @@ async fn cli_provide_persistence() -> anyhow::Result<()> { Ok(()) } -#[test] #[ignore = "flaky"] +#[test] fn cli_provide_addresses() -> Result<()> { let dir = testdir!(); let path = dir.join("foo"); @@ -521,7 +499,6 @@ fn cli_provide_addresses() -> Result<()> { } #[test] -#[ignore = "flaky"] fn cli_rpc_lock_restart() -> Result<()> { let dir = testdir!(); let iroh_data_dir = dir.join("data-dir"); @@ -699,8 +676,16 @@ fn to_out_dir(output: Output) -> Option { } } -/// Create a get command given a ticket and an output mode -fn make_get_cmd(iroh_data_dir: &Path, ticket: &str, out: Option) -> duct::Expression { +/// Create a get command given a ticket and an output mode and run it. +/// +/// The commands STDOUT and STDERR are printed, and the command's result code is checked for +/// success. +#[track_caller] +fn run_get_cmd( + iroh_data_dir: &Path, + ticket: &str, + out: Option, +) -> Result { // create a `get-ticket` cmd & optionally provide out path let out = out .map(|ref o| o.to_str().unwrap().to_string()) @@ -722,11 +707,24 @@ fn make_get_cmd(iroh_data_dir: &Path, ticket: &str, out: Option) -> duc iroh_data_dir.display() ); - cmd(iroh_bin(), &args) + let output = cmd(iroh_bin(), &args) .env_remove("RUST_LOG") .env("IROH_DATA_DIR", iroh_data_dir) .stdout_capture() .stderr_capture() + .unchecked() + .run()?; + + // checking the output first, so you can still view any logging + println!("STDOUT: {}", String::from_utf8_lossy(&output.stdout)); + println!("STDERR: {}", String::from_utf8_lossy(&output.stderr)); + + ensure!( + output.status.success(), + "iroh command failed. See STDERR output above." + ); + + Ok(output) } /// Test the provide and get loop for success, stderr output, and file contents. @@ -750,16 +748,10 @@ fn test_provide_get_loop(input: Input, output: Output) -> Result<()> { let ticket = match_provide_output(&mut provider, num_blobs, input.is_blob_or_collection())?; let out_dir = to_out_dir(output); let get_iroh_data_dir = dir.join("get-iroh-data-dir"); - let get_cmd = make_get_cmd(&get_iroh_data_dir, &ticket, out_dir.clone()); + let get_output = run_get_cmd(&get_iroh_data_dir, &ticket, out_dir.clone())?; - // test get stderr output - let get_output = get_cmd.unchecked().run()?; drop(provider); - // checking the output first, so you can still view any logging - println!("STDOUT: {}", String::from_utf8_lossy(&get_output.stdout)); - println!("STDERR: {}", String::from_utf8_lossy(&get_output.stderr)); - match_get_stderr(get_output.stderr)?; assert!(get_output.status.success()); From f085e633c82531b7d24a70703ae48a2562eccfdd Mon Sep 17 00:00:00 2001 From: Divma <26765164+divagant-martian@users.noreply.github.com> Date: Mon, 29 Jul 2024 14:21:53 -0500 Subject: [PATCH 11/45] test(iroh-cli): reduce flakyness of cli_provide_file_resume (#2563) ## Description In windows there is no way to copy a file being accessed by another process in an accessible way. This is what makes this test fail since it attempt to copy the blobs.db folder a couple of times while the iroh instance that handles is running. The change is simple: shutdown the provider, copy the files, re-start the provider. From my perspective, this does not affect what the test is attempting to assert. Now, since this includes re-starting the iroh instance that provides the files several times, and we match on output, sometimes we can get weird logs related to shutdown. One of those (and the only one I have seen so far) is for a netcheck report and didn't finish on time. Instead of logging this in the reportgen actor, the error is bubbled up to be handled by the netcheck actor (which will have shutdown by then) thus reducing noise and allowing for better error handling in the future ## Breaking Changes n/a ## Notes & open questions n/a ## Change checklist - [x] Self-review. - [ ] ~~Documentation updates following the [style guide](https://rust-lang.github.io/rfcs/1574-more-api-documentation-conventions.html#appendix-a-full-conventions-text), if relevant.~~ - [x] Tests if relevant. - [ ] ~~All breaking changes documented.~~ --- iroh-cli/tests/cli.rs | 15 +++++++++++---- iroh-net/src/netcheck.rs | 10 +++++----- iroh-net/src/netcheck/reportgen.rs | 3 +-- 3 files changed, 17 insertions(+), 11 deletions(-) diff --git a/iroh-cli/tests/cli.rs b/iroh-cli/tests/cli.rs index 999d1be3a8..5b48ed2a02 100644 --- a/iroh-cli/tests/cli.rs +++ b/iroh-cli/tests/cli.rs @@ -209,8 +209,8 @@ fn cli_provide_tree_resume() -> Result<()> { Ok(()) } -#[test] #[ignore = "flaky"] +#[test] fn cli_provide_file_resume() -> Result<()> { use iroh::blobs::store::fs::test_support::{make_partial, MakePartialResult}; @@ -229,11 +229,14 @@ fn cli_provide_file_resume() -> Result<()> { let src_iroh_data_dir = tmp.join("src_iroh_data_dir"); let file = src.join("file"); let hash = make_rand_file(100000, &file)?; - // leave the provider running for the entire test + // import the files into an ephemeral iroh to use the generated blobs db in tests let provider = make_provider_in(&src_iroh_data_dir, Input::Path(file.clone()), false)?; let count = count_input_files(&src); let ticket = match_provide_output(&provider, count, BlobOrCollection::Blob)?; + drop(provider); + { + let provider = make_provider_in(&src_iroh_data_dir, Input::Path(file.clone()), false)?; println!("first test - empty work dir"); let get_iroh_data_dir = tmp.join("get_iroh_data_dir_01"); let get_output = run_get_cmd(&get_iroh_data_dir, &ticket, Some(tgt.clone()))?; @@ -242,6 +245,7 @@ fn cli_provide_file_resume() -> Result<()> { assert_eq!(Hash::new(std::fs::read(&tgt)?), hash); // compare_files(&src, &tgt)?; std::fs::remove_file(&tgt)?; + drop(provider); } // second test - full work dir @@ -249,11 +253,13 @@ fn cli_provide_file_resume() -> Result<()> { println!("second test - full work dir"); let get_iroh_data_dir = tmp.join("get_iroh_data_dir_02"); copy_blob_dirs(&src_iroh_data_dir, &get_iroh_data_dir)?; + let provider = make_provider_in(&src_iroh_data_dir, Input::Path(file.clone()), false)?; let get_output = run_get_cmd(&get_iroh_data_dir, &ticket, Some(tgt.clone()))?; let matches = explicit_matches(match_get_stderr(get_output.stderr)?); assert_eq!(matches, vec!["0 B"]); assert_eq!(Hash::new(std::fs::read(&tgt)?), hash); std::fs::remove_file(&tgt)?; + drop(provider); } // third test - partial work dir - truncate some large files @@ -261,6 +267,7 @@ fn cli_provide_file_resume() -> Result<()> { println!("fourth test - partial work dir - truncate some large files"); let get_iroh_data_dir = tmp.join("get_iroh_data_dir_04"); copy_blob_dirs(&src_iroh_data_dir, &get_iroh_data_dir)?; + let provider = make_provider_in(&src_iroh_data_dir, Input::Path(file.clone()), false)?; make_partial(&get_iroh_data_dir, |_hash, _size| { MakePartialResult::Truncate(1024 * 32) })?; @@ -269,8 +276,8 @@ fn cli_provide_file_resume() -> Result<()> { assert_eq!(matches, vec!["65.98 KiB"]); assert_eq!(Hash::new(std::fs::read(&tgt)?), hash); std::fs::remove_file(&tgt)?; + drop(provider); } - drop(provider); Ok(()) } @@ -619,7 +626,7 @@ fn iroh_bin() -> &'static str { env!("CARGO_BIN_EXE_iroh") } -/// Makes a provider process with it's home directory in `iroh_data_dir`. +/// Makes a provider process with its home directory in `iroh_data_dir`. fn make_provider_in(iroh_data_dir: &Path, input: Input, wrap: bool) -> Result { let mut args = vec!["--metrics-port", "disabled", "start"]; if wrap { diff --git a/iroh-net/src/netcheck.rs b/iroh-net/src/netcheck.rs index 2d42fbd97f..e8e731f0a0 100644 --- a/iroh-net/src/netcheck.rs +++ b/iroh-net/src/netcheck.rs @@ -327,7 +327,7 @@ pub(crate) enum Message { /// A report produced by the [`reportgen`] actor. ReportReady { report: Box }, /// The [`reportgen`] actor failed to produce a report. - ReportAborted, + ReportAborted { err: anyhow::Error }, /// An incoming STUN packet to parse. StunPacket { /// The raw UDP payload. @@ -458,8 +458,8 @@ impl Actor { Message::ReportReady { report } => { self.handle_report_ready(report); } - Message::ReportAborted => { - self.handle_report_aborted(); + Message::ReportAborted { err } => { + self.handle_report_aborted(err); } Message::StunPacket { payload, from_addr } => { self.handle_stun_packet(&payload, from_addr); @@ -547,10 +547,10 @@ impl Actor { } } - fn handle_report_aborted(&mut self) { + fn handle_report_aborted(&mut self, err: anyhow::Error) { self.in_flight_stun_requests.clear(); if let Some(ReportRun { report_tx, .. }) = self.current_report_run.take() { - report_tx.send(Err(anyhow!("report aborted"))).ok(); + report_tx.send(Err(err.context("report aborted"))).ok(); } } diff --git a/iroh-net/src/netcheck/reportgen.rs b/iroh-net/src/netcheck/reportgen.rs index 81e48d3dd7..0bdd77ee43 100644 --- a/iroh-net/src/netcheck/reportgen.rs +++ b/iroh-net/src/netcheck/reportgen.rs @@ -210,9 +210,8 @@ impl Actor { match self.run_inner().await { Ok(_) => debug!("reportgen actor finished"), Err(err) => { - error!("reportgen actor failed: {err:#}"); self.netcheck - .send(netcheck::Message::ReportAborted) + .send(netcheck::Message::ReportAborted { err }) .await .ok(); } From 1dda2f7ab706cf794d2c8f4e6b47b24caf2f1c78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= Date: Wed, 31 Jul 2024 10:00:49 +0200 Subject: [PATCH 12/45] refactor(iroh-net): Move more server code behind `iroh-relay` feature flag (#2566) ## Description This moves more relay server code behind the `iroh-relay` feature flag. This is a continuation of #2516 Unfortunately I didn't quite catch "all server things". I'm fairly confident I do have all covered now, because this is extracted from a branch where server-side code wouldn't compile (to Wasm). Also: - Makes use of `MaybeTlsStream` instead of the `MaybeTlsStreamServer` alias. Using the original definition instead of the alias confuses rust-analyzer a lot less. And the aliasing back-and-forth was silly. ## Breaking Changes None ## Notes & open questions ## Change checklist - [x] Self-review. - [x] Documentation updates following the [style guide](https://rust-lang.github.io/rfcs/1574-more-api-documentation-conventions.html#appendix-a-full-conventions-text), if relevant. - ~~[ ] Tests if relevant.~~ - [x] All breaking changes documented. --- iroh-net/src/magicsock/relay_actor.rs | 16 ++++++---------- iroh-net/src/relay.rs | 4 ++++ iroh-net/src/relay/client.rs | 16 ++-------------- iroh-net/src/relay/client_conn.rs | 2 +- iroh-net/src/relay/clients.rs | 3 +-- iroh-net/src/relay/codec.rs | 8 +++++++- iroh-net/src/relay/http.rs | 2 +- iroh-net/src/relay/http/client.rs | 4 ++-- iroh-net/src/relay/http/server.rs | 8 +++----- iroh-net/src/relay/iroh_relay.rs | 4 ++-- iroh-net/src/relay/server.rs | 3 +-- iroh-net/src/relay/types.rs | 5 +++++ 12 files changed, 35 insertions(+), 40 deletions(-) diff --git a/iroh-net/src/magicsock/relay_actor.rs b/iroh-net/src/magicsock/relay_actor.rs index ecae432cf3..93029de0ce 100644 --- a/iroh-net/src/magicsock/relay_actor.rs +++ b/iroh-net/src/magicsock/relay_actor.rs @@ -20,7 +20,7 @@ use tracing::{debug, info, info_span, trace, warn, Instrument}; use crate::{ key::{PublicKey, PUBLIC_KEY_LENGTH}, - relay::{self, http::ClientError, ReceivedMessage, RelayUrl, MAX_PACKET_SIZE}, + relay::{self, client::ReceivedMessage, http::ClientError, RelayUrl, MAX_PACKET_SIZE}, }; use super::{ActorMessage, MagicSock}; @@ -200,7 +200,7 @@ impl ActiveRelay { None => ReadResult::Break, } } - Ok((msg, conn_gen)) => { + Ok((msg, _conn_gen)) => { // reset self.backoff.reset(); let now = Instant::now(); @@ -214,11 +214,7 @@ impl ActiveRelay { } match msg { - relay::ReceivedMessage::ServerInfo { .. } => { - info!(%conn_gen, "connected"); - ReadResult::Continue - } - relay::ReceivedMessage::ReceivedPacket { source, data } => { + relay::client::ReceivedMessage::ReceivedPacket { source, data } => { trace!(len=%data.len(), "received msg"); // If this is a new sender we hadn't seen before, remember it and // register a route for this peer. @@ -248,7 +244,7 @@ impl ActiveRelay { ReadResult::Continue } - relay::ReceivedMessage::Ping(data) => { + relay::client::ReceivedMessage::Ping(data) => { // Best effort reply to the ping. let dc = self.relay_client.clone(); tokio::task::spawn(async move { @@ -258,8 +254,8 @@ impl ActiveRelay { }); ReadResult::Continue } - relay::ReceivedMessage::Health { .. } => ReadResult::Continue, - relay::ReceivedMessage::PeerGone(key) => { + relay::client::ReceivedMessage::Health { .. } => ReadResult::Continue, + relay::client::ReceivedMessage::PeerGone(key) => { self.relay_routes.retain(|peer| peer != &key); ReadResult::Continue } diff --git a/iroh-net/src/relay.rs b/iroh-net/src/relay.rs index 9c0d696efc..b9d9f1fb06 100644 --- a/iroh-net/src/relay.rs +++ b/iroh-net/src/relay.rs @@ -11,7 +11,9 @@ #![deny(missing_docs, rustdoc::broken_intra_doc_links)] pub(crate) mod client; +#[cfg(feature = "iroh-relay")] pub(crate) mod client_conn; +#[cfg(feature = "iroh-relay")] pub(crate) mod clients; pub(crate) mod codec; pub mod http; @@ -19,6 +21,7 @@ pub mod http; pub mod iroh_relay; mod map; mod metrics; +#[cfg(feature = "iroh-relay")] pub(crate) mod server; pub(crate) mod types; @@ -27,5 +30,6 @@ pub use self::codec::MAX_PACKET_SIZE; pub use self::http::Client as HttpClient; pub use self::map::{RelayMap, RelayMode, RelayNode}; pub use self::metrics::Metrics; +#[cfg(feature = "iroh-relay")] pub use self::server::{ClientConnHandler, MaybeTlsStream as MaybeTlsStreamServer, Server}; pub use iroh_base::node_addr::RelayUrl; diff --git a/iroh-net/src/relay/client.rs b/iroh-net/src/relay/client.rs index 173919d906..1fa34b1efd 100644 --- a/iroh-net/src/relay/client.rs +++ b/iroh-net/src/relay/client.rs @@ -439,7 +439,7 @@ impl ClientBuilder { } #[derive(derive_more::Debug, Clone)] -/// The type of message received by the [`Client`] from the [`super::server::Server`]. +/// The type of message received by the [`Client`] from a relay server. pub enum ReceivedMessage { /// Represents an incoming packet. ReceivedPacket { @@ -452,18 +452,6 @@ pub enum ReceivedMessage { /// Indicates that the client identified by the underlying public key had previously sent you a /// packet but has now disconnected from the server. PeerGone(PublicKey), - /// Sent by the server upon first connect. - ServerInfo { - /// How many bytes per second the server says it will accept, including all framing bytes. - /// - /// Zero means unspecified. There might be a limit, but the client need not try to respect it. - token_bucket_bytes_per_second: usize, - /// How many bytes the server will allow in one burst, temporarily violating - /// `token_bucket_bytes_per_second`. - /// - /// Zero means unspecified. There might be a limit, but the [`Client`] need not try to respect it. - token_bucket_bytes_burst: usize, - }, /// Request from a client or server to reply to the /// other side with a [`ReceivedMessage::Pong`] with the given payload. Ping([u8; 8]), @@ -471,7 +459,7 @@ pub enum ReceivedMessage { /// with the payload sent previously in the ping. Pong([u8; 8]), /// A one-way empty message from server to client, just to - /// keep the connection alive. It's like a [ReceivedMessage::Ping], but doesn't solicit + /// keep the connection alive. It's like a [`ReceivedMessage::Ping`], but doesn't solicit /// a reply from the client. KeepAlive, /// A one-way message from server to client, declaring the connection health state. diff --git a/iroh-net/src/relay/client_conn.rs b/iroh-net/src/relay/client_conn.rs index 3d5de5cae4..c81b92c967 100644 --- a/iroh-net/src/relay/client_conn.rs +++ b/iroh-net/src/relay/client_conn.rs @@ -453,7 +453,7 @@ impl ClientConnIo { mod tests { use crate::key::SecretKey; use crate::relay::codec::{recv_frame, DerpCodec, FrameType}; - use crate::relay::MaybeTlsStreamServer as MaybeTlsStream; + use crate::relay::server::MaybeTlsStream; use super::*; diff --git a/iroh-net/src/relay/clients.rs b/iroh-net/src/relay/clients.rs index 32c76a01cf..9f77418332 100644 --- a/iroh-net/src/relay/clients.rs +++ b/iroh-net/src/relay/clients.rs @@ -262,8 +262,7 @@ mod tests { key::SecretKey, relay::{ codec::{recv_frame, DerpCodec, Frame, FrameType}, - server::RelayIo, - MaybeTlsStreamServer as MaybeTlsStream, + server::{MaybeTlsStream, RelayIo}, }, }; diff --git a/iroh-net/src/relay/codec.rs b/iroh-net/src/relay/codec.rs index 66186ed6ad..a35293f60a 100644 --- a/iroh-net/src/relay/codec.rs +++ b/iroh-net/src/relay/codec.rs @@ -1,7 +1,8 @@ use std::time::Duration; -use anyhow::{bail, ensure, Context}; +use anyhow::{bail, ensure}; use bytes::{Buf, BufMut, Bytes, BytesMut}; +#[cfg(feature = "iroh-relay")] use futures_lite::{Stream, StreamExt}; use futures_sink::Sink; use futures_util::SinkExt; @@ -21,8 +22,10 @@ const MAX_FRAME_SIZE: usize = 1024 * 1024; /// The Relay magic number, sent in the FrameType::ClientInfo frame upon initial connection. const MAGIC: &str = "RELAY🔑"; +#[cfg(feature = "iroh-relay")] pub(super) const KEEP_ALIVE: Duration = Duration::from_secs(60); // TODO: what should this be? +#[cfg(feature = "iroh-relay")] pub(super) const SERVER_CHANNEL_SIZE: usize = 1024 * 100; /// The number of packets buffered for sending per client pub(super) const PER_CLIENT_SEND_QUEUE_DEPTH: usize = 512; //32; @@ -155,9 +158,11 @@ pub(crate) async fn send_client_key + Unp /// Reads the `FrameType::ClientInfo` frame from the client (its proof of identity) /// upon it's initial connection. +#[cfg(feature = "iroh-relay")] pub(super) async fn recv_client_key> + Unpin>( stream: S, ) -> anyhow::Result<(PublicKey, ClientInfo)> { + use anyhow::Context; // the client is untrusted at this point, limit the input size even smaller than our usual // maximum frame size, and give a timeout @@ -523,6 +528,7 @@ impl Encoder for DerpCodec { /// Receives the next frame and matches the frame type. If the correct type is found returns the content, /// otherwise an error. +#[cfg(feature = "iroh-relay")] pub(super) async fn recv_frame> + Unpin>( frame_type: FrameType, mut stream: S, diff --git a/iroh-net/src/relay/http.rs b/iroh-net/src/relay/http.rs index ac3daf1de0..f7e4365f1b 100644 --- a/iroh-net/src/relay/http.rs +++ b/iroh-net/src/relay/http.rs @@ -75,7 +75,7 @@ mod tests { use tracing_subscriber::{prelude::*, EnvFilter}; use crate::key::{PublicKey, SecretKey}; - use crate::relay::ReceivedMessage; + use crate::relay::client::ReceivedMessage; pub(crate) fn make_tls_config() -> TlsConfig { let subject_alt_names = vec!["localhost".to_string()]; diff --git a/iroh-net/src/relay/http/client.rs b/iroh-net/src/relay/http/client.rs index aee444c633..97d3303184 100644 --- a/iroh-net/src/relay/http/client.rs +++ b/iroh-net/src/relay/http/client.rs @@ -35,7 +35,7 @@ use crate::relay::http::RELAY_PATH; use crate::relay::RelayUrl; use crate::relay::{ client::Client as RelayClient, client::ClientBuilder as RelayClientBuilder, - client::ClientReceiver as RelayClientReceiver, ReceivedMessage, + client::ClientReceiver as RelayClientReceiver, client::ReceivedMessage, }; use crate::util::chain; use crate::util::AbortingJoinHandle; @@ -367,7 +367,7 @@ impl ClientBuilder { ) } - /// The expected [`PublicKey`] of the [`super::server::Server`] we are connecting to. + /// The expected [`PublicKey`] of the relay server we are connecting to. pub fn server_public_key(mut self, server_public_key: PublicKey) -> Self { self.server_public_key = Some(server_public_key); self diff --git a/iroh-net/src/relay/http/server.rs b/iroh-net/src/relay/http/server.rs index d94fa59ac1..b0d6af86c6 100644 --- a/iroh-net/src/relay/http/server.rs +++ b/iroh-net/src/relay/http/server.rs @@ -25,7 +25,6 @@ use tungstenite::handshake::derive_accept_key; use crate::key::SecretKey; use crate::relay::http::SUPPORTED_WEBSOCKET_VERSION; use crate::relay::server::{ClientConnHandler, MaybeTlsStream}; -use crate::relay::MaybeTlsStreamServer; use super::{Protocol, LEGACY_RELAY_PATH, RELAY_PATH}; @@ -609,8 +608,7 @@ impl RelayService { Some(tls_config) => self.tls_serve_connection(stream, tls_config).await, None => { debug!("HTTP: serve connection"); - self.serve_connection(MaybeTlsStreamServer::Plain(stream)) - .await + self.serve_connection(MaybeTlsStream::Plain(stream)).await } } } @@ -629,7 +627,7 @@ impl RelayService { .into_stream(config) .await .context("TLS[acme] handshake")?; - self.serve_connection(MaybeTlsStreamServer::Tls(tls_stream)) + self.serve_connection(MaybeTlsStream::Tls(tls_stream)) .await .context("TLS[acme] serve connection")?; } @@ -637,7 +635,7 @@ impl RelayService { TlsAcceptor::Manual(a) => { debug!("TLS[manual]: accept"); let tls_stream = a.accept(stream).await.context("TLS[manual] accept")?; - self.serve_connection(MaybeTlsStreamServer::Tls(tls_stream)) + self.serve_connection(MaybeTlsStream::Tls(tls_stream)) .await .context("TLS[manual] serve connection")?; } diff --git a/iroh-net/src/relay/iroh_relay.rs b/iroh-net/src/relay/iroh_relay.rs index f7f0f3d8b1..4ada7965a0 100644 --- a/iroh-net/src/relay/iroh_relay.rs +++ b/iroh-net/src/relay/iroh_relay.rs @@ -608,7 +608,7 @@ async fn run_captive_portal_service(http_listener: TcpListener) -> Result<()> { let handler = CaptivePortalService; tasks.spawn(async move { - let stream = relay::MaybeTlsStreamServer::Plain(stream); + let stream = relay::server::MaybeTlsStream::Plain(stream); let stream = hyper_util::rt::TokioIo::new(stream); if let Err(err) = hyper::server::conn::http1::Builder::new() .serve_connection(stream, handler) @@ -714,7 +714,7 @@ mod tests { use crate::relay::http::{ClientBuilder, Protocol, HTTP_UPGRADE_PROTOCOL}; - use self::relay::ReceivedMessage; + use self::relay::client::ReceivedMessage; use super::*; diff --git a/iroh-net/src/relay/server.rs b/iroh-net/src/relay/server.rs index 4ce80a3e6f..865a82a2c0 100644 --- a/iroh-net/src/relay/server.rs +++ b/iroh-net/src/relay/server.rs @@ -554,11 +554,10 @@ mod tests { use super::*; use crate::relay::{ - client::{ClientBuilder, ConnReader, ConnWriter}, + client::{ClientBuilder, ConnReader, ConnWriter, ReceivedMessage}, codec::{recv_frame, FrameType}, http::streams::{MaybeTlsStreamReader, MaybeTlsStreamWriter}, types::ClientInfo, - ReceivedMessage, }; use tokio_util::codec::{FramedRead, FramedWrite}; use tracing_subscriber::{prelude::*, EnvFilter}; diff --git a/iroh-net/src/relay/types.rs b/iroh-net/src/relay/types.rs index b332182d50..27445202df 100644 --- a/iroh-net/src/relay/types.rs +++ b/iroh-net/src/relay/types.rs @@ -1,11 +1,14 @@ use std::num::NonZeroU32; use anyhow::{bail, Context, Result}; +#[cfg(feature = "iroh-relay")] use bytes::Bytes; use postcard::experimental::max_size::MaxSize; use serde::{Deserialize, Serialize}; +#[cfg(feature = "iroh-relay")] use super::client_conn::ClientConnBuilder; +#[cfg(feature = "iroh-relay")] use crate::key::PublicKey; pub(crate) struct RateLimiter { @@ -44,6 +47,7 @@ impl RateLimiter { /// A request to write a dataframe to a Client #[derive(Debug, Clone)] +#[cfg(feature = "iroh-relay")] pub(crate) struct Packet { /// The sender of the packet pub(crate) src: PublicKey, @@ -57,6 +61,7 @@ pub(crate) struct ClientInfo { pub(crate) version: usize, } +#[cfg(feature = "iroh-relay")] #[derive(derive_more::Debug)] pub(crate) enum ServerMessage { SendPacket((PublicKey, Packet)), From 29d2e82a577ebc8cb4029c0df0138fe662031d5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= Date: Wed, 31 Jul 2024 11:31:29 +0200 Subject: [PATCH 13/45] refactor(iroh-net)!: Improve server modules structure & rename structs (#2568) ## Description This is moving around the relay modules a bit. Mostly so that all server-specific code is under `iroh-net/src/relay/server/`. This means there's only a handful of `#[cfg(feature = "iroh-relay")]`s in `relay.rs` and a bunch of other places due to unused warnings otherwise. I've also renamed what was previously `iroh::relay::client::{Client, ClientBuilder, ClientReceiver, ...}` to `iroh::relay::client::conn::{Conn, ConnBuilder, ConnReceiver, ...}`, because otherwise the `Client*` names were duplicated in two modules that were in a parent-child relationship, which is super confusing. And really, the inner `Client`s are more about individual connections. I also resolved a TODO around moving to an `AbortingJoinHandle` in favor of a custom `RelayHttpServerGuard`. ## Breaking Changes - Refactored the module structure for users of the `iroh-relay` feature in iroh-net. ## Notes & open questions I realize the diff here is quite substantial. Unfortunately the diff is mostly just bad because I ended up "swapping places" for modules in two cases, which ended up creating really bad diffs. I know the diff is quite sad, so it's fair if we say "this is unreviewable" and I scrap this piece of work. I'm not blocked on this merging, and I only fairly quickly knocked this out today. As Mark Twain famously said > I apologize for such a long PR - I didn't have time to write a short one. Or sth like that. ## Change checklist - [X] Self-review. - [X] Documentation updates following the [style guide](https://rust-lang.github.io/rfcs/1574-more-api-documentation-conventions.html#appendix-a-full-conventions-text), if relevant. - ~~[ ] Tests if relevant.~~ - [X] All breaking changes documented. Yeah? Kinda? --- iroh-cli/src/commands/doctor.rs | 2 +- iroh-net/src/bin/iroh-relay.rs | 4 +- iroh-net/src/magicsock/relay_actor.rs | 28 +- iroh-net/src/metrics.rs | 3 +- iroh-net/src/relay.rs | 20 +- iroh-net/src/relay/client.rs | 1403 ++++++++++---- iroh-net/src/relay/client/conn.rs | 539 ++++++ .../src/relay/{http => client}/streams.rs | 0 iroh-net/src/relay/codec.rs | 9 +- iroh-net/src/relay/http.rs | 243 +-- iroh-net/src/relay/http/client.rs | 1112 ----------- iroh-net/src/relay/iroh_relay.rs | 1051 ---------- iroh-net/src/relay/server.rs | 1692 +++++++++-------- iroh-net/src/relay/server/actor.rs | 761 ++++++++ .../src/relay/{ => server}/client_conn.rs | 29 +- iroh-net/src/relay/{ => server}/clients.rs | 10 +- .../{http/server.rs => server/http_server.rs} | 265 ++- iroh-net/src/relay/{ => server}/metrics.rs | 0 iroh-net/src/relay/server/streams.rs | 163 ++ iroh-net/src/relay/server/types.rs | 25 + iroh-net/src/relay/types.rs | 73 - iroh-net/src/test_utils.rs | 13 +- 22 files changed, 3711 insertions(+), 3734 deletions(-) create mode 100644 iroh-net/src/relay/client/conn.rs rename iroh-net/src/relay/{http => client}/streams.rs (100%) delete mode 100644 iroh-net/src/relay/http/client.rs delete mode 100644 iroh-net/src/relay/iroh_relay.rs create mode 100644 iroh-net/src/relay/server/actor.rs rename iroh-net/src/relay/{ => server}/client_conn.rs (96%) rename iroh-net/src/relay/{ => server}/clients.rs (98%) rename iroh-net/src/relay/{http/server.rs => server/http_server.rs} (72%) rename iroh-net/src/relay/{ => server}/metrics.rs (100%) create mode 100644 iroh-net/src/relay/server/streams.rs create mode 100644 iroh-net/src/relay/server/types.rs delete mode 100644 iroh-net/src/relay/types.rs diff --git a/iroh-cli/src/commands/doctor.rs b/iroh-cli/src/commands/doctor.rs index a9886c827b..93e3221b6d 100644 --- a/iroh-cli/src/commands/doctor.rs +++ b/iroh-cli/src/commands/doctor.rs @@ -866,7 +866,7 @@ async fn relay_urls(count: usize, config: NodeConfig) -> anyhow::Result<()> { let mut clients = HashMap::new(); for node in &config.relay_nodes { let secret_key = key.clone(); - let client = iroh::net::relay::http::ClientBuilder::new(node.url.clone()) + let client = iroh::net::relay::HttpClientBuilder::new(node.url.clone()) .build(secret_key, dns_resolver.clone()); clients.insert(node.url.clone(), client); diff --git a/iroh-net/src/bin/iroh-relay.rs b/iroh-net/src/bin/iroh-relay.rs index 9641537050..b220399488 100644 --- a/iroh-net/src/bin/iroh-relay.rs +++ b/iroh-net/src/bin/iroh-relay.rs @@ -1,7 +1,7 @@ //! A simple relay server for iroh-net. //! //! This handles only the CLI and config file loading, the server implementation lives in -//! [`iroh_net::relay::iroh_relay`]. +//! [`iroh_net::relay::server`]. use std::net::{Ipv6Addr, SocketAddr}; use std::path::{Path, PathBuf}; @@ -12,7 +12,7 @@ use iroh_net::defaults::{ DEFAULT_HTTPS_PORT, DEFAULT_HTTP_PORT, DEFAULT_METRICS_PORT, DEFAULT_STUN_PORT, }; use iroh_net::key::SecretKey; -use iroh_net::relay::iroh_relay; +use iroh_net::relay::server as iroh_relay; use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; use tokio_rustls_acme::{caches::DirCache, AcmeConfig}; diff --git a/iroh-net/src/magicsock/relay_actor.rs b/iroh-net/src/magicsock/relay_actor.rs index 93029de0ce..0a7b6c8fda 100644 --- a/iroh-net/src/magicsock/relay_actor.rs +++ b/iroh-net/src/magicsock/relay_actor.rs @@ -20,7 +20,7 @@ use tracing::{debug, info, info_span, trace, warn, Instrument}; use crate::{ key::{PublicKey, PUBLIC_KEY_LENGTH}, - relay::{self, client::ReceivedMessage, http::ClientError, RelayUrl, MAX_PACKET_SIZE}, + relay::{self, client::conn::ReceivedMessage, client::ClientError, RelayUrl, MAX_PACKET_SIZE}, }; use super::{ActorMessage, MagicSock}; @@ -58,8 +58,8 @@ struct ActiveRelay { /// use instead of creating a new relay connection back to their home. relay_routes: Vec, url: RelayUrl, - relay_client: relay::http::Client, - relay_client_receiver: relay::http::ClientReceiver, + relay_client: relay::client::Client, + relay_client_receiver: relay::client::ClientReceiver, /// The set of senders we know are present on this connection, based on /// messages we've received from the server. peer_present: HashSet, @@ -74,8 +74,8 @@ enum ActiveRelayMessage { GetLastWrite(oneshot::Sender), Ping(oneshot::Sender>), GetLocalAddr(oneshot::Sender>), - GetPeerRoute(PublicKey, oneshot::Sender>), - GetClient(oneshot::Sender), + GetPeerRoute(PublicKey, oneshot::Sender>), + GetClient(oneshot::Sender), NotePreferred(bool), Shutdown, } @@ -83,8 +83,8 @@ enum ActiveRelayMessage { impl ActiveRelay { fn new( url: RelayUrl, - relay_client: relay::http::Client, - relay_client_receiver: relay::http::ClientReceiver, + relay_client: relay::client::Client, + relay_client_receiver: relay::client::ClientReceiver, msg_sender: mpsc::Sender, ) -> Self { ActiveRelay { @@ -179,7 +179,7 @@ impl ActiveRelay { if matches!( err, - relay::http::ClientError::Closed | relay::http::ClientError::IPDisabled + relay::client::ClientError::Closed | relay::client::ClientError::IPDisabled ) { // drop client return ReadResult::Break; @@ -214,7 +214,7 @@ impl ActiveRelay { } match msg { - relay::client::ReceivedMessage::ReceivedPacket { source, data } => { + relay::client::conn::ReceivedMessage::ReceivedPacket { source, data } => { trace!(len=%data.len(), "received msg"); // If this is a new sender we hadn't seen before, remember it and // register a route for this peer. @@ -244,7 +244,7 @@ impl ActiveRelay { ReadResult::Continue } - relay::client::ReceivedMessage::Ping(data) => { + relay::client::conn::ReceivedMessage::Ping(data) => { // Best effort reply to the ping. let dc = self.relay_client.clone(); tokio::task::spawn(async move { @@ -254,8 +254,8 @@ impl ActiveRelay { }); ReadResult::Continue } - relay::client::ReceivedMessage::Health { .. } => ReadResult::Continue, - relay::client::ReceivedMessage::PeerGone(key) => { + relay::client::conn::ReceivedMessage::Health { .. } => ReadResult::Continue, + relay::client::conn::ReceivedMessage::PeerGone(key) => { self.relay_routes.retain(|peer| peer != &key); ReadResult::Continue } @@ -418,7 +418,7 @@ impl RelayActor { &mut self, url: &RelayUrl, peer: Option<&PublicKey>, - ) -> relay::http::Client { + ) -> relay::client::Client { debug!("connect relay {} for peer {:?}", url, peer); // See if we have a connection open to that relay node ID first. If so, might as // well use it. (It's a little arbitrary whether we use this one vs. the reverse route @@ -475,7 +475,7 @@ impl RelayActor { let url1 = url.clone(); // building a client dials the relay - let mut builder = relay::http::ClientBuilder::new(url1.clone()); + let mut builder = relay::client::ClientBuilder::new(url1.clone()); if let Some(url) = self.msock.proxy_url() { builder = builder.proxy_url(url.clone()); } diff --git a/iroh-net/src/metrics.rs b/iroh-net/src/metrics.rs index 8449a33572..1bd5d09480 100644 --- a/iroh-net/src/metrics.rs +++ b/iroh-net/src/metrics.rs @@ -2,4 +2,5 @@ pub use crate::magicsock::Metrics as MagicsockMetrics; pub use crate::netcheck::Metrics as NetcheckMetrics; pub use crate::portmapper::Metrics as PortmapMetrics; -pub use crate::relay::Metrics as RelayMetrics; +#[cfg(feature = "iroh-relay")] +pub use crate::relay::server::Metrics as RelayMetrics; diff --git a/iroh-net/src/relay.rs b/iroh-net/src/relay.rs index b9d9f1fb06..4c7f5c6dc0 100644 --- a/iroh-net/src/relay.rs +++ b/iroh-net/src/relay.rs @@ -11,25 +11,17 @@ #![deny(missing_docs, rustdoc::broken_intra_doc_links)] pub(crate) mod client; -#[cfg(feature = "iroh-relay")] -pub(crate) mod client_conn; -#[cfg(feature = "iroh-relay")] -pub(crate) mod clients; pub(crate) mod codec; pub mod http; -#[cfg(feature = "iroh-relay")] -pub mod iroh_relay; mod map; -mod metrics; #[cfg(feature = "iroh-relay")] -pub(crate) mod server; -pub(crate) mod types; +pub mod server; -pub use self::client::{Client as RelayClient, ReceivedMessage}; +pub use self::client::conn::{Conn as RelayConn, ReceivedMessage}; +pub use self::client::{ + Client as HttpClient, ClientBuilder as HttpClientBuilder, ClientError as HttpClientError, + ClientReceiver as HttpClientReceiver, +}; pub use self::codec::MAX_PACKET_SIZE; -pub use self::http::Client as HttpClient; pub use self::map::{RelayMap, RelayMode, RelayNode}; -pub use self::metrics::Metrics; -#[cfg(feature = "iroh-relay")] -pub use self::server::{ClientConnHandler, MaybeTlsStream as MaybeTlsStreamServer, Server}; pub use iroh_base::node_addr::RelayUrl; diff --git a/iroh-net/src/relay/client.rs b/iroh-net/src/relay/client.rs index 1fa34b1efd..35a4e4af98 100644 --- a/iroh-net/src/relay/client.rs +++ b/iroh-net/src/relay/client.rs @@ -1,511 +1,1112 @@ -//! based on tailscale/derp/derp_client.go -use std::net::SocketAddr; -use std::pin::Pin; +//! Based on tailscale/derp/derphttp/derphttp_client.go + +use std::collections::HashMap; +use std::net::{IpAddr, SocketAddr}; use std::sync::Arc; -use std::task::{Context, Poll}; use std::time::Duration; -use anyhow::{anyhow, bail, ensure, Result}; +use base64::{engine::general_purpose::URL_SAFE, Engine as _}; use bytes::Bytes; -use futures_lite::Stream; -use futures_sink::Sink; -use futures_util::stream::{SplitSink, SplitStream, StreamExt}; -use futures_util::SinkExt; -use tokio::sync::mpsc; -use tokio_tungstenite_wasm::WebSocketStream; +use futures_lite::future::Boxed as BoxFuture; +use futures_util::StreamExt; +use http_body_util::Empty; +use hyper::body::Incoming; +use hyper::header::UPGRADE; +use hyper::upgrade::Parts; +use hyper::Request; +use hyper_util::rt::TokioIo; +use rand::Rng; +use rustls::client::Resumption; +use tokio::io::{AsyncRead, AsyncWrite}; +use tokio::net::TcpStream; +use tokio::sync::{mpsc, oneshot}; +use tokio::task::JoinSet; +use tokio::time::Instant; use tokio_util::codec::{FramedRead, FramedWrite}; -use tracing::{debug, info_span, trace, Instrument}; - -use super::codec::PER_CLIENT_READ_QUEUE_DEPTH; -use super::http::streams::{MaybeTlsStreamReader, MaybeTlsStreamWriter}; -use super::{ - codec::{ - write_frame, DerpCodec, Frame, MAX_PACKET_SIZE, PER_CLIENT_SEND_QUEUE_DEPTH, - PROTOCOL_VERSION, - }, - types::{ClientInfo, RateLimiter}, +use tracing::{debug, error, event, info_span, trace, warn, Instrument, Level}; +use url::Url; + +use conn::{ + Conn as RelayClient, ConnBuilder as RelayClientBuilder, ConnReader, + ConnReceiver as RelayClientReceiver, ConnWriter, ReceivedMessage, }; +use streams::{downcast_upgrade, MaybeTlsStream, ProxyStream}; +use crate::dns::{DnsResolver, ResolverExt}; use crate::key::{PublicKey, SecretKey}; +use crate::relay::codec::DerpCodec; +use crate::relay::http::{Protocol, RELAY_PATH}; +use crate::relay::RelayUrl; +use crate::util::chain; use crate::util::AbortingJoinHandle; -const CLIENT_RECV_TIMEOUT: Duration = Duration::from_secs(120); +pub(crate) mod conn; +pub(crate) mod streams; -impl PartialEq for Client { - fn eq(&self, other: &Self) -> bool { - Arc::ptr_eq(&self.inner, &other.inner) - } -} +const DIAL_NODE_TIMEOUT: Duration = Duration::from_millis(1500); +const PING_TIMEOUT: Duration = Duration::from_secs(5); +const CONNECT_TIMEOUT: Duration = Duration::from_secs(10); +const DNS_TIMEOUT: Duration = Duration::from_secs(1); -impl Eq for Client {} +/// Possible connection errors on the [`Client`] +#[derive(Debug, thiserror::Error)] +pub enum ClientError { + /// The client is closed + #[error("client is closed")] + Closed, + /// There no underlying relay [`super::client::Client`] client exists for this http relay [`Client`] + #[error("no relay client")] + NoClient, + /// There was an error sending a packet + #[error("error sending a packet")] + Send, + /// There was an error receiving a packet + #[error("error receiving a packet: {0:?}")] + Receive(anyhow::Error), + /// There was a connection timeout error + #[error("connect timeout")] + ConnectTimeout, + /// No relay nodes are available + #[error("Relay node is not available")] + RelayNodeNotAvail, + /// No relay nodes are available with that name + #[error("no nodes available for {0}")] + NoNodeForTarget(String), + /// The relay node specified only allows STUN requests + #[error("no relay nodes found for {0}, only are stun_only nodes")] + StunOnlyNodesFound(String), + /// There was an error dialing + #[error("dial error")] + DialIO(#[from] std::io::Error), + /// There was an error from the task doing the dialing + #[error("dial error")] + DialTask(#[from] tokio::task::JoinError), + /// Both IPv4 and IPv6 are disabled for this relay node + #[error("both IPv4 and IPv6 are explicitly disabled for this node")] + IPDisabled, + /// No local addresses exist + #[error("no local addr: {0}")] + NoLocalAddr(String), + /// There was http server [`hyper::Error`] + #[error("http connection error")] + Hyper(#[from] hyper::Error), + /// There was an http error [`http::Error`]. + #[error("http error")] + Http(#[from] http::Error), + /// There was an unexpected status code + #[error("unexpected status code: expected {0}, got {1}")] + UnexpectedStatusCode(hyper::StatusCode, hyper::StatusCode), + /// The connection failed to upgrade + #[error("failed to upgrade connection: {0}")] + Upgrade(String), + /// The connection failed to proxy + #[error("failed to proxy connection: {0}")] + Proxy(String), + /// The relay [`super::client::Client`] failed to build + #[error("failed to build relay client: {0}")] + Build(String), + /// The ping request timed out + #[error("ping timeout")] + PingTimeout, + /// The ping request was aborted + #[error("ping aborted")] + PingAborted, + /// This [`Client`] cannot acknowledge pings + #[error("cannot acknowledge pings")] + CannotAckPings, + /// The given [`Url`] is invalid + #[error("invalid url: {0}")] + InvalidUrl(String), + /// There was an error with DNS resolution + #[error("dns: {0:?}")] + Dns(Option), + /// There was a timeout resolving DNS. + #[error("dns timeout")] + DnsTimeout, + /// The inner actor is gone, likely means things are shutdown. + #[error("actor gone")] + ActorGone, + /// An error related to websockets, either errors with parsing ws messages or the handshake + #[error("websocket error: {0}")] + WebsocketError(#[from] tokio_tungstenite_wasm::Error), +} -/// A relay Client. +/// An HTTP Relay client. +/// /// Cheaply clonable. -/// Call `close` to shutdown the write loop and read functionality. -#[derive(Debug, Clone)] +#[derive(Clone, Debug)] pub struct Client { - inner: Arc, + inner: mpsc::Sender, + public_key: PublicKey, + #[allow(dead_code)] + recv_loop: Arc>, +} + +#[derive(Debug)] +enum ActorMessage { + Connect(oneshot::Sender>), + NotePreferred(bool), + LocalAddr(oneshot::Sender, ClientError>>), + Ping(oneshot::Sender>), + Pong([u8; 8], oneshot::Sender>), + Send(PublicKey, Bytes, oneshot::Sender>), + Close(oneshot::Sender>), + CloseForReconnect(oneshot::Sender>), + IsConnected(oneshot::Sender>), } +/// Receiving end of a [`Client`]. #[derive(Debug)] pub struct ClientReceiver { - /// The reader channel, receiving incoming messages. - reader_channel: mpsc::Receiver>, + msg_receiver: mpsc::Receiver>, } -impl ClientReceiver { - /// Reads a messages from a relay server. +#[derive(derive_more::Debug)] +struct Actor { + secret_key: SecretKey, + can_ack_pings: bool, + is_preferred: bool, + relay_client: Option<(RelayClient, RelayClientReceiver)>, + is_closed: bool, + #[debug("address family selector callback")] + address_family_selector: Option BoxFuture + Send + Sync + 'static>>, + conn_gen: usize, + url: RelayUrl, + protocol: Protocol, + #[debug("TlsConnector")] + tls_connector: tokio_rustls::TlsConnector, + pings: PingTracker, + ping_tasks: JoinSet<()>, + dns_resolver: DnsResolver, + proxy_url: Option, +} + +#[derive(Default, Debug)] +struct PingTracker(HashMap<[u8; 8], oneshot::Sender<()>>); + +impl PingTracker { + /// Note that we have sent a ping, and store the [`oneshot::Sender`] we + /// must notify when the pong returns + fn register(&mut self) -> ([u8; 8], oneshot::Receiver<()>) { + let data = rand::thread_rng().gen::<[u8; 8]>(); + let (send, recv) = oneshot::channel(); + self.0.insert(data, send); + (data, recv) + } + + /// Remove the associated [`oneshot::Sender`] for `data` & return it. /// - /// Once it returns an error, the [`Client`] is dead forever. - pub async fn recv(&mut self) -> Result { - let msg = self - .reader_channel - .recv() - .await - .ok_or(anyhow!("shut down"))??; - Ok(msg) + /// If there is no [`oneshot::Sender`] in the tracker, return `None`. + fn unregister(&mut self, data: [u8; 8], why: &'static str) -> Option> { + trace!("removing ping {}: {}", hex::encode(data), why); + self.0.remove(&data) } } +/// Build a Client. #[derive(derive_more::Debug)] -pub struct InnerClient { - /// Our local address, if known. +pub struct ClientBuilder { + /// Default is false + can_ack_pings: bool, + /// Default is false + is_preferred: bool, + /// Default is None + #[debug("address family selector callback")] + address_family_selector: Option BoxFuture + Send + Sync + 'static>>, + /// Default is false + is_prober: bool, + /// Expected PublicKey of the server + server_public_key: Option, + /// Server url. + url: RelayUrl, + /// Relay protocol + protocol: Protocol, + /// Allow self-signed certificates from relay servers + #[cfg(any(test, feature = "test-utils"))] + insecure_skip_cert_verify: bool, + /// HTTP Proxy + proxy_url: Option, +} + +impl ClientBuilder { + /// Create a new [`ClientBuilder`] + pub fn new(url: impl Into) -> Self { + ClientBuilder { + can_ack_pings: false, + is_preferred: false, + address_family_selector: None, + is_prober: false, + server_public_key: None, + url: url.into(), + protocol: Protocol::Relay, + #[cfg(any(test, feature = "test-utils"))] + insecure_skip_cert_verify: false, + proxy_url: None, + } + } + + /// Sets the server url + pub fn server_url(mut self, url: impl Into) -> Self { + self.url = url.into(); + self + } + + /// Sets whether to connect to the relay via websockets or not. + /// Set to use non-websocket, normal relaying by default. + pub fn protocol(mut self, protocol: Protocol) -> Self { + self.protocol = protocol; + self + } + + /// Returns if we should prefer ipv6 + /// it replaces the relayhttp.AddressFamilySelector we pass + /// It provides the hint as to whether in an IPv4-vs-IPv6 race that + /// IPv4 should be held back a bit to give IPv6 a better-than-50/50 + /// chance of winning. We only return true when we believe IPv6 will + /// work anyway, so we don't artificially delay the connection speed. + pub fn address_family_selector(mut self, selector: S) -> Self + where + S: Fn() -> BoxFuture + Send + Sync + 'static, + { + self.address_family_selector = Some(Box::new(selector)); + self + } + + /// Enable this [`Client`] to acknowledge pings. + pub fn can_ack_pings(mut self, can: bool) -> Self { + self.can_ack_pings = can; + self + } + + /// Indicate this client is the preferred way to communicate + /// to the peer with this client's [`PublicKey`] + pub fn is_preferred(mut self, is: bool) -> Self { + self.is_preferred = is; + self + } + + /// Indicates this client is a prober + pub fn is_prober(mut self, is: bool) -> Self { + self.is_prober = is; + self + } + + /// Skip the verification of the relay server's SSL certificates. /// - /// Is `None` in tests or when using websockets (because we don't control connection establishment in browsers). - local_addr: Option, - /// Channel on which to communicate to the server. The associated [`mpsc::Receiver`] will close - /// if there is ever an error writing to the server. - writer_channel: mpsc::Sender, - /// JoinHandle for the [`ClientWriter`] task - writer_task: AbortingJoinHandle>, - reader_task: AbortingJoinHandle<()>, + /// May only be used in tests. + #[cfg(any(test, feature = "test-utils"))] + pub fn insecure_skip_cert_verify(mut self, skip: bool) -> Self { + self.insecure_skip_cert_verify = skip; + self + } + + /// Set an explicit proxy url to proxy all HTTP(S) traffic through. + pub fn proxy_url(mut self, url: Url) -> Self { + self.proxy_url.replace(url); + self + } + + /// Build the [`Client`] + pub fn build(self, key: SecretKey, dns_resolver: DnsResolver) -> (Client, ClientReceiver) { + // TODO: review TLS config + let mut roots = rustls::RootCertStore::empty(); + roots.add_trust_anchors(webpki_roots::TLS_SERVER_ROOTS.iter().map(|ta| { + rustls::OwnedTrustAnchor::from_subject_spki_name_constraints( + ta.subject, + ta.spki, + ta.name_constraints, + ) + })); + let mut config = rustls::client::ClientConfig::builder() + .with_safe_defaults() + .with_root_certificates(roots) + .with_no_client_auth(); + #[cfg(any(test, feature = "test-utils"))] + if self.insecure_skip_cert_verify { + warn!("Insecure config: SSL certificates from relay servers will be trusted without verification"); + config + .dangerous() + .set_certificate_verifier(Arc::new(NoCertVerifier)); + } + + config.resumption = Resumption::default(); + + let tls_connector: tokio_rustls::TlsConnector = Arc::new(config).into(); + let public_key = key.public(); + + let inner = Actor { + secret_key: key, + can_ack_pings: self.can_ack_pings, + is_preferred: self.is_preferred, + relay_client: None, + is_closed: false, + address_family_selector: self.address_family_selector, + conn_gen: 0, + pings: PingTracker::default(), + ping_tasks: Default::default(), + url: self.url, + protocol: self.protocol, + tls_connector, + dns_resolver, + proxy_url: self.proxy_url, + }; + + let (msg_sender, inbox) = mpsc::channel(64); + let (s, r) = mpsc::channel(64); + let recv_loop = tokio::task::spawn( + async move { inner.run(inbox, s).await }.instrument(info_span!("client")), + ); + + ( + Client { + public_key, + inner: msg_sender, + recv_loop: Arc::new(recv_loop.into()), + }, + ClientReceiver { msg_receiver: r }, + ) + } + + /// The expected [`PublicKey`] of the relay server we are connecting to. + pub fn server_public_key(mut self, server_public_key: PublicKey) -> Self { + self.server_public_key = Some(server_public_key); + self + } +} + +impl ClientReceiver { + /// Reads a message from the server. Returns the message and the `conn_get`, or the number of + /// re-connections this Client has ever made + pub async fn recv(&mut self) -> Option> { + self.msg_receiver.recv().await + } } impl Client { - /// Sends a packet to the node identified by `dstkey` - /// - /// Errors if the packet is larger than [`super::MAX_PACKET_SIZE`] - pub async fn send(&self, dstkey: PublicKey, packet: Bytes) -> Result<()> { - trace!(%dstkey, len = packet.len(), "[RELAY] send"); + /// The public key for this client + pub fn public_key(&self) -> PublicKey { + self.public_key + } - self.inner - .writer_channel - .send(ClientWriterMessage::Packet((dstkey, packet))) - .await?; - Ok(()) + async fn send_actor(&self, msg_create: F) -> Result + where + F: FnOnce(oneshot::Sender>) -> ActorMessage, + { + let (s, r) = oneshot::channel(); + let msg = msg_create(s); + match self.inner.send(msg).await { + Ok(_) => { + let res = r.await.map_err(|_| ClientError::ActorGone)??; + Ok(res) + } + Err(_) => Err(ClientError::ActorGone), + } } - /// Send a ping with 8 bytes of random data. - pub async fn send_ping(&self, data: [u8; 8]) -> Result<()> { - self.inner - .writer_channel - .send(ClientWriterMessage::Ping(data)) - .await?; - Ok(()) + /// Connect to a relay Server and returns the underlying relay Client. + /// + /// Returns [`ClientError::Closed`] if the [`Client`] is closed. + /// + /// If there is already an active relay connection, returns the already + /// connected [`crate::relay::RelayConn`]. + pub async fn connect(&self) -> Result<(RelayClient, usize), ClientError> { + self.send_actor(ActorMessage::Connect).await } - /// Respond to a ping request. The `data` field should be filled - /// by the 8 bytes of random data send by the ping. - pub async fn send_pong(&self, data: [u8; 8]) -> Result<()> { + /// Let the server know that this client is the preferred client + pub async fn note_preferred(&self, is_preferred: bool) { self.inner - .writer_channel - .send(ClientWriterMessage::Pong(data)) - .await?; - Ok(()) + .send(ActorMessage::NotePreferred(is_preferred)) + .await + .ok(); } - /// Sends a packet that tells the server whether this - /// client is the user's preferred server. This is only - /// used in the server for stats. - pub async fn note_preferred(&self, preferred: bool) -> Result<()> { - self.inner - .writer_channel - .send(ClientWriterMessage::NotePreferred(preferred)) - .await?; - Ok(()) + /// Get the local addr of the connection. If there is no current underlying relay connection + /// or the [`Client`] is closed, returns `None`. + pub async fn local_addr(&self) -> Option { + self.send_actor(ActorMessage::LocalAddr) + .await + .ok() + .flatten() } - /// The local address that the [`Client`] is listening on. + /// Send a ping to the server. Return once we get an expected pong. /// - /// `None`, when run in a testing environment or when using websockets. - pub fn local_addr(&self) -> Option { - self.inner.local_addr + /// There must be a task polling `recv_detail` to process the `pong` response. + pub async fn ping(&self) -> Result { + self.send_actor(ActorMessage::Ping).await } - /// Whether or not this [`Client`] is closed. + /// Send a pong back to the server. /// - /// The [`Client`] is considered closed if the write side of the client is no longer running. - pub fn is_closed(&self) -> bool { - self.inner.writer_task.is_finished() + /// If there is no underlying active relay connection, it creates one before attempting to + /// send the pong message. + /// + /// If there is an error sending pong, it closes the underlying relay connection before + /// returning. + pub async fn send_pong(&self, data: [u8; 8]) -> Result<(), ClientError> { + self.send_actor(|s| ActorMessage::Pong(data, s)).await } - /// Close the client + /// Send a packet to the server. /// - /// Shuts down the write loop directly and marks the client as closed. The [`Client`] will - /// check if the client is closed before attempting to read from it. - pub async fn close(&self) { - if self.inner.writer_task.is_finished() && self.inner.reader_task.is_finished() { - return; - } + /// If there is no underlying active relay connection, it creates one before attempting to + /// send the message. + /// + /// If there is an error sending the packet, it closes the underlying relay connection before + /// returning. + pub async fn send(&self, dst_key: PublicKey, b: Bytes) -> Result<(), ClientError> { + self.send_actor(|s| ActorMessage::Send(dst_key, b, s)).await + } - self.inner - .writer_channel - .send(ClientWriterMessage::Shutdown) - .await - .ok(); - self.inner.reader_task.abort(); + /// Close the http relay connection. + pub async fn close(self) -> Result<(), ClientError> { + self.send_actor(ActorMessage::Close).await } -} -fn process_incoming_frame(frame: Frame) -> Result { - match frame { - Frame::KeepAlive => { - // A one-way keep-alive message that doesn't require an ack. - // This predated FrameType::Ping/FrameType::Pong. - Ok(ReceivedMessage::KeepAlive) - } - Frame::PeerGone { peer } => Ok(ReceivedMessage::PeerGone(peer)), - Frame::RecvPacket { src_key, content } => { - let packet = ReceivedMessage::ReceivedPacket { - source: src_key, - data: content, - }; - Ok(packet) - } - Frame::Ping { data } => Ok(ReceivedMessage::Ping(data)), - Frame::Pong { data } => Ok(ReceivedMessage::Pong(data)), - Frame::Health { problem } => { - let problem = std::str::from_utf8(&problem)?.to_owned(); - let problem = Some(problem); - Ok(ReceivedMessage::Health { problem }) - } - Frame::Restarting { - reconnect_in, - try_for, - } => { - let reconnect_in = Duration::from_millis(reconnect_in as u64); - let try_for = Duration::from_millis(try_for as u64); - Ok(ReceivedMessage::ServerRestarting { - reconnect_in, - try_for, - }) - } - _ => bail!("unexpected packet: {:?}", frame.typ()), + /// Disconnect the http relay connection. + pub async fn close_for_reconnect(&self) -> Result<(), ClientError> { + self.send_actor(ActorMessage::CloseForReconnect).await } -} -/// The kinds of messages we can send to the [`super::server::Server`] -#[derive(Debug)] -enum ClientWriterMessage { - /// Send a packet (addressed to the [`PublicKey`]) to the server - Packet((PublicKey, Bytes)), - /// Send a pong to the server - Pong([u8; 8]), - /// Send a ping to the server - Ping([u8; 8]), - /// Tell the server whether or not this client is the user's preferred client - NotePreferred(bool), - /// Shutdown the writer - Shutdown, + /// Returns `true` if the underlying relay connection is established. + pub async fn is_connected(&self) -> Result { + self.send_actor(ActorMessage::IsConnected).await + } } -/// Call [`ClientWriter::run`] to listen for messages to send to the client. -/// Should be used by the [`Client`] -/// -/// Shutsdown when you send a [`ClientWriterMessage::Shutdown`], or if there is an error writing to -/// the server. -struct ClientWriter { - recv_msgs: mpsc::Receiver, - writer: ConnWriter, - rate_limiter: Option, -} +impl Actor { + async fn run( + mut self, + mut inbox: mpsc::Receiver, + msg_sender: mpsc::Sender>, + ) { + // Add an initial connection attempt. + if let Err(err) = self.connect("initial connect").await { + msg_sender.send(Err(err)).await.ok(); + } -impl ClientWriter { - async fn run(mut self) -> Result<()> { - while let Some(msg) = self.recv_msgs.recv().await { - match msg { - ClientWriterMessage::Packet((key, bytes)) => { - send_packet(&mut self.writer, &self.rate_limiter, key, bytes).await?; - } - ClientWriterMessage::Pong(data) => { - write_frame(&mut self.writer, Frame::Pong { data }, None).await?; - self.writer.flush().await?; - } - ClientWriterMessage::Ping(data) => { - write_frame(&mut self.writer, Frame::Ping { data }, None).await?; - self.writer.flush().await?; + loop { + tokio::select! { + res = self.recv_detail() => { + if let Ok((ReceivedMessage::Pong(ping), _)) = res { + match self.pings.unregister(ping, "pong") { + Some(chan) => { + if chan.send(()).is_err() { + warn!("pong received for ping {ping:?}, but the receiving channel was closed"); + } + } + None => { + warn!("pong received for ping {ping:?}, but not registered"); + } + } + continue; + } + msg_sender.send(res).await.ok(); } - ClientWriterMessage::NotePreferred(preferred) => { - write_frame(&mut self.writer, Frame::NotePreferred { preferred }, None).await?; - self.writer.flush().await?; + Some(msg) = inbox.recv() => { + match msg { + ActorMessage::Connect(s) => { + let res = self.connect("actor msg").await.map(|(client, _, count)| (client, count)); + s.send(res).ok(); + }, + ActorMessage::NotePreferred(is_preferred) => { + self.note_preferred(is_preferred).await; + }, + ActorMessage::LocalAddr(s) => { + let res = self.local_addr(); + s.send(Ok(res)).ok(); + }, + ActorMessage::Ping(s) => { + self.ping(s).await; + }, + ActorMessage::Pong(data, s) => { + let res = self.send_pong(data).await; + s.send(res).ok(); + }, + ActorMessage::Send(key, data, s) => { + let res = self.send(key, data).await; + s.send(res).ok(); + }, + ActorMessage::Close(s) => { + let res = self.close().await; + s.send(Ok(res)).ok(); + // shutting down + break; + }, + ActorMessage::CloseForReconnect(s) => { + let res = self.close_for_reconnect().await; + s.send(Ok(res)).ok(); + }, + ActorMessage::IsConnected(s) => { + let res = self.is_connected(); + s.send(Ok(res)).ok(); + }, + } } - ClientWriterMessage::Shutdown => { - return Ok(()); + else => { + // Shutting down + self.close().await; + break; } } } + } + + async fn connect( + &mut self, + why: &'static str, + ) -> Result<(RelayClient, &'_ mut RelayClientReceiver, usize), ClientError> { + debug!( + "connect: {}, current client {}", + why, + self.relay_client.is_some() + ); + + if self.is_closed { + return Err(ClientError::Closed); + } + async move { + if self.relay_client.is_none() { + trace!("no connection, trying to connect"); + let (relay_client, receiver) = + tokio::time::timeout(CONNECT_TIMEOUT, self.connect_0()) + .await + .map_err(|_| ClientError::ConnectTimeout)??; + + self.relay_client = Some((relay_client.clone(), receiver)); + self.next_conn(); + } else { + trace!("already had connection"); + } + let count = self.current_conn(); + let (relay_client, receiver) = self + .relay_client + .as_mut() + .map(|(c, r)| (c.clone(), r)) + .expect("just checked"); - bail!("channel unexpectedly closed"); + Ok((relay_client, receiver, count)) + } + .instrument(info_span!("connect")) + .await } -} -/// The Builder returns a [`Client`] and a started [`ClientWriter`] run task. -pub struct ClientBuilder { - secret_key: SecretKey, - reader: ConnReader, - writer: ConnWriter, - local_addr: Option, -} + async fn connect_0(&self) -> Result<(RelayClient, RelayClientReceiver), ClientError> { + let (reader, writer, local_addr) = match self.protocol { + Protocol::Websocket => { + let (reader, writer) = self.connect_ws().await?; + let local_addr = None; + (reader, writer, local_addr) + } + Protocol::Relay => { + let (reader, writer, local_addr) = self.connect_derp().await?; + (reader, writer, Some(local_addr)) + } + }; -pub(crate) enum ConnReader { - Derp(FramedRead), - Ws(SplitStream), -} + let (relay_client, receiver) = + RelayClientBuilder::new(self.secret_key.clone(), local_addr, reader, writer) + .build() + .await + .map_err(|e| ClientError::Build(e.to_string()))?; -pub(crate) enum ConnWriter { - Derp(FramedWrite), - Ws(SplitSink), -} + if self.is_preferred && relay_client.note_preferred(true).await.is_err() { + relay_client.close().await; + return Err(ClientError::Send); + } -fn tung_wasm_to_io_err(e: tokio_tungstenite_wasm::Error) -> std::io::Error { - match e { - tokio_tungstenite_wasm::Error::Io(io_err) => io_err, - _ => std::io::Error::new(std::io::ErrorKind::Other, e.to_string()), + event!( + target: "events.net.relay.connected", + Level::DEBUG, + home = self.is_preferred, + url = %self.url, + ); + + trace!("connect_0 done"); + Ok((relay_client, receiver)) } -} -impl Stream for ConnReader { - type Item = Result; + async fn connect_ws(&self) -> Result<(ConnReader, ConnWriter), ClientError> { + let mut dial_url = (*self.url).clone(); + dial_url.set_path(RELAY_PATH); + // The relay URL is exchanged with the http(s) scheme in tickets and similar. + // We need to use the ws:// or wss:// schemes when connecting with websockets, though. + dial_url + .set_scheme(if self.use_tls() { "wss" } else { "ws" }) + .map_err(|()| ClientError::InvalidUrl(self.url.to_string()))?; - fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - match *self { - Self::Derp(ref mut ws) => Pin::new(ws).poll_next(cx), - Self::Ws(ref mut ws) => match Pin::new(ws).poll_next(cx) { - Poll::Ready(Some(Ok(tokio_tungstenite_wasm::Message::Binary(vec)))) => { - Poll::Ready(Some(Frame::decode_from_ws_msg(vec))) - } - Poll::Ready(Some(Ok(msg))) => { - tracing::warn!(?msg, "Got websocket message of unsupported type, skipping."); - Poll::Pending + debug!(%dial_url, "Dialing relay by websocket"); + + let (writer, reader) = tokio_tungstenite_wasm::connect(dial_url).await?.split(); + + let reader = ConnReader::Ws(reader); + let writer = ConnWriter::Ws(writer); + + Ok((reader, writer)) + } + + async fn connect_derp(&self) -> Result<(ConnReader, ConnWriter, SocketAddr), ClientError> { + let tcp_stream = self.dial_url().await?; + + let local_addr = tcp_stream + .local_addr() + .map_err(|e| ClientError::NoLocalAddr(e.to_string()))?; + + debug!(server_addr = ?tcp_stream.peer_addr(), %local_addr, "TCP stream connected"); + + let response = if self.use_tls() { + debug!("Starting TLS handshake"); + let hostname = self + .tls_servername() + .ok_or_else(|| ClientError::InvalidUrl("No tls servername".into()))?; + let tls_stream = self.tls_connector.connect(hostname, tcp_stream).await?; + debug!("tls_connector connect success"); + Self::start_upgrade(tls_stream).await? + } else { + debug!("Starting handshake"); + Self::start_upgrade(tcp_stream).await? + }; + + if response.status() != hyper::StatusCode::SWITCHING_PROTOCOLS { + error!( + "expected status 101 SWITCHING_PROTOCOLS, got: {}", + response.status() + ); + return Err(ClientError::UnexpectedStatusCode( + hyper::StatusCode::SWITCHING_PROTOCOLS, + response.status(), + )); + } + + debug!("starting upgrade"); + let upgraded = match hyper::upgrade::on(response).await { + Ok(upgraded) => upgraded, + Err(err) => { + warn!("upgrade failed: {:#}", err); + return Err(ClientError::Hyper(err)); + } + }; + + debug!("connection upgraded"); + let (reader, writer) = + downcast_upgrade(upgraded).map_err(|e| ClientError::Upgrade(e.to_string()))?; + + let reader = ConnReader::Derp(FramedRead::new(reader, DerpCodec)); + let writer = ConnWriter::Derp(FramedWrite::new(writer, DerpCodec)); + + Ok((reader, writer, local_addr)) + } + + /// Sends the HTTP upgrade request to the relay server. + async fn start_upgrade(io: T) -> Result, ClientError> + where + T: AsyncRead + AsyncWrite + Send + Unpin + 'static, + { + let io = hyper_util::rt::TokioIo::new(io); + let (mut request_sender, connection) = hyper::client::conn::http1::Builder::new() + .handshake(io) + .await?; + tokio::spawn( + // This task drives the HTTP exchange, completes once connection is upgraded. + async move { + debug!("HTTP upgrade driver started"); + if let Err(err) = connection.with_upgrades().await { + error!("HTTP upgrade error: {err:#}"); } - Poll::Ready(Some(Err(e))) => Poll::Ready(Some(Err(e.into()))), - Poll::Ready(None) => Poll::Ready(None), - Poll::Pending => Poll::Pending, - }, + debug!("HTTP upgrade driver finished"); + } + .instrument(info_span!("http-driver")), + ); + debug!("Sending upgrade request"); + let req = Request::builder() + .uri(RELAY_PATH) + .header(UPGRADE, Protocol::Relay.upgrade_header()) + .body(http_body_util::Empty::::new())?; + request_sender.send_request(req).await.map_err(From::from) + } + + async fn note_preferred(&mut self, is_preferred: bool) { + let old = &mut self.is_preferred; + if *old == is_preferred { + return; + } + *old = is_preferred; + + // only send the preference if we already have a connection + let res = { + if let Some((ref client, _)) = self.relay_client { + client.note_preferred(is_preferred).await + } else { + return; + } + }; + // need to do this outside the above closure because they rely on the same lock + // if there was an error sending, close the underlying relay connection + if res.is_err() { + self.close_for_reconnect().await; } } -} -impl Sink for ConnWriter { - type Error = std::io::Error; + fn local_addr(&self) -> Option { + if self.is_closed { + return None; + } + if let Some((ref client, _)) = self.relay_client { + client.local_addr() + } else { + None + } + } + + async fn ping(&mut self, s: oneshot::Sender>) { + let connect_res = self.connect("ping").await.map(|(c, _, _)| c); + let (ping, recv) = self.pings.register(); + trace!("ping: {}", hex::encode(ping)); + + self.ping_tasks.spawn(async move { + let res = match connect_res { + Ok(client) => { + let start = Instant::now(); + if let Err(err) = client.send_ping(ping).await { + warn!("failed to send ping: {:?}", err); + Err(ClientError::Send) + } else { + match tokio::time::timeout(PING_TIMEOUT, recv).await { + Ok(Ok(())) => Ok(start.elapsed()), + Err(_) => Err(ClientError::PingTimeout), + Ok(Err(_)) => Err(ClientError::PingAborted), + } + } + } + Err(err) => Err(err), + }; + s.send(res).ok(); + }); + } - fn poll_ready(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - match *self { - Self::Derp(ref mut ws) => Pin::new(ws).poll_ready(cx), - Self::Ws(ref mut ws) => Pin::new(ws).poll_ready(cx).map_err(tung_wasm_to_io_err), + async fn send(&mut self, dst_key: PublicKey, b: Bytes) -> Result<(), ClientError> { + trace!(dst = %dst_key.fmt_short(), len = b.len(), "send"); + let (client, _, _) = self.connect("send").await?; + if client.send(dst_key, b).await.is_err() { + self.close_for_reconnect().await; + return Err(ClientError::Send); } + Ok(()) } - fn start_send(mut self: Pin<&mut Self>, item: Frame) -> Result<(), Self::Error> { - match *self { - Self::Derp(ref mut ws) => Pin::new(ws).start_send(item), - Self::Ws(ref mut ws) => Pin::new(ws) - .start_send(tokio_tungstenite_wasm::Message::binary( - item.encode_for_ws_msg(), - )) - .map_err(tung_wasm_to_io_err), + async fn send_pong(&mut self, data: [u8; 8]) -> Result<(), ClientError> { + debug!("send_pong"); + if self.can_ack_pings { + let (client, _, _) = self.connect("send_pong").await?; + if client.send_pong(data).await.is_err() { + self.close_for_reconnect().await; + return Err(ClientError::Send); + } + Ok(()) + } else { + Err(ClientError::CannotAckPings) } } - fn poll_flush(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - match *self { - Self::Derp(ref mut ws) => Pin::new(ws).poll_flush(cx), - Self::Ws(ref mut ws) => Pin::new(ws).poll_flush(cx).map_err(tung_wasm_to_io_err), + async fn close(mut self) { + if !self.is_closed { + self.is_closed = true; + self.close_for_reconnect().await; } } - fn poll_close(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - match *self { - Self::Derp(ref mut ws) => Pin::new(ws).poll_close(cx), - Self::Ws(ref mut ws) => Pin::new(ws).poll_close(cx).map_err(tung_wasm_to_io_err), + fn is_connected(&self) -> bool { + if self.is_closed { + return false; } + self.relay_client.is_some() } -} -impl ClientBuilder { - pub fn new( - secret_key: SecretKey, - local_addr: Option, - reader: ConnReader, - writer: ConnWriter, - ) -> Self { - Self { - secret_key, - reader, - writer, - local_addr, + fn current_conn(&self) -> usize { + self.conn_gen + } + + fn next_conn(&mut self) -> usize { + self.conn_gen = self.conn_gen.wrapping_add(1); + self.conn_gen + } + + fn tls_servername(&self) -> Option { + self.url + .host_str() + .and_then(|s| rustls::ServerName::try_from(s).ok()) + } + + fn use_tls(&self) -> bool { + // only disable tls if we are explicitly dialing a http url + #[allow(clippy::match_like_matches_macro)] + match self.url.scheme() { + "http" => false, + "ws" => false, + _ => true, } } - async fn server_handshake(&mut self) -> Result> { - debug!("server_handshake: started"); - let client_info = ClientInfo { - version: PROTOCOL_VERSION, - }; - debug!("server_handshake: sending client_key: {:?}", &client_info); - crate::relay::codec::send_client_key(&mut self.writer, &self.secret_key, &client_info) - .await?; + async fn dial_url(&self) -> Result { + if let Some(ref proxy) = self.proxy_url { + let stream = self.dial_url_proxy(proxy.clone()).await?; + Ok(ProxyStream::Proxied(stream)) + } else { + let stream = self.dial_url_direct().await?; + Ok(ProxyStream::Raw(stream)) + } + } + + async fn dial_url_direct(&self) -> Result { + debug!(%self.url, "dial url"); + let prefer_ipv6 = self.prefer_ipv6().await; + let dst_ip = resolve_host(&self.dns_resolver, &self.url, prefer_ipv6).await?; - // TODO: add some actual configuration - let rate_limiter = RateLimiter::new(0, 0)?; + let port = url_port(&self.url) + .ok_or_else(|| ClientError::InvalidUrl("missing url port".into()))?; + let addr = SocketAddr::new(dst_ip, port); - debug!("server_handshake: done"); - Ok(rate_limiter) + debug!("connecting to {}", addr); + let tcp_stream = + tokio::time::timeout( + DIAL_NODE_TIMEOUT, + async move { TcpStream::connect(addr).await }, + ) + .await + .map_err(|_| ClientError::ConnectTimeout)? + .map_err(ClientError::DialIO)?; + + tcp_stream.set_nodelay(true)?; + + Ok(tcp_stream) } - pub async fn build(mut self) -> Result<(Client, ClientReceiver)> { - // exchange information with the server - let rate_limiter = self.server_handshake().await?; + async fn dial_url_proxy( + &self, + proxy_url: Url, + ) -> Result, MaybeTlsStream>, ClientError> { + debug!(%self.url, %proxy_url, "dial url via proxy"); - // create task to handle writing to the server - let (writer_sender, writer_recv) = mpsc::channel(PER_CLIENT_SEND_QUEUE_DEPTH); - let writer_task = tokio::task::spawn( - async move { - let client_writer = ClientWriter { - rate_limiter, - writer: self.writer, - recv_msgs: writer_recv, - }; - client_writer.run().await?; - Ok(()) + // Resolve proxy DNS + let prefer_ipv6 = self.prefer_ipv6().await; + let proxy_ip = resolve_host(&self.dns_resolver, &proxy_url, prefer_ipv6).await?; + + let proxy_port = url_port(&proxy_url) + .ok_or_else(|| ClientError::Proxy("missing proxy url port".into()))?; + let proxy_addr = SocketAddr::new(proxy_ip, proxy_port); + + debug!(%proxy_addr, "connecting to proxy"); + + let tcp_stream = tokio::time::timeout(DIAL_NODE_TIMEOUT, async move { + TcpStream::connect(proxy_addr).await + }) + .await + .map_err(|_| ClientError::ConnectTimeout)? + .map_err(ClientError::DialIO)?; + + tcp_stream.set_nodelay(true)?; + + // Setup TLS if necessary + let io = if proxy_url.scheme() == "http" { + MaybeTlsStream::Raw(tcp_stream) + } else { + let hostname = proxy_url + .host_str() + .and_then(|s| rustls::ServerName::try_from(s).ok()) + .ok_or_else(|| ClientError::InvalidUrl("No tls servername for proxy url".into()))?; + let tls_stream = self.tls_connector.connect(hostname, tcp_stream).await?; + MaybeTlsStream::Tls(tls_stream) + }; + let io = TokioIo::new(io); + + let target_host = self + .url + .host_str() + .ok_or_else(|| ClientError::Proxy("missing proxy host".into()))?; + + let port = + url_port(&self.url).ok_or_else(|| ClientError::Proxy("invalid target port".into()))?; + + // Establish Proxy Tunnel + let mut req_builder = Request::builder() + .uri(format!("{}:{}", target_host, port)) + .method("CONNECT") + .header("Host", target_host) + .header("Proxy-Connection", "Keep-Alive"); + if !proxy_url.username().is_empty() { + // Passthrough authorization + // https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Proxy-Authorization + debug!( + "setting proxy-authorization: username={}", + proxy_url.username() + ); + let to_encode = format!( + "{}:{}", + proxy_url.username(), + proxy_url.password().unwrap_or_default() + ); + let encoded = URL_SAFE.encode(to_encode); + req_builder = req_builder.header("Proxy-Authorization", format!("Basic {}", encoded)); + } + let req = req_builder.body(Empty::::new())?; + + debug!("Sending proxy request: {:?}", req); + + let (mut sender, conn) = hyper::client::conn::http1::handshake(io).await?; + tokio::task::spawn(async move { + if let Err(err) = conn.with_upgrades().await { + error!("Proxy connection failed: {:?}", err); } - .instrument(info_span!("client.writer")), - ); + }); - let (reader_sender, reader_recv) = mpsc::channel(PER_CLIENT_READ_QUEUE_DEPTH); - let writer_sender2 = writer_sender.clone(); - let reader_task = tokio::task::spawn(async move { - loop { - let frame = tokio::time::timeout(CLIENT_RECV_TIMEOUT, self.reader.next()).await; - let res = match frame { - Ok(Some(Ok(frame))) => process_incoming_frame(frame), - Ok(Some(Err(err))) => { - // Error processing incoming messages - Err(err) - } - Ok(None) => { - // EOF - Err(anyhow::anyhow!("EOF: reader stream ended")) - } - Err(err) => { - // Timeout - Err(err.into()) - } - }; - if res.is_err() { - // shutdown - writer_sender2 - .send(ClientWriterMessage::Shutdown) - .await - .ok(); - break; + let res = sender.send_request(req).await?; + if !res.status().is_success() { + return Err(ClientError::Proxy(format!( + "failed to connect to proxy: {}", + res.status(), + ))); + } + + let upgraded = hyper::upgrade::on(res).await?; + let Ok(Parts { io, read_buf, .. }) = upgraded.downcast::>() else { + return Err(ClientError::Proxy("invalid upgrade".to_string())); + }; + + let res = chain::chain(std::io::Cursor::new(read_buf), io.into_inner()); + + Ok(res) + } + + /// Reports whether IPv4 dials should be slightly + /// delayed to give IPv6 a better chance of winning dial races. + /// Implementations should only return true if IPv6 is expected + /// to succeed. (otherwise delaying IPv4 will delay the connection + /// overall) + async fn prefer_ipv6(&self) -> bool { + match self.address_family_selector { + Some(ref selector) => selector().await, + None => false, + } + } + + async fn recv_detail(&mut self) -> Result<(ReceivedMessage, usize), ClientError> { + if let Some((_client, client_receiver)) = self.relay_client.as_mut() { + trace!("recv_detail tick"); + match client_receiver.recv().await { + Ok(msg) => { + let current_gen = self.current_conn(); + return Ok((msg, current_gen)); } - if reader_sender.send(res).await.is_err() { - // shutdown, as the reader is gone - writer_sender2 - .send(ClientWriterMessage::Shutdown) - .await - .ok(); - break; + Err(e) => { + self.close_for_reconnect().await; + if self.is_closed { + return Err(ClientError::Closed); + } + // TODO(ramfox): more specific error? + return Err(ClientError::Receive(e)); } } - }); + } + std::future::pending().await + } - let client = Client { - inner: Arc::new(InnerClient { - local_addr: self.local_addr, - writer_channel: writer_sender, - writer_task: writer_task.into(), - reader_task: reader_task.into(), - }), - }; + /// Close the underlying relay connection. The next time the client takes some action that + /// requires a connection, it will call `connect`. + async fn close_for_reconnect(&mut self) { + debug!("close for reconnect"); + if let Some((client, _)) = self.relay_client.take() { + client.close().await + } + } +} - let client_receiver = ClientReceiver { - reader_channel: reader_recv, - }; +async fn resolve_host( + resolver: &DnsResolver, + url: &Url, + prefer_ipv6: bool, +) -> Result { + let host = url + .host() + .ok_or_else(|| ClientError::InvalidUrl("missing host".into()))?; + match host { + url::Host::Domain(domain) => { + // Need to do a DNS lookup + let mut addrs = resolver + .lookup_ipv4_ipv6(domain, DNS_TIMEOUT) + .await + .map_err(|e| ClientError::Dns(Some(e)))? + .peekable(); + + let found = if prefer_ipv6 { + let first = addrs.peek().copied(); + addrs.find(IpAddr::is_ipv6).or(first) + } else { + addrs.next() + }; + + found.ok_or_else(|| ClientError::Dns(None)) + } + url::Host::Ipv4(ip) => Ok(IpAddr::V4(ip)), + url::Host::Ipv6(ip) => Ok(IpAddr::V6(ip)), + } +} + +/// Used to allow self signed certificates in tests +#[cfg(any(test, feature = "test-utils"))] +struct NoCertVerifier; - Ok((client, client_receiver)) +#[cfg(any(test, feature = "test-utils"))] +impl rustls::client::ServerCertVerifier for NoCertVerifier { + fn verify_server_cert( + &self, + _end_entity: &rustls::Certificate, + _intermediates: &[rustls::Certificate], + _server_name: &rustls::ServerName, + _scts: &mut dyn Iterator, + _ocsp_response: &[u8], + _now: std::time::SystemTime, + ) -> Result { + Ok(rustls::client::ServerCertVerified::assertion()) } } -#[derive(derive_more::Debug, Clone)] -/// The type of message received by the [`Client`] from a relay server. -pub enum ReceivedMessage { - /// Represents an incoming packet. - ReceivedPacket { - /// The [`PublicKey`] of the packet sender. - source: PublicKey, - /// The received packet bytes. - #[debug(skip)] - data: Bytes, // TODO: ref - }, - /// Indicates that the client identified by the underlying public key had previously sent you a - /// packet but has now disconnected from the server. - PeerGone(PublicKey), - /// Request from a client or server to reply to the - /// other side with a [`ReceivedMessage::Pong`] with the given payload. - Ping([u8; 8]), - /// Reply to a [`ReceivedMessage::Ping`] from a client or server - /// with the payload sent previously in the ping. - Pong([u8; 8]), - /// A one-way empty message from server to client, just to - /// keep the connection alive. It's like a [`ReceivedMessage::Ping`], but doesn't solicit - /// a reply from the client. - KeepAlive, - /// A one-way message from server to client, declaring the connection health state. - Health { - /// If set, is a description of why the connection is unhealthy. - /// - /// If `None` means the connection is healthy again. - /// - /// The default condition is healthy, so the server doesn't broadcast a [`ReceivedMessage::Health`] - /// until a problem exists. - problem: Option, - }, - /// A one-way message from server to client, advertising that the server is restarting. - ServerRestarting { - /// An advisory duration that the client should wait before attempting to reconnect. - /// It might be zero. It exists for the server to smear out the reconnects. - reconnect_in: Duration, - /// An advisory duration for how long the client should attempt to reconnect - /// before giving up and proceeding with its normal connection failure logic. The interval - /// between retries is undefined for now. A server should not send a TryFor duration more - /// than a few seconds. - try_for: Duration, - }, +fn url_port(url: &Url) -> Option { + if let Some(port) = url.port() { + return Some(port); + } + + match url.scheme() { + "http" => Some(80), + "https" => Some(443), + _ => None, + } } -pub(crate) async fn send_packet + Unpin>( - mut writer: S, - rate_limiter: &Option, - dst_key: PublicKey, - packet: Bytes, -) -> Result<()> { - ensure!( - packet.len() <= MAX_PACKET_SIZE, - "packet too big: {}", - packet.len() - ); - - let frame = Frame::SendPacket { dst_key, packet }; - if let Some(rate_limiter) = rate_limiter { - if rate_limiter.check_n(frame.len()).is_err() { - tracing::warn!("dropping send: rate limit reached"); - return Ok(()); +#[cfg(test)] +mod tests { + use anyhow::{bail, Result}; + + use crate::dns::default_resolver; + + use super::*; + + #[tokio::test] + async fn test_recv_detail_connect_error() -> Result<()> { + let _guard = iroh_test::logging::setup(); + + let key = SecretKey::generate(); + let bad_url: Url = "https://bad.url".parse().unwrap(); + let dns_resolver = default_resolver(); + + let (_client, mut client_receiver) = + ClientBuilder::new(bad_url).build(key.clone(), dns_resolver.clone()); + + // ensure that the client will bubble up any connection error & not + // just loop ad infinitum attempting to connect + if client_receiver.recv().await.and_then(|s| s.ok()).is_some() { + bail!("expected client with bad relay node detail to return with an error"); } + Ok(()) } - writer.send(frame).await?; - writer.flush().await?; - - Ok(()) } diff --git a/iroh-net/src/relay/client/conn.rs b/iroh-net/src/relay/client/conn.rs new file mode 100644 index 0000000000..bebc29640d --- /dev/null +++ b/iroh-net/src/relay/client/conn.rs @@ -0,0 +1,539 @@ +//! Manages client-side connections to the relay server. +//! +//! based on tailscale/derp/derp_client.go + +use std::net::SocketAddr; +use std::num::NonZeroU32; +use std::pin::Pin; +use std::sync::Arc; +use std::task::{Context, Poll}; +use std::time::Duration; + +use anyhow::{anyhow, bail, ensure, Context as _, Result}; +use bytes::Bytes; +use futures_lite::Stream; +use futures_sink::Sink; +use futures_util::stream::{SplitSink, SplitStream, StreamExt}; +use futures_util::SinkExt; +use tokio::sync::mpsc; +use tokio_tungstenite_wasm::WebSocketStream; +use tokio_util::codec::{FramedRead, FramedWrite}; +use tracing::{debug, info_span, trace, Instrument}; + +use crate::key::{PublicKey, SecretKey}; +use crate::relay::client::streams::{MaybeTlsStreamReader, MaybeTlsStreamWriter}; +use crate::relay::codec::{ + write_frame, DerpCodec, Frame, MAX_PACKET_SIZE, PER_CLIENT_SEND_QUEUE_DEPTH, PROTOCOL_VERSION, +}; +use crate::relay::codec::{ClientInfo, PER_CLIENT_READ_QUEUE_DEPTH}; +use crate::util::AbortingJoinHandle; + +const CLIENT_RECV_TIMEOUT: Duration = Duration::from_secs(120); + +impl PartialEq for Conn { + fn eq(&self, other: &Self) -> bool { + Arc::ptr_eq(&self.inner, &other.inner) + } +} + +impl Eq for Conn {} + +/// A connection to a relay server. +/// +/// Cheaply clonable. +/// Call `close` to shut down the write loop and read functionality. +#[derive(Debug, Clone)] +pub struct Conn { + inner: Arc, +} + +#[derive(Debug)] +pub struct ConnReceiver { + /// The reader channel, receiving incoming messages. + reader_channel: mpsc::Receiver>, +} + +impl ConnReceiver { + /// Reads a messages from a relay server. + /// + /// Once it returns an error, the [`Conn`] is dead forever. + pub async fn recv(&mut self) -> Result { + let msg = self + .reader_channel + .recv() + .await + .ok_or(anyhow!("shut down"))??; + Ok(msg) + } +} + +#[derive(derive_more::Debug)] +pub struct ConnTasks { + /// Our local address, if known. + /// + /// Is `None` in tests or when using websockets (because we don't control connection establishment in browsers). + local_addr: Option, + /// Channel on which to communicate to the server. The associated [`mpsc::Receiver`] will close + /// if there is ever an error writing to the server. + writer_channel: mpsc::Sender, + /// JoinHandle for the [`ConnWriter`] task + writer_task: AbortingJoinHandle>, + reader_task: AbortingJoinHandle<()>, +} + +impl Conn { + /// Sends a packet to the node identified by `dstkey` + /// + /// Errors if the packet is larger than [`MAX_PACKET_SIZE`] + pub async fn send(&self, dstkey: PublicKey, packet: Bytes) -> Result<()> { + trace!(%dstkey, len = packet.len(), "[RELAY] send"); + + self.inner + .writer_channel + .send(ConnWriterMessage::Packet((dstkey, packet))) + .await?; + Ok(()) + } + + /// Send a ping with 8 bytes of random data. + pub async fn send_ping(&self, data: [u8; 8]) -> Result<()> { + self.inner + .writer_channel + .send(ConnWriterMessage::Ping(data)) + .await?; + Ok(()) + } + + /// Respond to a ping request. The `data` field should be filled + /// by the 8 bytes of random data send by the ping. + pub async fn send_pong(&self, data: [u8; 8]) -> Result<()> { + self.inner + .writer_channel + .send(ConnWriterMessage::Pong(data)) + .await?; + Ok(()) + } + + /// Sends a packet that tells the server whether this + /// connection is to the user's preferred server. This is only + /// used in the server for stats. + pub async fn note_preferred(&self, preferred: bool) -> Result<()> { + self.inner + .writer_channel + .send(ConnWriterMessage::NotePreferred(preferred)) + .await?; + Ok(()) + } + + /// The local address that the [`Conn`] is listening on. + /// + /// `None`, when run in a testing environment or when using websockets. + pub fn local_addr(&self) -> Option { + self.inner.local_addr + } + + /// Whether or not this [`Conn`] is closed. + /// + /// The [`Conn`] is considered closed if the write side of the connection is no longer running. + pub fn is_closed(&self) -> bool { + self.inner.writer_task.is_finished() + } + + /// Close the connection + /// + /// Shuts down the write loop directly and marks the connection as closed. The [`Conn`] will + /// check if the it is closed before attempting to read from it. + pub async fn close(&self) { + if self.inner.writer_task.is_finished() && self.inner.reader_task.is_finished() { + return; + } + + self.inner + .writer_channel + .send(ConnWriterMessage::Shutdown) + .await + .ok(); + self.inner.reader_task.abort(); + } +} + +fn process_incoming_frame(frame: Frame) -> Result { + match frame { + Frame::KeepAlive => { + // A one-way keep-alive message that doesn't require an ack. + // This predated FrameType::Ping/FrameType::Pong. + Ok(ReceivedMessage::KeepAlive) + } + Frame::PeerGone { peer } => Ok(ReceivedMessage::PeerGone(peer)), + Frame::RecvPacket { src_key, content } => { + let packet = ReceivedMessage::ReceivedPacket { + source: src_key, + data: content, + }; + Ok(packet) + } + Frame::Ping { data } => Ok(ReceivedMessage::Ping(data)), + Frame::Pong { data } => Ok(ReceivedMessage::Pong(data)), + Frame::Health { problem } => { + let problem = std::str::from_utf8(&problem)?.to_owned(); + let problem = Some(problem); + Ok(ReceivedMessage::Health { problem }) + } + Frame::Restarting { + reconnect_in, + try_for, + } => { + let reconnect_in = Duration::from_millis(reconnect_in as u64); + let try_for = Duration::from_millis(try_for as u64); + Ok(ReceivedMessage::ServerRestarting { + reconnect_in, + try_for, + }) + } + _ => bail!("unexpected packet: {:?}", frame.typ()), + } +} + +/// The kinds of messages we can send to the [`Server`](crate::relay::server::Server) +#[derive(Debug)] +enum ConnWriterMessage { + /// Send a packet (addressed to the [`PublicKey`]) to the server + Packet((PublicKey, Bytes)), + /// Send a pong to the server + Pong([u8; 8]), + /// Send a ping to the server + Ping([u8; 8]), + /// Tell the server whether or not this client is the user's preferred client + NotePreferred(bool), + /// Shutdown the writer + Shutdown, +} + +/// Call [`ConnWriterTasks::run`] to listen for messages to send to the connection. +/// Should be used by the [`Conn`] +/// +/// Shutsdown when you send a [`ConnWriterMessage::Shutdown`], or if there is an error writing to +/// the server. +struct ConnWriterTasks { + recv_msgs: mpsc::Receiver, + writer: ConnWriter, + rate_limiter: Option, +} + +impl ConnWriterTasks { + async fn run(mut self) -> Result<()> { + while let Some(msg) = self.recv_msgs.recv().await { + match msg { + ConnWriterMessage::Packet((key, bytes)) => { + send_packet(&mut self.writer, &self.rate_limiter, key, bytes).await?; + } + ConnWriterMessage::Pong(data) => { + write_frame(&mut self.writer, Frame::Pong { data }, None).await?; + self.writer.flush().await?; + } + ConnWriterMessage::Ping(data) => { + write_frame(&mut self.writer, Frame::Ping { data }, None).await?; + self.writer.flush().await?; + } + ConnWriterMessage::NotePreferred(preferred) => { + write_frame(&mut self.writer, Frame::NotePreferred { preferred }, None).await?; + self.writer.flush().await?; + } + ConnWriterMessage::Shutdown => { + return Ok(()); + } + } + } + + bail!("channel unexpectedly closed"); + } +} + +/// The Builder returns a [`Conn`] and a [`ConnReceiver`] and +/// runs a [`ConnWriterTasks`] in the background. +pub struct ConnBuilder { + secret_key: SecretKey, + reader: ConnReader, + writer: ConnWriter, + local_addr: Option, +} + +pub(crate) enum ConnReader { + Derp(FramedRead), + Ws(SplitStream), +} + +pub(crate) enum ConnWriter { + Derp(FramedWrite), + Ws(SplitSink), +} + +fn tung_wasm_to_io_err(e: tokio_tungstenite_wasm::Error) -> std::io::Error { + match e { + tokio_tungstenite_wasm::Error::Io(io_err) => io_err, + _ => std::io::Error::new(std::io::ErrorKind::Other, e.to_string()), + } +} + +impl Stream for ConnReader { + type Item = Result; + + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + match *self { + Self::Derp(ref mut ws) => Pin::new(ws).poll_next(cx), + Self::Ws(ref mut ws) => match Pin::new(ws).poll_next(cx) { + Poll::Ready(Some(Ok(tokio_tungstenite_wasm::Message::Binary(vec)))) => { + Poll::Ready(Some(Frame::decode_from_ws_msg(vec))) + } + Poll::Ready(Some(Ok(msg))) => { + tracing::warn!(?msg, "Got websocket message of unsupported type, skipping."); + Poll::Pending + } + Poll::Ready(Some(Err(e))) => Poll::Ready(Some(Err(e.into()))), + Poll::Ready(None) => Poll::Ready(None), + Poll::Pending => Poll::Pending, + }, + } + } +} + +impl Sink for ConnWriter { + type Error = std::io::Error; + + fn poll_ready(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + match *self { + Self::Derp(ref mut ws) => Pin::new(ws).poll_ready(cx), + Self::Ws(ref mut ws) => Pin::new(ws).poll_ready(cx).map_err(tung_wasm_to_io_err), + } + } + + fn start_send(mut self: Pin<&mut Self>, item: Frame) -> Result<(), Self::Error> { + match *self { + Self::Derp(ref mut ws) => Pin::new(ws).start_send(item), + Self::Ws(ref mut ws) => Pin::new(ws) + .start_send(tokio_tungstenite_wasm::Message::binary( + item.encode_for_ws_msg(), + )) + .map_err(tung_wasm_to_io_err), + } + } + + fn poll_flush(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + match *self { + Self::Derp(ref mut ws) => Pin::new(ws).poll_flush(cx), + Self::Ws(ref mut ws) => Pin::new(ws).poll_flush(cx).map_err(tung_wasm_to_io_err), + } + } + + fn poll_close(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + match *self { + Self::Derp(ref mut ws) => Pin::new(ws).poll_close(cx), + Self::Ws(ref mut ws) => Pin::new(ws).poll_close(cx).map_err(tung_wasm_to_io_err), + } + } +} + +impl ConnBuilder { + pub fn new( + secret_key: SecretKey, + local_addr: Option, + reader: ConnReader, + writer: ConnWriter, + ) -> Self { + Self { + secret_key, + reader, + writer, + local_addr, + } + } + + async fn server_handshake(&mut self) -> Result> { + debug!("server_handshake: started"); + let client_info = ClientInfo { + version: PROTOCOL_VERSION, + }; + debug!("server_handshake: sending client_key: {:?}", &client_info); + crate::relay::codec::send_client_key(&mut self.writer, &self.secret_key, &client_info) + .await?; + + // TODO: add some actual configuration + let rate_limiter = RateLimiter::new(0, 0)?; + + debug!("server_handshake: done"); + Ok(rate_limiter) + } + + pub async fn build(mut self) -> Result<(Conn, ConnReceiver)> { + // exchange information with the server + let rate_limiter = self.server_handshake().await?; + + // create task to handle writing to the server + let (writer_sender, writer_recv) = mpsc::channel(PER_CLIENT_SEND_QUEUE_DEPTH); + let writer_task = tokio::task::spawn( + ConnWriterTasks { + rate_limiter, + writer: self.writer, + recv_msgs: writer_recv, + } + .run() + .instrument(info_span!("client.writer")), + ); + + let (reader_sender, reader_recv) = mpsc::channel(PER_CLIENT_READ_QUEUE_DEPTH); + let reader_task = tokio::task::spawn({ + let writer_sender = writer_sender.clone(); + async move { + loop { + let frame = tokio::time::timeout(CLIENT_RECV_TIMEOUT, self.reader.next()).await; + let res = match frame { + Ok(Some(Ok(frame))) => process_incoming_frame(frame), + Ok(Some(Err(err))) => { + // Error processing incoming messages + Err(err) + } + Ok(None) => { + // EOF + Err(anyhow::anyhow!("EOF: reader stream ended")) + } + Err(err) => { + // Timeout + Err(err.into()) + } + }; + if res.is_err() { + // shutdown + writer_sender.send(ConnWriterMessage::Shutdown).await.ok(); + break; + } + if reader_sender.send(res).await.is_err() { + // shutdown, as the reader is gone + writer_sender.send(ConnWriterMessage::Shutdown).await.ok(); + break; + } + } + } + }); + + let conn = Conn { + inner: Arc::new(ConnTasks { + local_addr: self.local_addr, + writer_channel: writer_sender, + writer_task: writer_task.into(), + reader_task: reader_task.into(), + }), + }; + + let conn_receiver = ConnReceiver { + reader_channel: reader_recv, + }; + + Ok((conn, conn_receiver)) + } +} + +#[derive(derive_more::Debug, Clone)] +/// The type of message received by the [`Conn`] from a relay server. +pub enum ReceivedMessage { + /// Represents an incoming packet. + ReceivedPacket { + /// The [`PublicKey`] of the packet sender. + source: PublicKey, + /// The received packet bytes. + #[debug(skip)] + data: Bytes, // TODO: ref + }, + /// Indicates that the client identified by the underlying public key had previously sent you a + /// packet but has now disconnected from the server. + PeerGone(PublicKey), + /// Request from a client or server to reply to the + /// other side with a [`ReceivedMessage::Pong`] with the given payload. + Ping([u8; 8]), + /// Reply to a [`ReceivedMessage::Ping`] from a client or server + /// with the payload sent previously in the ping. + Pong([u8; 8]), + /// A one-way empty message from server to client, just to + /// keep the connection alive. It's like a [`ReceivedMessage::Ping`], but doesn't solicit + /// a reply from the client. + KeepAlive, + /// A one-way message from server to client, declaring the connection health state. + Health { + /// If set, is a description of why the connection is unhealthy. + /// + /// If `None` means the connection is healthy again. + /// + /// The default condition is healthy, so the server doesn't broadcast a [`ReceivedMessage::Health`] + /// until a problem exists. + problem: Option, + }, + /// A one-way message from server to client, advertising that the server is restarting. + ServerRestarting { + /// An advisory duration that the client should wait before attempting to reconnect. + /// It might be zero. It exists for the server to smear out the reconnects. + reconnect_in: Duration, + /// An advisory duration for how long the client should attempt to reconnect + /// before giving up and proceeding with its normal connection failure logic. The interval + /// between retries is undefined for now. A server should not send a TryFor duration more + /// than a few seconds. + try_for: Duration, + }, +} + +pub(crate) async fn send_packet + Unpin>( + mut writer: S, + rate_limiter: &Option, + dst_key: PublicKey, + packet: Bytes, +) -> Result<()> { + ensure!( + packet.len() <= MAX_PACKET_SIZE, + "packet too big: {}", + packet.len() + ); + + let frame = Frame::SendPacket { dst_key, packet }; + if let Some(rate_limiter) = rate_limiter { + if rate_limiter.check_n(frame.len()).is_err() { + tracing::warn!("dropping send: rate limit reached"); + return Ok(()); + } + } + writer.send(frame).await?; + writer.flush().await?; + + Ok(()) +} + +pub(crate) struct RateLimiter { + inner: governor::RateLimiter< + governor::state::direct::NotKeyed, + governor::state::InMemoryState, + governor::clock::DefaultClock, + governor::middleware::NoOpMiddleware, + >, +} + +impl RateLimiter { + pub(crate) fn new(bytes_per_second: usize, bytes_burst: usize) -> Result> { + if bytes_per_second == 0 || bytes_burst == 0 { + return Ok(None); + } + let bytes_per_second = NonZeroU32::new(u32::try_from(bytes_per_second)?) + .context("bytes_per_second not non-zero")?; + let bytes_burst = + NonZeroU32::new(u32::try_from(bytes_burst)?).context("bytes_burst not non-zero")?; + Ok(Some(Self { + inner: governor::RateLimiter::direct( + governor::Quota::per_second(bytes_per_second).allow_burst(bytes_burst), + ), + })) + } + + pub(crate) fn check_n(&self, n: usize) -> Result<()> { + let n = NonZeroU32::new(u32::try_from(n)?).context("n not non-zero")?; + match self.inner.check_n(n) { + Ok(_) => Ok(()), + Err(_) => bail!("batch cannot go through"), + } + } +} diff --git a/iroh-net/src/relay/http/streams.rs b/iroh-net/src/relay/client/streams.rs similarity index 100% rename from iroh-net/src/relay/http/streams.rs rename to iroh-net/src/relay/client/streams.rs diff --git a/iroh-net/src/relay/codec.rs b/iroh-net/src/relay/codec.rs index a35293f60a..a83debd3ea 100644 --- a/iroh-net/src/relay/codec.rs +++ b/iroh-net/src/relay/codec.rs @@ -7,9 +7,10 @@ use futures_lite::{Stream, StreamExt}; use futures_sink::Sink; use futures_util::SinkExt; use iroh_base::key::{Signature, PUBLIC_KEY_LENGTH}; +use postcard::experimental::max_size::MaxSize; +use serde::{Deserialize, Serialize}; use tokio_util::codec::{Decoder, Encoder}; -use super::types::ClientInfo; use crate::key::{PublicKey, SecretKey}; /// The maximum size of a packet sent over relay. @@ -115,6 +116,12 @@ impl std::fmt::Display for FrameType { } } +#[derive(Debug, Serialize, Deserialize, MaxSize, PartialEq, Eq)] +pub(crate) struct ClientInfo { + /// The relay protocol version that the client was built with. + pub(crate) version: usize, +} + /// Writes complete frame, errors if it is unable to write within the given `timeout`. /// Ignores the timeout if `None` /// diff --git a/iroh-net/src/relay/http.rs b/iroh-net/src/relay/http.rs index f7e4365f1b..83199befeb 100644 --- a/iroh-net/src/relay/http.rs +++ b/iroh-net/src/relay/http.rs @@ -1,14 +1,4 @@ -//! An http specific relay Client and relay Server. Allows for using tls or non tls connection -//! upgrades. -//! -mod client; -#[cfg(feature = "iroh-relay")] -mod server; -pub(crate) mod streams; - -pub use self::client::{Client, ClientBuilder, ClientError, ClientReceiver}; -#[cfg(feature = "iroh-relay")] -pub use self::server::{Server, ServerBuilder, ServerHandle, TlsAcceptor, TlsConfig}; +//! HTTP-specific constants for the relay server and client. pub(crate) const HTTP_UPGRADE_PROTOCOL: &str = "iroh derp http"; pub(crate) const WEBSOCKET_UPGRADE_PROTOCOL: &str = "websocket"; @@ -61,234 +51,3 @@ impl Protocol { } } } - -#[cfg(test)] -mod tests { - use super::*; - - use anyhow::Result; - use bytes::Bytes; - use reqwest::Url; - use tokio::sync::mpsc; - use tokio::task::JoinHandle; - use tracing::{info, info_span, Instrument}; - use tracing_subscriber::{prelude::*, EnvFilter}; - - use crate::key::{PublicKey, SecretKey}; - use crate::relay::client::ReceivedMessage; - - pub(crate) fn make_tls_config() -> TlsConfig { - let subject_alt_names = vec!["localhost".to_string()]; - - let cert = rcgen::generate_simple_self_signed(subject_alt_names).unwrap(); - let rustls_certificate = rustls::Certificate(cert.serialize_der().unwrap()); - let rustls_key = rustls::PrivateKey(cert.get_key_pair().serialize_der()); - let config = rustls::ServerConfig::builder() - .with_safe_defaults() - .with_no_client_auth() - .with_single_cert(vec![(rustls_certificate)], rustls_key) - .unwrap(); - - let config = std::sync::Arc::new(config); - let acceptor = tokio_rustls::TlsAcceptor::from(config.clone()); - - TlsConfig { - config, - acceptor: TlsAcceptor::Manual(acceptor), - } - } - - #[tokio::test] - async fn test_http_clients_and_server() -> Result<()> { - let _guard = iroh_test::logging::setup(); - - let server_key = SecretKey::generate(); - let a_key = SecretKey::generate(); - let b_key = SecretKey::generate(); - - // start server - let server = ServerBuilder::new("127.0.0.1:0".parse().unwrap()) - .secret_key(Some(server_key)) - .spawn() - .await?; - - let addr = server.addr(); - - // get dial info - let port = addr.port(); - let addr = { - if let std::net::IpAddr::V4(ipv4_addr) = addr.ip() { - ipv4_addr - } else { - anyhow::bail!("cannot get ipv4 addr from socket addr {addr:?}"); - } - }; - info!("addr: {addr}:{port}"); - let relay_addr: Url = format!("http://{addr}:{port}").parse().unwrap(); - - // create clients - let (a_key, mut a_recv, client_a_task, client_a) = { - let span = info_span!("client-a"); - let _guard = span.enter(); - create_test_client(a_key, relay_addr.clone()) - }; - info!("created client {a_key:?}"); - let (b_key, mut b_recv, client_b_task, client_b) = { - let span = info_span!("client-b"); - let _guard = span.enter(); - create_test_client(b_key, relay_addr) - }; - info!("created client {b_key:?}"); - - info!("ping a"); - client_a.ping().await?; - - info!("ping b"); - client_b.ping().await?; - - info!("sending message from a to b"); - let msg = Bytes::from_static(b"hi there, client b!"); - client_a.send(b_key, msg.clone()).await?; - info!("waiting for message from a on b"); - let (got_key, got_msg) = b_recv.recv().await.expect("expected message from client_a"); - assert_eq!(a_key, got_key); - assert_eq!(msg, got_msg); - - info!("sending message from b to a"); - let msg = Bytes::from_static(b"right back at ya, client b!"); - client_b.send(a_key, msg.clone()).await?; - info!("waiting for message b on a"); - let (got_key, got_msg) = a_recv.recv().await.expect("expected message from client_b"); - assert_eq!(b_key, got_key); - assert_eq!(msg, got_msg); - - client_a.close().await?; - client_a_task.abort(); - client_b.close().await?; - client_b_task.abort(); - server.shutdown(); - - Ok(()) - } - - fn create_test_client( - key: SecretKey, - server_url: Url, - ) -> ( - PublicKey, - mpsc::Receiver<(PublicKey, Bytes)>, - JoinHandle<()>, - Client, - ) { - let client = ClientBuilder::new(server_url).insecure_skip_cert_verify(true); - let dns_resolver = crate::dns::default_resolver(); - let (client, mut client_reader) = client.build(key.clone(), dns_resolver.clone()); - let public_key = key.public(); - let (received_msg_s, received_msg_r) = tokio::sync::mpsc::channel(10); - let client_reader_task = tokio::spawn( - async move { - loop { - info!("waiting for message on {:?}", key.public()); - match client_reader.recv().await { - None => { - info!("client received nothing"); - return; - } - Some(Err(e)) => { - info!("client {:?} `recv` error {e}", key.public()); - return; - } - Some(Ok((msg, _))) => { - info!("got message on {:?}: {msg:?}", key.public()); - if let ReceivedMessage::ReceivedPacket { source, data } = msg { - received_msg_s - .send((source, data)) - .await - .unwrap_or_else(|err| { - panic!( - "client {:?}, error sending message over channel: {:?}", - key.public(), - err - ) - }); - } - } - } - } - } - .instrument(info_span!("test-client-reader")), - ); - (public_key, received_msg_r, client_reader_task, client) - } - - #[tokio::test] - async fn test_https_clients_and_server() -> Result<()> { - tracing_subscriber::registry() - .with(tracing_subscriber::fmt::layer().with_writer(std::io::stderr)) - .with(EnvFilter::from_default_env()) - .try_init() - .ok(); - - let server_key = SecretKey::generate(); - let a_key = SecretKey::generate(); - let b_key = SecretKey::generate(); - - // create tls_config - let tls_config = make_tls_config(); - - // start server - let mut server = ServerBuilder::new("127.0.0.1:0".parse().unwrap()) - .secret_key(Some(server_key)) - .tls_config(Some(tls_config)) - .spawn() - .await?; - - let addr = server.addr(); - - // get dial info - let port = addr.port(); - let addr = { - if let std::net::IpAddr::V4(ipv4_addr) = addr.ip() { - ipv4_addr - } else { - anyhow::bail!("cannot get ipv4 addr from socket addr {addr:?}"); - } - }; - info!("Relay listening on: {addr}:{port}"); - - let url: Url = format!("https://localhost:{port}").parse().unwrap(); - - // create clients - let (a_key, mut a_recv, client_a_task, client_a) = create_test_client(a_key, url.clone()); - info!("created client {a_key:?}"); - let (b_key, mut b_recv, client_b_task, client_b) = create_test_client(b_key, url); - info!("created client {b_key:?}"); - - client_a.ping().await?; - client_b.ping().await?; - - info!("sending message from a to b"); - let msg = Bytes::from_static(b"hi there, client b!"); - client_a.send(b_key, msg.clone()).await?; - info!("waiting for message from a on b"); - let (got_key, got_msg) = b_recv.recv().await.expect("expected message from client_a"); - assert_eq!(a_key, got_key); - assert_eq!(msg, got_msg); - - info!("sending message from b to a"); - let msg = Bytes::from_static(b"right back at ya, client b!"); - client_b.send(a_key, msg.clone()).await?; - info!("waiting for message b on a"); - let (got_key, got_msg) = a_recv.recv().await.expect("expected message from client_b"); - assert_eq!(b_key, got_key); - assert_eq!(msg, got_msg); - - server.shutdown(); - server.task_handle().await?; - client_a.close().await?; - client_a_task.abort(); - client_b.close().await?; - client_b_task.abort(); - Ok(()) - } -} diff --git a/iroh-net/src/relay/http/client.rs b/iroh-net/src/relay/http/client.rs deleted file mode 100644 index 97d3303184..0000000000 --- a/iroh-net/src/relay/http/client.rs +++ /dev/null @@ -1,1112 +0,0 @@ -//! Based on tailscale/derp/derphttp/derphttp_client.go - -use std::collections::HashMap; -use std::net::{IpAddr, SocketAddr}; -use std::sync::Arc; -use std::time::Duration; - -use base64::{engine::general_purpose::URL_SAFE, Engine as _}; -use bytes::Bytes; -use futures_lite::future::Boxed as BoxFuture; -use futures_util::StreamExt; -use http_body_util::Empty; -use hyper::body::Incoming; -use hyper::header::UPGRADE; -use hyper::upgrade::Parts; -use hyper::Request; -use hyper_util::rt::TokioIo; -use rand::Rng; -use rustls::client::Resumption; -use tokio::io::{AsyncRead, AsyncWrite}; -use tokio::net::TcpStream; -use tokio::sync::{mpsc, oneshot}; -use tokio::task::JoinSet; -use tokio::time::Instant; -use tokio_util::codec::{FramedRead, FramedWrite}; -use tracing::{debug, error, event, info_span, trace, warn, Instrument, Level}; -use url::Url; - -use crate::dns::{DnsResolver, ResolverExt}; -use crate::key::{PublicKey, SecretKey}; -use crate::relay::client::{ConnReader, ConnWriter}; -use crate::relay::codec::DerpCodec; -use crate::relay::http::streams::{downcast_upgrade, MaybeTlsStream}; -use crate::relay::http::RELAY_PATH; -use crate::relay::RelayUrl; -use crate::relay::{ - client::Client as RelayClient, client::ClientBuilder as RelayClientBuilder, - client::ClientReceiver as RelayClientReceiver, client::ReceivedMessage, -}; -use crate::util::chain; -use crate::util::AbortingJoinHandle; - -use super::streams::ProxyStream; -use super::Protocol; - -const DIAL_NODE_TIMEOUT: Duration = Duration::from_millis(1500); -const PING_TIMEOUT: Duration = Duration::from_secs(5); -const CONNECT_TIMEOUT: Duration = Duration::from_secs(10); -const DNS_TIMEOUT: Duration = Duration::from_secs(1); - -/// Possible connection errors on the [`Client`] -#[derive(Debug, thiserror::Error)] -pub enum ClientError { - /// The client is closed - #[error("client is closed")] - Closed, - /// There no underlying relay [`super::client::Client`] client exists for this http relay [`Client`] - #[error("no relay client")] - NoClient, - /// There was an error sending a packet - #[error("error sending a packet")] - Send, - /// There was an error receiving a packet - #[error("error receiving a packet: {0:?}")] - Receive(anyhow::Error), - /// There was a connection timeout error - #[error("connect timeout")] - ConnectTimeout, - /// No relay nodes are available - #[error("Relay node is not available")] - RelayNodeNotAvail, - /// No relay nodes are available with that name - #[error("no nodes available for {0}")] - NoNodeForTarget(String), - /// The relay node specified only allows STUN requests - #[error("no relay nodes found for {0}, only are stun_only nodes")] - StunOnlyNodesFound(String), - /// There was an error dialing - #[error("dial error")] - DialIO(#[from] std::io::Error), - /// There was an error from the task doing the dialing - #[error("dial error")] - DialTask(#[from] tokio::task::JoinError), - /// Both IPv4 and IPv6 are disabled for this relay node - #[error("both IPv4 and IPv6 are explicitly disabled for this node")] - IPDisabled, - /// No local addresses exist - #[error("no local addr: {0}")] - NoLocalAddr(String), - /// There was http server [`hyper::Error`] - #[error("http connection error")] - Hyper(#[from] hyper::Error), - /// There was an http error [`http::Error`]. - #[error("http error")] - Http(#[from] http::Error), - /// There was an unexpected status code - #[error("unexpected status code: expected {0}, got {1}")] - UnexpectedStatusCode(hyper::StatusCode, hyper::StatusCode), - /// The connection failed to upgrade - #[error("failed to upgrade connection: {0}")] - Upgrade(String), - /// The connection failed to proxy - #[error("failed to proxy connection: {0}")] - Proxy(String), - /// The relay [`super::client::Client`] failed to build - #[error("failed to build relay client: {0}")] - Build(String), - /// The ping request timed out - #[error("ping timeout")] - PingTimeout, - /// The ping request was aborted - #[error("ping aborted")] - PingAborted, - /// This [`Client`] cannot acknowledge pings - #[error("cannot acknowledge pings")] - CannotAckPings, - /// The given [`Url`] is invalid - #[error("invalid url: {0}")] - InvalidUrl(String), - /// There was an error with DNS resolution - #[error("dns: {0:?}")] - Dns(Option), - /// There was a timeout resolving DNS. - #[error("dns timeout")] - DnsTimeout, - /// The inner actor is gone, likely means things are shutdown. - #[error("actor gone")] - ActorGone, - /// An error related to websockets, either errors with parsing ws messages or the handshake - #[error("websocket error: {0}")] - WebsocketError(#[from] tokio_tungstenite_wasm::Error), -} - -/// An HTTP Relay client. -/// -/// Cheaply clonable. -#[derive(Clone, Debug)] -pub struct Client { - inner: mpsc::Sender, - public_key: PublicKey, - #[allow(dead_code)] - recv_loop: Arc>, -} - -#[derive(Debug)] -enum ActorMessage { - Connect(oneshot::Sender>), - NotePreferred(bool), - LocalAddr(oneshot::Sender, ClientError>>), - Ping(oneshot::Sender>), - Pong([u8; 8], oneshot::Sender>), - Send(PublicKey, Bytes, oneshot::Sender>), - Close(oneshot::Sender>), - CloseForReconnect(oneshot::Sender>), - IsConnected(oneshot::Sender>), -} - -/// Receiving end of a [`Client`]. -#[derive(Debug)] -pub struct ClientReceiver { - msg_receiver: mpsc::Receiver>, -} - -#[derive(derive_more::Debug)] -struct Actor { - secret_key: SecretKey, - can_ack_pings: bool, - is_preferred: bool, - relay_client: Option<(RelayClient, RelayClientReceiver)>, - is_closed: bool, - #[debug("address family selector callback")] - address_family_selector: Option BoxFuture + Send + Sync + 'static>>, - conn_gen: usize, - url: RelayUrl, - protocol: Protocol, - #[debug("TlsConnector")] - tls_connector: tokio_rustls::TlsConnector, - pings: PingTracker, - ping_tasks: JoinSet<()>, - dns_resolver: DnsResolver, - proxy_url: Option, -} - -#[derive(Default, Debug)] -struct PingTracker(HashMap<[u8; 8], oneshot::Sender<()>>); - -impl PingTracker { - /// Note that we have sent a ping, and store the [`oneshot::Sender`] we - /// must notify when the pong returns - fn register(&mut self) -> ([u8; 8], oneshot::Receiver<()>) { - let data = rand::thread_rng().gen::<[u8; 8]>(); - let (send, recv) = oneshot::channel(); - self.0.insert(data, send); - (data, recv) - } - - /// Remove the associated [`oneshot::Sender`] for `data` & return it. - /// - /// If there is no [`oneshot::Sender`] in the tracker, return `None`. - fn unregister(&mut self, data: [u8; 8], why: &'static str) -> Option> { - trace!("removing ping {}: {}", hex::encode(data), why); - self.0.remove(&data) - } -} - -/// Build a Client. -#[derive(derive_more::Debug)] -pub struct ClientBuilder { - /// Default is false - can_ack_pings: bool, - /// Default is false - is_preferred: bool, - /// Default is None - #[debug("address family selector callback")] - address_family_selector: Option BoxFuture + Send + Sync + 'static>>, - /// Default is false - is_prober: bool, - /// Expected PublicKey of the server - server_public_key: Option, - /// Server url. - url: RelayUrl, - /// Relay protocol - protocol: Protocol, - /// Allow self-signed certificates from relay servers - #[cfg(any(test, feature = "test-utils"))] - insecure_skip_cert_verify: bool, - /// HTTP Proxy - proxy_url: Option, -} - -impl ClientBuilder { - /// Create a new [`ClientBuilder`] - pub fn new(url: impl Into) -> Self { - ClientBuilder { - can_ack_pings: false, - is_preferred: false, - address_family_selector: None, - is_prober: false, - server_public_key: None, - url: url.into(), - protocol: Protocol::Relay, - #[cfg(any(test, feature = "test-utils"))] - insecure_skip_cert_verify: false, - proxy_url: None, - } - } - - /// Sets the server url - pub fn server_url(mut self, url: impl Into) -> Self { - self.url = url.into(); - self - } - - /// Sets whether to connect to the relay via websockets or not. - /// Set to use non-websocket, normal relaying by default. - pub fn protocol(mut self, protocol: Protocol) -> Self { - self.protocol = protocol; - self - } - - /// Returns if we should prefer ipv6 - /// it replaces the relayhttp.AddressFamilySelector we pass - /// It provides the hint as to whether in an IPv4-vs-IPv6 race that - /// IPv4 should be held back a bit to give IPv6 a better-than-50/50 - /// chance of winning. We only return true when we believe IPv6 will - /// work anyway, so we don't artificially delay the connection speed. - pub fn address_family_selector(mut self, selector: S) -> Self - where - S: Fn() -> BoxFuture + Send + Sync + 'static, - { - self.address_family_selector = Some(Box::new(selector)); - self - } - - /// Enable this [`Client`] to acknowledge pings. - pub fn can_ack_pings(mut self, can: bool) -> Self { - self.can_ack_pings = can; - self - } - - /// Indicate this client is the preferred way to communicate - /// to the peer with this client's [`PublicKey`] - pub fn is_preferred(mut self, is: bool) -> Self { - self.is_preferred = is; - self - } - - /// Indicates this client is a prober - pub fn is_prober(mut self, is: bool) -> Self { - self.is_prober = is; - self - } - - /// Skip the verification of the relay server's SSL certificates. - /// - /// May only be used in tests. - #[cfg(any(test, feature = "test-utils"))] - pub fn insecure_skip_cert_verify(mut self, skip: bool) -> Self { - self.insecure_skip_cert_verify = skip; - self - } - - /// Set an explicit proxy url to proxy all HTTP(S) traffic through. - pub fn proxy_url(mut self, url: Url) -> Self { - self.proxy_url.replace(url); - self - } - - /// Build the [`Client`] - pub fn build(self, key: SecretKey, dns_resolver: DnsResolver) -> (Client, ClientReceiver) { - // TODO: review TLS config - let mut roots = rustls::RootCertStore::empty(); - roots.add_trust_anchors(webpki_roots::TLS_SERVER_ROOTS.iter().map(|ta| { - rustls::OwnedTrustAnchor::from_subject_spki_name_constraints( - ta.subject, - ta.spki, - ta.name_constraints, - ) - })); - let mut config = rustls::client::ClientConfig::builder() - .with_safe_defaults() - .with_root_certificates(roots) - .with_no_client_auth(); - #[cfg(any(test, feature = "test-utils"))] - if self.insecure_skip_cert_verify { - warn!("Insecure config: SSL certificates from relay servers will be trusted without verification"); - config - .dangerous() - .set_certificate_verifier(Arc::new(NoCertVerifier)); - } - - config.resumption = Resumption::default(); - - let tls_connector: tokio_rustls::TlsConnector = Arc::new(config).into(); - let public_key = key.public(); - - let inner = Actor { - secret_key: key, - can_ack_pings: self.can_ack_pings, - is_preferred: self.is_preferred, - relay_client: None, - is_closed: false, - address_family_selector: self.address_family_selector, - conn_gen: 0, - pings: PingTracker::default(), - ping_tasks: Default::default(), - url: self.url, - protocol: self.protocol, - tls_connector, - dns_resolver, - proxy_url: self.proxy_url, - }; - - let (msg_sender, inbox) = mpsc::channel(64); - let (s, r) = mpsc::channel(64); - let recv_loop = tokio::task::spawn( - async move { inner.run(inbox, s).await }.instrument(info_span!("client")), - ); - - ( - Client { - public_key, - inner: msg_sender, - recv_loop: Arc::new(recv_loop.into()), - }, - ClientReceiver { msg_receiver: r }, - ) - } - - /// The expected [`PublicKey`] of the relay server we are connecting to. - pub fn server_public_key(mut self, server_public_key: PublicKey) -> Self { - self.server_public_key = Some(server_public_key); - self - } -} - -impl ClientReceiver { - /// Reads a message from the server. Returns the message and the `conn_get`, or the number of - /// re-connections this Client has ever made - pub async fn recv(&mut self) -> Option> { - self.msg_receiver.recv().await - } -} - -impl Client { - /// The public key for this client - pub fn public_key(&self) -> PublicKey { - self.public_key - } - - async fn send_actor(&self, msg_create: F) -> Result - where - F: FnOnce(oneshot::Sender>) -> ActorMessage, - { - let (s, r) = oneshot::channel(); - let msg = msg_create(s); - match self.inner.send(msg).await { - Ok(_) => { - let res = r.await.map_err(|_| ClientError::ActorGone)??; - Ok(res) - } - Err(_) => Err(ClientError::ActorGone), - } - } - - /// Connect to a relay Server and returns the underlying relay Client. - /// - /// Returns [`ClientError::Closed`] if the [`Client`] is closed. - /// - /// If there is already an active relay connection, returns the already - /// connected [`crate::relay::client::Client`]. - pub async fn connect(&self) -> Result<(RelayClient, usize), ClientError> { - self.send_actor(ActorMessage::Connect).await - } - - /// Let the server know that this client is the preferred client - pub async fn note_preferred(&self, is_preferred: bool) { - self.inner - .send(ActorMessage::NotePreferred(is_preferred)) - .await - .ok(); - } - - /// Get the local addr of the connection. If there is no current underlying relay connection - /// or the [`Client`] is closed, returns `None`. - pub async fn local_addr(&self) -> Option { - self.send_actor(ActorMessage::LocalAddr) - .await - .ok() - .flatten() - } - - /// Send a ping to the server. Return once we get an expected pong. - /// - /// There must be a task polling `recv_detail` to process the `pong` response. - pub async fn ping(&self) -> Result { - self.send_actor(ActorMessage::Ping).await - } - - /// Send a pong back to the server. - /// - /// If there is no underlying active relay connection, it creates one before attempting to - /// send the pong message. - /// - /// If there is an error sending pong, it closes the underlying relay connection before - /// returning. - pub async fn send_pong(&self, data: [u8; 8]) -> Result<(), ClientError> { - self.send_actor(|s| ActorMessage::Pong(data, s)).await - } - - /// Send a packet to the server. - /// - /// If there is no underlying active relay connection, it creates one before attempting to - /// send the message. - /// - /// If there is an error sending the packet, it closes the underlying relay connection before - /// returning. - pub async fn send(&self, dst_key: PublicKey, b: Bytes) -> Result<(), ClientError> { - self.send_actor(|s| ActorMessage::Send(dst_key, b, s)).await - } - - /// Close the http relay connection. - pub async fn close(self) -> Result<(), ClientError> { - self.send_actor(ActorMessage::Close).await - } - - /// Disconnect the http relay connection. - pub async fn close_for_reconnect(&self) -> Result<(), ClientError> { - self.send_actor(ActorMessage::CloseForReconnect).await - } - - /// Returns `true` if the underlying relay connection is established. - pub async fn is_connected(&self) -> Result { - self.send_actor(ActorMessage::IsConnected).await - } -} - -impl Actor { - async fn run( - mut self, - mut inbox: mpsc::Receiver, - msg_sender: mpsc::Sender>, - ) { - // Add an initial connection attempt. - if let Err(err) = self.connect("initial connect").await { - msg_sender.send(Err(err)).await.ok(); - } - - loop { - tokio::select! { - res = self.recv_detail() => { - if let Ok((ReceivedMessage::Pong(ping), _)) = res { - match self.pings.unregister(ping, "pong") { - Some(chan) => { - if chan.send(()).is_err() { - warn!("pong received for ping {ping:?}, but the receiving channel was closed"); - } - } - None => { - warn!("pong received for ping {ping:?}, but not registered"); - } - } - continue; - } - msg_sender.send(res).await.ok(); - } - Some(msg) = inbox.recv() => { - match msg { - ActorMessage::Connect(s) => { - let res = self.connect("actor msg").await.map(|(client, _, count)| (client, count)); - s.send(res).ok(); - }, - ActorMessage::NotePreferred(is_preferred) => { - self.note_preferred(is_preferred).await; - }, - ActorMessage::LocalAddr(s) => { - let res = self.local_addr(); - s.send(Ok(res)).ok(); - }, - ActorMessage::Ping(s) => { - self.ping(s).await; - }, - ActorMessage::Pong(data, s) => { - let res = self.send_pong(data).await; - s.send(res).ok(); - }, - ActorMessage::Send(key, data, s) => { - let res = self.send(key, data).await; - s.send(res).ok(); - }, - ActorMessage::Close(s) => { - let res = self.close().await; - s.send(Ok(res)).ok(); - // shutting down - break; - }, - ActorMessage::CloseForReconnect(s) => { - let res = self.close_for_reconnect().await; - s.send(Ok(res)).ok(); - }, - ActorMessage::IsConnected(s) => { - let res = self.is_connected(); - s.send(Ok(res)).ok(); - }, - } - } - else => { - // Shutting down - self.close().await; - break; - } - } - } - } - - async fn connect( - &mut self, - why: &'static str, - ) -> Result<(RelayClient, &'_ mut RelayClientReceiver, usize), ClientError> { - debug!( - "connect: {}, current client {}", - why, - self.relay_client.is_some() - ); - - if self.is_closed { - return Err(ClientError::Closed); - } - async move { - if self.relay_client.is_none() { - trace!("no connection, trying to connect"); - let (relay_client, receiver) = - tokio::time::timeout(CONNECT_TIMEOUT, self.connect_0()) - .await - .map_err(|_| ClientError::ConnectTimeout)??; - - self.relay_client = Some((relay_client.clone(), receiver)); - self.next_conn(); - } else { - trace!("already had connection"); - } - let count = self.current_conn(); - let (relay_client, receiver) = self - .relay_client - .as_mut() - .map(|(c, r)| (c.clone(), r)) - .expect("just checked"); - - Ok((relay_client, receiver, count)) - } - .instrument(info_span!("connect")) - .await - } - - async fn connect_0(&self) -> Result<(RelayClient, RelayClientReceiver), ClientError> { - let (reader, writer, local_addr) = match self.protocol { - Protocol::Websocket => { - let (reader, writer) = self.connect_ws().await?; - let local_addr = None; - (reader, writer, local_addr) - } - Protocol::Relay => { - let (reader, writer, local_addr) = self.connect_derp().await?; - (reader, writer, Some(local_addr)) - } - }; - - let (relay_client, receiver) = - RelayClientBuilder::new(self.secret_key.clone(), local_addr, reader, writer) - .build() - .await - .map_err(|e| ClientError::Build(e.to_string()))?; - - if self.is_preferred && relay_client.note_preferred(true).await.is_err() { - relay_client.close().await; - return Err(ClientError::Send); - } - - event!( - target: "events.net.relay.connected", - Level::DEBUG, - home = self.is_preferred, - url = %self.url, - ); - - trace!("connect_0 done"); - Ok((relay_client, receiver)) - } - - async fn connect_ws(&self) -> Result<(ConnReader, ConnWriter), ClientError> { - let mut dial_url = (*self.url).clone(); - dial_url.set_path(RELAY_PATH); - // The relay URL is exchanged with the http(s) scheme in tickets and similar. - // We need to use the ws:// or wss:// schemes when connecting with websockets, though. - dial_url - .set_scheme(if self.use_tls() { "wss" } else { "ws" }) - .map_err(|()| ClientError::InvalidUrl(self.url.to_string()))?; - - debug!(%dial_url, "Dialing relay by websocket"); - - let (writer, reader) = tokio_tungstenite_wasm::connect(dial_url).await?.split(); - - let reader = ConnReader::Ws(reader); - let writer = ConnWriter::Ws(writer); - - Ok((reader, writer)) - } - - async fn connect_derp(&self) -> Result<(ConnReader, ConnWriter, SocketAddr), ClientError> { - let tcp_stream = self.dial_url().await?; - - let local_addr = tcp_stream - .local_addr() - .map_err(|e| ClientError::NoLocalAddr(e.to_string()))?; - - debug!(server_addr = ?tcp_stream.peer_addr(), %local_addr, "TCP stream connected"); - - let response = if self.use_tls() { - debug!("Starting TLS handshake"); - let hostname = self - .tls_servername() - .ok_or_else(|| ClientError::InvalidUrl("No tls servername".into()))?; - let tls_stream = self.tls_connector.connect(hostname, tcp_stream).await?; - debug!("tls_connector connect success"); - Self::start_upgrade(tls_stream).await? - } else { - debug!("Starting handshake"); - Self::start_upgrade(tcp_stream).await? - }; - - if response.status() != hyper::StatusCode::SWITCHING_PROTOCOLS { - error!( - "expected status 101 SWITCHING_PROTOCOLS, got: {}", - response.status() - ); - return Err(ClientError::UnexpectedStatusCode( - hyper::StatusCode::SWITCHING_PROTOCOLS, - response.status(), - )); - } - - debug!("starting upgrade"); - let upgraded = match hyper::upgrade::on(response).await { - Ok(upgraded) => upgraded, - Err(err) => { - warn!("upgrade failed: {:#}", err); - return Err(ClientError::Hyper(err)); - } - }; - - debug!("connection upgraded"); - let (reader, writer) = - downcast_upgrade(upgraded).map_err(|e| ClientError::Upgrade(e.to_string()))?; - - let reader = ConnReader::Derp(FramedRead::new(reader, DerpCodec)); - let writer = ConnWriter::Derp(FramedWrite::new(writer, DerpCodec)); - - Ok((reader, writer, local_addr)) - } - - /// Sends the HTTP upgrade request to the relay server. - async fn start_upgrade(io: T) -> Result, ClientError> - where - T: AsyncRead + AsyncWrite + Send + Unpin + 'static, - { - let io = hyper_util::rt::TokioIo::new(io); - let (mut request_sender, connection) = hyper::client::conn::http1::Builder::new() - .handshake(io) - .await?; - tokio::spawn( - // This task drives the HTTP exchange, completes once connection is upgraded. - async move { - debug!("HTTP upgrade driver started"); - if let Err(err) = connection.with_upgrades().await { - error!("HTTP upgrade error: {err:#}"); - } - debug!("HTTP upgrade driver finished"); - } - .instrument(info_span!("http-driver")), - ); - debug!("Sending upgrade request"); - let req = Request::builder() - .uri(RELAY_PATH) - .header(UPGRADE, Protocol::Relay.upgrade_header()) - .body(http_body_util::Empty::::new())?; - request_sender.send_request(req).await.map_err(From::from) - } - - async fn note_preferred(&mut self, is_preferred: bool) { - let old = &mut self.is_preferred; - if *old == is_preferred { - return; - } - *old = is_preferred; - - // only send the preference if we already have a connection - let res = { - if let Some((ref client, _)) = self.relay_client { - client.note_preferred(is_preferred).await - } else { - return; - } - }; - // need to do this outside the above closure because they rely on the same lock - // if there was an error sending, close the underlying relay connection - if res.is_err() { - self.close_for_reconnect().await; - } - } - - fn local_addr(&self) -> Option { - if self.is_closed { - return None; - } - if let Some((ref client, _)) = self.relay_client { - client.local_addr() - } else { - None - } - } - - async fn ping(&mut self, s: oneshot::Sender>) { - let connect_res = self.connect("ping").await.map(|(c, _, _)| c); - let (ping, recv) = self.pings.register(); - trace!("ping: {}", hex::encode(ping)); - - self.ping_tasks.spawn(async move { - let res = match connect_res { - Ok(client) => { - let start = Instant::now(); - if let Err(err) = client.send_ping(ping).await { - warn!("failed to send ping: {:?}", err); - Err(ClientError::Send) - } else { - match tokio::time::timeout(PING_TIMEOUT, recv).await { - Ok(Ok(())) => Ok(start.elapsed()), - Err(_) => Err(ClientError::PingTimeout), - Ok(Err(_)) => Err(ClientError::PingAborted), - } - } - } - Err(err) => Err(err), - }; - s.send(res).ok(); - }); - } - - async fn send(&mut self, dst_key: PublicKey, b: Bytes) -> Result<(), ClientError> { - trace!(dst = %dst_key.fmt_short(), len = b.len(), "send"); - let (client, _, _) = self.connect("send").await?; - if client.send(dst_key, b).await.is_err() { - self.close_for_reconnect().await; - return Err(ClientError::Send); - } - Ok(()) - } - - async fn send_pong(&mut self, data: [u8; 8]) -> Result<(), ClientError> { - debug!("send_pong"); - if self.can_ack_pings { - let (client, _, _) = self.connect("send_pong").await?; - if client.send_pong(data).await.is_err() { - self.close_for_reconnect().await; - return Err(ClientError::Send); - } - Ok(()) - } else { - Err(ClientError::CannotAckPings) - } - } - - async fn close(mut self) { - if !self.is_closed { - self.is_closed = true; - self.close_for_reconnect().await; - } - } - - fn is_connected(&self) -> bool { - if self.is_closed { - return false; - } - self.relay_client.is_some() - } - - fn current_conn(&self) -> usize { - self.conn_gen - } - - fn next_conn(&mut self) -> usize { - self.conn_gen = self.conn_gen.wrapping_add(1); - self.conn_gen - } - - fn tls_servername(&self) -> Option { - self.url - .host_str() - .and_then(|s| rustls::ServerName::try_from(s).ok()) - } - - fn use_tls(&self) -> bool { - // only disable tls if we are explicitly dialing a http url - #[allow(clippy::match_like_matches_macro)] - match self.url.scheme() { - "http" => false, - "ws" => false, - _ => true, - } - } - - async fn dial_url(&self) -> Result { - if let Some(ref proxy) = self.proxy_url { - let stream = self.dial_url_proxy(proxy.clone()).await?; - Ok(ProxyStream::Proxied(stream)) - } else { - let stream = self.dial_url_direct().await?; - Ok(ProxyStream::Raw(stream)) - } - } - - async fn dial_url_direct(&self) -> Result { - debug!(%self.url, "dial url"); - let prefer_ipv6 = self.prefer_ipv6().await; - let dst_ip = resolve_host(&self.dns_resolver, &self.url, prefer_ipv6).await?; - - let port = url_port(&self.url) - .ok_or_else(|| ClientError::InvalidUrl("missing url port".into()))?; - let addr = SocketAddr::new(dst_ip, port); - - debug!("connecting to {}", addr); - let tcp_stream = - tokio::time::timeout( - DIAL_NODE_TIMEOUT, - async move { TcpStream::connect(addr).await }, - ) - .await - .map_err(|_| ClientError::ConnectTimeout)? - .map_err(ClientError::DialIO)?; - - tcp_stream.set_nodelay(true)?; - - Ok(tcp_stream) - } - - async fn dial_url_proxy( - &self, - proxy_url: Url, - ) -> Result, MaybeTlsStream>, ClientError> { - debug!(%self.url, %proxy_url, "dial url via proxy"); - - // Resolve proxy DNS - let prefer_ipv6 = self.prefer_ipv6().await; - let proxy_ip = resolve_host(&self.dns_resolver, &proxy_url, prefer_ipv6).await?; - - let proxy_port = url_port(&proxy_url) - .ok_or_else(|| ClientError::Proxy("missing proxy url port".into()))?; - let proxy_addr = SocketAddr::new(proxy_ip, proxy_port); - - debug!(%proxy_addr, "connecting to proxy"); - - let tcp_stream = tokio::time::timeout(DIAL_NODE_TIMEOUT, async move { - TcpStream::connect(proxy_addr).await - }) - .await - .map_err(|_| ClientError::ConnectTimeout)? - .map_err(ClientError::DialIO)?; - - tcp_stream.set_nodelay(true)?; - - // Setup TLS if necessary - let io = if proxy_url.scheme() == "http" { - MaybeTlsStream::Raw(tcp_stream) - } else { - let hostname = proxy_url - .host_str() - .and_then(|s| rustls::ServerName::try_from(s).ok()) - .ok_or_else(|| ClientError::InvalidUrl("No tls servername for proxy url".into()))?; - let tls_stream = self.tls_connector.connect(hostname, tcp_stream).await?; - MaybeTlsStream::Tls(tls_stream) - }; - let io = TokioIo::new(io); - - let target_host = self - .url - .host_str() - .ok_or_else(|| ClientError::Proxy("missing proxy host".into()))?; - - let port = - url_port(&self.url).ok_or_else(|| ClientError::Proxy("invalid target port".into()))?; - - // Establish Proxy Tunnel - let mut req_builder = Request::builder() - .uri(format!("{}:{}", target_host, port)) - .method("CONNECT") - .header("Host", target_host) - .header("Proxy-Connection", "Keep-Alive"); - if !proxy_url.username().is_empty() { - // Passthrough authorization - // https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Proxy-Authorization - debug!( - "setting proxy-authorization: username={}", - proxy_url.username() - ); - let to_encode = format!( - "{}:{}", - proxy_url.username(), - proxy_url.password().unwrap_or_default() - ); - let encoded = URL_SAFE.encode(to_encode); - req_builder = req_builder.header("Proxy-Authorization", format!("Basic {}", encoded)); - } - let req = req_builder.body(Empty::::new())?; - - debug!("Sending proxy request: {:?}", req); - - let (mut sender, conn) = hyper::client::conn::http1::handshake(io).await?; - tokio::task::spawn(async move { - if let Err(err) = conn.with_upgrades().await { - error!("Proxy connection failed: {:?}", err); - } - }); - - let res = sender.send_request(req).await?; - if !res.status().is_success() { - return Err(ClientError::Proxy(format!( - "failed to connect to proxy: {}", - res.status(), - ))); - } - - let upgraded = hyper::upgrade::on(res).await?; - let Ok(Parts { io, read_buf, .. }) = upgraded.downcast::>() else { - return Err(ClientError::Proxy("invalid upgrade".to_string())); - }; - - let res = chain::chain(std::io::Cursor::new(read_buf), io.into_inner()); - - Ok(res) - } - - /// Reports whether IPv4 dials should be slightly - /// delayed to give IPv6 a better chance of winning dial races. - /// Implementations should only return true if IPv6 is expected - /// to succeed. (otherwise delaying IPv4 will delay the connection - /// overall) - async fn prefer_ipv6(&self) -> bool { - match self.address_family_selector { - Some(ref selector) => selector().await, - None => false, - } - } - - async fn recv_detail(&mut self) -> Result<(ReceivedMessage, usize), ClientError> { - if let Some((_client, client_receiver)) = self.relay_client.as_mut() { - trace!("recv_detail tick"); - match client_receiver.recv().await { - Ok(msg) => { - let current_gen = self.current_conn(); - return Ok((msg, current_gen)); - } - Err(e) => { - self.close_for_reconnect().await; - if self.is_closed { - return Err(ClientError::Closed); - } - // TODO(ramfox): more specific error? - return Err(ClientError::Receive(e)); - } - } - } - std::future::pending().await - } - - /// Close the underlying relay connection. The next time the client takes some action that - /// requires a connection, it will call `connect`. - async fn close_for_reconnect(&mut self) { - debug!("close for reconnect"); - if let Some((client, _)) = self.relay_client.take() { - client.close().await - } - } -} - -async fn resolve_host( - resolver: &DnsResolver, - url: &Url, - prefer_ipv6: bool, -) -> Result { - let host = url - .host() - .ok_or_else(|| ClientError::InvalidUrl("missing host".into()))?; - match host { - url::Host::Domain(domain) => { - // Need to do a DNS lookup - let mut addrs = resolver - .lookup_ipv4_ipv6(domain, DNS_TIMEOUT) - .await - .map_err(|e| ClientError::Dns(Some(e)))? - .peekable(); - - let found = if prefer_ipv6 { - let first = addrs.peek().copied(); - addrs.find(IpAddr::is_ipv6).or(first) - } else { - addrs.next() - }; - - found.ok_or_else(|| ClientError::Dns(None)) - } - url::Host::Ipv4(ip) => Ok(IpAddr::V4(ip)), - url::Host::Ipv6(ip) => Ok(IpAddr::V6(ip)), - } -} - -/// Used to allow self signed certificates in tests -#[cfg(any(test, feature = "test-utils"))] -struct NoCertVerifier; - -#[cfg(any(test, feature = "test-utils"))] -impl rustls::client::ServerCertVerifier for NoCertVerifier { - fn verify_server_cert( - &self, - _end_entity: &rustls::Certificate, - _intermediates: &[rustls::Certificate], - _server_name: &rustls::ServerName, - _scts: &mut dyn Iterator, - _ocsp_response: &[u8], - _now: std::time::SystemTime, - ) -> Result { - Ok(rustls::client::ServerCertVerified::assertion()) - } -} - -fn url_port(url: &Url) -> Option { - if let Some(port) = url.port() { - return Some(port); - } - - match url.scheme() { - "http" => Some(80), - "https" => Some(443), - _ => None, - } -} - -#[cfg(test)] -mod tests { - use anyhow::{bail, Result}; - - use crate::dns::default_resolver; - - use super::*; - - #[tokio::test] - async fn test_recv_detail_connect_error() -> Result<()> { - let _guard = iroh_test::logging::setup(); - - let key = SecretKey::generate(); - let bad_url: Url = "https://bad.url".parse().unwrap(); - let dns_resolver = default_resolver(); - - let (_client, mut client_receiver) = - ClientBuilder::new(bad_url).build(key.clone(), dns_resolver.clone()); - - // ensure that the client will bubble up any connection error & not - // just loop ad infinitum attempting to connect - if client_receiver.recv().await.and_then(|s| s.ok()).is_some() { - bail!("expected client with bad relay node detail to return with an error"); - } - Ok(()) - } -} diff --git a/iroh-net/src/relay/iroh_relay.rs b/iroh-net/src/relay/iroh_relay.rs deleted file mode 100644 index 4ada7965a0..0000000000 --- a/iroh-net/src/relay/iroh_relay.rs +++ /dev/null @@ -1,1051 +0,0 @@ -//! A full-fledged iroh-relay server. -//! -//! This module provides an API to run a full fledged iroh-relay server. It is primarily -//! used by the `iroh-relay` binary in this crate. It can be used to run a relay server in -//! other locations however. -//! -//! This code is fully written in a form of structured-concurrency: every spawned task is -//! always attached to a handle and when the handle is dropped the tasks abort. So tasks -//! can not outlive their handle. It is also always possible to await for completion of a -//! task. Some tasks additionally have a method to do graceful shutdown. - -use std::fmt; -use std::future::Future; -use std::net::SocketAddr; -use std::pin::Pin; -use std::sync::Arc; - -use anyhow::{anyhow, bail, Context, Result}; -use futures_lite::StreamExt; -use http::response::Builder as ResponseBuilder; -use http::{HeaderMap, Method, Request, Response, StatusCode}; -use hyper::body::Incoming; -use iroh_metrics::inc; -use tokio::net::{TcpListener, UdpSocket}; -use tokio::task::JoinSet; -use tracing::{debug, error, info, info_span, instrument, trace, warn, Instrument}; - -use crate::key::SecretKey; -use crate::relay; -use crate::relay::http::{ - ServerBuilder as RelayServerBuilder, TlsAcceptor, LEGACY_RELAY_PROBE_PATH, RELAY_PROBE_PATH, -}; -use crate::stun; -use crate::util::AbortingJoinHandle; - -// Module defined in this file. -use metrics::StunMetrics; - -const NO_CONTENT_CHALLENGE_HEADER: &str = "X-Tailscale-Challenge"; -const NO_CONTENT_RESPONSE_HEADER: &str = "X-Tailscale-Response"; -const NOTFOUND: &[u8] = b"Not Found"; -const RELAY_DISABLED: &[u8] = b"relay server disabled"; -const ROBOTS_TXT: &[u8] = b"User-agent: *\nDisallow: /\n"; -const INDEX: &[u8] = br#" -

Iroh Relay

-

- This is an Iroh Relay server. -

-"#; -const TLS_HEADERS: [(&str, &str); 2] = [ - ("Strict-Transport-Security", "max-age=63072000; includeSubDomains"), - ("Content-Security-Policy", "default-src 'none'; frame-ancestors 'none'; form-action 'none'; base-uri 'self'; block-all-mixed-content; plugin-types 'none'") -]; - -type BytesBody = http_body_util::Full; -type HyperError = Box; -type HyperResult = std::result::Result; - -/// Creates a new [`BytesBody`] with no content. -fn body_empty() -> BytesBody { - http_body_util::Full::new(hyper::body::Bytes::new()) -} - -/// Configuration for the full Relay & STUN server. -/// -/// Be aware the generic parameters are for when using the Let's Encrypt TLS configuration. -/// If not used dummy ones need to be provided, e.g. `ServerConfig::<(), ()>::default()`. -#[derive(Debug, Default)] -pub struct ServerConfig { - /// Configuration for the Relay server, disabled if `None`. - pub relay: Option>, - /// Configuration for the STUN server, disabled if `None`. - pub stun: Option, - /// Socket to serve metrics on. - #[cfg(feature = "metrics")] - pub metrics_addr: Option, -} - -/// Configuration for the Relay HTTP and HTTPS server. -/// -/// This includes the HTTP services hosted by the Relay server, the Relay `/relay` HTTP -/// endpoint is only one of the services served. -#[derive(Debug)] -pub struct RelayConfig { - /// The iroh secret key of the Relay server. - pub secret_key: SecretKey, - /// The socket address on which the Relay HTTP server should bind. - /// - /// Normally you'd choose port `80`. The bind address for the HTTPS server is - /// configured in [`RelayConfig::tls`]. - /// - /// If [`RelayConfig::tls`] is `None` then this serves all the HTTP services without - /// TLS. - pub http_bind_addr: SocketAddr, - /// TLS configuration for the HTTPS server. - /// - /// If *None* all the HTTP services that would be served here are served from - /// [`RelayConfig::http_bind_addr`]. - pub tls: Option>, - /// Rate limits. - pub limits: Limits, -} - -/// Configuration for the STUN server. -#[derive(Debug)] -pub struct StunConfig { - /// The socket address on which the STUN server should bind. - /// - /// Normally you'd chose port `3478`, see [`crate::defaults::DEFAULT_STUN_PORT`]. - pub bind_addr: SocketAddr, -} - -/// TLS configuration for Relay server. -/// -/// Normally the Relay server accepts connections on both HTTPS and HTTP. -#[derive(Debug)] -pub struct TlsConfig { - /// The socket address on which to serve the HTTPS server. - /// - /// Since the captive portal probe has to run over plain text HTTP and TLS is used for - /// the main relay server this has to be on a different port. When TLS is not enabled - /// this is served on the [`RelayConfig::http_bind_addr`] socket address. - /// - /// Normally you'd choose port `80`. - pub https_bind_addr: SocketAddr, - /// Mode for getting a cert. - pub cert: CertConfig, -} - -/// Rate limits. -#[derive(Debug, Default)] -pub struct Limits { - /// Rate limit for accepting new connection. Unlimited if not set. - pub accept_conn_limit: Option, - /// Burst limit for accepting new connection. Unlimited if not set. - pub accept_conn_burst: Option, -} - -/// TLS certificate configuration. -#[derive(derive_more::Debug)] -pub enum CertConfig { - /// Use Let's Encrypt. - LetsEncrypt { - /// Configuration for Let's Encrypt certificates. - #[debug("AcmeConfig")] - config: tokio_rustls_acme::AcmeConfig, - }, - /// Use a static TLS key and certificate chain. - Manual { - /// The TLS private key. - private_key: rustls::PrivateKey, - /// The TLS certificate chain. - certs: Vec, - }, -} - -/// A running Relay + STUN server. -/// -/// This is a full Relay server, including STUN, Relay and various associated HTTP services. -/// -/// Dropping this will stop the server. -#[derive(Debug)] -pub struct Server { - /// The address of the HTTP server, if configured. - http_addr: Option, - /// The address of the STUN server, if configured. - stun_addr: Option, - /// The address of the HTTPS server, if the relay server is using TLS. - /// - /// If the Relay server is not using TLS then it is served from the - /// [`Server::http_addr`]. - https_addr: Option, - /// Handle to the relay server. - relay_handle: Option, - /// The main task running the server. - supervisor: AbortingJoinHandle>, -} - -impl Server { - /// Starts the server. - pub async fn spawn(config: ServerConfig) -> Result - where - EC: fmt::Debug + 'static, - EA: fmt::Debug + 'static, - { - let mut tasks = JoinSet::new(); - - #[cfg(feature = "metrics")] - if let Some(addr) = config.metrics_addr { - debug!("Starting metrics server"); - use iroh_metrics::core::Metric; - - iroh_metrics::core::Core::init(|reg, metrics| { - metrics.insert(crate::metrics::RelayMetrics::new(reg)); - metrics.insert(StunMetrics::new(reg)); - }); - tasks.spawn( - iroh_metrics::metrics::start_metrics_server(addr) - .instrument(info_span!("metrics-server")), - ); - } - - // Start the STUN server. - let stun_addr = match config.stun { - Some(stun) => { - debug!("Starting STUN server"); - match UdpSocket::bind(stun.bind_addr).await { - Ok(sock) => { - let addr = sock.local_addr()?; - info!("STUN server bound on {addr}"); - tasks.spawn( - server_stun_listener(sock).instrument(info_span!("stun-server", %addr)), - ); - Some(addr) - } - Err(err) => bail!("failed to bind STUN listener: {err:#?}"), - } - } - None => None, - }; - - // Start the Relay server. - let (relay_server, http_addr) = match config.relay { - Some(relay_config) => { - debug!("Starting Relay server"); - let mut headers = HeaderMap::new(); - for (name, value) in TLS_HEADERS.iter() { - headers.insert(*name, value.parse()?); - } - let relay_bind_addr = match relay_config.tls { - Some(ref tls) => tls.https_bind_addr, - None => relay_config.http_bind_addr, - }; - let mut builder = RelayServerBuilder::new(relay_bind_addr) - .secret_key(Some(relay_config.secret_key)) - .headers(headers) - .relay_override(Box::new(relay_disabled_handler)) - .request_handler(Method::GET, "/", Box::new(root_handler)) - .request_handler(Method::GET, "/index.html", Box::new(root_handler)) - .request_handler( - Method::GET, - LEGACY_RELAY_PROBE_PATH, - Box::new(probe_handler), - ) // backwards compat - .request_handler(Method::GET, RELAY_PROBE_PATH, Box::new(probe_handler)) - .request_handler(Method::GET, "/robots.txt", Box::new(robots_handler)); - let http_addr = match relay_config.tls { - Some(tls_config) => { - let server_config = rustls::ServerConfig::builder() - .with_safe_defaults() - .with_no_client_auth(); - let server_tls_config = match tls_config.cert { - CertConfig::LetsEncrypt { config } => { - let mut state = config.state(); - let server_config = - server_config.with_cert_resolver(state.resolver()); - let acceptor = TlsAcceptor::LetsEncrypt(state.acceptor()); - tasks.spawn( - async move { - while let Some(event) = state.next().await { - match event { - Ok(ok) => debug!("acme event: {ok:?}"), - Err(err) => error!("error: {err:?}"), - } - } - Err(anyhow!("acme event stream finished")) - } - .instrument(info_span!("acme")), - ); - Some(relay::http::TlsConfig { - config: Arc::new(server_config), - acceptor, - }) - } - CertConfig::Manual { private_key, certs } => { - let server_config = server_config - .with_single_cert(certs.clone(), private_key.clone())?; - let server_config = Arc::new(server_config); - let acceptor = - tokio_rustls::TlsAcceptor::from(server_config.clone()); - let acceptor = TlsAcceptor::Manual(acceptor); - Some(relay::http::TlsConfig { - config: server_config, - acceptor, - }) - } - }; - builder = builder.tls_config(server_tls_config); - - // Some services always need to be served over HTTP without TLS. Run - // these standalone. - let http_listener = TcpListener::bind(&relay_config.http_bind_addr) - .await - .context("failed to bind http")?; - let http_addr = http_listener.local_addr()?; - tasks.spawn( - run_captive_portal_service(http_listener) - .instrument(info_span!("http-service", addr = %http_addr)), - ); - Some(http_addr) - } - None => { - // If running Relay without TLS add the plain HTTP server directly - // to the Relay server. - builder = builder.request_handler( - Method::GET, - "/generate_204", - Box::new(serve_no_content_handler), - ); - None - } - }; - let relay_server = builder.spawn().await?; - (Some(relay_server), http_addr) - } - None => (None, None), - }; - // If http_addr is Some then relay_server is serving HTTPS. If http_addr is None - // relay_server is serving HTTP, including the /generate_204 service. - let relay_addr = relay_server.as_ref().map(|srv| srv.addr()); - let relay_handle = relay_server.as_ref().map(|srv| srv.handle()); - let relay_server = relay_server.map(RelayHttpServerGuard); - let task = tokio::spawn(relay_supervisor(tasks, relay_server)); - Ok(Self { - http_addr: http_addr.or(relay_addr), - stun_addr, - https_addr: http_addr.and(relay_addr), - relay_handle, - supervisor: AbortingJoinHandle::from(task), - }) - } - - /// Requests graceful shutdown. - /// - /// Returns once all server tasks have stopped. - pub async fn shutdown(self) -> Result<()> { - // Only the Relay server needs shutting down, the supervisor will abort the tasks in - // the JoinSet when the server terminates. - if let Some(handle) = self.relay_handle { - handle.shutdown(); - } - self.supervisor.await? - } - - /// Returns the handle for the task. - /// - /// This allows waiting for the server's supervisor task to finish. Can be useful in - /// case there is an error in the server before it is shut down. - pub fn task_handle(&mut self) -> &mut AbortingJoinHandle> { - &mut self.supervisor - } - - /// The socket address the HTTPS server is listening on. - pub fn https_addr(&self) -> Option { - self.https_addr - } - - /// The socket address the HTTP server is listening on. - pub fn http_addr(&self) -> Option { - self.http_addr - } - - /// The socket address the STUN server is listening on. - pub fn stun_addr(&self) -> Option { - self.stun_addr - } -} - -/// Horrible hack to make [`relay::http::Server`] behave somewhat. -/// -/// We need this server to abort on drop to achieve structured concurrency. -// TODO: could consider building this directly into the relay::http::Server -#[derive(Debug)] -struct RelayHttpServerGuard(relay::http::Server); - -impl Drop for RelayHttpServerGuard { - fn drop(&mut self) { - self.0.task_handle().abort(); - } -} - -/// Supervisor for the relay server tasks. -/// -/// As soon as one of the tasks exits, all other tasks are stopped and the server stops. -/// The supervisor finishes once all tasks are finished. -#[instrument(skip_all)] -async fn relay_supervisor( - mut tasks: JoinSet>, - mut relay_http_server: Option, -) -> Result<()> { - let res = match (relay_http_server.as_mut(), tasks.len()) { - (None, _) => tasks - .join_next() - .await - .unwrap_or_else(|| Ok(Err(anyhow!("Nothing to supervise")))), - (Some(relay), 0) => relay.0.task_handle().await.map(anyhow::Ok), - (Some(relay), _) => { - tokio::select! { - biased; - Some(ret) = tasks.join_next() => ret, - ret = relay.0.task_handle() => ret.map(anyhow::Ok), - else => Ok(Err(anyhow!("Empty JoinSet (unreachable)"))), - } - } - }; - let ret = match res { - Ok(Ok(())) => { - debug!("Task exited"); - Ok(()) - } - Ok(Err(err)) => { - error!(%err, "Task failed"); - Err(err.context("task failed")) - } - Err(err) => { - if let Ok(panic) = err.try_into_panic() { - error!("Task panicked"); - std::panic::resume_unwind(panic); - } - debug!("Task cancelled"); - Err(anyhow!("task cancelled")) - } - }; - - // Ensure the HTTP server terminated, there is no harm in calling this after it is - // already shut down. The JoinSet is aborted on drop. - if let Some(server) = relay_http_server { - server.0.shutdown(); - } - - tasks.shutdown().await; - - ret -} - -/// Runs a STUN server. -/// -/// When the future is dropped, the server stops. -async fn server_stun_listener(sock: UdpSocket) -> Result<()> { - info!(addr = ?sock.local_addr().ok(), "running STUN server"); - let sock = Arc::new(sock); - let mut buffer = vec![0u8; 64 << 10]; - let mut tasks = JoinSet::new(); - loop { - tokio::select! { - biased; - _ = tasks.join_next(), if !tasks.is_empty() => (), - res = sock.recv_from(&mut buffer) => { - match res { - Ok((n, src_addr)) => { - inc!(StunMetrics, requests); - let pkt = &buffer[..n]; - if !stun::is(pkt) { - debug!(%src_addr, "STUN: ignoring non stun packet"); - inc!(StunMetrics, bad_requests); - continue; - } - let pkt = pkt.to_vec(); - tasks.spawn(handle_stun_request(src_addr, pkt, sock.clone())); - } - Err(err) => { - inc!(StunMetrics, failures); - warn!("failed to recv: {err:#}"); - } - } - } - } - } -} - -/// Handles a single STUN request, doing all logging required. -async fn handle_stun_request(src_addr: SocketAddr, pkt: Vec, sock: Arc) { - let handle = AbortingJoinHandle::from(tokio::task::spawn_blocking(move || { - match stun::parse_binding_request(&pkt) { - Ok(txid) => { - debug!(%src_addr, %txid, "STUN: received binding request"); - Some((txid, stun::response(txid, src_addr))) - } - Err(err) => { - inc!(StunMetrics, bad_requests); - warn!(%src_addr, "STUN: invalid binding request: {:?}", err); - None - } - } - })); - let (txid, response) = match handle.await { - Ok(Some(val)) => val, - Ok(None) => return, - Err(err) => { - error!("{err:#}"); - return; - } - }; - match sock.send_to(&response, src_addr).await { - Ok(len) => { - if len != response.len() { - warn!( - %src_addr, - %txid, - "failed to write response, {len}/{} bytes sent", - response.len() - ); - } else { - match src_addr { - SocketAddr::V4(_) => inc!(StunMetrics, ipv4_success), - SocketAddr::V6(_) => inc!(StunMetrics, ipv6_success), - } - } - trace!(%src_addr, %txid, "sent {len} bytes"); - } - Err(err) => { - inc!(StunMetrics, failures); - warn!(%src_addr, %txid, "failed to write response: {err:#}"); - } - } -} - -fn relay_disabled_handler( - _r: Request, - response: ResponseBuilder, -) -> HyperResult> { - response - .status(StatusCode::NOT_FOUND) - .body(RELAY_DISABLED.into()) - .map_err(|err| Box::new(err) as HyperError) -} - -fn root_handler( - _r: Request, - response: ResponseBuilder, -) -> HyperResult> { - response - .status(StatusCode::OK) - .header("Content-Type", "text/html; charset=utf-8") - .body(INDEX.into()) - .map_err(|err| Box::new(err) as HyperError) -} - -/// HTTP latency queries -fn probe_handler( - _r: Request, - response: ResponseBuilder, -) -> HyperResult> { - response - .status(StatusCode::OK) - .header("Access-Control-Allow-Origin", "*") - .body(body_empty()) - .map_err(|err| Box::new(err) as HyperError) -} - -fn robots_handler( - _r: Request, - response: ResponseBuilder, -) -> HyperResult> { - response - .status(StatusCode::OK) - .body(ROBOTS_TXT.into()) - .map_err(|err| Box::new(err) as HyperError) -} - -/// For captive portal detection. -fn serve_no_content_handler( - r: Request, - mut response: ResponseBuilder, -) -> HyperResult> { - if let Some(challenge) = r.headers().get(NO_CONTENT_CHALLENGE_HEADER) { - if !challenge.is_empty() - && challenge.len() < 64 - && challenge - .as_bytes() - .iter() - .all(|c| is_challenge_char(*c as char)) - { - response = response.header( - NO_CONTENT_RESPONSE_HEADER, - format!("response {}", challenge.to_str()?), - ); - } - } - - response - .status(StatusCode::NO_CONTENT) - .body(body_empty()) - .map_err(|err| Box::new(err) as HyperError) -} - -fn is_challenge_char(c: char) -> bool { - // Semi-randomly chosen as a limited set of valid characters - c.is_ascii_lowercase() - || c.is_ascii_uppercase() - || c.is_ascii_digit() - || c == '.' - || c == '-' - || c == '_' -} - -/// This is a future that never returns, drop it to cancel/abort. -async fn run_captive_portal_service(http_listener: TcpListener) -> Result<()> { - info!("serving"); - - // If this future is cancelled, this is dropped and all tasks are aborted. - let mut tasks = JoinSet::new(); - - loop { - match http_listener.accept().await { - Ok((stream, peer_addr)) => { - debug!(%peer_addr, "Connection opened",); - let handler = CaptivePortalService; - - tasks.spawn(async move { - let stream = relay::server::MaybeTlsStream::Plain(stream); - let stream = hyper_util::rt::TokioIo::new(stream); - if let Err(err) = hyper::server::conn::http1::Builder::new() - .serve_connection(stream, handler) - .with_upgrades() - .await - { - error!("Failed to serve connection: {err:?}"); - } - }); - } - Err(err) => { - error!( - "[CaptivePortalService] failed to accept connection: {:#?}", - err - ); - } - } - } -} - -#[derive(Clone)] -struct CaptivePortalService; - -impl hyper::service::Service> for CaptivePortalService { - type Response = Response; - type Error = HyperError; - type Future = Pin> + Send>>; - - fn call(&self, req: Request) -> Self::Future { - match (req.method(), req.uri().path()) { - // Captive Portal checker - (&Method::GET, "/generate_204") => { - Box::pin(async move { serve_no_content_handler(req, Response::builder()) }) - } - _ => { - // Return 404 not found response. - let r = Response::builder() - .status(StatusCode::NOT_FOUND) - .body(NOTFOUND.into()) - .map_err(|err| Box::new(err) as HyperError); - Box::pin(async move { r }) - } - } - } -} - -mod metrics { - use iroh_metrics::{ - core::{Counter, Metric}, - struct_iterable::Iterable, - }; - - /// StunMetrics tracked for the DERPER - #[allow(missing_docs)] - #[derive(Debug, Clone, Iterable)] - pub struct StunMetrics { - /* - * Metrics about STUN requests over ipv6 - */ - /// Number of stun requests made - pub requests: Counter, - /// Number of successful requests over ipv4 - pub ipv4_success: Counter, - /// Number of successful requests over ipv6 - pub ipv6_success: Counter, - - /// Number of bad requests, either non-stun packets or incorrect binding request - pub bad_requests: Counter, - /// Number of failures - pub failures: Counter, - } - - impl Default for StunMetrics { - fn default() -> Self { - Self { - /* - * Metrics about STUN requests - */ - requests: Counter::new("Number of STUN requests made to the server."), - ipv4_success: Counter::new("Number of successful ipv4 STUN requests served."), - ipv6_success: Counter::new("Number of successful ipv6 STUN requests served."), - bad_requests: Counter::new("Number of bad requests made to the STUN endpoint."), - failures: Counter::new("Number of STUN requests that end in failure."), - } - } - } - - impl Metric for StunMetrics { - fn name() -> &'static str { - "stun" - } - } -} - -#[cfg(test)] -mod tests { - use std::net::Ipv4Addr; - use std::time::Duration; - - use bytes::Bytes; - use http::header::UPGRADE; - use iroh_base::node_addr::RelayUrl; - - use crate::relay::http::{ClientBuilder, Protocol, HTTP_UPGRADE_PROTOCOL}; - - use self::relay::client::ReceivedMessage; - - use super::*; - - async fn spawn_local_relay() -> Result { - Server::spawn(ServerConfig::<(), ()> { - relay: Some(RelayConfig { - secret_key: SecretKey::generate(), - http_bind_addr: (Ipv4Addr::LOCALHOST, 0).into(), - tls: None, - limits: Default::default(), - }), - stun: None, - metrics_addr: None, - }) - .await - } - - #[tokio::test] - async fn test_no_services() { - let _guard = iroh_test::logging::setup(); - let mut server = Server::spawn(ServerConfig::<(), ()>::default()) - .await - .unwrap(); - let res = tokio::time::timeout(Duration::from_secs(5), server.task_handle()) - .await - .expect("timeout, server not finished") - .expect("server task JoinError"); - assert!(res.is_err()); - } - - #[tokio::test] - async fn test_conflicting_bind() { - let _guard = iroh_test::logging::setup(); - let mut server = Server::spawn(ServerConfig::<(), ()> { - relay: Some(RelayConfig { - secret_key: SecretKey::generate(), - http_bind_addr: (Ipv4Addr::LOCALHOST, 1234).into(), - tls: None, - limits: Default::default(), - }), - stun: None, - metrics_addr: Some((Ipv4Addr::LOCALHOST, 1234).into()), - }) - .await - .unwrap(); - let res = tokio::time::timeout(Duration::from_secs(5), server.task_handle()) - .await - .expect("timeout, server not finished") - .expect("server task JoinError"); - assert!(res.is_err()); // AddrInUse - } - - #[tokio::test] - async fn test_root_handler() { - let _guard = iroh_test::logging::setup(); - let server = spawn_local_relay().await.unwrap(); - let url = format!("http://{}", server.http_addr().unwrap()); - - let response = reqwest::get(&url).await.unwrap(); - assert_eq!(response.status(), 200); - let body = response.text().await.unwrap(); - assert!(body.contains("iroh.computer")); - } - - #[tokio::test] - async fn test_captive_portal_service() { - let _guard = iroh_test::logging::setup(); - let server = spawn_local_relay().await.unwrap(); - let url = format!("http://{}/generate_204", server.http_addr().unwrap()); - let challenge = "123az__."; - - let client = reqwest::Client::new(); - let response = client - .get(&url) - .header(NO_CONTENT_CHALLENGE_HEADER, challenge) - .send() - .await - .unwrap(); - assert_eq!(response.status(), StatusCode::NO_CONTENT); - let header = response.headers().get(NO_CONTENT_RESPONSE_HEADER).unwrap(); - assert_eq!(header.to_str().unwrap(), format!("response {challenge}")); - let body = response.text().await.unwrap(); - assert!(body.is_empty()); - } - - #[tokio::test] - async fn test_relay_client_legacy_route() { - let _guard = iroh_test::logging::setup(); - let server = spawn_local_relay().await.unwrap(); - // We're testing the legacy endpoint at `/derp` - let endpoint_url = format!("http://{}/derp", server.http_addr().unwrap()); - - let client = reqwest::Client::new(); - let result = client - .get(endpoint_url) - .header(UPGRADE, HTTP_UPGRADE_PROTOCOL) - .send() - .await - .unwrap(); - - assert_eq!(result.status(), StatusCode::SWITCHING_PROTOCOLS); - } - - #[tokio::test] - async fn test_relay_clients_both_derp() { - let _guard = iroh_test::logging::setup(); - let server = spawn_local_relay().await.unwrap(); - let relay_url = format!("http://{}", server.http_addr().unwrap()); - let relay_url: RelayUrl = relay_url.parse().unwrap(); - - // set up client a - let a_secret_key = SecretKey::generate(); - let a_key = a_secret_key.public(); - let resolver = crate::dns::default_resolver().clone(); - let (client_a, mut client_a_receiver) = - ClientBuilder::new(relay_url.clone()).build(a_secret_key, resolver); - let connect_client = client_a.clone(); - - // give the relay server some time to accept connections - if let Err(err) = tokio::time::timeout(Duration::from_secs(10), async move { - loop { - match connect_client.connect().await { - Ok(_) => break, - Err(err) => { - warn!("client unable to connect to relay server: {err:#}"); - tokio::time::sleep(Duration::from_millis(100)).await; - } - } - } - }) - .await - { - panic!("error connecting to relay server: {err:#}"); - } - - // set up client b - let b_secret_key = SecretKey::generate(); - let b_key = b_secret_key.public(); - let resolver = crate::dns::default_resolver().clone(); - let (client_b, mut client_b_receiver) = - ClientBuilder::new(relay_url.clone()).build(b_secret_key, resolver); - client_b.connect().await.unwrap(); - - // send message from a to b - let msg = Bytes::from("hello, b"); - client_a.send(b_key, msg.clone()).await.unwrap(); - - let (res, _) = client_b_receiver.recv().await.unwrap().unwrap(); - if let ReceivedMessage::ReceivedPacket { source, data } = res { - assert_eq!(a_key, source); - assert_eq!(msg, data); - } else { - panic!("client_b received unexpected message {res:?}"); - } - - // send message from b to a - let msg = Bytes::from("howdy, a"); - client_b.send(a_key, msg.clone()).await.unwrap(); - - let (res, _) = client_a_receiver.recv().await.unwrap().unwrap(); - if let ReceivedMessage::ReceivedPacket { source, data } = res { - assert_eq!(b_key, source); - assert_eq!(msg, data); - } else { - panic!("client_a received unexpected message {res:?}"); - } - } - - #[tokio::test] - async fn test_relay_clients_both_websockets() { - let _guard = iroh_test::logging::setup(); - let server = spawn_local_relay().await.unwrap(); - - let relay_url = format!("http://{}", server.http_addr().unwrap()); - let relay_url: RelayUrl = relay_url.parse().unwrap(); - - // set up client a - let a_secret_key = SecretKey::generate(); - let a_key = a_secret_key.public(); - let resolver = crate::dns::default_resolver().clone(); - let (client_a, mut client_a_receiver) = ClientBuilder::new(relay_url.clone()) - .protocol(Protocol::Websocket) - .build(a_secret_key, resolver); - let connect_client = client_a.clone(); - - // give the relay server some time to accept connections - if let Err(err) = tokio::time::timeout(Duration::from_secs(10), async move { - loop { - match connect_client.connect().await { - Ok(_) => break, - Err(err) => { - warn!("client unable to connect to relay server: {err:#}"); - tokio::time::sleep(Duration::from_millis(100)).await; - } - } - } - }) - .await - { - panic!("error connecting to relay server: {err:#}"); - } - - // set up client b - let b_secret_key = SecretKey::generate(); - let b_key = b_secret_key.public(); - let resolver = crate::dns::default_resolver().clone(); - let (client_b, mut client_b_receiver) = ClientBuilder::new(relay_url.clone()) - .protocol(Protocol::Websocket) // another websocket client - .build(b_secret_key, resolver); - client_b.connect().await.unwrap(); - - // send message from a to b - let msg = Bytes::from("hello, b"); - client_a.send(b_key, msg.clone()).await.unwrap(); - - let (res, _) = client_b_receiver.recv().await.unwrap().unwrap(); - if let ReceivedMessage::ReceivedPacket { source, data } = res { - assert_eq!(a_key, source); - assert_eq!(msg, data); - } else { - panic!("client_b received unexpected message {res:?}"); - } - - // send message from b to a - let msg = Bytes::from("howdy, a"); - client_b.send(a_key, msg.clone()).await.unwrap(); - - let (res, _) = client_a_receiver.recv().await.unwrap().unwrap(); - if let ReceivedMessage::ReceivedPacket { source, data } = res { - assert_eq!(b_key, source); - assert_eq!(msg, data); - } else { - panic!("client_a received unexpected message {res:?}"); - } - } - - #[tokio::test] - async fn test_relay_clients_websocket_and_derp() { - let _guard = iroh_test::logging::setup(); - let server = spawn_local_relay().await.unwrap(); - - let relay_url = format!("http://{}", server.http_addr().unwrap()); - let relay_url: RelayUrl = relay_url.parse().unwrap(); - - // set up client a - let a_secret_key = SecretKey::generate(); - let a_key = a_secret_key.public(); - let resolver = crate::dns::default_resolver().clone(); - let (client_a, mut client_a_receiver) = - ClientBuilder::new(relay_url.clone()).build(a_secret_key, resolver); - let connect_client = client_a.clone(); - - // give the relay server some time to accept connections - if let Err(err) = tokio::time::timeout(Duration::from_secs(10), async move { - loop { - match connect_client.connect().await { - Ok(_) => break, - Err(err) => { - warn!("client unable to connect to relay server: {err:#}"); - tokio::time::sleep(Duration::from_millis(100)).await; - } - } - } - }) - .await - { - panic!("error connecting to relay server: {err:#}"); - } - - // set up client b - let b_secret_key = SecretKey::generate(); - let b_key = b_secret_key.public(); - let resolver = crate::dns::default_resolver().clone(); - let (client_b, mut client_b_receiver) = ClientBuilder::new(relay_url.clone()) - .protocol(Protocol::Websocket) // Use websockets - .build(b_secret_key, resolver); - client_b.connect().await.unwrap(); - - // send message from a to b - let msg = Bytes::from("hello, b"); - client_a.send(b_key, msg.clone()).await.unwrap(); - - let (res, _) = client_b_receiver.recv().await.unwrap().unwrap(); - if let ReceivedMessage::ReceivedPacket { source, data } = res { - assert_eq!(a_key, source); - assert_eq!(msg, data); - } else { - panic!("client_b received unexpected message {res:?}"); - } - - // send message from b to a - let msg = Bytes::from("howdy, a"); - client_b.send(a_key, msg.clone()).await.unwrap(); - - let (res, _) = client_a_receiver.recv().await.unwrap().unwrap(); - if let ReceivedMessage::ReceivedPacket { source, data } = res { - assert_eq!(b_key, source); - assert_eq!(msg, data); - } else { - panic!("client_a received unexpected message {res:?}"); - } - } - - #[tokio::test] - async fn test_stun() { - let _guard = iroh_test::logging::setup(); - let server = Server::spawn(ServerConfig::<(), ()> { - relay: None, - stun: Some(StunConfig { - bind_addr: (Ipv4Addr::LOCALHOST, 0).into(), - }), - metrics_addr: None, - }) - .await - .unwrap(); - - let txid = stun::TransactionId::default(); - let req = stun::request(txid); - let socket = UdpSocket::bind("127.0.0.1:0").await.unwrap(); - socket - .send_to(&req, server.stun_addr().unwrap()) - .await - .unwrap(); - - // get response - let mut buf = vec![0u8; 64000]; - let (len, addr) = socket.recv_from(&mut buf).await.unwrap(); - assert_eq!(addr, server.stun_addr().unwrap()); - buf.truncate(len); - let (txid_back, response_addr) = stun::parse_response(&buf).unwrap(); - assert_eq!(txid, txid_back); - assert_eq!(response_addr, socket.local_addr().unwrap()); - } -} diff --git a/iroh-net/src/relay/server.rs b/iroh-net/src/relay/server.rs index 865a82a2c0..adc4674416 100644 --- a/iroh-net/src/relay/server.rs +++ b/iroh-net/src/relay/server.rs @@ -1,337 +1,462 @@ -//! based on tailscale/derp/derp_server.go -use std::collections::HashMap; +//! A fully-fledged iroh-relay server over HTTP or HTTPS. +//! +//! This module provides an API to run a full fledged iroh-relay server. It is primarily +//! used by the `iroh-relay` binary in this crate. It can be used to run a relay server in +//! other locations however. +//! +//! This code is fully written in a form of structured-concurrency: every spawned task is +//! always attached to a handle and when the handle is dropped the tasks abort. So tasks +//! can not outlive their handle. It is also always possible to await for completion of a +//! task. Some tasks additionally have a method to do graceful shutdown. + +use std::fmt; +use std::future::Future; +use std::net::SocketAddr; use std::pin::Pin; -use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Arc; -use std::task::{Context, Poll}; -use std::time::Duration; - -use anyhow::{bail, Context as _, Result}; -use futures_lite::Stream; -use futures_sink::Sink; -use hyper::HeaderMap; -use iroh_metrics::core::UsageStatsReport; -use iroh_metrics::{inc, inc_by, report_usage_stats}; -use time::{Date, OffsetDateTime}; -use tokio::io::{AsyncRead, AsyncWrite}; -use tokio::sync::mpsc; -use tokio::task::JoinHandle; -use tokio_tungstenite::WebSocketStream; -use tokio_util::codec::Framed; -use tokio_util::sync::CancellationToken; -use tracing::{info_span, trace, Instrument}; -use tungstenite::protocol::Role; - -use crate::key::{PublicKey, SecretKey}; - -use super::codec::Frame; -use super::http::Protocol; -use super::{ - client_conn::ClientConnBuilder, - clients::Clients, - codec::{ - recv_client_key, DerpCodec, PER_CLIENT_SEND_QUEUE_DEPTH, PROTOCOL_VERSION, - SERVER_CHANNEL_SIZE, - }, - metrics::Metrics, - types::ServerMessage, -}; -// TODO: skipping `verboseDropKeys` for now +use anyhow::{anyhow, bail, Context, Result}; +use futures_lite::StreamExt; +use http::response::Builder as ResponseBuilder; +use http::{HeaderMap, Method, Request, Response, StatusCode}; +use hyper::body::Incoming; +use iroh_metrics::inc; +use tokio::net::{TcpListener, UdpSocket}; +use tokio::task::JoinSet; +use tracing::{debug, error, info, info_span, instrument, trace, warn, Instrument}; + +use crate::key::SecretKey; +use crate::relay::http::{LEGACY_RELAY_PROBE_PATH, RELAY_PROBE_PATH}; +use crate::stun; +use crate::util::AbortingJoinHandle; + +// Module defined in this file. +use stun_metrics::StunMetrics; + +pub(crate) mod actor; +pub(crate) mod client_conn; +mod clients; +mod http_server; +mod metrics; +pub(crate) mod streams; +pub(crate) mod types; + +pub use self::actor::{ClientConnHandler, ServerActorTask}; +pub use self::metrics::Metrics; +pub use self::streams::MaybeTlsStream as MaybeTlsStreamServer; + +const NO_CONTENT_CHALLENGE_HEADER: &str = "X-Tailscale-Challenge"; +const NO_CONTENT_RESPONSE_HEADER: &str = "X-Tailscale-Response"; +const NOTFOUND: &[u8] = b"Not Found"; +const RELAY_DISABLED: &[u8] = b"relay server disabled"; +const ROBOTS_TXT: &[u8] = b"User-agent: *\nDisallow: /\n"; +const INDEX: &[u8] = br#" +

Iroh Relay

+

+ This is an Iroh Relay server. +

+"#; +const TLS_HEADERS: [(&str, &str); 2] = [ + ("Strict-Transport-Security", "max-age=63072000; includeSubDomains"), + ("Content-Security-Policy", "default-src 'none'; frame-ancestors 'none'; form-action 'none'; base-uri 'self'; block-all-mixed-content; plugin-types 'none'") +]; + +type BytesBody = http_body_util::Full; +type HyperError = Box; +type HyperResult = std::result::Result; + +/// Creates a new [`BytesBody`] with no content. +fn body_empty() -> BytesBody { + http_body_util::Full::new(hyper::body::Bytes::new()) +} + +/// Configuration for the full Relay & STUN server. +/// +/// Be aware the generic parameters are for when using the Let's Encrypt TLS configuration. +/// If not used dummy ones need to be provided, e.g. `ServerConfig::<(), ()>::default()`. +#[derive(Debug, Default)] +pub struct ServerConfig { + /// Configuration for the Relay server, disabled if `None`. + pub relay: Option>, + /// Configuration for the STUN server, disabled if `None`. + pub stun: Option, + /// Socket to serve metrics on. + #[cfg(feature = "metrics")] + pub metrics_addr: Option, +} + +/// Configuration for the Relay HTTP and HTTPS server. +/// +/// This includes the HTTP services hosted by the Relay server, the Relay `/relay` HTTP +/// endpoint is only one of the services served. +#[derive(Debug)] +pub struct RelayConfig { + /// The iroh secret key of the Relay server. + pub secret_key: SecretKey, + /// The socket address on which the Relay HTTP server should bind. + /// + /// Normally you'd choose port `80`. The bind address for the HTTPS server is + /// configured in [`RelayConfig::tls`]. + /// + /// If [`RelayConfig::tls`] is `None` then this serves all the HTTP services without + /// TLS. + pub http_bind_addr: SocketAddr, + /// TLS configuration for the HTTPS server. + /// + /// If *None* all the HTTP services that would be served here are served from + /// [`RelayConfig::http_bind_addr`]. + pub tls: Option>, + /// Rate limits. + pub limits: Limits, +} + +/// Configuration for the STUN server. +#[derive(Debug)] +pub struct StunConfig { + /// The socket address on which the STUN server should bind. + /// + /// Normally you'd chose port `3478`, see [`crate::defaults::DEFAULT_STUN_PORT`]. + pub bind_addr: SocketAddr, +} -static CONN_NUM: AtomicUsize = AtomicUsize::new(1); -fn new_conn_num() -> usize { - CONN_NUM.fetch_add(1, Ordering::Relaxed) +/// TLS configuration for Relay server. +/// +/// Normally the Relay server accepts connections on both HTTPS and HTTP. +#[derive(Debug)] +pub struct TlsConfig { + /// The socket address on which to serve the HTTPS server. + /// + /// Since the captive portal probe has to run over plain text HTTP and TLS is used for + /// the main relay server this has to be on a different port. When TLS is not enabled + /// this is served on the [`RelayConfig::http_bind_addr`] socket address. + /// + /// Normally you'd choose port `80`. + pub https_bind_addr: SocketAddr, + /// Mode for getting a cert. + pub cert: CertConfig, } -pub(crate) const WRITE_TIMEOUT: Duration = Duration::from_secs(2); +/// Rate limits. +#[derive(Debug, Default)] +pub struct Limits { + /// Rate limit for accepting new connection. Unlimited if not set. + pub accept_conn_limit: Option, + /// Burst limit for accepting new connection. Unlimited if not set. + pub accept_conn_burst: Option, +} + +/// TLS certificate configuration. +#[derive(derive_more::Debug)] +pub enum CertConfig { + /// Use Let's Encrypt. + LetsEncrypt { + /// Configuration for Let's Encrypt certificates. + #[debug("AcmeConfig")] + config: tokio_rustls_acme::AcmeConfig, + }, + /// Use a static TLS key and certificate chain. + Manual { + /// The TLS private key. + private_key: rustls::PrivateKey, + /// The TLS certificate chain. + certs: Vec, + }, +} -/// A relay server. +/// A running Relay + STUN server. +/// +/// This is a full Relay server, including STUN, Relay and various associated HTTP services. /// -/// Responsible for managing connections to relay [`super::client::Client`]s, sending packets from one client to another. +/// Dropping this will stop the server. #[derive(Debug)] pub struct Server { - /// Optionally specifies how long to wait before failing when writing - /// to a client - write_timeout: Option, - /// secret_key of the client - secret_key: SecretKey, - /// The DER encoded x509 cert to send after `LetsEncrypt` cert+intermediate. - meta_cert: Vec, - /// Channel on which to communicate to the [`ServerActor`] - server_channel: mpsc::Sender, - /// When true, the server has been shutdown. - closed: bool, - /// Server loop handler - loop_handler: JoinHandle>, - /// Done token, forces a hard shutdown. To gracefully shutdown, use [`Server::close`] - cancel: CancellationToken, - // TODO: stats collection + /// The address of the HTTP server, if configured. + http_addr: Option, + /// The address of the STUN server, if configured. + stun_addr: Option, + /// The address of the HTTPS server, if the relay server is using TLS. + /// + /// If the Relay server is not using TLS then it is served from the + /// [`Server::http_addr`]. + https_addr: Option, + /// Handle to the relay server. + relay_handle: Option, + /// The main task running the server. + supervisor: AbortingJoinHandle>, } impl Server { - /// TODO: replace with builder - pub fn new(key: SecretKey) -> Self { - let (server_channel_s, server_channel_r) = mpsc::channel(SERVER_CHANNEL_SIZE); - let server_actor = ServerActor::new(key.public(), server_channel_r); - let cancel_token = CancellationToken::new(); - let done = cancel_token.clone(); - let server_task = tokio::spawn( - async move { server_actor.run(done).await } - .instrument(info_span!("relay.server", me = %key.public().fmt_short())), - ); - let meta_cert = init_meta_cert(&key.public()); - Self { - write_timeout: Some(WRITE_TIMEOUT), - secret_key: key, - meta_cert, - server_channel: server_channel_s, - closed: false, - loop_handler: server_task, - cancel: cancel_token, + /// Starts the server. + pub async fn spawn(config: ServerConfig) -> Result + where + EC: fmt::Debug + 'static, + EA: fmt::Debug + 'static, + { + let mut tasks = JoinSet::new(); + + #[cfg(feature = "metrics")] + if let Some(addr) = config.metrics_addr { + debug!("Starting metrics server"); + use iroh_metrics::core::Metric; + + iroh_metrics::core::Core::init(|reg, metrics| { + metrics.insert(crate::metrics::RelayMetrics::new(reg)); + metrics.insert(StunMetrics::new(reg)); + }); + tasks.spawn( + iroh_metrics::metrics::start_metrics_server(addr) + .instrument(info_span!("metrics-server")), + ); } - } - /// Returns the server's secret key. - pub fn secret_key(&self) -> &SecretKey { - &self.secret_key - } + // Start the STUN server. + let stun_addr = match config.stun { + Some(stun) => { + debug!("Starting STUN server"); + match UdpSocket::bind(stun.bind_addr).await { + Ok(sock) => { + let addr = sock.local_addr()?; + info!("STUN server bound on {addr}"); + tasks.spawn( + server_stun_listener(sock).instrument(info_span!("stun-server", %addr)), + ); + Some(addr) + } + Err(err) => bail!("failed to bind STUN listener: {err:#?}"), + } + } + None => None, + }; - /// Returns the server's public key. - pub fn public_key(&self) -> PublicKey { - self.secret_key.public() + // Start the Relay server. + let (relay_server, http_addr) = match config.relay { + Some(relay_config) => { + debug!("Starting Relay server"); + let mut headers = HeaderMap::new(); + for (name, value) in TLS_HEADERS.iter() { + headers.insert(*name, value.parse()?); + } + let relay_bind_addr = match relay_config.tls { + Some(ref tls) => tls.https_bind_addr, + None => relay_config.http_bind_addr, + }; + let mut builder = http_server::ServerBuilder::new(relay_bind_addr) + .secret_key(Some(relay_config.secret_key)) + .headers(headers) + .relay_override(Box::new(relay_disabled_handler)) + .request_handler(Method::GET, "/", Box::new(root_handler)) + .request_handler(Method::GET, "/index.html", Box::new(root_handler)) + .request_handler( + Method::GET, + LEGACY_RELAY_PROBE_PATH, + Box::new(probe_handler), + ) // backwards compat + .request_handler(Method::GET, RELAY_PROBE_PATH, Box::new(probe_handler)) + .request_handler(Method::GET, "/robots.txt", Box::new(robots_handler)); + let http_addr = match relay_config.tls { + Some(tls_config) => { + let server_config = rustls::ServerConfig::builder() + .with_safe_defaults() + .with_no_client_auth(); + let server_tls_config = match tls_config.cert { + CertConfig::LetsEncrypt { config } => { + let mut state = config.state(); + let server_config = + server_config.with_cert_resolver(state.resolver()); + let acceptor = + http_server::TlsAcceptor::LetsEncrypt(state.acceptor()); + tasks.spawn( + async move { + while let Some(event) = state.next().await { + match event { + Ok(ok) => debug!("acme event: {ok:?}"), + Err(err) => error!("error: {err:?}"), + } + } + Err(anyhow!("acme event stream finished")) + } + .instrument(info_span!("acme")), + ); + Some(http_server::TlsConfig { + config: Arc::new(server_config), + acceptor, + }) + } + CertConfig::Manual { private_key, certs } => { + let server_config = server_config + .with_single_cert(certs.clone(), private_key.clone())?; + let server_config = Arc::new(server_config); + let acceptor = + tokio_rustls::TlsAcceptor::from(server_config.clone()); + let acceptor = http_server::TlsAcceptor::Manual(acceptor); + Some(http_server::TlsConfig { + config: server_config, + acceptor, + }) + } + }; + builder = builder.tls_config(server_tls_config); + + // Some services always need to be served over HTTP without TLS. Run + // these standalone. + let http_listener = TcpListener::bind(&relay_config.http_bind_addr) + .await + .context("failed to bind http")?; + let http_addr = http_listener.local_addr()?; + tasks.spawn( + run_captive_portal_service(http_listener) + .instrument(info_span!("http-service", addr = %http_addr)), + ); + Some(http_addr) + } + None => { + // If running Relay without TLS add the plain HTTP server directly + // to the Relay server. + builder = builder.request_handler( + Method::GET, + "/generate_204", + Box::new(serve_no_content_handler), + ); + None + } + }; + let relay_server = builder.spawn().await?; + (Some(relay_server), http_addr) + } + None => (None, None), + }; + // If http_addr is Some then relay_server is serving HTTPS. If http_addr is None + // relay_server is serving HTTP, including the /generate_204 service. + let relay_addr = relay_server.as_ref().map(|srv| srv.addr()); + let relay_handle = relay_server.as_ref().map(|srv| srv.handle()); + let task = tokio::spawn(relay_supervisor(tasks, relay_server)); + Ok(Self { + http_addr: http_addr.or(relay_addr), + stun_addr, + https_addr: http_addr.and(relay_addr), + relay_handle, + supervisor: AbortingJoinHandle::from(task), + }) } - /// Closes the server and waits for the connections to disconnect. - pub async fn close(mut self) { - if !self.closed { - if let Err(err) = self.server_channel.send(ServerMessage::Shutdown).await { - tracing::warn!( - "could not shutdown the server gracefully, doing a forced shutdown: {:?}", - err - ); - self.cancel.cancel(); - } - match self.loop_handler.await { - Ok(Ok(())) => {} - Ok(Err(e)) => tracing::warn!("error shutting down server: {e:?}"), - Err(e) => tracing::warn!("error waiting for the server process to close: {e:?}"), - } - self.closed = true; + /// Requests graceful shutdown. + /// + /// Returns once all server tasks have stopped. + pub async fn shutdown(self) -> Result<()> { + // Only the Relay server needs shutting down, the supervisor will abort the tasks in + // the JoinSet when the server terminates. + if let Some(handle) = self.relay_handle { + handle.shutdown(); } + self.supervisor.await? } - /// Aborts the server. + /// Returns the handle for the task. /// - /// You should prefer to use [`Server::close`] for a graceful shutdown. - pub fn abort(&self) { - self.cancel.cancel(); + /// This allows waiting for the server's supervisor task to finish. Can be useful in + /// case there is an error in the server before it is shut down. + pub fn task_handle(&mut self) -> &mut AbortingJoinHandle> { + &mut self.supervisor } - /// Whether or not the relay [Server] is closed. - pub fn is_closed(&self) -> bool { - self.closed + /// The socket address the HTTPS server is listening on. + pub fn https_addr(&self) -> Option { + self.https_addr } - /// Create a [`ClientConnHandler`], which can verify connections and add them to the - /// [`Server`]. - pub fn client_conn_handler(&self, default_headers: HeaderMap) -> ClientConnHandler { - ClientConnHandler { - server_channel: self.server_channel.clone(), - secret_key: self.secret_key.clone(), - write_timeout: self.write_timeout, - default_headers: Arc::new(default_headers), - } + /// The socket address the HTTP server is listening on. + pub fn http_addr(&self) -> Option { + self.http_addr } - /// Returns the server metadata cert that can be sent by the TLS server to - /// let the client skip a round trip during start-up. - pub fn meta_cert(&self) -> &[u8] { - &self.meta_cert + /// The socket address the STUN server is listening on. + pub fn stun_addr(&self) -> Option { + self.stun_addr } } -/// Handle incoming connections to the Server. -/// -/// Created by the [`Server`] by calling [`Server::client_conn_handler`]. +/// Supervisor for the relay server tasks. /// -/// Can be cheaply cloned. -#[derive(Debug)] -pub struct ClientConnHandler { - server_channel: mpsc::Sender, - secret_key: SecretKey, - write_timeout: Option, - pub(super) default_headers: Arc, -} - -impl Clone for ClientConnHandler { - fn clone(&self) -> Self { - Self { - server_channel: self.server_channel.clone(), - secret_key: self.secret_key.clone(), - write_timeout: self.write_timeout, - default_headers: Arc::clone(&self.default_headers), - } - } -} - -impl ClientConnHandler { - /// Adds a new connection to the server and serves it. - /// - /// Will error if it takes too long (10 sec) to write or read to the connection, if there is - /// some read or write error to the connection, if the server is meant to verify clients, - /// and is unable to verify this one, or if there is some issue communicating with the server. - /// - /// The provided [`AsyncRead`] and [`AsyncWrite`] must be already connected to the connection. - pub async fn accept(&self, protocol: Protocol, io: MaybeTlsStream) -> Result<()> { - trace!(?protocol, "accept: start"); - let mut io = match protocol { - Protocol::Relay => { - inc!(Metrics, derp_accepts); - RelayIo::Derp(Framed::new(io, DerpCodec)) +/// As soon as one of the tasks exits, all other tasks are stopped and the server stops. +/// The supervisor finishes once all tasks are finished. +#[instrument(skip_all)] +async fn relay_supervisor( + mut tasks: JoinSet>, + mut relay_http_server: Option, +) -> Result<()> { + let res = match (relay_http_server.as_mut(), tasks.len()) { + (None, _) => tasks + .join_next() + .await + .unwrap_or_else(|| Ok(Err(anyhow!("Nothing to supervise")))), + (Some(relay), 0) => relay.task_handle().await.map(anyhow::Ok), + (Some(relay), _) => { + tokio::select! { + biased; + Some(ret) = tasks.join_next() => ret, + ret = relay.task_handle() => ret.map(anyhow::Ok), + else => Ok(Err(anyhow!("Empty JoinSet (unreachable)"))), } - Protocol::Websocket => { - inc!(Metrics, websocket_accepts); - RelayIo::Ws(WebSocketStream::from_raw_socket(io, Role::Server, None).await) + } + }; + let ret = match res { + Ok(Ok(())) => { + debug!("Task exited"); + Ok(()) + } + Ok(Err(err)) => { + error!(%err, "Task failed"); + Err(err.context("task failed")) + } + Err(err) => { + if let Ok(panic) = err.try_into_panic() { + error!("Task panicked"); + std::panic::resume_unwind(panic); } - }; - trace!("accept: recv client key"); - let (client_key, info) = recv_client_key(&mut io) - .await - .context("unable to receive client information")?; - - if info.version != PROTOCOL_VERSION { - bail!( - "unexpected client version {}, expected {}", - info.version, - PROTOCOL_VERSION - ); + debug!("Task cancelled"); + Err(anyhow!("task cancelled")) } + }; - trace!("accept: build client conn"); - let client_conn_builder = ClientConnBuilder { - key: client_key, - conn_num: new_conn_num(), - io, - write_timeout: self.write_timeout, - channel_capacity: PER_CLIENT_SEND_QUEUE_DEPTH, - server_channel: self.server_channel.clone(), - }; - trace!("accept: create client"); - self.server_channel - .send(ServerMessage::CreateClient(client_conn_builder)) - .await - .map_err(|_| { - anyhow::anyhow!("server channel closed, the server is probably shutdown") - })?; - Ok(()) + // Ensure the HTTP server terminated, there is no harm in calling this after it is + // already shut down. The JoinSet is aborted on drop. + if let Some(server) = relay_http_server { + server.shutdown(); } -} - -pub(crate) struct ServerActor { - key: PublicKey, - receiver: mpsc::Receiver, - /// All clients connected to this server - clients: Clients, - client_counter: ClientCounter, -} -impl ServerActor { - pub(crate) fn new(key: PublicKey, receiver: mpsc::Receiver) -> Self { - Self { - key, - receiver, - clients: Clients::new(), - client_counter: ClientCounter::default(), - } - } + tasks.shutdown().await; - pub(crate) async fn run(mut self, done: CancellationToken) -> Result<()> { - loop { - tokio::select! { - biased; - _ = done.cancelled() => { - tracing::warn!("server actor loop cancelled, closing loop"); - // TODO: stats: drain channel & count dropped packets etc - // close all client connections and client read/write loops - self.clients.shutdown().await; - return Ok(()); - } - msg = self.receiver.recv() => { - let msg = match msg { - Some(m) => m, - None => { - tracing::warn!("server channel sender closed unexpectedly, shutting down server loop"); - self.clients.shutdown().await; - anyhow::bail!("server channel sender closed unexpectedly, closed client connections, and shutting down server loop"); - } - }; - match msg { - ServerMessage::SendPacket((key, packet)) => { - tracing::trace!("send packet from: {:?} to: {:?} ({}b)", packet.src, key, packet.bytes.len()); - let src = packet.src; - if self.clients.contains_key(&key) { - // if this client is in our local network, just try to send the - // packet - if self.clients.send_packet(&key, packet).is_ok() { - self.clients.record_send(&src, key); - } - } else { - tracing::warn!("send packet: no way to reach client {key:?}, dropped packet"); - inc!(Metrics, send_packets_dropped); - } - } - ServerMessage::SendDiscoPacket((key, packet)) => { - tracing::trace!("send disco packet from: {:?} to: {:?} ({}b)", packet.src, key, packet.bytes.len()); - let src = packet.src; - if self.clients.contains_key(&key) { - // if this client is in our local network, just try to send the - // packet - if self.clients.send_disco_packet(&key, packet).is_ok() { - self.clients.record_send(&src, key); - } - } else { - tracing::warn!("send disco packet: no way to reach client {key:?}, dropped packet"); - inc!(Metrics, disco_packets_dropped); - } - } - ServerMessage::CreateClient(client_builder) => { - inc!(Metrics, accepts); - - tracing::trace!("create client: {:?}", client_builder.key); - let key = client_builder.key; - - report_usage_stats(&UsageStatsReport::new( - "relay_accepts".to_string(), - self.key.to_string(), - 1, - None, // TODO(arqu): attribute to user id; possibly with the re-introduction of request tokens or other auth - Some(key.to_string()), - )).await; - let nc = self.client_counter.update(key); - inc_by!(Metrics, unique_client_keys, nc); - - // build and register client, starting up read & write loops for the - // client connection - self.clients.register(client_builder); + ret +} +/// Runs a STUN server. +/// +/// When the future is dropped, the server stops. +async fn server_stun_listener(sock: UdpSocket) -> Result<()> { + info!(addr = ?sock.local_addr().ok(), "running STUN server"); + let sock = Arc::new(sock); + let mut buffer = vec![0u8; 64 << 10]; + let mut tasks = JoinSet::new(); + loop { + tokio::select! { + biased; + _ = tasks.join_next(), if !tasks.is_empty() => (), + res = sock.recv_from(&mut buffer) => { + match res { + Ok((n, src_addr)) => { + inc!(StunMetrics, requests); + let pkt = &buffer[..n]; + if !stun::is(pkt) { + debug!(%src_addr, "STUN: ignoring non stun packet"); + inc!(StunMetrics, bad_requests); + continue; } - ServerMessage::RemoveClient((key, conn_num)) => { - inc!(Metrics, disconnects); - tracing::trace!("remove client: {:?}", key); - // ensure we still have the client in question - if self.clients.has_client(&key, conn_num) { - // remove the client from the map of clients, & notify any peers that it - // has sent messages that it has left the network - self.clients.unregister(&key); - } - } - ServerMessage::Shutdown => { - tracing::info!("server gracefully shutting down..."); - // close all client connections and client read/write loops - self.clients.shutdown().await; - return Ok(()); - } + let pkt = pkt.to_vec(); + tasks.spawn(handle_stun_request(src_addr, pkt, sock.clone())); + } + Err(err) => { + inc!(StunMetrics, failures); + warn!("failed to recv: {err:#}"); } } } @@ -339,565 +464,584 @@ impl ServerActor { } } -/// Initializes the [`Server`] with a self-signed x509 cert -/// encoding this server's public key and protocol version. "cmd/relay_server -/// then sends this after the Let's Encrypt leaf + intermediate certs after -/// the ServerHello (encrypted in TLS 1.3, not that is matters much). -/// -/// Then the client can save a round trime getting that and can start speaking -/// relay right away. (we don't use ALPN because that's sent in the clear and -/// we're being paranoid to not look too weird to any middleboxes, given that -/// relay is an ultimate fallback path). But since the post-ServerHello certs -/// are encrypted we can have the client also use them as a signal to be able -/// to start speaking relay right away, starting with its identity proof, -/// encrypted to the server's public key. -/// -/// This RTT optimization fails where there's a corp-mandated TLS proxy with -/// corp-mandated root certs on employee machines and TLS proxy cleans up -/// unnecessary certs. In that case we just fall back to the extra RTT. -fn init_meta_cert(server_key: &PublicKey) -> Vec { - let mut params = - rcgen::CertificateParams::new([format!("derpkey{}", hex::encode(server_key.as_bytes()))]); - params.serial_number = Some((PROTOCOL_VERSION as u64).into()); - // Windows requires not_after and not_before set: - params.not_after = time::OffsetDateTime::now_utc().saturating_add(30 * time::Duration::DAY); - params.not_before = time::OffsetDateTime::now_utc().saturating_sub(30 * time::Duration::DAY); - - rcgen::Certificate::from_params(params) - .expect("fixed inputs") - .serialize_der() - .expect("fixed allocations") -} - -#[derive(Debug)] -pub(crate) enum RelayIo { - Derp(Framed), - Ws(WebSocketStream), -} - -fn tung_to_io_err(e: tungstenite::Error) -> std::io::Error { - match e { - tungstenite::Error::Io(io_err) => io_err, - _ => std::io::Error::new(std::io::ErrorKind::Other, e.to_string()), - } -} - -impl Sink for RelayIo { - type Error = std::io::Error; - - fn poll_ready(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - match *self { - Self::Derp(ref mut framed) => Pin::new(framed).poll_ready(cx), - Self::Ws(ref mut ws) => Pin::new(ws).poll_ready(cx).map_err(tung_to_io_err), +/// Handles a single STUN request, doing all logging required. +async fn handle_stun_request(src_addr: SocketAddr, pkt: Vec, sock: Arc) { + let handle = AbortingJoinHandle::from(tokio::task::spawn_blocking(move || { + match stun::parse_binding_request(&pkt) { + Ok(txid) => { + debug!(%src_addr, %txid, "STUN: received binding request"); + Some((txid, stun::response(txid, src_addr))) + } + Err(err) => { + inc!(StunMetrics, bad_requests); + warn!(%src_addr, "STUN: invalid binding request: {:?}", err); + None + } } - } - - fn start_send(mut self: Pin<&mut Self>, item: Frame) -> Result<(), Self::Error> { - match *self { - Self::Derp(ref mut framed) => Pin::new(framed).start_send(item), - Self::Ws(ref mut ws) => Pin::new(ws) - .start_send(tungstenite::Message::Binary(item.encode_for_ws_msg())) - .map_err(tung_to_io_err), + })); + let (txid, response) = match handle.await { + Ok(Some(val)) => val, + Ok(None) => return, + Err(err) => { + error!("{err:#}"); + return; } - } - - fn poll_flush(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - match *self { - Self::Derp(ref mut framed) => Pin::new(framed).poll_flush(cx), - Self::Ws(ref mut ws) => Pin::new(ws).poll_flush(cx).map_err(tung_to_io_err), + }; + match sock.send_to(&response, src_addr).await { + Ok(len) => { + if len != response.len() { + warn!( + %src_addr, + %txid, + "failed to write response, {len}/{} bytes sent", + response.len() + ); + } else { + match src_addr { + SocketAddr::V4(_) => inc!(StunMetrics, ipv4_success), + SocketAddr::V6(_) => inc!(StunMetrics, ipv6_success), + } + } + trace!(%src_addr, %txid, "sent {len} bytes"); } - } - - fn poll_close(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - match *self { - Self::Derp(ref mut framed) => Pin::new(framed).poll_close(cx), - Self::Ws(ref mut ws) => Pin::new(ws).poll_close(cx).map_err(tung_to_io_err), + Err(err) => { + inc!(StunMetrics, failures); + warn!(%src_addr, %txid, "failed to write response: {err:#}"); } } } -impl Stream for RelayIo { - type Item = anyhow::Result; +fn relay_disabled_handler( + _r: Request, + response: ResponseBuilder, +) -> HyperResult> { + response + .status(StatusCode::NOT_FOUND) + .body(RELAY_DISABLED.into()) + .map_err(|err| Box::new(err) as HyperError) +} - fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - match *self { - Self::Derp(ref mut framed) => Pin::new(framed).poll_next(cx), - Self::Ws(ref mut ws) => match Pin::new(ws).poll_next(cx) { - Poll::Ready(Some(Ok(tungstenite::Message::Binary(vec)))) => { - Poll::Ready(Some(Frame::decode_from_ws_msg(vec))) - } - Poll::Ready(Some(Ok(msg))) => { - tracing::warn!(?msg, "Got websocket message of unsupported type, skipping."); - Poll::Pending - } - Poll::Ready(Some(Err(e))) => Poll::Ready(Some(Err(e.into()))), - Poll::Ready(None) => Poll::Ready(None), - Poll::Pending => Poll::Pending, - }, - } - } +fn root_handler( + _r: Request, + response: ResponseBuilder, +) -> HyperResult> { + response + .status(StatusCode::OK) + .header("Content-Type", "text/html; charset=utf-8") + .body(INDEX.into()) + .map_err(|err| Box::new(err) as HyperError) } -/// Whether or not the underlying [`tokio::net::TcpStream`] is served over Tls -#[derive(Debug)] -pub enum MaybeTlsStream { - /// A plain non-Tls [`tokio::net::TcpStream`] - Plain(tokio::net::TcpStream), - /// A Tls wrapped [`tokio::net::TcpStream`] - Tls(tokio_rustls::server::TlsStream), - #[cfg(test)] - Test(tokio::io::DuplexStream), +/// HTTP latency queries +fn probe_handler( + _r: Request, + response: ResponseBuilder, +) -> HyperResult> { + response + .status(StatusCode::OK) + .header("Access-Control-Allow-Origin", "*") + .body(body_empty()) + .map_err(|err| Box::new(err) as HyperError) } -impl AsyncRead for MaybeTlsStream { - fn poll_read( - mut self: Pin<&mut Self>, - cx: &mut Context<'_>, - buf: &mut tokio::io::ReadBuf<'_>, - ) -> Poll> { - match &mut *self { - MaybeTlsStream::Plain(ref mut s) => Pin::new(s).poll_read(cx, buf), - MaybeTlsStream::Tls(ref mut s) => Pin::new(s).poll_read(cx, buf), - #[cfg(test)] - MaybeTlsStream::Test(ref mut s) => Pin::new(s).poll_read(cx, buf), - } - } +fn robots_handler( + _r: Request, + response: ResponseBuilder, +) -> HyperResult> { + response + .status(StatusCode::OK) + .body(ROBOTS_TXT.into()) + .map_err(|err| Box::new(err) as HyperError) } -impl AsyncWrite for MaybeTlsStream { - fn poll_flush( - mut self: Pin<&mut Self>, - cx: &mut Context<'_>, - ) -> Poll> { - match &mut *self { - MaybeTlsStream::Plain(ref mut s) => Pin::new(s).poll_flush(cx), - MaybeTlsStream::Tls(ref mut s) => Pin::new(s).poll_flush(cx), - #[cfg(test)] - MaybeTlsStream::Test(ref mut s) => Pin::new(s).poll_flush(cx), +/// For captive portal detection. +fn serve_no_content_handler( + r: Request, + mut response: ResponseBuilder, +) -> HyperResult> { + if let Some(challenge) = r.headers().get(NO_CONTENT_CHALLENGE_HEADER) { + if !challenge.is_empty() + && challenge.len() < 64 + && challenge + .as_bytes() + .iter() + .all(|c| is_challenge_char(*c as char)) + { + response = response.header( + NO_CONTENT_RESPONSE_HEADER, + format!("response {}", challenge.to_str()?), + ); } } - fn poll_shutdown( - mut self: Pin<&mut Self>, - cx: &mut Context<'_>, - ) -> Poll> { - match &mut *self { - MaybeTlsStream::Plain(ref mut s) => Pin::new(s).poll_shutdown(cx), - MaybeTlsStream::Tls(ref mut s) => Pin::new(s).poll_shutdown(cx), - #[cfg(test)] - MaybeTlsStream::Test(ref mut s) => Pin::new(s).poll_shutdown(cx), - } - } + response + .status(StatusCode::NO_CONTENT) + .body(body_empty()) + .map_err(|err| Box::new(err) as HyperError) +} - fn poll_write( - mut self: Pin<&mut Self>, - cx: &mut Context<'_>, - buf: &[u8], - ) -> Poll> { - match &mut *self { - MaybeTlsStream::Plain(ref mut s) => Pin::new(s).poll_write(cx, buf), - MaybeTlsStream::Tls(ref mut s) => Pin::new(s).poll_write(cx, buf), - #[cfg(test)] - MaybeTlsStream::Test(ref mut s) => Pin::new(s).poll_write(cx, buf), - } - } +fn is_challenge_char(c: char) -> bool { + // Semi-randomly chosen as a limited set of valid characters + c.is_ascii_lowercase() + || c.is_ascii_uppercase() + || c.is_ascii_digit() + || c == '.' + || c == '-' + || c == '_' +} - fn poll_write_vectored( - mut self: Pin<&mut Self>, - cx: &mut Context<'_>, - bufs: &[std::io::IoSlice<'_>], - ) -> Poll> { - match &mut *self { - MaybeTlsStream::Plain(ref mut s) => Pin::new(s).poll_write_vectored(cx, bufs), - MaybeTlsStream::Tls(ref mut s) => Pin::new(s).poll_write_vectored(cx, bufs), - #[cfg(test)] - MaybeTlsStream::Test(ref mut s) => Pin::new(s).poll_write_vectored(cx, bufs), +/// This is a future that never returns, drop it to cancel/abort. +async fn run_captive_portal_service(http_listener: TcpListener) -> Result<()> { + info!("serving"); + + // If this future is cancelled, this is dropped and all tasks are aborted. + let mut tasks = JoinSet::new(); + + loop { + match http_listener.accept().await { + Ok((stream, peer_addr)) => { + debug!(%peer_addr, "Connection opened",); + let handler = CaptivePortalService; + + tasks.spawn(async move { + let stream = crate::relay::server::streams::MaybeTlsStream::Plain(stream); + let stream = hyper_util::rt::TokioIo::new(stream); + if let Err(err) = hyper::server::conn::http1::Builder::new() + .serve_connection(stream, handler) + .with_upgrades() + .await + { + error!("Failed to serve connection: {err:?}"); + } + }); + } + Err(err) => { + error!( + "[CaptivePortalService] failed to accept connection: {:#?}", + err + ); + } } } } -struct ClientCounter { - clients: HashMap, - last_clear_date: Date, -} +#[derive(Clone)] +struct CaptivePortalService; -impl Default for ClientCounter { - fn default() -> Self { - Self { - clients: HashMap::new(), - last_clear_date: OffsetDateTime::now_utc().date(), +impl hyper::service::Service> for CaptivePortalService { + type Response = Response; + type Error = HyperError; + type Future = Pin> + Send>>; + + fn call(&self, req: Request) -> Self::Future { + match (req.method(), req.uri().path()) { + // Captive Portal checker + (&Method::GET, "/generate_204") => { + Box::pin(async move { serve_no_content_handler(req, Response::builder()) }) + } + _ => { + // Return 404 not found response. + let r = Response::builder() + .status(StatusCode::NOT_FOUND) + .body(NOTFOUND.into()) + .map_err(|err| Box::new(err) as HyperError); + Box::pin(async move { r }) + } } } } -impl ClientCounter { - fn check_and_clear(&mut self) { - let today = OffsetDateTime::now_utc().date(); - if today != self.last_clear_date { - self.clients.clear(); - self.last_clear_date = today; +mod stun_metrics { + use iroh_metrics::{ + core::{Counter, Metric}, + struct_iterable::Iterable, + }; + + /// StunMetrics tracked for the DERPER + #[allow(missing_docs)] + #[derive(Debug, Clone, Iterable)] + pub struct StunMetrics { + /* + * Metrics about STUN requests over ipv6 + */ + /// Number of stun requests made + pub requests: Counter, + /// Number of successful requests over ipv4 + pub ipv4_success: Counter, + /// Number of successful requests over ipv6 + pub ipv6_success: Counter, + + /// Number of bad requests, either non-stun packets or incorrect binding request + pub bad_requests: Counter, + /// Number of failures + pub failures: Counter, + } + + impl Default for StunMetrics { + fn default() -> Self { + Self { + /* + * Metrics about STUN requests + */ + requests: Counter::new("Number of STUN requests made to the server."), + ipv4_success: Counter::new("Number of successful ipv4 STUN requests served."), + ipv6_success: Counter::new("Number of successful ipv6 STUN requests served."), + bad_requests: Counter::new("Number of bad requests made to the STUN endpoint."), + failures: Counter::new("Number of STUN requests that end in failure."), + } } } - /// Updates the client counter. - pub fn update(&mut self, client: PublicKey) -> u64 { - self.check_and_clear(); - let new_conn = !self.clients.contains_key(&client); - let counter = self.clients.entry(client).or_insert(0); - *counter += 1; - new_conn as u64 + impl Metric for StunMetrics { + fn name() -> &'static str { + "stun" + } } } #[cfg(test)] mod tests { - use super::*; - - use crate::relay::{ - client::{ClientBuilder, ConnReader, ConnWriter, ReceivedMessage}, - codec::{recv_frame, FrameType}, - http::streams::{MaybeTlsStreamReader, MaybeTlsStreamWriter}, - types::ClientInfo, - }; - use tokio_util::codec::{FramedRead, FramedWrite}; - use tracing_subscriber::{prelude::*, EnvFilter}; + use std::net::Ipv4Addr; + use std::time::Duration; use bytes::Bytes; - use tokio::io::DuplexStream; - - fn test_client_builder( - key: PublicKey, - conn_num: usize, - server_channel: mpsc::Sender, - ) -> (ClientConnBuilder, Framed) { - let (test_io, io) = tokio::io::duplex(1024); - ( - ClientConnBuilder { - key, - conn_num, - io: RelayIo::Derp(Framed::new(MaybeTlsStream::Test(io), DerpCodec)), - write_timeout: None, - channel_capacity: 10, - server_channel, - }, - Framed::new(test_io, DerpCodec), - ) + use http::header::UPGRADE; + use iroh_base::node_addr::RelayUrl; + + use crate::relay::client::conn::ReceivedMessage; + use crate::relay::client::ClientBuilder; + use crate::relay::http::{Protocol, HTTP_UPGRADE_PROTOCOL}; + + use super::*; + + async fn spawn_local_relay() -> Result { + Server::spawn(ServerConfig::<(), ()> { + relay: Some(RelayConfig { + secret_key: SecretKey::generate(), + http_bind_addr: (Ipv4Addr::LOCALHOST, 0).into(), + tls: None, + limits: Default::default(), + }), + stun: None, + metrics_addr: None, + }) + .await } #[tokio::test] - async fn test_server_actor() -> Result<()> { - let server_key = SecretKey::generate().public(); - - // make server actor - let (server_channel, server_channel_r) = mpsc::channel(20); - let server_actor: ServerActor = ServerActor::new(server_key, server_channel_r); - let done = CancellationToken::new(); - let server_done = done.clone(); - - // run server actor - let server_task = tokio::spawn( - async move { server_actor.run(server_done).await } - .instrument(info_span!("relay.server")), - ); - - let key_a = SecretKey::generate().public(); - let (client_a, mut a_io) = test_client_builder(key_a, 1, server_channel.clone()); - - // create client a - server_channel - .send(ServerMessage::CreateClient(client_a)) + async fn test_no_services() { + let _guard = iroh_test::logging::setup(); + let mut server = Server::spawn(ServerConfig::<(), ()>::default()) .await - .map_err(|_| anyhow::anyhow!("server gone"))?; - - // server message: create client b - let key_b = SecretKey::generate().public(); - let (client_b, mut b_io) = test_client_builder(key_b, 2, server_channel.clone()); - server_channel - .send(ServerMessage::CreateClient(client_b)) + .unwrap(); + let res = tokio::time::timeout(Duration::from_secs(5), server.task_handle()) .await - .map_err(|_| anyhow::anyhow!("server gone"))?; - - // write message from b to a - let msg = b"hello world!"; - crate::relay::client::send_packet(&mut b_io, &None, key_a, Bytes::from_static(msg)).await?; - - // get message on a's reader - let frame = recv_frame(FrameType::RecvPacket, &mut a_io).await?; - assert_eq!( - frame, - Frame::RecvPacket { - src_key: key_b, - content: msg.to_vec().into() - } - ); + .expect("timeout, server not finished") + .expect("server task JoinError"); + assert!(res.is_err()); + } - // remove b - server_channel - .send(ServerMessage::RemoveClient((key_b, 2))) + #[tokio::test] + async fn test_conflicting_bind() { + let _guard = iroh_test::logging::setup(); + let mut server = Server::spawn(ServerConfig::<(), ()> { + relay: Some(RelayConfig { + secret_key: SecretKey::generate(), + http_bind_addr: (Ipv4Addr::LOCALHOST, 1234).into(), + tls: None, + limits: Default::default(), + }), + stun: None, + metrics_addr: Some((Ipv4Addr::LOCALHOST, 1234).into()), + }) + .await + .unwrap(); + let res = tokio::time::timeout(Duration::from_secs(5), server.task_handle()) .await - .map_err(|_| anyhow::anyhow!("server gone"))?; + .expect("timeout, server not finished") + .expect("server task JoinError"); + assert!(res.is_err()); // AddrInUse + } - // get peer gone message on a about b leaving the network - // (we get this message because b has sent us a packet before) - let frame = recv_frame(FrameType::PeerGone, &mut a_io).await?; - assert_eq!(Frame::PeerGone { peer: key_b }, frame); + #[tokio::test] + async fn test_root_handler() { + let _guard = iroh_test::logging::setup(); + let server = spawn_local_relay().await.unwrap(); + let url = format!("http://{}", server.http_addr().unwrap()); + + let response = reqwest::get(&url).await.unwrap(); + assert_eq!(response.status(), 200); + let body = response.text().await.unwrap(); + assert!(body.contains("iroh.computer")); + } - // close gracefully - server_channel - .send(ServerMessage::Shutdown) + #[tokio::test] + async fn test_captive_portal_service() { + let _guard = iroh_test::logging::setup(); + let server = spawn_local_relay().await.unwrap(); + let url = format!("http://{}/generate_204", server.http_addr().unwrap()); + let challenge = "123az__."; + + let client = reqwest::Client::new(); + let response = client + .get(&url) + .header(NO_CONTENT_CHALLENGE_HEADER, challenge) + .send() .await - .map_err(|_| anyhow::anyhow!("server gone"))?; - server_task.await??; - Ok(()) + .unwrap(); + assert_eq!(response.status(), StatusCode::NO_CONTENT); + let header = response.headers().get(NO_CONTENT_RESPONSE_HEADER).unwrap(); + assert_eq!(header.to_str().unwrap(), format!("response {challenge}")); + let body = response.text().await.unwrap(); + assert!(body.is_empty()); } #[tokio::test] - async fn test_client_conn_handler() -> Result<()> { - // create client connection handler - let (server_channel_s, mut server_channel_r) = mpsc::channel(10); - let client_key = SecretKey::generate(); - let handler = ClientConnHandler { - secret_key: client_key.clone(), - write_timeout: None, - server_channel: server_channel_s, - default_headers: Default::default(), - }; + async fn test_relay_client_legacy_route() { + let _guard = iroh_test::logging::setup(); + let server = spawn_local_relay().await.unwrap(); + // We're testing the legacy endpoint at `/derp` + let endpoint_url = format!("http://{}/derp", server.http_addr().unwrap()); + + let client = reqwest::Client::new(); + let result = client + .get(endpoint_url) + .header(UPGRADE, HTTP_UPGRADE_PROTOCOL) + .send() + .await + .unwrap(); - // create the parts needed for a client - let (client, server_io) = tokio::io::duplex(10); - let (client_reader, client_writer) = tokio::io::split(client); - let _client_reader = FramedRead::new(client_reader, DerpCodec); - let mut client_writer = FramedWrite::new(client_writer, DerpCodec); - - // start a task as if a client is doing the "accept" handshake - let pub_client_key = client_key.public(); - let client_task: JoinHandle> = tokio::spawn(async move { - // send the client info - let client_info = ClientInfo { - version: PROTOCOL_VERSION, - }; - crate::relay::codec::send_client_key(&mut client_writer, &client_key, &client_info) - .await?; + assert_eq!(result.status(), StatusCode::SWITCHING_PROTOCOLS); + } - Ok(()) - }); - - // attempt to add the connection to the server - handler - .accept(Protocol::Relay, MaybeTlsStream::Test(server_io)) - .await?; - client_task.await??; - - // ensure we inform the server to create the client from the connection! - match server_channel_r.recv().await.unwrap() { - ServerMessage::CreateClient(builder) => { - assert_eq!(pub_client_key, builder.key); + #[tokio::test] + async fn test_relay_clients_both_derp() { + let _guard = iroh_test::logging::setup(); + let server = spawn_local_relay().await.unwrap(); + let relay_url = format!("http://{}", server.http_addr().unwrap()); + let relay_url: RelayUrl = relay_url.parse().unwrap(); + + // set up client a + let a_secret_key = SecretKey::generate(); + let a_key = a_secret_key.public(); + let resolver = crate::dns::default_resolver().clone(); + let (client_a, mut client_a_receiver) = + ClientBuilder::new(relay_url.clone()).build(a_secret_key, resolver); + let connect_client = client_a.clone(); + + // give the relay server some time to accept connections + if let Err(err) = tokio::time::timeout(Duration::from_secs(10), async move { + loop { + match connect_client.connect().await { + Ok(_) => break, + Err(err) => { + warn!("client unable to connect to relay server: {err:#}"); + tokio::time::sleep(Duration::from_millis(100)).await; + } + } } - _ => anyhow::bail!("unexpected server message"), + }) + .await + { + panic!("error connecting to relay server: {err:#}"); } - Ok(()) - } - fn make_test_client(secret_key: SecretKey) -> (tokio::io::DuplexStream, ClientBuilder) { - let (client, server) = tokio::io::duplex(10); - let (client_reader, client_writer) = tokio::io::split(client); - - let client_reader = MaybeTlsStreamReader::Mem(client_reader); - let client_writer = MaybeTlsStreamWriter::Mem(client_writer); + // set up client b + let b_secret_key = SecretKey::generate(); + let b_key = b_secret_key.public(); + let resolver = crate::dns::default_resolver().clone(); + let (client_b, mut client_b_receiver) = + ClientBuilder::new(relay_url.clone()).build(b_secret_key, resolver); + client_b.connect().await.unwrap(); + + // send message from a to b + let msg = Bytes::from("hello, b"); + client_a.send(b_key, msg.clone()).await.unwrap(); + + let (res, _) = client_b_receiver.recv().await.unwrap().unwrap(); + if let ReceivedMessage::ReceivedPacket { source, data } = res { + assert_eq!(a_key, source); + assert_eq!(msg, data); + } else { + panic!("client_b received unexpected message {res:?}"); + } - let client_reader = ConnReader::Derp(FramedRead::new(client_reader, DerpCodec)); - let client_writer = ConnWriter::Derp(FramedWrite::new(client_writer, DerpCodec)); + // send message from b to a + let msg = Bytes::from("howdy, a"); + client_b.send(a_key, msg.clone()).await.unwrap(); - ( - server, - ClientBuilder::new(secret_key, None, client_reader, client_writer), - ) + let (res, _) = client_a_receiver.recv().await.unwrap().unwrap(); + if let ReceivedMessage::ReceivedPacket { source, data } = res { + assert_eq!(b_key, source); + assert_eq!(msg, data); + } else { + panic!("client_a received unexpected message {res:?}"); + } } #[tokio::test] - async fn test_server_basic() -> Result<()> { + async fn test_relay_clients_both_websockets() { let _guard = iroh_test::logging::setup(); - - // create the server! - let server_key = SecretKey::generate(); - let server: Server = Server::new(server_key); - - // create client a and connect it to the server - let key_a = SecretKey::generate(); - let public_key_a = key_a.public(); - let (rw_a, client_a_builder) = make_test_client(key_a); - let handler = server.client_conn_handler(Default::default()); - let handler_task = tokio::spawn(async move { - handler - .accept(Protocol::Relay, MaybeTlsStream::Test(rw_a)) - .await - }); - let (client_a, mut client_receiver_a) = client_a_builder.build().await?; - handler_task.await??; - - // create client b and connect it to the server - let key_b = SecretKey::generate(); - let public_key_b = key_b.public(); - let (rw_b, client_b_builder) = make_test_client(key_b); - let handler = server.client_conn_handler(Default::default()); - let handler_task = tokio::spawn(async move { - handler - .accept(Protocol::Relay, MaybeTlsStream::Test(rw_b)) - .await - }); - let (client_b, mut client_receiver_b) = client_b_builder.build().await?; - handler_task.await??; - - // send message from a to b! - let msg = Bytes::from_static(b"hello client b!!"); - client_a.send(public_key_b, msg.clone()).await?; - match client_receiver_b.recv().await? { - ReceivedMessage::ReceivedPacket { source, data } => { - assert_eq!(public_key_a, source); - assert_eq!(&msg[..], data); - } - msg => { - anyhow::bail!("expected ReceivedPacket msg, got {msg:?}"); + let server = spawn_local_relay().await.unwrap(); + + let relay_url = format!("http://{}", server.http_addr().unwrap()); + let relay_url: RelayUrl = relay_url.parse().unwrap(); + + // set up client a + let a_secret_key = SecretKey::generate(); + let a_key = a_secret_key.public(); + let resolver = crate::dns::default_resolver().clone(); + let (client_a, mut client_a_receiver) = ClientBuilder::new(relay_url.clone()) + .protocol(Protocol::Websocket) + .build(a_secret_key, resolver); + let connect_client = client_a.clone(); + + // give the relay server some time to accept connections + if let Err(err) = tokio::time::timeout(Duration::from_secs(10), async move { + loop { + match connect_client.connect().await { + Ok(_) => break, + Err(err) => { + warn!("client unable to connect to relay server: {err:#}"); + tokio::time::sleep(Duration::from_millis(100)).await; + } + } } + }) + .await + { + panic!("error connecting to relay server: {err:#}"); } - // send message from b to a! - let msg = Bytes::from_static(b"nice to meet you client a!!"); - client_b.send(public_key_a, msg.clone()).await?; - match client_receiver_a.recv().await? { - ReceivedMessage::ReceivedPacket { source, data } => { - assert_eq!(public_key_b, source); - assert_eq!(&msg[..], data); - } - msg => { - anyhow::bail!("expected ReceivedPacket msg, got {msg:?}"); - } + // set up client b + let b_secret_key = SecretKey::generate(); + let b_key = b_secret_key.public(); + let resolver = crate::dns::default_resolver().clone(); + let (client_b, mut client_b_receiver) = ClientBuilder::new(relay_url.clone()) + .protocol(Protocol::Websocket) // another websocket client + .build(b_secret_key, resolver); + client_b.connect().await.unwrap(); + + // send message from a to b + let msg = Bytes::from("hello, b"); + client_a.send(b_key, msg.clone()).await.unwrap(); + + let (res, _) = client_b_receiver.recv().await.unwrap().unwrap(); + if let ReceivedMessage::ReceivedPacket { source, data } = res { + assert_eq!(a_key, source); + assert_eq!(msg, data); + } else { + panic!("client_b received unexpected message {res:?}"); } - // close the server and clients - server.close().await; + // send message from b to a + let msg = Bytes::from("howdy, a"); + client_b.send(a_key, msg.clone()).await.unwrap(); - // client connections have been shutdown - let res = client_a - .send(public_key_b, Bytes::from_static(b"try to send")) - .await; - assert!(res.is_err()); - assert!(client_receiver_b.recv().await.is_err()); - Ok(()) + let (res, _) = client_a_receiver.recv().await.unwrap().unwrap(); + if let ReceivedMessage::ReceivedPacket { source, data } = res { + assert_eq!(b_key, source); + assert_eq!(msg, data); + } else { + panic!("client_a received unexpected message {res:?}"); + } } #[tokio::test] - async fn test_server_replace_client() -> Result<()> { - tracing_subscriber::registry() - .with(tracing_subscriber::fmt::layer().with_writer(std::io::stderr)) - .with(EnvFilter::from_default_env()) - .try_init() - .ok(); - - // create the server! - let server_key = SecretKey::generate(); - let server: Server = Server::new(server_key); - - // create client a and connect it to the server - let key_a = SecretKey::generate(); - let public_key_a = key_a.public(); - let (rw_a, client_a_builder) = make_test_client(key_a); - let handler = server.client_conn_handler(Default::default()); - let handler_task = tokio::spawn(async move { - handler - .accept(Protocol::Relay, MaybeTlsStream::Test(rw_a)) - .await - }); - let (client_a, mut client_receiver_a) = client_a_builder.build().await?; - handler_task.await??; - - // create client b and connect it to the server - let key_b = SecretKey::generate(); - let public_key_b = key_b.public(); - let (rw_b, client_b_builder) = make_test_client(key_b.clone()); - let handler = server.client_conn_handler(Default::default()); - let handler_task = tokio::spawn(async move { - handler - .accept(Protocol::Relay, MaybeTlsStream::Test(rw_b)) - .await - }); - let (client_b, mut client_receiver_b) = client_b_builder.build().await?; - handler_task.await??; - - // send message from a to b! - let msg = Bytes::from_static(b"hello client b!!"); - client_a.send(public_key_b, msg.clone()).await?; - match client_receiver_b.recv().await? { - ReceivedMessage::ReceivedPacket { source, data } => { - assert_eq!(public_key_a, source); - assert_eq!(&msg[..], data); - } - msg => { - anyhow::bail!("expected ReceivedPacket msg, got {msg:?}"); + async fn test_relay_clients_websocket_and_derp() { + let _guard = iroh_test::logging::setup(); + let server = spawn_local_relay().await.unwrap(); + + let relay_url = format!("http://{}", server.http_addr().unwrap()); + let relay_url: RelayUrl = relay_url.parse().unwrap(); + + // set up client a + let a_secret_key = SecretKey::generate(); + let a_key = a_secret_key.public(); + let resolver = crate::dns::default_resolver().clone(); + let (client_a, mut client_a_receiver) = + ClientBuilder::new(relay_url.clone()).build(a_secret_key, resolver); + let connect_client = client_a.clone(); + + // give the relay server some time to accept connections + if let Err(err) = tokio::time::timeout(Duration::from_secs(10), async move { + loop { + match connect_client.connect().await { + Ok(_) => break, + Err(err) => { + warn!("client unable to connect to relay server: {err:#}"); + tokio::time::sleep(Duration::from_millis(100)).await; + } + } } + }) + .await + { + panic!("error connecting to relay server: {err:#}"); } - // send message from b to a! - let msg = Bytes::from_static(b"nice to meet you client a!!"); - client_b.send(public_key_a, msg.clone()).await?; - match client_receiver_a.recv().await? { - ReceivedMessage::ReceivedPacket { source, data } => { - assert_eq!(public_key_b, source); - assert_eq!(&msg[..], data); - } - msg => { - anyhow::bail!("expected ReceivedPacket msg, got {msg:?}"); - } + // set up client b + let b_secret_key = SecretKey::generate(); + let b_key = b_secret_key.public(); + let resolver = crate::dns::default_resolver().clone(); + let (client_b, mut client_b_receiver) = ClientBuilder::new(relay_url.clone()) + .protocol(Protocol::Websocket) // Use websockets + .build(b_secret_key, resolver); + client_b.connect().await.unwrap(); + + // send message from a to b + let msg = Bytes::from("hello, b"); + client_a.send(b_key, msg.clone()).await.unwrap(); + + let (res, _) = client_b_receiver.recv().await.unwrap().unwrap(); + if let ReceivedMessage::ReceivedPacket { source, data } = res { + assert_eq!(a_key, source); + assert_eq!(msg, data); + } else { + panic!("client_b received unexpected message {res:?}"); } - // create client b and connect it to the server - let (new_rw_b, new_client_b_builder) = make_test_client(key_b); - let handler = server.client_conn_handler(Default::default()); - let handler_task = tokio::spawn(async move { - handler - .accept(Protocol::Relay, MaybeTlsStream::Test(new_rw_b)) - .await - }); - let (new_client_b, mut new_client_receiver_b) = new_client_b_builder.build().await?; - handler_task.await??; - - // assert!(client_b.recv().await.is_err()); - - // send message from a to b! - let msg = Bytes::from_static(b"are you still there, b?!"); - client_a.send(public_key_b, msg.clone()).await?; - match new_client_receiver_b.recv().await? { - ReceivedMessage::ReceivedPacket { source, data } => { - assert_eq!(public_key_a, source); - assert_eq!(&msg[..], data); - } - msg => { - anyhow::bail!("expected ReceivedPacket msg, got {msg:?}"); - } - } + // send message from b to a + let msg = Bytes::from("howdy, a"); + client_b.send(a_key, msg.clone()).await.unwrap(); - // send message from b to a! - let msg = Bytes::from_static(b"just had a spot of trouble but I'm back now,a!!"); - new_client_b.send(public_key_a, msg.clone()).await?; - match client_receiver_a.recv().await? { - ReceivedMessage::ReceivedPacket { source, data } => { - assert_eq!(public_key_b, source); - assert_eq!(&msg[..], data); - } - msg => { - anyhow::bail!("expected ReceivedPacket msg, got {msg:?}"); - } + let (res, _) = client_a_receiver.recv().await.unwrap().unwrap(); + if let ReceivedMessage::ReceivedPacket { source, data } = res { + assert_eq!(b_key, source); + assert_eq!(msg, data); + } else { + panic!("client_a received unexpected message {res:?}"); } + } - // close the server and clients - server.close().await; - - // client connections have been shutdown - let res = client_a - .send(public_key_b, Bytes::from_static(b"try to send")) - .await; - assert!(res.is_err()); - assert!(new_client_receiver_b.recv().await.is_err()); - Ok(()) + #[tokio::test] + async fn test_stun() { + let _guard = iroh_test::logging::setup(); + let server = Server::spawn(ServerConfig::<(), ()> { + relay: None, + stun: Some(StunConfig { + bind_addr: (Ipv4Addr::LOCALHOST, 0).into(), + }), + metrics_addr: None, + }) + .await + .unwrap(); + + let txid = stun::TransactionId::default(); + let req = stun::request(txid); + let socket = UdpSocket::bind("127.0.0.1:0").await.unwrap(); + socket + .send_to(&req, server.stun_addr().unwrap()) + .await + .unwrap(); + + // get response + let mut buf = vec![0u8; 64000]; + let (len, addr) = socket.recv_from(&mut buf).await.unwrap(); + assert_eq!(addr, server.stun_addr().unwrap()); + buf.truncate(len); + let (txid_back, response_addr) = stun::parse_response(&buf).unwrap(); + assert_eq!(txid, txid_back); + assert_eq!(response_addr, socket.local_addr().unwrap()); } } diff --git a/iroh-net/src/relay/server/actor.rs b/iroh-net/src/relay/server/actor.rs new file mode 100644 index 0000000000..1b561fdcdf --- /dev/null +++ b/iroh-net/src/relay/server/actor.rs @@ -0,0 +1,761 @@ +//! The main event loop for the relay server. +//! +//! based on tailscale/derp/derp_server.go + +use std::collections::HashMap; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Arc; +use std::time::Duration; + +use anyhow::{bail, Context as _, Result}; +use hyper::HeaderMap; +use iroh_metrics::core::UsageStatsReport; +use iroh_metrics::{inc, inc_by, report_usage_stats}; +use time::{Date, OffsetDateTime}; +use tokio::sync::mpsc; +use tokio_tungstenite::WebSocketStream; +use tokio_util::codec::Framed; +use tokio_util::sync::CancellationToken; +use tracing::{info_span, trace, Instrument}; +use tungstenite::protocol::Role; + +use crate::key::{PublicKey, SecretKey}; +use crate::relay::http::Protocol; +use crate::relay::server::streams::{MaybeTlsStream, RelayIo}; +use crate::relay::server::types::ServerMessage; +use crate::relay::{ + codec::{ + recv_client_key, DerpCodec, PER_CLIENT_SEND_QUEUE_DEPTH, PROTOCOL_VERSION, + SERVER_CHANNEL_SIZE, + }, + server::client_conn::ClientConnBuilder, + server::clients::Clients, + server::metrics::Metrics, +}; +use crate::util::AbortingJoinHandle; + +// TODO: skipping `verboseDropKeys` for now + +static CONN_NUM: AtomicUsize = AtomicUsize::new(1); +fn new_conn_num() -> usize { + CONN_NUM.fetch_add(1, Ordering::Relaxed) +} + +pub(crate) const WRITE_TIMEOUT: Duration = Duration::from_secs(2); + +/// The task for a running server actor. +/// +/// Will forcefully abort the server actor loop when dropped. +/// For stopping gracefully, use [`ServerActorTask::close`]. +/// +/// Responsible for managing connections to relay [`Conn`](crate::relay::RelayConn)s, sending packets from one client to another. +#[derive(Debug)] +pub struct ServerActorTask { + /// Optionally specifies how long to wait before failing when writing + /// to a client + write_timeout: Option, + /// secret_key of the client + secret_key: SecretKey, + /// The DER encoded x509 cert to send after `LetsEncrypt` cert+intermediate. + meta_cert: Vec, + /// Channel on which to communicate to the [`ServerActor`] + server_channel: mpsc::Sender, + /// When true, the server has been shutdown. + closed: bool, + /// Server loop handler + loop_handler: AbortingJoinHandle>, + /// Done token, forces a hard shutdown. To gracefully shutdown, use [`ServerActorTask::close`] + cancel: CancellationToken, + // TODO: stats collection +} + +impl ServerActorTask { + /// TODO: replace with builder + pub fn new(key: SecretKey) -> Self { + let (server_channel_s, server_channel_r) = mpsc::channel(SERVER_CHANNEL_SIZE); + let server_actor = ServerActor::new(key.public(), server_channel_r); + let cancel_token = CancellationToken::new(); + let done = cancel_token.clone(); + let server_task = tokio::spawn( + async move { server_actor.run(done).await } + .instrument(info_span!("relay.server", me = %key.public().fmt_short())), + ) + .into(); + let meta_cert = init_meta_cert(&key.public()); + Self { + write_timeout: Some(WRITE_TIMEOUT), + secret_key: key, + meta_cert, + server_channel: server_channel_s, + closed: false, + loop_handler: server_task, + cancel: cancel_token, + } + } + + /// Returns the server's secret key. + pub fn secret_key(&self) -> &SecretKey { + &self.secret_key + } + + /// Returns the server's public key. + pub fn public_key(&self) -> PublicKey { + self.secret_key.public() + } + + /// Closes the server and waits for the connections to disconnect. + pub async fn close(mut self) { + if !self.closed { + if let Err(err) = self.server_channel.send(ServerMessage::Shutdown).await { + tracing::warn!( + "could not shutdown the server gracefully, doing a forced shutdown: {:?}", + err + ); + self.cancel.cancel(); + } + match self.loop_handler.await { + Ok(Ok(())) => {} + Ok(Err(e)) => tracing::warn!("error shutting down server: {e:?}"), + Err(e) => tracing::warn!("error waiting for the server process to close: {e:?}"), + } + self.closed = true; + } + } + + /// Aborts the server. + /// + /// You should prefer to use [`ServerActorTask::close`] for a graceful shutdown. + pub fn abort(&self) { + self.cancel.cancel(); + } + + /// Whether or not the relay [`ServerActorTask`] is closed. + pub fn is_closed(&self) -> bool { + self.closed + } + + /// Create a [`ClientConnHandler`], which can verify connections and add them to the + /// [`ServerActorTask`]. + pub fn client_conn_handler(&self, default_headers: HeaderMap) -> ClientConnHandler { + ClientConnHandler { + server_channel: self.server_channel.clone(), + secret_key: self.secret_key.clone(), + write_timeout: self.write_timeout, + default_headers: Arc::new(default_headers), + } + } + + /// Returns the server metadata cert that can be sent by the TLS server to + /// let the client skip a round trip during start-up. + pub fn meta_cert(&self) -> &[u8] { + &self.meta_cert + } +} + +/// Handle incoming connections to the Server. +/// +/// Created by the [`ServerActorTask`] by calling [`ServerActorTask::client_conn_handler`]. +/// +/// Can be cheaply cloned. +#[derive(Debug)] +pub struct ClientConnHandler { + server_channel: mpsc::Sender, + secret_key: SecretKey, + write_timeout: Option, + pub(crate) default_headers: Arc, +} + +impl Clone for ClientConnHandler { + fn clone(&self) -> Self { + Self { + server_channel: self.server_channel.clone(), + secret_key: self.secret_key.clone(), + write_timeout: self.write_timeout, + default_headers: Arc::clone(&self.default_headers), + } + } +} + +impl ClientConnHandler { + /// Adds a new connection to the server and serves it. + /// + /// Will error if it takes too long (10 sec) to write or read to the connection, if there is + /// some read or write error to the connection, if the server is meant to verify clients, + /// and is unable to verify this one, or if there is some issue communicating with the server. + /// + /// The provided [`AsyncRead`] and [`AsyncWrite`] must be already connected to the connection. + /// + /// [`AsyncRead`]: tokio::io::AsyncRead + /// [`AsyncWrite`]: tokio::io::AsyncWrite + pub async fn accept(&self, protocol: Protocol, io: MaybeTlsStream) -> Result<()> { + trace!(?protocol, "accept: start"); + let mut io = match protocol { + Protocol::Relay => { + inc!(Metrics, derp_accepts); + RelayIo::Derp(Framed::new(io, DerpCodec)) + } + Protocol::Websocket => { + inc!(Metrics, websocket_accepts); + RelayIo::Ws(WebSocketStream::from_raw_socket(io, Role::Server, None).await) + } + }; + trace!("accept: recv client key"); + let (client_key, info) = recv_client_key(&mut io) + .await + .context("unable to receive client information")?; + + if info.version != PROTOCOL_VERSION { + bail!( + "unexpected client version {}, expected {}", + info.version, + PROTOCOL_VERSION + ); + } + + trace!("accept: build client conn"); + let client_conn_builder = ClientConnBuilder { + key: client_key, + conn_num: new_conn_num(), + io, + write_timeout: self.write_timeout, + channel_capacity: PER_CLIENT_SEND_QUEUE_DEPTH, + server_channel: self.server_channel.clone(), + }; + trace!("accept: create client"); + self.server_channel + .send(ServerMessage::CreateClient(client_conn_builder)) + .await + .map_err(|_| { + anyhow::anyhow!("server channel closed, the server is probably shutdown") + })?; + Ok(()) + } +} + +struct ServerActor { + key: PublicKey, + receiver: mpsc::Receiver, + /// All clients connected to this server + clients: Clients, + client_counter: ClientCounter, +} + +impl ServerActor { + fn new(key: PublicKey, receiver: mpsc::Receiver) -> Self { + Self { + key, + receiver, + clients: Clients::new(), + client_counter: ClientCounter::default(), + } + } + + async fn run(mut self, done: CancellationToken) -> Result<()> { + loop { + tokio::select! { + biased; + _ = done.cancelled() => { + tracing::warn!("server actor loop cancelled, closing loop"); + // TODO: stats: drain channel & count dropped packets etc + // close all client connections and client read/write loops + self.clients.shutdown().await; + return Ok(()); + } + msg = self.receiver.recv() => { + let msg = match msg { + Some(m) => m, + None => { + tracing::warn!("server channel sender closed unexpectedly, shutting down server loop"); + self.clients.shutdown().await; + anyhow::bail!("server channel sender closed unexpectedly, closed client connections, and shutting down server loop"); + } + }; + match msg { + ServerMessage::SendPacket((key, packet)) => { + tracing::trace!("send packet from: {:?} to: {:?} ({}b)", packet.src, key, packet.bytes.len()); + let src = packet.src; + if self.clients.contains_key(&key) { + // if this client is in our local network, just try to send the + // packet + if self.clients.send_packet(&key, packet).is_ok() { + self.clients.record_send(&src, key); + } + } else { + tracing::warn!("send packet: no way to reach client {key:?}, dropped packet"); + inc!(Metrics, send_packets_dropped); + } + } + ServerMessage::SendDiscoPacket((key, packet)) => { + tracing::trace!("send disco packet from: {:?} to: {:?} ({}b)", packet.src, key, packet.bytes.len()); + let src = packet.src; + if self.clients.contains_key(&key) { + // if this client is in our local network, just try to send the + // packet + if self.clients.send_disco_packet(&key, packet).is_ok() { + self.clients.record_send(&src, key); + } + } else { + tracing::warn!("send disco packet: no way to reach client {key:?}, dropped packet"); + inc!(Metrics, disco_packets_dropped); + } + } + ServerMessage::CreateClient(client_builder) => { + inc!(Metrics, accepts); + + tracing::trace!("create client: {:?}", client_builder.key); + let key = client_builder.key; + + report_usage_stats(&UsageStatsReport::new( + "relay_accepts".to_string(), + self.key.to_string(), + 1, + None, // TODO(arqu): attribute to user id; possibly with the re-introduction of request tokens or other auth + Some(key.to_string()), + )).await; + let nc = self.client_counter.update(key); + inc_by!(Metrics, unique_client_keys, nc); + + // build and register client, starting up read & write loops for the + // client connection + self.clients.register(client_builder); + + } + ServerMessage::RemoveClient((key, conn_num)) => { + inc!(Metrics, disconnects); + tracing::trace!("remove client: {:?}", key); + // ensure we still have the client in question + if self.clients.has_client(&key, conn_num) { + // remove the client from the map of clients, & notify any peers that it + // has sent messages that it has left the network + self.clients.unregister(&key); + } + } + ServerMessage::Shutdown => { + tracing::info!("server gracefully shutting down..."); + // close all client connections and client read/write loops + self.clients.shutdown().await; + return Ok(()); + } + } + } + } + } + } +} + +/// Initializes the [`ServerActor`] with a self-signed x509 cert +/// encoding this server's public key and protocol version. "cmd/relay_server +/// then sends this after the Let's Encrypt leaf + intermediate certs after +/// the ServerHello (encrypted in TLS 1.3, not that is matters much). +/// +/// Then the client can save a round trime getting that and can start speaking +/// relay right away. (we don't use ALPN because that's sent in the clear and +/// we're being paranoid to not look too weird to any middleboxes, given that +/// relay is an ultimate fallback path). But since the post-ServerHello certs +/// are encrypted we can have the client also use them as a signal to be able +/// to start speaking relay right away, starting with its identity proof, +/// encrypted to the server's public key. +/// +/// This RTT optimization fails where there's a corp-mandated TLS proxy with +/// corp-mandated root certs on employee machines and TLS proxy cleans up +/// unnecessary certs. In that case we just fall back to the extra RTT. +fn init_meta_cert(server_key: &PublicKey) -> Vec { + let mut params = + rcgen::CertificateParams::new([format!("derpkey{}", hex::encode(server_key.as_bytes()))]); + params.serial_number = Some((PROTOCOL_VERSION as u64).into()); + // Windows requires not_after and not_before set: + params.not_after = time::OffsetDateTime::now_utc().saturating_add(30 * time::Duration::DAY); + params.not_before = time::OffsetDateTime::now_utc().saturating_sub(30 * time::Duration::DAY); + + rcgen::Certificate::from_params(params) + .expect("fixed inputs") + .serialize_der() + .expect("fixed allocations") +} + +struct ClientCounter { + clients: HashMap, + last_clear_date: Date, +} + +impl Default for ClientCounter { + fn default() -> Self { + Self { + clients: HashMap::new(), + last_clear_date: OffsetDateTime::now_utc().date(), + } + } +} + +impl ClientCounter { + fn check_and_clear(&mut self) { + let today = OffsetDateTime::now_utc().date(); + if today != self.last_clear_date { + self.clients.clear(); + self.last_clear_date = today; + } + } + + /// Updates the client counter. + pub fn update(&mut self, client: PublicKey) -> u64 { + self.check_and_clear(); + let new_conn = !self.clients.contains_key(&client); + let counter = self.clients.entry(client).or_insert(0); + *counter += 1; + new_conn as u64 + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use crate::relay::{ + client::conn::{ConnBuilder, ConnReader, ConnWriter, ReceivedMessage}, + client::streams::{MaybeTlsStreamReader, MaybeTlsStreamWriter}, + codec::{recv_frame, ClientInfo, Frame, FrameType}, + }; + use tokio_util::codec::{FramedRead, FramedWrite}; + use tracing_subscriber::{prelude::*, EnvFilter}; + + use bytes::Bytes; + use tokio::io::DuplexStream; + + fn test_client_builder( + key: PublicKey, + conn_num: usize, + server_channel: mpsc::Sender, + ) -> (ClientConnBuilder, Framed) { + let (test_io, io) = tokio::io::duplex(1024); + ( + ClientConnBuilder { + key, + conn_num, + io: RelayIo::Derp(Framed::new(MaybeTlsStream::Test(io), DerpCodec)), + write_timeout: None, + channel_capacity: 10, + server_channel, + }, + Framed::new(test_io, DerpCodec), + ) + } + + #[tokio::test] + async fn test_server_actor() -> Result<()> { + let server_key = SecretKey::generate().public(); + + // make server actor + let (server_channel, server_channel_r) = mpsc::channel(20); + let server_actor: ServerActor = ServerActor::new(server_key, server_channel_r); + let done = CancellationToken::new(); + let server_done = done.clone(); + + // run server actor + let server_task = tokio::spawn( + async move { server_actor.run(server_done).await } + .instrument(info_span!("relay.server")), + ); + + let key_a = SecretKey::generate().public(); + let (client_a, mut a_io) = test_client_builder(key_a, 1, server_channel.clone()); + + // create client a + server_channel + .send(ServerMessage::CreateClient(client_a)) + .await + .map_err(|_| anyhow::anyhow!("server gone"))?; + + // server message: create client b + let key_b = SecretKey::generate().public(); + let (client_b, mut b_io) = test_client_builder(key_b, 2, server_channel.clone()); + server_channel + .send(ServerMessage::CreateClient(client_b)) + .await + .map_err(|_| anyhow::anyhow!("server gone"))?; + + // write message from b to a + let msg = b"hello world!"; + crate::relay::client::conn::send_packet(&mut b_io, &None, key_a, Bytes::from_static(msg)) + .await?; + + // get message on a's reader + let frame = recv_frame(FrameType::RecvPacket, &mut a_io).await?; + assert_eq!( + frame, + Frame::RecvPacket { + src_key: key_b, + content: msg.to_vec().into() + } + ); + + // remove b + server_channel + .send(ServerMessage::RemoveClient((key_b, 2))) + .await + .map_err(|_| anyhow::anyhow!("server gone"))?; + + // get peer gone message on a about b leaving the network + // (we get this message because b has sent us a packet before) + let frame = recv_frame(FrameType::PeerGone, &mut a_io).await?; + assert_eq!(Frame::PeerGone { peer: key_b }, frame); + + // close gracefully + server_channel + .send(ServerMessage::Shutdown) + .await + .map_err(|_| anyhow::anyhow!("server gone"))?; + server_task.await??; + Ok(()) + } + + #[tokio::test] + async fn test_client_conn_handler() -> Result<()> { + // create client connection handler + let (server_channel_s, mut server_channel_r) = mpsc::channel(10); + let client_key = SecretKey::generate(); + let handler = ClientConnHandler { + secret_key: client_key.clone(), + write_timeout: None, + server_channel: server_channel_s, + default_headers: Default::default(), + }; + + // create the parts needed for a client + let (client, server_io) = tokio::io::duplex(10); + let (client_reader, client_writer) = tokio::io::split(client); + let _client_reader = FramedRead::new(client_reader, DerpCodec); + let mut client_writer = FramedWrite::new(client_writer, DerpCodec); + + // start a task as if a client is doing the "accept" handshake + let pub_client_key = client_key.public(); + let client_task: AbortingJoinHandle> = tokio::spawn(async move { + // send the client info + let client_info = ClientInfo { + version: PROTOCOL_VERSION, + }; + crate::relay::codec::send_client_key(&mut client_writer, &client_key, &client_info) + .await?; + + Ok(()) + }) + .into(); + + // attempt to add the connection to the server + handler + .accept(Protocol::Relay, MaybeTlsStream::Test(server_io)) + .await?; + client_task.await??; + + // ensure we inform the server to create the client from the connection! + match server_channel_r.recv().await.unwrap() { + ServerMessage::CreateClient(builder) => { + assert_eq!(pub_client_key, builder.key); + } + _ => anyhow::bail!("unexpected server message"), + } + Ok(()) + } + + fn make_test_client(secret_key: SecretKey) -> (tokio::io::DuplexStream, ConnBuilder) { + let (client, server) = tokio::io::duplex(10); + let (client_reader, client_writer) = tokio::io::split(client); + + let client_reader = MaybeTlsStreamReader::Mem(client_reader); + let client_writer = MaybeTlsStreamWriter::Mem(client_writer); + + let client_reader = ConnReader::Derp(FramedRead::new(client_reader, DerpCodec)); + let client_writer = ConnWriter::Derp(FramedWrite::new(client_writer, DerpCodec)); + + ( + server, + ConnBuilder::new(secret_key, None, client_reader, client_writer), + ) + } + + #[tokio::test] + async fn test_server_basic() -> Result<()> { + let _guard = iroh_test::logging::setup(); + + // create the server! + let server_key = SecretKey::generate(); + let server: ServerActorTask = ServerActorTask::new(server_key); + + // create client a and connect it to the server + let key_a = SecretKey::generate(); + let public_key_a = key_a.public(); + let (rw_a, client_a_builder) = make_test_client(key_a); + let handler = server.client_conn_handler(Default::default()); + let handler_task = tokio::spawn(async move { + handler + .accept(Protocol::Relay, MaybeTlsStream::Test(rw_a)) + .await + }); + let (client_a, mut client_receiver_a) = client_a_builder.build().await?; + handler_task.await??; + + // create client b and connect it to the server + let key_b = SecretKey::generate(); + let public_key_b = key_b.public(); + let (rw_b, client_b_builder) = make_test_client(key_b); + let handler = server.client_conn_handler(Default::default()); + let handler_task = tokio::spawn(async move { + handler + .accept(Protocol::Relay, MaybeTlsStream::Test(rw_b)) + .await + }); + let (client_b, mut client_receiver_b) = client_b_builder.build().await?; + handler_task.await??; + + // send message from a to b! + let msg = Bytes::from_static(b"hello client b!!"); + client_a.send(public_key_b, msg.clone()).await?; + match client_receiver_b.recv().await? { + ReceivedMessage::ReceivedPacket { source, data } => { + assert_eq!(public_key_a, source); + assert_eq!(&msg[..], data); + } + msg => { + anyhow::bail!("expected ReceivedPacket msg, got {msg:?}"); + } + } + + // send message from b to a! + let msg = Bytes::from_static(b"nice to meet you client a!!"); + client_b.send(public_key_a, msg.clone()).await?; + match client_receiver_a.recv().await? { + ReceivedMessage::ReceivedPacket { source, data } => { + assert_eq!(public_key_b, source); + assert_eq!(&msg[..], data); + } + msg => { + anyhow::bail!("expected ReceivedPacket msg, got {msg:?}"); + } + } + + // close the server and clients + server.close().await; + + // client connections have been shutdown + let res = client_a + .send(public_key_b, Bytes::from_static(b"try to send")) + .await; + assert!(res.is_err()); + assert!(client_receiver_b.recv().await.is_err()); + Ok(()) + } + + #[tokio::test] + async fn test_server_replace_client() -> Result<()> { + tracing_subscriber::registry() + .with(tracing_subscriber::fmt::layer().with_writer(std::io::stderr)) + .with(EnvFilter::from_default_env()) + .try_init() + .ok(); + + // create the server! + let server_key = SecretKey::generate(); + let server: ServerActorTask = ServerActorTask::new(server_key); + + // create client a and connect it to the server + let key_a = SecretKey::generate(); + let public_key_a = key_a.public(); + let (rw_a, client_a_builder) = make_test_client(key_a); + let handler = server.client_conn_handler(Default::default()); + let handler_task = tokio::spawn(async move { + handler + .accept(Protocol::Relay, MaybeTlsStream::Test(rw_a)) + .await + }); + let (client_a, mut client_receiver_a) = client_a_builder.build().await?; + handler_task.await??; + + // create client b and connect it to the server + let key_b = SecretKey::generate(); + let public_key_b = key_b.public(); + let (rw_b, client_b_builder) = make_test_client(key_b.clone()); + let handler = server.client_conn_handler(Default::default()); + let handler_task = tokio::spawn(async move { + handler + .accept(Protocol::Relay, MaybeTlsStream::Test(rw_b)) + .await + }); + let (client_b, mut client_receiver_b) = client_b_builder.build().await?; + handler_task.await??; + + // send message from a to b! + let msg = Bytes::from_static(b"hello client b!!"); + client_a.send(public_key_b, msg.clone()).await?; + match client_receiver_b.recv().await? { + ReceivedMessage::ReceivedPacket { source, data } => { + assert_eq!(public_key_a, source); + assert_eq!(&msg[..], data); + } + msg => { + anyhow::bail!("expected ReceivedPacket msg, got {msg:?}"); + } + } + + // send message from b to a! + let msg = Bytes::from_static(b"nice to meet you client a!!"); + client_b.send(public_key_a, msg.clone()).await?; + match client_receiver_a.recv().await? { + ReceivedMessage::ReceivedPacket { source, data } => { + assert_eq!(public_key_b, source); + assert_eq!(&msg[..], data); + } + msg => { + anyhow::bail!("expected ReceivedPacket msg, got {msg:?}"); + } + } + + // create client b and connect it to the server + let (new_rw_b, new_client_b_builder) = make_test_client(key_b); + let handler = server.client_conn_handler(Default::default()); + let handler_task = tokio::spawn(async move { + handler + .accept(Protocol::Relay, MaybeTlsStream::Test(new_rw_b)) + .await + }); + let (new_client_b, mut new_client_receiver_b) = new_client_b_builder.build().await?; + handler_task.await??; + + // assert!(client_b.recv().await.is_err()); + + // send message from a to b! + let msg = Bytes::from_static(b"are you still there, b?!"); + client_a.send(public_key_b, msg.clone()).await?; + match new_client_receiver_b.recv().await? { + ReceivedMessage::ReceivedPacket { source, data } => { + assert_eq!(public_key_a, source); + assert_eq!(&msg[..], data); + } + msg => { + anyhow::bail!("expected ReceivedPacket msg, got {msg:?}"); + } + } + + // send message from b to a! + let msg = Bytes::from_static(b"just had a spot of trouble but I'm back now,a!!"); + new_client_b.send(public_key_a, msg.clone()).await?; + match client_receiver_a.recv().await? { + ReceivedMessage::ReceivedPacket { source, data } => { + assert_eq!(public_key_b, source); + assert_eq!(&msg[..], data); + } + msg => { + anyhow::bail!("expected ReceivedPacket msg, got {msg:?}"); + } + } + + // close the server and clients + server.close().await; + + // client connections have been shutdown + let res = client_a + .send(public_key_b, Bytes::from_static(b"try to send")) + .await; + assert!(res.is_err()); + assert!(new_client_receiver_b.recv().await.is_err()); + Ok(()) + } +} diff --git a/iroh-net/src/relay/client_conn.rs b/iroh-net/src/relay/server/client_conn.rs similarity index 96% rename from iroh-net/src/relay/client_conn.rs rename to iroh-net/src/relay/server/client_conn.rs index c81b92c967..320c7c45bb 100644 --- a/iroh-net/src/relay/client_conn.rs +++ b/iroh-net/src/relay/server/client_conn.rs @@ -1,3 +1,5 @@ +//! The server-side representation of an ongoing client relaying connection. + use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; use std::time::Duration; @@ -15,15 +17,18 @@ use crate::{disco::looks_like_disco_wrapper, key::PublicKey}; use iroh_metrics::{inc, inc_by}; -use super::codec::Frame; -use super::server::RelayIo; -use super::{ +use crate::relay::codec::Frame; +use crate::relay::server::streams::RelayIo; +use crate::relay::server::types::{Packet, ServerMessage}; +use crate::relay::{ codec::{write_frame, KEEP_ALIVE}, - metrics::Metrics, - types::{Packet, ServerMessage}, + server::metrics::Metrics, }; -/// The [`super::server::Server`] side representation of a [`super::client::Client`]'s connection +/// The [`Server`] side representation of a [`Client`]'s connection. +/// +/// [`Server`]: crate::relay::server::Server +/// [`Client`]: crate::relay::client::Client #[derive(Debug)] pub(crate) struct ClientConnManager { /// Static after construction, process-wide unique counter, incremented each time we accept @@ -452,8 +457,9 @@ impl ClientConnIo { #[cfg(test)] mod tests { use crate::key::SecretKey; + use crate::relay::client::conn; use crate::relay::codec::{recv_frame, DerpCodec, FrameType}; - use crate::relay::server::MaybeTlsStream; + use crate::relay::server::streams::MaybeTlsStream; use super::*; @@ -555,8 +561,7 @@ mod tests { // send packet println!(" send packet"); let data = b"hello world!"; - crate::relay::client::send_packet(&mut io_rw, &None, target, Bytes::from_static(data)) - .await?; + conn::send_packet(&mut io_rw, &None, target, Bytes::from_static(data)).await?; let msg = server_channel_r.recv().await.unwrap(); match msg { ServerMessage::SendPacket((got_target, packet)) => { @@ -575,8 +580,7 @@ mod tests { let mut disco_data = crate::disco::MAGIC.as_bytes().to_vec(); disco_data.extend_from_slice(target.as_bytes()); disco_data.extend_from_slice(data); - crate::relay::client::send_packet(&mut io_rw, &None, target, disco_data.clone().into()) - .await?; + conn::send_packet(&mut io_rw, &None, target, disco_data.clone().into()).await?; let msg = server_channel_r.recv().await.unwrap(); match msg { ServerMessage::SendDiscoPacket((got_target, packet)) => { @@ -630,8 +634,7 @@ mod tests { let data = b"hello world!"; let target = SecretKey::generate().public(); - crate::relay::client::send_packet(&mut io_rw, &None, target, Bytes::from_static(data)) - .await?; + conn::send_packet(&mut io_rw, &None, target, Bytes::from_static(data)).await?; let msg = server_channel_r.recv().await.unwrap(); match msg { ServerMessage::SendPacket((got_target, packet)) => { diff --git a/iroh-net/src/relay/clients.rs b/iroh-net/src/relay/server/clients.rs similarity index 98% rename from iroh-net/src/relay/clients.rs rename to iroh-net/src/relay/server/clients.rs index 9f77418332..06ae9b4dde 100644 --- a/iroh-net/src/relay/clients.rs +++ b/iroh-net/src/relay/server/clients.rs @@ -10,11 +10,9 @@ use tracing::{Instrument, Span}; use crate::key::PublicKey; -use super::{ - client_conn::{ClientConnBuilder, ClientConnManager}, - metrics::Metrics, - types::Packet, -}; +use super::client_conn::{ClientConnBuilder, ClientConnManager}; +use super::metrics::Metrics; +use super::types::Packet; /// Number of times we try to send to a client connection before dropping the data; const RETRIES: usize = 3; @@ -262,7 +260,7 @@ mod tests { key::SecretKey, relay::{ codec::{recv_frame, DerpCodec, Frame, FrameType}, - server::{MaybeTlsStream, RelayIo}, + server::streams::{MaybeTlsStream, RelayIo}, }, }; diff --git a/iroh-net/src/relay/http/server.rs b/iroh-net/src/relay/server/http_server.rs similarity index 72% rename from iroh-net/src/relay/http/server.rs rename to iroh-net/src/relay/server/http_server.rs index b0d6af86c6..fb1dd03e1a 100644 --- a/iroh-net/src/relay/http/server.rs +++ b/iroh-net/src/relay/server/http_server.rs @@ -16,17 +16,16 @@ use hyper::service::Service; use hyper::upgrade::Upgraded; use hyper::{HeaderMap, Method, Request, Response, StatusCode}; use tokio::net::{TcpListener, TcpStream}; -use tokio::task::JoinHandle; use tokio_rustls_acme::AcmeAcceptor; use tokio_util::sync::CancellationToken; use tracing::{debug, debug_span, error, info, info_span, warn, Instrument}; use tungstenite::handshake::derive_accept_key; use crate::key::SecretKey; -use crate::relay::http::SUPPORTED_WEBSOCKET_VERSION; -use crate::relay::server::{ClientConnHandler, MaybeTlsStream}; - -use super::{Protocol, LEGACY_RELAY_PATH, RELAY_PATH}; +use crate::relay::http::{Protocol, LEGACY_RELAY_PATH, RELAY_PATH, SUPPORTED_WEBSOCKET_VERSION}; +use crate::relay::server::actor::{ClientConnHandler, ServerActorTask}; +use crate::relay::server::streams::MaybeTlsStream; +use crate::util::AbortingJoinHandle; type BytesBody = http_body_util::Full; type HyperError = Box; @@ -84,7 +83,7 @@ async fn relay_connection_handler( #[derive(Debug)] pub struct Server { addr: SocketAddr, - http_server_task: JoinHandle<()>, + http_server_task: AbortingJoinHandle<()>, cancel_server_loop: CancellationToken, } @@ -95,7 +94,6 @@ impl Server { /// the server, in particular it allows gracefully shutting down the server. pub fn handle(&self) -> ServerHandle { ServerHandle { - addr: self.addr, cancel_token: self.cancel_server_loop.clone(), } } @@ -105,12 +103,12 @@ impl Server { self.cancel_server_loop.cancel(); } - /// Returns the [`JoinHandle`] for the supervisor task managing the server. + /// Returns the [`AbortingJoinHandle`] for the supervisor task managing the server. /// /// This is the root of all the tasks for the server. Aborting it will abort all the /// other tasks for the server. Awaiting it will complete when all the server tasks are /// completed. - pub fn task_handle(&mut self) -> &mut JoinHandle<()> { + pub fn task_handle(&mut self) -> &mut AbortingJoinHandle<()> { &mut self.http_server_task } @@ -125,7 +123,6 @@ impl Server { /// This does not allow access to the task but can communicate with it. #[derive(Debug, Clone)] pub struct ServerHandle { - addr: SocketAddr, cancel_token: CancellationToken, } @@ -134,11 +131,6 @@ impl ServerHandle { pub fn shutdown(&self) { self.cancel_token.cancel() } - - /// Returns the address the server is bound on. - pub fn addr(&self) -> SocketAddr { - self.addr - } } /// Configuration to use for the TLS connection @@ -152,8 +144,8 @@ pub struct TlsConfig { /// Builder for the Relay HTTP Server. /// -/// Defaults to handling relay requests on the "/derp" endpoint. Other HTTP endpoints can -/// be added using [`ServerBuilder::request_handler`]. +/// Defaults to handling relay requests on the "/relay" (and "/derp" for backwards compatibility) endpoint. +/// Other HTTP endpoints can be added using [`ServerBuilder::request_handler`]. /// /// If no [`SecretKey`] is provided, it is assumed that you will provide a /// [`ServerBuilder::relay_override`] function that handles requests to the relay @@ -231,6 +223,7 @@ impl ServerBuilder { } /// Sets a custom "404" handler. + #[allow(unused)] pub fn not_found_handler(mut self, handler: HyperHandler) -> Self { self.not_found_fn = Some(handler); self @@ -260,7 +253,7 @@ impl ServerBuilder { ); let (relay_handler, relay_server) = if let Some(secret_key) = self.secret_key { // spawns a server actor/task - let server = crate::relay::server::Server::new(secret_key.clone()); + let server = ServerActorTask::new(secret_key.clone()); ( RelayHandler::ConnHandler(server.client_conn_handler(self.headers.clone())), Some(server), @@ -305,7 +298,7 @@ impl ServerBuilder { struct ServerState { addr: SocketAddr, tls_config: Option, - server: Option, + server: Option, service: RelayService, } @@ -376,7 +369,7 @@ impl ServerState { Ok(Server { addr, - http_server_task: task, + http_server_task: AbortingJoinHandle::from(task), cancel_server_loop, }) } @@ -682,3 +675,235 @@ impl std::ops::DerefMut for Handlers { &mut self.0 } } + +#[cfg(test)] +mod tests { + use super::*; + + use anyhow::Result; + use bytes::Bytes; + use reqwest::Url; + use tokio::sync::mpsc; + use tokio::task::JoinHandle; + use tracing::{info, info_span, Instrument}; + use tracing_subscriber::{prelude::*, EnvFilter}; + + use crate::key::{PublicKey, SecretKey}; + use crate::relay::client::conn::ReceivedMessage; + use crate::relay::client::{Client, ClientBuilder}; + + pub(crate) fn make_tls_config() -> TlsConfig { + let subject_alt_names = vec!["localhost".to_string()]; + + let cert = rcgen::generate_simple_self_signed(subject_alt_names).unwrap(); + let rustls_certificate = rustls::Certificate(cert.serialize_der().unwrap()); + let rustls_key = rustls::PrivateKey(cert.get_key_pair().serialize_der()); + let config = rustls::ServerConfig::builder() + .with_safe_defaults() + .with_no_client_auth() + .with_single_cert(vec![(rustls_certificate)], rustls_key) + .unwrap(); + + let config = std::sync::Arc::new(config); + let acceptor = tokio_rustls::TlsAcceptor::from(config.clone()); + + TlsConfig { + config, + acceptor: TlsAcceptor::Manual(acceptor), + } + } + + #[tokio::test] + async fn test_http_clients_and_server() -> Result<()> { + let _guard = iroh_test::logging::setup(); + + let server_key = SecretKey::generate(); + let a_key = SecretKey::generate(); + let b_key = SecretKey::generate(); + + // start server + let server = ServerBuilder::new("127.0.0.1:0".parse().unwrap()) + .secret_key(Some(server_key)) + .spawn() + .await?; + + let addr = server.addr(); + + // get dial info + let port = addr.port(); + let addr = { + if let std::net::IpAddr::V4(ipv4_addr) = addr.ip() { + ipv4_addr + } else { + anyhow::bail!("cannot get ipv4 addr from socket addr {addr:?}"); + } + }; + info!("addr: {addr}:{port}"); + let relay_addr: Url = format!("http://{addr}:{port}").parse().unwrap(); + + // create clients + let (a_key, mut a_recv, client_a_task, client_a) = { + let span = info_span!("client-a"); + let _guard = span.enter(); + create_test_client(a_key, relay_addr.clone()) + }; + info!("created client {a_key:?}"); + let (b_key, mut b_recv, client_b_task, client_b) = { + let span = info_span!("client-b"); + let _guard = span.enter(); + create_test_client(b_key, relay_addr) + }; + info!("created client {b_key:?}"); + + info!("ping a"); + client_a.ping().await?; + + info!("ping b"); + client_b.ping().await?; + + info!("sending message from a to b"); + let msg = Bytes::from_static(b"hi there, client b!"); + client_a.send(b_key, msg.clone()).await?; + info!("waiting for message from a on b"); + let (got_key, got_msg) = b_recv.recv().await.expect("expected message from client_a"); + assert_eq!(a_key, got_key); + assert_eq!(msg, got_msg); + + info!("sending message from b to a"); + let msg = Bytes::from_static(b"right back at ya, client b!"); + client_b.send(a_key, msg.clone()).await?; + info!("waiting for message b on a"); + let (got_key, got_msg) = a_recv.recv().await.expect("expected message from client_b"); + assert_eq!(b_key, got_key); + assert_eq!(msg, got_msg); + + client_a.close().await?; + client_a_task.abort(); + client_b.close().await?; + client_b_task.abort(); + server.shutdown(); + + Ok(()) + } + + fn create_test_client( + key: SecretKey, + server_url: Url, + ) -> ( + PublicKey, + mpsc::Receiver<(PublicKey, Bytes)>, + JoinHandle<()>, + Client, + ) { + let client = ClientBuilder::new(server_url).insecure_skip_cert_verify(true); + let dns_resolver = crate::dns::default_resolver(); + let (client, mut client_reader) = client.build(key.clone(), dns_resolver.clone()); + let public_key = key.public(); + let (received_msg_s, received_msg_r) = tokio::sync::mpsc::channel(10); + let client_reader_task = tokio::spawn( + async move { + loop { + info!("waiting for message on {:?}", key.public()); + match client_reader.recv().await { + None => { + info!("client received nothing"); + return; + } + Some(Err(e)) => { + info!("client {:?} `recv` error {e}", key.public()); + return; + } + Some(Ok((msg, _))) => { + info!("got message on {:?}: {msg:?}", key.public()); + if let ReceivedMessage::ReceivedPacket { source, data } = msg { + received_msg_s + .send((source, data)) + .await + .unwrap_or_else(|err| { + panic!( + "client {:?}, error sending message over channel: {:?}", + key.public(), + err + ) + }); + } + } + } + } + } + .instrument(info_span!("test-client-reader")), + ); + (public_key, received_msg_r, client_reader_task, client) + } + + #[tokio::test] + async fn test_https_clients_and_server() -> Result<()> { + tracing_subscriber::registry() + .with(tracing_subscriber::fmt::layer().with_writer(std::io::stderr)) + .with(EnvFilter::from_default_env()) + .try_init() + .ok(); + + let server_key = SecretKey::generate(); + let a_key = SecretKey::generate(); + let b_key = SecretKey::generate(); + + // create tls_config + let tls_config = make_tls_config(); + + // start server + let mut server = ServerBuilder::new("127.0.0.1:0".parse().unwrap()) + .secret_key(Some(server_key)) + .tls_config(Some(tls_config)) + .spawn() + .await?; + + let addr = server.addr(); + + // get dial info + let port = addr.port(); + let addr = { + if let std::net::IpAddr::V4(ipv4_addr) = addr.ip() { + ipv4_addr + } else { + anyhow::bail!("cannot get ipv4 addr from socket addr {addr:?}"); + } + }; + info!("Relay listening on: {addr}:{port}"); + + let url: Url = format!("https://localhost:{port}").parse().unwrap(); + + // create clients + let (a_key, mut a_recv, client_a_task, client_a) = create_test_client(a_key, url.clone()); + info!("created client {a_key:?}"); + let (b_key, mut b_recv, client_b_task, client_b) = create_test_client(b_key, url); + info!("created client {b_key:?}"); + + client_a.ping().await?; + client_b.ping().await?; + + info!("sending message from a to b"); + let msg = Bytes::from_static(b"hi there, client b!"); + client_a.send(b_key, msg.clone()).await?; + info!("waiting for message from a on b"); + let (got_key, got_msg) = b_recv.recv().await.expect("expected message from client_a"); + assert_eq!(a_key, got_key); + assert_eq!(msg, got_msg); + + info!("sending message from b to a"); + let msg = Bytes::from_static(b"right back at ya, client b!"); + client_b.send(a_key, msg.clone()).await?; + info!("waiting for message b on a"); + let (got_key, got_msg) = a_recv.recv().await.expect("expected message from client_b"); + assert_eq!(b_key, got_key); + assert_eq!(msg, got_msg); + + server.shutdown(); + server.task_handle().await?; + client_a.close().await?; + client_a_task.abort(); + client_b.close().await?; + client_b_task.abort(); + Ok(()) + } +} diff --git a/iroh-net/src/relay/metrics.rs b/iroh-net/src/relay/server/metrics.rs similarity index 100% rename from iroh-net/src/relay/metrics.rs rename to iroh-net/src/relay/server/metrics.rs diff --git a/iroh-net/src/relay/server/streams.rs b/iroh-net/src/relay/server/streams.rs new file mode 100644 index 0000000000..096f397c93 --- /dev/null +++ b/iroh-net/src/relay/server/streams.rs @@ -0,0 +1,163 @@ +//! Streams used in the server-side implementation of iroh relays. + +use std::pin::Pin; +use std::task::{Context, Poll}; + +use anyhow::Result; +use futures_lite::Stream; +use futures_sink::Sink; +use tokio::io::{AsyncRead, AsyncWrite}; +use tokio_tungstenite::WebSocketStream; +use tokio_util::codec::Framed; + +use crate::relay::codec::DerpCodec; +use crate::relay::codec::Frame; + +#[derive(Debug)] +pub(crate) enum RelayIo { + Derp(Framed), + Ws(WebSocketStream), +} + +fn tung_to_io_err(e: tungstenite::Error) -> std::io::Error { + match e { + tungstenite::Error::Io(io_err) => io_err, + _ => std::io::Error::new(std::io::ErrorKind::Other, e.to_string()), + } +} + +impl Sink for RelayIo { + type Error = std::io::Error; + + fn poll_ready(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + match *self { + Self::Derp(ref mut framed) => Pin::new(framed).poll_ready(cx), + Self::Ws(ref mut ws) => Pin::new(ws).poll_ready(cx).map_err(tung_to_io_err), + } + } + + fn start_send(mut self: Pin<&mut Self>, item: Frame) -> Result<(), Self::Error> { + match *self { + Self::Derp(ref mut framed) => Pin::new(framed).start_send(item), + Self::Ws(ref mut ws) => Pin::new(ws) + .start_send(tungstenite::Message::Binary(item.encode_for_ws_msg())) + .map_err(tung_to_io_err), + } + } + + fn poll_flush(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + match *self { + Self::Derp(ref mut framed) => Pin::new(framed).poll_flush(cx), + Self::Ws(ref mut ws) => Pin::new(ws).poll_flush(cx).map_err(tung_to_io_err), + } + } + + fn poll_close(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + match *self { + Self::Derp(ref mut framed) => Pin::new(framed).poll_close(cx), + Self::Ws(ref mut ws) => Pin::new(ws).poll_close(cx).map_err(tung_to_io_err), + } + } +} + +impl Stream for RelayIo { + type Item = anyhow::Result; + + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + match *self { + Self::Derp(ref mut framed) => Pin::new(framed).poll_next(cx), + Self::Ws(ref mut ws) => match Pin::new(ws).poll_next(cx) { + Poll::Ready(Some(Ok(tungstenite::Message::Binary(vec)))) => { + Poll::Ready(Some(Frame::decode_from_ws_msg(vec))) + } + Poll::Ready(Some(Ok(msg))) => { + tracing::warn!(?msg, "Got websocket message of unsupported type, skipping."); + Poll::Pending + } + Poll::Ready(Some(Err(e))) => Poll::Ready(Some(Err(e.into()))), + Poll::Ready(None) => Poll::Ready(None), + Poll::Pending => Poll::Pending, + }, + } + } +} + +/// The main underlying IO stream type used for the relay server. +/// +/// Allows choosing whether or not the underlying [`tokio::net::TcpStream`] is served over Tls +#[derive(Debug)] +pub enum MaybeTlsStream { + /// A plain non-Tls [`tokio::net::TcpStream`] + Plain(tokio::net::TcpStream), + /// A Tls wrapped [`tokio::net::TcpStream`] + Tls(tokio_rustls::server::TlsStream), + #[cfg(test)] + Test(tokio::io::DuplexStream), +} + +impl AsyncRead for MaybeTlsStream { + fn poll_read( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + buf: &mut tokio::io::ReadBuf<'_>, + ) -> Poll> { + match &mut *self { + MaybeTlsStream::Plain(ref mut s) => Pin::new(s).poll_read(cx, buf), + MaybeTlsStream::Tls(ref mut s) => Pin::new(s).poll_read(cx, buf), + #[cfg(test)] + MaybeTlsStream::Test(ref mut s) => Pin::new(s).poll_read(cx, buf), + } + } +} + +impl AsyncWrite for MaybeTlsStream { + fn poll_flush( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + ) -> Poll> { + match &mut *self { + MaybeTlsStream::Plain(ref mut s) => Pin::new(s).poll_flush(cx), + MaybeTlsStream::Tls(ref mut s) => Pin::new(s).poll_flush(cx), + #[cfg(test)] + MaybeTlsStream::Test(ref mut s) => Pin::new(s).poll_flush(cx), + } + } + + fn poll_shutdown( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + ) -> Poll> { + match &mut *self { + MaybeTlsStream::Plain(ref mut s) => Pin::new(s).poll_shutdown(cx), + MaybeTlsStream::Tls(ref mut s) => Pin::new(s).poll_shutdown(cx), + #[cfg(test)] + MaybeTlsStream::Test(ref mut s) => Pin::new(s).poll_shutdown(cx), + } + } + + fn poll_write( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + buf: &[u8], + ) -> Poll> { + match &mut *self { + MaybeTlsStream::Plain(ref mut s) => Pin::new(s).poll_write(cx, buf), + MaybeTlsStream::Tls(ref mut s) => Pin::new(s).poll_write(cx, buf), + #[cfg(test)] + MaybeTlsStream::Test(ref mut s) => Pin::new(s).poll_write(cx, buf), + } + } + + fn poll_write_vectored( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + bufs: &[std::io::IoSlice<'_>], + ) -> Poll> { + match &mut *self { + MaybeTlsStream::Plain(ref mut s) => Pin::new(s).poll_write_vectored(cx, bufs), + MaybeTlsStream::Tls(ref mut s) => Pin::new(s).poll_write_vectored(cx, bufs), + #[cfg(test)] + MaybeTlsStream::Test(ref mut s) => Pin::new(s).poll_write_vectored(cx, bufs), + } + } +} diff --git a/iroh-net/src/relay/server/types.rs b/iroh-net/src/relay/server/types.rs new file mode 100644 index 0000000000..008e180715 --- /dev/null +++ b/iroh-net/src/relay/server/types.rs @@ -0,0 +1,25 @@ +//! Types that are shared between [`super::actor`] and [`super::client_conn`]. + +use bytes::Bytes; + +use crate::key::PublicKey; +use crate::relay::server::client_conn::ClientConnBuilder; + +/// A request to write a dataframe to a Client +#[derive(Debug, Clone)] +pub(crate) struct Packet { + /// The sender of the packet + pub(crate) src: PublicKey, + /// The data packet bytes. + pub(crate) bytes: Bytes, +} + +#[derive(derive_more::Debug)] +pub(crate) enum ServerMessage { + SendPacket((PublicKey, Packet)), + SendDiscoPacket((PublicKey, Packet)), + #[debug("CreateClient")] + CreateClient(ClientConnBuilder), + RemoveClient((PublicKey, usize)), + Shutdown, +} diff --git a/iroh-net/src/relay/types.rs b/iroh-net/src/relay/types.rs deleted file mode 100644 index 27445202df..0000000000 --- a/iroh-net/src/relay/types.rs +++ /dev/null @@ -1,73 +0,0 @@ -use std::num::NonZeroU32; - -use anyhow::{bail, Context, Result}; -#[cfg(feature = "iroh-relay")] -use bytes::Bytes; -use postcard::experimental::max_size::MaxSize; -use serde::{Deserialize, Serialize}; - -#[cfg(feature = "iroh-relay")] -use super::client_conn::ClientConnBuilder; -#[cfg(feature = "iroh-relay")] -use crate::key::PublicKey; - -pub(crate) struct RateLimiter { - inner: governor::RateLimiter< - governor::state::direct::NotKeyed, - governor::state::InMemoryState, - governor::clock::DefaultClock, - governor::middleware::NoOpMiddleware, - >, -} - -impl RateLimiter { - pub(crate) fn new(bytes_per_second: usize, bytes_burst: usize) -> Result> { - if bytes_per_second == 0 || bytes_burst == 0 { - return Ok(None); - } - let bytes_per_second = NonZeroU32::new(u32::try_from(bytes_per_second)?) - .context("bytes_per_second not non-zero")?; - let bytes_burst = - NonZeroU32::new(u32::try_from(bytes_burst)?).context("bytes_burst not non-zero")?; - Ok(Some(Self { - inner: governor::RateLimiter::direct( - governor::Quota::per_second(bytes_per_second).allow_burst(bytes_burst), - ), - })) - } - - pub(crate) fn check_n(&self, n: usize) -> Result<()> { - let n = NonZeroU32::new(u32::try_from(n)?).context("n not non-zero")?; - match self.inner.check_n(n) { - Ok(_) => Ok(()), - Err(_) => bail!("batch cannot go through"), - } - } -} - -/// A request to write a dataframe to a Client -#[derive(Debug, Clone)] -#[cfg(feature = "iroh-relay")] -pub(crate) struct Packet { - /// The sender of the packet - pub(crate) src: PublicKey, - /// The data packet bytes. - pub(crate) bytes: Bytes, -} - -#[derive(Debug, Serialize, Deserialize, MaxSize, PartialEq, Eq)] -pub(crate) struct ClientInfo { - /// The relay protocol version that the client was built with. - pub(crate) version: usize, -} - -#[cfg(feature = "iroh-relay")] -#[derive(derive_more::Debug)] -pub(crate) enum ServerMessage { - SendPacket((PublicKey, Packet)), - SendDiscoPacket((PublicKey, Packet)), - #[debug("CreateClient")] - CreateClient(ClientConnBuilder), - RemoveClient((PublicKey, usize)), - Shutdown, -} diff --git a/iroh-net/src/test_utils.rs b/iroh-net/src/test_utils.rs index 34cf957935..a1fad1dead 100644 --- a/iroh-net/src/test_utils.rs +++ b/iroh-net/src/test_utils.rs @@ -1,8 +1,10 @@ //! Internal utilities to support testing. +use std::net::Ipv4Addr; use anyhow::Result; use tokio::sync::oneshot; +use crate::relay::server::{CertConfig, RelayConfig, Server, ServerConfig, StunConfig, TlsConfig}; use crate::{ key::SecretKey, relay::{RelayMap, RelayNode, RelayUrl}, @@ -25,12 +27,7 @@ pub struct CleanupDropGuard(pub(crate) oneshot::Sender<()>); /// /// The returned `Url` is the url of the relay server in the returned [`RelayMap`]. /// When dropped, the returned [`Server`] does will stop running. -/// -/// [`Server`]: crate::relay::iroh_relay::Server -pub async fn run_relay_server() -> Result<(RelayMap, RelayUrl, crate::relay::iroh_relay::Server)> { - use crate::relay::iroh_relay::{CertConfig, RelayConfig, ServerConfig, StunConfig, TlsConfig}; - use std::net::Ipv4Addr; - +pub async fn run_relay_server() -> Result<(RelayMap, RelayUrl, Server)> { let secret_key = SecretKey::generate(); let cert = rcgen::generate_simple_self_signed(vec!["localhost".to_string()]).unwrap(); let rustls_cert = rustls::Certificate(cert.serialize_der().unwrap()); @@ -55,9 +52,7 @@ pub async fn run_relay_server() -> Result<(RelayMap, RelayUrl, crate::relay::iro #[cfg(feature = "metrics")] metrics_addr: None, }; - let server = crate::relay::iroh_relay::Server::spawn(config) - .await - .unwrap(); + let server = Server::spawn(config).await.unwrap(); let url: RelayUrl = format!("https://localhost:{}", server.https_addr().unwrap().port()) .parse() .unwrap(); From 7fdd6cb64f24c908862ccdf59fb5ca466e0b508f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= Date: Wed, 31 Jul 2024 13:34:10 +0200 Subject: [PATCH 14/45] refactor(iroh-net): Switch to (now stable) `IpAddr::to_canonical` (#2569) ## Description Very small refactor. Just switching away from something vendored. - `IpAddr::to_canonical` is [stable since version 1.75](https://doc.rust-lang.org/std/net/enum.IpAddr.html#method.to_canonical). - We've set MSRV to 1.76 in lots of CI jobs (and 1.66 in some remaining ones?) - I want to cfg-out the `net/ip.rs` module in the Wasm case - `to_canonical` is used in a bunch of places, so I can only cfg it out, if that function is kept. Thus the switch to the stable version ## Breaking Changes - Removed `iroh::net::net::ip::to_canonical`. Use `std::net::IpAddr::to_canonical` instead. ## Notes & open questions ## Change checklist - [X] Self-review. - ~~[ ] Documentation updates following the [style guide](https://rust-lang.github.io/rfcs/1574-more-api-documentation-conventions.html#appendix-a-full-conventions-text), if relevant.~~ - ~~[ ] Tests if relevant.~~ - [X] All breaking changes documented. --- iroh-net/src/disco.rs | 4 ++-- iroh-net/src/net/ip.rs | 20 +------------------- iroh-net/src/netcheck.rs | 3 +-- iroh-net/src/netcheck/reportgen.rs | 5 ++--- iroh-net/src/stun.rs | 6 ++---- 5 files changed, 8 insertions(+), 30 deletions(-) diff --git a/iroh-net/src/disco.rs b/iroh-net/src/disco.rs index 3914c6e89c..9e95eeeea4 100644 --- a/iroh-net/src/disco.rs +++ b/iroh-net/src/disco.rs @@ -27,7 +27,7 @@ use anyhow::{anyhow, bail, ensure, Context, Result}; use serde::{Deserialize, Serialize}; use url::Url; -use crate::{key, net::ip::to_canonical, relay::RelayUrl}; +use crate::{key, relay::RelayUrl}; use super::{key::PublicKey, stun}; @@ -269,7 +269,7 @@ fn socket_addr_from_bytes(p: [u8; EP_LENGTH]) -> SocketAddr { let raw_src_ip: [u8; 16] = p[..16].try_into().expect("array long enough"); let raw_port: [u8; 2] = p[16..].try_into().expect("array long enough"); - let src_ip = to_canonical(IpAddr::from(raw_src_ip)); + let src_ip = IpAddr::from(raw_src_ip).to_canonical(); let src_port = u16::from_le_bytes(raw_port); SocketAddr::new(src_ip, src_port) diff --git a/iroh-net/src/net/ip.rs b/iroh-net/src/net/ip.rs index 78bdeabfbf..180fcd908e 100644 --- a/iroh-net/src/net/ip.rs +++ b/iroh-net/src/net/ip.rs @@ -46,7 +46,7 @@ impl LocalAddresses { .chain(iface.ipv6.iter().map(|a| IpAddr::V6(a.addr))); for ip in addrs { - let ip = to_canonical(ip); + let ip = ip.to_canonical(); if ip.is_loopback() || ifc_is_loopback { loopback.push(ip); @@ -128,24 +128,6 @@ pub(super) fn is_link_local(ip: IpAddr) -> bool { } } -/// Converts IPv4-mappend IPv6 addresses to IPv4. -/// -/// Converts this address to an [`IpAddr::V4`] if it is an IPv4-mapped IPv6 addresses, -/// otherwise it return self as-is. -// TODO: replace with IpAddr::to_canonical once stabilized. -pub fn to_canonical(ip: IpAddr) -> IpAddr { - match ip { - ip @ IpAddr::V4(_) => ip, - IpAddr::V6(ip) => { - if let Some(ip) = ip.to_ipv4_mapped() { - IpAddr::V4(ip) - } else { - IpAddr::V6(ip) - } - } - } -} - /// Returns true if the address is a unicast address with link-local scope, as defined in RFC 4291. // Copied from std lib, not stable yet pub const fn is_unicast_link_local(addr: Ipv6Addr) -> bool { diff --git a/iroh-net/src/netcheck.rs b/iroh-net/src/netcheck.rs index e8e731f0a0..59890fbf96 100644 --- a/iroh-net/src/netcheck.rs +++ b/iroh-net/src/netcheck.rs @@ -20,7 +20,6 @@ use tokio_util::sync::CancellationToken; use tracing::{debug, error, info_span, trace, warn, Instrument}; use crate::dns::DnsResolver; -use crate::net::ip::to_canonical; use crate::net::{IpFamily, UdpSocket}; use crate::relay::RelayUrl; use crate::util::CancelOnDrop; @@ -764,7 +763,7 @@ async fn recv_stun_once(sock: &UdpSocket, buf: &mut [u8], actor_addr: &Addr) -> .await .context("Error reading from stun socket")?; let payload = &buf[..count]; - from_addr.set_ip(to_canonical(from_addr.ip())); + from_addr.set_ip(from_addr.ip().to_canonical()); let msg = Message::StunPacket { payload: Bytes::from(payload.to_vec()), from_addr, diff --git a/iroh-net/src/netcheck/reportgen.rs b/iroh-net/src/netcheck/reportgen.rs index 0bdd77ee43..d29055b774 100644 --- a/iroh-net/src/netcheck/reportgen.rs +++ b/iroh-net/src/netcheck/reportgen.rs @@ -34,7 +34,6 @@ use super::NetcheckMetrics; use crate::defaults::DEFAULT_STUN_PORT; use crate::dns::{DnsResolver, ResolverExt}; use crate::net::interfaces; -use crate::net::ip; use crate::net::UdpSocket; use crate::netcheck::{self, Report}; use crate::ping::{PingError, Pinger}; @@ -953,7 +952,7 @@ async fn get_relay_addr( { Ok(mut addrs) => addrs .next() - .map(ip::to_canonical) + .map(|ip| ip.to_canonical()) .map(|addr| SocketAddr::new(addr, port)) .ok_or(anyhow!("No suitable relay addr found")), Err(err) => Err(err.context("No suitable relay addr found")), @@ -973,7 +972,7 @@ async fn get_relay_addr( { Ok(mut addrs) => addrs .next() - .map(ip::to_canonical) + .map(|ip| ip.to_canonical()) .map(|addr| SocketAddr::new(addr, port)) .ok_or(anyhow!("No suitable relay addr found")), Err(err) => Err(err.context("No suitable relay addr found")), diff --git a/iroh-net/src/stun.rs b/iroh-net/src/stun.rs index e0ed936782..73cdea566f 100644 --- a/iroh-net/src/stun.rs +++ b/iroh-net/src/stun.rs @@ -11,8 +11,6 @@ pub use stun_rs::{ TransactionId, }; -use crate::net::ip::to_canonical; - /// Errors that can occur when handling a STUN packet. #[derive(Debug, thiserror::Error)] pub enum Error { @@ -126,12 +124,12 @@ pub fn parse_response(b: &[u8]) -> Result<(TransactionId, SocketAddr), Error> { match attr { StunAttribute::XorMappedAddress(a) => { let mut a = *a.socket_address(); - a.set_ip(to_canonical(a.ip())); + a.set_ip(a.ip().to_canonical()); addr = Some(a); } StunAttribute::MappedAddress(a) => { let mut a = *a.socket_address(); - a.set_ip(to_canonical(a.ip())); + a.set_ip(a.ip().to_canonical()); fallback_addr = Some(a); } _ => {} From 9e6b1e0897b15ea7096c95143e11e09e948c862e Mon Sep 17 00:00:00 2001 From: Divma <26765164+divagant-martian@users.noreply.github.com> Date: Wed, 31 Jul 2024 11:56:30 -0500 Subject: [PATCH 15/45] test(iroh-cli): make cli resumption tests not flaky (#2564) ## Description Takes a different approach to tests by doing the setup for them all first before running the tests, to prevent windows from whining about a locked blobs.db used by the provider. This still requires an ephemeral iroh to obtain the blobs.db but does not require much re-starting, making it less prone to weird shutdown issues. ## Breaking Changes n/a ## Notes & open questions Note that there is sleep as a synchronization point because otherwise the folder for the tests will not have the correct contents. Unexpected but have seen it happen a lot. We also use a different folder for the iroh instance that actually is providing during the tests, in case shutdown is not done for the instance that produces the blobs.db, when the one that provides is started. ## Change checklist - [x] Self-review. - [ ] ~~Documentation updates following the [style guide](https://rust-lang.github.io/rfcs/1574-more-api-documentation-conventions.html#appendix-a-full-conventions-text), if relevant.~~ - [x] Tests if relevant. - [ ] ~~All breaking changes documented.~~ --- iroh-cli/tests/cli.rs | 202 ++++++++++++++++++++---------------------- 1 file changed, 98 insertions(+), 104 deletions(-) diff --git a/iroh-cli/tests/cli.rs b/iroh-cli/tests/cli.rs index 5b48ed2a02..4de8df809b 100644 --- a/iroh-cli/tests/cli.rs +++ b/iroh-cli/tests/cli.rs @@ -112,8 +112,8 @@ fn cli_provide_tree() -> Result<()> { test_provide_get_loop(Input::Path(path), Output::Path) } +/// Test resumption with collections. #[test] -#[ignore = "flaky"] fn cli_provide_tree_resume() -> Result<()> { use iroh::blobs::store::fs::test_support::{make_partial, MakePartialResult}; @@ -128,7 +128,7 @@ fn cli_provide_tree_resume() -> Result<()> { let tmp = testdir!(); let src = tmp.join("src"); std::fs::create_dir(&src)?; - let src_iroh_data_dir = tmp.join("src_iroh_data_dir"); + let src_iroh_data_dir_pre = tmp.join("src_iroh_data_dir_pre"); let tgt = tmp.join("tgt"); { let foo_path = src.join("foo"); @@ -142,74 +142,70 @@ fn cli_provide_tree_resume() -> Result<()> { make_rand_file(100000, &file2)?; make_rand_file(5000, &file3)?; } - // leave the provider running for the entire test - let provider = make_provider_in(&src_iroh_data_dir, Input::Path(src.clone()), false)?; + let count = count_input_files(&src); - let ticket = match_provide_output(&provider, count, BlobOrCollection::Collection)?; - { - println!("first test - empty work dir"); - let get_iroh_data_dir = tmp.join("get_iroh_data_dir_01"); - let get_output = run_get_cmd(&get_iroh_data_dir, &ticket, Some(tgt.clone()))?; - let matches = explicit_matches(match_get_stderr(get_output.stderr)?); - assert_eq!(matches, vec!["112.89 KiB"]); - compare_files(&src, &tgt)?; - std::fs::remove_dir_all(&tgt)?; - } - // second test - full work dir { - println!("second test - full work dir"); - let get_iroh_data_dir = tmp.join("get_iroh_data_dir_02"); - copy_blob_dirs(&src_iroh_data_dir, &get_iroh_data_dir)?; - let get_output = run_get_cmd(&get_iroh_data_dir, &ticket, Some(tgt.clone()))?; - let matches = explicit_matches(match_get_stderr(get_output.stderr)?); - assert_eq!(matches, vec!["0 B"]); - compare_files(&src, &tgt)?; - std::fs::remove_dir_all(&tgt)?; + // import the files into an ephemeral iroh to use the generated blobs db in tests + let provider = make_provider_in(&src_iroh_data_dir_pre, Input::Path(src.clone()), false)?; + // small synchronization points: allow iroh to be ready for transfer + std::thread::sleep(std::time::Duration::from_secs(5)); + let _ticket = match_provide_output(&provider, count, BlobOrCollection::Collection)?; } - // third test - partial work dir - remove some large files - { - println!("third test - partial work dir - remove some large files"); - let get_iroh_data_dir = tmp.join("get_iroh_data_dir_03"); - copy_blob_dirs(&src_iroh_data_dir, &get_iroh_data_dir)?; - make_partial(&get_iroh_data_dir, |_hash, size| { - if size == 100000 { - MakePartialResult::Remove - } else { - MakePartialResult::Retain - } - })?; - let get_output = run_get_cmd(&get_iroh_data_dir, &ticket, Some(tgt.clone()))?; - let matches = explicit_matches(match_get_stderr(get_output.stderr)?); - assert_eq!(matches, vec!["98.04 KiB"]); - compare_files(&src, &tgt)?; - std::fs::remove_dir_all(&tgt)?; - } + // setup the data dir for the iroh instances that will get the blobs + let src_iroh_data_dir = tmp.join("src_iroh_data_dir"); + copy_blob_dirs(&src_iroh_data_dir_pre, &src_iroh_data_dir)?; + // first tests + let empty_dir = tmp.join("get_iroh_data_dir_01"); + // second test + let full_dir = tmp.join("get_iroh_data_dir_02"); + copy_blob_dirs(&src_iroh_data_dir, &full_dir)?; + // third test + let partial_dir_1 = tmp.join("get_iroh_data_dir_03"); + copy_blob_dirs(&src_iroh_data_dir, &partial_dir_1)?; + make_partial(&partial_dir_1, |_hash, size| { + if size == 100000 { + MakePartialResult::Remove + } else { + MakePartialResult::Retain + } + })?; + // fourth test + let partial_dir_2 = tmp.join("get_iroh_data_dir_04"); + copy_blob_dirs(&src_iroh_data_dir, &partial_dir_2)?; + make_partial(&partial_dir_2, |_hash, size| { + if size == 100000 { + MakePartialResult::Truncate(1024 * 32) + } else { + MakePartialResult::Retain + } + })?; + + // start the provider and run the test cases + let provider = make_provider_in(&src_iroh_data_dir, Input::Path(src.clone()), false)?; + let ticket = match_provide_output(&provider, count, BlobOrCollection::Collection)?; + + let run_test = + |name: &'static str, get_folder: PathBuf, transfer_size: &'static str| -> Result<()> { + println!("\n***\n{name}\n***"); + let get_output = run_get_cmd(&get_folder, &ticket, Some(tgt.clone()))?; + let matches = explicit_matches(match_get_stderr(get_output.stderr)?); + assert_eq!(matches, vec![transfer_size], "{name}: wrong transfer size"); + compare_files(&src, &tgt).context("file contents do not match")?; + std::fs::remove_dir_all(&tgt)?; + Ok(()) + }; + + run_test("no data needs full transfer", empty_dir, "112.89 KiB")?; + run_test("full data needs no transfer", full_dir, "0 B")?; + run_test("missing blobs needs transfer", partial_dir_1, "98.04 KiB")?; + run_test("partial blobs needs transfer", partial_dir_2, "65.98 KiB")?; - // fourth test - partial work dir - truncate some large files - { - println!("fourth test - partial work dir - truncate some large files"); - let get_iroh_data_dir = tmp.join("get_iroh_data_dir_04"); - copy_blob_dirs(&src_iroh_data_dir, &get_iroh_data_dir)?; - make_partial(&get_iroh_data_dir, |_hash, size| { - if size == 100000 { - MakePartialResult::Truncate(1024 * 32) - } else { - MakePartialResult::Retain - } - })?; - let get_output = run_get_cmd(&get_iroh_data_dir, &ticket, Some(tgt.clone()))?; - let matches = explicit_matches(match_get_stderr(get_output.stderr)?); - assert_eq!(matches, vec!["65.98 KiB"]); - compare_files(&src, &tgt)?; - std::fs::remove_dir_all(&tgt)?; - } drop(provider); Ok(()) } -#[ignore = "flaky"] #[test] fn cli_provide_file_resume() -> Result<()> { use iroh::blobs::store::fs::test_support::{make_partial, MakePartialResult}; @@ -226,58 +222,55 @@ fn cli_provide_file_resume() -> Result<()> { let src = tmp.join("src"); let tgt = tmp.join("tgt"); std::fs::create_dir(&src)?; - let src_iroh_data_dir = tmp.join("src_iroh_data_dir"); + let src_iroh_data_dir_pre = tmp.join("src_iroh_data_dir_pre"); let file = src.join("file"); let hash = make_rand_file(100000, &file)?; - // import the files into an ephemeral iroh to use the generated blobs db in tests - let provider = make_provider_in(&src_iroh_data_dir, Input::Path(file.clone()), false)?; let count = count_input_files(&src); - let ticket = match_provide_output(&provider, count, BlobOrCollection::Blob)?; - drop(provider); { - let provider = make_provider_in(&src_iroh_data_dir, Input::Path(file.clone()), false)?; - println!("first test - empty work dir"); - let get_iroh_data_dir = tmp.join("get_iroh_data_dir_01"); - let get_output = run_get_cmd(&get_iroh_data_dir, &ticket, Some(tgt.clone()))?; - let matches = explicit_matches(match_get_stderr(get_output.stderr)?); - assert_eq!(matches, vec!["98.04 KiB"]); - assert_eq!(Hash::new(std::fs::read(&tgt)?), hash); - // compare_files(&src, &tgt)?; - std::fs::remove_file(&tgt)?; - drop(provider); + // import the files into an ephemeral iroh to use the generated blobs db in tests + let provider = make_provider_in(&src_iroh_data_dir_pre, Input::Path(file.clone()), false)?; + // small synchronization points: allow iroh to be ready for transfer + std::thread::sleep(std::time::Duration::from_secs(5)); + let _ticket = match_provide_output(&provider, count, BlobOrCollection::Blob)?; } - // second test - full work dir - { - println!("second test - full work dir"); - let get_iroh_data_dir = tmp.join("get_iroh_data_dir_02"); - copy_blob_dirs(&src_iroh_data_dir, &get_iroh_data_dir)?; - let provider = make_provider_in(&src_iroh_data_dir, Input::Path(file.clone()), false)?; - let get_output = run_get_cmd(&get_iroh_data_dir, &ticket, Some(tgt.clone()))?; - let matches = explicit_matches(match_get_stderr(get_output.stderr)?); - assert_eq!(matches, vec!["0 B"]); - assert_eq!(Hash::new(std::fs::read(&tgt)?), hash); - std::fs::remove_file(&tgt)?; - drop(provider); - } + // setup the data dir for the iroh instances that will get the blobs + let src_iroh_data_dir = tmp.join("src_iroh_data_dir"); + copy_blob_dirs(&src_iroh_data_dir_pre, &src_iroh_data_dir)?; + + // first test: empty + let empty_data_dir = tmp.join("get_iroh_data_dir_01"); + // second test: all data available already + let full_data_dir = tmp.join("get_iroh_data_dir_02"); + copy_blob_dirs(&src_iroh_data_dir, &full_data_dir)?; + // third test: partial files + let partial_data_dir = tmp.join("get_iroh_data_dir_03"); + copy_blob_dirs(&src_iroh_data_dir, &partial_data_dir)?; + make_partial(&partial_data_dir, |_hash, _size| { + MakePartialResult::Truncate(1024 * 32) + })?; + + // start the provider and run the test cases - // third test - partial work dir - truncate some large files - { - println!("fourth test - partial work dir - truncate some large files"); - let get_iroh_data_dir = tmp.join("get_iroh_data_dir_04"); - copy_blob_dirs(&src_iroh_data_dir, &get_iroh_data_dir)?; - let provider = make_provider_in(&src_iroh_data_dir, Input::Path(file.clone()), false)?; - make_partial(&get_iroh_data_dir, |_hash, _size| { - MakePartialResult::Truncate(1024 * 32) - })?; - let get_output = run_get_cmd(&get_iroh_data_dir, &ticket, Some(tgt.clone()))?; - let matches = explicit_matches(match_get_stderr(get_output.stderr)?); - assert_eq!(matches, vec!["65.98 KiB"]); - assert_eq!(Hash::new(std::fs::read(&tgt)?), hash); - std::fs::remove_file(&tgt)?; - drop(provider); - } + let provider = make_provider_in(&src_iroh_data_dir, Input::Path(file.clone()), false)?; + let ticket = match_provide_output(&provider, count, BlobOrCollection::Blob)?; + + let run_test = + |name: &'static str, get_folder: PathBuf, transfer_size: &'static str| -> Result<()> { + println!("\n***\n{name}\n***"); + let get_output = run_get_cmd(&get_folder, &ticket, Some(tgt.clone()))?; + let matches = explicit_matches(match_get_stderr(get_output.stderr)?); + assert_eq!(matches, vec![transfer_size], "{name}: wrong transfer size"); + let current_hash = Hash::new(std::fs::read(&tgt)?); + assert_eq!(current_hash, hash, "{name}: wrong blob contents"); + std::fs::remove_file(&tgt)?; + Ok(()) + }; + + run_test("no data needs full transfer", empty_data_dir, "98.04 KiB")?; + run_test("full folder needs no transfer", full_data_dir, "0 B")?; + run_test("partial data needs transfer", partial_data_dir, "65.98 KiB")?; Ok(()) } @@ -1076,6 +1069,7 @@ fn copy_dir_all(src: impl AsRef, dst: impl AsRef) -> anyhow::Result< ) })?; } else { + println!("copying {} to {}", src.display(), dst.display()); std::fs::copy(&src, &dst).with_context(|| { format!( "failed to copy file `{}` to `{}`", From 15f36b373ec3dc86d9a81caeef54f8a165c10001 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= Date: Thu, 1 Aug 2024 09:29:33 +0200 Subject: [PATCH 16/45] test(iroh-blobs): comment out ignored test (that is not a flaky test) (#2559) ## Description There's this test that was introduced in [the `local_pool` PR](https://github.com/n0-computer/iroh/pull/2517). It was ignored via `#[ignore = "todo"]`. Notably, it's *not flaky*, it always fails. Our flaky tests are run with `cargo nextest run --run-ignored all [...]`. We can't be more specific with the `ignore`d tests. The only options are `default`, `ignored-only` and `all`. This kind of test is really hard to write. IIUC, `#[should_panic]` can only test for the panic happening in the thread that the test is initiated in, it doesn't detect panics that are thrown in threads spawned from the test. I assume this is the reason writing this test was abandoned. Keeping this test with the `#[ignore = "todo"]` on it means we're always running it in our flaky test suite, which is confusing. We thought this test was flaky, but it's not. IMO it's better to comment it out/remove it than to pollute our flaky test results. ## Breaking Changes None ## Notes & open questions In this PR I'm commenting this test. Should we remove it instead? Or do people have ideas on how to make this test work? Do we have an idea what we're *expecting* of our `LocalPool` implementation? Should a panic on one of the threads cause a panic in the `finish()` function? ## Change checklist - [X] Self-review. - ~~[ ] Documentation updates following the [style guide](https://rust-lang.github.io/rfcs/1574-more-api-documentation-conventions.html#appendix-a-full-conventions-text), if relevant.~~ - ~~[ ] Tests if relevant.~~ - [X] All breaking changes documented. --- iroh-blobs/src/util/local_pool.rs | 32 +++++++++++++++---------------- iroh/tests/client.rs | 12 +++++++----- 2 files changed, 23 insertions(+), 21 deletions(-) diff --git a/iroh-blobs/src/util/local_pool.rs b/iroh-blobs/src/util/local_pool.rs index d1270a9d9a..4465d86b66 100644 --- a/iroh-blobs/src/util/local_pool.rs +++ b/iroh-blobs/src/util/local_pool.rs @@ -634,20 +634,20 @@ mod tests { assert_eq!(c2.load(std::sync::atomic::Ordering::SeqCst), 0); } - #[tokio::test] - #[should_panic] - #[ignore = "todo"] - async fn test_panic() { - let _ = tracing_subscriber::fmt::try_init(); - let pool = LocalPool::new(Config { - threads: 2, - ..Config::default() - }); - pool.spawn_detached(|| async { - panic!("test panic"); - }); - // we can't use shutdown here, because we need to allow time for the - // panic to happen. - pool.finish().await; - } + // #[tokio::test] + // #[should_panic] + // #[ignore = "todo"] + // async fn test_panic() { + // let _ = tracing_subscriber::fmt::try_init(); + // let pool = LocalPool::new(Config { + // threads: 2, + // ..Config::default() + // }); + // pool.spawn_detached(|| async { + // panic!("test panic"); + // }); + // // we can't use shutdown here, because we need to allow time for the + // // panic to happen. + // pool.finish().await; + // } } diff --git a/iroh/tests/client.rs b/iroh/tests/client.rs index e725896e2f..cd1297f0c9 100644 --- a/iroh/tests/client.rs +++ b/iroh/tests/client.rs @@ -8,7 +8,6 @@ use iroh_gossip::{ }; use iroh_net::{key::SecretKey, NodeAddr}; use testresult::TestResult; -use tokio::task::JoinHandle; /// Spawn an iroh node in a separate thread and tokio runtime, and return /// the address and client. @@ -35,11 +34,11 @@ fn spawn_node() -> (NodeAddr, Iroh) { } /// Await `n` messages from a stream of gossip events. -fn await_messages( +async fn await_messages( mut stream: impl Stream> + Unpin + Send + Sync + 'static, n: usize, -) -> JoinHandle> { - tokio::spawn(async move { +) -> TestResult> { + let handle = tokio::spawn(async move { let mut res = Vec::new(); #[allow(clippy::single_match)] while let Some(msg) = stream.next().await { @@ -54,10 +53,13 @@ fn await_messages( } } res - }) + }); + + Ok(tokio::time::timeout(std::time::Duration::from_secs(60), handle).await??) } #[tokio::test] +#[ignore = "flaky"] async fn gossip_smoke() -> TestResult { let _ = tracing_subscriber::fmt::try_init(); let (addr1, node1) = spawn_node(); From 8e4e586cece3968700a13562058f3a5c152c1805 Mon Sep 17 00:00:00 2001 From: Divma <26765164+divagant-martian@users.noreply.github.com> Date: Thu, 1 Aug 2024 06:22:00 -0500 Subject: [PATCH 17/45] docs(iroh-cli): point to the configuration refernce from each iroh subcommand (#2571) ## Description #2537 mentions someone not finding the configuration options after running `iroh start --help`. `iroh --help` already mentions both in the second line and in the config flag (this one being fairly recent addition to the help text). I'd say the only thing missing here is to be able to see the docs from the subcommands, which is what this pr does. ## Breaking Changes n/a ## Notes & open questions n/a ## Change checklist - [x] Self-review. - [ ] ~~Documentation updates following the [style guide](https://rust-lang.github.io/rfcs/1574-more-api-documentation-conventions.html#appendix-a-full-conventions-text), if relevant.~~ - [ ] ~~Tests if relevant.~~ - [ ] ~~All breaking changes documented.~~ --- iroh-cli/src/commands.rs | 4 ++++ iroh-cli/src/commands/rpc.rs | 10 ++++++++++ 2 files changed, 14 insertions(+) diff --git a/iroh-cli/src/commands.rs b/iroh-cli/src/commands.rs index 59529cd671..071a6ee65d 100644 --- a/iroh-cli/src/commands.rs +++ b/iroh-cli/src/commands.rs @@ -81,6 +81,8 @@ pub(crate) enum Commands { /// start optionally takes a `--add SOURCE` option, which can be a file or a folder /// to serve on startup. Data can also be added after startup with commands like /// `iroh blob add` or by adding content to documents. + /// + /// For general configuration options see . Start { /// Optionally add a file or folder to the node. /// @@ -99,6 +101,8 @@ pub(crate) enum Commands { /// /// The console is a REPL for interacting with a running iroh node. /// For more info on available commands, see https://iroh.computer/docs/api + /// + /// For general configuration options see . Console, #[clap(flatten)] diff --git a/iroh-cli/src/commands/rpc.rs b/iroh-cli/src/commands/rpc.rs index cb177f4911..da3594ad68 100644 --- a/iroh-cli/src/commands/rpc.rs +++ b/iroh-cli/src/commands/rpc.rs @@ -16,6 +16,8 @@ pub enum RpcCommands { /// /// Documents are mutable, syncable key-value stores. /// For more on docs see https://iroh.computer/docs/layers/documents + /// + /// For general configuration options see . Docs { #[clap(subcommand)] command: DocCommands, @@ -24,6 +26,8 @@ pub enum RpcCommands { /// Manage document authors /// /// Authors are keypairs that identify writers to documents. + /// + /// For general configuration options see . Authors { #[clap(subcommand)] command: AuthorCommands, @@ -32,6 +36,8 @@ pub enum RpcCommands { /// /// Blobs are immutable, opaque chunks of arbitrary-sized data. /// For more on blobs see https://iroh.computer/docs/layers/blobs + /// + /// For general configuration options see . Blobs { #[clap(subcommand)] command: BlobCommands, @@ -44,6 +50,8 @@ pub enum RpcCommands { /// Manage gossip /// /// Gossip is a way to broadcast messages to a group of nodes. + /// + /// For general configuration options see . Gossip { #[clap(subcommand)] command: GossipCommands, @@ -57,6 +65,8 @@ pub enum RpcCommands { /// a tag. /// /// Any data iroh fetches without a tag will be periodically deleted. + /// + /// For general configuration options see . Tags { #[clap(subcommand)] command: TagCommands, From 32bb0f3be432676ca49473e75c7eb00db32a3673 Mon Sep 17 00:00:00 2001 From: Franz Heinzmann Date: Thu, 1 Aug 2024 19:21:49 +0200 Subject: [PATCH 18/45] fix(iroh-gossip): connection loop misuses `tokio::select!` leading to read errors (#2572) ## Description The connection loop of iroh-gossip misused tokio-select by selecting over a future that is not cancellation safe. This means that if the timings are bad, the message reading future would be aborted midway in a message, and then restart by reading a length, which would then yield some random number because it would be reading some random bytes in the middle of a message. This means it would lead to random connection drops. ## Breaking Changes Backport from #2570 ## Notes & open questions ## Change checklist - [x] Self-review. - [ ] ~~Documentation updates following the [style guide](https://rust-lang.github.io/rfcs/1574-more-api-documentation-conventions.html#appendix-a-full-conventions-text), if relevant.~~ - [ ] ~~Tests if relevant.~~ - [x] All breaking changes documented. --- iroh-gossip/src/net.rs | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/iroh-gossip/src/net.rs b/iroh-gossip/src/net.rs index 7844117509..29f428ca49 100644 --- a/iroh-gossip/src/net.rs +++ b/iroh-gossip/src/net.rs @@ -652,27 +652,26 @@ async fn connection_loop( }; let mut send_buf = BytesMut::new(); let mut recv_buf = BytesMut::new(); - loop { - tokio::select! { - biased; - // If `send_rx` is closed, - // stop selecting it but don't quit. - // We are not going to use connection for sending anymore, - // but the other side may still want to use it to - // send data to us. - Some(msg) = send_rx.recv(), if !send_rx.is_closed() => { - write_message(&mut send, &mut send_buf, &msg, max_message_size).await? - } + let send_loop = async { + while let Some(msg) = send_rx.recv().await { + write_message(&mut send, &mut send_buf, &msg, max_message_size).await? + } + Ok::<_, anyhow::Error>(()) + }; - msg = read_message(&mut recv, &mut recv_buf, max_message_size) => { - let msg = msg?; - match msg { - None => break, - Some(msg) => in_event_tx.send(InEvent::RecvMessage(from, msg)).await? - } + let recv_loop = async { + loop { + let msg = read_message(&mut recv, &mut recv_buf, max_message_size).await?; + match msg { + None => break, + Some(msg) => in_event_tx.send(InEvent::RecvMessage(from, msg)).await?, } } - } + Ok::<_, anyhow::Error>(()) + }; + + tokio::try_join!(send_loop, recv_loop)?; + Ok(()) } From 347d45c3de3bcba878657566a67f4e1825b03bc4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=BCdiger=20Klaehn?= Date: Fri, 2 Aug 2024 09:07:00 +0300 Subject: [PATCH 19/45] refactor(iroh): remove flume from iroh-cli and iroh (#2543) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Description refactor(iroh): remove flume from iroh-cli and iroh Removes most usages of flume from iroh and all usages of flume from iroh-cli. ## Breaking Changes None ## Notes & open questions Note: this does not remove all usages because we need the iroh docs purge PR to be merged before that. ## Change checklist - [ ] Self-review. - [ ] ~~Documentation updates following the [style guide](https://rust-lang.github.io/rfcs/1574-more-api-documentation-conventions.html#appendix-a-full-conventions-text), if relevant.~~ - [ ] ~~Tests if relevant.~~ - [ ] ~~All breaking changes documented.~~ --------- Co-authored-by: Philipp Krüger Co-authored-by: Kasey Co-authored-by: Kasey Huizinga --- Cargo.lock | 2 +- iroh-blobs/src/downloader/progress.rs | 4 +- iroh-blobs/src/downloader/test.rs | 26 +++--- iroh-blobs/src/util/progress.rs | 128 ++++++++++++++++++++++++++ iroh-cli/Cargo.toml | 2 +- iroh-cli/src/commands/doctor.rs | 14 +-- iroh/src/node/rpc.rs | 99 ++++++++++---------- iroh/tests/gc.rs | 23 +++-- 8 files changed, 213 insertions(+), 85 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a7afd8edd7..4b06c11628 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2629,6 +2629,7 @@ name = "iroh-cli" version = "0.21.0" dependencies = [ "anyhow", + "async-channel", "bao-tree", "bytes", "clap", @@ -2640,7 +2641,6 @@ dependencies = [ "dialoguer", "dirs-next", "duct", - "flume", "futures-buffered", "futures-lite 2.3.0", "futures-util", diff --git a/iroh-blobs/src/downloader/progress.rs b/iroh-blobs/src/downloader/progress.rs index 8a0114dda2..eac80985d5 100644 --- a/iroh-blobs/src/downloader/progress.rs +++ b/iroh-blobs/src/downloader/progress.rs @@ -11,13 +11,13 @@ use parking_lot::Mutex; use crate::{ get::{db::DownloadProgress, progress::TransferState}, - util::progress::{FlumeProgressSender, IdGenerator, ProgressSendError, ProgressSender}, + util::progress::{AsyncChannelProgressSender, IdGenerator, ProgressSendError, ProgressSender}, }; use super::DownloadKind; /// The channel that can be used to subscribe to progress updates. -pub type ProgressSubscriber = FlumeProgressSender; +pub type ProgressSubscriber = AsyncChannelProgressSender; /// Track the progress of downloads. /// diff --git a/iroh-blobs/src/downloader/test.rs b/iroh-blobs/src/downloader/test.rs index 871b835ba7..2e734eaf3b 100644 --- a/iroh-blobs/src/downloader/test.rs +++ b/iroh-blobs/src/downloader/test.rs @@ -12,7 +12,7 @@ use crate::{ get::{db::BlobId, progress::TransferState}, util::{ local_pool::LocalPool, - progress::{FlumeProgressSender, IdGenerator}, + progress::{AsyncChannelProgressSender, IdGenerator}, }, }; @@ -276,13 +276,13 @@ async fn concurrent_progress() { let hash = Hash::new([0u8; 32]); let kind_1 = HashAndFormat::raw(hash); - let (prog_a_tx, prog_a_rx) = flume::bounded(64); - let prog_a_tx = FlumeProgressSender::new(prog_a_tx); + let (prog_a_tx, prog_a_rx) = async_channel::bounded(64); + let prog_a_tx = AsyncChannelProgressSender::new(prog_a_tx); let req = DownloadRequest::new(kind_1, vec![peer]).progress_sender(prog_a_tx); let handle_a = downloader.queue(req).await; - let (prog_b_tx, prog_b_rx) = flume::bounded(64); - let prog_b_tx = FlumeProgressSender::new(prog_b_tx); + let (prog_b_tx, prog_b_rx) = async_channel::bounded(64); + let prog_b_tx = AsyncChannelProgressSender::new(prog_b_tx); let req = DownloadRequest::new(kind_1, vec![peer]).progress_sender(prog_b_tx); let handle_b = downloader.queue(req).await; @@ -292,8 +292,8 @@ async fn concurrent_progress() { let mut state_b = TransferState::new(hash); let mut state_c = TransferState::new(hash); - let prog1_a = prog_a_rx.recv_async().await.unwrap(); - let prog1_b = prog_b_rx.recv_async().await.unwrap(); + let prog1_a = prog_a_rx.recv().await.unwrap(); + let prog1_b = prog_b_rx.recv().await.unwrap(); assert!(matches!(prog1_a, DownloadProgress::Found { hash, size: 100, ..} if hash == hash)); assert!(matches!(prog1_b, DownloadProgress::Found { hash, size: 100, ..} if hash == hash)); @@ -301,12 +301,12 @@ async fn concurrent_progress() { state_b.on_progress(prog1_b); assert_eq!(state_a, state_b); - let (prog_c_tx, prog_c_rx) = flume::bounded(64); - let prog_c_tx = FlumeProgressSender::new(prog_c_tx); + let (prog_c_tx, prog_c_rx) = async_channel::bounded(64); + let prog_c_tx = AsyncChannelProgressSender::new(prog_c_tx); let req = DownloadRequest::new(kind_1, vec![peer]).progress_sender(prog_c_tx); let handle_c = downloader.queue(req).await; - let prog1_c = prog_c_rx.recv_async().await.unwrap(); + let prog1_c = prog_c_rx.recv().await.unwrap(); assert!(matches!(&prog1_c, DownloadProgress::InitialState(state) if state == &state_a)); state_c.on_progress(prog1_c); @@ -317,9 +317,9 @@ async fn concurrent_progress() { res_b.unwrap(); res_c.unwrap(); - let prog_a: Vec<_> = prog_a_rx.into_stream().collect().await; - let prog_b: Vec<_> = prog_b_rx.into_stream().collect().await; - let prog_c: Vec<_> = prog_c_rx.into_stream().collect().await; + let prog_a: Vec<_> = prog_a_rx.collect().await; + let prog_b: Vec<_> = prog_b_rx.collect().await; + let prog_c: Vec<_> = prog_c_rx.collect().await; assert_eq!(prog_a.len(), 1); assert_eq!(prog_b.len(), 1); diff --git a/iroh-blobs/src/util/progress.rs b/iroh-blobs/src/util/progress.rs index 8915b1cfb2..6f1f678655 100644 --- a/iroh-blobs/src/util/progress.rs +++ b/iroh-blobs/src/util/progress.rs @@ -518,6 +518,98 @@ impl ProgressSender for FlumeProgressSender { } } +/// A progress sender that uses an async channel. +pub struct AsyncChannelProgressSender { + sender: async_channel::Sender, + id: std::sync::Arc, +} + +impl std::fmt::Debug for AsyncChannelProgressSender { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("AsyncChannelProgressSender") + .field("id", &self.id) + .field("sender", &self.sender) + .finish() + } +} + +impl Clone for AsyncChannelProgressSender { + fn clone(&self) -> Self { + Self { + sender: self.sender.clone(), + id: self.id.clone(), + } + } +} + +impl AsyncChannelProgressSender { + /// Create a new progress sender from an async channel sender. + pub fn new(sender: async_channel::Sender) -> Self { + Self { + sender, + id: std::sync::Arc::new(std::sync::atomic::AtomicU64::new(0)), + } + } + + /// Returns true if `other` sends on the same `async_channel` channel as `self`. + pub fn same_channel(&self, other: &AsyncChannelProgressSender) -> bool { + same_channel(&self.sender, &other.sender) + } +} + +/// Given a value that is aligned and sized like a pointer, return the value of +/// the pointer as a usize. +fn get_as_ptr(value: &T) -> Option { + use std::mem; + if mem::size_of::() == std::mem::size_of::() + && mem::align_of::() == mem::align_of::() + { + // SAFETY: size and alignment requirements are checked and met + unsafe { Some(mem::transmute_copy(value)) } + } else { + None + } +} + +fn same_channel(a: &async_channel::Sender, b: &async_channel::Sender) -> bool { + // This relies on async_channel::Sender being just a newtype wrapper around + // an Arc>, so if two senders point to the same channel, the + // pointers will be the same. + get_as_ptr(a).unwrap() == get_as_ptr(b).unwrap() +} + +impl IdGenerator for AsyncChannelProgressSender { + fn new_id(&self) -> u64 { + self.id.fetch_add(1, std::sync::atomic::Ordering::SeqCst) + } +} + +impl ProgressSender for AsyncChannelProgressSender { + type Msg = T; + + async fn send(&self, msg: Self::Msg) -> std::result::Result<(), ProgressSendError> { + self.sender + .send(msg) + .await + .map_err(|_| ProgressSendError::ReceiverDropped) + } + + fn try_send(&self, msg: Self::Msg) -> std::result::Result<(), ProgressSendError> { + match self.sender.try_send(msg) { + Ok(_) => Ok(()), + Err(async_channel::TrySendError::Full(_)) => Ok(()), + Err(async_channel::TrySendError::Closed(_)) => Err(ProgressSendError::ReceiverDropped), + } + } + + fn blocking_send(&self, msg: Self::Msg) -> std::result::Result<(), ProgressSendError> { + match self.sender.send_blocking(msg) { + Ok(_) => Ok(()), + Err(_) => Err(ProgressSendError::ReceiverDropped), + } + } +} + /// An error that can occur when sending progress messages. /// /// Really the only error that can occur is if the receiver is dropped. @@ -628,3 +720,39 @@ impl io::Result<()> + 'stati self.0.set_len(size).await } } + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use super::*; + + #[test] + fn get_as_ptr_works() { + struct Wrapper(Arc); + let x = Wrapper(Arc::new(1u64)); + assert_eq!( + get_as_ptr(&x).unwrap(), + Arc::as_ptr(&x.0) as usize - 2 * std::mem::size_of::() + ); + } + + #[test] + fn get_as_ptr_wrong_use() { + struct Wrapper(#[allow(dead_code)] u8); + let x = Wrapper(1); + assert!(get_as_ptr(&x).is_none()); + } + + #[test] + fn test_sender_is_ptr() { + assert_eq!( + std::mem::size_of::(), + std::mem::size_of::>() + ); + assert_eq!( + std::mem::align_of::(), + std::mem::align_of::>() + ); + } +} diff --git a/iroh-cli/Cargo.toml b/iroh-cli/Cargo.toml index 6d4f0e5cbc..d4bd389a4c 100644 --- a/iroh-cli/Cargo.toml +++ b/iroh-cli/Cargo.toml @@ -23,6 +23,7 @@ doc = false [dependencies] anyhow = "1.0.81" +async-channel = "2.3.1" bao-tree = "0.13" bytes = "1.5.0" clap = { version = "4", features = ["derive"] } @@ -33,7 +34,6 @@ crossterm = "0.27.0" derive_more = { version = "1.0.0-beta.1", features = ["display"] } dialoguer = { version = "0.11.0", default-features = false } dirs-next = "2.0.0" -flume = "0.11.0" futures-buffered = "0.2.4" futures-lite = "2.3" futures-util = { version = "0.3.30", features = ["futures-sink"] } diff --git a/iroh-cli/src/commands/doctor.rs b/iroh-cli/src/commands/doctor.rs index 93e3221b6d..6f98302a52 100644 --- a/iroh-cli/src/commands/doctor.rs +++ b/iroh-cli/src/commands/doctor.rs @@ -23,7 +23,7 @@ use iroh::{ base::ticket::{BlobTicket, Ticket}, blobs::{ store::{ReadableStore, Store as _}, - util::progress::{FlumeProgressSender, ProgressSender}, + util::progress::{AsyncChannelProgressSender, ProgressSender}, }, docs::{Capability, DocTicket}, net::{ @@ -1145,28 +1145,28 @@ pub async fn run(command: Commands, config: &NodeConfig) -> anyhow::Result<()> { Commands::TicketInspect { ticket, zbase32 } => inspect_ticket(&ticket, zbase32), Commands::BlobConsistencyCheck { path, repair } => { let blob_store = iroh::blobs::store::fs::Store::load(path).await?; - let (send, recv) = flume::bounded(1); + let (send, recv) = async_channel::bounded(1); let task = tokio::spawn(async move { - while let Ok(msg) = recv.recv_async().await { + while let Ok(msg) = recv.recv().await { println!("{:?}", msg); } }); blob_store - .consistency_check(repair, FlumeProgressSender::new(send).boxed()) + .consistency_check(repair, AsyncChannelProgressSender::new(send).boxed()) .await?; task.await?; Ok(()) } Commands::BlobValidate { path, repair } => { let blob_store = iroh::blobs::store::fs::Store::load(path).await?; - let (send, recv) = flume::bounded(1); + let (send, recv) = async_channel::bounded(1); let task = tokio::spawn(async move { - while let Ok(msg) = recv.recv_async().await { + while let Ok(msg) = recv.recv().await { println!("{:?}", msg); } }); blob_store - .validate(repair, FlumeProgressSender::new(send).boxed()) + .validate(repair, AsyncChannelProgressSender::new(send).boxed()) .await?; task.await?; Ok(()) diff --git a/iroh/src/node/rpc.rs b/iroh/src/node/rpc.rs index 0796a0d86e..467e91d402 100644 --- a/iroh/src/node/rpc.rs +++ b/iroh/src/node/rpc.rs @@ -15,13 +15,12 @@ use iroh_blobs::get::db::DownloadProgress; use iroh_blobs::get::Stats; use iroh_blobs::store::{ConsistencyCheckProgress, ExportFormat, ImportProgress, MapEntry}; use iroh_blobs::util::local_pool::LocalPoolHandle; -use iroh_blobs::util::progress::ProgressSender; +use iroh_blobs::util::progress::{AsyncChannelProgressSender, ProgressSender}; use iroh_blobs::util::SetTagOption; use iroh_blobs::BlobFormat; use iroh_blobs::{ provider::AddProgress, store::{Store as BaoStore, ValidateProgress}, - util::progress::FlumeProgressSender, HashAndFormat, }; use iroh_io::AsyncSliceReader; @@ -527,18 +526,18 @@ impl Handler { self, msg: ValidateRequest, ) -> impl Stream + Send + 'static { - let (tx, rx) = flume::bounded(1); + let (tx, rx) = async_channel::bounded(1); let tx2 = tx.clone(); let db = self.inner.db.clone(); tokio::task::spawn(async move { if let Err(e) = db - .validate(msg.repair, FlumeProgressSender::new(tx).boxed()) + .validate(msg.repair, AsyncChannelProgressSender::new(tx).boxed()) .await { - tx2.send_async(ValidateProgress::Abort(e.into())).await.ok(); + tx2.send(ValidateProgress::Abort(e.into())).await.ok(); } }); - rx.into_stream() + rx } /// Invoke validate on the database and stream out the result @@ -546,59 +545,59 @@ impl Handler { self, msg: ConsistencyCheckRequest, ) -> impl Stream + Send + 'static { - let (tx, rx) = flume::bounded(1); + let (tx, rx) = async_channel::bounded(1); let tx2 = tx.clone(); let db = self.inner.db.clone(); tokio::task::spawn(async move { if let Err(e) = db - .consistency_check(msg.repair, FlumeProgressSender::new(tx).boxed()) + .consistency_check(msg.repair, AsyncChannelProgressSender::new(tx).boxed()) .await { - tx2.send_async(ConsistencyCheckProgress::Abort(e.into())) + tx2.send(ConsistencyCheckProgress::Abort(e.into())) .await .ok(); } }); - rx.into_stream() + rx } fn blob_add_from_path(self, msg: AddPathRequest) -> impl Stream { // provide a little buffer so that we don't slow down the sender - let (tx, rx) = flume::bounded(32); + let (tx, rx) = async_channel::bounded(32); let tx2 = tx.clone(); self.local_pool_handle().spawn_detached(|| async move { if let Err(e) = self.blob_add_from_path0(msg, tx).await { - tx2.send_async(AddProgress::Abort(e.into())).await.ok(); + tx2.send(AddProgress::Abort(e.into())).await.ok(); } }); - rx.into_stream().map(AddPathResponse) + rx.map(AddPathResponse) } fn doc_import_file(self, msg: ImportFileRequest) -> impl Stream { // provide a little buffer so that we don't slow down the sender - let (tx, rx) = flume::bounded(32); + let (tx, rx) = async_channel::bounded(32); let tx2 = tx.clone(); self.local_pool_handle().spawn_detached(|| async move { if let Err(e) = self.doc_import_file0(msg, tx).await { - tx2.send_async(crate::client::docs::ImportProgress::Abort(e.into())) + tx2.send(crate::client::docs::ImportProgress::Abort(e.into())) .await .ok(); } }); - rx.into_stream().map(ImportFileResponse) + rx.map(ImportFileResponse) } async fn doc_import_file0( self, msg: ImportFileRequest, - progress: flume::Sender, + progress: async_channel::Sender, ) -> anyhow::Result<()> { let docs = self.docs().ok_or_else(|| anyhow!("docs are disabled"))?; use crate::client::docs::ImportProgress as DocImportProgress; use iroh_blobs::store::ImportMode; use std::collections::BTreeMap; - let progress = FlumeProgressSender::new(progress); + let progress = AsyncChannelProgressSender::new(progress); let names = Arc::new(Mutex::new(BTreeMap::new())); // convert import progress to provide progress let import_progress = progress.clone().with_filter_map(move |x| match x { @@ -660,23 +659,23 @@ impl Handler { } fn doc_export_file(self, msg: ExportFileRequest) -> impl Stream { - let (tx, rx) = flume::bounded(1024); + let (tx, rx) = async_channel::bounded(1024); let tx2 = tx.clone(); self.local_pool_handle().spawn_detached(|| async move { if let Err(e) = self.doc_export_file0(msg, tx).await { - tx2.send_async(ExportProgress::Abort(e.into())).await.ok(); + tx2.send(ExportProgress::Abort(e.into())).await.ok(); } }); - rx.into_stream().map(ExportFileResponse) + rx.map(ExportFileResponse) } async fn doc_export_file0( self, msg: ExportFileRequest, - progress: flume::Sender, + progress: async_channel::Sender, ) -> anyhow::Result<()> { let _docs = self.docs().ok_or_else(|| anyhow!("docs are disabled"))?; - let progress = FlumeProgressSender::new(progress); + let progress = AsyncChannelProgressSender::new(progress); let ExportFileRequest { entry, path, mode } = msg; let key = bytes::Bytes::from(entry.key().to_vec()); let export_progress = progress.clone().with_map(move |mut x| { @@ -700,11 +699,11 @@ impl Handler { } fn blob_download(self, msg: BlobDownloadRequest) -> impl Stream { - let (sender, receiver) = flume::bounded(1024); + let (sender, receiver) = async_channel::bounded(1024); let db = self.inner.db.clone(); let downloader = self.inner.downloader.clone(); let endpoint = self.inner.endpoint.clone(); - let progress = FlumeProgressSender::new(sender); + let progress = AsyncChannelProgressSender::new(sender); self.local_pool_handle().spawn_detached(move || async move { if let Err(err) = download(&db, endpoint, &downloader, msg, progress.clone()).await { progress @@ -714,12 +713,12 @@ impl Handler { } }); - receiver.into_stream().map(DownloadResponse) + receiver.map(DownloadResponse) } fn blob_export(self, msg: ExportRequest) -> impl Stream { - let (tx, rx) = flume::bounded(1024); - let progress = FlumeProgressSender::new(tx); + let (tx, rx) = async_channel::bounded(1024); + let progress = AsyncChannelProgressSender::new(tx); self.local_pool_handle().spawn_detached(move || async move { let res = iroh_blobs::export::export( &self.inner.db, @@ -735,18 +734,18 @@ impl Handler { Err(err) => progress.send(ExportProgress::Abort(err.into())).await.ok(), }; }); - rx.into_stream().map(ExportResponse) + rx.map(ExportResponse) } async fn blob_add_from_path0( self, msg: AddPathRequest, - progress: flume::Sender, + progress: async_channel::Sender, ) -> anyhow::Result<()> { use iroh_blobs::store::ImportMode; use std::collections::BTreeMap; - let progress = FlumeProgressSender::new(progress); + let progress = AsyncChannelProgressSender::new(progress); let names = Arc::new(Mutex::new(BTreeMap::new())); // convert import progress to provide progress let import_progress = progress.clone().with_filter_map(move |x| match x { @@ -923,25 +922,25 @@ impl Handler { msg: AddStreamRequest, stream: impl Stream + Send + Unpin + 'static, ) -> impl Stream { - let (tx, rx) = flume::bounded(32); + let (tx, rx) = async_channel::bounded(32); let this = self.clone(); self.local_pool_handle().spawn_detached(|| async move { if let Err(err) = this.blob_add_stream0(msg, stream, tx.clone()).await { - tx.send_async(AddProgress::Abort(err.into())).await.ok(); + tx.send(AddProgress::Abort(err.into())).await.ok(); } }); - rx.into_stream().map(AddStreamResponse) + rx.map(AddStreamResponse) } async fn blob_add_stream0( self, msg: AddStreamRequest, stream: impl Stream + Send + Unpin + 'static, - progress: flume::Sender, + progress: async_channel::Sender, ) -> anyhow::Result<()> { - let progress = FlumeProgressSender::new(progress); + let progress = AsyncChannelProgressSender::new(progress); let stream = stream.map(|item| match item { AddStreamUpdate::Chunk(chunk) => Ok(chunk), @@ -993,24 +992,24 @@ impl Handler { self, req: ReadAtRequest, ) -> impl Stream> + Send + 'static { - let (tx, rx) = flume::bounded(RPC_BLOB_GET_CHANNEL_CAP); + let (tx, rx) = async_channel::bounded(RPC_BLOB_GET_CHANNEL_CAP); let db = self.inner.db.clone(); self.local_pool_handle().spawn_detached(move || async move { if let Err(err) = read_loop(req, db, tx.clone(), RPC_BLOB_GET_CHUNK_SIZE).await { - tx.send_async(RpcResult::Err(err.into())).await.ok(); + tx.send(RpcResult::Err(err.into())).await.ok(); } }); async fn read_loop( req: ReadAtRequest, db: D, - tx: flume::Sender>, + tx: async_channel::Sender>, max_chunk_size: usize, ) -> anyhow::Result<()> { let entry = db.get(&req.hash).await?; let entry = entry.ok_or_else(|| anyhow!("Blob not found"))?; let size = entry.size(); - tx.send_async(Ok(ReadAtResponse::Entry { + tx.send(Ok(ReadAtResponse::Entry { size, is_complete: entry.is_complete(), })) @@ -1037,7 +1036,7 @@ impl Handler { let chunk = reader.read_at(req.offset + read, chunk_size).await?; let chunk_len = chunk.len(); if !chunk.is_empty() { - tx.send_async(Ok(ReadAtResponse::Data { chunk })).await?; + tx.send(Ok(ReadAtResponse::Data { chunk })).await?; } if chunk_len < chunk_size { break; @@ -1048,7 +1047,7 @@ impl Handler { Ok(()) } - rx.into_stream() + rx } fn node_connections( @@ -1056,17 +1055,15 @@ impl Handler { _: ConnectionsRequest, ) -> impl Stream> + Send + 'static { // provide a little buffer so that we don't slow down the sender - let (tx, rx) = flume::bounded(32); + let (tx, rx) = async_channel::bounded(32); let mut conn_infos = self.inner.endpoint.connection_infos(); conn_infos.sort_by_key(|n| n.node_id.to_string()); self.local_pool_handle().spawn_detached(|| async move { for conn_info in conn_infos { - tx.send_async(Ok(ConnectionsResponse { conn_info })) - .await - .ok(); + tx.send(Ok(ConnectionsResponse { conn_info })).await.ok(); } }); - rx.into_stream() + rx } // This method is called as an RPC method, which have to be async @@ -1125,7 +1122,7 @@ async fn download( endpoint: Endpoint, downloader: &Downloader, req: BlobDownloadRequest, - progress: FlumeProgressSender, + progress: AsyncChannelProgressSender, ) -> Result<()> where D: iroh_blobs::store::Store, @@ -1175,7 +1172,7 @@ async fn download_queued( downloader: &Downloader, hash_and_format: HashAndFormat, nodes: Vec, - progress: FlumeProgressSender, + progress: AsyncChannelProgressSender, ) -> Result { let mut node_ids = Vec::with_capacity(nodes.len()); let mut any_added = false; @@ -1199,7 +1196,7 @@ async fn download_direct_from_nodes( endpoint: Endpoint, hash_and_format: HashAndFormat, nodes: Vec, - progress: FlumeProgressSender, + progress: AsyncChannelProgressSender, ) -> Result where D: BaoStore, @@ -1232,7 +1229,7 @@ async fn download_direct( endpoint: Endpoint, hash_and_format: HashAndFormat, node: NodeAddr, - progress: FlumeProgressSender, + progress: AsyncChannelProgressSender, ) -> Result where D: BaoStore, diff --git a/iroh/tests/gc.rs b/iroh/tests/gc.rs index e032691df9..83a21f8d7f 100644 --- a/iroh/tests/gc.rs +++ b/iroh/tests/gc.rs @@ -37,11 +37,14 @@ pub fn simulate_remote(data: &[u8]) -> (blake3::Hash, Cursor) { } /// Wrap a bao store in a node that has gc enabled. -async fn wrap_in_node(bao_store: S, gc_period: Duration) -> (Node, flume::Receiver<()>) +async fn wrap_in_node( + bao_store: S, + gc_period: Duration, +) -> (Node, async_channel::Receiver<()>) where S: iroh_blobs::store::Store, { - let (gc_send, gc_recv) = flume::unbounded(); + let (gc_send, gc_recv) = async_channel::unbounded(); let node = node::Builder::with_db_and_store( bao_store, DocsStorage::Memory, @@ -49,7 +52,7 @@ where ) .gc_policy(iroh::node::GcPolicy::Interval(gc_period)) .register_gc_done_cb(Box::new(move || { - gc_send.send(()).ok(); + gc_send.send_blocking(()).ok(); })) .spawn() .await @@ -60,19 +63,19 @@ where async fn gc_test_node() -> ( Node, iroh_blobs::store::mem::Store, - flume::Receiver<()>, + async_channel::Receiver<()>, ) { let bao_store = iroh_blobs::store::mem::Store::new(); let (node, gc_recv) = wrap_in_node(bao_store.clone(), Duration::from_millis(500)).await; (node, bao_store, gc_recv) } -async fn step(evs: &flume::Receiver<()>) { +async fn step(evs: &async_channel::Receiver<()>) { // drain the event queue, we want a new GC while evs.try_recv().is_ok() {} // wait for several GC cycles for _ in 0..3 { - evs.recv_async().await.unwrap(); + evs.recv().await.unwrap(); } } @@ -191,7 +194,7 @@ mod file { use iroh_blobs::{ store::{BaoBatchWriter, ConsistencyCheckProgress, Map, MapEntryMut, ReportLevel}, - util::progress::{FlumeProgressSender, ProgressSender as _}, + util::progress::{AsyncChannelProgressSender, ProgressSender as _}, TempTag, }; use tokio::io::AsyncReadExt; @@ -212,16 +215,16 @@ mod file { async fn check_consistency(store: &impl Store) -> anyhow::Result { let mut max_level = ReportLevel::Trace; - let (tx, rx) = flume::bounded(1); + let (tx, rx) = async_channel::bounded(1); let task = tokio::task::spawn(async move { - while let Ok(ev) = rx.recv_async().await { + while let Ok(ev) = rx.recv().await { if let ConsistencyCheckProgress::Update { level, .. } = &ev { max_level = max_level.max(*level); } } }); store - .consistency_check(false, FlumeProgressSender::new(tx).boxed()) + .consistency_check(false, AsyncChannelProgressSender::new(tx).boxed()) .await?; task.await?; Ok(max_level) From 3f3fec5010a97f7d11f00b9c3eb2f05e167a1472 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= Date: Fri, 2 Aug 2024 11:33:51 +0200 Subject: [PATCH 20/45] fix(iroh-net): Fix a compiler error with newer `derive_more` versions (#2578) ## Description The `#[derive(Debug)]` used to generate an impl like this: ```rust #[automatically_derived] impl<'a> ::core::fmt::Debug for Accept<'a> where quinn::Accept<'a>: ::core::fmt::Debug, Endpoint: ::core::fmt::Debug, { // ... } ``` But `quinn::Accept` doesn't implement `Debug`. ## Breaking Changes None ## Notes & open questions I'm... Really stumped on what changed in `derive_more`. When I expand the macro, in both versions there's a `Debug` bound on `quinn::Accept`, which just can't be right... Anyhow. ## Change checklist - [X] Self-review. - ~~[ ] Documentation updates following the [style guide](https://rust-lang.github.io/rfcs/1574-more-api-documentation-conventions.html#appendix-a-full-conventions-text), if relevant.~~ - ~~[ ] Tests if relevant.~~ - [X] All breaking changes documented. --- Cargo.lock | 8 ++++---- iroh-docs/src/store/fs/tables.rs | 1 + iroh-net/src/endpoint.rs | 1 + 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4b06c11628..af8e7a5224 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1178,18 +1178,18 @@ dependencies = [ [[package]] name = "derive_more" -version = "1.0.0-beta.6" +version = "1.0.0-beta.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7abbfc297053be59290e3152f8cbcd52c8642e0728b69ee187d991d4c1af08d" +checksum = "3249c0372e72f5f93b5c0ca54c0ab76bbf6216b6f718925476fd9bc4ffabb4fe" dependencies = [ "derive_more-impl", ] [[package]] name = "derive_more-impl" -version = "1.0.0-beta.6" +version = "1.0.0-beta.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2bba3e9872d7c58ce7ef0fcf1844fcc3e23ef2a58377b50df35dd98e42a5726e" +checksum = "27d919ced7590fc17b5d5a3c63b662e8a7d2324212c4e4dbbed975cafd22d16d" dependencies = [ "proc-macro2", "quote", diff --git a/iroh-docs/src/store/fs/tables.rs b/iroh-docs/src/store/fs/tables.rs index f3f1f207a6..898fffca4d 100644 --- a/iroh-docs/src/store/fs/tables.rs +++ b/iroh-docs/src/store/fs/tables.rs @@ -118,6 +118,7 @@ pub struct Tables<'tx> { pub records_by_key: Table<'tx, RecordsByKeyId<'static>, ()>, pub namespaces: Table<'tx, &'static [u8; 32], (u8, &'static [u8; 32])>, pub latest_per_author: Table<'tx, LatestPerAuthorKey<'static>, LatestPerAuthorValue<'static>>, + #[debug("MultimapTable")] pub namespace_peers: MultimapTable<'tx, &'static [u8; 32], (Nanos, &'static PeerIdBytes)>, pub download_policy: Table<'tx, &'static [u8; 32], &'static [u8]>, pub authors: Table<'tx, &'static [u8; 32], &'static [u8; 32]>, diff --git a/iroh-net/src/endpoint.rs b/iroh-net/src/endpoint.rs index 69019ad148..9b844467aa 100644 --- a/iroh-net/src/endpoint.rs +++ b/iroh-net/src/endpoint.rs @@ -898,6 +898,7 @@ impl Endpoint { #[pin_project::pin_project] pub struct Accept<'a> { #[pin] + #[debug("quinn::Accept")] inner: quinn::Accept<'a>, magic_ep: Endpoint, } From d662bfc663ad956bbb38716bd5b8022a699bfce4 Mon Sep 17 00:00:00 2001 From: Floris Bruynooghe Date: Fri, 2 Aug 2024 17:25:33 +0200 Subject: [PATCH 21/45] ref(iroh-net): Remove need for relay info in best_addr (#2579) ## Description This moves metrics for when switches between direct connections and relayed connections to the NodeState. This is after all where this decision is made. BestAddr only knows about the best UDP address, it has no business keeping track of relays. Cleaning this up enables further refactoring of BestAddr and PathState which need to improve because they depend on each other, but are hindered by the relay state sneaking into there. ## Breaking Changes None ## Notes & open questions These metrics ignore the fact that we have mixed as a possibility so over-simplify the situation. This makes the logic to increment them complex, which in turn makes it no one will really know what they mean. I need this to move on with #2546 so don't really want to get into designing our metrics though. I believe this way the keep the same behaviour. ## Change checklist - [x] Self-review. - [x] Documentation updates following the [style guide](https://rust-lang.github.io/rfcs/1574-more-api-documentation-conventions.html#appendix-a-full-conventions-text), if relevant. - [x] Tests if relevant. - [x] All breaking changes documented. --- iroh-net/src/magicsock/node_map/best_addr.rs | 25 +--------- iroh-net/src/magicsock/node_map/node_state.rs | 50 +++++++++++++++++-- 2 files changed, 48 insertions(+), 27 deletions(-) diff --git a/iroh-net/src/magicsock/node_map/best_addr.rs b/iroh-net/src/magicsock/node_map/best_addr.rs index 95b47b361f..d5550b7c49 100644 --- a/iroh-net/src/magicsock/node_map/best_addr.rs +++ b/iroh-net/src/magicsock/node_map/best_addr.rs @@ -5,11 +5,8 @@ use std::{ time::{Duration, Instant}, }; -use iroh_metrics::inc; use tracing::{debug, info}; -use crate::magicsock::metrics::Metrics as MagicsockMetrics; - /// How long we trust a UDP address as the exclusive path (without using relay) without having heard a Pong reply. const TRUST_UDP_ADDR_DURATION: Duration = Duration::from_millis(6500); @@ -93,12 +90,6 @@ impl BestAddr { let old = self.0.take(); if let Some(old_addr) = old.as_ref().map(BestAddrInner::addr) { info!(?reason, ?has_relay, %old_addr, "clearing best_addr"); - // no longer relying on the direct connection - inc!(MagicsockMetrics, num_direct_conns_removed); - if has_relay { - // we are now relying on the relay connection, add a relay conn - inc!(MagicsockMetrics, num_relay_conns_added); - } } } @@ -126,16 +117,15 @@ impl BestAddr { latency: Duration, source: Source, confirmed_at: Instant, - has_relay: bool, ) { match self.0.as_mut() { None => { - self.insert(addr, latency, source, confirmed_at, has_relay); + self.insert(addr, latency, source, confirmed_at); } Some(state) => { let candidate = AddrLatency { addr, latency }; if !state.is_trusted(confirmed_at) || candidate.is_better_than(&state.addr) { - self.insert(addr, latency, source, confirmed_at, has_relay); + self.insert(addr, latency, source, confirmed_at); } else if state.addr.addr == addr { state.confirmed_at = confirmed_at; state.trust_until = Some(source.trust_until(confirmed_at)); @@ -160,7 +150,6 @@ impl BestAddr { latency: Duration, source: Source, confirmed_at: Instant, - has_relay: bool, ) { let trust_until = source.trust_until(confirmed_at); @@ -184,22 +173,12 @@ impl BestAddr { "selecting new direct path for node" ); } - let was_empty = self.is_empty(); let inner = BestAddrInner { addr: AddrLatency { addr, latency }, trust_until: Some(trust_until), confirmed_at, }; self.0 = Some(inner); - if was_empty && has_relay { - // we now have a direct connection, adjust direct connection count - inc!(MagicsockMetrics, num_direct_conns_added); - if has_relay { - // we no longer rely on the relay connection, decrease the relay connection - // count - inc!(MagicsockMetrics, num_relay_conns_removed); - } - } } pub fn state(&self, now: Instant) -> State { diff --git a/iroh-net/src/magicsock/node_map/node_state.rs b/iroh-net/src/magicsock/node_map/node_state.rs index 4c98d0b5bc..df3e88a82d 100644 --- a/iroh-net/src/magicsock/node_map/node_state.rs +++ b/iroh-net/src/magicsock/node_map/node_state.rs @@ -295,8 +295,8 @@ impl NodeState { (None, Some(relay_url)) => ConnectionType::Relay(relay_url), (None, None) => ConnectionType::None, }; - if self.conn_type.update(typ).is_ok() { - let typ = self.conn_type.get(); + if let Ok(prev_typ) = self.conn_type.update(typ.clone()) { + // The connection type has changed. event!( target: "events.net.conn_type.changed", Level::DEBUG, @@ -304,6 +304,50 @@ impl NodeState { conn_type = ?typ, ); info!(%typ, "new connection type"); + + // Update some metrics + match (prev_typ, typ) { + (ConnectionType::Direct(_), ConnectionType::Direct(_)) => (), + (ConnectionType::Direct(_), ConnectionType::Relay(_)) => { + inc!(MagicsockMetrics, num_direct_conns_removed); + inc!(MagicsockMetrics, num_relay_conns_added); + } + (ConnectionType::Direct(_), ConnectionType::Mixed(_, _)) => { + inc!(MagicsockMetrics, num_direct_conns_removed); + inc!(MagicsockMetrics, num_relay_conns_added); + } + (ConnectionType::Direct(_), ConnectionType::None) => { + inc!(MagicsockMetrics, num_direct_conns_removed) + } + (ConnectionType::Relay(_), ConnectionType::Direct(_)) => { + inc!(MagicsockMetrics, num_direct_conns_added); + inc!(MagicsockMetrics, num_relay_conns_removed); + } + (ConnectionType::Relay(_), ConnectionType::Relay(_)) => (), + (ConnectionType::Relay(_), ConnectionType::Mixed(_, _)) => (), + (ConnectionType::Relay(_), ConnectionType::None) => { + inc!(MagicsockMetrics, num_relay_conns_removed) + } + (ConnectionType::Mixed(_, _), ConnectionType::Direct(_)) => { + inc!(MagicsockMetrics, num_direct_conns_added); + inc!(MagicsockMetrics, num_relay_conns_removed); + } + (ConnectionType::Mixed(_, _), ConnectionType::Relay(_)) => (), + (ConnectionType::Mixed(_, _), ConnectionType::Mixed(_, _)) => (), + (ConnectionType::Mixed(_, _), ConnectionType::None) => { + inc!(MagicsockMetrics, num_relay_conns_removed) + } + (ConnectionType::None, ConnectionType::Direct(_)) => { + inc!(MagicsockMetrics, num_direct_conns_added) + } + (ConnectionType::None, ConnectionType::Relay(_)) => { + inc!(MagicsockMetrics, num_relay_conns_added) + } + (ConnectionType::None, ConnectionType::Mixed(_, _)) => { + inc!(MagicsockMetrics, num_relay_conns_added) + } + (ConnectionType::None, ConnectionType::None) => (), + } } (best_addr, relay_url) } @@ -367,7 +411,6 @@ impl NodeState { pong.latency, best_addr::Source::BestCandidate, pong.pong_at, - self.relay_url.is_some(), ) } } @@ -916,7 +959,6 @@ impl NodeState { latency, best_addr::Source::ReceivedPong, now, - self.relay_url.is_some(), ); } From 1ba033cf0cc601c7ffd4c09822190ddbb2fb8197 Mon Sep 17 00:00:00 2001 From: Friedel Ziegelmayer Date: Sat, 3 Aug 2024 15:04:19 +0200 Subject: [PATCH 22/45] fix: pin derive_more to avoid sudden breakages (#2584) To avoid sudden breakages in new releases, pin `deriver_more` to a specific `beta` --- iroh-base/Cargo.toml | 2 +- iroh-blobs/Cargo.toml | 2 +- iroh-cli/Cargo.toml | 2 +- iroh-dns-server/Cargo.toml | 2 +- iroh-docs/Cargo.toml | 2 +- iroh-gossip/Cargo.toml | 2 +- iroh-net/Cargo.toml | 2 +- iroh/Cargo.toml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/iroh-base/Cargo.toml b/iroh-base/Cargo.toml index b7368c7a74..c6fbc9b2df 100644 --- a/iroh-base/Cargo.toml +++ b/iroh-base/Cargo.toml @@ -27,7 +27,7 @@ thiserror = "1" # key module aead = { version = "0.5.2", features = ["bytes"], optional = true } -derive_more = { version = "1.0.0-beta.6", features = ["debug", "display", "from_str"], optional = true } +derive_more = { version = "=1.0.0-beta.7", features = ["debug", "display", "from_str"], optional = true } ed25519-dalek = { version = "2.0.0", features = ["serde", "rand_core"], optional = true } once_cell = { version = "1.18.0", optional = true } rand = { version = "0.8", optional = true } diff --git a/iroh-blobs/Cargo.toml b/iroh-blobs/Cargo.toml index d423624b75..d084a35760 100644 --- a/iroh-blobs/Cargo.toml +++ b/iroh-blobs/Cargo.toml @@ -21,7 +21,7 @@ async-channel = "2.3.1" bao-tree = { version = "0.13", features = ["tokio_fsm", "validate"], default-features = false } bytes = { version = "1.4", features = ["serde"] } chrono = "0.4.31" -derive_more = { version = "1.0.0-beta.6", features = ["debug", "display", "deref", "deref_mut", "from", "try_into", "into"] } +derive_more = { version = "=1.0.0-beta.7", features = ["debug", "display", "deref", "deref_mut", "from", "try_into", "into"] } flume = "0.11" futures-buffered = "0.2.4" futures-lite = "2.3" diff --git a/iroh-cli/Cargo.toml b/iroh-cli/Cargo.toml index d4bd389a4c..512d5da2a6 100644 --- a/iroh-cli/Cargo.toml +++ b/iroh-cli/Cargo.toml @@ -31,7 +31,7 @@ colored = "2.0.4" comfy-table = "7.0.1" console = "0.15.5" crossterm = "0.27.0" -derive_more = { version = "1.0.0-beta.1", features = ["display"] } +derive_more = { version = "=1.0.0-beta.7", features = ["display"] } dialoguer = { version = "0.11.0", default-features = false } dirs-next = "2.0.0" futures-buffered = "0.2.4" diff --git a/iroh-dns-server/Cargo.toml b/iroh-dns-server/Cargo.toml index 366c3631f9..0a65b68755 100644 --- a/iroh-dns-server/Cargo.toml +++ b/iroh-dns-server/Cargo.toml @@ -17,7 +17,7 @@ axum-server = { version = "0.6.0", features = ["tls-rustls"] } base64-url = "2.0.2" bytes = "1.5.0" clap = { version = "4.5.1", features = ["derive"] } -derive_more = { version = "1.0.0-beta.6", features = ["debug", "display", "into", "from"] } +derive_more = { version = "=1.0.0-beta.7", features = ["debug", "display", "into", "from"] } dirs-next = "2.0.0" futures-lite = "2.3.0" governor = "0.6.3" diff --git a/iroh-docs/Cargo.toml b/iroh-docs/Cargo.toml index 180898d423..f50afda51c 100644 --- a/iroh-docs/Cargo.toml +++ b/iroh-docs/Cargo.toml @@ -19,7 +19,7 @@ anyhow = "1" async-channel = "2.3.1" blake3 = { package = "iroh-blake3", version = "1.4.5"} bytes = { version = "1.4", features = ["serde"] } -derive_more = { version = "1.0.0-beta.6", features = ["debug", "deref", "display", "from", "try_into", "into", "as_ref"] } +derive_more = { version = "=1.0.0-beta.7", features = ["debug", "deref", "display", "from", "try_into", "into", "as_ref"] } ed25519-dalek = { version = "2.0.0", features = ["serde", "rand_core"] } futures-buffered = "0.2.4" futures-lite = "2.3.0" diff --git a/iroh-gossip/Cargo.toml b/iroh-gossip/Cargo.toml index f521e41f19..3aed7435fa 100644 --- a/iroh-gossip/Cargo.toml +++ b/iroh-gossip/Cargo.toml @@ -19,7 +19,7 @@ workspace = true anyhow = { version = "1" } blake3 = { package = "iroh-blake3", version = "1.4.5"} bytes = { version = "1.4.0", features = ["serde"] } -derive_more = { version = "1.0.0-beta.6", features = ["add", "debug", "deref", "display", "from", "try_into", "into"] } +derive_more = { version = "=1.0.0-beta.7", features = ["add", "debug", "deref", "display", "from", "try_into", "into"] } ed25519-dalek = { version = "2.0.0", features = ["serde", "rand_core"] } indexmap = "2.0" postcard = { version = "1", default-features = false, features = ["alloc", "use-std", "experimental-derive"] } diff --git a/iroh-net/Cargo.toml b/iroh-net/Cargo.toml index 8b677fbfb4..5145d35a83 100644 --- a/iroh-net/Cargo.toml +++ b/iroh-net/Cargo.toml @@ -23,7 +23,7 @@ backoff = "0.4.0" bytes = "1" netdev = "0.30.0" der = { version = "0.7", features = ["alloc", "derive"] } -derive_more = { version = "1.0.0-beta.6", features = ["debug", "display", "from", "try_into", "deref"] } +derive_more = { version = "=1.0.0-beta.7", features = ["debug", "display", "from", "try_into", "deref"] } futures-buffered = "0.2.4" futures-concurrency = "7.6.0" futures-lite = "2.3" diff --git a/iroh/Cargo.toml b/iroh/Cargo.toml index d31990e63f..855f65803a 100644 --- a/iroh/Cargo.toml +++ b/iroh/Cargo.toml @@ -20,7 +20,7 @@ anyhow = { version = "1" } async-channel = "2.3.1" bao-tree = { version = "0.13", features = ["tokio_fsm"], default-features = false } bytes = "1" -derive_more = { version = "1.0.0-beta.6", features = ["debug", "display", "from", "try_into", "from_str"] } +derive_more = { version = "=1.0.0-beta.7", features = ["debug", "display", "from", "try_into", "from_str"] } flume = "0.11" futures-buffered = "0.2.4" futures-lite = "2.3" From 605a85d9c121f8d2b48f91c2eb1e86cfa451bd22 Mon Sep 17 00:00:00 2001 From: Kasey Date: Sat, 3 Aug 2024 10:26:12 -0400 Subject: [PATCH 23/45] test(iroh-net): increase timeout for local swarm discovery test (#2574) ## Description The `local_swarm_discovery` test was reliably timing out when run on the linux CI. After checking the logs and NOT finding any evidence that anything goes wrong, I increased the timeout and can no longer trigger the failure. ## Change checklist - [x] Self-review. --------- Co-authored-by: Kasey Huizinga --- iroh-net/src/discovery/local_swarm_discovery.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/iroh-net/src/discovery/local_swarm_discovery.rs b/iroh-net/src/discovery/local_swarm_discovery.rs index b1e9bfffc7..d721f16729 100644 --- a/iroh-net/src/discovery/local_swarm_discovery.rs +++ b/iroh-net/src/discovery/local_swarm_discovery.rs @@ -296,8 +296,8 @@ mod tests { use testresult::TestResult; #[tokio::test] - #[ignore = "flaky"] async fn test_local_swarm_discovery() -> TestResult { + let _guard = iroh_test::logging::setup(); let (node_id_a, discovery_a) = make_discoverer()?; let (_, discovery_b) = make_discoverer()?; @@ -312,12 +312,13 @@ mod tests { // resolve twice to ensure we can create separate streams for the same node_id let mut s1 = discovery_b.resolve(ep.clone(), node_id_a).unwrap(); let mut s2 = discovery_b.resolve(ep, node_id_a).unwrap(); + tracing::debug!(?node_id_a, "Discovering node id a"); // publish discovery_a's address discovery_a.publish(&addr_info); - let s1_res = tokio::time::timeout(Duration::from_secs(5), s1.next()) + let s1_res = tokio::time::timeout(Duration::from_secs(10), s1.next()) .await? .unwrap()?; - let s2_res = tokio::time::timeout(Duration::from_secs(5), s2.next()) + let s2_res = tokio::time::timeout(Duration::from_secs(10), s2.next()) .await? .unwrap()?; assert_eq!(s1_res.addr_info, addr_info); From f5b3918b8d4a0077334980b91ca6339acaa1c55f Mon Sep 17 00:00:00 2001 From: Floris Bruynooghe Date: Mon, 5 Aug 2024 17:07:23 +0200 Subject: [PATCH 24/45] fix(iroh-net): Make a single direct address in NodeAddr instant (#2580) ## Description In #2509 (c1c3539c0a07d8659979ffacdb5bd1fc23d6939c) we removed chosing a random unconfirmed direct address and through both it and the relay if we did not yet have any confirmed direct address. This required any direct address passed in via a NodeAddr to do a DISCO Ping-Pong round trip before it would be used. It turns out this is used a lot in the sense that a common socket address is used: when you know an IP is reachable. Requiring a ping would mean the first QUIC packets would be dropped, Quinn would wait to detect this before retrying and connections are delayed by an observable second by the end of it all. Not great for this scenario, even if it can be argued that outside of tests this is not a common scenario. So this brings back the random selection of unconfirmed addresses. It tries to do this without adding more complexity to the NodeState. Instead it consolidates the path information and best address into the NodeUdpState struct. But **without any logic changes.** It then adds just this extra caching of the chosen address to this struct. This is a huge simplification of a much larger refactor I was trying to do first. I believe that was the right track but resulted in too many changes to do in one large PR. So the long-term goal here is to continue removing state manipulation outside of NodeUdpState struct so that it gains more control of how to manage the UDP paths. This will help the logic of how UDP paths are chosen to be based entirely on the PathState - which is the right model for this. ## Breaking Changes None ## Notes & open questions Fixed #2546 ## Change checklist - [x] Self-review. - [x] Documentation updates following the [style guide](https://rust-lang.github.io/rfcs/1574-more-api-documentation-conventions.html#appendix-a-full-conventions-text), if relevant. - [x] Tests if relevant. - [x] All breaking changes documented. --- iroh-net/src/magicsock.rs | 5 +- iroh-net/src/magicsock/node_map.rs | 4 +- iroh-net/src/magicsock/node_map/node_state.rs | 264 ++++++++---------- iroh-net/src/magicsock/node_map/udp_paths.rs | 179 ++++++++++++ 4 files changed, 301 insertions(+), 151 deletions(-) create mode 100644 iroh-net/src/magicsock/node_map/udp_paths.rs diff --git a/iroh-net/src/magicsock.rs b/iroh-net/src/magicsock.rs index 0547953dc1..2f54e459ce 100644 --- a/iroh-net/src/magicsock.rs +++ b/iroh-net/src/magicsock.rs @@ -499,7 +499,10 @@ impl MagicSock { let dest = QuicMappedAddr(dest); let mut transmits_sent = 0; - match self.node_map.get_send_addrs(dest) { + match self + .node_map + .get_send_addrs(dest, self.ipv6_reported.load(Ordering::Relaxed)) + { Some((public_key, udp_addr, relay_url, mut msgs)) => { let mut pings_sent = false; // If we have pings to send, we *have* to send them out first. diff --git a/iroh-net/src/magicsock/node_map.rs b/iroh-net/src/magicsock/node_map.rs index 7632dce0c5..3550f34bfb 100644 --- a/iroh-net/src/magicsock/node_map.rs +++ b/iroh-net/src/magicsock/node_map.rs @@ -30,6 +30,7 @@ use crate::{ mod best_addr; mod node_state; +mod udp_paths; pub use node_state::{ConnectionType, ControlMsg, DirectAddrInfo, NodeInfo}; pub(super) use node_state::{DiscoPingPurpose, PingAction, PingRole, SendPing}; @@ -186,6 +187,7 @@ impl NodeMap { pub(super) fn get_send_addrs( &self, addr: QuicMappedAddr, + have_ipv6: bool, ) -> Option<( PublicKey, Option, @@ -195,7 +197,7 @@ impl NodeMap { let mut inner = self.inner.lock(); let ep = inner.get_mut(NodeStateKey::QuicMappedAddr(addr))?; let public_key = *ep.public_key(); - let (udp_addr, relay_url, msgs) = ep.get_send_addrs(); + let (udp_addr, relay_url, msgs) = ep.get_send_addrs(have_ipv6); Some((public_key, udp_addr, relay_url, msgs)) } diff --git a/iroh-net/src/magicsock/node_map/node_state.rs b/iroh-net/src/magicsock/node_map/node_state.rs index df3e88a82d..7a69b57117 100644 --- a/iroh-net/src/magicsock/node_map/node_state.rs +++ b/iroh-net/src/magicsock/node_map/node_state.rs @@ -1,9 +1,7 @@ -use std::{ - collections::{btree_map::Entry, BTreeMap, BTreeSet, HashMap}, - hash::Hash, - net::{IpAddr, SocketAddr}, - time::{Duration, Instant}, -}; +use std::collections::{btree_map::Entry, BTreeMap, BTreeSet, HashMap}; +use std::hash::Hash; +use std::net::{IpAddr, SocketAddr}; +use std::time::{Duration, Instant}; use iroh_metrics::inc; use serde::{Deserialize, Serialize}; @@ -11,21 +9,17 @@ use tokio::sync::mpsc; use tracing::{debug, event, info, instrument, trace, warn, Level}; use watchable::{Watchable, WatcherStream}; -use crate::{ - disco::{self, SendAddr}, - endpoint::AddrInfo, - key::PublicKey, - magicsock::{Timer, HEARTBEAT_INTERVAL}, - net::ip::is_unicast_link_local, - relay::RelayUrl, - stun, - util::relay_only_mode, - NodeAddr, NodeId, -}; - -use crate::magicsock::{metrics::Metrics as MagicsockMetrics, ActorMessage, QuicMappedAddr}; - -use super::best_addr::{self, BestAddr, ClearReason, Source}; +use crate::disco::{self, SendAddr}; +use crate::endpoint::AddrInfo; +use crate::key::PublicKey; +use crate::magicsock::{ActorMessage, MagicsockMetrics, QuicMappedAddr, Timer, HEARTBEAT_INTERVAL}; +use crate::net::ip::is_unicast_link_local; +use crate::relay::RelayUrl; +use crate::util::relay_only_mode; +use crate::{stun, NodeAddr, NodeId}; + +use super::best_addr::{self, ClearReason, Source}; +use super::udp_paths::{NodeUdpPaths, UdpSendAddr}; use super::IpPort; /// Number of addresses that are not active that we keep around per node. @@ -116,10 +110,7 @@ pub(super) struct NodeState { /// /// The fallback/bootstrap path, if non-zero (non-zero for well-behaved clients). relay_url: Option<(RelayUrl, PathState)>, - /// Best non-relay path, i.e. a UDP address. - best_addr: BestAddr, - /// State for each of this node's direct paths. - direct_addr_state: BTreeMap, + udp_paths: NodeUdpPaths, sent_pings: HashMap, /// Last time this node was used. /// @@ -169,9 +160,8 @@ impl NodeState { PathState::new(options.node_id, SendAddr::Relay(url)), ) }), - best_addr: Default::default(), + udp_paths: NodeUdpPaths::new(), sent_pings: HashMap::new(), - direct_addr_state: BTreeMap::new(), last_used: options.active.then(Instant::now), last_call_me_maybe: None, conn_type: Watchable::new(ConnectionType::None), @@ -203,7 +193,8 @@ impl NodeState { let conn_type = self.conn_type.get(); let latency = match conn_type { ConnectionType::Direct(addr) => self - .direct_addr_state + .udp_paths + .paths .get(&addr.into()) .and_then(|state| state.latency()), ConnectionType::Relay(ref url) => self @@ -213,7 +204,8 @@ impl NodeState { .and_then(|(_, state)| state.latency()), ConnectionType::Mixed(addr, ref url) => { let addr_latency = self - .direct_addr_state + .udp_paths + .paths .get(&addr.into()) .and_then(|state| state.latency()); let relay_latency = self @@ -226,7 +218,8 @@ impl NodeState { ConnectionType::None => None, }; let addrs = self - .direct_addr_state + .udp_paths + .paths .iter() .map(|(addr, endpoint_state)| DirectAddrInfo { addr: SocketAddr::from(*addr), @@ -261,31 +254,34 @@ impl NodeState { /// Returns the address(es) that should be used for sending the next packet. /// /// This may return to send on one, both or no paths. - fn addr_for_send(&mut self, now: &Instant) -> (Option, Option) { + fn addr_for_send( + &mut self, + now: &Instant, + have_ipv6: bool, + ) -> (Option, Option) { if relay_only_mode() { debug!("in `DEV_relay_ONLY` mode, giving the relay address as the only viable address for this endpoint"); return (None, self.relay_url()); } - // Update our best addr from candidate addresses (only if it is empty and if we have - // recent pongs). - self.assign_best_addr_from_candidates_if_empty(); - let (best_addr, relay_url) = match self.best_addr.state(*now) { - best_addr::State::Valid(best_addr) => { + let (best_addr, relay_url) = match self.udp_paths.send_addr(*now, have_ipv6) { + UdpSendAddr::Valid(addr) => { // If we have a valid address we use it. - trace!(addr = %best_addr.addr, latency = ?best_addr.latency, - "best_addr is set and valid, use best_addr only"); - (Some(best_addr.addr), None) + trace!(%addr, "UdpSendAddr is valid, use it"); + (Some(addr), None) } - best_addr::State::Outdated(best_addr) => { + UdpSendAddr::Outdated(addr) => { // If the address is outdated we use it, but send via relay at the same time. // We also send disco pings so that it will become valid again if it still // works (i.e. we don't need to holepunch again). - trace!(addr = %best_addr.addr, latency = ?best_addr.latency, - "best_addr is set but outdated, use best_addr and relay"); - (Some(best_addr.addr), self.relay_url()) + trace!(%addr, "UdpSendAddr is outdated, use it together with relay"); + (Some(addr), self.relay_url()) } - best_addr::State::Empty => { - trace!("best_addr is unset, use relay"); + UdpSendAddr::Unconfirmed(addr) => { + trace!(%addr, "UdpSendAddr is unconfirmed, use it together with relay"); + (Some(addr), self.relay_url()) + } + UdpSendAddr::None => { + trace!("No UdpSendAddr, use relay"); (None, self.relay_url()) } }; @@ -356,7 +352,7 @@ impl NodeState { /// /// If this is also the best address, it will be cleared as well. pub(super) fn remove_direct_addr(&mut self, ip_port: &IpPort, reason: ClearReason) { - let Some(state) = self.direct_addr_state.remove(ip_port) else { + let Some(state) = self.udp_paths.paths.remove(ip_port) else { return; }; @@ -365,55 +361,11 @@ impl NodeState { None => debug!(%ip_port, last_seen=%"never", ?reason, "pruning address"), } - self.best_addr - .clear_if_equals((*ip_port).into(), reason, self.relay_url.is_some()); - } - - /// Fixup best_adrr from candidates. - /// - /// If somehow we end up in a state where we failed to set a best_addr, while we do have - /// valid candidates, this will chose a candidate and set best_addr again. Most likely - /// this is a bug elsewhere though. - fn assign_best_addr_from_candidates_if_empty(&mut self) { - if !self.best_addr.is_empty() { - return; - } - - // The highest acceptable latency for an endpoint path. If the latency is higher - // then this the path will be ignored. - const MAX_LATENCY: Duration = Duration::from_secs(60 * 60); - let best_pong = self - .direct_addr_state - .iter() - .fold(None, |best_pong, (ipp, state)| { - let best_latency = best_pong - .map(|p: &PongReply| p.latency) - .unwrap_or(MAX_LATENCY); - match state.recent_pong() { - // This pong is better if it has a lower latency, or if it has the same - // latency but on an IPv6 path. - Some(pong) - if pong.latency < best_latency - || (pong.latency == best_latency && ipp.ip().is_ipv6()) => - { - Some(pong) - } - _ => best_pong, - } - }); - - // If we found a candidate, set to best addr - if let Some(pong) = best_pong { - if let SendAddr::Udp(addr) = pong.from { - warn!(%addr, "No best_addr was set, choose candidate with lowest latency"); - self.best_addr.insert_if_better_or_reconfirm( - addr, - pong.latency, - best_addr::Source::BestCandidate, - pong.pong_at, - ) - } - } + self.udp_paths.best_addr.clear_if_equals( + (*ip_port).into(), + reason, + self.relay_url.is_some(), + ); } /// Whether we need to send another call-me-maybe to the endpoint. @@ -430,7 +382,7 @@ impl NodeState { debug!("no previous full ping: need full ping"); return true; }; - match self.best_addr.state(*now) { + match self.udp_paths.best_addr.state(*now) { best_addr::State::Empty => { debug!("best addr not set: need full ping"); true @@ -461,7 +413,7 @@ impl NodeState { debug!(tx = %hex::encode(txid), addr = %sp.to, "pong not received in timeout"); match sp.to { SendAddr::Udp(addr) => { - if let Some(path_state) = self.direct_addr_state.get_mut(&addr.into()) { + if let Some(path_state) = self.udp_paths.paths.get_mut(&addr.into()) { path_state.last_ping = None; // only clear the best address if there was no sign of life from this path // within the time the pong should have arrived @@ -470,7 +422,7 @@ impl NodeState { .map(|last_alive| last_alive.elapsed() <= PING_TIMEOUT_DURATION) .unwrap_or(false); if !consider_alive { - self.best_addr.clear_if_equals( + self.udp_paths.best_addr.clear_if_equals( addr, ClearReason::PongTimeout, self.relay_url().is_some(), @@ -479,7 +431,7 @@ impl NodeState { } else { // If we have no state for the best addr it should have been cleared // anyway. - self.best_addr.clear_if_equals( + self.udp_paths.best_addr.clear_if_equals( addr, ClearReason::PongTimeout, self.relay_url.is_some(), @@ -539,7 +491,7 @@ impl NodeState { let mut path_found = false; match to { SendAddr::Udp(addr) => { - if let Some(st) = self.direct_addr_state.get_mut(&addr.into()) { + if let Some(st) = self.udp_paths.paths.get_mut(&addr.into()) { st.last_ping.replace(now); path_found = true } @@ -633,7 +585,7 @@ impl NodeState { #[must_use = "actions must be handled"] fn send_pings(&mut self, now: Instant) -> Vec { // We allocate +1 in case the caller wants to add a call-me-maybe message. - let mut ping_msgs = Vec::with_capacity(self.direct_addr_state.len() + 1); + let mut ping_msgs = Vec::with_capacity(self.udp_paths.paths.len() + 1); if let Some((url, state)) = self.relay_url.as_ref() { if state.needs_ping(&now) { @@ -653,7 +605,8 @@ impl NodeState { } self.prune_direct_addresses(); let mut ping_dsts = String::from("["); - self.direct_addr_state + self.udp_paths + .paths .iter() .filter_map(|(ipp, state)| state.needs_ping(&now).then_some(*ipp)) .filter_map(|ipp| { @@ -668,7 +621,7 @@ impl NodeState { debug!( %ping_dsts, dst = %self.node_id.fmt_short(), - paths = %summarize_node_paths(&self.direct_addr_state), + paths = %summarize_node_paths(&self.udp_paths.paths), "sending pings to node", ); self.last_full_ping.replace(now); @@ -676,7 +629,7 @@ impl NodeState { } pub(super) fn update_from_node_addr(&mut self, n: &AddrInfo) { - if self.best_addr.is_empty() { + if self.udp_paths.best_addr.is_empty() { // we do not have a direct connection, so changing the relay information may // have an effect on our connection status if self.relay_url.is_none() && n.relay_url.is_some() { @@ -702,11 +655,12 @@ impl NodeState { } for &addr in n.direct_addresses.iter() { - self.direct_addr_state + self.udp_paths + .paths .entry(addr.into()) .or_insert_with(|| PathState::new(self.node_id, SendAddr::from(addr))); } - let paths = summarize_node_paths(&self.direct_addr_state); + let paths = summarize_node_paths(&self.udp_paths.paths); debug!(new = ?n.direct_addresses , %paths, "added new direct paths for endpoint"); } @@ -714,10 +668,11 @@ impl NodeState { #[instrument(skip_all, fields(node = %self.node_id.fmt_short()))] pub(super) fn reset(&mut self) { self.last_full_ping = None; - self.best_addr + self.udp_paths + .best_addr .clear(ClearReason::Reset, self.relay_url.is_some()); - for es in self.direct_addr_state.values_mut() { + for es in self.udp_paths.paths.values_mut() { es.last_ping = None; } } @@ -739,7 +694,7 @@ impl NodeState { let now = Instant::now(); let role = match path { - SendAddr::Udp(addr) => match self.direct_addr_state.entry(addr.into()) { + SendAddr::Udp(addr) => match self.udp_paths.paths.entry(addr.into()) { Entry::Occupied(mut occupied) => occupied.get_mut().handle_ping(tx_id, now), Entry::Vacant(vacant) => { info!(%addr, "new direct addr for node"); @@ -787,7 +742,7 @@ impl NodeState { // if the endpoint does not yet have a best_addrr let needs_ping_back = if matches!(path, SendAddr::Udp(_)) && matches!( - self.best_addr.state(now), + self.udp_paths.best_addr.state(now), best_addr::State::Empty | best_addr::State::Outdated(_) ) { // We also need to send a ping to make this path available to us as well. This @@ -803,7 +758,7 @@ impl NodeState { debug!( ?role, needs_ping_back = ?needs_ping_back.is_some(), - paths = %summarize_node_paths(&self.direct_addr_state), + paths = %summarize_node_paths(&self.udp_paths.paths), "endpoint handled ping", ); PingHandled { @@ -819,7 +774,8 @@ impl NodeState { pub(super) fn prune_direct_addresses(&mut self) { // prune candidates are addresses that are not active let mut prune_candidates: Vec<_> = self - .direct_addr_state + .udp_paths + .paths .iter() .filter(|(_ip_port, state)| !state.is_active()) .map(|(ip_port, state)| (*ip_port, state.last_alive())) @@ -834,7 +790,7 @@ impl NodeState { if prune_count == 0 { // nothing to do, within limits debug!( - paths = %summarize_node_paths(&self.direct_addr_state), + paths = %summarize_node_paths(&self.udp_paths.paths), "prune addresses: {prune_count} pruned", ); return; @@ -848,7 +804,7 @@ impl NodeState { self.remove_direct_addr(&ip_port, ClearReason::Inactive) } debug!( - paths = %summarize_node_paths(&self.direct_addr_state), + paths = %summarize_node_paths(&self.udp_paths.paths), "prune addresses: {prune_count} pruned", ); } @@ -857,8 +813,8 @@ impl NodeState { /// assumptions about which paths work. #[instrument("disco", skip_all, fields(node = %self.node_id.fmt_short()))] pub(super) fn note_connectivity_change(&mut self) { - self.best_addr.clear_trust("connectivity changed"); - for es in self.direct_addr_state.values_mut() { + self.udp_paths.best_addr.clear_trust("connectivity changed"); + for es in self.udp_paths.paths.values_mut() { es.clear(); } } @@ -906,7 +862,7 @@ impl NodeState { match src { SendAddr::Udp(addr) => { - match self.direct_addr_state.get_mut(&addr.into()) { + match self.udp_paths.paths.get_mut(&addr.into()) { None => { warn!("ignoring pong: no state for src addr"); // This is no longer an endpoint we care about. @@ -923,7 +879,7 @@ impl NodeState { } } debug!( - paths = %summarize_node_paths(&self.direct_addr_state), + paths = %summarize_node_paths(&self.udp_paths.paths), "handled pong", ); } @@ -954,7 +910,7 @@ impl NodeState { // TODO(bradfitz): decide how latency vs. preference order affects decision if let SendAddr::Udp(to) = sp.to { debug_assert!(!is_relay, "mismatching relay & udp"); - self.best_addr.insert_if_better_or_reconfirm( + self.udp_paths.best_addr.insert_if_better_or_reconfirm( to, latency, best_addr::Source::ReceivedPong, @@ -991,7 +947,8 @@ impl NodeState { } let ipp = IpPort::from(*peer_sockaddr); call_me_maybe_ipps.insert(ipp); - self.direct_addr_state + self.udp_paths + .paths .entry(ipp) .or_insert_with(|| PathState::new(self.node_id, SendAddr::from(*peer_sockaddr))) .call_me_maybe_time @@ -1001,7 +958,7 @@ impl NodeState { // Zero out all the last_ping times to force send_pings to send new ones, even if // it's been less than 5 seconds ago. Also clear pongs for direct addresses not // included in the updated set. - for (ipp, st) in self.direct_addr_state.iter_mut() { + for (ipp, st) in self.udp_paths.paths.iter_mut() { st.last_ping = None; if !call_me_maybe_ipps.contains(ipp) { // TODO: This seems like a weird way to signal that the endpoint no longer @@ -1014,16 +971,17 @@ impl NodeState { } // Clear trust on our best_addr if it is not included in the updated set. Also // clear the last call-me-maybe send time so we will send one again. - if let Some(addr) = self.best_addr.addr() { + if let Some(addr) = self.udp_paths.best_addr.addr() { let ipp: IpPort = addr.into(); if !call_me_maybe_ipps.contains(&ipp) { - self.best_addr + self.udp_paths + .best_addr .clear_trust("best_addr not in new call-me-maybe"); self.last_call_me_maybe = None; } } debug!( - paths = %summarize_node_paths(&self.direct_addr_state), + paths = %summarize_node_paths(&self.udp_paths.paths), "updated endpoint paths from call-me-maybe", ); self.send_pings(now) @@ -1031,13 +989,14 @@ impl NodeState { /// Marks this endpoint as having received a UDP payload message. pub(super) fn receive_udp(&mut self, addr: IpPort, now: Instant) { - let Some(state) = self.direct_addr_state.get_mut(&addr) else { + let Some(state) = self.udp_paths.paths.get_mut(&addr) else { debug_assert!(false, "node map inconsistency by_ip_port <-> direct addr"); return; }; state.last_payload_msg = Some(now); self.last_used = Some(now); - self.best_addr + self.udp_paths + .best_addr .reconfirm_if_used(addr.into(), Source::Udp, now); } @@ -1063,7 +1022,8 @@ impl NodeState { pub(super) fn last_ping(&self, addr: &SendAddr) -> Option { match addr { SendAddr::Udp(addr) => self - .direct_addr_state + .udp_paths + .paths .get(&(*addr).into()) .and_then(|ep| ep.last_ping), SendAddr::Relay(url) => self @@ -1100,7 +1060,7 @@ impl NodeState { } // Send heartbeat ping to keep the current addr going as long as we need it. - if let Some(udp_addr) = self.best_addr.addr() { + if let Some(udp_addr) = self.udp_paths.best_addr.addr() { let elapsed = self.last_ping(&SendAddr::Udp(udp_addr)).map(|l| now - l); // Send a ping if the last ping is older than 2 seconds. let needs_ping = match elapsed { @@ -1131,10 +1091,11 @@ impl NodeState { #[instrument("get_send_addrs", skip_all, fields(node = %self.node_id.fmt_short()))] pub(crate) fn get_send_addrs( &mut self, + have_ipv6: bool, ) -> (Option, Option, Vec) { let now = Instant::now(); self.last_used.replace(now); - let (udp_addr, relay_url) = self.addr_for_send(&now); + let (udp_addr, relay_url) = self.addr_for_send(&now, have_ipv6); let mut ping_msgs = Vec::new(); if self.want_call_me_maybe(&now) { @@ -1153,12 +1114,12 @@ impl NodeState { /// Get the direct addresses for this endpoint. pub(super) fn direct_addresses(&self) -> impl Iterator + '_ { - self.direct_addr_state.keys().copied() + self.udp_paths.paths.keys().copied() } #[cfg(test)] pub(super) fn direct_address_states(&self) -> impl Iterator + '_ { - self.direct_addr_state.iter() + self.udp_paths.paths.iter() } pub(super) fn last_used(&self) -> Option { @@ -1223,6 +1184,13 @@ impl PathState { } } + pub(super) fn udp_addr(&self) -> Option { + match self.path { + SendAddr::Udp(addr) => Some(addr), + SendAddr::Relay(_) => None, + } + } + pub(super) fn with_last_payload(node_id: NodeId, path: SendAddr, now: Instant) -> Self { PathState { node_id, @@ -1337,7 +1305,7 @@ impl PathState { } /// Returns the most recent pong if available. - fn recent_pong(&self) -> Option<&PongReply> { + pub(super) fn recent_pong(&self) -> Option<&PongReply> { self.recent_pong.as_ref() } @@ -1604,6 +1572,8 @@ pub enum ConnectionType { mod tests { use std::net::Ipv4Addr; + use best_addr::BestAddr; + use super::{ super::{NodeMap, NodeMapInner}, *, @@ -1659,13 +1629,15 @@ mod tests { node_id: key.public(), last_full_ping: None, relay_url: None, - best_addr: BestAddr::from_parts( - ip_port.into(), - latency, - now, - now + Duration::from_secs(100), + udp_paths: NodeUdpPaths::from_parts( + endpoint_state, + BestAddr::from_parts( + ip_port.into(), + latency, + now, + now + Duration::from_secs(100), + ), ), - direct_addr_state: endpoint_state, sent_pings: HashMap::new(), last_used: Some(now), last_call_me_maybe: None, @@ -1684,8 +1656,7 @@ mod tests { node_id: key.public(), last_full_ping: None, relay_url: relay_and_state(key.public(), send_addr.clone()), - best_addr: BestAddr::default(), - direct_addr_state: BTreeMap::default(), + udp_paths: NodeUdpPaths::new(), sent_pings: HashMap::new(), last_used: Some(now), last_call_me_maybe: None, @@ -1696,7 +1667,6 @@ mod tests { // endpoint w/ no best addr but a relay w/ no latency let c_endpoint = { // let socket_addr = "0.0.0.0:8".parse().unwrap(); - let endpoint_state = BTreeMap::new(); let key = SecretKey::generate(); NodeState { id: 2, @@ -1707,8 +1677,7 @@ mod tests { send_addr.clone(), PathState::new(key.public(), SendAddr::from(send_addr.clone())), )), - best_addr: BestAddr::default(), - direct_addr_state: endpoint_state, + udp_paths: NodeUdpPaths::new(), sent_pings: HashMap::new(), last_used: Some(now), last_call_me_maybe: None, @@ -1741,13 +1710,10 @@ mod tests { node_id: key.public(), last_full_ping: None, relay_url: relay_and_state(key.public(), send_addr.clone()), - best_addr: BestAddr::from_parts( - socket_addr, - Duration::from_millis(80), - now, - expired, + udp_paths: NodeUdpPaths::from_parts( + endpoint_state, + BestAddr::from_parts(socket_addr, Duration::from_millis(80), now, expired), ), - direct_addr_state: endpoint_state, sent_pings: HashMap::new(), last_used: Some(now), last_call_me_maybe: None, diff --git a/iroh-net/src/magicsock/node_map/udp_paths.rs b/iroh-net/src/magicsock/node_map/udp_paths.rs new file mode 100644 index 0000000000..1154bc19c1 --- /dev/null +++ b/iroh-net/src/magicsock/node_map/udp_paths.rs @@ -0,0 +1,179 @@ +//! Path state for UDP addresses of a single peer node. +//! +//! This started as simply moving the [`NodeState`]'s `direct_addresses` and `best_addr` +//! into one place together. The aim is for external places to not directly interact with +//! the inside and instead only notifies this struct of state changes to each path. +//! +//! [`NodeState`]: super::node_state::NodeState +use std::collections::BTreeMap; +use std::net::SocketAddr; +use std::time::{Duration, Instant}; + +use rand::seq::IteratorRandom; +use tracing::warn; + +use crate::disco::SendAddr; + +use super::best_addr::{self, BestAddr}; +use super::node_state::{PathState, PongReply}; +use super::IpPort; + +/// The address on which to send datagrams over UDP. +/// +/// The [`MagicSock`] sends packets to zero or one UDP address, depending on the known paths +/// to the remote node. This conveys the UDP address to send on from the [`NodeUdpPaths`] +/// to the [`NodeState`]. +/// +/// [`NodeUdpPaths`] contains all the UDP path states, while [`NodeState`] has to decide the +/// bigger picture including the relay server. +/// +/// See [`NodeUdpPaths::send_addr`]. +/// +/// [`MagicSock`]: crate::magicsock::MagicSock +/// [`NodeState`]: super::node_state::NodeState +#[derive(Debug)] +pub(super) enum UdpSendAddr { + /// The UDP address can be relied on to deliver data to the remote node. + /// + /// This means this path is usable with a reasonable latency and can be fully trusted to + /// transport payload data to the remote node. + Valid(SocketAddr), + /// The UDP address is highly likely to work, but has not been used for a while. + /// + /// The path should be usable but has not carried DISCO or payload data for a little too + /// long. It is best to also use a backup, i.e. relay, path if possible. + Outdated(SocketAddr), + /// The UDP address is not known to work, but it might. + /// + /// We know this UDP address belongs to the remote node, but we do not know if the path + /// already works or may need holepunching before it will start to work. It might even + /// never work. It is still useful to send to this together with backup path, + /// i.e. relay, in case the path works: if the path does not need holepunching it might + /// be much faster. And if there is no relay path at all it might be the only way to + /// establish a connection. + Unconfirmed(SocketAddr), + /// No known UDP path exists to the remote node. + None, +} + +/// The UDP paths for a single node. +/// +/// Paths are identified by the [`IpPort`] of their UDP address. +/// +/// Initially this collects two structs directly from the [`NodeState`] into one place, +/// leaving the APIs and astractions the same. The goal is that this slowly migrates +/// directly interacting with this data into only receiving [`PathState`] updates. This +/// will consolidate the logic of direct path selection and make this simpler to reason +/// about. However doing that all at once is too large a refactor. +/// +/// [`NodeState`]: super::node_state::NodeState +#[derive(Debug, Default)] +pub(super) struct NodeUdpPaths { + /// The state for each of this node's direct paths. + pub(super) paths: BTreeMap, + /// Best UDP path currently selected. + pub(super) best_addr: BestAddr, + /// If we had to choose a path because we had no `best_addr` it is stored here. + chosen_candidate: Option, +} + +impl NodeUdpPaths { + pub(super) fn new() -> Self { + Default::default() + } + + #[cfg(test)] + pub(super) fn from_parts(paths: BTreeMap, best_addr: BestAddr) -> Self { + Self { + paths, + best_addr, + chosen_candidate: None, + } + } + + /// Returns the current UDP address to send on. + /// + /// TODO: The goal here is for this to simply return the already known send address, so + /// it should be `&self` and not `&mut self`. This is only possible once the state from + /// [`NodeUdpPaths`] is no longer modified from outside. + pub(super) fn send_addr(&mut self, now: Instant, have_ipv6: bool) -> UdpSendAddr { + self.assign_best_addr_from_candidates_if_empty(); + match self.best_addr.state(now) { + best_addr::State::Valid(addr) => UdpSendAddr::Valid(addr.addr), + best_addr::State::Outdated(addr) => UdpSendAddr::Outdated(addr.addr), + best_addr::State::Empty => { + // No direct connection has been used before. If we know of any possible + // candidate addresses, randomly try to use one. This path is most + // effective when folks use a NodeAddr with exactly one direct address which + // they know to work, effectively like using a traditional socket or QUIC + // endpoint. + let addr = self + .chosen_candidate + .and_then(|ipp| self.paths.get(&ipp)) + .and_then(|path| path.udp_addr()) + .filter(|addr| addr.is_ipv4() || have_ipv6) + .or_else(|| { + // Look for a new candidate in all the known paths. This may look + // like a RNG use on the hot-path but this is normally invoked at + // most most once at startup. + let addr = self + .paths + .values() + .filter_map(|path| path.udp_addr()) + .filter(|addr| addr.is_ipv4() || have_ipv6) + .choose(&mut rand::thread_rng()); + self.chosen_candidate = addr.map(IpPort::from); + addr + }); + match addr { + Some(addr) => UdpSendAddr::Unconfirmed(addr), + None => UdpSendAddr::None, + } + } + } + } + + /// Fixup best_addr from candidates. + /// + /// If somehow we end up in a state where we failed to set a best_addr, while we do have + /// valid candidates, this will chose a candidate and set best_addr again. Most likely + /// this is a bug elsewhere though. + fn assign_best_addr_from_candidates_if_empty(&mut self) { + if !self.best_addr.is_empty() { + return; + } + + // The highest acceptable latency for an endpoint path. If the latency is higher + // then this the path will be ignored. + const MAX_LATENCY: Duration = Duration::from_secs(60 * 60); + let best_pong = self.paths.iter().fold(None, |best_pong, (ipp, state)| { + let best_latency = best_pong + .map(|p: &PongReply| p.latency) + .unwrap_or(MAX_LATENCY); + match state.recent_pong() { + // This pong is better if it has a lower latency, or if it has the same + // latency but on an IPv6 path. + Some(pong) + if pong.latency < best_latency + || (pong.latency == best_latency && ipp.ip().is_ipv6()) => + { + Some(pong) + } + _ => best_pong, + } + }); + + // If we found a candidate, set to best addr + if let Some(pong) = best_pong { + if let SendAddr::Udp(addr) = pong.from { + warn!(%addr, "No best_addr was set, choose candidate with lowest latency"); + self.best_addr.insert_if_better_or_reconfirm( + addr, + pong.latency, + best_addr::Source::BestCandidate, + pong.pong_at, + ) + } + } + } +} From 43ef8b6e87048f7f28ddb4c2b97d7bf4fe853b90 Mon Sep 17 00:00:00 2001 From: Floris Bruynooghe Date: Mon, 5 Aug 2024 17:07:56 +0200 Subject: [PATCH 25/45] ref(iroh-net): Don't write the match as fully exhaustive (#2585) ## Description This makes it a bit earier to read. It's harder to figure out if thre are changes to the ConnectionType though, but let's assume that doesn't happen too often. ## Breaking Changes None ## Notes & open questions I'm open to other ways of structuring this code. Not entirely sure what the best shape would be. ## Change checklist - [x] Self-review. - [x] Documentation updates following the [style guide](https://rust-lang.github.io/rfcs/1574-more-api-documentation-conventions.html#appendix-a-full-conventions-text), if relevant. - [x] Tests if relevant. - [x] All breaking changes documented. --- iroh-net/src/magicsock/node_map/node_state.rs | 45 +++++++------------ 1 file changed, 15 insertions(+), 30 deletions(-) diff --git a/iroh-net/src/magicsock/node_map/node_state.rs b/iroh-net/src/magicsock/node_map/node_state.rs index 7a69b57117..375fb706d7 100644 --- a/iroh-net/src/magicsock/node_map/node_state.rs +++ b/iroh-net/src/magicsock/node_map/node_state.rs @@ -303,46 +303,31 @@ impl NodeState { // Update some metrics match (prev_typ, typ) { - (ConnectionType::Direct(_), ConnectionType::Direct(_)) => (), - (ConnectionType::Direct(_), ConnectionType::Relay(_)) => { - inc!(MagicsockMetrics, num_direct_conns_removed); - inc!(MagicsockMetrics, num_relay_conns_added); - } - (ConnectionType::Direct(_), ConnectionType::Mixed(_, _)) => { - inc!(MagicsockMetrics, num_direct_conns_removed); - inc!(MagicsockMetrics, num_relay_conns_added); - } - (ConnectionType::Direct(_), ConnectionType::None) => { - inc!(MagicsockMetrics, num_direct_conns_removed) - } - (ConnectionType::Relay(_), ConnectionType::Direct(_)) => { - inc!(MagicsockMetrics, num_direct_conns_added); - inc!(MagicsockMetrics, num_relay_conns_removed); - } - (ConnectionType::Relay(_), ConnectionType::Relay(_)) => (), - (ConnectionType::Relay(_), ConnectionType::Mixed(_, _)) => (), - (ConnectionType::Relay(_), ConnectionType::None) => { - inc!(MagicsockMetrics, num_relay_conns_removed) - } - (ConnectionType::Mixed(_, _), ConnectionType::Direct(_)) => { + (ConnectionType::Relay(_), ConnectionType::Direct(_)) + | (ConnectionType::Mixed(_, _), ConnectionType::Direct(_)) => { inc!(MagicsockMetrics, num_direct_conns_added); inc!(MagicsockMetrics, num_relay_conns_removed); } - (ConnectionType::Mixed(_, _), ConnectionType::Relay(_)) => (), - (ConnectionType::Mixed(_, _), ConnectionType::Mixed(_, _)) => (), - (ConnectionType::Mixed(_, _), ConnectionType::None) => { - inc!(MagicsockMetrics, num_relay_conns_removed) + (ConnectionType::Direct(_), ConnectionType::Relay(_)) + | (ConnectionType::Direct(_), ConnectionType::Mixed(_, _)) => { + inc!(MagicsockMetrics, num_direct_conns_removed); + inc!(MagicsockMetrics, num_relay_conns_added); } (ConnectionType::None, ConnectionType::Direct(_)) => { inc!(MagicsockMetrics, num_direct_conns_added) } - (ConnectionType::None, ConnectionType::Relay(_)) => { - inc!(MagicsockMetrics, num_relay_conns_added) + (ConnectionType::Direct(_), ConnectionType::None) => { + inc!(MagicsockMetrics, num_direct_conns_removed) } - (ConnectionType::None, ConnectionType::Mixed(_, _)) => { + (ConnectionType::None, ConnectionType::Relay(_)) + | (ConnectionType::None, ConnectionType::Mixed(_, _)) => { inc!(MagicsockMetrics, num_relay_conns_added) } - (ConnectionType::None, ConnectionType::None) => (), + (ConnectionType::Relay(_), ConnectionType::None) + | (ConnectionType::Mixed(_, _), ConnectionType::None) => { + inc!(MagicsockMetrics, num_relay_conns_removed) + } + _ => (), } } (best_addr, relay_url) From 07844031c3e568e34c64a825803c9cd3f91a2035 Mon Sep 17 00:00:00 2001 From: Franz Heinzmann Date: Mon, 5 Aug 2024 18:38:23 +0200 Subject: [PATCH 26/45] fix(iroh-blobs): do not hit the network when downloading blobs which are complete (#2586) ## Description Two changes to the downloader: * Never try to download from ourselves. If the only provider node added is our own node, fail with error "no providers". * The actual download request flow is turned into a generator (while keeping API compatibility for the existing `get_to_db` public function). A new `get_to_db_in_steps` function either runs to completion if the requested data is fully available locally, or yields a `NeedsConn` struct at the point where it needs a network connection to proceed. The `NeedsConn` has an `async proceed(self, conn: Connection)`, which must be called with a connection for the actual download to start. This two-step process allows the downloader to check if we should dial nodes at all, or are already done without doing anything, while emitting the exact same flow of events (because we run the same loop) to the client. To achieve this, `get_to_db` now uses a genawaiter generator internally. This means that the big loop that is the iroh-blobs protocol request flow does not have to be changed at all, only that instead of a closure we yield and resume, which makes this much easier to integrate into an external state machine like the downloader. The changes needed for this for the downloader are a bit verbose because the downloader itself is generic over a `Getter`, with impls for the actual impl and a test impl that does not use networking; therefore the new `NeedsConn` state has to be modeled with an additional associated type and trait here. This PR adds three tests: * Downloading a missing blob from the local node fails without trying to connect to ourselves * Downloading an existing blob succeeds without trying to download * Downloading an existing collection succeeds without trying to download Closes #2575 Replaced #2576 ## Notes and open questions ## Breaking changes None, only an API addition to the public API of iroh_blobs: `iroh_blobs::get::check_local_with_progress_if_complete` --------- Co-authored-by: dignifiedquire --- iroh-blobs/src/downloader.rs | 191 ++++++++++++++++------- iroh-blobs/src/downloader/get.rs | 90 ++++++----- iroh-blobs/src/downloader/invariants.rs | 2 +- iroh-blobs/src/downloader/progress.rs | 3 + iroh-blobs/src/downloader/test.rs | 23 ++- iroh-blobs/src/downloader/test/dialer.rs | 7 + iroh-blobs/src/downloader/test/getter.rs | 26 ++- iroh-blobs/src/get/db.rs | 128 +++++++++++++-- iroh-net/src/dialer.rs | 5 + iroh/src/client/blobs.rs | 184 ++++++++++++++++++++++ iroh/src/node/rpc.rs | 98 +++++++----- 11 files changed, 594 insertions(+), 163 deletions(-) diff --git a/iroh-blobs/src/downloader.rs b/iroh-blobs/src/downloader.rs index dd26a8bc6d..21644e8d93 100644 --- a/iroh-blobs/src/downloader.rs +++ b/iroh-blobs/src/downloader.rs @@ -27,8 +27,12 @@ //! requests to a single node is also limited. use std::{ - collections::{hash_map::Entry, HashMap, HashSet}, + collections::{ + hash_map::{self, Entry}, + HashMap, HashSet, + }, fmt, + future::Future, num::NonZeroUsize, sync::{ atomic::{AtomicU64, Ordering}, @@ -46,7 +50,7 @@ use tokio::{ sync::{mpsc, oneshot}, task::JoinSet, }; -use tokio_util::{sync::CancellationToken, time::delay_queue}; +use tokio_util::{either::Either, sync::CancellationToken, time::delay_queue}; use tracing::{debug, error_span, trace, warn, Instrument}; use crate::{ @@ -75,13 +79,15 @@ pub struct IntentId(pub u64); /// Trait modeling a dialer. This allows for IO-less testing. pub trait Dialer: Stream)> + Unpin { /// Type of connections returned by the Dialer. - type Connection: Clone; + type Connection: Clone + 'static; /// Dial a node. fn queue_dial(&mut self, node_id: NodeId); /// Get the number of dialing nodes. fn pending_count(&self) -> usize; /// Check if a node is being dialed. fn is_pending(&self, node: NodeId) -> bool; + /// Get the node id of our node. + fn node_id(&self) -> NodeId; } /// Signals what should be done with the request when it fails. @@ -97,20 +103,39 @@ pub enum FailureAction { RetryLater(anyhow::Error), } -/// Future of a get request. -type GetFut = BoxedLocal; +/// Future of a get request, for the checking stage. +type GetStartFut = BoxedLocal, FailureAction>>; +/// Future of a get request, for the downloading stage. +type GetProceedFut = BoxedLocal; /// Trait modelling performing a single request over a connection. This allows for IO-less testing. pub trait Getter { /// Type of connections the Getter requires to perform a download. - type Connection; - /// Return a future that performs the download using the given connection. + type Connection: 'static; + /// Type of the intermediary state returned from [`Self::get`] if a connection is needed. + type NeedsConn: NeedsConn; + /// Returns a future that checks the local store if the request is already complete, returning + /// a struct implementing [`NeedsConn`] if we need a network connection to proceed. fn get( &mut self, kind: DownloadKind, - conn: Self::Connection, progress_sender: BroadcastProgressSender, - ) -> GetFut; + ) -> GetStartFut; +} + +/// Trait modelling the intermediary state when a connection is needed to proceed. +pub trait NeedsConn: std::fmt::Debug + 'static { + /// Proceeds the download with the given connection. + fn proceed(self, conn: C) -> GetProceedFut; +} + +/// Output returned from [`Getter::get`]. +#[derive(Debug)] +pub enum GetOutput { + /// The request is already complete in the local store. + Complete(Stats), + /// The request needs a connection to continue. + NeedsConn(N), } /// Concurrency limits for the [`Downloader`]. @@ -280,7 +305,7 @@ pub struct DownloadHandle { receiver: oneshot::Receiver, } -impl std::future::Future for DownloadHandle { +impl Future for DownloadHandle { type Output = ExternalDownloadResult; fn poll( @@ -424,10 +449,12 @@ struct IntentHandlers { } /// Information about a request. -#[derive(Debug, Default)] -struct RequestInfo { +#[derive(Debug)] +struct RequestInfo { /// Registered intents with progress senders and result callbacks. intents: HashMap, + progress_sender: BroadcastProgressSender, + get_state: Option, } /// Information about a request in progress. @@ -529,7 +556,7 @@ struct Service { /// Queue of pending downloads. queue: Queue, /// Information about pending and active requests. - requests: HashMap, + requests: HashMap>, /// State of running downloads. active_requests: HashMap, /// Tasks for currently running downloads. @@ -666,48 +693,85 @@ impl, D: Dialer> Service { on_progress: progress, }; - // early exit if no providers. - if nodes.is_empty() && self.providers.get_candidates(&kind.hash()).next().is_none() { - self.finalize_download( - kind, - [(intent_id, intent_handlers)].into(), - Err(DownloadError::NoProviders), - ); - return; - } - // add the nodes to the provider map - let updated = self - .providers - .add_hash_with_nodes(kind.hash(), nodes.iter().map(|n| n.node_id)); + // (skip the node id of our own node - we should never attempt to download from ourselves) + let node_ids = nodes + .iter() + .map(|n| n.node_id) + .filter(|node_id| *node_id != self.dialer.node_id()); + let updated = self.providers.add_hash_with_nodes(kind.hash(), node_ids); // queue the transfer (if not running) or attach to transfer progress (if already running) - if self.active_requests.contains_key(&kind) { - // the transfer is already running, so attach the progress sender - if let Some(on_progress) = &intent_handlers.on_progress { - // this is async because it sends the current state over the progress channel - if let Err(err) = self - .progress_tracker - .subscribe(kind, on_progress.clone()) - .await - { - debug!(?err, %kind, "failed to subscribe progress sender to transfer"); + match self.requests.entry(kind) { + hash_map::Entry::Occupied(mut entry) => { + if let Some(on_progress) = &intent_handlers.on_progress { + // this is async because it sends the current state over the progress channel + if let Err(err) = self + .progress_tracker + .subscribe(kind, on_progress.clone()) + .await + { + debug!(?err, %kind, "failed to subscribe progress sender to transfer"); + } } + entry.get_mut().intents.insert(intent_id, intent_handlers); } - } else { - // the transfer is not running. - if updated && self.queue.is_parked(&kind) { - // the transfer is on hold for pending retries, and we added new nodes, so move back to queue. - self.queue.unpark(&kind); - } else if !self.queue.contains(&kind) { - // the transfer is not yet queued: add to queue. + hash_map::Entry::Vacant(entry) => { + tracing::warn!("is new, queue"); + let progress_sender = self.progress_tracker.track( + kind, + intent_handlers + .on_progress + .clone() + .into_iter() + .collect::>(), + ); + + let get_state = match self.getter.get(kind, progress_sender.clone()).await { + Err(_err) => { + self.finalize_download( + kind, + [(intent_id, intent_handlers)].into(), + // TODO: add better error variant? this is only triggered if the local + // store failed with local IO. + Err(DownloadError::DownloadFailed), + ); + return; + } + Ok(GetOutput::Complete(stats)) => { + self.finalize_download( + kind, + [(intent_id, intent_handlers)].into(), + Ok(stats), + ); + return; + } + Ok(GetOutput::NeedsConn(state)) => { + // early exit if no providers. + if self.providers.get_candidates(&kind.hash()).next().is_none() { + self.finalize_download( + kind, + [(intent_id, intent_handlers)].into(), + Err(DownloadError::NoProviders), + ); + return; + } + state + } + }; + entry.insert(RequestInfo { + intents: [(intent_id, intent_handlers)].into_iter().collect(), + progress_sender, + get_state: Some(get_state), + }); self.queue.insert(kind); } } - // store the request info - let request_info = self.requests.entry(kind).or_default(); - request_info.intents.insert(intent_id, intent_handlers); + if updated && self.queue.is_parked(&kind) { + // the transfer is on hold for pending retries, and we added new nodes, so move back to queue. + self.queue.unpark(&kind); + } } /// Cancels a download intent. @@ -860,7 +924,6 @@ impl, D: Dialer> Service { ) { self.progress_tracker.remove(&kind); self.remove_hash_if_not_queued(&kind.hash()); - let result = result.map_err(|_| DownloadError::DownloadFailed); for (_id, handlers) in intents.into_iter() { handlers.on_finish.send(result.clone()).ok(); } @@ -1082,14 +1145,9 @@ impl, D: Dialer> Service { /// Panics if hash is not in self.requests or node is not in self.nodes. fn start_download(&mut self, kind: DownloadKind, node: NodeId) { let node_info = self.connected_nodes.get_mut(&node).expect("node exists"); - let request_info = self.requests.get(&kind).expect("hash exists"); - - // create a progress sender and subscribe all intents to the progress sender - let subscribers = request_info - .intents - .values() - .flat_map(|state| state.on_progress.clone()); - let progress_sender = self.progress_tracker.track(kind, subscribers); + let request_info = self.requests.get_mut(&kind).expect("request exists"); + let progress = request_info.progress_sender.clone(); + // .expect("queued state exists"); // create the active request state let cancellation = CancellationToken::new(); @@ -1098,7 +1156,15 @@ impl, D: Dialer> Service { node, }; let conn = node_info.conn.clone(); - let get_fut = self.getter.get(kind, conn, progress_sender); + + // If this is the first provider node we try, we have an initial state + // from starting the generator in Self::handle_queue_new_download. + // If this not the first provider node we try, we have to recreate the generator, because + // we can only resume it once. + let get_state = match request_info.get_state.take() { + Some(state) => Either::Left(async move { Ok(GetOutput::NeedsConn(state)) }), + None => Either::Right(self.getter.get(kind, progress)), + }; let fut = async move { // NOTE: it's an open question if we should do timeouts at this point. Considerations from @Frando: // > at this stage we do not know the size of the download, so the timeout would have @@ -1106,9 +1172,16 @@ impl, D: Dialer> Service { // > this means that a super slow node would block a download from succeeding for a long // > time, while faster nodes could be readily available. // As a conclusion, timeouts should be added only after downloads are known to be bounded + let fut = async move { + match get_state.await? { + GetOutput::Complete(stats) => Ok(stats), + GetOutput::NeedsConn(state) => state.proceed(conn).await, + } + }; + tokio::pin!(fut); let res = tokio::select! { _ = cancellation.cancelled() => Err(FailureAction::AllIntentsDropped), - res = get_fut => res + res = &mut fut => res }; trace!("transfer finished"); @@ -1433,4 +1506,8 @@ impl Dialer for iroh_net::dialer::Dialer { fn is_pending(&self, node: NodeId) -> bool { self.is_pending(node) } + + fn node_id(&self) -> NodeId { + self.endpoint().node_id() + } } diff --git a/iroh-blobs/src/downloader/get.rs b/iroh-blobs/src/downloader/get.rs index e48370d42c..b43cbaba92 100644 --- a/iroh-blobs/src/downloader/get.rs +++ b/iroh-blobs/src/downloader/get.rs @@ -3,18 +3,13 @@ //! [`Connection`]: iroh_net::endpoint::Connection use crate::{ - get::{db::get_to_db, error::GetError}, + get::{db::get_to_db_in_steps, error::GetError}, store::Store, }; use futures_lite::FutureExt; -#[cfg(feature = "metrics")] -use iroh_metrics::{inc, inc_by}; use iroh_net::endpoint; -#[cfg(feature = "metrics")] -use crate::metrics::Metrics; - -use super::{progress::BroadcastProgressSender, DownloadKind, FailureAction, GetFut, Getter}; +use super::{progress::BroadcastProgressSender, DownloadKind, FailureAction, GetStartFut, Getter}; impl From for FailureAction { fn from(e: GetError) -> Self { @@ -39,46 +34,63 @@ pub(crate) struct IoGetter { impl Getter for IoGetter { type Connection = endpoint::Connection; + type NeedsConn = crate::get::db::GetStateNeedsConn; fn get( &mut self, kind: DownloadKind, - conn: Self::Connection, progress_sender: BroadcastProgressSender, - ) -> GetFut { + ) -> GetStartFut { let store = self.store.clone(); - let fut = async move { - let get_conn = || async move { Ok(conn) }; - let res = get_to_db(&store, get_conn, &kind.hash_and_format(), progress_sender).await; - match res { - Ok(stats) => { - #[cfg(feature = "metrics")] - { - let crate::get::Stats { - bytes_written, - bytes_read: _, - elapsed, - } = stats; - - inc!(Metrics, downloads_success); - inc_by!(Metrics, download_bytes_total, bytes_written); - inc_by!(Metrics, download_time_total, elapsed.as_millis() as u64); - } - Ok(stats) + async move { + match get_to_db_in_steps(store, kind.hash_and_format(), progress_sender).await { + Err(err) => Err(err.into()), + Ok(crate::get::db::GetState::Complete(stats)) => { + Ok(super::GetOutput::Complete(stats)) } - Err(e) => { - // record metrics according to the error - #[cfg(feature = "metrics")] - { - match &e { - GetError::NotFound(_) => inc!(Metrics, downloads_notfound), - _ => inc!(Metrics, downloads_error), - } - } - Err(e.into()) + Ok(crate::get::db::GetState::NeedsConn(needs_conn)) => { + Ok(super::GetOutput::NeedsConn(needs_conn)) } } - }; - fut.boxed_local() + } + .boxed_local() + } +} + +impl super::NeedsConn for crate::get::db::GetStateNeedsConn { + fn proceed(self, conn: endpoint::Connection) -> super::GetProceedFut { + async move { + let res = self.proceed(conn).await; + #[cfg(feature = "metrics")] + track_metrics(&res); + match res { + Ok(stats) => Ok(stats), + Err(err) => Err(err.into()), + } + } + .boxed_local() + } +} + +#[cfg(feature = "metrics")] +fn track_metrics(res: &Result) { + use crate::metrics::Metrics; + use iroh_metrics::{inc, inc_by}; + match res { + Ok(stats) => { + let crate::get::Stats { + bytes_written, + bytes_read: _, + elapsed, + } = stats; + + inc!(Metrics, downloads_success); + inc_by!(Metrics, download_bytes_total, *bytes_written); + inc_by!(Metrics, download_time_total, elapsed.as_millis() as u64); + } + Err(e) => match &e { + GetError::NotFound(_) => inc!(Metrics, downloads_notfound), + _ => inc!(Metrics, downloads_error), + }, } } diff --git a/iroh-blobs/src/downloader/invariants.rs b/iroh-blobs/src/downloader/invariants.rs index e4a2656368..0409e3d922 100644 --- a/iroh-blobs/src/downloader/invariants.rs +++ b/iroh-blobs/src/downloader/invariants.rs @@ -77,8 +77,8 @@ impl, D: Dialer> Service { // check that the count of futures we are polling for downloads is consistent with the // number of requests assert_eq!( - self.in_progress_downloads.len(), self.active_requests.len(), + self.in_progress_downloads.len(), "active_requests and in_progress_downloads are out of sync" ); // check that the count of requests per peer matches the number of requests that have that diff --git a/iroh-blobs/src/downloader/progress.rs b/iroh-blobs/src/downloader/progress.rs index eac80985d5..60ded0e7a5 100644 --- a/iroh-blobs/src/downloader/progress.rs +++ b/iroh-blobs/src/downloader/progress.rs @@ -103,6 +103,7 @@ struct Inner { impl Inner { fn subscribe(&mut self, subscriber: ProgressSubscriber) -> DownloadProgress { + tracing::warn!(state=?self.state, "subscribe! emit initial"); let msg = DownloadProgress::InitialState(self.state.clone()); self.subscribers.push(subscriber); msg @@ -136,7 +137,9 @@ impl ProgressSender for BroadcastProgressSender { // making sure that the lock is not held across an await point. let futs = { let mut inner = self.shared.lock(); + tracing::warn!(?msg, state_pre=?inner.state, "send to {}", inner.subscribers.len()); inner.on_progress(msg.clone()); + tracing::warn!(state_post=?inner.state, "send"); let futs = inner .subscribers .iter_mut() diff --git a/iroh-blobs/src/downloader/test.rs b/iroh-blobs/src/downloader/test.rs index 2e734eaf3b..b2bd4c751a 100644 --- a/iroh-blobs/src/downloader/test.rs +++ b/iroh-blobs/src/downloader/test.rs @@ -9,7 +9,10 @@ use futures_util::future::FutureExt; use iroh_net::key::SecretKey; use crate::{ - get::{db::BlobId, progress::TransferState}, + get::{ + db::BlobId, + progress::{BlobProgress, TransferState}, + }, util::{ local_pool::LocalPool, progress::{AsyncChannelProgressSender, IdGenerator}, @@ -286,16 +289,26 @@ async fn concurrent_progress() { let req = DownloadRequest::new(kind_1, vec![peer]).progress_sender(prog_b_tx); let handle_b = downloader.queue(req).await; - start_tx.send(()).unwrap(); - let mut state_a = TransferState::new(hash); let mut state_b = TransferState::new(hash); let mut state_c = TransferState::new(hash); + let prog0_b = prog_b_rx.recv().await.unwrap(); + assert!(matches!( + prog0_b, + DownloadProgress::InitialState(state) if state.root.hash == hash && state.root.progress == BlobProgress::Pending, + )); + + start_tx.send(()).unwrap(); + let prog1_a = prog_a_rx.recv().await.unwrap(); let prog1_b = prog_b_rx.recv().await.unwrap(); - assert!(matches!(prog1_a, DownloadProgress::Found { hash, size: 100, ..} if hash == hash)); - assert!(matches!(prog1_b, DownloadProgress::Found { hash, size: 100, ..} if hash == hash)); + assert!( + matches!(prog1_a, DownloadProgress::Found { hash: found_hash, size: 100, ..} if found_hash == hash) + ); + assert!( + matches!(prog1_b, DownloadProgress::Found { hash: found_hash, size: 100, ..} if found_hash == hash) + ); state_a.on_progress(prog1_a); state_b.on_progress(prog1_b); diff --git a/iroh-blobs/src/downloader/test/dialer.rs b/iroh-blobs/src/downloader/test/dialer.rs index d099552a11..fc5a939959 100644 --- a/iroh-blobs/src/downloader/test/dialer.rs +++ b/iroh-blobs/src/downloader/test/dialer.rs @@ -21,6 +21,8 @@ struct TestingDialerInner { dial_duration: Duration, /// Fn deciding if a dial is successful. dial_outcome: Box bool + Send + Sync + 'static>, + /// Our own node id + node_id: NodeId, } impl Default for TestingDialerInner { @@ -31,6 +33,7 @@ impl Default for TestingDialerInner { dial_history: Vec::default(), dial_duration: Duration::from_millis(10), dial_outcome: Box::new(|_| true), + node_id: NodeId::from_bytes(&[0u8; 32]).unwrap(), } } } @@ -55,6 +58,10 @@ impl Dialer for TestingDialer { fn is_pending(&self, node: NodeId) -> bool { self.0.read().dialing.contains(&node) } + + fn node_id(&self) -> NodeId { + self.0.read().node_id + } } impl Stream for TestingDialer { diff --git a/iroh-blobs/src/downloader/test/getter.rs b/iroh-blobs/src/downloader/test/getter.rs index 397f1134f1..c3686a71c4 100644 --- a/iroh-blobs/src/downloader/test/getter.rs +++ b/iroh-blobs/src/downloader/test/getter.rs @@ -3,9 +3,12 @@ use futures_lite::{future::Boxed as BoxFuture, FutureExt}; use parking_lot::RwLock; +use crate::downloader; + use super::*; -#[derive(Default, Clone)] +#[derive(Default, Clone, derive_more::Debug)] +#[debug("TestingGetter")] pub(super) struct TestingGetter(Arc>); pub(super) type RequestHandlerFn = Arc< @@ -34,14 +37,29 @@ impl Getter for TestingGetter { // since for testing we don't need a real connection, just keep track of what peer is the // request being sent to type Connection = NodeId; + type NeedsConn = GetStateNeedsConn; fn get( &mut self, kind: DownloadKind, - peer: NodeId, progress_sender: BroadcastProgressSender, - ) -> GetFut { - let mut inner = self.0.write(); + ) -> GetStartFut { + std::future::ready(Ok(downloader::GetOutput::NeedsConn(GetStateNeedsConn( + self.clone(), + kind, + progress_sender, + )))) + .boxed_local() + } +} + +#[derive(Debug)] +pub(super) struct GetStateNeedsConn(TestingGetter, DownloadKind, BroadcastProgressSender); + +impl downloader::NeedsConn for GetStateNeedsConn { + fn proceed(self, peer: NodeId) -> super::GetProceedFut { + let GetStateNeedsConn(getter, kind, progress_sender) = self; + let mut inner = getter.0.write(); inner.request_history.push((kind, peer)); let request_duration = inner.request_duration; let handler = inner.request_handler.clone(); diff --git a/iroh-blobs/src/get/db.rs b/iroh-blobs/src/get/db.rs index 08ef2f82c7..afcdea6972 100644 --- a/iroh-blobs/src/get/db.rs +++ b/iroh-blobs/src/get/db.rs @@ -3,12 +3,18 @@ use std::future::Future; use std::io; use std::num::NonZeroU64; +use std::pin::Pin; use futures_lite::StreamExt; +use genawaiter::{ + rc::{Co, Gen}, + GeneratorState, +}; use iroh_base::hash::Hash; use iroh_base::rpc::RpcError; use iroh_net::endpoint::Connection; use serde::{Deserialize, Serialize}; +use tokio::sync::oneshot; use crate::hashseq::parse_hash_seq; use crate::protocol::RangeSpec; @@ -34,6 +40,9 @@ use bao_tree::{ChunkNum, ChunkRanges}; use iroh_io::AsyncSliceReader; use tracing::trace; +type GetGenerator = Gen>>>>; +type GetFuture = Pin> + 'static>>; + /// Get a blob or collection into a store. /// /// This considers data that is already in the store, and will only request @@ -50,12 +59,105 @@ pub async fn get_to_db< db: &D, get_conn: C, hash_and_format: &HashAndFormat, - sender: impl ProgressSender + IdGenerator, + progress_sender: impl ProgressSender + IdGenerator, +) -> Result { + match get_to_db_in_steps(db.clone(), *hash_and_format, progress_sender).await? { + GetState::Complete(res) => Ok(res), + GetState::NeedsConn(state) => { + let conn = get_conn().await.map_err(GetError::Io)?; + state.proceed(conn).await + } + } +} + +/// Get a blob or collection into a store, yielding if a connection is needed. +/// +/// This checks a get request against a local store, and returns [`GetState`], +/// which is either `Complete` in case the requested data is fully available in the local store, or +/// `NeedsConn`, once a connection is needed to proceed downloading the missing data. +/// +/// In the latter case, call [`GetStateNeedsConn::proceed`] with a connection to a provider to +/// proceed with the download. +/// +/// Progress reporting works in the same way as documented in [`get_to_db`]. +pub async fn get_to_db_in_steps< + D: BaoStore, + P: ProgressSender + IdGenerator, +>( + db: D, + hash_and_format: HashAndFormat, + progress_sender: P, +) -> Result { + let mut gen: GetGenerator = genawaiter::rc::Gen::new(move |co| { + let fut = async move { producer(co, &db, &hash_and_format, progress_sender).await }; + let fut: GetFuture = Box::pin(fut); + fut + }); + match gen.async_resume().await { + GeneratorState::Yielded(Yield::NeedConn(reply)) => { + Ok(GetState::NeedsConn(GetStateNeedsConn(gen, reply))) + } + GeneratorState::Complete(res) => res.map(GetState::Complete), + } +} + +/// Intermediary state returned from [`get_to_db_in_steps`] for a download request that needs a +/// connection to proceed. +#[derive(derive_more::Debug)] +#[debug("GetStateNeedsConn")] +pub struct GetStateNeedsConn(GetGenerator, oneshot::Sender); + +impl GetStateNeedsConn { + /// Proceed with the download by providing a connection to a provider. + pub async fn proceed(mut self, conn: Connection) -> Result { + self.1.send(conn).expect("receiver is not dropped"); + match self.0.async_resume().await { + GeneratorState::Yielded(y) => match y { + Yield::NeedConn(_) => panic!("NeedsConn may only be yielded once"), + }, + GeneratorState::Complete(res) => res, + } + } +} + +/// Output of [`get_to_db_in_steps`]. +#[derive(Debug)] +pub enum GetState { + /// The requested data is completely available in the local store, no network requests are + /// needed. + Complete(Stats), + /// The requested data is not fully available in the local store, we need a connection to + /// proceed. + /// + /// Once a connection is available, call [`GetStateNeedsConn::proceed`] to continue. + NeedsConn(GetStateNeedsConn), +} + +struct GetCo(Co); + +impl GetCo { + async fn get_conn(&self) -> Connection { + let (tx, rx) = oneshot::channel(); + self.0.yield_(Yield::NeedConn(tx)).await; + rx.await.expect("sender may not be dropped") + } +} + +enum Yield { + NeedConn(oneshot::Sender), +} + +async fn producer( + co: Co, + db: &D, + hash_and_format: &HashAndFormat, + progress: impl ProgressSender + IdGenerator, ) -> Result { let HashAndFormat { hash, format } = hash_and_format; + let co = GetCo(co); match format { - BlobFormat::Raw => get_blob(db, get_conn, hash, sender).await, - BlobFormat::HashSeq => get_hash_seq(db, get_conn, hash, sender).await, + BlobFormat::Raw => get_blob(db, co, hash, progress).await, + BlobFormat::HashSeq => get_hash_seq(db, co, hash, progress).await, } } @@ -63,9 +165,9 @@ pub async fn get_to_db< /// /// We need to create our own files and handle the case where an outboard /// is not needed. -async fn get_blob F, F: Future>>( +async fn get_blob( db: &D, - get_conn: C, + co: GetCo, hash: &Hash, progress: impl ProgressSender + IdGenerator, ) -> Result { @@ -100,7 +202,7 @@ async fn get_blob F, F: Future F, F: Future { // full request - let conn = get_conn().await.map_err(GetError::Io)?; + let conn = co.get_conn().await; let request = get::fsm::start(conn, GetRequest::single(*hash)); // create a new bidi stream let connected = request.next().await?; @@ -299,13 +401,9 @@ async fn blob_infos(db: &D, hash_seq: &[Hash]) -> io::Result F, - F: Future>, ->( +async fn get_hash_seq( db: &D, - get_conn: C, + co: GetCo, root_hash: &Hash, sender: impl ProgressSender + IdGenerator, ) -> Result { @@ -364,7 +462,7 @@ async fn get_hash_seq< .collect::>(); log!("requesting chunks {:?}", missing_iter); let request = GetRequest::new(*root_hash, RangeSpecSeq::from_ranges(missing_iter)); - let conn = get_conn().await.map_err(GetError::Io)?; + let conn = co.get_conn().await; let request = get::fsm::start(conn, request); // create a new bidi stream let connected = request.next().await?; @@ -410,7 +508,7 @@ async fn get_hash_seq< _ => { tracing::debug!("don't have collection - doing full download"); // don't have the collection, so probably got nothing - let conn = get_conn().await.map_err(GetError::Io)?; + let conn = co.get_conn().await; let request = get::fsm::start(conn, GetRequest::all(*root_hash)); // create a new bidi stream let connected = request.next().await?; diff --git a/iroh-net/src/dialer.rs b/iroh-net/src/dialer.rs index 7a7685d97b..8c37b08c08 100644 --- a/iroh-net/src/dialer.rs +++ b/iroh-net/src/dialer.rs @@ -99,6 +99,11 @@ impl Dialer { pub fn pending_count(&self) -> usize { self.pending_dials.len() } + + /// Returns a reference to the endpoint used in this dialer. + pub fn endpoint(&self) -> &Endpoint { + &self.endpoint + } } impl Stream for Dialer { diff --git a/iroh/src/client/blobs.rs b/iroh/src/client/blobs.rs index 3151c3fb1f..04e544e8b1 100644 --- a/iroh/src/client/blobs.rs +++ b/iroh/src/client/blobs.rs @@ -944,7 +944,10 @@ mod tests { use super::*; use anyhow::Context as _; + use iroh_blobs::hashseq::HashSeq; + use iroh_net::NodeId; use rand::RngCore; + use testresult::TestResult; use tokio::io::AsyncWriteExt; #[tokio::test] @@ -1248,4 +1251,185 @@ mod tests { Ok(()) } + + /// Download a existing blob from oneself + #[tokio::test] + async fn test_blob_get_self_existing() -> TestResult<()> { + let _guard = iroh_test::logging::setup(); + + let node = crate::node::Node::memory().spawn().await?; + let node_id = node.node_id(); + let client = node.client(); + + let AddOutcome { hash, size, .. } = client.blobs().add_bytes("foo").await?; + + // Direct + let res = client + .blobs() + .download_with_opts( + hash, + DownloadOptions { + format: BlobFormat::Raw, + nodes: vec![node_id.into()], + tag: SetTagOption::Auto, + mode: DownloadMode::Direct, + }, + ) + .await? + .await?; + + assert_eq!(res.local_size, size); + assert_eq!(res.downloaded_size, 0); + + // Queued + let res = client + .blobs() + .download_with_opts( + hash, + DownloadOptions { + format: BlobFormat::Raw, + nodes: vec![node_id.into()], + tag: SetTagOption::Auto, + mode: DownloadMode::Queued, + }, + ) + .await? + .await?; + + assert_eq!(res.local_size, size); + assert_eq!(res.downloaded_size, 0); + + Ok(()) + } + + /// Download a missing blob from oneself + #[tokio::test] + async fn test_blob_get_self_missing() -> TestResult<()> { + let _guard = iroh_test::logging::setup(); + + let node = crate::node::Node::memory().spawn().await?; + let node_id = node.node_id(); + let client = node.client(); + + let hash = Hash::from_bytes([0u8; 32]); + + // Direct + let res = client + .blobs() + .download_with_opts( + hash, + DownloadOptions { + format: BlobFormat::Raw, + nodes: vec![node_id.into()], + tag: SetTagOption::Auto, + mode: DownloadMode::Direct, + }, + ) + .await? + .await; + assert!(res.is_err()); + assert_eq!( + res.err().unwrap().to_string().as_str(), + "No nodes to download from provided" + ); + + // Queued + let res = client + .blobs() + .download_with_opts( + hash, + DownloadOptions { + format: BlobFormat::Raw, + nodes: vec![node_id.into()], + tag: SetTagOption::Auto, + mode: DownloadMode::Queued, + }, + ) + .await? + .await; + assert!(res.is_err()); + assert_eq!( + res.err().unwrap().to_string().as_str(), + "No provider nodes found" + ); + + Ok(()) + } + + /// Download a existing collection. Check that things succeed and no download is performed. + #[tokio::test] + async fn test_blob_get_existing_collection() -> TestResult<()> { + let _guard = iroh_test::logging::setup(); + + let node = crate::node::Node::memory().spawn().await?; + // We use a nonexisting node id because we just want to check that this succeeds without + // hitting the network. + let node_id = NodeId::from_bytes(&[0u8; 32])?; + let client = node.client(); + + let mut collection = Collection::default(); + let mut tags = Vec::new(); + let mut size = 0; + for value in ["iroh", "is", "cool"] { + let import_outcome = client.blobs().add_bytes(value).await.context("add bytes")?; + collection.push(value.to_string(), import_outcome.hash); + tags.push(import_outcome.tag); + size += import_outcome.size; + } + + let (hash, _tag) = client + .blobs() + .create_collection(collection, SetTagOption::Auto, tags) + .await?; + + // load the hashseq and collection header manually to calculate our expected size + let hashseq_bytes = client.blobs().read_to_bytes(hash).await?; + size += hashseq_bytes.len() as u64; + let hashseq = HashSeq::try_from(hashseq_bytes)?; + let collection_header_bytes = client + .blobs() + .read_to_bytes(hashseq.into_iter().next().expect("header to exist")) + .await?; + size += collection_header_bytes.len() as u64; + + // Direct + let res = client + .blobs() + .download_with_opts( + hash, + DownloadOptions { + format: BlobFormat::HashSeq, + nodes: vec![node_id.into()], + tag: SetTagOption::Auto, + mode: DownloadMode::Direct, + }, + ) + .await? + .await + .context("direct (download)")?; + + assert_eq!(res.local_size, size); + assert_eq!(res.downloaded_size, 0); + + // Queued + let res = client + .blobs() + .download_with_opts( + hash, + DownloadOptions { + format: BlobFormat::HashSeq, + nodes: vec![node_id.into()], + tag: SetTagOption::Auto, + mode: DownloadMode::Queued, + }, + ) + .await? + .await + .context("queued")?; + + assert_eq!(res.local_size, size); + assert_eq!(res.downloaded_size, 0); + + Ok(()) + } } diff --git a/iroh/src/node/rpc.rs b/iroh/src/node/rpc.rs index 467e91d402..e51e233ce8 100644 --- a/iroh/src/node/rpc.rs +++ b/iroh/src/node/rpc.rs @@ -3,12 +3,11 @@ use std::io; use std::sync::{Arc, Mutex}; use std::time::Duration; -use anyhow::{anyhow, ensure, Result}; +use anyhow::{anyhow, Result}; use futures_buffered::BufferedStreamExt; use futures_lite::{Stream, StreamExt}; use genawaiter::sync::{Co, Gen}; use iroh_base::rpc::{RpcError, RpcResult}; -use iroh_blobs::downloader::{DownloadRequest, Downloader}; use iroh_blobs::export::ExportProgress; use iroh_blobs::format::collection::Collection; use iroh_blobs::get::db::DownloadProgress; @@ -18,6 +17,10 @@ use iroh_blobs::util::local_pool::LocalPoolHandle; use iroh_blobs::util::progress::{AsyncChannelProgressSender, ProgressSender}; use iroh_blobs::util::SetTagOption; use iroh_blobs::BlobFormat; +use iroh_blobs::{ + downloader::{DownloadRequest, Downloader}, + get::db::GetState, +}; use iroh_blobs::{ provider::AddProgress, store::{Store as BaoStore, ValidateProgress}, @@ -1191,6 +1194,7 @@ async fn download_queued( Ok(stats) } +#[tracing::instrument("download_direct", skip_all, fields(hash=%hash_and_format.hash.fmt_short()))] async fn download_direct_from_nodes( db: &D, endpoint: Endpoint, @@ -1201,51 +1205,61 @@ async fn download_direct_from_nodes( where D: BaoStore, { - ensure!(!nodes.is_empty(), "No nodes to download from provided."); let mut last_err = None; - for node in nodes { - let node_id = node.node_id; - match download_direct( - db, - endpoint.clone(), - hash_and_format, - node, - progress.clone(), - ) - .await + let mut remaining_nodes = nodes.len(); + let mut nodes_iter = nodes.into_iter(); + 'outer: loop { + match iroh_blobs::get::db::get_to_db_in_steps(db.clone(), hash_and_format, progress.clone()) + .await? { - Ok(stats) => return Ok(stats), - Err(err) => { - debug!(?err, node = &node_id.fmt_short(), "Download failed"); - last_err = Some(err) + GetState::Complete(stats) => return Ok(stats), + GetState::NeedsConn(needs_conn) => { + let (conn, node_id) = 'inner: loop { + match nodes_iter.next() { + None => break 'outer, + Some(node) => { + remaining_nodes -= 1; + let node_id = node.node_id; + if node_id == endpoint.node_id() { + debug!( + ?remaining_nodes, + "skip node {} (it is the node id of ourselves)", + node_id.fmt_short() + ); + continue 'inner; + } + match endpoint.connect(node, iroh_blobs::protocol::ALPN).await { + Ok(conn) => break 'inner (conn, node_id), + Err(err) => { + debug!( + ?remaining_nodes, + "failed to connect to {}: {err}", + node_id.fmt_short() + ); + continue 'inner; + } + } + } + } + }; + match needs_conn.proceed(conn).await { + Ok(stats) => return Ok(stats), + Err(err) => { + warn!( + ?remaining_nodes, + "failed to download from {}: {err}", + node_id.fmt_short() + ); + last_err = Some(err); + } + } } } } - Err(last_err.unwrap()) -} - -async fn download_direct( - db: &D, - endpoint: Endpoint, - hash_and_format: HashAndFormat, - node: NodeAddr, - progress: AsyncChannelProgressSender, -) -> Result -where - D: BaoStore, -{ - let get_conn = { - let progress = progress.clone(); - move || async move { - let conn = endpoint.connect(node, iroh_blobs::protocol::ALPN).await?; - progress.send(DownloadProgress::Connected).await?; - Ok(conn) - } - }; - - let res = iroh_blobs::get::db::get_to_db(db, get_conn, &hash_and_format, progress).await; - - res.map_err(Into::into) + match last_err { + Some(err) => Err(err.into()), + None => Err(anyhow!("No nodes to download from provided")), + } } fn docs_disabled() -> RpcError { From da2e10c0a1fd0def8813991eeb296ee9dc8c5030 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 5 Aug 2024 19:05:54 +0200 Subject: [PATCH 27/45] chore(deps): bump mozilla-actions/sccache-action from 0.0.4 to 0.0.5 in the github-actions group (#2531) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps the github-actions group with 1 update: [mozilla-actions/sccache-action](https://github.com/mozilla-actions/sccache-action). Updates `mozilla-actions/sccache-action` from 0.0.4 to 0.0.5
Release notes

Sourced from mozilla-actions/sccache-action's releases.

v0.0.5

What's Changed

Dependencies

New Contributors

Full Changelog: https://github.com/Mozilla-Actions/sccache-action/compare/v0.0.4...v0.0.5

Commits
  • 89e9040 new upstream release
  • e87392b prepare sccache-action 0.0.5
  • b4427f4 Merge pull request #121 from Mozilla-Actions/dependabot/npm_and_yarn/prettier...
  • e759f84 Bump prettier from 3.2.5 to 3.3.2
  • b129114 Merge pull request #118 from Mozilla-Actions/dependabot/npm_and_yarn/ts-jest-...
  • fb41e19 Merge pull request #120 from Mozilla-Actions/dependabot/npm_and_yarn/braces-3...
  • 83e296b Bump braces from 3.0.2 to 3.0.3
  • 511fd3c Merge pull request #117 from Mozilla-Actions/dependabot/npm_and_yarn/typescri...
  • d4fcccf Merge pull request #115 from Mozilla-Actions/dependabot/npm_and_yarn/types/no...
  • aece89f Merge pull request #113 from orf/add-notice-output
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=mozilla-actions/sccache-action&package-manager=github_actions&previous-version=0.0.4&new-version=0.0.5)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore major version` will close this group update PR and stop Dependabot creating any more for the specific dependency's major version (unless you unignore this specific dependency's major version or upgrade to it yourself) - `@dependabot ignore minor version` will close this group update PR and stop Dependabot creating any more for the specific dependency's minor version (unless you unignore this specific dependency's minor version or upgrade to it yourself) - `@dependabot ignore ` will close this group update PR and stop Dependabot creating any more for the specific dependency (unless you unignore this specific dependency or upgrade to it yourself) - `@dependabot unignore ` will remove all of the ignore conditions of the specified dependency - `@dependabot unignore ` will remove the ignore condition of the specified dependency and ignore conditions
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Floris Bruynooghe Co-authored-by: Friedel Ziegelmayer --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d1bbff6ef1..689c48e048 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -324,7 +324,7 @@ jobs: uses: dtolnay/rust-toolchain@stable - name: Install sccache - uses: mozilla-actions/sccache-action@v0.0.4 + uses: mozilla-actions/sccache-action@v0.0.5 - name: Build iroh run: | From 55836fa5ca56fe6964be52046bb0c7f77e62b647 Mon Sep 17 00:00:00 2001 From: Kasey Date: Mon, 5 Aug 2024 13:42:10 -0400 Subject: [PATCH 28/45] docs(iroh): add documentations and examples for the `iroh::node::Client` (#2582) ## Description Happy Docs Day! Added documentation to the `iroh::node::Client`, including examples. ## Change checklist - [x] Self-review. - [x] Documentation updates following the [style guide](https://rust-lang.github.io/rfcs/1574-more-api-documentation-conventions.html#appendix-a-full-conventions-text), if relevant. --------- Co-authored-by: Kasey Huizinga --- Cargo.lock | 1 + iroh/Cargo.toml | 3 ++ iroh/src/client/node.rs | 81 ++++++++++++++++++++++++++++++++++++++++- 3 files changed, 84 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index af8e7a5224..32932a5825 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2525,6 +2525,7 @@ dependencies = [ "tokio-util", "tracing", "tracing-subscriber", + "url", "walkdir", ] diff --git a/iroh/Cargo.toml b/iroh/Cargo.toml index 855f65803a..69ec85d66b 100644 --- a/iroh/Cargo.toml +++ b/iroh/Cargo.toml @@ -59,6 +59,9 @@ indicatif = { version = "0.17", features = ["tokio"], optional = true } ref-cast = "1.0.23" console = { version = "0.15.5", optional = true } +# Documentation tests +url = { version = "2.5.0", features = ["serde"] } + [features] default = ["metrics", "fs-store"] metrics = ["iroh-metrics", "iroh-blobs/metrics"] diff --git a/iroh/src/client/node.rs b/iroh/src/client/node.rs index 265d3c2a00..c12c5bd5bb 100644 --- a/iroh/src/client/node.rs +++ b/iroh/src/client/node.rs @@ -29,7 +29,86 @@ use crate::rpc_protocol::node::{ use super::{flatten, RpcClient}; -/// Iroh node client. +/// Iroh node Client. +/// +/// Cheaply clonable and threadsafe. Use the iroh `node::Client` to access the +/// iroh node methods from a different thread, process, or remote machine. +/// The [`Iroh`](crate::client::Iroh) client dereferences to a `node::Client`, +/// so you have access to this api from the [`Iroh`](crate::client::Iroh) client itself. +/// +/// The `node::Client` api allows you to get information *about* the iroh node, +/// its status, and connection status to other nodes. It also allows you to +/// provide address information about *other* nodes to your node. +/// +/// Obtain an iroh `node::Client` via [`Iroh::node()`](crate::client::Iroh::node). +/// +/// It also provides a way to [shutdown](Client::shutdown) the entire iroh node. +/// +/// # Examples +/// ``` +/// use std::str::FromStr; +/// use iroh_base::{key::NodeId, node_addr::{RelayUrl, NodeAddr}}; +/// use url::Url; +/// +/// # async fn run() -> anyhow::Result<()> { +/// // Create an iroh node: +/// let iroh = iroh::node::Node::memory().spawn().await?; +/// // Create a node client, a client that gives you access to `node` subsystem +/// let node_client = iroh.client().node(); +/// // Get the node status, including its node id, addresses, the version of iroh +/// // it is running, and more. +/// let status = node_client.status().await?; +/// println!("Node status: {status:?}"); +/// // Provide your node an address for another node +/// let relay_url = RelayUrl::from(Url::parse("https://example.com").unwrap()); +/// let addr = NodeAddr::from_parts( +/// // the node_id +/// NodeId::from_str("ae58ff8833241ac82d6ff7611046ed67b5072d142c588d0063e942d9a75502b6").unwrap(), +/// // the home relay +/// Some(relay_url), +/// // the direct addresses +/// vec!["120.0.0.1:0".parse().unwrap()], +/// ); +/// node_client.add_node_addr(addr).await?; +/// // Shut down the node. Passing `true` will force the shutdown, passing in +/// // `false` will allow the node to shut down gracefully. +/// node_client.shutdown(false).await?; +/// # Ok(()) +/// # } +/// ``` +/// You can also use the `node::Client` methods from the `Iroh` client: +/// +/// ``` +/// use std::str::FromStr; +/// use iroh_base::{key::NodeId, node_addr::{RelayUrl, NodeAddr}}; +/// use url::Url; +/// +/// # async fn run() -> anyhow::Result<()> { +/// // Create an iroh node: +/// let iroh = iroh::node::Node::memory().spawn().await?; +/// // Create a client: +/// let client = iroh.client(); +/// // Get the node status, including its node id, addresses, the version of iroh +/// // it is running, and more. +/// let status = client.status().await?; +/// println!("Node status: {status:?}"); +/// // Provide your node an address for another node +/// let relay_url = RelayUrl::from(Url::parse("https://example.com").unwrap()); +/// let addr = NodeAddr::from_parts( +/// // the node_id +/// NodeId::from_str("ae58ff8833241ac82d6ff7611046ed67b5072d142c588d0063e942d9a75502b6").unwrap(), +/// // the home relay +/// Some(relay_url), +/// // the direct addresses +/// vec!["120.0.0.1:0".parse().unwrap()], +/// ); +/// client.add_node_addr(addr).await?; +/// // Shut down the node. Passing `true` will force the shutdown, passing in +/// // `false` will allow the node to shut down gracefully. +/// client.shutdown(false).await?; +/// # Ok(()) +/// # } +/// ``` #[derive(Debug, Clone, RefCast)] #[repr(transparent)] pub struct Client { From bdc1c45b8251d23773ec6064416a3ea2f4264df9 Mon Sep 17 00:00:00 2001 From: Franz Heinzmann Date: Mon, 5 Aug 2024 21:04:54 +0200 Subject: [PATCH 29/45] [wip] refactor(iroh-gossip)!: dispatch gossip events and updates by topic (#2570) ## Description This PR changes the main public `iroh_gossip` to keep track of client-side gossip subscriptions. The `net::Gossip` struct now keeps track of client-side subscribers per topic, which are made up of a pair of two streams/channels: from the client to the actor a stream of updates (outgoing messages) and from the actor to the client a stream of events (incoming messages). Once all client streams&sinks for a topic are dropped, the topic is being quit. This builds on the client API added in #2258, but completely removes the `dispatcher` module, integrating its features directly into the gossip actor. See below for a short list of the API changes. The new API can be browsed [here](https://n0-computer.github.io/iroh/pr/2570/docs/iroh/gossip/net/index.html). The refactor turned out bigger than initially intended, sorry for that, but I did not see a good way to reduce the scope. What's still missing (can also be follow-ups)?: - [ ] Review the new public API - [ ] Align the client API to the iroh_gossip API. The `GossipTopic` can be made to work on both the client and the native API, as it only deals with streams and sinks. ## Breaking Changes * `iroh_gossip::dispatcher` is removed with everything that was in it. use the new API from `iroh_gossip::net::Gossip` instead (see below). * `iroh_gossip::net::Gossip` methods changed: * changed: `join` now returns a `GossipTopic` * removed: `broadcast`, `broadcast_neighbors`, `subscribe`, `subscribe_all`, `quit`. * for `subscribe` use `join` instead, which returns a `GossipTopic` * for `broadcast` and `broadcast_neighbors` use the respective methods on `GossipTopic` . * `quit` is obsolete now, the topic will be quitted once all `GossipTopic` handles are dropped. * `subscribe_all` is no longer available * `iroh_gossip::net::JoinTopicFut` is removed (is now obsolete) ## Notes & open questions ## Change checklist - [x] Self-review. - [x] Documentation updates following the [style guide](https://rust-lang.github.io/rfcs/1574-more-api-documentation-conventions.html#appendix-a-full-conventions-text), if relevant. - [ ] Tests if relevant. - [ ] All breaking changes documented. --- Cargo.lock | 2 +- iroh-cli/src/commands/gossip.rs | 8 +- iroh-docs/src/engine.rs | 11 +- iroh-docs/src/engine/gossip.rs | 283 ++++++----- iroh-docs/src/engine/live.rs | 87 ++-- iroh-gossip/Cargo.toml | 27 +- iroh-gossip/examples/chat.rs | 40 +- iroh-gossip/src/dispatcher.rs | 503 -------------------- iroh-gossip/src/lib.rs | 2 - iroh-gossip/src/net.rs | 811 +++++++++++++++++++------------- iroh-gossip/src/net/handles.rs | 254 ++++++++++ iroh-gossip/src/proto/state.rs | 10 +- iroh/src/client/gossip.rs | 8 +- iroh/src/metrics.rs | 5 - iroh/src/node.rs | 2 - iroh/src/node/builder.rs | 7 +- iroh/src/node/rpc.rs | 9 +- iroh/src/rpc_protocol/gossip.rs | 4 +- iroh/tests/client.rs | 38 +- 19 files changed, 971 insertions(+), 1140 deletions(-) delete mode 100644 iroh-gossip/src/dispatcher.rs create mode 100644 iroh-gossip/src/net/handles.rs diff --git a/Cargo.lock b/Cargo.lock index 32932a5825..87bf9e4359 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2781,9 +2781,9 @@ dependencies = [ "clap", "derive_more", "ed25519-dalek", + "futures-concurrency", "futures-lite 2.3.0", "futures-util", - "genawaiter", "indexmap 2.2.6", "iroh-base", "iroh-blake3", diff --git a/iroh-cli/src/commands/gossip.rs b/iroh-cli/src/commands/gossip.rs index 0ab236bdce..7995491fd0 100644 --- a/iroh-cli/src/commands/gossip.rs +++ b/iroh-cli/src/commands/gossip.rs @@ -65,7 +65,7 @@ impl GossipCommands { line = input_lines.next_line() => { let line = line.context("failed to read from stdin")?; if let Some(line) = line { - sink.send(iroh_gossip::dispatcher::Command::Broadcast(line.into())).await?; + sink.send(iroh_gossip::net::Command::Broadcast(line.into())).await?; } else { break; } @@ -73,14 +73,14 @@ impl GossipCommands { res = stream.next() => { let res = res.context("gossip stream ended")?.context("failed to read gossip stream")?; match res { - iroh_gossip::dispatcher::Event::Gossip(event) => { + iroh_gossip::net::Event::Gossip(event) => { if verbose { println!("{:?}", event); - } else if let iroh_gossip::dispatcher::GossipEvent::Received(iroh_gossip::dispatcher::Message { content, .. }) = event { + } else if let iroh_gossip::net::GossipEvent::Received(iroh_gossip::net::Message { content, .. }) = event { println!("{:?}", content); } } - iroh_gossip::dispatcher::Event::Lagged => { + iroh_gossip::net::Event::Lagged => { anyhow::bail!("gossip stream lagged"); } }; diff --git a/iroh-docs/src/engine.rs b/iroh-docs/src/engine.rs index f6a2ae73aa..332497a78e 100644 --- a/iroh-docs/src/engine.rs +++ b/iroh-docs/src/engine.rs @@ -23,7 +23,6 @@ use tracing::{error, error_span, Instrument}; use crate::{actor::SyncHandle, ContentStatus, ContentStatusCallback, Entry, NamespaceId}; use crate::{Author, AuthorId}; -use self::gossip::GossipActor; use self::live::{LiveActor, ToLiveActor}; pub use self::live::SyncEvent; @@ -69,7 +68,6 @@ impl Engine { default_author_storage: DefaultAuthorStorage, ) -> anyhow::Result { let (live_actor_tx, to_live_actor_recv) = mpsc::channel(ACTOR_CHANNEL_CAP); - let (to_gossip_actor, to_gossip_actor_recv) = mpsc::channel(ACTOR_CHANNEL_CAP); let me = endpoint.node_id().fmt_short(); let content_status_cb = { @@ -86,17 +84,10 @@ impl Engine { downloader, to_live_actor_recv, live_actor_tx.clone(), - to_gossip_actor, - ); - let gossip_actor = GossipActor::new( - to_gossip_actor_recv, - sync.clone(), - gossip, - live_actor_tx.clone(), ); let actor_handle = tokio::task::spawn( async move { - if let Err(err) = actor.run(gossip_actor).await { + if let Err(err) = actor.run().await { error!("sync actor failed: {err:?}"); } } diff --git a/iroh-docs/src/engine/gossip.rs b/iroh-docs/src/engine/gossip.rs index 210356c969..ab261ef1e6 100644 --- a/iroh-docs/src/engine/gossip.rs +++ b/iroh-docs/src/engine/gossip.rs @@ -1,170 +1,156 @@ -use std::collections::HashSet; +use std::collections::{hash_map, HashMap}; use anyhow::{Context, Result}; +use bytes::Bytes; use futures_lite::StreamExt; use futures_util::FutureExt; -use iroh_gossip::net::{Event, Gossip}; -use iroh_metrics::inc; -use iroh_net::key::PublicKey; +use iroh_gossip::net::{Event, Gossip, GossipEvent, GossipReceiver, GossipSender, JoinOptions}; +use iroh_net::NodeId; use tokio::{ - sync::{broadcast, mpsc}, - task::JoinSet, + sync::mpsc, + task::{AbortHandle, JoinSet}, }; -use tokio_stream::{ - wrappers::{errors::BroadcastStreamRecvError, BroadcastStream}, - StreamMap, -}; -use tracing::{debug, error, trace, warn}; +use tracing::{debug, instrument, warn}; -use crate::metrics::Metrics; use crate::{actor::SyncHandle, ContentStatus, NamespaceId}; use super::live::{Op, ToLiveActor}; -#[derive(strum::Display, Debug)] -pub enum ToGossipActor { - Shutdown, - Join { - namespace: NamespaceId, - peers: Vec, - }, - Leave { - namespace: NamespaceId, - }, +#[derive(Debug)] +struct ActiveState { + sender: GossipSender, + abort_handle: AbortHandle, } -/// This actor subscribes to all gossip events. When receiving entries, they are inserted in the -/// replica (if open). Other events are forwarded to the main actor to be handled there. -pub struct GossipActor { - inbox: mpsc::Receiver, - sync: SyncHandle, +#[derive(Debug)] +pub struct GossipState { gossip: Gossip, - to_sync_actor: mpsc::Sender, - joined: HashSet, - want_join: HashSet, - pending_joins: JoinSet<(NamespaceId, Result>)>, - gossip_events: StreamMap>, + sync: SyncHandle, + to_live_actor: mpsc::Sender, + active: HashMap, + active_tasks: JoinSet<(NamespaceId, Result<()>)>, } -impl GossipActor { - pub fn new( - inbox: mpsc::Receiver, - sync: SyncHandle, - gossip: Gossip, - to_sync_actor: mpsc::Sender, - ) -> Self { +impl GossipState { + pub fn new(gossip: Gossip, sync: SyncHandle, to_live_actor: mpsc::Sender) -> Self { Self { - inbox, - sync, gossip, - to_sync_actor, - joined: Default::default(), - want_join: Default::default(), - pending_joins: Default::default(), - gossip_events: Default::default(), + sync, + to_live_actor, + active: Default::default(), + active_tasks: Default::default(), } } - pub async fn run(&mut self) -> anyhow::Result<()> { - let mut i = 0; - loop { - i += 1; - trace!(?i, "tick wait"); - inc!(Metrics, doc_gossip_tick_main); - tokio::select! { - next = self.gossip_events.next(), if !self.gossip_events.is_empty() => { - trace!(?i, "tick: gossip_event"); - inc!(Metrics, doc_gossip_tick_event); - if let Err(err) = self.on_gossip_event(next).await { - error!("gossip actor died: {err:?}"); - return Err(err); - } - }, - msg = self.inbox.recv() => { - let msg = msg.context("to_actor closed")?; - trace!(%msg, ?i, "tick: to_actor"); - inc!(Metrics, doc_gossip_tick_actor); - if !self.on_actor_message(msg).await.context("on_actor_message")? { - break; - } - } - Some(res) = self.pending_joins.join_next(), if !self.pending_joins.is_empty() => { - trace!(?i, "tick: pending_joins"); - inc!(Metrics, doc_gossip_tick_pending_join); - let (namespace, res) = res.context("pending_joins closed")?; - match res { - Ok(stream) => { - debug!(namespace = %namespace.fmt_short(), "joined gossip"); - self.joined.insert(namespace); - let stream = BroadcastStream::new(stream); - self.gossip_events.insert(namespace, stream); - }, - Err(err) => { - if self.want_join.contains(&namespace) { - error!(?namespace, ?err, "failed to join gossip"); - } - } - } - } + pub async fn join(&mut self, namespace: NamespaceId, bootstrap: Vec) -> Result<()> { + match self.active.entry(namespace) { + hash_map::Entry::Occupied(entry) => { + if !bootstrap.is_empty() { + entry.get().sender.join_peers(bootstrap).await?; + } + } + hash_map::Entry::Vacant(entry) => { + let sub = self + .gossip + .join_with_opts(namespace.into(), JoinOptions::with_bootstrap(bootstrap)); + let (sender, stream) = sub.split(); + let abort_handle = self.active_tasks.spawn( + receive_loop( + namespace, + stream, + self.to_live_actor.clone(), + self.sync.clone(), + ) + .map(move |res| (namespace, res)), + ); + entry.insert(ActiveState { + sender, + abort_handle, + }); } } Ok(()) } - async fn on_actor_message(&mut self, msg: ToGossipActor) -> anyhow::Result { - match msg { - ToGossipActor::Shutdown => { - for namespace in self.joined.iter() { - self.gossip.quit((*namespace).into()).await.ok(); - } - return Ok(false); - } - ToGossipActor::Join { namespace, peers } => { - debug!(?namespace, peers = peers.len(), "join gossip"); - let gossip = self.gossip.clone(); - // join gossip for the topic to receive and send message - let fut = async move { - let stream = gossip.subscribe(namespace.into()).await?; - let _topic = gossip.join(namespace.into(), peers).await?.await?; - Ok(stream) - }; - let fut = fut.map(move |res| (namespace, res)); - self.want_join.insert(namespace); - self.pending_joins.spawn(fut); - } - ToGossipActor::Leave { namespace } => { - self.gossip.quit(namespace.into()).await?; - self.joined.remove(&namespace); - self.want_join.remove(&namespace); - } + pub fn quit(&mut self, topic: &NamespaceId) { + if let Some(state) = self.active.remove(topic) { + state.abort_handle.abort(); } - Ok(true) } - async fn on_gossip_event( - &mut self, - event: Option<(NamespaceId, Result)>, - ) -> Result<()> { - let (namespace, event) = event.context("Gossip event channel closed")?; - let event = match event { - Ok(event) => event, - Err(BroadcastStreamRecvError::Lagged(n)) => { - warn!("GossipActor too slow (lagged by {n}) - dropping gossip event"); - return Ok(()); - } - }; - if !self.joined.contains(&namespace) && !self.want_join.contains(&namespace) { - error!(namespace = %namespace.fmt_short(), "received gossip event for unknown topic"); - return Ok(()); + + pub async fn shutdown(&mut self) -> Result<()> { + for (_, state) in self.active.drain() { + state.abort_handle.abort(); } - if let Err(err) = self.on_gossip_event_inner(namespace, event).await { - error!(namespace = %namespace.fmt_short(), ?err, "Failed to process gossip event"); + self.progress().await + } + + pub async fn broadcast(&self, namespace: &NamespaceId, message: Bytes) { + if let Some(state) = self.active.get(namespace) { + state.sender.broadcast(message).await.ok(); + } + } + + pub async fn broadcast_neighbors(&self, namespace: &NamespaceId, message: Bytes) { + if let Some(state) = self.active.get(namespace) { + state.sender.broadcast_neighbors(message).await.ok(); + } + } + + pub fn max_message_size(&self) -> usize { + self.gossip.max_message_size() + } + + pub fn is_empty(&self) -> bool { + self.active.is_empty() + } + + /// Progress the internal task queues. + /// + /// Returns an error if any of the active tasks panic. + /// + /// ## Cancel safety + /// + /// This function is fully cancel-safe. + pub async fn progress(&mut self) -> Result<()> { + while let Some(res) = self.active_tasks.join_next().await { + match res { + Err(err) if err.is_cancelled() => continue, + Err(err) => return Err(err).context("gossip receive loop panicked"), + Ok((namespace, res)) => { + self.active.remove(&namespace); + if let Err(err) = res { + warn!(?err, ?namespace, "gossip receive loop failed") + } + } + } } Ok(()) } +} - async fn on_gossip_event_inner(&mut self, namespace: NamespaceId, event: Event) -> Result<()> { +#[instrument("gossip-recv", skip_all, fields(namespace=%namespace.fmt_short()))] +async fn receive_loop( + namespace: NamespaceId, + mut recv: GossipReceiver, + to_sync_actor: mpsc::Sender, + sync: SyncHandle, +) -> Result<()> { + for peer in recv.neighbors() { + to_sync_actor + .send(ToLiveActor::NeighborUp { namespace, peer }) + .await?; + } + while let Some(event) = recv.try_next().await? { + let event = match event { + Event::Gossip(event) => event, + Event::Lagged => { + debug!("gossip loop lagged - dropping gossip event"); + continue; + } + }; match event { - Event::Received(msg) => { + GossipEvent::Received(msg) => { let op: Op = postcard::from_bytes(&msg.content)?; match op { Op::Put(entry) => { @@ -180,12 +166,15 @@ impl GossipActor { false => ContentStatus::Missing, }; let from = *msg.delivered_from.as_bytes(); - self.sync + if let Err(err) = sync .insert_remote(namespace, entry, from, content_status) - .await?; + .await + { + debug!("ignoring entry received via gossip: {err}"); + } } Op::ContentReady(hash) => { - self.to_sync_actor + to_sync_actor .send(ToLiveActor::NeighborContentReady { namespace, node: msg.delivered_from, @@ -194,7 +183,7 @@ impl GossipActor { .await?; } Op::SyncReport(report) => { - self.to_sync_actor + to_sync_actor .send(ToLiveActor::IncomingSyncReport { from: msg.delivered_from, report, @@ -203,20 +192,24 @@ impl GossipActor { } } } - // A new neighbor appeared in the gossip swarm. Try to sync with it directly. - // [Self::sync_with_peer] will check to not resync with peers synced previously in the - // same session. TODO: Maybe this is too broad and leads to too many sync requests. - Event::NeighborUp(peer) => { - self.to_sync_actor + GossipEvent::NeighborUp(peer) => { + to_sync_actor .send(ToLiveActor::NeighborUp { namespace, peer }) .await?; } - Event::NeighborDown(peer) => { - self.to_sync_actor + GossipEvent::NeighborDown(peer) => { + to_sync_actor .send(ToLiveActor::NeighborDown { namespace, peer }) .await?; } + GossipEvent::Joined(peers) => { + for peer in peers { + to_sync_actor + .send(ToLiveActor::NeighborUp { namespace, peer }) + .await?; + } + } } - Ok(()) } + Ok(()) } diff --git a/iroh-docs/src/engine/live.rs b/iroh-docs/src/engine/live.rs index 6e49536baa..84cb52e953 100644 --- a/iroh-docs/src/engine/live.rs +++ b/iroh-docs/src/engine/live.rs @@ -9,7 +9,7 @@ use iroh_blobs::downloader::{DownloadError, DownloadRequest, Downloader}; use iroh_blobs::get::Stats; use iroh_blobs::HashAndFormat; use iroh_blobs::{store::EntryStatus, Hash}; -use iroh_gossip::{net::Gossip, proto::TopicId}; +use iroh_gossip::net::Gossip; use iroh_metrics::inc; use iroh_net::NodeId; use iroh_net::{key::PublicKey, Endpoint, NodeAddr}; @@ -18,9 +18,8 @@ use tokio::{ sync::{self, mpsc, oneshot}, task::JoinSet, }; -use tracing::{debug, error, error_span, info, instrument, trace, warn, Instrument, Span}; +use tracing::{debug, error, info, instrument, trace, warn, Instrument, Span}; -use crate::metrics::Metrics; use crate::{ actor::{OpenOpts, SyncHandle}, net::{ @@ -29,8 +28,9 @@ use crate::{ }, AuthorHeads, ContentStatus, NamespaceId, SignedEntry, }; +use crate::{engine::gossip::GossipState, metrics::Metrics}; -use super::gossip::{GossipActor, ToGossipActor}; +// use super::gossip::{GossipActor, ToGossipActor}; use super::state::{NamespaceStates, Origin, SyncReason}; /// Name used for logging when new node addresses are added from the docs engine. @@ -150,7 +150,6 @@ pub struct LiveActor { inbox: mpsc::Receiver, sync: SyncHandle, endpoint: Endpoint, - gossip: Gossip, bao_store: B, downloader: Downloader, replica_events_tx: async_channel::Sender, @@ -160,7 +159,7 @@ pub struct LiveActor { /// Note: Must not be used in methods called from `Self::run` directly to prevent deadlocks. /// Only clone into newly spawned tasks. sync_actor_tx: mpsc::Sender, - gossip_actor_tx: mpsc::Sender, + gossip: GossipState, /// Running sync futures (from connect). running_sync_connect: JoinSet, @@ -190,20 +189,19 @@ impl LiveActor { downloader: Downloader, inbox: mpsc::Receiver, sync_actor_tx: mpsc::Sender, - gossip_actor_tx: mpsc::Sender, ) -> Self { let (replica_events_tx, replica_events_rx) = async_channel::bounded(1024); + let gossip_state = GossipState::new(gossip, sync.clone(), sync_actor_tx.clone()); Self { inbox, sync, replica_events_rx, replica_events_tx, endpoint, - gossip, + gossip: gossip_state, bao_store, downloader, sync_actor_tx, - gossip_actor_tx, running_sync_connect: Default::default(), running_sync_accept: Default::default(), subscribers: Default::default(), @@ -215,22 +213,11 @@ impl LiveActor { } /// Run the actor loop. - pub async fn run(mut self, mut gossip_actor: GossipActor) -> Result<()> { - let me = self.endpoint.node_id().fmt_short(); - let gossip_handle = tokio::task::spawn( - async move { - if let Err(err) = gossip_actor.run().await { - error!("gossip recv actor failed: {err:?}"); - } - } - .instrument(error_span!("sync", %me)), - ); - + pub async fn run(mut self) -> Result<()> { let shutdown_reply = self.run_inner().await; if let Err(err) = self.shutdown().await { error!(?err, "Error during shutdown"); } - gossip_handle.await?; drop(self); match shutdown_reply { Ok(reply) => { @@ -288,7 +275,11 @@ impl LiveActor { inc!(Metrics, doc_live_tick_pending_downloads); let (namespace, hash, res) = res.context("pending_downloads closed")?; self.on_download_ready(namespace, hash, res).await; - + } + res = self.gossip.progress(), if !self.gossip.is_empty() => { + if let Err(error) = res { + warn!(?error, "gossip state failed"); + } } } } @@ -379,13 +370,15 @@ impl LiveActor { async fn shutdown(&mut self) -> anyhow::Result<()> { // cancel all subscriptions self.subscribers.clear(); - // shutdown gossip actor - self.gossip_actor_tx - .send(ToGossipActor::Shutdown) - .await - .ok(); - // shutdown sync thread - let _store = self.sync.shutdown().await; + let (gossip_shutdown_res, _store) = tokio::join!( + // quit the gossip topics and task loops. + self.gossip.shutdown(), + // shutdown sync thread + self.sync.shutdown() + ); + gossip_shutdown_res?; + // TODO: abort_all and join_next all JoinSets to catch panics + // (they are aborted on drop, but that swallows panics) Ok(()) } @@ -439,10 +432,7 @@ impl LiveActor { .unsubscribe(namespace, self.replica_events_tx.clone()) .await?; self.sync.close(namespace).await?; - self.gossip_actor_tx - .send(ToGossipActor::Leave { namespace }) - .await - .context("gossip actor failure")?; + self.gossip.quit(&namespace); } if kill_subscribers { self.subscribers.remove(&namespace); @@ -450,11 +440,7 @@ impl LiveActor { Ok(()) } - async fn join_peers( - &mut self, - namespace: NamespaceId, - peers: Vec, - ) -> anyhow::Result<()> { + async fn join_peers(&mut self, namespace: NamespaceId, peers: Vec) -> Result<()> { let mut peer_ids = Vec::new(); // add addresses of peers to our endpoint address book @@ -477,12 +463,7 @@ impl LiveActor { } // tell gossip to join - self.gossip_actor_tx - .send(ToGossipActor::Join { - namespace, - peers: peer_ids.clone(), - }) - .await?; + self.gossip.join(namespace, peer_ids.clone()).await?; if !peer_ids.is_empty() { // trigger initial sync with initial peers @@ -644,18 +625,9 @@ impl LiveActor { } }; // TODO: We should debounce and merge these neighbor announcements likely. - if let Err(err) = self - .gossip - .broadcast_neighbors(namespace.into(), msg.into()) - .await - { - error!( - namespace = %namespace.fmt_short(), - %op, - ?err, - "Failed to broadcast to neighbors" - ); - } + self.gossip + .broadcast_neighbors(&namespace, msg.into()) + .await; } async fn on_download_ready( @@ -725,12 +697,11 @@ impl LiveActor { match event { crate::Event::LocalInsert { namespace, entry } => { debug!(namespace=%namespace.fmt_short(), "replica event: LocalInsert"); - let topic = TopicId::from_bytes(*namespace.as_bytes()); // A new entry was inserted locally. Broadcast a gossip message. if self.state.is_syncing(&namespace) { let op = Op::Put(entry.clone()); let message = postcard::to_stdvec(&op)?.into(); - self.gossip.broadcast(topic, message).await?; + self.gossip.broadcast(&namespace, message).await; } } crate::Event::RemoteInsert { diff --git a/iroh-gossip/Cargo.toml b/iroh-gossip/Cargo.toml index 3aed7435fa..b39e4d76a8 100644 --- a/iroh-gossip/Cargo.toml +++ b/iroh-gossip/Cargo.toml @@ -15,43 +15,38 @@ rust-version = "1.76" workspace = true [dependencies] -# proto dependencies (required) anyhow = { version = "1" } +async-channel = { version = "2.3.1", optional = true } blake3 = { package = "iroh-blake3", version = "1.4.5"} bytes = { version = "1.4.0", features = ["serde"] } derive_more = { version = "=1.0.0-beta.7", features = ["add", "debug", "deref", "display", "from", "try_into", "into"] } ed25519-dalek = { version = "2.0.0", features = ["serde", "rand_core"] } +futures-concurrency = { version = "7.6.1", optional = true } +futures-lite = { version = "2.3", optional = true } +futures-util = { version = "0.3.30", optional = true } indexmap = "2.0" +iroh-base = { version = "0.21.0", path = "../iroh-base" } +iroh-metrics = { version = "0.21.0", path = "../iroh-metrics" } +iroh-net = { path = "../iroh-net", version = "0.21.0", optional = true, default-features = false } postcard = { version = "1", default-features = false, features = ["alloc", "use-std", "experimental-derive"] } rand = { version = "0.8.5", features = ["std_rng"] } rand_core = "0.6.4" serde = { version = "1.0.164", features = ["derive"] } -tracing = "0.1" -iroh-metrics = { version = "0.21.0", path = "../iroh-metrics" } -iroh-base = { version = "0.21.0", path = "../iroh-base" } - -# net dependencies (optional) -futures-lite = { version = "2.3", optional = true } -iroh-net = { path = "../iroh-net", version = "0.21.0", optional = true, default-features = false, features = ["test-utils"] } tokio = { version = "1", optional = true, features = ["io-util", "sync", "rt", "macros", "net", "fs"] } tokio-util = { version = "0.7.8", optional = true, features = ["codec"] } -genawaiter = { version = "0.99.1", default-features = false, features = ["futures03"] } - -# dispatcher dependencies (optional) -async-channel = { version = "2.3.1", optional = true } -futures-util = { version = "0.3.30", optional = true } +tracing = "0.1" [dev-dependencies] clap = { version = "4", features = ["derive"] } +iroh-net = { path = "../iroh-net", version = "0.21.0", default-features = false, features = ["test-utils"] } iroh-test = { path = "../iroh-test" } rand_chacha = "0.3.1" tracing-subscriber = { version = "0.3", features = ["env-filter"] } url = "2.4.0" [features] -default = ["net", "dispatcher"] -net = ["dep:futures-lite", "dep:iroh-net", "dep:tokio", "dep:tokio-util"] -dispatcher = ["dep:async-channel", "dep:futures-util"] +default = ["net"] +net = ["dep:futures-lite", "dep:iroh-net", "dep:tokio", "dep:tokio-util", "dep:async-channel", "dep:futures-util", "dep:futures-concurrency"] [[example]] name = "chat" diff --git a/iroh-gossip/examples/chat.rs b/iroh-gossip/examples/chat.rs index a9b3a3999d..4aa36a2f56 100644 --- a/iroh-gossip/examples/chat.rs +++ b/iroh-gossip/examples/chat.rs @@ -1,13 +1,14 @@ use std::{collections::HashMap, fmt, str::FromStr}; -use anyhow::{bail, Context}; +use anyhow::{bail, Context, Result}; use bytes::Bytes; use clap::Parser; use ed25519_dalek::Signature; +use futures_lite::StreamExt; use iroh_base::base32; use iroh_gossip::{ - net::{Gossip, GOSSIP_ALPN}, - proto::{Event, TopicId}, + net::{Event, Gossip, GossipEvent, GossipReceiver, GOSSIP_ALPN}, + proto::TopicId, }; use iroh_net::{ key::{PublicKey, SecretKey}, @@ -65,7 +66,7 @@ enum Command { } #[tokio::main] -async fn main() -> anyhow::Result<()> { +async fn main() -> Result<()> { tracing_subscriber::fmt::init(); let args = Args::parse(); @@ -134,18 +135,18 @@ async fn main() -> anyhow::Result<()> { endpoint.add_node_addr(peer)?; } }; - gossip.join(topic, peer_ids).await?.await?; + let (sender, receiver) = gossip.join(topic, peer_ids).await?.split(); println!("> connected!"); // broadcast our name, if set if let Some(name) = args.name { let message = Message::AboutMe { name }; let encoded_message = SignedMessage::sign_and_encode(endpoint.secret_key(), &message)?; - gossip.broadcast(topic, encoded_message).await?; + sender.broadcast(encoded_message).await?; } // subscribe and print loop - tokio::spawn(subscribe_loop(gossip.clone(), topic)); + tokio::spawn(subscribe_loop(receiver)); // spawn an input thread that reads stdin // not using tokio here because they recommend this for "technical reasons" @@ -157,21 +158,18 @@ async fn main() -> anyhow::Result<()> { while let Some(text) = line_rx.recv().await { let message = Message::Message { text: text.clone() }; let encoded_message = SignedMessage::sign_and_encode(endpoint.secret_key(), &message)?; - gossip.broadcast(topic, encoded_message).await?; + sender.broadcast(encoded_message).await?; println!("> sent: {text}"); } Ok(()) } -async fn subscribe_loop(gossip: Gossip, topic: TopicId) -> anyhow::Result<()> { +async fn subscribe_loop(mut receiver: GossipReceiver) -> Result<()> { // init a peerid -> name hashmap let mut names = HashMap::new(); - // get a stream that emits updates on our topic - let mut stream = gossip.subscribe(topic).await?; - loop { - let event = stream.recv().await?; - if let Event::Received(msg) = event { + while let Some(event) = receiver.try_next().await? { + if let Event::Gossip(GossipEvent::Received(msg)) = event { let (from, message) = SignedMessage::verify_and_decode(&msg.content)?; match message { Message::AboutMe { name } => { @@ -187,6 +185,7 @@ async fn subscribe_loop(gossip: Gossip, topic: TopicId) -> anyhow::Result<()> { } } } + Ok(()) } async fn endpoint_loop(endpoint: Endpoint, gossip: Gossip) { @@ -199,10 +198,7 @@ async fn endpoint_loop(endpoint: Endpoint, gossip: Gossip) { }); } } -async fn handle_connection( - mut conn: iroh_net::endpoint::Connecting, - gossip: Gossip, -) -> anyhow::Result<()> { +async fn handle_connection(mut conn: iroh_net::endpoint::Connecting, gossip: Gossip) -> Result<()> { let alpn = conn.alpn().await?; let conn = conn.await?; let peer_id = iroh_net::endpoint::get_remote_node_id(&conn)?; @@ -216,7 +212,7 @@ async fn handle_connection( Ok(()) } -fn input_loop(line_tx: tokio::sync::mpsc::Sender) -> anyhow::Result<()> { +fn input_loop(line_tx: tokio::sync::mpsc::Sender) -> Result<()> { let mut buffer = String::new(); let stdin = std::io::stdin(); // We get `Stdin` here. loop { @@ -234,7 +230,7 @@ struct SignedMessage { } impl SignedMessage { - pub fn verify_and_decode(bytes: &[u8]) -> anyhow::Result<(PublicKey, Message)> { + pub fn verify_and_decode(bytes: &[u8]) -> Result<(PublicKey, Message)> { let signed_message: Self = postcard::from_bytes(bytes)?; let key: PublicKey = signed_message.from; key.verify(&signed_message.data, &signed_message.signature)?; @@ -242,7 +238,7 @@ impl SignedMessage { Ok((signed_message.from, message)) } - pub fn sign_and_encode(secret_key: &SecretKey, message: &Message) -> anyhow::Result { + pub fn sign_and_encode(secret_key: &SecretKey, message: &Message) -> Result { let data: Bytes = postcard::to_stdvec(&message)?.into(); let signature = secret_key.sign(&data); let from: PublicKey = secret_key.public(); @@ -269,7 +265,7 @@ struct Ticket { } impl Ticket { /// Deserializes from bytes. - fn from_bytes(bytes: &[u8]) -> anyhow::Result { + fn from_bytes(bytes: &[u8]) -> Result { postcard::from_bytes(bytes).map_err(Into::into) } /// Serializes to bytes. diff --git a/iroh-gossip/src/dispatcher.rs b/iroh-gossip/src/dispatcher.rs deleted file mode 100644 index e724741ff2..0000000000 --- a/iroh-gossip/src/dispatcher.rs +++ /dev/null @@ -1,503 +0,0 @@ -//! A higher level wrapper for the gossip engine that manages multiple gossip subscriptions and updates. -use std::{ - collections::{btree_map::Entry, BTreeMap, BTreeSet}, - pin::Pin, - sync::{Arc, Mutex}, -}; - -use crate::{ - net::{Event as IrohGossipEvent, Gossip}, - proto::{DeliveryScope, TopicId}, -}; -use bytes::Bytes; -use futures_lite::StreamExt; -use futures_util::Stream; -use iroh_base::rpc::{RpcError, RpcResult}; -use iroh_net::{key::PublicKey, util::AbortingJoinHandle, NodeId}; -use serde::{Deserialize, Serialize}; - -/// Join a gossip topic -#[derive(Serialize, Deserialize, Debug)] -pub struct SubscribeOptions { - /// The initial bootstrap nodes - pub bootstrap: BTreeSet, - /// The maximum number of messages that can be buffered in a subscription. - /// - /// If this limit is reached, the subscriber will receive a `Lagged` response, - /// the message will be dropped, and the subscriber will be closed. - /// - /// This is to prevent a single slow subscriber from blocking the dispatch loop. - /// If a subscriber is lagging, it should be closed and re-opened. - pub subscription_capacity: usize, -} - -/// Send a gossip message -#[derive(Serialize, Deserialize, Debug)] -pub enum Command { - /// Broadcast a message to all nodes in the swarm - Broadcast(Bytes), - /// Broadcast a message to all direct neighbors - BroadcastNeighbors(Bytes), -} - -/// Update from a subscribed gossip topic -#[derive(Serialize, Deserialize, Debug)] -pub enum Event { - /// A message was received - Gossip(GossipEvent), - /// We missed some messages - Lagged, -} - -/// Gossip event -/// An event to be emitted to the application for a particular topic. -#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Serialize, Deserialize)] -pub enum GossipEvent { - /// We have a new, direct neighbor in the swarm membership layer for this topic - NeighborUp(NodeId), - /// We dropped direct neighbor in the swarm membership layer for this topic - NeighborDown(NodeId), - /// A gossip message was received for this topic - Received(Message), -} - -impl From> for GossipEvent { - fn from(event: crate::proto::Event) -> Self { - match event { - crate::proto::Event::NeighborUp(node_id) => Self::NeighborUp(node_id), - crate::proto::Event::NeighborDown(node_id) => Self::NeighborDown(node_id), - crate::proto::Event::Received(message) => Self::Received(Message { - content: message.content, - scope: message.scope, - delivered_from: message.delivered_from, - }), - } - } -} - -/// A gossip message -#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Serialize, Deserialize)] -pub struct Message { - /// The content of the message - pub content: Bytes, - /// The scope of the message. - /// This tells us if the message is from a direct neighbor or actual gossip. - pub scope: DeliveryScope, - /// The node that delivered the message. This is not the same as the original author. - pub delivered_from: NodeId, -} - -/// A gossip engine that manages gossip subscriptions and updates. -#[derive(Debug, Clone)] -pub struct GossipDispatcher { - gossip: Gossip, - inner: Arc>, -} - -/// The mutable state of the gossip engine. -#[derive(Debug)] -struct State { - current_subscriptions: BTreeMap, - /// the single task that dispatches gossip events to all subscribed streams - /// - /// this isn't really part of the mutable state, but it needs to live somewhere - task: Option>, -} - -/// Type alias for a stream of gossip updates, so we don't have to repeat all the bounds. -type CommandStream = Box + Send + Sync + Unpin + 'static>; -/// Type alias for a sink of gossip events. -type EventSink = async_channel::Sender>; - -#[derive(derive_more::Debug)] -enum TopicState { - /// The topic is currently joining. - /// Making new subscriptions is allowed, but they will have to wait for the join to finish. - Joining { - /// Stream/sink pairs that are waiting for the topic to become live. - #[debug(skip)] - waiting: Vec<(CommandStream, EventSink)>, - /// Set of bootstrap nodes we are using. - bootstrap: BTreeSet, - /// The task that is driving the join future. - _join_task: AbortingJoinHandle<()>, - }, - /// The topic is currently live. - /// New subscriptions can be immediately added. - Live { - update_tasks: Vec>, - event_sinks: Vec, - }, - /// The topic is currently quitting. - /// We can't make new subscriptions without waiting for the quit to finish. - Quitting { - /// Stream/sink pairs that are waiting for the topic to quit so - /// it can be joined again. - #[debug(skip)] - waiting: Vec<(CommandStream, EventSink)>, - /// Set of bootstrap nodes we are using. - /// - /// This is used to re-join the topic after quitting. - bootstrap: BTreeSet, - /// The task that is driving the quit future. - #[allow(dead_code)] - quit_task: AbortingJoinHandle<()>, - }, -} - -impl TopicState { - /// Extract all senders from the state. - fn into_senders(self) -> Vec { - match self { - TopicState::Joining { waiting, .. } | TopicState::Quitting { waiting, .. } => { - waiting.into_iter().map(|(_, send)| send).collect() - } - TopicState::Live { event_sinks, .. } => event_sinks, - } - } -} - -impl GossipDispatcher { - /// Create a new gossip dispatcher with the given gossip instance. - pub fn new(gossip: Gossip) -> Self { - let inner = Arc::new(Mutex::new(State { - current_subscriptions: BTreeMap::new(), - task: None, - })); - let res = Self { gossip, inner }; - let dispatch_task = spawn_owned(res.clone().dispatch_task()); - res.inner.lock().unwrap().task = Some(dispatch_task); - res - } - - /// Quit a gossip topic and handle the result of the quitting. - /// - /// On quit success, will try to join the topic again with the bootstrap nodes we have accumulated while waiting for quit to finish. - /// On quit failure, all waiting streams will be notified with the error and removed. - async fn quit_task(self, topic: TopicId) { - let res = self.gossip.quit(topic).await; - let mut inner = self.inner.lock().unwrap(); - if let Some(TopicState::Quitting { - waiting, - bootstrap: peers, - .. - }) = inner.current_subscriptions.remove(&topic) - { - match res { - Ok(()) => { - if waiting.is_empty() { - return; - } - let bootstrap = peers.clone(); - let _join_task = spawn_owned(self.clone().join_task(topic, bootstrap)); - inner.current_subscriptions.insert( - topic, - TopicState::Joining { - waiting, - bootstrap: peers, - _join_task, - }, - ); - } - Err(e) => { - // notify all waiting streams that there is something wrong with the topic - let error = RpcError::from(e); - for (_, send) in waiting { - send.try_send(Err(error.clone())).ok(); - } - } - } - } - } - - /// Try to send an event to a sink. - /// - /// This will not wait until the sink is full, but send a `Lagged` response if the sink is almost full. - fn try_send(send: &EventSink, event: &IrohGossipEvent) -> bool { - // If the stream is disconnected, we don't need to send to it. - if send.is_closed() { - return false; - } - // Check if the send buffer is almost full, and send a lagged response if it is. - if let Some(cap) = send.capacity() { - if send.len() >= cap - 1 { - send.try_send(Ok(Event::Lagged)).ok(); - return false; - } - } - // Send the event to the stream. - // We are the owner of the stream, so we can be sure that there is still room. - send.try_send(Ok(Event::Gossip(event.clone().into()))) - .is_ok() - } - - /// Dispatch gossip events to all subscribed streams. - /// - /// This should not fail unless the gossip instance is faulty. - async fn dispatch_loop(mut self) -> anyhow::Result<()> { - let stream = self.gossip.clone().subscribe_all(); - tokio::pin!(stream); - while let Some(item) = stream.next().await { - let (topic, event) = item?; - // The loop is only for the case that the topic is still in joining state, - // where we switch it to live here and have to re-lock the mutex afterwards. - loop { - let mut inner = self.inner.lock().unwrap(); - let Some(state) = inner.current_subscriptions.get_mut(&topic) else { - tracing::trace!("Received event for unknown topic, possibly sync {topic}",); - break; - }; - match state { - // The topic is in joining state. It can happen that we receive an event before - // our join task completed. In this case, we switch the topic to live here. - TopicState::Joining { .. } => { - drop(inner); - self.on_join(topic, Ok(())); - continue; - } - TopicState::Live { - update_tasks, - event_sinks, - } => { - // Send the message to all our senders, and remove disconnected senders. - event_sinks.retain(|sink| Self::try_send(sink, &event)); - // If no senders are left, and all update tasks are finished, we can quit - // the topic. - if event_sinks.is_empty() - && update_tasks.iter().all(|task| task.is_finished()) - { - let quit_task = tokio::task::spawn(self.clone().quit_task(topic)); - inner.current_subscriptions.insert( - topic, - TopicState::Quitting { - waiting: vec![], - bootstrap: BTreeSet::new(), - quit_task: quit_task.into(), - }, - ); - } - } - _ => {} - } - break; - } - } - Ok(()) - } - - /// Dispatch gossip events to all subscribed streams, and handle the unlikely case of a dispatch loop failure. - async fn dispatch_task(self) { - if let Err(cause) = self.clone().dispatch_loop().await { - // dispatch task failed. Not sure what to do here. - tracing::error!("Gossip dispatch task failed: {}", cause); - let mut inner = self.inner.lock().unwrap(); - let error = RpcError::from(cause); - for (_, state) in std::mem::take(&mut inner.current_subscriptions) { - for sender in state.into_senders() { - sender.try_send(Err(error.clone())).ok(); - } - } - } - } - - /// Handle updates from the client. - async fn update_loop( - gossip: Gossip, - topic: TopicId, - mut updates: CommandStream, - ) -> anyhow::Result<()> { - while let Some(update) = Pin::new(&mut updates).next().await { - match update { - Command::Broadcast(msg) => { - gossip.broadcast(topic, msg).await?; - } - Command::BroadcastNeighbors(msg) => { - gossip.broadcast_neighbors(topic, msg).await?; - } - } - } - Ok(()) - } - - /// Handle updates from the client, and handle update loop failure. - async fn update_task(self, topic: TopicId, updates: CommandStream) { - let res = Self::update_loop(self.gossip.clone(), topic, updates).await; - let mut inner = self.inner.lock().unwrap(); - - match res { - Err(err) => { - // we got an error while sending to the topic - if let Some(TopicState::Live { event_sinks, .. }) = - inner.current_subscriptions.remove(&topic) - { - let error = RpcError::from(err); - // notify all live streams that sending to the topic failed - for send in event_sinks { - send.try_send(Err(error.clone())).ok(); - } - } - } - Ok(()) => { - // check if we should quit the topic. - if let Some(TopicState::Live { - event_sinks, - update_tasks, - }) = inner.current_subscriptions.get(&topic) - { - if event_sinks.is_empty() && update_tasks.iter().all(|t| t.is_finished()) { - let quit_task = tokio::task::spawn(self.clone().quit_task(topic)); - inner.current_subscriptions.insert( - topic, - TopicState::Quitting { - waiting: vec![], - bootstrap: BTreeSet::new(), - quit_task: quit_task.into(), - }, - ); - } - } - } - } - } - - /// Call join, then await the result. - /// - /// Basically just flattens the two stages of joining into one. - async fn join(gossip: Gossip, topic: TopicId, bootstrap: Vec) -> anyhow::Result<()> { - let join = gossip.join(topic, bootstrap).await?; - join.await?; - Ok(()) - } - - /// Join a gossip topic and handle turning waiting streams into live streams. - async fn join_task(mut self, topic: TopicId, bootstrap: BTreeSet) { - let res = Self::join(self.gossip.clone(), topic, bootstrap.into_iter().collect()).await; - self.on_join(topic, res); - } - - /// Switch the state of a topic to live. - /// - /// If the topic is already live, this is a noop. - fn on_join(&mut self, topic: TopicId, res: anyhow::Result<()>) { - let mut inner = self.inner.lock().unwrap(); - let Some(state) = inner.current_subscriptions.remove(&topic) else { - return; - }; - match state { - TopicState::Live { - update_tasks, - event_sinks, - } => { - inner.current_subscriptions.insert( - topic, - TopicState::Live { - update_tasks, - event_sinks, - }, - ); - } - TopicState::Joining { waiting, .. } => { - match res { - Ok(()) => { - let mut event_sinks = vec![]; - let mut update_tasks = vec![]; - for (updates, event_sink) in waiting { - // if the stream is disconnected, we don't need to keep it and start the update task - if event_sink.is_closed() { - continue; - } - event_sinks.push(event_sink); - let task = spawn_owned(self.clone().update_task(topic, updates)); - update_tasks.push(task); - } - inner.current_subscriptions.insert( - topic, - TopicState::Live { - event_sinks, - update_tasks, - }, - ); - } - Err(e) => { - // notify all waiting streams that the subscription failed - let error = RpcError::from(e); - for (_, send) in waiting { - send.try_send(Err(error.clone())).ok(); - } - } - } - } - TopicState::Quitting { .. } => {} - } - } - - /// Subscribe to a gossip topic. - pub fn subscribe_with_opts( - &self, - topic: TopicId, - options: SubscribeOptions, - updates: CommandStream, - ) -> impl Stream> + Unpin { - let mut inner = self.inner.lock().unwrap(); - let (send, recv) = async_channel::bounded(options.subscription_capacity); - match inner.current_subscriptions.entry(topic) { - Entry::Vacant(entry) => { - // There is no existing subscription, so we need to start a new one. - let waiting = vec![(updates, send)]; - let this = self.clone(); - let _join_task = - spawn_owned(this.clone().join_task(topic, options.bootstrap.clone())); - entry.insert(TopicState::Joining { - waiting, - bootstrap: options.bootstrap, - _join_task, - }); - } - Entry::Occupied(mut entry) => { - // There is already a subscription - let state = entry.get_mut(); - match state { - TopicState::Joining { - waiting, - bootstrap: peers, - .. - } => { - // We are joining, so we need to wait with creating the update task. - // - // TODO: should we merge the bootstrap nodes and try to join with all of them? - peers.extend(options.bootstrap); - waiting.push((updates, send)); - } - TopicState::Quitting { - waiting, - bootstrap: peers, - .. - } => { - // We are quitting, so we need to wait with creating the update task. - peers.extend(options.bootstrap); - waiting.push((updates, send)); - } - TopicState::Live { - event_sinks, - update_tasks, - } => { - // There is already a live subscription, so we can immediately start the update task. - let task = spawn_owned(self.clone().update_task(topic, updates)); - event_sinks.push(send); - update_tasks.push(task); - } - } - } - } - recv.boxed() - } -} - -/// tokio::spawn but returns an `AbortingJoinHandle` that owns the task. -fn spawn_owned(f: F) -> AbortingJoinHandle -where - F: std::future::Future + Send + 'static, - T: Send + 'static, -{ - tokio::spawn(f).into() -} diff --git a/iroh-gossip/src/lib.rs b/iroh-gossip/src/lib.rs index 70a015a787..9c6dd3f27e 100644 --- a/iroh-gossip/src/lib.rs +++ b/iroh-gossip/src/lib.rs @@ -2,8 +2,6 @@ #![deny(missing_docs, rustdoc::broken_intra_doc_links)] -#[cfg(feature = "dispatcher")] -pub mod dispatcher; pub mod metrics; #[cfg(feature = "net")] pub mod net; diff --git a/iroh-gossip/src/net.rs b/iroh-gossip/src/net.rs index 29f428ca49..f813927d47 100644 --- a/iroh-gossip/src/net.rs +++ b/iroh-gossip/src/net.rs @@ -1,23 +1,30 @@ //! Networking for the `iroh-gossip` protocol -use anyhow::{anyhow, Context}; -use bytes::{Bytes, BytesMut}; -use futures_lite::stream::Stream; -use genawaiter::sync::{Co, Gen}; +use anyhow::{anyhow, Context as _, Result}; +use bytes::BytesMut; +use futures_concurrency::{ + future::TryJoin, + stream::{stream_group, StreamGroup}, +}; +use futures_lite::{stream::Stream, StreamExt}; +use futures_util::TryFutureExt; use iroh_metrics::inc; use iroh_net::{ dialer::Dialer, endpoint::{get_remote_node_id, Connection}, key::PublicKey, - AddrInfo, Endpoint, NodeAddr, + util::SharedAbortingJoinHandle, + AddrInfo, Endpoint, NodeAddr, NodeId, }; use rand::rngs::StdRng; use rand_core::SeedableRng; -use std::{collections::HashMap, future::Future, pin::Pin, sync::Arc, task::Poll, time::Instant}; -use tokio::{ - sync::{broadcast, mpsc, oneshot}, - task::JoinHandle, +use std::{ + collections::{BTreeSet, HashMap, HashSet, VecDeque}, + pin::Pin, + task::{Context, Poll}, + time::Instant, }; +use tokio::{sync::mpsc, task::JoinSet}; use tracing::{debug, error_span, trace, warn, Instrument}; use self::util::{read_message, write_message, Timers}; @@ -26,14 +33,20 @@ use crate::{ proto::{self, PeerData, Scope, TopicId}, }; +mod handles; pub mod util; +pub use self::handles::{ + Command, CommandStream, Event, GossipEvent, GossipReceiver, GossipSender, GossipTopic, + JoinOptions, Message, +}; + /// ALPN protocol name pub const GOSSIP_ALPN: &[u8] = b"/iroh-gossip/0"; -/// Channel capacity for all subscription broadcast channels (single) -const SUBSCRIBE_ALL_CAP: usize = 2048; -/// Channel capacity for topic subscription broadcast channels (one per topic) -const SUBSCRIBE_TOPIC_CAP: usize = 2048; +/// Default channel capacity for topic subscription channels (one per topic) +const TOPIC_EVENTS_DEFAULT_CAP: usize = 2048; +/// Default channel capacity for topic subscription channels (one per topic) +const TOPIC_COMMANDS_DEFAULT_CAP: usize = 2048; /// Channel capacity for the send queue (one per connection) const SEND_QUEUE_CAP: usize = 64; /// Channel capacity for the ToActor message queue (single) @@ -46,9 +59,9 @@ const ON_ENDPOINTS_CAP: usize = 64; const SOURCE_NAME: &str = "gossip"; /// Events emitted from the gossip protocol -pub type Event = proto::Event; +pub type ProtoEvent = proto::Event; /// Commands for the gossip protocol -pub type Command = proto::Command; +pub type ProtoCommand = proto::Command; type InEvent = proto::InEvent; type OutEvent = proto::OutEvent; @@ -77,7 +90,7 @@ type ProtoMessage = proto::Message; pub struct Gossip { to_actor_tx: mpsc::Sender, on_direct_addrs_tx: mpsc::Sender>, - _actor_handle: Arc>>, + _actor_handle: SharedAbortingJoinHandle<()>, max_message_size: usize, } @@ -106,21 +119,18 @@ impl Gossip { in_event_rx, in_event_tx, on_direct_addr_rx: on_endpoints_rx, - conns: Default::default(), - conn_send_tx: Default::default(), - pending_sends: Default::default(), timers: Timers::new(), - subscribers_all: None, - subscribers_topic: Default::default(), + command_rx: StreamGroup::new().keyed(), + peers: Default::default(), + topics: Default::default(), + quit_queue: Default::default(), + connection_tasks: Default::default(), }; let actor_handle = tokio::spawn( async move { if let Err(err) = actor.run().await { warn!("gossip actor closed with error: {err:?}"); - Err(err) - } else { - Ok(()) } } .instrument(error_span!("gossip", %me)), @@ -128,7 +138,7 @@ impl Gossip { Self { to_actor_tx, on_direct_addrs_tx: on_endpoints_tx, - _actor_handle: Arc::new(actor_handle), + _actor_handle: actor_handle.into(), max_message_size, } } @@ -138,113 +148,77 @@ impl Gossip { self.max_message_size } - /// Join a topic and connect to peers. - /// - /// - /// This method only asks for [`PublicKey`]s. You must supply information on how to - /// connect to these peers manually before, by calling [`Endpoint::add_node_addr`] on - /// the underlying [`Endpoint`]. - /// - /// This method returns a future that completes once the request reached the local actor. - /// This completion returns a [`JoinTopicFut`] which completes once at least peer was joined - /// successfully and the swarm thus becomes operational. - /// - /// The [`JoinTopicFut`] has no timeout, so it will remain pending indefinitely if no peer - /// could be contacted. Usually you will want to add a timeout yourself. - /// - /// TODO: Resolve to an error once all connection attempts failed. - pub async fn join( - &self, - topic: TopicId, - peers: Vec, - ) -> anyhow::Result { - let (tx, rx) = oneshot::channel(); - self.send(ToActor::Join(topic, peers, tx)).await?; - Ok(JoinTopicFut(rx)) - } - - /// Quit a topic. - /// - /// This sends a disconnect message to all active peers and then drops the state - /// for this topic. - pub async fn quit(&self, topic: TopicId) -> anyhow::Result<()> { - self.send(ToActor::Quit(topic)).await?; - Ok(()) - } - - /// Broadcast a message on a topic to all peers in the swarm. - /// - /// This does not join the topic automatically, so you have to call [`Self::join`] yourself - /// for messages to be broadcast to peers. + /// Handle an incoming [`Connection`]. /// - /// Messages with the same content are only delivered once. - pub async fn broadcast(&self, topic: TopicId, message: Bytes) -> anyhow::Result<()> { - let (tx, rx) = oneshot::channel(); - self.send(ToActor::Broadcast(topic, message, Scope::Swarm, tx)) + /// Make sure to check the ALPN protocol yourself before passing the connection. + pub async fn handle_connection(&self, conn: Connection) -> anyhow::Result<()> { + let peer_id = get_remote_node_id(&conn)?; + self.send(ToActor::HandleConnection(peer_id, ConnOrigin::Accept, conn)) .await?; - rx.await??; Ok(()) } - /// Broadcast a message on a topic to the immediate neighbors. - /// - /// This does not join the topic automatically, so you have to call [`Self::join`] yourself - /// for messages to be broadcast to peers. - pub async fn broadcast_neighbors(&self, topic: TopicId, message: Bytes) -> anyhow::Result<()> { - let (tx, rx) = oneshot::channel(); - self.send(ToActor::Broadcast(topic, message, Scope::Neighbors, tx)) - .await?; - rx.await??; - Ok(()) + /// Join a gossip topic with the default options and wait for at least one active connection. + pub async fn join(&self, topic_id: TopicId, bootstrap: Vec) -> Result { + let mut sub = self.join_with_opts(topic_id, JoinOptions::with_bootstrap(bootstrap)); + sub.joined().await?; + Ok(sub) } - /// Subscribe to messages and event notifications for a topic. + /// Join a gossip topic with options. /// - /// Does not join the topic automatically, so you have to call [`Self::join`] yourself - /// to actually receive messages. - pub async fn subscribe(&self, topic: TopicId) -> anyhow::Result> { - let (tx, rx) = oneshot::channel(); - self.send(ToActor::Subscribe(topic, tx)).await?; - let res = rx.await.map_err(|_| anyhow!("subscribe_tx dropped"))??; - Ok(res) - } - - /// Subscribe to all events published on topics that you joined. + /// Returns a [`GossipTopic`] instantly. To wait for at least one connection to be established, + /// you can await [`GossipTopic::joined`]. /// - /// Note that this method takes self by value. Usually you would clone the [`Gossip`] handle. - /// before. - pub fn subscribe_all( - self, - ) -> impl Stream> { - Gen::new(|co| async move { - if let Err(err) = self.subscribe_all0(&co).await { - warn!("subscribe_all produced error: {err:?}"); - co.yield_(Err(broadcast::error::RecvError::Closed)).await - } - }) + /// Messages will be queued until a first connection is available. If the internal channel becomes full, + /// the oldest messages will be dropped from the channel. + pub fn join_with_opts(&self, topic_id: TopicId, opts: JoinOptions) -> GossipTopic { + let (command_tx, command_rx) = async_channel::bounded(TOPIC_COMMANDS_DEFAULT_CAP); + let command_rx: CommandStream = Box::pin(command_rx); + let event_rx = self.join_with_stream(topic_id, opts, command_rx); + GossipTopic::new(command_tx, Box::pin(event_rx)) } - async fn subscribe_all0( + /// Join a gossip topic with options and an externally-created update stream. + /// + /// This method differs from [`Self::join_with_opts`] by letting you pass in a `updates` command stream yourself + /// instead of using a channel created for you. + /// + /// It returns a stream of events. If you want to wait for the topic to become active, wait for + /// the [`GossipEvent::Joined`] event. + pub fn join_with_stream( &self, - co: &Co>, - ) -> anyhow::Result<()> { - let (tx, rx) = oneshot::channel(); - self.send(ToActor::SubscribeAll(tx)).await?; - let mut res = rx.await??; - loop { - let event = res.recv().await; - co.yield_(event).await; + topic_id: TopicId, + options: JoinOptions, + updates: CommandStream, + ) -> impl Stream> + Send + 'static { + let (event_tx, event_rx) = async_channel::bounded(options.subscription_capacity); + let to_actor_tx = self.to_actor_tx.clone(); + let channels = SubscriberChannels { + command_rx: updates, + event_tx, + }; + // We spawn a task to send the subscribe action to the actor, because we want the send to + // succeed even if the returned stream is dropped right away without being polled, because + // it is legit to keep only the `updates` stream and drop the event stream. This situation + // is handled fine within the actor, but we have to make sure that the message reaches the + // actor. + let task = tokio::task::spawn(async move { + to_actor_tx + .send(ToActor::Join { + topic_id, + bootstrap: options.bootstrap, + channels, + }) + .await + .map_err(|_| anyhow!("Gossip actor dropped")) + }); + async move { + task.await + .map_err(|err| anyhow!("Task for sending to gossip actor failed: {err:?}"))??; + Ok(event_rx) } - } - - /// Handle an incoming [`Connection`]. - /// - /// Make sure to check the ALPN protocol yourself before passing the connection. - pub async fn handle_connection(&self, conn: Connection) -> anyhow::Result<()> { - let peer_id = get_remote_node_id(&conn)?; - self.send(ToActor::ConnIncoming(peer_id, ConnOrigin::Accept, conn)) - .await?; - Ok(()) + .try_flatten_stream() } /// Set info on our direct addresses. @@ -272,75 +246,24 @@ impl Gossip { } } -/// Future that completes once at least one peer is joined for this topic. -/// -/// The future has no timeout, so it will remain pending indefinitely if no peer -/// could be contacted. Usually you will want to add a timeout yourself. -/// -/// TODO: Optionally resolve to an error once all connection attempts failed. -#[derive(Debug)] -pub struct JoinTopicFut(oneshot::Receiver>); -impl Future for JoinTopicFut { - type Output = anyhow::Result; - - fn poll( - mut self: std::pin::Pin<&mut Self>, - cx: &mut std::task::Context<'_>, - ) -> std::task::Poll { - let res = Pin::new(&mut self.0).poll(cx); - match res { - Poll::Pending => Poll::Pending, - Poll::Ready(Err(_err)) => Poll::Ready(Err(anyhow!("gossip actor dropped"))), - Poll::Ready(Ok(res)) => Poll::Ready(res), - } - } -} - -/// Whether a connection is initiated by us (Dial) or by the remote peer (Accept) -#[derive(Debug)] -enum ConnOrigin { - Accept, - Dial, -} - /// Input messages for the gossip [`Actor`]. #[derive(derive_more::Debug)] enum ToActor { /// Handle a new QUIC connection, either from accept (external to the actor) or from connect /// (happens internally in the actor). - ConnIncoming(PublicKey, ConnOrigin, #[debug(skip)] Connection), - /// Join a topic with a list of peers. Reply with oneshot once at least one peer joined. - Join( - TopicId, - Vec, - #[debug(skip)] oneshot::Sender>, - ), - /// Leave a topic, send disconnect messages and drop all state. - Quit(TopicId), - /// Broadcast a message on a topic. - Broadcast( - TopicId, - #[debug("<{}b>", _1.len())] Bytes, - Scope, - #[debug(skip)] oneshot::Sender>, - ), - /// Subscribe to a topic. Return oneshot which resolves to a broadcast receiver for events on a - /// topic. - Subscribe( - TopicId, - #[debug(skip)] oneshot::Sender>>, - ), - /// Subscribe to a topic. Return oneshot which resolves to a broadcast receiver for events on a - /// topic. - SubscribeAll( - #[debug(skip)] oneshot::Sender>>, - ), + HandleConnection(PublicKey, ConnOrigin, #[debug("Connection")] Connection), + Join { + topic_id: TopicId, + bootstrap: BTreeSet, + channels: SubscriberChannels, + }, } /// Actor that sends and handles messages between the connection and main state loops struct Actor { /// Protocol state state: proto::State, + /// The endpoint through which we dial peers endpoint: Endpoint, /// Dial machine to connect to peers dialer: Dialer, @@ -354,16 +277,16 @@ struct Actor { on_direct_addr_rx: mpsc::Receiver>, /// Queued timers timers: Timers, - /// Currently opened quinn connections to peers - conns: HashMap, - /// Channels to send outbound messages into the connection loops - conn_send_tx: HashMap>, - /// Queued messages that were to be sent before a dial completed - pending_sends: HashMap>, - /// Broadcast senders for active topic subscriptions from the application - subscribers_topic: HashMap>, - /// Broadcast senders for wildcard subscriptions from the application - subscribers_all: Option>, + /// Map of topics to their state. + topics: HashMap, + /// Map of peers to their state. + peers: HashMap, + /// Stream of commands from topic handles. + command_rx: stream_group::Keyed, + /// Internal queue of topic to close because all handles were dropped. + quit_queue: VecDeque, + /// Tasks for the connection loops, to keep track of panics. + connection_tasks: JoinSet<()>, } impl Actor { @@ -386,7 +309,12 @@ impl Actor { } } }, + Some((key, (topic, command))) = self.command_rx.next(), if !self.command_rx.is_empty() => { + trace!(?i, "tick: command_rx"); + self.handle_command(topic, key, command).await?; + }, new_endpoints = self.on_direct_addr_rx.recv() => { + trace!(?i, "tick: new_endpoints"); match new_endpoints { Some(endpoints) => { inc!(Metrics, actor_tick_endpoint); @@ -411,7 +339,7 @@ impl Actor { Ok(conn) => { debug!(peer = ?peer_id, "dial successful"); inc!(Metrics, actor_tick_dialer_success); - self.handle_to_actor_msg(ToActor::ConnIncoming(peer_id, ConnOrigin::Dial, conn), Instant::now()).await.context("dialer.next -> conn -> handle_to_actor_msg")?; + self.handle_connection(peer_id, ConnOrigin::Dial, conn); } Err(err) => { warn!(peer = ?peer_id, "dial failed: {err}"); @@ -437,155 +365,262 @@ impl Actor { self.handle_in_event(InEvent::TimerExpired(timer), now).await.context("timers.drain_expired -> handle_in_event")?; } } - + Some(res) = self.connection_tasks.join_next(), if !self.connection_tasks.is_empty() => { + if let Err(err) = res { + if !err.is_cancelled() { + warn!("connection task panicked: {err:?}"); + } + } + } } } Ok(()) } - async fn handle_to_actor_msg(&mut self, msg: ToActor, now: Instant) -> anyhow::Result<()> { - trace!("handle to_actor {msg:?}"); - match msg { - ToActor::ConnIncoming(peer_id, origin, conn) => { - self.conns.insert(peer_id, conn.clone()); - self.dialer.abort_dial(peer_id); - let (send_tx, send_rx) = mpsc::channel(SEND_QUEUE_CAP); - self.conn_send_tx.insert(peer_id, send_tx.clone()); - - let max_message_size = self.state.max_message_size(); - - // Spawn a task for this connection - let in_event_tx = self.in_event_tx.clone(); - tokio::spawn( - async move { - debug!("connection established"); - match connection_loop( - peer_id, - conn, - origin, - send_rx, - &in_event_tx, - max_message_size, - ) - .await - { - Ok(()) => { - debug!("connection closed without error") - } - Err(err) => { - debug!("connection closed with error {err:?}") - } - } - in_event_tx - .send(InEvent::PeerDisconnected(peer_id)) - .await - .ok(); - } - .instrument(error_span!("gossip_conn", peer = %peer_id.fmt_short())), - ); - - // Forward queued pending sends - if let Some(send_queue) = self.pending_sends.remove(&peer_id) { - for msg in send_queue { - send_tx.send(msg).await?; + async fn handle_command( + &mut self, + topic: TopicId, + key: stream_group::Key, + command: Option, + ) -> anyhow::Result<()> { + debug!(?topic, ?key, ?command, "handle command"); + let Some(state) = self.topics.get_mut(&topic) else { + // TODO: unreachable? + warn!("received command for unknown topic"); + return Ok(()); + }; + let TopicState { + command_rx_keys, + event_senders, + .. + } = state; + match command { + Some(command) => { + let command = match command { + Command::Broadcast(message) => ProtoCommand::Broadcast(message, Scope::Swarm), + Command::BroadcastNeighbors(message) => { + ProtoCommand::Broadcast(message, Scope::Neighbors) } + Command::JoinPeers(peers) => ProtoCommand::Join(peers), + }; + self.handle_in_event(proto::InEvent::Command(topic, command), Instant::now()) + .await?; + } + None => { + command_rx_keys.remove(&key); + if command_rx_keys.is_empty() && event_senders.is_empty() { + self.quit_queue.push_back(topic); + self.process_quit_queue().await?; } } - ToActor::Join(topic_id, peers, reply) => { - self.handle_in_event(InEvent::Command(topic_id, Command::Join(peers)), now) - .await?; - if self.state.has_active_peers(&topic_id) { - // If the active_view contains at least one peer, reply now - reply.send(Ok(topic_id)).ok(); - } else { - // Otherwise, wait for any peer to come up as neighbor. - let sub = self.subscribe(topic_id); - tokio::spawn(async move { - let res = wait_for_neighbor_up(sub).await; - let res = res.map(|_| topic_id); - reply.send(res).ok(); - }); + } + Ok(()) + } + + fn handle_connection(&mut self, peer_id: NodeId, origin: ConnOrigin, conn: Connection) { + // Check that we only keep one connection per peer per direction. + if let Some(peer_info) = self.peers.get(&peer_id) { + if matches!(origin, ConnOrigin::Dial) && peer_info.conn_dialed.is_some() { + warn!(?peer_id, ?origin, "ignoring connection: already accepted"); + return; + } + if matches!(origin, ConnOrigin::Accept) && peer_info.conn_accepted.is_some() { + warn!(?peer_id, ?origin, "ignoring connection: already accepted"); + return; + } + } + + let mut peer_info = self.peers.remove(&peer_id).unwrap_or_default(); + + // Store the connection so that we can terminate it when the peer is removed. + match origin { + ConnOrigin::Dial => { + peer_info.conn_dialed = Some(conn.clone()); + } + ConnOrigin::Accept => { + peer_info.conn_accepted = Some(conn.clone()); + } + } + + // Extract the queue of pending messages. + let queue = match &mut peer_info.state { + PeerState::Pending { queue } => std::mem::take(queue), + PeerState::Active { .. } => Default::default(), + }; + + let (send_tx, send_rx) = mpsc::channel(SEND_QUEUE_CAP); + let max_message_size = self.state.max_message_size(); + let in_event_tx = self.in_event_tx.clone(); + + // Spawn a task for this connection + self.connection_tasks.spawn( + async move { + match connection_loop( + peer_id, + conn, + origin, + send_rx, + &in_event_tx, + max_message_size, + queue, + ) + .await + { + Ok(()) => debug!("connection closed without error"), + Err(err) => warn!("connection closed: {err:?}"), } + in_event_tx + .send(InEvent::PeerDisconnected(peer_id)) + .await + .ok(); } - ToActor::Quit(topic_id) => { - self.handle_in_event(InEvent::Command(topic_id, Command::Quit), now) - .await?; - self.subscribers_topic.remove(&topic_id); + .instrument(error_span!("gossip_conn", peer = %peer_id.fmt_short())), + ); + + peer_info.state = match peer_info.state { + PeerState::Pending { .. } => PeerState::Active { send_tx }, + PeerState::Active { send_tx } => PeerState::Active { send_tx }, + }; + + self.peers.insert(peer_id, peer_info); + } + + async fn handle_to_actor_msg(&mut self, msg: ToActor, now: Instant) -> anyhow::Result<()> { + trace!("handle to_actor {msg:?}"); + match msg { + ToActor::HandleConnection(peer_id, origin, conn) => { + self.handle_connection(peer_id, origin, conn) } - ToActor::Broadcast(topic_id, message, scope, reply) => { + ToActor::Join { + topic_id, + bootstrap, + channels, + } => { + let state = self.topics.entry(topic_id).or_default(); + let TopicState { + neighbors, + event_senders, + command_rx_keys, + } = state; + if !neighbors.is_empty() { + let neighbors = neighbors.iter().copied().collect(); + channels + .event_tx + .try_send(Ok(Event::Gossip(GossipEvent::Joined(neighbors)))) + .ok(); + } + + event_senders.push(channels.event_tx); + let command_rx = TopicCommandStream::new(topic_id, channels.command_rx); + let key = self.command_rx.insert(command_rx); + command_rx_keys.insert(key); + self.handle_in_event( - InEvent::Command(topic_id, Command::Broadcast(message, scope)), + InEvent::Command( + topic_id, + ProtoCommand::Join(bootstrap.into_iter().collect()), + ), now, ) .await?; - reply.send(Ok(())).ok(); } - ToActor::Subscribe(topic_id, reply) => { - let rx = self.subscribe(topic_id); - reply.send(Ok(rx)).ok(); - } - ToActor::SubscribeAll(reply) => { - let rx = self.subscribe_all(); - reply.send(Ok(rx)).ok(); - } - }; + } Ok(()) } async fn handle_in_event(&mut self, event: InEvent, now: Instant) -> anyhow::Result<()> { + self.handle_in_event_inner(event, now).await?; + self.process_quit_queue().await?; + Ok(()) + } + + async fn process_quit_queue(&mut self) -> anyhow::Result<()> { + while let Some(topic_id) = self.quit_queue.pop_front() { + self.handle_in_event_inner( + InEvent::Command(topic_id, ProtoCommand::Quit), + Instant::now(), + ) + .await?; + self.topics.remove(&topic_id); + } + Ok(()) + } + + async fn handle_in_event_inner(&mut self, event: InEvent, now: Instant) -> anyhow::Result<()> { if matches!(event, InEvent::TimerExpired(_)) { - trace!("handle in_event {event:?}"); + trace!(?event, "handle in_event"); } else { - debug!("handle in_event {event:?}"); + debug!(?event, "handle in_event"); }; if let InEvent::PeerDisconnected(peer) = &event { - self.conn_send_tx.remove(peer); + self.peers.remove(peer); } let out = self.state.handle(event, now); for event in out { if matches!(event, OutEvent::ScheduleTimer(_, _)) { - trace!("handle out_event {event:?}"); + trace!(?event, "handle out_event"); } else { - debug!("handle out_event {event:?}"); + debug!(?event, "handle out_event"); }; match event { OutEvent::SendMessage(peer_id, message) => { - if let Some(send) = self.conn_send_tx.get(&peer_id) { - if let Err(_err) = send.send(message).await { - warn!("conn receiver for {peer_id:?} dropped"); - self.conn_send_tx.remove(&peer_id); + let info = self.peers.entry(peer_id).or_default(); + match &mut info.state { + PeerState::Active { send_tx } => { + if let Err(_err) = send_tx.send(message).await { + // Removing the peer is handled by the in_event PeerDisconnected sent + // at the end of the connection task. + warn!("connection loop for {peer_id:?} dropped"); + } + } + PeerState::Pending { queue } => { + if queue.is_empty() { + self.dialer.queue_dial(peer_id, GOSSIP_ALPN); + } + queue.push(message); } - } else { - debug!(peer = ?peer_id, "dial"); - self.dialer.queue_dial(peer_id, GOSSIP_ALPN); - // TODO: Enforce max length - self.pending_sends.entry(peer_id).or_default().push(message); } } OutEvent::EmitEvent(topic_id, event) => { - if let Some(sender) = self.subscribers_all.as_mut() { - if let Err(_event) = sender.send((topic_id, event.clone())) { - self.subscribers_all = None; - } - } - if let Some(sender) = self.subscribers_topic.get(&topic_id) { - // Only error case is that all [broadcast::Receivers] have been dropped. - // If so, remove the sender as well. - if let Err(_event) = sender.send(event) { - self.subscribers_topic.remove(&topic_id); + let Some(state) = self.topics.get_mut(&topic_id) else { + // TODO: unreachable? + warn!(?topic_id, "gossip state emitted event for unknown topic"); + continue; + }; + let TopicState { + neighbors, + event_senders, + command_rx_keys, + } = state; + let event = if let ProtoEvent::NeighborUp(neighbor) = event { + let was_empty = neighbors.is_empty(); + neighbors.insert(neighbor); + if was_empty { + GossipEvent::Joined(vec![neighbor]) + } else { + GossipEvent::NeighborUp(neighbor) } + } else { + event.into() + }; + event_senders.send(&event); + if event_senders.is_empty() && command_rx_keys.is_empty() { + self.quit_queue.push_back(topic_id); } } OutEvent::ScheduleTimer(delay, timer) => { self.timers.insert(now + delay, timer); } - OutEvent::DisconnectPeer(peer) => { - if let Some(conn) = self.conns.remove(&peer) { - conn.close(0u8.into(), b"close from disconnect"); + OutEvent::DisconnectPeer(peer_id) => { + if let Some(peer) = self.peers.remove(&peer_id) { + if let Some(conn) = peer.conn_dialed { + conn.close(0u8.into(), b"close from disconnect"); + } + if let Some(conn) = peer.conn_accepted { + conn.close(0u8.into(), b"close from disconnect"); + } + drop(peer.state); } - self.conn_send_tx.remove(&peer); - self.pending_sends.remove(&peer); - self.dialer.abort_dial(peer); } OutEvent::PeerData(node_id, data) => match decode_peer_data(&data) { Err(err) => warn!("Failed to decode {data:?} from {node_id}: {err}"), @@ -604,40 +639,47 @@ impl Actor { } Ok(()) } +} - fn subscribe_all(&mut self) -> broadcast::Receiver<(TopicId, Event)> { - if let Some(tx) = self.subscribers_all.as_mut() { - tx.subscribe() - } else { - let (tx, rx) = broadcast::channel(SUBSCRIBE_ALL_CAP); - self.subscribers_all = Some(tx); - rx - } - } +#[derive(Debug, Default)] +struct PeerInfo { + state: PeerState, + conn_dialed: Option, + conn_accepted: Option, +} - fn subscribe(&mut self, topic_id: TopicId) -> broadcast::Receiver { - if let Some(tx) = self.subscribers_topic.get(&topic_id) { - tx.subscribe() - } else { - let (tx, rx) = broadcast::channel(SUBSCRIBE_TOPIC_CAP); - self.subscribers_topic.insert(topic_id, tx); - rx - } - } +#[derive(Debug)] +enum PeerState { + Pending { queue: Vec }, + Active { send_tx: mpsc::Sender }, } -async fn wait_for_neighbor_up(mut sub: broadcast::Receiver) -> anyhow::Result<()> { - loop { - match sub.recv().await { - Ok(Event::NeighborUp(_neighbor)) => break Ok(()), - Ok(_) | Err(broadcast::error::RecvError::Lagged(_)) => {} - Err(broadcast::error::RecvError::Closed) => { - break Err(anyhow!("Failed to join swarm: channel closed")) - } - } +impl Default for PeerState { + fn default() -> Self { + PeerState::Pending { queue: Vec::new() } } } +#[derive(Debug, Default)] +struct TopicState { + neighbors: BTreeSet, + event_senders: EventSenders, + command_rx_keys: HashSet, +} + +/// Whether a connection is initiated by us (Dial) or by the remote peer (Accept) +#[derive(Debug, Clone, Copy)] +enum ConnOrigin { + Accept, + Dial, +} +#[derive(derive_more::Debug)] +struct SubscriberChannels { + event_tx: async_channel::Sender>, + #[debug("CommandStream")] + command_rx: CommandStream, +} + async fn connection_loop( from: PublicKey, conn: Connection, @@ -645,14 +687,20 @@ async fn connection_loop( mut send_rx: mpsc::Receiver, in_event_tx: &mpsc::Sender, max_message_size: usize, + queue: Vec, ) -> anyhow::Result<()> { let (mut send, mut recv) = match origin { ConnOrigin::Accept => conn.accept_bi().await?, ConnOrigin::Dial => conn.open_bi().await?, }; + debug!("connection established"); let mut send_buf = BytesMut::new(); let mut recv_buf = BytesMut::new(); + let send_loop = async { + for msg in queue { + write_message(&mut send, &mut send_buf, &msg, max_message_size).await? + } while let Some(msg) = send_rx.recv().await { write_message(&mut send, &mut send_buf, &msg, max_message_size).await? } @@ -670,7 +718,7 @@ async fn connection_loop( Ok::<_, anyhow::Error>(()) }; - tokio::try_join!(send_loop, recv_loop)?; + (send_loop, recv_loop).try_join().await?; Ok(()) } @@ -690,10 +738,92 @@ fn decode_peer_data(peer_data: &PeerData) -> anyhow::Result { Ok(info) } +#[derive(Debug, Default)] +struct EventSenders { + senders: Vec<(async_channel::Sender>, bool)>, +} + +impl EventSenders { + fn is_empty(&self) -> bool { + self.senders.is_empty() + } + + fn push(&mut self, sender: async_channel::Sender>) { + self.senders.push((sender, false)); + } + + /// Send an event to all subscribers. + /// + /// This will not wait until the sink is full, but send a `Lagged` response if the sink is almost full. + fn send(&mut self, event: &GossipEvent) { + self.senders.retain_mut(|(send, lagged)| { + // If the stream is disconnected, we don't need to send to it. + if send.is_closed() { + return false; + } + + // Check if the send buffer is almost full, and send a lagged response if it is. + let cap = send.capacity().expect("we only use bounded channels"); + let event = if send.len() >= cap - 1 { + if *lagged { + return true; + } + *lagged = true; + Event::Lagged + } else { + *lagged = false; + Event::Gossip(event.clone()) + }; + match send.try_send(Ok(event)) { + Ok(()) => true, + Err(async_channel::TrySendError::Full(_)) => true, + Err(async_channel::TrySendError::Closed(_)) => false, + } + }) + } +} + +#[derive(derive_more::Debug)] +struct TopicCommandStream { + topic_id: TopicId, + #[debug("CommandStream")] + stream: CommandStream, + closed: bool, +} + +impl TopicCommandStream { + fn new(topic_id: TopicId, stream: CommandStream) -> Self { + Self { + topic_id, + stream, + closed: false, + } + } +} + +impl Stream for TopicCommandStream { + type Item = (TopicId, Option); + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + if self.closed { + return Poll::Ready(None); + } + match Pin::new(&mut self.stream).poll_next(cx) { + Poll::Ready(Some(item)) => Poll::Ready(Some((self.topic_id, Some(item)))), + Poll::Ready(None) => { + self.closed = true; + Poll::Ready(Some((self.topic_id, None))) + } + Poll::Pending => Poll::Pending, + } + } +} + #[cfg(test)] mod test { use std::time::Duration; + use bytes::Bytes; + use futures_concurrency::future::TryJoin; use iroh_net::key::SecretKey; use iroh_net::relay::{RelayMap, RelayMode}; use tokio::spawn; @@ -764,6 +894,7 @@ mod test { debug!("peer2 {:?}", ep2.node_id()); debug!("peer3 {:?}", ep3.node_id()); let pi1 = ep1.node_id(); + let pi2 = ep2.node_id(); let cancel = CancellationToken::new(); let tasks = [ @@ -774,31 +905,33 @@ mod test { debug!("----- adding peers ----- "); let topic: TopicId = blake3::hash(b"foobar").into(); - // share info that pi1 is on the same relay_node - let addr1 = NodeAddr::new(pi1).with_relay_url(relay_url); + + let addr1 = NodeAddr::new(pi1).with_relay_url(relay_url.clone()); + let addr2 = NodeAddr::new(pi2).with_relay_url(relay_url); ep2.add_node_addr(addr1.clone()).unwrap(); - ep3.add_node_addr(addr1).unwrap(); + ep3.add_node_addr(addr2).unwrap(); debug!("----- joining ----- "); // join the topics and wait for the connection to succeed - go1.join(topic, vec![]).await.unwrap(); - go2.join(topic, vec![pi1]).await.unwrap().await.unwrap(); - go3.join(topic, vec![pi1]).await.unwrap().await.unwrap(); + let [sub1, mut sub2, mut sub3] = [ + go1.join(topic, vec![]), + go2.join(topic, vec![pi1]), + go3.join(topic, vec![pi2]), + ] + .try_join() + .await + .unwrap(); - let len = 2; + let (sink1, _stream1) = sub1.split(); - // subscribe nodes 2 and 3 to the topic - let mut stream2 = go2.subscribe(topic).await.unwrap(); - let mut stream3 = go3.subscribe(topic).await.unwrap(); + let len = 2; // publish messages on node1 let pub1 = spawn(async move { for i in 0..len { let message = format!("hi{}", i); info!("go1 broadcast: {message:?}"); - go1.broadcast(topic, message.into_bytes().into()) - .await - .unwrap(); + sink1.broadcast(message.into_bytes().into()).await.unwrap(); tokio::time::sleep(Duration::from_micros(1)).await; } }); @@ -807,9 +940,9 @@ mod test { let sub2 = spawn(async move { let mut recv = vec![]; loop { - let ev = stream2.recv().await.unwrap(); + let ev = sub2.next().await.unwrap().unwrap(); info!("go2 event: {ev:?}"); - if let Event::Received(msg) = ev { + if let Event::Gossip(GossipEvent::Received(msg)) = ev { recv.push(msg.content); } if recv.len() == len { @@ -822,9 +955,9 @@ mod test { let sub3 = spawn(async move { let mut recv = vec![]; loop { - let ev = stream3.recv().await.unwrap(); + let ev = sub3.next().await.unwrap().unwrap(); info!("go3 event: {ev:?}"); - if let Event::Received(msg) = ev { + if let Event::Gossip(GossipEvent::Received(msg)) = ev { recv.push(msg.content); } if recv.len() == len { diff --git a/iroh-gossip/src/net/handles.rs b/iroh-gossip/src/net/handles.rs new file mode 100644 index 0000000000..c082192224 --- /dev/null +++ b/iroh-gossip/src/net/handles.rs @@ -0,0 +1,254 @@ +//! Topic handles for sending and receiving on a gossip topic. +//! +//! These are returned from [`super::Gossip`]. + +use std::{ + collections::{BTreeSet, HashSet}, + pin::Pin, + task::{Context, Poll}, +}; + +use anyhow::{anyhow, Result}; +use bytes::Bytes; +use futures_lite::{Stream, StreamExt}; +use iroh_net::NodeId; +use serde::{Deserialize, Serialize}; + +use crate::{net::TOPIC_EVENTS_DEFAULT_CAP, proto::DeliveryScope}; + +/// Sender for a gossip topic. +#[derive(Debug)] +pub struct GossipSender(async_channel::Sender); + +impl GossipSender { + pub(crate) fn new(sender: async_channel::Sender) -> Self { + Self(sender) + } + + /// Broadcast a message to all nodes. + pub async fn broadcast(&self, message: Bytes) -> anyhow::Result<()> { + self.0 + .send(Command::Broadcast(message)) + .await + .map_err(|_| anyhow!("Gossip actor dropped")) + } + + /// Broadcast a message to our direct neighbors. + pub async fn broadcast_neighbors(&self, message: Bytes) -> anyhow::Result<()> { + self.0 + .send(Command::BroadcastNeighbors(message)) + .await + .map_err(|_| anyhow!("Gossip actor dropped")) + } + + /// Join a set of peers. + pub async fn join_peers(&self, peers: Vec) -> anyhow::Result<()> { + self.0 + .send(Command::JoinPeers(peers)) + .await + .map_err(|_| anyhow!("Gossip actor dropped")) + } +} + +type EventStream = Pin> + Send + 'static>>; + +/// Subscribed gossip topic. +/// +/// This handle is a [`Stream`] of [`Event`]s from the topic, and can be used to send messages. +/// +/// It may be split into sender and receiver parts with [`Self::split`]. +#[derive(Debug)] +pub struct GossipTopic { + sender: GossipSender, + receiver: GossipReceiver, +} + +impl GossipTopic { + pub(crate) fn new(sender: async_channel::Sender, receiver: EventStream) -> Self { + Self { + sender: GossipSender::new(sender), + receiver: GossipReceiver::new(Box::pin(receiver)), + } + } + + /// Splits `self` into [`GossipSender`] and [`GossipReceiver`] parts. + pub fn split(self) -> (GossipSender, GossipReceiver) { + (self.sender, self.receiver) + } + + /// Sends a message to all peers. + pub async fn broadcast(&self, message: Bytes) -> anyhow::Result<()> { + self.sender.broadcast(message).await + } + + /// Sends a message to our direct neighbors in the swarm. + pub async fn broadcast_neighbors(&self, message: Bytes) -> anyhow::Result<()> { + self.sender.broadcast_neighbors(message).await + } + + /// Waits until we are connected to at least one node. + pub async fn joined(&mut self) -> Result<()> { + self.receiver.joined().await + } + + /// Returns true if we are connected to at least one node. + pub fn is_joined(&self) -> bool { + self.receiver.is_joined() + } +} + +impl Stream for GossipTopic { + type Item = Result; + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + Pin::new(&mut self.receiver).poll_next(cx) + } +} + +/// Receiver for gossip events on a topic. +/// +/// This is a [`Stream`] of [`Event`]s emitted from the topic. +#[derive(derive_more::Debug)] +pub struct GossipReceiver { + #[debug("EventStream")] + stream: EventStream, + neighbors: HashSet, +} + +impl GossipReceiver { + pub(crate) fn new(events_rx: EventStream) -> Self { + Self { + stream: events_rx, + neighbors: Default::default(), + } + } + + /// Lists our current direct neighbors. + pub fn neighbors(&self) -> impl Iterator + '_ { + self.neighbors.iter().copied() + } + + /// Waits until we are connected to at least one node. + pub async fn joined(&mut self) -> Result<()> { + while self.neighbors.is_empty() { + let _ = self.try_next().await?; + } + Ok(()) + } + + /// Returns true if we are connected to at least one node. + pub fn is_joined(&self) -> bool { + !self.neighbors.is_empty() + } +} + +impl Stream for GossipReceiver { + type Item = Result; + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + let item = std::task::ready!(Pin::new(&mut self.stream).poll_next(cx)); + if let Some(Ok(item)) = &item { + match item { + Event::Gossip(GossipEvent::Joined(neighbors)) => { + self.neighbors.extend(neighbors.iter().copied()); + } + Event::Gossip(GossipEvent::NeighborUp(node_id)) => { + self.neighbors.insert(*node_id); + } + Event::Gossip(GossipEvent::NeighborDown(node_id)) => { + self.neighbors.remove(node_id); + } + _ => {} + } + } + Poll::Ready(item) + } +} + +/// Update from a subscribed gossip topic. +#[derive(Serialize, Deserialize, Debug, Eq, PartialEq)] +pub enum Event { + /// A message was received. + Gossip(GossipEvent), + /// We missed some messages. + Lagged, +} + +/// Gossip event +/// An event to be emitted to the application for a particular topic. +#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Serialize, Deserialize)] +pub enum GossipEvent { + /// We joined the topic with at least one peer. + Joined(Vec), + /// We have a new, direct neighbor in the swarm membership layer for this topic + NeighborUp(NodeId), + /// We dropped direct neighbor in the swarm membership layer for this topic + NeighborDown(NodeId), + /// A gossip message was received for this topic + Received(Message), +} + +impl From> for GossipEvent { + fn from(event: crate::proto::Event) -> Self { + match event { + crate::proto::Event::NeighborUp(node_id) => Self::NeighborUp(node_id), + crate::proto::Event::NeighborDown(node_id) => Self::NeighborDown(node_id), + crate::proto::Event::Received(message) => Self::Received(Message { + content: message.content, + scope: message.scope, + delivered_from: message.delivered_from, + }), + } + } +} + +/// A gossip message +#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, derive_more::Debug, Serialize, Deserialize)] +pub struct Message { + /// The content of the message + #[debug("Bytes({})", self.content.len())] + pub content: Bytes, + /// The scope of the message. + /// This tells us if the message is from a direct neighbor or actual gossip. + pub scope: DeliveryScope, + /// The node that delivered the message. This is not the same as the original author. + pub delivered_from: NodeId, +} + +/// A stream of commands for a gossip subscription. +pub type CommandStream = Pin + Send + Sync + 'static>>; + +/// Send a gossip message +#[derive(Serialize, Deserialize, derive_more::Debug)] +pub enum Command { + /// Broadcast a message to all nodes in the swarm + Broadcast(#[debug("Bytes({})", _0.len())] Bytes), + /// Broadcast a message to all direct neighbors + BroadcastNeighbors(#[debug("Bytes({})", _0.len())] Bytes), + /// Connect to a set of peers + JoinPeers(Vec), +} + +/// Options for joining a gossip topic. +#[derive(Serialize, Deserialize, Debug)] +pub struct JoinOptions { + /// The initial bootstrap nodes + pub bootstrap: BTreeSet, + /// The maximum number of messages that can be buffered in a subscription. + /// + /// If this limit is reached, the subscriber will receive a `Lagged` response, + /// the message will be dropped, and the subscriber will be closed. + /// + /// This is to prevent a single slow subscriber from blocking the dispatch loop. + /// If a subscriber is lagging, it should be closed and re-opened. + pub subscription_capacity: usize, +} + +impl JoinOptions { + /// Creates [`JoinOptions`] with the provided bootstrap nodes and the default subscription + /// capacity. + pub fn with_bootstrap(nodes: impl IntoIterator) -> Self { + Self { + bootstrap: nodes.into_iter().collect(), + subscription_capacity: TOPIC_EVENTS_DEFAULT_CAP, + } + } +} diff --git a/iroh-gossip/src/proto/state.rs b/iroh-gossip/src/proto/state.rs index a841342014..b8561aeeef 100644 --- a/iroh-gossip/src/proto/state.rs +++ b/iroh-gossip/src/proto/state.rs @@ -216,11 +216,6 @@ impl State { match event { InEventMapped::TopicEvent(topic, event) => { - // when receiving messages, update our conn map to take note that this topic state may want - // to keep this connection - if let topic::InEvent::RecvMessage(from, _message) = &event { - self.peer_topics.entry(*from).or_default().insert(topic); - } // when receiving a join command, initialize state if it doesn't exist if matches!(&event, topic::InEvent::Command(Command::Join(_peers))) { if let hash_map::Entry::Vacant(e) = self.states.entry(topic) { @@ -239,6 +234,11 @@ impl State { // pass the event to the state handler if let Some(state) = self.states.get_mut(&topic) { + // when receiving messages, update our conn map to take note that this topic state may want + // to keep this connection + if let topic::InEvent::RecvMessage(from, _message) = &event { + self.peer_topics.entry(*from).or_default().insert(topic); + } let out = state.handle(event, now); for event in out { handle_out_event(topic, event, &mut self.peer_topics, &mut self.outbox); diff --git a/iroh/src/client/gossip.rs b/iroh/src/client/gossip.rs index 9f24736365..dce8fd611c 100644 --- a/iroh/src/client/gossip.rs +++ b/iroh/src/client/gossip.rs @@ -55,14 +55,14 @@ impl Client { /// /// Returns a sink to send updates to the topic and a stream of responses. /// - /// Updates are either [Broadcast](iroh_gossip::dispatcher::Command::Broadcast) - /// or [BroadcastNeighbors](iroh_gossip::dispatcher::Command::BroadcastNeighbors). + /// Updates are either [Broadcast](iroh_gossip::net::Command::Broadcast) + /// or [BroadcastNeighbors](iroh_gossip::net::Command::BroadcastNeighbors). /// /// Broadcasts are gossiped to the entire swarm, while BroadcastNeighbors are sent to /// just the immediate neighbors of the node. /// - /// Responses are either [Gossip](iroh_gossip::dispatcher::Event::Gossip) or - /// [Lagged](iroh_gossip::dispatcher::Event::Lagged). + /// Responses are either [Gossip](iroh_gossip::net::Event::Gossip) or + /// [Lagged](iroh_gossip::net::Event::Lagged). /// /// Gossip events contain the actual message content, as well as information about the /// immediate neighbors of the node. diff --git a/iroh/src/metrics.rs b/iroh/src/metrics.rs index fb8e1cc93d..646fddf26d 100644 --- a/iroh/src/metrics.rs +++ b/iroh/src/metrics.rs @@ -74,7 +74,6 @@ pub fn try_init_metrics_collection() -> std::io::Result<()> { metrics.insert(iroh_net::metrics::MagicsockMetrics::new(reg)); metrics.insert(iroh_net::metrics::NetcheckMetrics::new(reg)); metrics.insert(iroh_net::metrics::PortmapMetrics::new(reg)); - metrics.insert(iroh_net::metrics::RelayMetrics::new(reg)); }) } @@ -101,10 +100,6 @@ pub fn get_metrics() -> anyhow::Result> { core.get_collector::(), &mut map, ); - collect( - core.get_collector::(), - &mut map, - ); Ok(map) } diff --git a/iroh/src/node.rs b/iroh/src/node.rs index 182f1dcc50..1ebd808132 100644 --- a/iroh/src/node.rs +++ b/iroh/src/node.rs @@ -46,7 +46,6 @@ use iroh_base::key::PublicKey; use iroh_blobs::store::{GcMarkEvent, GcSweepEvent, Store as BaoStore}; use iroh_blobs::util::local_pool::{LocalPool, LocalPoolHandle}; use iroh_blobs::{downloader::Downloader, protocol::Closed}; -use iroh_gossip::dispatcher::GossipDispatcher; use iroh_gossip::net::Gossip; use iroh_net::key::SecretKey; use iroh_net::{ @@ -116,7 +115,6 @@ struct NodeInner { cancel_token: CancellationToken, client: crate::client::Iroh, downloader: Downloader, - gossip_dispatcher: GossipDispatcher, local_pool_handle: LocalPoolHandle, } diff --git a/iroh/src/node/builder.rs b/iroh/src/node/builder.rs index a208cdaa94..456bf44d5a 100644 --- a/iroh/src/node/builder.rs +++ b/iroh/src/node/builder.rs @@ -15,10 +15,7 @@ use iroh_blobs::{ }; use iroh_docs::engine::DefaultAuthorStorage; use iroh_docs::net::DOCS_ALPN; -use iroh_gossip::{ - dispatcher::GossipDispatcher, - net::{Gossip, GOSSIP_ALPN}, -}; +use iroh_gossip::net::{Gossip, GOSSIP_ALPN}; #[cfg(not(test))] use iroh_net::discovery::local_swarm_discovery::LocalSwarmDiscovery; use iroh_net::{ @@ -557,7 +554,6 @@ where downloader.clone(), ) .await?; - let gossip_dispatcher = GossipDispatcher::new(gossip.clone()); // Initialize the internal RPC connection. let (internal_rpc, controller) = quic_rpc::transport::flume::connection::(32); @@ -577,7 +573,6 @@ where cancel_token: CancellationToken::new(), downloader, gossip, - gossip_dispatcher, local_pool_handle: lp.handle().clone(), }); diff --git a/iroh/src/node/rpc.rs b/iroh/src/node/rpc.rs index e51e233ce8..fb6f98d6f3 100644 --- a/iroh/src/node/rpc.rs +++ b/iroh/src/node/rpc.rs @@ -212,14 +212,15 @@ impl Handler { match msg { Subscribe(msg) => { chan.bidi_streaming(msg, self, |handler, req, updates| { - handler.inner.gossip_dispatcher.subscribe_with_opts( + let stream = handler.inner.gossip.join_with_stream( req.topic, - iroh_gossip::dispatcher::SubscribeOptions { + iroh_gossip::net::JoinOptions { bootstrap: req.bootstrap, subscription_capacity: req.subscription_capacity, }, - Box::new(updates), - ) + Box::pin(updates), + ); + futures_util::TryStreamExt::map_err(stream, RpcError::from) }) .await } diff --git a/iroh/src/rpc_protocol/gossip.rs b/iroh/src/rpc_protocol/gossip.rs index f9a64dda5b..8e877ad297 100644 --- a/iroh/src/rpc_protocol/gossip.rs +++ b/iroh/src/rpc_protocol/gossip.rs @@ -9,8 +9,8 @@ use serde::{Deserialize, Serialize}; use super::RpcService; -pub use iroh_gossip::dispatcher::Command as SubscribeUpdate; -pub use iroh_gossip::dispatcher::Event as SubscribeResponse; +pub use iroh_gossip::net::Command as SubscribeUpdate; +pub use iroh_gossip::net::Event as SubscribeResponse; #[allow(missing_docs)] #[derive(strum::Display, Debug, Serialize, Deserialize)] diff --git a/iroh/tests/client.rs b/iroh/tests/client.rs index cd1297f0c9..a22ce04eec 100644 --- a/iroh/tests/client.rs +++ b/iroh/tests/client.rs @@ -3,7 +3,7 @@ use futures_lite::{Stream, StreamExt}; use futures_util::SinkExt; use iroh::client::Iroh; use iroh_gossip::{ - dispatcher::{Command, Event, GossipEvent}, + net::{Command, Event, GossipEvent}, proto::TopicId, }; use iroh_net::{key::SecretKey, NodeAddr}; @@ -11,8 +11,8 @@ use testresult::TestResult; /// Spawn an iroh node in a separate thread and tokio runtime, and return /// the address and client. -fn spawn_node() -> (NodeAddr, Iroh) { - let (sender, receiver) = std::sync::mpsc::channel(); +async fn spawn_node() -> (NodeAddr, Iroh) { + let (sender, receiver) = tokio::sync::oneshot::channel(); std::thread::spawn(move || { let runtime = tokio::runtime::Builder::new_multi_thread() .enable_all() @@ -21,16 +21,18 @@ fn spawn_node() -> (NodeAddr, Iroh) { let secret_key = SecretKey::generate(); let node = iroh::node::Builder::default() .secret_key(secret_key) + .relay_mode(iroh_net::relay::RelayMode::Disabled) + .node_discovery(iroh::node::DiscoveryConfig::None) .spawn() .await?; let addr = node.node_addr().await?; - sender.send((addr, node.client().clone()))?; + sender.send((addr, node.client().clone())).unwrap(); node.cancel_token().cancelled().await; anyhow::Ok(()) })?; anyhow::Ok(()) }); - receiver.recv().unwrap() + receiver.await.unwrap() } /// Await `n` messages from a stream of gossip events. @@ -62,15 +64,23 @@ async fn await_messages( #[ignore = "flaky"] async fn gossip_smoke() -> TestResult { let _ = tracing_subscriber::fmt::try_init(); - let (addr1, node1) = spawn_node(); - let (addr2, node2) = spawn_node(); + let (addr1, node1) = spawn_node().await; + let (addr2, node2) = spawn_node().await; let gossip1 = node1.gossip(); let gossip2 = node2.gossip(); node1.add_node_addr(addr2.clone()).await?; node2.add_node_addr(addr1.clone()).await?; + let topic = TopicId::from([0u8; 32]); - let (mut sink1, _stream1) = gossip1.subscribe(topic, [addr2.node_id]).await?; + let (mut sink1, mut stream1) = gossip1.subscribe(topic, [addr2.node_id]).await?; let (_sink2, stream2) = gossip2.subscribe(topic, [addr1.node_id]).await?; + + assert_eq!( + stream1.next().await.unwrap().unwrap(), + Event::Gossip(GossipEvent::Joined(vec![addr2.node_id])) + ); + drop(stream1); + sink1.send(Command::Broadcast("hello".into())).await?; let msgs = await_messages(stream2, 1).await?; assert_eq!(msgs, vec![Bytes::from("hello")]); @@ -78,11 +88,10 @@ async fn gossip_smoke() -> TestResult { } #[tokio::test] -#[ignore = "flaky"] async fn gossip_drop_sink() -> TestResult { let _ = tracing_subscriber::fmt::try_init(); - let (addr1, node1) = spawn_node(); - let (addr2, node2) = spawn_node(); + let (addr1, node1) = spawn_node().await; + let (addr2, node2) = spawn_node().await; let gossip1 = node1.gossip(); let gossip2 = node2.gossip(); node1.add_node_addr(addr2.clone()).await?; @@ -90,9 +99,14 @@ async fn gossip_drop_sink() -> TestResult { let topic = TopicId::from([0u8; 32]); - let (mut sink1, stream1) = gossip1.subscribe(topic, [addr2.node_id]).await?; + let (mut sink1, mut stream1) = gossip1.subscribe(topic, [addr2.node_id]).await?; let (sink2, stream2) = gossip2.subscribe(topic, [addr1.node_id]).await?; + assert_eq!( + stream1.next().await.unwrap().unwrap(), + Event::Gossip(GossipEvent::Joined(vec![addr2.node_id])) + ); + drop(stream1); drop(sink2); From 215cd1d8ffdc4b7fbaeceb792da981c40f59b41a Mon Sep 17 00:00:00 2001 From: Floris Bruynooghe Date: Mon, 5 Aug 2024 22:30:49 +0200 Subject: [PATCH 30/45] fix(iroh): Do not set low max streams in builder (#2593) ## Description This removes the max streams in the builder. This makes little sense when users are allowed to create their custom protocol. right now both uni and bidirectional streams default to 100 max, which is reasonable for now. We should allow fully customising the TransportConfig later. ## Breaking Changes I don't think this counts as a breaking change. If we want to lower one of these later it would be, but we're probably fine with 100 by default? ## Notes & open questions See #2592 ## Change checklist - [x] Self-review. - [x] Documentation updates following the [style guide](https://rust-lang.github.io/rfcs/1574-more-api-documentation-conventions.html#appendix-a-full-conventions-text), if relevant. - [ ] Tests if relevant. - [x] All breaking changes documented. --- iroh/src/node/builder.rs | 7 ------- 1 file changed, 7 deletions(-) diff --git a/iroh/src/node/builder.rs b/iroh/src/node/builder.rs index 456bf44d5a..3ea3330fd8 100644 --- a/iroh/src/node/builder.rs +++ b/iroh/src/node/builder.rs @@ -54,7 +54,6 @@ const ENDPOINT_WAIT: Duration = Duration::from_secs(5); const DEFAULT_GC_INTERVAL: Duration = Duration::from_secs(60 * 5); const MAX_CONNECTIONS: u32 = 1024; -const MAX_STREAMS: u64 = 10; /// Storage backend for documents. #[derive(Debug, Clone)] @@ -458,11 +457,6 @@ where ..Default::default() }); let (endpoint, nodes_data_path) = { - let mut transport_config = quinn::TransportConfig::default(); - transport_config - .max_concurrent_bidi_streams(MAX_STREAMS.try_into()?) - .max_concurrent_uni_streams(0u32.into()); - let discovery: Option> = match self.node_discovery { DiscoveryConfig::None => None, DiscoveryConfig::Custom(discovery) => Some(discovery), @@ -501,7 +495,6 @@ where .secret_key(self.secret_key.clone()) .proxy_from_env() .keylog(self.keylog) - .transport_config(transport_config) .concurrent_connections(MAX_CONNECTIONS) .relay_mode(self.relay_mode); let endpoint = match discovery { From 7259ab584d509bde8f45654700a4bd9e74e4405c Mon Sep 17 00:00:00 2001 From: dignifiedquire Date: Mon, 5 Aug 2024 22:52:04 +0200 Subject: [PATCH 31/45] docs: update description in cargo.toml --- iroh/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/iroh/Cargo.toml b/iroh/Cargo.toml index 69ec85d66b..3c2168a87f 100644 --- a/iroh/Cargo.toml +++ b/iroh/Cargo.toml @@ -3,7 +3,7 @@ name = "iroh" version = "0.21.0" edition = "2021" readme = "README.md" -description = "Bytes. Distributed." +description = "A toolkit for building distributed applications" license = "MIT OR Apache-2.0" authors = ["dignifiedquire ", "n0 team"] repository = "https://github.com/n0-computer/iroh" From d54a5deb099754eaccd28fdb3cc8da93122f1376 Mon Sep 17 00:00:00 2001 From: dignifiedquire Date: Mon, 5 Aug 2024 22:52:25 +0200 Subject: [PATCH 32/45] chore: Release --- CHANGELOG.md | 60 +++++++++++++++++++++++++++++++++++++- Cargo.lock | 22 +++++++------- iroh-base/Cargo.toml | 2 +- iroh-blobs/Cargo.toml | 8 ++--- iroh-cli/Cargo.toml | 8 ++--- iroh-dns-server/Cargo.toml | 6 ++-- iroh-docs/Cargo.toml | 12 ++++---- iroh-gossip/Cargo.toml | 10 +++---- iroh-metrics/Cargo.toml | 2 +- iroh-net/Cargo.toml | 6 ++-- iroh-net/bench/Cargo.toml | 2 +- iroh-test/Cargo.toml | 2 +- iroh/Cargo.toml | 14 ++++----- 13 files changed, 106 insertions(+), 48 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 040d68a893..c8b57d8367 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,64 @@ All notable changes to iroh will be documented in this file. -## [0.21.0](https://github.com/n0-computer/iroh/compare/v0.20.0..0.21.0) - 2024-07-22 +## [0.22.0](https://github.com/n0-computer/iroh/compare/v0.21.0..0.22.0) - 2024-08-05 + +### ⛰️ Features + +- *(iroh)* Improve documentation and canonicalize docs in `iroh::client` ([#2553](https://github.com/n0-computer/iroh/issues/2553)) - ([d937234](https://github.com/n0-computer/iroh/commit/d937234621791338a65338678badc35345784296)) +- Override to staging relays ([#2551](https://github.com/n0-computer/iroh/issues/2551)) - ([ed4420b](https://github.com/n0-computer/iroh/commit/ed4420b5df75d4cfe3623c3e722f33a8a19449ce)) + +### 🐛 Bug Fixes + +- *(iroh)* Do not set low max streams in builder ([#2593](https://github.com/n0-computer/iroh/issues/2593)) - ([215cd1d](https://github.com/n0-computer/iroh/commit/215cd1d8ffdc4b7fbaeceb792da981c40f59b41a)) +- *(iroh-blobs)* Use async_channel instead of flume for local_pool ([#2533](https://github.com/n0-computer/iroh/issues/2533)) - ([9052905](https://github.com/n0-computer/iroh/commit/9052905d0d75d62c761139f02294d6abc1c53af6)) +- *(iroh-blobs)* Do not hit the network when downloading blobs which are complete ([#2586](https://github.com/n0-computer/iroh/issues/2586)) - ([0784403](https://github.com/n0-computer/iroh/commit/07844031c3e568e34c64a825803c9cd3f91a2035)) +- *(iroh-cli)* [**breaking**] Improve cli and configuration file ([#2532](https://github.com/n0-computer/iroh/issues/2532)) - ([0fc3794](https://github.com/n0-computer/iroh/commit/0fc37942be3d68399fbe45401ba7d67be43a83a6)) +- *(iroh-gossip)* Connection loop misuses `tokio::select!` leading to read errors ([#2572](https://github.com/n0-computer/iroh/issues/2572)) - ([32bb0f3](https://github.com/n0-computer/iroh/commit/32bb0f3be432676ca49473e75c7eb00db32a3673)) +- *(iroh-net)* Fix a compiler error with newer `derive_more` versions ([#2578](https://github.com/n0-computer/iroh/issues/2578)) - ([3f3fec5](https://github.com/n0-computer/iroh/commit/3f3fec5010a97f7d11f00b9c3eb2f05e167a1472)) +- *(iroh-net)* Make a single direct address in NodeAddr instant ([#2580](https://github.com/n0-computer/iroh/issues/2580)) - ([f5b3918](https://github.com/n0-computer/iroh/commit/f5b3918b8d4a0077334980b91ca6339acaa1c55f)) +- Docker image builds ([#2530](https://github.com/n0-computer/iroh/issues/2530)) - ([5c60a52](https://github.com/n0-computer/iroh/commit/5c60a52dd442525852f1b1a0b0f5fc62b463060e)) +- Disable docs preview on forks ([#2558](https://github.com/n0-computer/iroh/issues/2558)) - ([741b42f](https://github.com/n0-computer/iroh/commit/741b42fa4260c94b4e80b633bffdf5add6ee24aa)) +- Force CI to use staging relays ([#2560](https://github.com/n0-computer/iroh/issues/2560)) - ([ffeb1a9](https://github.com/n0-computer/iroh/commit/ffeb1a901387a56a1544ef058a86843f500eb84a)) +- Pin derive_more to avoid sudden breakages ([#2584](https://github.com/n0-computer/iroh/issues/2584)) - ([1ba033c](https://github.com/n0-computer/iroh/commit/1ba033cf0cc601c7ffd4c09822190ddbb2fb8197)) + +### 🚜 Refactor + +- *(iroh)* Remove flume from iroh gossip ([#2542](https://github.com/n0-computer/iroh/issues/2542)) - ([2964569](https://github.com/n0-computer/iroh/commit/29645698ca794d88314ff9c1117e962ec6260650)) +- *(iroh)* Remove flume from iroh-cli and iroh ([#2543](https://github.com/n0-computer/iroh/issues/2543)) - ([347d45c](https://github.com/n0-computer/iroh/commit/347d45c3de3bcba878657566a67f4e1825b03bc4)) +- *(iroh-docs)* Replace flume with async_channel in docs ([#2540](https://github.com/n0-computer/iroh/issues/2540)) - ([e7a7552](https://github.com/n0-computer/iroh/commit/e7a7552191b71b476cab0a75544f129e657d8dfe)) +- *(iroh-net)* Replace flume in iroh-net with async_channel ([#2539](https://github.com/n0-computer/iroh/issues/2539)) - ([22314a1](https://github.com/n0-computer/iroh/commit/22314a18228799e26de8ba2c0e44b45aec3b2af4)) +- *(iroh-net)* Move more server code behind `iroh-relay` feature flag ([#2566](https://github.com/n0-computer/iroh/issues/2566)) - ([1dda2f7](https://github.com/n0-computer/iroh/commit/1dda2f7ab706cf794d2c8f4e6b47b24caf2f1c78)) +- *(iroh-net)* [**breaking**] Improve server modules structure & rename structs ([#2568](https://github.com/n0-computer/iroh/issues/2568)) - ([29d2e82](https://github.com/n0-computer/iroh/commit/29d2e82a577ebc8cb4029c0df0138fe662031d5c)) +- *(iroh-net)* Switch to (now stable) `IpAddr::to_canonical` ([#2569](https://github.com/n0-computer/iroh/issues/2569)) - ([7fdd6cb](https://github.com/n0-computer/iroh/commit/7fdd6cb64f24c908862ccdf59fb5ca466e0b508f)) + +### 📚 Documentation + +- *(iroh)* Add documentations and examples for the `iroh::node::Client` ([#2582](https://github.com/n0-computer/iroh/issues/2582)) - ([55836fa](https://github.com/n0-computer/iroh/commit/55836fa5ca56fe6964be52046bb0c7f77e62b647)) +- *(iroh-cli)* Point to the configuration refernce from each iroh subcommand ([#2571](https://github.com/n0-computer/iroh/issues/2571)) - ([8e4e586](https://github.com/n0-computer/iroh/commit/8e4e586cece3968700a13562058f3a5c152c1805)) +- Fix typos discovered by codespell ([#2534](https://github.com/n0-computer/iroh/issues/2534)) - ([8435a45](https://github.com/n0-computer/iroh/commit/8435a45e3ee273d5a8dcb083eadc333426024b8b)) +- Update description in cargo.toml - ([7259ab5](https://github.com/n0-computer/iroh/commit/7259ab584d509bde8f45654700a4bd9e74e4405c)) + +### 🧪 Testing + +- *(iroh-blobs)* Comment out ignored test (that is not a flaky test) ([#2559](https://github.com/n0-computer/iroh/issues/2559)) - ([15f36b3](https://github.com/n0-computer/iroh/commit/15f36b373ec3dc86d9a81caeef54f8a165c10001)) +- *(iroh-cli)* Update to new api ([#2549](https://github.com/n0-computer/iroh/issues/2549)) - ([f97c1c0](https://github.com/n0-computer/iroh/commit/f97c1c0858161a8c0e0f64b862aaceea0035d371)) +- *(iroh-cli)* Remove flaky mark from 5 tests and improve logs ([#2562](https://github.com/n0-computer/iroh/issues/2562)) - ([14fccee](https://github.com/n0-computer/iroh/commit/14fcceed53e9633402ba1b978f2002901b615ba8)) +- *(iroh-cli)* Reduce flakyness of cli_provide_file_resume ([#2563](https://github.com/n0-computer/iroh/issues/2563)) - ([f085e63](https://github.com/n0-computer/iroh/commit/f085e633c82531b7d24a70703ae48a2562eccfdd)) +- *(iroh-cli)* Make cli resumption tests not flaky ([#2564](https://github.com/n0-computer/iroh/issues/2564)) - ([9e6b1e0](https://github.com/n0-computer/iroh/commit/9e6b1e0897b15ea7096c95143e11e09e948c862e)) +- *(iroh-net)* Increase timeout for local swarm discovery test ([#2574](https://github.com/n0-computer/iroh/issues/2574)) - ([605a85d](https://github.com/n0-computer/iroh/commit/605a85d9c121f8d2b48f91c2eb1e86cfa451bd22)) + +### ⚙️ Miscellaneous Tasks + +- *(iroh-net)* Remove need for relay info in best_addr ([#2579](https://github.com/n0-computer/iroh/issues/2579)) - ([d662bfc](https://github.com/n0-computer/iroh/commit/d662bfc663ad956bbb38716bd5b8022a699bfce4)) +- Fix clippy warnings ([#2550](https://github.com/n0-computer/iroh/issues/2550)) - ([73de21b](https://github.com/n0-computer/iroh/commit/73de21b35d6b83def03f51caca06c1931ea8ee77)) +- Generate docs for each PR ([#2547](https://github.com/n0-computer/iroh/issues/2547)) - ([0812333](https://github.com/n0-computer/iroh/commit/081233357d4dbe0cabe890009d674839d9de18be)) + +### Ref + +- *(iroh-net)* Don't write the match as fully exhaustive ([#2585](https://github.com/n0-computer/iroh/issues/2585)) - ([43ef8b6](https://github.com/n0-computer/iroh/commit/43ef8b6e87048f7f28ddb4c2b97d7bf4fe853b90)) + +## [0.21.0](https://github.com/n0-computer/iroh/compare/v0.20.0..v0.21.0) - 2024-07-22 ### ⛰️ Features @@ -42,6 +99,7 @@ All notable changes to iroh will be documented in this file. - *(bytes)* Bytes v1.6.0 was yanked so upgrade to bytes v1.6.1 ([#2503](https://github.com/n0-computer/iroh/issues/2503)) - ([ecfbed3](https://github.com/n0-computer/iroh/commit/ecfbed3d5e1bdaca36ab1ddd2ebcd01a6b286a94)) - Add a flaky tests failure report to our discord notification ([#2496](https://github.com/n0-computer/iroh/issues/2496)) - ([f84c06e](https://github.com/n0-computer/iroh/commit/f84c06eb87ed8b93b1bce71c8502732db7faeedb)) - Keep GitHub Actions up to date with GitHub's Dependabot ([#2498](https://github.com/n0-computer/iroh/issues/2498)) - ([538efbf](https://github.com/n0-computer/iroh/commit/538efbfc6575733114292ddcfdc040adb50a246c)) +- Release - ([1145b34](https://github.com/n0-computer/iroh/commit/1145b34a2f8001a37bcf907626dc8ebd8dd77da4)) ### Deprecation diff --git a/Cargo.lock b/Cargo.lock index 87bf9e4359..40d0580df4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2475,7 +2475,7 @@ checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3" [[package]] name = "iroh" -version = "0.21.0" +version = "0.22.0" dependencies = [ "anyhow", "async-channel", @@ -2531,7 +2531,7 @@ dependencies = [ [[package]] name = "iroh-base" -version = "0.21.0" +version = "0.22.0" dependencies = [ "aead", "anyhow", @@ -2575,7 +2575,7 @@ dependencies = [ [[package]] name = "iroh-blobs" -version = "0.21.0" +version = "0.22.0" dependencies = [ "anyhow", "async-channel", @@ -2627,7 +2627,7 @@ dependencies = [ [[package]] name = "iroh-cli" -version = "0.21.0" +version = "0.22.0" dependencies = [ "anyhow", "async-channel", @@ -2683,7 +2683,7 @@ dependencies = [ [[package]] name = "iroh-dns-server" -version = "0.21.0" +version = "0.22.0" dependencies = [ "anyhow", "async-trait", @@ -2732,7 +2732,7 @@ dependencies = [ [[package]] name = "iroh-docs" -version = "0.21.0" +version = "0.22.0" dependencies = [ "anyhow", "async-channel", @@ -2773,7 +2773,7 @@ dependencies = [ [[package]] name = "iroh-gossip" -version = "0.21.0" +version = "0.22.0" dependencies = [ "anyhow", "async-channel", @@ -2817,7 +2817,7 @@ dependencies = [ [[package]] name = "iroh-metrics" -version = "0.21.0" +version = "0.22.0" dependencies = [ "anyhow", "erased_set", @@ -2836,7 +2836,7 @@ dependencies = [ [[package]] name = "iroh-net" -version = "0.21.0" +version = "0.22.0" dependencies = [ "anyhow", "async-channel", @@ -2931,7 +2931,7 @@ dependencies = [ [[package]] name = "iroh-net-bench" -version = "0.21.0" +version = "0.22.0" dependencies = [ "anyhow", "bytes", @@ -2997,7 +2997,7 @@ dependencies = [ [[package]] name = "iroh-test" -version = "0.21.0" +version = "0.22.0" dependencies = [ "anyhow", "tokio", diff --git a/iroh-base/Cargo.toml b/iroh-base/Cargo.toml index c6fbc9b2df..30a7a8a841 100644 --- a/iroh-base/Cargo.toml +++ b/iroh-base/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "iroh-base" -version = "0.21.0" +version = "0.22.0" edition = "2021" readme = "README.md" description = "base type and utilities for Iroh" diff --git a/iroh-blobs/Cargo.toml b/iroh-blobs/Cargo.toml index d084a35760..040da24aa2 100644 --- a/iroh-blobs/Cargo.toml +++ b/iroh-blobs/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "iroh-blobs" -version = "0.21.0" +version = "0.22.0" edition = "2021" readme = "README.md" description = "blob and collection transfer support for iroh" @@ -28,10 +28,10 @@ futures-lite = "2.3" genawaiter = { version = "0.99.1", features = ["futures03"] } hashlink = { version = "0.9.0", optional = true } hex = "0.4.3" -iroh-base = { version = "0.21.0", features = ["redb"], path = "../iroh-base" } +iroh-base = { version = "0.22.0", features = ["redb"], path = "../iroh-base" } iroh-io = { version = "0.6.0", features = ["stats"] } -iroh-metrics = { version = "0.21.0", path = "../iroh-metrics", default-features = false } -iroh-net = { version = "0.21.0", path = "../iroh-net" } +iroh-metrics = { version = "0.22.0", path = "../iroh-metrics", default-features = false } +iroh-net = { version = "0.22.0", path = "../iroh-net" } num_cpus = "1.15.0" parking_lot = { version = "0.12.1", optional = true } pin-project = "1.1.5" diff --git a/iroh-cli/Cargo.toml b/iroh-cli/Cargo.toml index 512d5da2a6..7a330dcf6c 100644 --- a/iroh-cli/Cargo.toml +++ b/iroh-cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "iroh-cli" -version = "0.21.0" +version = "0.22.0" edition = "2021" readme = "README.md" description = "Bytes. Distributed." @@ -40,9 +40,9 @@ futures-util = { version = "0.3.30", features = ["futures-sink"] } hex = "0.4.3" human-time = "0.1.6" indicatif = { version = "0.17", features = ["tokio"] } -iroh = { version = "0.21.0", path = "../iroh", features = ["metrics"] } -iroh-gossip = { version = "0.21.0", path = "../iroh-gossip" } -iroh-metrics = { version = "0.21.0", path = "../iroh-metrics" } +iroh = { version = "0.22.0", path = "../iroh", features = ["metrics"] } +iroh-gossip = { version = "0.22.0", path = "../iroh-gossip" } +iroh-metrics = { version = "0.22.0", path = "../iroh-metrics" } parking_lot = "0.12.1" pkarr = { version = "1.1.5", default-features = false } portable-atomic = "1" diff --git a/iroh-dns-server/Cargo.toml b/iroh-dns-server/Cargo.toml index 0a65b68755..17c4a93b80 100644 --- a/iroh-dns-server/Cargo.toml +++ b/iroh-dns-server/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "iroh-dns-server" -version = "0.21.0" +version = "0.22.0" edition = "2021" description = "A pkarr relay and DNS server" license = "MIT OR Apache-2.0" @@ -24,7 +24,7 @@ governor = "0.6.3" hickory-proto = "0.24.0" hickory-server = { version = "0.24.0", features = ["dns-over-rustls"] } http = "1.0.0" -iroh-metrics = { version = "0.21.0", path = "../iroh-metrics" } +iroh-metrics = { version = "0.22.0", path = "../iroh-metrics" } lru = "0.12.3" mainline = "2.0.1" parking_lot = "0.12.1" @@ -53,7 +53,7 @@ z32 = "1.1.1" [dev-dependencies] hickory-resolver = "0.24.0" -iroh-net = { version = "0.21.0", path = "../iroh-net" } +iroh-net = { version = "0.22.0", path = "../iroh-net" } iroh-test = { path = "../iroh-test" } pkarr = { version = "2.0.0", features = ["rand"] } diff --git a/iroh-docs/Cargo.toml b/iroh-docs/Cargo.toml index f50afda51c..c6a593640c 100644 --- a/iroh-docs/Cargo.toml +++ b/iroh-docs/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "iroh-docs" -version = "0.21.0" +version = "0.22.0" edition = "2021" readme = "README.md" description = "Iroh sync" @@ -25,11 +25,11 @@ futures-buffered = "0.2.4" futures-lite = "2.3.0" futures-util = { version = "0.3.25" } hex = "0.4" -iroh-base = { version = "0.21.0", path = "../iroh-base" } -iroh-blobs = { version = "0.21.0", path = "../iroh-blobs", optional = true, features = ["downloader"] } -iroh-gossip = { version = "0.21.0", path = "../iroh-gossip", optional = true } -iroh-metrics = { version = "0.21.0", path = "../iroh-metrics", default-features = false } -iroh-net = { version = "0.21.0", optional = true, path = "../iroh-net" } +iroh-base = { version = "0.22.0", path = "../iroh-base" } +iroh-blobs = { version = "0.22.0", path = "../iroh-blobs", optional = true, features = ["downloader"] } +iroh-gossip = { version = "0.22.0", path = "../iroh-gossip", optional = true } +iroh-metrics = { version = "0.22.0", path = "../iroh-metrics", default-features = false } +iroh-net = { version = "0.22.0", optional = true, path = "../iroh-net" } lru = "0.12" num_enum = "0.7" postcard = { version = "1", default-features = false, features = ["alloc", "use-std", "experimental-derive"] } diff --git a/iroh-gossip/Cargo.toml b/iroh-gossip/Cargo.toml index b39e4d76a8..6b1b92dc70 100644 --- a/iroh-gossip/Cargo.toml +++ b/iroh-gossip/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "iroh-gossip" -version = "0.21.0" +version = "0.22.0" edition = "2021" readme = "README.md" description = "gossip messages over broadcast trees" @@ -25,9 +25,9 @@ futures-concurrency = { version = "7.6.1", optional = true } futures-lite = { version = "2.3", optional = true } futures-util = { version = "0.3.30", optional = true } indexmap = "2.0" -iroh-base = { version = "0.21.0", path = "../iroh-base" } -iroh-metrics = { version = "0.21.0", path = "../iroh-metrics" } -iroh-net = { path = "../iroh-net", version = "0.21.0", optional = true, default-features = false } +iroh-base = { version = "0.22.0", path = "../iroh-base" } +iroh-metrics = { version = "0.22.0", path = "../iroh-metrics" } +iroh-net = { path = "../iroh-net", version = "0.22.0", optional = true, default-features = false } postcard = { version = "1", default-features = false, features = ["alloc", "use-std", "experimental-derive"] } rand = { version = "0.8.5", features = ["std_rng"] } rand_core = "0.6.4" @@ -38,7 +38,7 @@ tracing = "0.1" [dev-dependencies] clap = { version = "4", features = ["derive"] } -iroh-net = { path = "../iroh-net", version = "0.21.0", default-features = false, features = ["test-utils"] } +iroh-net = { path = "../iroh-net", version = "0.22.0", default-features = false, features = ["test-utils"] } iroh-test = { path = "../iroh-test" } rand_chacha = "0.3.1" tracing-subscriber = { version = "0.3", features = ["env-filter"] } diff --git a/iroh-metrics/Cargo.toml b/iroh-metrics/Cargo.toml index 05c30768d4..8619ce025b 100644 --- a/iroh-metrics/Cargo.toml +++ b/iroh-metrics/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "iroh-metrics" -version = "0.21.0" +version = "0.22.0" edition = "2021" readme = "README.md" description = "metrics for iroh" diff --git a/iroh-net/Cargo.toml b/iroh-net/Cargo.toml index 5145d35a83..757b3a7e00 100644 --- a/iroh-net/Cargo.toml +++ b/iroh-net/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "iroh-net" -version = "0.21.0" +version = "0.22.0" edition = "2021" readme = "README.md" description = "networking support for iroh" @@ -39,7 +39,7 @@ http-body-util = "0.1.0" hyper = { version = "1", features = ["server", "client", "http1"] } hyper-util = "0.1.1" igd-next = { version = "0.14.3", features = ["aio_tokio"] } -iroh-base = { version = "0.21.0", path = "../iroh-base", features = ["key"] } +iroh-base = { version = "0.22.0", path = "../iroh-base", features = ["key"] } libc = "0.2.139" num_enum = "0.7" once_cell = "1.18.0" @@ -89,7 +89,7 @@ tracing-subscriber = { version = "0.3", features = ["env-filter"], optional = tr tokio-rustls-acme = { version = "0.3", optional = true } # metrics -iroh-metrics = { version = "0.21.0", path = "../iroh-metrics", default-features = false } +iroh-metrics = { version = "0.22.0", path = "../iroh-metrics", default-features = false } strum = { version = "0.26.2", features = ["derive"] } [target.'cfg(any(target_os = "linux", target_os = "android"))'.dependencies] diff --git a/iroh-net/bench/Cargo.toml b/iroh-net/bench/Cargo.toml index 28a401bd40..9bc7ad7990 100644 --- a/iroh-net/bench/Cargo.toml +++ b/iroh-net/bench/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "iroh-net-bench" -version = "0.21.0" +version = "0.22.0" edition = "2021" license = "MIT OR Apache-2.0" publish = false diff --git a/iroh-test/Cargo.toml b/iroh-test/Cargo.toml index c90305b291..fe592c6fbe 100644 --- a/iroh-test/Cargo.toml +++ b/iroh-test/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "iroh-test" -version = "0.21.0" +version = "0.22.0" edition = "2021" readme = "README.md" description = "Internal utilities to support testing of iroh." diff --git a/iroh/Cargo.toml b/iroh/Cargo.toml index 3c2168a87f..3a4264a54a 100644 --- a/iroh/Cargo.toml +++ b/iroh/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "iroh" -version = "0.21.0" +version = "0.22.0" edition = "2021" readme = "README.md" description = "A toolkit for building distributed applications" @@ -27,16 +27,16 @@ futures-lite = "2.3" futures-util = "0.3" genawaiter = { version = "0.99", default-features = false, features = ["futures03"] } hex = { version = "0.4.3" } -iroh-blobs = { version = "0.21.0", path = "../iroh-blobs", features = ["downloader"] } -iroh-base = { version = "0.21.0", path = "../iroh-base", features = ["key"] } +iroh-blobs = { version = "0.22.0", path = "../iroh-blobs", features = ["downloader"] } +iroh-base = { version = "0.22.0", path = "../iroh-base", features = ["key"] } iroh-io = { version = "0.6.0", features = ["stats"] } -iroh-metrics = { version = "0.21.0", path = "../iroh-metrics", optional = true } -iroh-net = { version = "0.21.0", path = "../iroh-net", features = ["local_swarm_discovery"] } +iroh-metrics = { version = "0.22.0", path = "../iroh-metrics", optional = true } +iroh-net = { version = "0.22.0", path = "../iroh-net", features = ["local_swarm_discovery"] } nested_enum_utils = "0.1.0" num_cpus = { version = "1.15.0" } portable-atomic = "1" -iroh-docs = { version = "0.21.0", path = "../iroh-docs" } -iroh-gossip = { version = "0.21.0", path = "../iroh-gossip" } +iroh-docs = { version = "0.22.0", path = "../iroh-docs" } +iroh-gossip = { version = "0.22.0", path = "../iroh-gossip" } parking_lot = "0.12.1" postcard = { version = "1", default-features = false, features = ["alloc", "use-std", "experimental-derive"] } quic-rpc = { version = "0.11", default-features = false, features = ["flume-transport", "quinn-transport"] } From 3f2eb1853aa9acfa07441909b703cb3aab2a876c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 9 Aug 2024 11:52:19 +0200 Subject: [PATCH 33/45] chore(deps): bump EmbarkStudios/cargo-deny-action from 1 to 2 in the github-actions group (#2591) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps the github-actions group with 1 update: [EmbarkStudios/cargo-deny-action](https://github.com/embarkstudios/cargo-deny-action). Updates `EmbarkStudios/cargo-deny-action` from 1 to 2
Release notes

Sourced from EmbarkStudios/cargo-deny-action's releases.

Release 2.0.1 - cargo-deny 0.16.1

Fixed

  • PR#691 fixed an issue where workspace dependencies that used the current dir '.' path component would incorrectly trigger the unused-workspace-dependency lint.

Release 2.0.0 - cargo-deny 0.16.0

Action

Added

Changed

  • This release includes breaking changes in cargo-deny, so this release begins the v2 tag, using v1 will be stable but not follow future cargo-deny releases.

cargo-deny

Removed

  • PR#681 finished the deprecation introduced in PR#611, making the usage of the deprecated fields into errors.

[advisories]

The following fields have all been removed in favor of denying all advisories by default. To ignore an advisory the ignore field can be used as before.

  • vulnerability - Vulnerability advisories are now deny by default
  • unmaintained - Unmaintained advisories are now deny by default
  • unsound - Unsound advisories are now deny by default
  • notice - Notice advisories are now deny by default
  • severity-threshold - The severity of vulnerabilities is now irrelevant

[licenses]

The following fields have all been removed in favor of denying all licenses that are not explicitly allowed via either allow or exceptions.

  • unlicensed - Crates whose license(s) cannot be confidently determined are now always errors. The clarify field can be used to help cargo-deny determine the license.
  • allow-osi-fsf-free - The OSI/FSF Free attributes are now irrelevant, only whether it is explicitly allowed.
  • copyleft - The copyleft attribute is now irrelevant, only whether it is explicitly allowed.
  • default - The default is now deny.
  • deny - All licenses are now denied by default, this field added nothing.

Changed

  • PR#685 follows up on PR#673, moving the fields that were added to their own separate bans.workspace-dependencies section. This is an unannounced breaking change but is fairly minor and 0.15.0 was never released on github actions so the amount of people affected by this will be (hopefully) small. This also makes the workspace duplicate detection off by default since the field is optional, but makes it so that if not specified workspace duplicates are now deny instead of warn.

Fixed

  • PR#685 resolved #682 by adding the include-path-dependencies field, allowing path dependencies to be ignored if it is false.

Release 1.6.3 - cargo-deny 0.14.21

Fixed

  • PR#643 resolved #629 by making the hosted git (github, gitlab, bitbucket) org/user name comparison case-insensitive. Thanks @​pmnlla!
  • PR#649 fixed an issue where depending on the same crate multiple times by using different cfg()/triple targets could cause features to be resolved incorrectly and thus crates to be not pulled into the graph used for checking.

[0.14.20] - 2024-03-23

Fixed

  • PR#642 resolved #641 by pinning gix-transport (and its unique dependencies) to 0.41.2 as a workaround for cargo install not using the lockfile. See this issue for more information.

... (truncated)

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=EmbarkStudios/cargo-deny-action&package-manager=github_actions&previous-version=1&new-version=2)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore major version` will close this group update PR and stop Dependabot creating any more for the specific dependency's major version (unless you unignore this specific dependency's major version or upgrade to it yourself) - `@dependabot ignore minor version` will close this group update PR and stop Dependabot creating any more for the specific dependency's minor version (unless you unignore this specific dependency's minor version or upgrade to it yourself) - `@dependabot ignore ` will close this group update PR and stop Dependabot creating any more for the specific dependency (unless you unignore this specific dependency or upgrade to it yourself) - `@dependabot unignore ` will remove all of the ignore conditions of the specified dependency - `@dependabot unignore ` will remove the ignore condition of the specified dependency and ignore conditions
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Floris Bruynooghe --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 689c48e048..6c4d27ac99 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -299,7 +299,7 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - uses: EmbarkStudios/cargo-deny-action@v1 + - uses: EmbarkStudios/cargo-deny-action@v2 with: arguments: --workspace --all-features command: check From 2e937a834d25c6ea003f6666099d73f72f3e09f3 Mon Sep 17 00:00:00 2001 From: Floris Bruynooghe Date: Fri, 9 Aug 2024 12:12:06 +0200 Subject: [PATCH 34/45] ref(iroh-net): Move PathState to its own module (#2587) # Description There are no functional changes, this only moves code and fixes up imports. ## Breaking Changes None ## Notes & open questions This is on top of #2580, which needs to be merged first. ## Change checklist - [x] Self-review. - [x] Documentation updates following the [style guide](https://rust-lang.github.io/rfcs/1574-more-api-documentation-conventions.html#appendix-a-full-conventions-text), if relevant. - [x] Tests if relevant. - [x] All breaking changes documented. --- iroh-net/src/magicsock/node_map.rs | 1 + iroh-net/src/magicsock/node_map/node_state.rs | 281 +---------------- iroh-net/src/magicsock/node_map/path_state.rs | 284 ++++++++++++++++++ iroh-net/src/magicsock/node_map/udp_paths.rs | 3 +- 4 files changed, 295 insertions(+), 274 deletions(-) create mode 100644 iroh-net/src/magicsock/node_map/path_state.rs diff --git a/iroh-net/src/magicsock/node_map.rs b/iroh-net/src/magicsock/node_map.rs index 3550f34bfb..a91510674c 100644 --- a/iroh-net/src/magicsock/node_map.rs +++ b/iroh-net/src/magicsock/node_map.rs @@ -30,6 +30,7 @@ use crate::{ mod best_addr; mod node_state; +mod path_state; mod udp_paths; pub use node_state::{ConnectionType, ControlMsg, DirectAddrInfo, NodeInfo}; diff --git a/iroh-net/src/magicsock/node_map/node_state.rs b/iroh-net/src/magicsock/node_map/node_state.rs index 375fb706d7..c387b97647 100644 --- a/iroh-net/src/magicsock/node_map/node_state.rs +++ b/iroh-net/src/magicsock/node_map/node_state.rs @@ -1,4 +1,4 @@ -use std::collections::{btree_map::Entry, BTreeMap, BTreeSet, HashMap}; +use std::collections::{btree_map::Entry, BTreeSet, HashMap}; use std::hash::Hash; use std::net::{IpAddr, SocketAddr}; use std::time::{Duration, Instant}; @@ -19,6 +19,7 @@ use crate::util::relay_only_mode; use crate::{stun, NodeAddr, NodeId}; use super::best_addr::{self, ClearReason, Source}; +use super::path_state::{summarize_node_paths, PathState}; use super::udp_paths::{NodeUdpPaths, UdpSendAddr}; use super::IpPort; @@ -33,16 +34,12 @@ const LAST_ALIVE_PRUNE_DURATION: Duration = Duration::from_secs(120); /// How long we wait for a pong reply before assuming it's never coming. const PING_TIMEOUT_DURATION: Duration = Duration::from_secs(5); -/// The minimum time between pings to an endpoint. (Except in the case of CallMeMaybe frames -/// resetting the counter, as the first pings likely didn't through the firewall) -const DISCO_PING_INTERVAL: Duration = Duration::from_secs(5); - /// The latency at or under which we don't try to upgrade to a better path. const GOOD_ENOUGH_LATENCY: Duration = Duration::from_millis(5); /// How long since the last activity we try to keep an established endpoint peering alive. /// It's also the idle time at which we stop doing STUN queries to keep NAT mappings alive. -const SESSION_ACTIVE_TIMEOUT: Duration = Duration::from_secs(45); +pub(super) const SESSION_ACTIVE_TIMEOUT: Duration = Duration::from_secs(45); /// How often we try to upgrade to a better patheven if we have some non-relay route that works. const UPGRADE_INTERVAL: Duration = Duration::from_secs(60); @@ -1130,267 +1127,6 @@ impl From for NodeAddr { } } -/// State about a particular path to another [`NodeState`]. -/// -/// This state is used for both the relay path and any direct UDP paths. -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub(super) struct PathState { - /// The node for which this path exists. - node_id: NodeId, - /// The path this applies for. - path: SendAddr, - /// The last (outgoing) ping time. - last_ping: Option, - - /// If non-zero, means that this was an endpoint that we learned about at runtime (from an - /// incoming ping). If so, we keep the time updated and use it to discard old candidates. - // NOTE: tx_id Originally added in tailscale due to . - last_got_ping: Option<(Instant, stun::TransactionId)>, - - /// If non-zero, is the time this endpoint was advertised last via a call-me-maybe disco message. - call_me_maybe_time: Option, - - /// Last [`PongReply`] received. - pub(super) recent_pong: Option, - /// When was this endpoint last used to transmit payload data (removing ping, pong, etc). - pub(super) last_payload_msg: Option, -} - -impl PathState { - fn new(node_id: NodeId, path: SendAddr) -> Self { - Self { - node_id, - path, - last_ping: None, - last_got_ping: None, - call_me_maybe_time: None, - recent_pong: None, - last_payload_msg: None, - } - } - - pub(super) fn udp_addr(&self) -> Option { - match self.path { - SendAddr::Udp(addr) => Some(addr), - SendAddr::Relay(_) => None, - } - } - - pub(super) fn with_last_payload(node_id: NodeId, path: SendAddr, now: Instant) -> Self { - PathState { - node_id, - path, - last_ping: None, - last_got_ping: None, - call_me_maybe_time: None, - recent_pong: None, - last_payload_msg: Some(now), - } - } - - pub(super) fn with_ping( - node_id: NodeId, - path: SendAddr, - tx_id: stun::TransactionId, - now: Instant, - ) -> Self { - let mut new = PathState::new(node_id, path); - new.handle_ping(tx_id, now); - new - } - - pub(super) fn add_pong_reply(&mut self, r: PongReply) { - if let SendAddr::Udp(ref path) = self.path { - if self.recent_pong.is_none() { - event!( - target: "events.net.holepunched", - Level::DEBUG, - node = %self.node_id.fmt_short(), - path = ?path, - direction = "outgoing", - ); - } - } - self.recent_pong = Some(r); - } - - #[cfg(test)] - pub(super) fn with_pong_reply(node_id: NodeId, r: PongReply) -> Self { - PathState { - node_id, - path: r.from.clone(), - last_ping: None, - last_got_ping: None, - call_me_maybe_time: None, - recent_pong: Some(r), - last_payload_msg: None, - } - } - - /// Check whether this path is considered active. - /// - /// Active means the path has received payload messages within the last - /// [`SESSION_ACTIVE_TIMEOUT`]. - /// - /// Note that a path might be alive but not active if it's contactable but not in - /// use. - pub(super) fn is_active(&self) -> bool { - self.last_payload_msg - .as_ref() - .map(|instant| instant.elapsed() <= SESSION_ACTIVE_TIMEOUT) - .unwrap_or(false) - } - - /// Returns the instant the last incoming ping was received. - pub(super) fn last_incoming_ping(&self) -> Option<&Instant> { - self.last_got_ping.as_ref().map(|(time, _tx_id)| time) - } - - /// Reports the last instant this path was considered alive. - /// - /// Alive means the path is considered in use by the remote endpoint. Either because we - /// received a payload message, a DISCO message (ping, pong) or it was advertised in a - /// call-me-maybe message. - /// - /// This is the most recent instant between: - /// - when last pong was received. - /// - when this path was last advertised in a received CallMeMaybe message. - /// - When the last payload transmission occurred. - /// - when the last ping from them was received. - pub(super) fn last_alive(&self) -> Option { - self.recent_pong() - .map(|pong| &pong.pong_at) - .into_iter() - .chain(self.last_payload_msg.as_ref()) - .chain(self.call_me_maybe_time.as_ref()) - .chain(self.last_incoming_ping()) - .max() - .copied() - } - - pub(super) fn last_control_msg(&self, now: Instant) -> Option<(Duration, ControlMsg)> { - // get every control message and assign it its kind - let last_pong = self - .recent_pong() - .map(|pong| (pong.pong_at, ControlMsg::Pong)); - let last_call_me_maybe = self - .call_me_maybe_time - .as_ref() - .map(|call_me| (*call_me, ControlMsg::CallMeMaybe)); - let last_ping = self - .last_incoming_ping() - .map(|ping| (*ping, ControlMsg::Ping)); - - last_pong - .into_iter() - .chain(last_call_me_maybe) - .chain(last_ping) - .max_by_key(|(instant, _kind)| *instant) - .map(|(instant, kind)| (now.duration_since(instant), kind)) - } - - /// Returns the most recent pong if available. - pub(super) fn recent_pong(&self) -> Option<&PongReply> { - self.recent_pong.as_ref() - } - - /// Returns the latency from the most recent pong, if available. - fn latency(&self) -> Option { - self.recent_pong.as_ref().map(|p| p.latency) - } - - fn needs_ping(&self, now: &Instant) -> bool { - match self.last_ping { - None => true, - Some(last_ping) => { - let elapsed = now.duration_since(last_ping); - - // TODO: remove! - // This logs "ping is too new" for each send whenever the endpoint does *not* need - // a ping. Pretty sure this is not a useful log, but maybe there was a reason? - // if !needs_ping { - // debug!("ping is too new: {}ms", elapsed.as_millis()); - // } - elapsed > DISCO_PING_INTERVAL - } - } - } - - fn handle_ping(&mut self, tx_id: stun::TransactionId, now: Instant) -> PingRole { - if Some(&tx_id) == self.last_got_ping.as_ref().map(|(_t, tx_id)| tx_id) { - PingRole::Duplicate - } else { - let prev = self.last_got_ping.replace((now, tx_id)); - let heartbeat_deadline = HEARTBEAT_INTERVAL + (HEARTBEAT_INTERVAL / 2); - match prev { - Some((prev_time, _tx)) if now.duration_since(prev_time) <= heartbeat_deadline => { - PingRole::LikelyHeartbeat - } - Some((prev_time, _tx)) => { - debug!( - elapsed = ?now.duration_since(prev_time), - "heartbeat missed, reactivating", - ); - PingRole::Activate - } - None => { - if let SendAddr::Udp(ref addr) = self.path { - event!( - target: "events.net.holepunched", - Level::DEBUG, - node = %self.node_id.fmt_short(), - path = ?addr, - direction = "incoming", - ); - } - PingRole::Activate - } - } - } - } - - fn clear(&mut self) { - self.last_ping = None; - self.last_got_ping = None; - self.call_me_maybe_time = None; - self.recent_pong = None; - } - - fn summary(&self, mut w: impl std::fmt::Write) -> std::fmt::Result { - write!(w, "{{ ")?; - if self.is_active() { - write!(w, "active ")?; - } - if let Some(ref pong) = self.recent_pong { - write!(w, "pong-received({:?} ago) ", pong.pong_at.elapsed())?; - } - if let Some(when) = self.last_incoming_ping() { - write!(w, "ping-received({:?} ago) ", when.elapsed())?; - } - if let Some(ref when) = self.last_ping { - write!(w, "ping-sent({:?} ago) ", when.elapsed())?; - } - write!(w, "}}") - } -} - -// TODO: Make an `EndpointPaths` struct and do things nicely. -fn summarize_node_paths(paths: &BTreeMap) -> String { - use std::fmt::Write; - - let mut w = String::new(); - write!(&mut w, "[").ok(); - for (i, (ipp, state)) in paths.iter().enumerate() { - if i > 0 { - write!(&mut w, ", ").ok(); - } - write!(&mut w, "{ipp}").ok(); - state.summary(&mut w).ok(); - } - write!(&mut w, "]").ok(); - w -} - /// Whether to send a call-me-maybe message after sending pings to all known paths. /// /// `IfNoRecent` will only send a call-me-maybe if no previous one was sent in the last @@ -1555,15 +1291,14 @@ pub enum ConnectionType { #[cfg(test)] mod tests { - use std::net::Ipv4Addr; + use std::{collections::BTreeMap, net::Ipv4Addr}; + + use crate::key::SecretKey; + use crate::magicsock::node_map::{NodeMap, NodeMapInner}; use best_addr::BestAddr; - use super::{ - super::{NodeMap, NodeMapInner}, - *, - }; - use crate::key::SecretKey; + use super::*; #[test] fn test_endpoint_infos() { diff --git a/iroh-net/src/magicsock/node_map/path_state.rs b/iroh-net/src/magicsock/node_map/path_state.rs new file mode 100644 index 0000000000..6121d8242d --- /dev/null +++ b/iroh-net/src/magicsock/node_map/path_state.rs @@ -0,0 +1,284 @@ +//! The state kept for each network path to a remote node. + +use std::collections::BTreeMap; +use std::net::SocketAddr; +use std::time::{Duration, Instant}; + +use iroh_base::key::NodeId; +use tracing::{debug, event, Level}; + +use crate::disco::SendAddr; +use crate::magicsock::HEARTBEAT_INTERVAL; +use crate::stun; + +use super::node_state::{PongReply, SESSION_ACTIVE_TIMEOUT}; +use super::{ControlMsg, IpPort, PingRole}; + +/// The minimum time between pings to an endpoint. +/// +/// Except in the case of CallMeMaybe frames resetting the counter, as the first pings +/// likely didn't through the firewall. +const DISCO_PING_INTERVAL: Duration = Duration::from_secs(5); + +/// State about a particular path to another [`NodeState`]. +/// +/// This state is used for both the relay path and any direct UDP paths. +/// +/// [`NodeState`]: super::node_state::NodeState +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub(super) struct PathState { + /// The node for which this path exists. + node_id: NodeId, + /// The path this applies for. + path: SendAddr, + /// The last (outgoing) ping time. + pub(super) last_ping: Option, + + /// If non-zero, means that this was an endpoint that we learned about at runtime (from an + /// incoming ping). If so, we keep the time updated and use it to discard old candidates. + // NOTE: tx_id Originally added in tailscale due to . + last_got_ping: Option<(Instant, stun::TransactionId)>, + + /// If non-zero, is the time this endpoint was advertised last via a call-me-maybe disco message. + pub(super) call_me_maybe_time: Option, + + /// Last [`PongReply`] received. + pub(super) recent_pong: Option, + /// When was this endpoint last used to transmit payload data (removing ping, pong, etc). + pub(super) last_payload_msg: Option, +} + +impl PathState { + pub(super) fn new(node_id: NodeId, path: SendAddr) -> Self { + Self { + node_id, + path, + last_ping: None, + last_got_ping: None, + call_me_maybe_time: None, + recent_pong: None, + last_payload_msg: None, + } + } + + pub(super) fn udp_addr(&self) -> Option { + match self.path { + SendAddr::Udp(addr) => Some(addr), + SendAddr::Relay(_) => None, + } + } + + pub(super) fn with_last_payload(node_id: NodeId, path: SendAddr, now: Instant) -> Self { + PathState { + node_id, + path, + last_ping: None, + last_got_ping: None, + call_me_maybe_time: None, + recent_pong: None, + last_payload_msg: Some(now), + } + } + + pub(super) fn with_ping( + node_id: NodeId, + path: SendAddr, + tx_id: stun::TransactionId, + now: Instant, + ) -> Self { + let mut new = PathState::new(node_id, path); + new.handle_ping(tx_id, now); + new + } + + pub(super) fn add_pong_reply(&mut self, r: PongReply) { + if let SendAddr::Udp(ref path) = self.path { + if self.recent_pong.is_none() { + event!( + target: "events.net.holepunched", + Level::DEBUG, + node = %self.node_id.fmt_short(), + path = ?path, + direction = "outgoing", + ); + } + } + self.recent_pong = Some(r); + } + + #[cfg(test)] + pub(super) fn with_pong_reply(node_id: NodeId, r: PongReply) -> Self { + PathState { + node_id, + path: r.from.clone(), + last_ping: None, + last_got_ping: None, + call_me_maybe_time: None, + recent_pong: Some(r), + last_payload_msg: None, + } + } + + /// Check whether this path is considered active. + /// + /// Active means the path has received payload messages within the last + /// [`SESSION_ACTIVE_TIMEOUT`]. + /// + /// Note that a path might be alive but not active if it's contactable but not in + /// use. + pub(super) fn is_active(&self) -> bool { + self.last_payload_msg + .as_ref() + .map(|instant| instant.elapsed() <= SESSION_ACTIVE_TIMEOUT) + .unwrap_or(false) + } + + /// Returns the instant the last incoming ping was received. + pub(super) fn last_incoming_ping(&self) -> Option<&Instant> { + self.last_got_ping.as_ref().map(|(time, _tx_id)| time) + } + + /// Reports the last instant this path was considered alive. + /// + /// Alive means the path is considered in use by the remote endpoint. Either because we + /// received a payload message, a DISCO message (ping, pong) or it was advertised in a + /// call-me-maybe message. + /// + /// This is the most recent instant between: + /// - when last pong was received. + /// - when this path was last advertised in a received CallMeMaybe message. + /// - When the last payload transmission occurred. + /// - when the last ping from them was received. + pub(super) fn last_alive(&self) -> Option { + self.recent_pong() + .map(|pong| &pong.pong_at) + .into_iter() + .chain(self.last_payload_msg.as_ref()) + .chain(self.call_me_maybe_time.as_ref()) + .chain(self.last_incoming_ping()) + .max() + .copied() + } + + pub(super) fn last_control_msg(&self, now: Instant) -> Option<(Duration, ControlMsg)> { + // get every control message and assign it its kind + let last_pong = self + .recent_pong() + .map(|pong| (pong.pong_at, ControlMsg::Pong)); + let last_call_me_maybe = self + .call_me_maybe_time + .as_ref() + .map(|call_me| (*call_me, ControlMsg::CallMeMaybe)); + let last_ping = self + .last_incoming_ping() + .map(|ping| (*ping, ControlMsg::Ping)); + + last_pong + .into_iter() + .chain(last_call_me_maybe) + .chain(last_ping) + .max_by_key(|(instant, _kind)| *instant) + .map(|(instant, kind)| (now.duration_since(instant), kind)) + } + + /// Returns the most recent pong if available. + pub(super) fn recent_pong(&self) -> Option<&PongReply> { + self.recent_pong.as_ref() + } + + /// Returns the latency from the most recent pong, if available. + pub(super) fn latency(&self) -> Option { + self.recent_pong.as_ref().map(|p| p.latency) + } + + pub(super) fn needs_ping(&self, now: &Instant) -> bool { + match self.last_ping { + None => true, + Some(last_ping) => { + let elapsed = now.duration_since(last_ping); + + // TODO: remove! + // This logs "ping is too new" for each send whenever the endpoint does *not* need + // a ping. Pretty sure this is not a useful log, but maybe there was a reason? + // if !needs_ping { + // debug!("ping is too new: {}ms", elapsed.as_millis()); + // } + elapsed > DISCO_PING_INTERVAL + } + } + } + + pub(super) fn handle_ping(&mut self, tx_id: stun::TransactionId, now: Instant) -> PingRole { + if Some(&tx_id) == self.last_got_ping.as_ref().map(|(_t, tx_id)| tx_id) { + PingRole::Duplicate + } else { + let prev = self.last_got_ping.replace((now, tx_id)); + let heartbeat_deadline = HEARTBEAT_INTERVAL + (HEARTBEAT_INTERVAL / 2); + match prev { + Some((prev_time, _tx)) if now.duration_since(prev_time) <= heartbeat_deadline => { + PingRole::LikelyHeartbeat + } + Some((prev_time, _tx)) => { + debug!( + elapsed = ?now.duration_since(prev_time), + "heartbeat missed, reactivating", + ); + PingRole::Activate + } + None => { + if let SendAddr::Udp(ref addr) = self.path { + event!( + target: "events.net.holepunched", + Level::DEBUG, + node = %self.node_id.fmt_short(), + path = ?addr, + direction = "incoming", + ); + } + PingRole::Activate + } + } + } + } + + pub(super) fn clear(&mut self) { + self.last_ping = None; + self.last_got_ping = None; + self.call_me_maybe_time = None; + self.recent_pong = None; + } + + fn summary(&self, mut w: impl std::fmt::Write) -> std::fmt::Result { + write!(w, "{{ ")?; + if self.is_active() { + write!(w, "active ")?; + } + if let Some(ref pong) = self.recent_pong { + write!(w, "pong-received({:?} ago) ", pong.pong_at.elapsed())?; + } + if let Some(when) = self.last_incoming_ping() { + write!(w, "ping-received({:?} ago) ", when.elapsed())?; + } + if let Some(ref when) = self.last_ping { + write!(w, "ping-sent({:?} ago) ", when.elapsed())?; + } + write!(w, "}}") + } +} + +// TODO: Make an `EndpointPaths` struct and do things nicely. +pub(super) fn summarize_node_paths(paths: &BTreeMap) -> String { + use std::fmt::Write; + + let mut w = String::new(); + write!(&mut w, "[").ok(); + for (i, (ipp, state)) in paths.iter().enumerate() { + if i > 0 { + write!(&mut w, ", ").ok(); + } + write!(&mut w, "{ipp}").ok(); + state.summary(&mut w).ok(); + } + write!(&mut w, "]").ok(); + w +} diff --git a/iroh-net/src/magicsock/node_map/udp_paths.rs b/iroh-net/src/magicsock/node_map/udp_paths.rs index 1154bc19c1..345fff9daf 100644 --- a/iroh-net/src/magicsock/node_map/udp_paths.rs +++ b/iroh-net/src/magicsock/node_map/udp_paths.rs @@ -15,7 +15,8 @@ use tracing::warn; use crate::disco::SendAddr; use super::best_addr::{self, BestAddr}; -use super::node_state::{PathState, PongReply}; +use super::node_state::PongReply; +use super::path_state::PathState; use super::IpPort; /// The address on which to send datagrams over UDP. From 1c86dace54e243f9d1a65634bf1bfc385d573236 Mon Sep 17 00:00:00 2001 From: Divma <26765164+divagant-martian@users.noreply.github.com> Date: Fri, 9 Aug 2024 05:35:10 -0500 Subject: [PATCH 35/45] fix(iroh-blobs): do not skip empty partial blobs in migration (#2604) ## Description When a blob found during migration is missing an outboard, according to the `cli_bao_store_migration` test, we should still migrate it. Before this change the blob would be ignored. This makes the test no longer broken. ## Breaking Changes n/a ## Notes & open questions 1. I can reproduce the failure 100% of the time, so before this change **I consider the test broken, not flaky.** 2. I reproduced the failure by running an iroh node with the exact v0 data the test uses. After the change, I did this again, and also ran both `validate` and `consistency-check` and got no errors. ## Change checklist - [x] Self-review. - [ ] ~~Documentation updates following the [style guide](https://rust-lang.github.io/rfcs/1574-more-api-documentation-conventions.html#appendix-a-full-conventions-text), if relevant.~~ - [ ] ~~Tests if relevant.~~ - [ ] ~~All breaking changes documented.~~ --- iroh-blobs/src/store/fs/import_flat_store.rs | 14 +++++++++----- iroh-cli/tests/cli.rs | 1 - 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/iroh-blobs/src/store/fs/import_flat_store.rs b/iroh-blobs/src/store/fs/import_flat_store.rs index 97771423bf..b80cdd2729 100644 --- a/iroh-blobs/src/store/fs/import_flat_store.rs +++ b/iroh-blobs/src/store/fs/import_flat_store.rs @@ -310,11 +310,15 @@ impl ActorState { if let Some(((data_path, data_size), outboard)) = largest_partial { let needs_outboard = data_size >= IROH_BLOCK_SIZE.bytes() as u64; let outboard_path = if needs_outboard { - let Some((outboard_path, _)) = outboard else { - tracing::warn!("missing outboard file for {}", hash.to_hex()); - continue; - }; - Some(outboard_path) + if let Some((outboard_path, _)) = outboard { + Some(outboard_path) + } else { + tracing::warn!( + hash = hash.fmt_short(), + "missing outboard file. assuming empty partial" + ); + None + } } else { None }; diff --git a/iroh-cli/tests/cli.rs b/iroh-cli/tests/cli.rs index 4de8df809b..edd1aaac2c 100644 --- a/iroh-cli/tests/cli.rs +++ b/iroh-cli/tests/cli.rs @@ -350,7 +350,6 @@ fn run_cli( } #[test] -#[ignore = "flaky"] fn cli_bao_store_migration() -> anyhow::Result<()> { let dir = testdir!(); let iroh_data_dir = dir.join("iroh_data_dir"); From a9c96a92fe7d2bb0f92f574b9c5b78e6f27316cf Mon Sep 17 00:00:00 2001 From: Kasey Date: Fri, 9 Aug 2024 06:37:32 -0400 Subject: [PATCH 36/45] feat(iroh-net): upgrade to new `swarm-discovery` api (#2605) ## Description The new `swarm-discovery` api allows you to remove and add addresses you want to publish, without needing to restart the swarm-discovery service. There is a chance that this process (restarting the swarm-discovery service) caused flaky-ness in our `test_local_swarm_discovery` test. ## Change checklist - [x] Self-review. --------- Co-authored-by: Kasey Huizinga --- Cargo.lock | 4 +- iroh-net/Cargo.toml | 2 +- .../src/discovery/local_swarm_discovery.rs | 82 +++++++------------ 3 files changed, 34 insertions(+), 54 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 40d0580df4..00f2b29a0f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5578,9 +5578,9 @@ dependencies = [ [[package]] name = "swarm-discovery" -version = "0.2.0" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a0685d4eda80e2dfee7fc413ba861ef11411ca813d836a77ab8f0d3a00286488" +checksum = "39769914108ae68e261d85ceac7bce7095947130f79c29d4535e9b31fc702a40" dependencies = [ "acto", "anyhow", diff --git a/iroh-net/Cargo.toml b/iroh-net/Cargo.toml index 757b3a7e00..6e0e154dc8 100644 --- a/iroh-net/Cargo.toml +++ b/iroh-net/Cargo.toml @@ -58,7 +58,7 @@ ring = "0.17" rustls = { version = "0.21.11", default-features = false, features = ["dangerous_configuration"] } serde = { version = "1", features = ["derive", "rc"] } smallvec = "1.11.1" -swarm-discovery = { version = "0.2.0", optional = true } +swarm-discovery = { version = "0.2.1", optional = true } socket2 = "0.5.3" stun-rs = "0.1.5" surge-ping = "0.8.0" diff --git a/iroh-net/src/discovery/local_swarm_discovery.rs b/iroh-net/src/discovery/local_swarm_discovery.rs index d721f16729..a42594a6e8 100644 --- a/iroh-net/src/discovery/local_swarm_discovery.rs +++ b/iroh-net/src/discovery/local_swarm_discovery.rs @@ -65,12 +65,12 @@ impl LocalSwarmDiscovery { let (send, recv) = async_channel::bounded(64); let task_sender = send.clone(); let rt = tokio::runtime::Handle::current(); - let mut guard = Some(LocalSwarmDiscovery::spawn_discoverer( + let discovery = LocalSwarmDiscovery::spawn_discoverer( node_id, task_sender.clone(), BTreeSet::new(), &rt, - )?); + )?; let handle = tokio::spawn(async move { let mut node_addrs: HashMap = HashMap::default(); @@ -171,21 +171,12 @@ impl LocalSwarmDiscovery { } Message::ChangeLocalAddrs(addrs) => { trace!(?addrs, "LocalSwarmDiscovery Message::ChangeLocalAddrs"); - let callback_send = task_sender.clone(); - let g = guard.take(); - drop(g); - guard = match LocalSwarmDiscovery::spawn_discoverer( - node_id, - callback_send.clone(), - addrs.direct_addresses, - &rt, - ) { - Ok(guard) => Some(guard), - Err(e) => { - error!("LocalSwarmDiscovery error creating discovery service: {e}"); - return; - } - }; + discovery.remove_all(); + let addrs = + LocalSwarmDiscovery::socketaddrs_to_addrs(addrs.direct_addresses); + for addr in addrs { + discovery.add(addr.0, addr.1) + } } } } @@ -213,38 +204,27 @@ impl LocalSwarmDiscovery { .send_blocking(Message::Discovery(node_id.to_string(), peer.clone())) .ok(); }; - let mut addrs: HashMap> = HashMap::default(); - let mut has_ipv4 = false; - let mut has_ipv6 = false; - for socketaddr in socketaddrs { - if !has_ipv6 && socketaddr.is_ipv6() { - has_ipv6 = true; - }; - if !has_ipv4 && socketaddr.is_ipv4() { - has_ipv4 = true; - }; - addrs - .entry(socketaddr.port()) - .and_modify(|a| a.push(socketaddr.ip())) - .or_insert(vec![socketaddr.ip()]); - } - - let ip_class = match (has_ipv4, has_ipv6) { - (true, true) => IpClass::V4AndV6, - (true, false) => IpClass::V4Only, - (false, true) => IpClass::V6Only, - // this case indicates no ip addresses were supplied, in which case, default to ipv4 - (false, false) => IpClass::V4Only, - }; + let addrs = LocalSwarmDiscovery::socketaddrs_to_addrs(socketaddrs); let mut discoverer = Discoverer::new_interactive(N0_LOCAL_SWARM.to_string(), node_id.to_string()) .with_callback(callback) - .with_ip_class(ip_class); + .with_ip_class(IpClass::Auto); for addr in addrs { discoverer = discoverer.with_addrs(addr.0, addr.1); } discoverer.spawn(rt) } + + fn socketaddrs_to_addrs(socketaddrs: BTreeSet) -> HashMap> { + let mut addrs: HashMap> = HashMap::default(); + for socketaddr in socketaddrs { + addrs + .entry(socketaddr.port()) + .and_modify(|a| a.push(socketaddr.ip())) + .or_insert(vec![socketaddr.ip()]); + } + addrs + } } impl From<&Peer> for DiscoveryItem { @@ -298,10 +278,10 @@ mod tests { #[tokio::test] async fn test_local_swarm_discovery() -> TestResult { let _guard = iroh_test::logging::setup(); - let (node_id_a, discovery_a) = make_discoverer()?; - let (_, discovery_b) = make_discoverer()?; + let (_, discovery_a) = make_discoverer()?; + let (node_id_b, discovery_b) = make_discoverer()?; - // make addr info for discoverer a + // make addr info for discoverer b let addr_info = AddrInfo { relay_url: None, direct_addresses: BTreeSet::from(["0.0.0.0:11111".parse()?]), @@ -310,15 +290,15 @@ mod tests { // pass in endpoint, this is never used let ep = crate::endpoint::Builder::default().bind(0).await?; // resolve twice to ensure we can create separate streams for the same node_id - let mut s1 = discovery_b.resolve(ep.clone(), node_id_a).unwrap(); - let mut s2 = discovery_b.resolve(ep, node_id_a).unwrap(); - tracing::debug!(?node_id_a, "Discovering node id a"); - // publish discovery_a's address - discovery_a.publish(&addr_info); - let s1_res = tokio::time::timeout(Duration::from_secs(10), s1.next()) + let mut s1 = discovery_a.resolve(ep.clone(), node_id_b).unwrap(); + let mut s2 = discovery_a.resolve(ep, node_id_b).unwrap(); + tracing::debug!(?node_id_b, "Discovering node id b"); + // publish discovery_b's address + discovery_b.publish(&addr_info); + let s1_res = tokio::time::timeout(Duration::from_secs(5), s1.next()) .await? .unwrap()?; - let s2_res = tokio::time::timeout(Duration::from_secs(10), s2.next()) + let s2_res = tokio::time::timeout(Duration::from_secs(5), s2.next()) .await? .unwrap()?; assert_eq!(s1_res.addr_info, addr_info); From 7494566ef2da183f49b8d8e8418a33bebfb03bb0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= Date: Fri, 9 Aug 2024 13:08:00 +0200 Subject: [PATCH 37/45] test(iroh-cli): Replace `cli_provide_one_file_large` with a faster test (#2607) ## Description The `cli_provide_one_file_large` test takes multiple minutes to complete. We talked about removing it altogether in the discord. The reasoning for removing it is that we already exercise big file provide & get loops via netsim. So there is no need to run the same code in debug mode taking >3 minutes in CI. However, I ended up not removing the test but instead changing it, because the other tests test really small file sizes, e.g. `1000` bytes. At those sizes, outboards aren't even generated and files are put inline into the database. So instead I worked out the file size required for exercising the whole external-outboard (and external-file) machinery of iroh-blobs. The cutoff point is at ~4.1MB, which is really far away from the 1GB size and should transfer within seconds. ## Breaking Changes None ## Notes & open questions Should we just delete the test altogether? ## Change checklist - [x] Self-review. - [x] Documentation updates following the [style guide](https://rust-lang.github.io/rfcs/1574-more-api-documentation-conventions.html#appendix-a-full-conventions-text), if relevant. - [x] Tests if relevant. - [x] All breaking changes documented. --- iroh-cli/tests/cli.rs | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/iroh-cli/tests/cli.rs b/iroh-cli/tests/cli.rs index edd1aaac2c..325b8a09bf 100644 --- a/iroh-cli/tests/cli.rs +++ b/iroh-cli/tests/cli.rs @@ -50,10 +50,20 @@ fn cli_provide_one_file_basic() -> Result<()> { } #[test] -fn cli_provide_one_file_large() -> Result<()> { +fn cli_provide_one_file_external_outboard() -> Result<()> { let dir = testdir!(); let path = dir.join("foo"); - make_rand_file(1024 * 1024 * 1024, &path)?; + // The cutoff point at which an outboard is stored externally is 16KiB by default. + // Outboards end up approaching ~1/256th the size of the source file. + // So if the source file is 16 KiB * 256, we *almost* have a file big enough that + // causes its outboard to be stored externally. + // We add a bit of margin, just to be safe. + let outboard_size_to_file_size = 256; + let safety_margin = 20; + let file_size = iroh::blobs::store::fs::InlineOptions::default().max_outboard_inlined + * (outboard_size_to_file_size + safety_margin); + // At current defaults, `file_size` ends up being ~4.5MB + make_rand_file(file_size as usize, &path)?; // provide a path to a file, do not pipe from stdin, do not pipe to stdout test_provide_get_loop(Input::Path(path), Output::Path) } From 5eee643e8b52b40c7a48e41de2f9867403b30d79 Mon Sep 17 00:00:00 2001 From: Floris Bruynooghe Date: Fri, 9 Aug 2024 13:18:20 +0200 Subject: [PATCH 38/45] fix(tests): For DNS discovery only use a local DNS server (#2598) ## Description Because the Node builder by default configures DNS discovery the tests all do a lookup. This went to the staging DNS server, but even so, we hit 429 Too Many Requests on this sometimes. Instead this configures a domain under `.test.`. This domain is reserved for tests and any DNS server should return an emtpy response without going through upstream DNS servers - unless the records were directly added to the DNS server receiving the query in which case it will respond. So now all our default DNS discovery just doesn't discover anything, which is fine as all tests have other mechanisms to find the nodes. The tests that do use DNS discovery run their own local DNS server for this, which is also switched to `.test.`, because that's the right domain to use for this. ## Breaking Changes When using `iroh::node::Node` in tests, when `cfg(test)` is enabled, the default node discovery will no longer work. You must explicitly configure the node discovery, options to use a test-local node discovery or a staging node discovery are described in the API docs. ## Notes & open questions I feel like we should also try this with the relay server. That will probably be a bit harder and it is not an issue for me right now. I actually need this for the quinn11 branch, but the fix is so generic that it makes more sense to do on main and merge it back. ## Change checklist - [x] Self-review. - [x] Documentation updates following the [style guide](https://rust-lang.github.io/rfcs/1574-more-api-documentation-conventions.html#appendix-a-full-conventions-text), if relevant. - [x] Tests if relevant. - [x] All breaking changes documented. --- iroh-net/src/discovery/dns.rs | 27 ++++++++++++-- iroh-net/src/discovery/pkarr.rs | 13 +++++-- iroh-net/src/relay/map.rs | 6 ++- iroh-net/src/test_utils.rs | 2 +- iroh/src/node/builder.rs | 65 ++++++++++++++++++++++++++++----- 5 files changed, 94 insertions(+), 19 deletions(-) diff --git a/iroh-net/src/discovery/dns.rs b/iroh-net/src/discovery/dns.rs index 0490a7a826..5d88f4ca8b 100644 --- a/iroh-net/src/discovery/dns.rs +++ b/iroh-net/src/discovery/dns.rs @@ -13,6 +13,10 @@ use crate::{ pub const N0_DNS_NODE_ORIGIN_PROD: &str = "dns.iroh.link"; /// The n0 testing DNS node origin, for testing. pub const N0_DNS_NODE_ORIGIN_STAGING: &str = "staging-dns.iroh.link"; +/// Testing DNS node origin, must run server from [`crate::test_utils::DnsPkarrServer`]. +#[cfg(any(test, feature = "test-utils"))] +pub const TEST_DNS_NODE_ORIGIN: &str = "dns.iroh.test"; + const DNS_STAGGERING_MS: &[u64] = &[200, 300]; /// DNS node discovery @@ -41,13 +45,28 @@ pub struct DnsDiscovery { } impl DnsDiscovery { - /// Create a new DNS discovery. + /// Creates a new DNS discovery. pub fn new(origin_domain: String) -> Self { Self { origin_domain } } - /// Create a new DNS discovery which uses the [`N0_DNS_NODE_ORIGIN_PROD`] origin domain and in testing - /// uses [`N0_DNS_NODE_ORIGIN_STAGING`]. + /// Creates a new DNS discovery using the `iroh.link` domain. + /// + /// This uses the [`N0_DNS_NODE_ORIGIN_PROD`] domain. + /// + /// # Usage during tests + /// + /// When `cfg(test)` is enabled or when using the `test-utils` cargo feature the + /// [`TEST_DNS_NODE_ORIGIN`] is used. + /// + /// Note that the `iroh.test` domain is not integrated with the global DNS network and + /// thus node discovery is effectively disabled. To use node discovery in a test use + /// the [`crate::test_utils::DnsPkarrServer`] in the test and configure it as a + /// custom discovery mechanism. + /// + /// For testing it is also possible to use the [`N0_DNS_NODE_ORIGIN_STAGING`] domain + /// with [`DnsDiscovery::new`]. This would then use a hosted discovery service again, + /// but for testing purposes. pub fn n0_dns() -> Self { #[cfg(not(any(test, feature = "test-utils")))] { @@ -55,7 +74,7 @@ impl DnsDiscovery { } #[cfg(any(test, feature = "test-utils"))] { - Self::new(N0_DNS_NODE_ORIGIN_STAGING.to_string()) + Self::new(TEST_DNS_NODE_ORIGIN.to_string()) } } } diff --git a/iroh-net/src/discovery/pkarr.rs b/iroh-net/src/discovery/pkarr.rs index 2a6cc69d5f..c7008f241a 100644 --- a/iroh-net/src/discovery/pkarr.rs +++ b/iroh-net/src/discovery/pkarr.rs @@ -153,12 +153,17 @@ impl PublisherService { loop { if let Some(info) = self.watcher.get() { if let Err(err) = self.publish_current(info).await { - warn!(?err, url = %self.pkarr_client.pkarr_relay_url , "Failed to publish to pkarr"); failed_attempts += 1; // Retry after increasing timeout - republish - .as_mut() - .reset(Instant::now() + Duration::from_secs(failed_attempts)); + let retry_after = Duration::from_secs(failed_attempts); + republish.as_mut().reset(Instant::now() + retry_after); + warn!( + err = %format!("{err:#}"), + url = %self.pkarr_client.pkarr_relay_url , + ?retry_after, + %failed_attempts, + "Failed to publish to pkarr", + ); } else { failed_attempts = 0; // Republish after fixed interval diff --git a/iroh-net/src/relay/map.rs b/iroh-net/src/relay/map.rs index d6313892cd..8c1a2fd24f 100644 --- a/iroh-net/src/relay/map.rs +++ b/iroh-net/src/relay/map.rs @@ -9,12 +9,16 @@ use crate::defaults::DEFAULT_STUN_PORT; use super::RelayUrl; -/// Configuration options for the relay servers of the magic endpoint. +/// Configuration of the relay servers for an [`Endpoint`]. +/// +/// [`Endpoint`]: crate::endpoint::Endpoint #[derive(Debug, Clone, PartialEq, Eq)] pub enum RelayMode { /// Disable relay servers completely. Disabled, /// Use the default relay map, with production relay servers from n0. + /// + /// See [`crate::defaults::prod`] for the severs used. Default, /// Use the staging relay servers from n0. Staging, diff --git a/iroh-net/src/test_utils.rs b/iroh-net/src/test_utils.rs index a1fad1dead..6a8b411daa 100644 --- a/iroh-net/src/test_utils.rs +++ b/iroh-net/src/test_utils.rs @@ -101,7 +101,7 @@ pub(crate) mod dns_and_pkarr_servers { impl DnsPkarrServer { /// Run DNS and Pkarr servers on localhost. pub async fn run() -> anyhow::Result { - Self::run_with_origin("dns.example".to_string()).await + Self::run_with_origin("dns.iroh.test".to_string()).await } /// Run DNS and Pkarr servers on localhost with the specified `node_origin` domain. diff --git a/iroh/src/node/builder.rs b/iroh/src/node/builder.rs index 3ea3330fd8..52b97b1b93 100644 --- a/iroh/src/node/builder.rs +++ b/iroh/src/node/builder.rs @@ -73,12 +73,23 @@ pub enum DocsStorage { /// Blob store implementations are available in [`iroh_blobs::store`]. /// Document store implementations are available in [`iroh_docs::store`]. /// -/// Everything else is optional. +/// Everything else is optional, with some sensible defaults. +/// +/// The default **relay servers** are hosted by [number 0] on the `iroh.network` domain. To +/// customise this use the [`Builder::relay_mode`] function. +/// +/// For **node discovery** the default is to use the [number 0] hosted DNS server hosted on +/// `iroh.link`. To customise this use the [`Builder::node_discovery`] function. +/// +/// Note that some defaults change when running using `cfg(test)`, see the individual +/// methods for details. /// /// Finally you can create and run the node by calling [`Builder::spawn`]. /// /// The returned [`Node`] is awaitable to know when it finishes. It can be terminated /// using [`Node::shutdown`]. +/// +/// [number 0]: https://n0.computer #[derive(derive_more::Debug)] pub struct Builder where @@ -125,13 +136,37 @@ impl StorageConfig { } /// Configuration for node discovery. +/// +/// Node discovery enables connecting to other peers by only the [`NodeId`]. This usually +/// works by the nodes publishing their [`RelayUrl`] and/or their direct addresses to some +/// publicly available service. +/// +/// [`NodeId`]: crate::base::key::NodeId +/// [`RelayUrl`]: crate::base::node_addr::RelayUrl #[derive(Debug, Default)] pub enum DiscoveryConfig { /// Use no node discovery mechanism. None, /// Use the default discovery mechanism. /// - /// This enables the [`DnsDiscovery`] service. + /// This uses two discovery services concurrently: + /// + /// - It publishes to a pkarr service operated by [number 0] which makes the information + /// available via DNS in the `iroh.link` domain. + /// + /// - It uses an mDNS-like system to announce itself on the local network. + /// + /// # Usage during tests + /// + /// Note that the default changes when compiling with `cfg(test)` or the `test-utils` + /// cargo feature from [iroh-net] is enabled. In this case only the Pkarr/DNS service + /// is used, but on the `iroh.test` domain. This domain is not integrated with the + /// global DNS network and thus node discovery is effectively disabled. To use node + /// discovery in a test use the [`iroh_net::test_utils::DnsPkarrServer`] in the test and + /// configure it here as a custom discovery mechanism ([`DiscoveryConfig::Custom`]). + /// + /// [number 0]: https://n0.computer + /// [iroh-net]: crate::net #[default] Default, /// Use a custom discovery mechanism. @@ -354,18 +389,30 @@ where /// establish connections between peers by being an initial relay for traffic while /// assisting in holepunching to establish a direct connection between peers. /// - /// When using [RelayMode::Custom], the provided `relay_map` must contain at least one - /// configured relay node. If an invalid [`iroh_net::relay::RelayMode`] - /// is provided [`Self::spawn`] will result in an error. - pub fn relay_mode(mut self, dm: RelayMode) -> Self { - self.relay_mode = dm; + /// When using [`RelayMode::Custom`], the provided `relay_map` must contain at least one + /// configured relay node. If an invalid [`iroh_net::relay::RelayMode`] is provided + /// [`Self::spawn`] will result in an error. + /// + /// # Usage during tests + /// + /// Note that while the default is [`RelayMode::Default`], when using `cfg(test)` or + /// when the `test-utils` cargo feature [`RelayMode::Staging`] is the default. + pub fn relay_mode(mut self, relay_mode: RelayMode) -> Self { + self.relay_mode = relay_mode; self } /// Sets the node discovery mechanism. /// - /// The default is [`DiscoveryConfig::Default`]. Use [`DiscoveryConfig::Custom`] to pass a - /// custom [`Discovery`]. + /// Node discovery enables connecting to other peers by only the [`NodeId`]. This + /// usually works by the nodes publishing their [`RelayUrl`] and/or their direct + /// addresses to some publicly available service. + /// + /// See [`DiscoveryConfig::default`] for the defaults, note that the defaults change + /// when using `cfg(test)`. + /// + /// [`NodeId`]: crate::base::key::NodeId + /// [`RelayUrl`]: crate::base::node_addr::RelayUrl pub fn node_discovery(mut self, config: DiscoveryConfig) -> Self { self.node_discovery = config; self From 3b7881cccbd0b8fe09317695bd4c4808608cb149 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= Date: Fri, 9 Aug 2024 14:35:23 +0200 Subject: [PATCH 39/45] docs: Also list `iroh-gossip` as a re-export (#2606) ## Description Just a missing doc change that I noticed when trying to answer a Q on discord. ## Breaking Changes None ## Change checklist - [x] Self-review. - [x] Documentation updates following the [style guide](https://rust-lang.github.io/rfcs/1574-more-api-documentation-conventions.html#appendix-a-full-conventions-text), if relevant. - ~~[ ] Tests if relevant.~~ - [x] All breaking changes documented. --- iroh/src/lib.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/iroh/src/lib.rs b/iroh/src/lib.rs index 3e0a5c3ad6..59b5ce215c 100644 --- a/iroh/src/lib.rs +++ b/iroh/src/lib.rs @@ -80,6 +80,7 @@ //! - [iroh_base] as [`base`] //! - [iroh_blobs] as [`blobs`] //! - [iroh_docs] as [`docs`] +//! - [iroh_gossip] as [`gossip`] //! - [iroh_net] as [`net`] //! //! ## Feature Flags From a2d2ec69e327da09b32e0e90a148d371e37d4f3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= Date: Fri, 9 Aug 2024 14:36:01 +0200 Subject: [PATCH 40/45] test(iroh): Reduce entry amount in `sync_gossip_bulk` (#2608) ## Description The test currently runs 55s on our windows CI machines, it runs 40s when I run it on my laptop locally. If anyone is curious: In release mode, the test runs 0.4s. :upside_down_face: The test basically sync 1000 entries twice. The number is configurable and the default was arbitrarily chosen according to @Frando, so I've reduced it by a factor of 10. With `n_entries = 100` the test runs ~4.5s for me, much more reasonable. ## Breaking Changes None ## Notes & open questions ## Change checklist - [x] Self-review. - ~~[ ] Documentation updates following the [style guide](https://rust-lang.github.io/rfcs/1574-more-api-documentation-conventions.html#appendix-a-full-conventions-text), if relevant.~~ - [x] Tests if relevant. - [x] All breaking changes documented. --- iroh/tests/sync.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/iroh/tests/sync.rs b/iroh/tests/sync.rs index cf94c4926a..026d14c6fc 100644 --- a/iroh/tests/sync.rs +++ b/iroh/tests/sync.rs @@ -156,7 +156,7 @@ async fn sync_subscribe_no_sync() -> Result<()> { async fn sync_gossip_bulk() -> Result<()> { let n_entries: usize = std::env::var("N_ENTRIES") .map(|x| x.parse().expect("N_ENTRIES must be a number")) - .unwrap_or(1000); + .unwrap_or(100); let mut rng = test_rng(b"sync_gossip_bulk"); setup_logging(); From 5d98a5cb8194be58aff995a6aa463c36571d5399 Mon Sep 17 00:00:00 2001 From: Franz Heinzmann Date: Mon, 12 Aug 2024 13:14:28 +0200 Subject: [PATCH 41/45] fix(iroh-gossip): clarify docs and semantics of gossip joined event (#2597) ## Description Improves documentation around the `GossipEvent::Joined` event: It is only emitted once at the beginning of the stream, and the event will not be emitted when awaiting `GossipReceiver::joined`. Also makes sure that the event is actually only emitted once per intent (it potentially could have been emitted multiple times before if the neighbor count first got down to 0 and then up again for `GossipTopics` subscribing inbetween). ## Breaking Changes ## Notes & open questions Inspired by the discussion in https://github.com/deltachat/deltachat-core-rust/pull/5860 ## Change checklist - [x] Self-review. - [x] Documentation updates following the [style guide](https://rust-lang.github.io/rfcs/1574-more-api-documentation-conventions.html#appendix-a-full-conventions-text), if relevant. - [ ] Tests if relevant. - [x] All breaking changes documented. Co-authored-by: Friedel Ziegelmayer --- iroh-gossip/src/net.rs | 9 ++++--- iroh-gossip/src/net/handles.rs | 44 +++++++++++++++++++++++++--------- 2 files changed, 39 insertions(+), 14 deletions(-) diff --git a/iroh-gossip/src/net.rs b/iroh-gossip/src/net.rs index f813927d47..1ecb3fcf39 100644 --- a/iroh-gossip/src/net.rs +++ b/iroh-gossip/src/net.rs @@ -501,8 +501,9 @@ impl Actor { neighbors, event_senders, command_rx_keys, + joined, } = state; - if !neighbors.is_empty() { + if *joined { let neighbors = neighbors.iter().copied().collect(); channels .event_tx @@ -588,14 +589,15 @@ impl Actor { continue; }; let TopicState { + joined, neighbors, event_senders, command_rx_keys, } = state; let event = if let ProtoEvent::NeighborUp(neighbor) = event { - let was_empty = neighbors.is_empty(); neighbors.insert(neighbor); - if was_empty { + if !*joined { + *joined = true; GossipEvent::Joined(vec![neighbor]) } else { GossipEvent::NeighborUp(neighbor) @@ -662,6 +664,7 @@ impl Default for PeerState { #[derive(Debug, Default)] struct TopicState { + joined: bool, neighbors: BTreeSet, event_senders: EventSenders, command_rx_keys: HashSet, diff --git a/iroh-gossip/src/net/handles.rs b/iroh-gossip/src/net/handles.rs index c082192224..c944805afa 100644 --- a/iroh-gossip/src/net/handles.rs +++ b/iroh-gossip/src/net/handles.rs @@ -8,7 +8,7 @@ use std::{ task::{Context, Poll}, }; -use anyhow::{anyhow, Result}; +use anyhow::{anyhow, Context as _, Result}; use bytes::Bytes; use futures_lite::{Stream, StreamExt}; use iroh_net::NodeId; @@ -112,6 +112,7 @@ pub struct GossipReceiver { #[debug("EventStream")] stream: EventStream, neighbors: HashSet, + joined: bool, } impl GossipReceiver { @@ -119,6 +120,7 @@ impl GossipReceiver { Self { stream: events_rx, neighbors: Default::default(), + joined: false, } } @@ -128,9 +130,22 @@ impl GossipReceiver { } /// Waits until we are connected to at least one node. + /// + /// This progresses the stream until we received [`GossipEvent::Joined`], which is the first + /// item emitted on the stream. + /// + /// Note that this consumes the [`GossipEvent::Joined`] event. If you want to act on these + /// initial neighbors, use [`Self::neighbors`] after awaiting [`Self::joined`]. pub async fn joined(&mut self) -> Result<()> { - while self.neighbors.is_empty() { - let _ = self.try_next().await?; + if !self.joined { + match self + .try_next() + .await? + .context("Gossip receiver closed before Joined event was received.")? + { + Event::Gossip(GossipEvent::Joined(_)) => {} + _ => anyhow::bail!("Expected Joined event to be the first event received."), + } } Ok(()) } @@ -148,6 +163,7 @@ impl Stream for GossipReceiver { if let Some(Ok(item)) = &item { match item { Event::Gossip(GossipEvent::Joined(neighbors)) => { + self.joined = true; self.neighbors.extend(neighbors.iter().copied()); } Event::Gossip(GossipEvent::NeighborUp(node_id)) => { @@ -163,26 +179,32 @@ impl Stream for GossipReceiver { } } -/// Update from a subscribed gossip topic. +/// Events emitted from a gossip topic with a lagging notification. +/// +/// This is the item of the [`GossipReceiver`] stream. It wraps the actual gossip events to also +/// provide a notification if we missed gossip events for the topic. #[derive(Serialize, Deserialize, Debug, Eq, PartialEq)] pub enum Event { - /// A message was received. + /// We received an event. Gossip(GossipEvent), - /// We missed some messages. + /// We missed some messages because our [`GossipReceiver`] was not progressing fast enough. Lagged, } -/// Gossip event -/// An event to be emitted to the application for a particular topic. +/// Events emitted from a gossip topic. +/// +/// These are the events emitted from a [`GossipReceiver`], wrapped in [`Event::Gossip`]. #[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Serialize, Deserialize)] pub enum GossipEvent { /// We joined the topic with at least one peer. + /// + /// This is the first event on a [`GossipReceiver`] and will only be emitted once. Joined(Vec), - /// We have a new, direct neighbor in the swarm membership layer for this topic + /// We have a new, direct neighbor in the swarm membership layer for this topic. NeighborUp(NodeId), - /// We dropped direct neighbor in the swarm membership layer for this topic + /// We dropped direct neighbor in the swarm membership layer for this topic. NeighborDown(NodeId), - /// A gossip message was received for this topic + /// We received a gossip message for this topic. Received(Message), } From ceb94dab985400958da8f9902c6bde4ef5ccdc7c Mon Sep 17 00:00:00 2001 From: Divma <26765164+divagant-martian@users.noreply.github.com> Date: Mon, 12 Aug 2024 08:41:20 -0500 Subject: [PATCH 42/45] docs(iroh-cli): fix help text for incomplete blobs (#2615) ## Description Aligns the help text for `blobs list incomplete-blobs` with the docs in iroh.computer. ## Breaking Changes n/a ## Notes & open questions n/a ## Change checklist - [x] Self-review. - [x] Documentation updates following the [style guide](https://rust-lang.github.io/rfcs/1574-more-api-documentation-conventions.html#appendix-a-full-conventions-text), if relevant. - [ ] ~~Tests if relevant.~~ - [ ] ~~All breaking changes documented.~~ --- iroh-cli/src/commands/blobs.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/iroh-cli/src/commands/blobs.rs b/iroh-cli/src/commands/blobs.rs index 6e4c0aeba2..f1ed55500f 100644 --- a/iroh-cli/src/commands/blobs.rs +++ b/iroh-cli/src/commands/blobs.rs @@ -436,7 +436,7 @@ pub struct BlobAddOptions { pub enum ListCommands { /// List the available blobs on the running provider. Blobs, - /// List the available blobs on the running provider. + /// List the blobs on the running provider that are not full files. IncompleteBlobs, /// List the available collections on the running provider. Collections, From bcc87a24c722362358c68251749b52eeaca31b53 Mon Sep 17 00:00:00 2001 From: Friedel Ziegelmayer Date: Tue, 13 Aug 2024 16:19:23 +0200 Subject: [PATCH 43/45] feat: allow custom blob providing event handling (#2583) ## Description This adds the ability back to track blob provide events from an iroh node. ## Breaking Changes - Modifies `Event` enum: - removes CustomGetRequestReceived - adds TransferProgress - send_blob now takes an EventSender - trait `EventSender` has been renamed `CustomEventSender` - the concrete (boxed) event sender is now called just `EventSender` ## Notes & open questions Notes: I added progress events that get sent every single 16 KiB chunk. These events contain the usual data (connection id and request id) plus offset and hash. Adding the hash makes the event larger, but makes it very much self-contained. The event is fired after the read, but that does not mean that the data has been successfully transferred. Should be good enough to show progress anyway. These will be a lot of events, so they are sent using try_send in order not to slow down the transfer. That means that you might miss some events, but since they only contain the end_offset that should be fine. Open questions: I called the trait CustomEventSender and the concrete handle for an optional CustomEventSender just `EventSender`. I can revert this if somebody can think of a better name for the trait or the handle... ## Change checklist - [x] Self-review. - [x] Documentation updates following the [style guide](https://rust-lang.github.io/rfcs/1574-more-api-documentation-conventions.html#appendix-a-full-conventions-text), if relevant. - [x] Tests if relevant. - [x] All breaking changes documented. --------- Co-authored-by: Ruediger Klaehn --- iroh-blobs/examples/provide-bytes.rs | 13 +- iroh-blobs/src/provider.rs | 170 +++++++++++++++++++++------ iroh/src/client/blobs.rs | 91 +++++++++++++- iroh/src/node/builder.rs | 25 +++- iroh/src/node/protocol.rs | 18 +-- 5 files changed, 253 insertions(+), 64 deletions(-) diff --git a/iroh-blobs/examples/provide-bytes.rs b/iroh-blobs/examples/provide-bytes.rs index eab9fddf5a..60211563f8 100644 --- a/iroh-blobs/examples/provide-bytes.rs +++ b/iroh-blobs/examples/provide-bytes.rs @@ -100,7 +100,7 @@ async fn main() -> Result<()> { return; } }; - iroh_blobs::provider::handle_connection(conn, db, MockEventSender, lp).await + iroh_blobs::provider::handle_connection(conn, db, Default::default(), lp).await }); } }); @@ -114,14 +114,3 @@ async fn main() -> Result<()> { Err(e) => Err(anyhow::anyhow!("unable to listen for ctrl-c: {e}")), } } - -#[derive(Clone)] -struct MockEventSender; - -use futures_lite::future::FutureExt; - -impl iroh_blobs::provider::EventSender for MockEventSender { - fn send(&self, _event: iroh_blobs::provider::Event) -> futures_lite::future::Boxed<()> { - async move {}.boxed() - } -} diff --git a/iroh-blobs/src/provider.rs b/iroh-blobs/src/provider.rs index 54b2515158..f4d185ffdb 100644 --- a/iroh-blobs/src/provider.rs +++ b/iroh-blobs/src/provider.rs @@ -1,5 +1,6 @@ //! The server side API use std::fmt::Debug; +use std::sync::Arc; use std::time::Duration; use anyhow::{Context, Result}; @@ -49,23 +50,28 @@ pub enum Event { /// The hash for which the client wants to receive data. hash: Hash, }, - /// A request was received from a client. - CustomGetRequestReceived { + /// A sequence of hashes has been found and is being transferred. + TransferHashSeqStarted { /// An unique connection id. connection_id: u64, /// An identifier uniquely identifying this transfer request. request_id: u64, - /// The size of the custom get request. - len: usize, + /// The number of blobs in the sequence. + num_blobs: u64, }, - /// A sequence of hashes has been found and is being transferred. - TransferHashSeqStarted { + /// A chunk of a blob was transferred. + /// + /// These events will be sent with try_send, so you can not assume that you + /// will receive all of them. + TransferProgress { /// An unique connection id. connection_id: u64, /// An identifier uniquely identifying this transfer request. request_id: u64, - /// The number of blobs in the sequence. - num_blobs: u64, + /// The hash for which we are transferring data. + hash: Hash, + /// Offset up to which we have transferred data. + end_offset: u64, }, /// A blob in a sequence was transferred. TransferBlobCompleted { @@ -179,18 +185,21 @@ pub async fn read_request(mut reader: RecvStream) -> Result { /// close the writer, and return with `Ok(SentStatus::NotFound)`. /// /// If the transfer does _not_ end in error, the buffer will be empty and the writer is gracefully closed. -pub async fn transfer_collection( +pub async fn transfer_collection( request: GetRequest, // Store from which to fetch blobs. db: &D, // Response writer, containing the quinn stream. - writer: &mut ResponseWriter, + writer: &mut ResponseWriter, // the collection to transfer mut outboard: impl Outboard, mut data: impl AsyncSliceReader, stats: &mut TransferStats, ) -> Result { let hash = request.hash; + let events = writer.events.clone(); + let request_id = writer.request_id(); + let connection_id = writer.connection_id(); // if the request is just for the root, we don't need to deserialize the collection let just_root = matches!(request.ranges.as_single(), Some((0, _))); @@ -199,7 +208,7 @@ pub async fn transfer_collection( let (stream, num_blobs) = parse_hash_seq(&mut data).await?; writer .events - .send(Event::TransferHashSeqStarted { + .send(|| Event::TransferHashSeqStarted { connection_id: writer.connection_id(), request_id: writer.request_id(), num_blobs, @@ -210,6 +219,13 @@ pub async fn transfer_collection( None }; + let mk_progress = |end_offset| Event::TransferProgress { + connection_id, + request_id, + hash, + end_offset, + }; + let mut prev = 0; for (offset, ranges) in request.ranges.iter_non_empty() { // create a tracking writer so we can get some stats for writing @@ -218,11 +234,13 @@ pub async fn transfer_collection( debug!("writing ranges '{:?}' of sequence {}", ranges, hash); // wrap the data reader in a tracking reader so we can get some stats for reading let mut tracking_reader = TrackingSliceReader::new(&mut data); + let mut sending_reader = + SendingSliceReader::new(&mut tracking_reader, &events, mk_progress); // send the root tw.write(outboard.tree().size().to_le_bytes().as_slice()) .await?; encode_ranges_validated( - &mut tracking_reader, + &mut sending_reader, &mut outboard, &ranges.to_chunk_ranges(), &mut tw, @@ -243,7 +261,8 @@ pub async fn transfer_collection( } if let Some(hash) = c.next().await? { tokio::task::yield_now().await; - let (status, size, blob_read_stats) = send_blob(db, hash, ranges, &mut tw).await?; + let (status, size, blob_read_stats) = + send_blob(db, hash, ranges, &mut tw, events.clone(), mk_progress).await?; stats.send += tw.stats(); stats.read += blob_read_stats; if SentStatus::NotFound == status { @@ -253,7 +272,7 @@ pub async fn transfer_collection( writer .events - .send(Event::TransferBlobCompleted { + .send(|| Event::TransferBlobCompleted { connection_id: writer.connection_id(), request_id: writer.request_id(), hash, @@ -273,17 +292,98 @@ pub async fn transfer_collection( Ok(SentStatus::Sent) } -/// Trait for sending events. -pub trait EventSender: Clone + Sync + Send + 'static { - /// Send an event. +struct SendingSliceReader<'a, R, F> { + inner: R, + sender: &'a EventSender, + make_event: F, +} + +impl<'a, R: AsyncSliceReader, F: Fn(u64) -> Event> SendingSliceReader<'a, R, F> { + fn new(inner: R, sender: &'a EventSender, make_event: F) -> Self { + Self { + inner, + sender, + make_event, + } + } +} + +impl<'a, R: AsyncSliceReader, F: Fn(u64) -> Event> AsyncSliceReader + for SendingSliceReader<'a, R, F> +{ + async fn read_at(&mut self, offset: u64, len: usize) -> std::io::Result { + let res = self.inner.read_at(offset, len).await; + if let Ok(res) = res.as_ref() { + let end_offset = offset + res.len() as u64; + self.sender.try_send(|| (self.make_event)(end_offset)); + } + res + } + + async fn size(&mut self) -> std::io::Result { + self.inner.size().await + } +} + +/// Trait for sending blob events. +pub trait CustomEventSender: std::fmt::Debug + Sync + Send + 'static { + /// Send an event and wait for it to be sent. fn send(&self, event: Event) -> BoxFuture<()>; + + /// Try to send an event. + fn try_send(&self, event: Event); +} + +/// A possibly disabled sender for events. +#[derive(Debug, Clone, Default)] +pub struct EventSender { + inner: Option>, +} + +impl From for EventSender { + fn from(inner: T) -> Self { + Self { + inner: Some(Arc::new(inner)), + } + } +} + +impl EventSender { + /// Create a new event sender. + pub fn new(inner: Option>) -> Self { + Self { inner } + } + + /// Send an event. + /// + /// If the inner sender is not set, the function to produce the event will + /// not be called. So any cost associated with gathering information for the + /// event will not be incurred. + pub async fn send(&self, event: impl FnOnce() -> Event) { + if let Some(inner) = &self.inner { + let event = event(); + inner.as_ref().send(event).await; + } + } + + /// Try to send an event. + /// + /// This will just drop the event if it can not be sent immediately. So it + /// is only appropriate for events that are not critical, such as + /// self-contained progress updates. + pub fn try_send(&self, event: impl FnOnce() -> Event) { + if let Some(inner) = &self.inner { + let event = event(); + inner.as_ref().try_send(event); + } + } } /// Handle a single connection. -pub async fn handle_connection( +pub async fn handle_connection( connection: endpoint::Connection, db: D, - events: E, + events: EventSender, rt: LocalPoolHandle, ) { let remote_addr = connection.remote_address(); @@ -300,7 +400,9 @@ pub async fn handle_connection( events: events.clone(), inner: writer, }; - events.send(Event::ClientConnected { connection_id }).await; + events + .send(|| Event::ClientConnected { connection_id }) + .await; let db = db.clone(); rt.spawn_detached(|| { async move { @@ -316,11 +418,7 @@ pub async fn handle_connection( .await } -async fn handle_stream( - db: D, - reader: RecvStream, - writer: ResponseWriter, -) -> Result<()> { +async fn handle_stream(db: D, reader: RecvStream, writer: ResponseWriter) -> Result<()> { // 1. Decode the request. debug!("reading request"); let request = match read_request(reader).await { @@ -337,16 +435,16 @@ async fn handle_stream( } /// Handle a single standard get request. -pub async fn handle_get( +pub async fn handle_get( db: D, request: GetRequest, - mut writer: ResponseWriter, + mut writer: ResponseWriter, ) -> Result<()> { let hash = request.hash; debug!(%hash, "received request"); writer .events - .send(Event::GetRequestReceived { + .send(|| Event::GetRequestReceived { hash, connection_id: writer.connection_id(), request_id: writer.request_id(), @@ -397,13 +495,13 @@ pub async fn handle_get( /// A helper struct that combines a quinn::SendStream with auxiliary information #[derive(Debug)] -pub struct ResponseWriter { +pub struct ResponseWriter { inner: SendStream, - events: E, + events: EventSender, connection_id: u64, } -impl ResponseWriter { +impl ResponseWriter { fn tracking_writer(&mut self) -> TrackingStreamWriter> { TrackingStreamWriter::new(TokioStreamWriter(&mut self.inner)) } @@ -449,7 +547,7 @@ impl ResponseWriter { info!("transfer completed for {}", hash); Self::print_stats(&stats); self.events - .send(Event::TransferCompleted { + .send(move || Event::TransferCompleted { connection_id: self.connection_id(), request_id: self.request_id(), stats, @@ -462,7 +560,7 @@ impl ResponseWriter { Self::print_stats(stats); }; self.events - .send(Event::TransferAborted { + .send(move || Event::TransferAborted { connection_id: self.connection_id(), request_id: self.request_id(), stats, @@ -486,15 +584,19 @@ pub async fn send_blob( hash: Hash, ranges: &RangeSpec, mut writer: W, + events: EventSender, + mk_progress: impl Fn(u64) -> Event, ) -> Result<(SentStatus, u64, SliceReaderStats)> { match db.get(&hash).await? { Some(entry) => { let outboard = entry.outboard().await?; let size = outboard.tree().size(); let mut file_reader = TrackingSliceReader::new(entry.data_reader().await?); + let mut sending_reader = + SendingSliceReader::new(&mut file_reader, &events, mk_progress); writer.write(size.to_le_bytes().as_slice()).await?; encode_ranges_validated( - &mut file_reader, + &mut sending_reader, outboard, &ranges.to_chunk_ranges(), writer, diff --git a/iroh/src/client/blobs.rs b/iroh/src/client/blobs.rs index 04e544e8b1..2935ba891a 100644 --- a/iroh/src/client/blobs.rs +++ b/iroh/src/client/blobs.rs @@ -948,7 +948,7 @@ mod tests { use iroh_net::NodeId; use rand::RngCore; use testresult::TestResult; - use tokio::io::AsyncWriteExt; + use tokio::{io::AsyncWriteExt, sync::mpsc}; #[tokio::test] async fn test_blob_create_collection() -> Result<()> { @@ -1252,6 +1252,95 @@ mod tests { Ok(()) } + #[derive(Debug, Clone)] + struct BlobEvents { + sender: mpsc::Sender, + } + + impl BlobEvents { + fn new(cap: usize) -> (Self, mpsc::Receiver) { + let (s, r) = mpsc::channel(cap); + (Self { sender: s }, r) + } + } + + impl iroh_blobs::provider::CustomEventSender for BlobEvents { + fn send(&self, event: iroh_blobs::provider::Event) -> futures_lite::future::Boxed<()> { + let sender = self.sender.clone(); + Box::pin(async move { + sender.send(event).await.ok(); + }) + } + + fn try_send(&self, event: iroh_blobs::provider::Event) { + self.sender.try_send(event).ok(); + } + } + + #[tokio::test] + async fn test_blob_provide_events() -> Result<()> { + let _guard = iroh_test::logging::setup(); + + let (node1_events, mut node1_events_r) = BlobEvents::new(16); + let node1 = crate::node::Node::memory() + .blobs_events(node1_events) + .spawn() + .await?; + + let (node2_events, mut node2_events_r) = BlobEvents::new(16); + let node2 = crate::node::Node::memory() + .blobs_events(node2_events) + .spawn() + .await?; + + let import_outcome = node1.blobs().add_bytes(&b"hello world"[..]).await?; + + // Download in node2 + let node1_addr = node1.node_addr().await?; + let res = node2 + .blobs() + .download(import_outcome.hash, node1_addr) + .await? + .await?; + dbg!(&res); + assert_eq!(res.local_size, 0); + assert_eq!(res.downloaded_size, 11); + + node1.shutdown().await?; + node2.shutdown().await?; + + let mut ev1 = Vec::new(); + while let Some(ev) = node1_events_r.recv().await { + ev1.push(ev); + } + // assert_eq!(ev1.len(), 3); + assert!(matches!( + ev1[0], + iroh_blobs::provider::Event::ClientConnected { .. } + )); + assert!(matches!( + ev1[1], + iroh_blobs::provider::Event::GetRequestReceived { .. } + )); + assert!(matches!( + ev1[2], + iroh_blobs::provider::Event::TransferProgress { .. } + )); + assert!(matches!( + ev1[3], + iroh_blobs::provider::Event::TransferCompleted { .. } + )); + dbg!(&ev1); + + let mut ev2 = Vec::new(); + while let Some(ev) = node2_events_r.recv().await { + ev2.push(ev); + } + + // Node 2 did not provide anything + assert!(ev2.is_empty()); + Ok(()) + } /// Download a existing blob from oneself #[tokio::test] async fn test_blob_get_self_existing() -> TestResult<()> { diff --git a/iroh/src/node/builder.rs b/iroh/src/node/builder.rs index 52b97b1b93..34dbf4b3c1 100644 --- a/iroh/src/node/builder.rs +++ b/iroh/src/node/builder.rs @@ -10,6 +10,7 @@ use futures_lite::StreamExt; use iroh_base::key::SecretKey; use iroh_blobs::{ downloader::Downloader, + provider::EventSender, store::{Map, Store as BaoStore}, util::local_pool::{self, LocalPool, LocalPoolHandle, PanicMode}, }; @@ -112,6 +113,7 @@ where /// Callback to register when a gc loop is done #[debug("callback")] gc_done_callback: Option>, + blob_events: EventSender, } /// Configuration for storage. @@ -237,6 +239,7 @@ impl Default for Builder { #[cfg(any(test, feature = "test-utils"))] insecure_skip_relay_cert_verify: false, gc_done_callback: None, + blob_events: Default::default(), } } } @@ -270,6 +273,7 @@ impl Builder { #[cfg(any(test, feature = "test-utils"))] insecure_skip_relay_cert_verify: false, gc_done_callback: None, + blob_events: Default::default(), } } } @@ -278,6 +282,15 @@ impl Builder where D: BaoStore, { + /// Configure a blob events sender. This will replace the previous blob + /// event sender. By default, no events are sent. + /// + /// To define an event sender, implement the [`iroh_blobs::provider::CustomEventSender`] trait. + pub fn blobs_events(mut self, blob_events: impl Into) -> Self { + self.blob_events = blob_events.into(); + self + } + /// Persist all node data in the provided directory. pub async fn persist( self, @@ -334,6 +347,7 @@ where #[cfg(any(test, feature = "test-utils"))] insecure_skip_relay_cert_verify: false, gc_done_callback: self.gc_done_callback, + blob_events: self.blob_events, }) } @@ -627,7 +641,7 @@ where local_pool: lp, }; - let protocol_builder = protocol_builder.register_iroh_protocols(); + let protocol_builder = protocol_builder.register_iroh_protocols(self.blob_events); Ok(protocol_builder) } @@ -750,10 +764,13 @@ impl ProtocolBuilder { } /// Registers the core iroh protocols (blobs, gossip, docs). - fn register_iroh_protocols(mut self) -> Self { + fn register_iroh_protocols(mut self, blob_events: EventSender) -> Self { // Register blobs. - let blobs_proto = - BlobsProtocol::new(self.blobs_db().clone(), self.local_pool_handle().clone()); + let blobs_proto = BlobsProtocol::new_with_events( + self.blobs_db().clone(), + self.local_pool_handle().clone(), + blob_events, + ); self = self.accept(iroh_blobs::protocol::ALPN, Arc::new(blobs_proto)); // Register gossip. diff --git a/iroh/src/node/protocol.rs b/iroh/src/node/protocol.rs index ce342ab249..aef3b170f4 100644 --- a/iroh/src/node/protocol.rs +++ b/iroh/src/node/protocol.rs @@ -3,7 +3,7 @@ use std::{any::Any, collections::BTreeMap, fmt, sync::Arc}; use anyhow::Result; use futures_lite::future::Boxed as BoxedFuture; use futures_util::future::join_all; -use iroh_blobs::util::local_pool::LocalPoolHandle; +use iroh_blobs::{provider::EventSender, util::local_pool::LocalPoolHandle}; use iroh_net::endpoint::Connecting; /// Handler for incoming connections. @@ -81,11 +81,12 @@ impl ProtocolMap { pub(crate) struct BlobsProtocol { rt: LocalPoolHandle, store: S, + events: EventSender, } impl BlobsProtocol { - pub fn new(store: S, rt: LocalPoolHandle) -> Self { - Self { rt, store } + pub fn new_with_events(store: S, rt: LocalPoolHandle, events: EventSender) -> Self { + Self { rt, store, events } } } @@ -95,7 +96,7 @@ impl ProtocolHandler for BlobsProtocol { iroh_blobs::provider::handle_connection( conn.await?, self.store.clone(), - MockEventSender, + self.events.clone(), self.rt.clone(), ) .await; @@ -104,15 +105,6 @@ impl ProtocolHandler for BlobsProtocol { } } -#[derive(Debug, Clone)] -struct MockEventSender; - -impl iroh_blobs::provider::EventSender for MockEventSender { - fn send(&self, _event: iroh_blobs::provider::Event) -> futures_lite::future::Boxed<()> { - Box::pin(std::future::ready(())) - } -} - impl ProtocolHandler for iroh_gossip::net::Gossip { fn accept(self: Arc, conn: Connecting) -> BoxedFuture> { Box::pin(async move { self.handle_connection(conn.await?).await }) From a5072c3a0a11d931b3fc4e95ac48c32f12959a5b Mon Sep 17 00:00:00 2001 From: Divma <26765164+divagant-martian@users.noreply.github.com> Date: Tue, 13 Aug 2024 11:38:22 -0500 Subject: [PATCH 44/45] chore(ci): use nextests groups to isolate some tests (#2617) ## Description After a lot of debugging we realized the integrations tests (`gc` tests, for example) run with swarm discovery on since they are not technically using test iroh, so they bypass the cfg that prevents it from being turned on. This causes the `test_local_swarm_discovery` test to be flooded with nodes from other tests, making it flaky. Isolating the test should make it no longer flaky anymore ## Breaking Changes n/a ## Notes & open questions We should still create a test that creates a bunch of nodes and makes sure they all can find each other within reasonable time. There is still reason to believe swarm discovery is slow, particularly in linux, but we should address it in a test under our countrol, not via the non-deterministic nature of concurrent tests in ci. ## Change checklist - [x] Self-review. - [ ] ~~Documentation updates following the [style guide](https://rust-lang.github.io/rfcs/1574-more-api-documentation-conventions.html#appendix-a-full-conventions-text), if relevant.~~ - [ ] ~~Tests if relevant.~~ - [ ] ~~All breaking changes documented.~~ --- .config/nextest.toml | 10 +++ .github/workflows/tests.yaml | 4 +- .../src/discovery/local_swarm_discovery.rs | 73 ++++++++++--------- 3 files changed, 51 insertions(+), 36 deletions(-) create mode 100644 .config/nextest.toml diff --git a/.config/nextest.toml b/.config/nextest.toml new file mode 100644 index 0000000000..fc3ed95a21 --- /dev/null +++ b/.config/nextest.toml @@ -0,0 +1,10 @@ +[test-groups] +run-in-isolation = { max-threads = 16 } +# these are tests that must not run with other tests concurrently. All tests in +# this group can take up at most 16 threads among them, but each one requiring +# 16 threads also. The effect should be that tests run isolated. + +[[profile.ci.overrides]] +filter = 'test(::run_in_isolation::)' +test-group = 'run-in-isolation' +threads-required = 16 diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index 94b07f652b..7f58de3733 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -114,7 +114,7 @@ jobs: - name: run tests run: | mkdir -p output - cargo nextest run --workspace ${{ env.FEATURES }} --lib --bins --tests --run-ignored ${{ inputs.flaky && 'all' || 'default' }} --no-fail-fast --message-format ${{ inputs.flaky && 'libtest-json' || 'human' }} > output/${{ matrix.name }}_${{ matrix.features }}_${{ matrix.rust }}.json + cargo nextest run --workspace ${{ env.FEATURES }} --lib --bins --tests --profile ci --run-ignored ${{ inputs.flaky && 'all' || 'default' }} --no-fail-fast --message-format ${{ inputs.flaky && 'libtest-json' || 'human' }} > output/${{ matrix.name }}_${{ matrix.features }}_${{ matrix.rust }}.json env: RUST_LOG: ${{ runner.debug && 'TRACE' || 'DEBUG'}} NEXTEST_EXPERIMENTAL_LIBTEST_JSON: 1 @@ -213,7 +213,7 @@ jobs: - name: tests run: | mkdir -p output - cargo nextest run --workspace ${{ env.FEATURES }} --lib --bins --tests --target ${{ matrix.target }} --run-ignored ${{ inputs.flaky && 'all' || 'default' }} --no-fail-fast --message-format ${{ inputs.flaky && 'libtest-json' || 'human' }} > output/${{ matrix.name }}_${{ matrix.features }}_${{ matrix.rust }}.json + cargo nextest run --workspace ${{ env.FEATURES }} --lib --bins --tests --profile ci --target ${{ matrix.target }} --run-ignored ${{ inputs.flaky && 'all' || 'default' }} --no-fail-fast --message-format ${{ inputs.flaky && 'libtest-json' || 'human' }} > output/${{ matrix.name }}_${{ matrix.features }}_${{ matrix.rust }}.json env: RUST_LOG: ${{ runner.debug && 'TRACE' || 'DEBUG'}} NEXTEST_EXPERIMENTAL_LIBTEST_JSON: 1 diff --git a/iroh-net/src/discovery/local_swarm_discovery.rs b/iroh-net/src/discovery/local_swarm_discovery.rs index a42594a6e8..4c83b44cd2 100644 --- a/iroh-net/src/discovery/local_swarm_discovery.rs +++ b/iroh-net/src/discovery/local_swarm_discovery.rs @@ -127,7 +127,7 @@ impl LocalSwarmDiscovery { sender.send(Ok(item)).await.ok(); } } - trace!( + debug!( ?discovered_node_id, ?peer_info, "adding node to LocalSwarmDiscovery address book" @@ -272,42 +272,47 @@ impl Discovery for LocalSwarmDiscovery { #[cfg(test)] mod tests { - use super::*; - use testresult::TestResult; - #[tokio::test] - async fn test_local_swarm_discovery() -> TestResult { - let _guard = iroh_test::logging::setup(); - let (_, discovery_a) = make_discoverer()?; - let (node_id_b, discovery_b) = make_discoverer()?; + /// This module's name signals nextest to run test in a single thread (no other concurrent + /// tests) + mod run_in_isolation { + use super::super::*; + use testresult::TestResult; - // make addr info for discoverer b - let addr_info = AddrInfo { - relay_url: None, - direct_addresses: BTreeSet::from(["0.0.0.0:11111".parse()?]), - }; + #[tokio::test] + async fn test_local_swarm_discovery() -> TestResult { + let _guard = iroh_test::logging::setup(); + let (_, discovery_a) = make_discoverer()?; + let (node_id_b, discovery_b) = make_discoverer()?; - // pass in endpoint, this is never used - let ep = crate::endpoint::Builder::default().bind(0).await?; - // resolve twice to ensure we can create separate streams for the same node_id - let mut s1 = discovery_a.resolve(ep.clone(), node_id_b).unwrap(); - let mut s2 = discovery_a.resolve(ep, node_id_b).unwrap(); - tracing::debug!(?node_id_b, "Discovering node id b"); - // publish discovery_b's address - discovery_b.publish(&addr_info); - let s1_res = tokio::time::timeout(Duration::from_secs(5), s1.next()) - .await? - .unwrap()?; - let s2_res = tokio::time::timeout(Duration::from_secs(5), s2.next()) - .await? - .unwrap()?; - assert_eq!(s1_res.addr_info, addr_info); - assert_eq!(s2_res.addr_info, addr_info); - Ok(()) - } + // make addr info for discoverer b + let addr_info = AddrInfo { + relay_url: None, + direct_addresses: BTreeSet::from(["0.0.0.0:11111".parse()?]), + }; - fn make_discoverer() -> Result<(PublicKey, LocalSwarmDiscovery)> { - let node_id = crate::key::SecretKey::generate().public(); - Ok((node_id, LocalSwarmDiscovery::new(node_id)?)) + // pass in endpoint, this is never used + let ep = crate::endpoint::Builder::default().bind(0).await?; + // resolve twice to ensure we can create separate streams for the same node_id + let mut s1 = discovery_a.resolve(ep.clone(), node_id_b).unwrap(); + let mut s2 = discovery_a.resolve(ep, node_id_b).unwrap(); + tracing::debug!(?node_id_b, "Discovering node id b"); + // publish discovery_b's address + discovery_b.publish(&addr_info); + let s1_res = tokio::time::timeout(Duration::from_secs(5), s1.next()) + .await? + .unwrap()?; + let s2_res = tokio::time::timeout(Duration::from_secs(5), s2.next()) + .await? + .unwrap()?; + assert_eq!(s1_res.addr_info, addr_info); + assert_eq!(s2_res.addr_info, addr_info); + Ok(()) + } + + fn make_discoverer() -> Result<(PublicKey, LocalSwarmDiscovery)> { + let node_id = crate::key::SecretKey::generate().public(); + Ok((node_id, LocalSwarmDiscovery::new(node_id)?)) + } } } From 74a527b9699e5da06c0b85bcb32a873397906472 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=BCdiger=20Klaehn?= Date: Wed, 14 Aug 2024 09:38:59 +0300 Subject: [PATCH 45/45] refactor(iroh-net)!: remove async channel (#2620) ## Description Removes async-channel from iroh-net and replaces it with the tokio mpsc channel. This is a mergeable version of https://github.com/n0-computer/iroh/pull/2614 ## Breaking Changes LocalSwarmDiscovery is no longer UnwindSafe ## Notes & open questions Open question: I am using blocking_send from inside the windows RouteMonitor. This depends on that the callback thread is not a tokio thread. I have no idea if this is a reasonable assumption to make. Otherwise we would have to bring back the runtime check condition or just capture a runtime handle and spawn a task. Note: what even is this? The only message there is is NetworkMessage::Change, and there is no timestamp or anything. So could I just use try_send? If the queue is full, there is already a Change in it, so it is as good as the new one... ## Change checklist - [x] Self-review. - [ ] Documentation updates following the [style guide](https://rust-lang.github.io/rfcs/1574-more-api-documentation-conventions.html#appendix-a-full-conventions-text), if relevant. - [ ] Tests if relevant. - [ ] All breaking changes documented. --------- Co-authored-by: dignifiedquire --- Cargo.lock | 2 +- iroh-net/Cargo.toml | 9 ++-- .../src/discovery/local_swarm_discovery.rs | 41 +++++++++++-------- iroh-net/src/magicsock.rs | 15 +++---- iroh-net/src/magicsock/udp_conn.rs | 5 ++- iroh-net/src/net/netmon/actor.rs | 6 +-- iroh-net/src/net/netmon/android.rs | 3 +- iroh-net/src/net/netmon/bsd.rs | 4 +- iroh-net/src/net/netmon/linux.rs | 4 +- iroh-net/src/net/netmon/windows.rs | 7 ++-- 10 files changed, 55 insertions(+), 41 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 00f2b29a0f..f229a55ce6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2839,7 +2839,6 @@ name = "iroh-net" version = "0.22.0" dependencies = [ "anyhow", - "async-channel", "axum", "backoff", "base64 0.22.1", @@ -2913,6 +2912,7 @@ dependencies = [ "tokio", "tokio-rustls 0.24.1", "tokio-rustls-acme", + "tokio-stream", "tokio-tungstenite", "tokio-tungstenite-wasm", "tokio-util", diff --git a/iroh-net/Cargo.toml b/iroh-net/Cargo.toml index 6e0e154dc8..4c00a036c2 100644 --- a/iroh-net/Cargo.toml +++ b/iroh-net/Cargo.toml @@ -17,7 +17,6 @@ workspace = true [dependencies] anyhow = { version = "1" } -async-channel = "2.3.1" base64 = "0.22.1" backoff = "0.4.0" bytes = "1" @@ -58,7 +57,6 @@ ring = "0.17" rustls = { version = "0.21.11", default-features = false, features = ["dangerous_configuration"] } serde = { version = "1", features = ["derive", "rc"] } smallvec = "1.11.1" -swarm-discovery = { version = "0.2.1", optional = true } socket2 = "0.5.3" stun-rs = "0.1.5" surge-ping = "0.8.0" @@ -92,6 +90,11 @@ tokio-rustls-acme = { version = "0.3", optional = true } iroh-metrics = { version = "0.22.0", path = "../iroh-metrics", default-features = false } strum = { version = "0.26.2", features = ["derive"] } +# local_swarm_discovery +swarm-discovery = { version = "0.2.1", optional = true } +tokio-stream = { version = "0.1.15", optional = true } + + [target.'cfg(any(target_os = "linux", target_os = "android"))'.dependencies] netlink-packet-core = "0.7.0" netlink-packet-route = "0.17.0" @@ -140,7 +143,7 @@ iroh-relay = [ ] metrics = ["iroh-metrics/metrics"] test-utils = ["iroh-relay"] -local_swarm_discovery = ["dep:swarm-discovery"] +local_swarm_discovery = ["dep:swarm-discovery", "dep:tokio-stream"] [[bin]] name = "iroh-relay" diff --git a/iroh-net/src/discovery/local_swarm_discovery.rs b/iroh-net/src/discovery/local_swarm_discovery.rs index 4c83b44cd2..b952a06b08 100644 --- a/iroh-net/src/discovery/local_swarm_discovery.rs +++ b/iroh-net/src/discovery/local_swarm_discovery.rs @@ -11,13 +11,12 @@ use std::{ use anyhow::Result; use derive_more::FromStr; -use futures_lite::{stream::Boxed as BoxStream, StreamExt}; +use futures_lite::stream::Boxed as BoxStream; use tracing::{debug, error, trace, warn}; -use async_channel::Sender; use iroh_base::key::PublicKey; use swarm_discovery::{Discoverer, DropGuard, IpClass, Peer}; -use tokio::task::JoinSet; +use tokio::{sync::mpsc, task::JoinSet}; use crate::{ discovery::{Discovery, DiscoveryItem}, @@ -39,13 +38,13 @@ const DISCOVERY_DURATION: Duration = Duration::from_secs(10); pub struct LocalSwarmDiscovery { #[allow(dead_code)] handle: AbortingJoinHandle<()>, - sender: Sender, + sender: mpsc::Sender, } #[derive(Debug)] enum Message { Discovery(String, Peer), - SendAddrs(NodeId, Sender>), + SendAddrs(NodeId, mpsc::Sender>), ChangeLocalAddrs(AddrInfo), Timeout(NodeId, usize), } @@ -62,7 +61,7 @@ impl LocalSwarmDiscovery { /// This relies on [`tokio::runtime::Handle::current`] and will panic if called outside of the context of a tokio runtime. pub fn new(node_id: NodeId) -> Result { debug!("Creating new LocalSwarmDiscovery service"); - let (send, recv) = async_channel::bounded(64); + let (send, mut recv) = mpsc::channel(64); let task_sender = send.clone(); let rt = tokio::runtime::Handle::current(); let discovery = LocalSwarmDiscovery::spawn_discoverer( @@ -75,19 +74,21 @@ impl LocalSwarmDiscovery { let handle = tokio::spawn(async move { let mut node_addrs: HashMap = HashMap::default(); let mut last_id = 0; - let mut senders: HashMap>>> = - HashMap::default(); + let mut senders: HashMap< + PublicKey, + HashMap>>, + > = HashMap::default(); let mut timeouts = JoinSet::new(); loop { trace!(?node_addrs, "LocalSwarmDiscovery Service loop tick"); let msg = match recv.recv().await { - Err(err) => { - error!("LocalSwarmDiscovery service error: {err:?}"); + None => { + error!("LocalSwarmDiscovery channel closed"); error!("closing LocalSwarmDiscovery"); timeouts.abort_all(); return; } - Ok(msg) => msg, + Some(msg) => msg, }; match msg { Message::Discovery(discovered_node_id, peer_info) => { @@ -189,10 +190,11 @@ impl LocalSwarmDiscovery { fn spawn_discoverer( node_id: PublicKey, - sender: Sender, + sender: mpsc::Sender, socketaddrs: BTreeSet, rt: &tokio::runtime::Handle, ) -> Result { + let spawn_rt = rt.clone(); let callback = move |node_id: &str, peer: &Peer| { trace!( node_id, @@ -200,9 +202,12 @@ impl LocalSwarmDiscovery { "Received peer information from LocalSwarmDiscovery" ); - sender - .send_blocking(Message::Discovery(node_id.to_string(), peer.clone())) - .ok(); + let sender = sender.clone(); + let node_id = node_id.to_string(); + let peer = peer.clone(); + spawn_rt.spawn(async move { + sender.send(Message::Discovery(node_id, peer)).await.ok(); + }); }; let addrs = LocalSwarmDiscovery::socketaddrs_to_addrs(socketaddrs); let mut discoverer = @@ -247,7 +252,7 @@ impl From<&Peer> for DiscoveryItem { impl Discovery for LocalSwarmDiscovery { fn resolve(&self, _ep: Endpoint, node_id: NodeId) -> Option>> { - let (send, recv) = async_channel::bounded(20); + let (send, recv) = mpsc::channel(20); let discovery_sender = self.sender.clone(); tokio::spawn(async move { discovery_sender @@ -255,7 +260,8 @@ impl Discovery for LocalSwarmDiscovery { .await .ok(); }); - Some(recv.boxed()) + let stream = tokio_stream::wrappers::ReceiverStream::new(recv); + Some(Box::pin(stream)) } fn publish(&self, info: &AddrInfo) { @@ -277,6 +283,7 @@ mod tests { /// tests) mod run_in_isolation { use super::super::*; + use futures_lite::StreamExt; use testresult::TestResult; #[tokio::test] diff --git a/iroh-net/src/magicsock.rs b/iroh-net/src/magicsock.rs index 2f54e459ce..995ec326f4 100644 --- a/iroh-net/src/magicsock.rs +++ b/iroh-net/src/magicsock.rs @@ -177,7 +177,7 @@ pub(crate) struct MagicSock { proxy_url: Option, /// Used for receiving relay messages. - relay_recv_receiver: async_channel::Receiver, + relay_recv_receiver: parking_lot::Mutex>, /// Stores wakers, to be called when relay_recv_ch receives new data. network_recv_wakers: parking_lot::Mutex>, network_send_wakers: parking_lot::Mutex>, @@ -788,12 +788,13 @@ impl MagicSock { if self.is_closed() { break; } - match self.relay_recv_receiver.try_recv() { - Err(async_channel::TryRecvError::Empty) => { + let mut relay_recv_receiver = self.relay_recv_receiver.lock(); + match relay_recv_receiver.try_recv() { + Err(mpsc::error::TryRecvError::Empty) => { self.network_recv_wakers.lock().replace(cx.waker().clone()); break; } - Err(async_channel::TryRecvError::Closed) => { + Err(mpsc::error::TryRecvError::Disconnected) => { return Poll::Ready(Err(io::Error::new( io::ErrorKind::NotConnected, "connection closed", @@ -1378,7 +1379,7 @@ impl Handle { insecure_skip_relay_cert_verify, } = opts; - let (relay_recv_sender, relay_recv_receiver) = async_channel::bounded(128); + let (relay_recv_sender, relay_recv_receiver) = mpsc::channel(128); let (pconn4, pconn6) = bind(port)?; let port = pconn4.port(); @@ -1412,7 +1413,7 @@ impl Handle { local_addrs: std::sync::RwLock::new((ipv4_addr, ipv6_addr)), closing: AtomicBool::new(false), closed: AtomicBool::new(false), - relay_recv_receiver, + relay_recv_receiver: parking_lot::Mutex::new(relay_recv_receiver), network_recv_wakers: parking_lot::Mutex::new(None), network_send_wakers: parking_lot::Mutex::new(None), actor_sender: actor_sender.clone(), @@ -1704,7 +1705,7 @@ struct Actor { relay_actor_sender: mpsc::Sender, relay_actor_cancel_token: CancellationToken, /// Channel to send received relay messages on, for processing. - relay_recv_sender: async_channel::Sender, + relay_recv_sender: mpsc::Sender, /// When set, is an AfterFunc timer that will call MagicSock::do_periodic_stun. periodic_re_stun_timer: time::Interval, /// The `NetInfo` provided in the last call to `net_info_func`. It's used to deduplicate calls to netInfoFunc. diff --git a/iroh-net/src/magicsock/udp_conn.rs b/iroh-net/src/magicsock/udp_conn.rs index f4d641db26..e6d6444d09 100644 --- a/iroh-net/src/magicsock/udp_conn.rs +++ b/iroh-net/src/magicsock/udp_conn.rs @@ -151,6 +151,7 @@ mod tests { use super::*; use anyhow::Result; + use tokio::sync::mpsc; const ALPN: &[u8] = b"n0/test/1"; @@ -192,7 +193,7 @@ mod tests { let (m2, _m2_key) = wrap_socket(m2)?; let m1_addr = SocketAddr::new(network.local_addr(), m1.local_addr()?.port()); - let (m1_send, m1_recv) = async_channel::bounded(8); + let (m1_send, mut m1_recv) = mpsc::channel(8); let m1_task = tokio::task::spawn(async move { if let Some(conn) = m1.accept().await { @@ -220,7 +221,7 @@ mod tests { drop(send_bi); // make sure the right values arrived - let val = m1_recv.recv().await?; + let val = m1_recv.recv().await.unwrap(); assert_eq!(val, b"hello"); m1_task.await??; diff --git a/iroh-net/src/net/netmon/actor.rs b/iroh-net/src/net/netmon/actor.rs index 083e482caa..b30aee228a 100644 --- a/iroh-net/src/net/netmon/actor.rs +++ b/iroh-net/src/net/netmon/actor.rs @@ -57,7 +57,7 @@ pub(super) struct Actor { /// OS specific monitor. #[allow(dead_code)] route_monitor: RouteMonitor, - mon_receiver: async_channel::Receiver, + mon_receiver: mpsc::Receiver, actor_receiver: mpsc::Receiver, actor_sender: mpsc::Sender, /// Callback registry. @@ -84,7 +84,7 @@ impl Actor { let wall_time = Instant::now(); // Use flume channels, as tokio::mpsc is not safe to use across ffi boundaries. - let (mon_sender, mon_receiver) = async_channel::bounded(MON_CHAN_CAPACITY); + let (mon_sender, mon_receiver) = mpsc::channel(MON_CHAN_CAPACITY); let route_monitor = RouteMonitor::new(mon_sender)?; let (actor_sender, actor_receiver) = mpsc::channel(ACTOR_CHAN_CAPACITY); @@ -129,7 +129,7 @@ impl Actor { debounce_interval.reset_immediately(); } } - Ok(_event) = self.mon_receiver.recv() => { + Some(_event) = self.mon_receiver.recv() => { trace!("network activity detected"); last_event.replace(false); debounce_interval.reset_immediately(); diff --git a/iroh-net/src/net/netmon/android.rs b/iroh-net/src/net/netmon/android.rs index f92eb721f0..ace7e8f326 100644 --- a/iroh-net/src/net/netmon/android.rs +++ b/iroh-net/src/net/netmon/android.rs @@ -1,4 +1,5 @@ use anyhow::Result; +use tokio::sync::mpsc; use super::actor::NetworkMessage; @@ -6,7 +7,7 @@ use super::actor::NetworkMessage; pub(super) struct RouteMonitor {} impl RouteMonitor { - pub(super) fn new(_sender: async_channel::Sender) -> Result { + pub(super) fn new(_sender: mpsc::Sender) -> Result { // Very sad monitor. Android doesn't allow us to do this Ok(RouteMonitor {}) diff --git a/iroh-net/src/net/netmon/bsd.rs b/iroh-net/src/net/netmon/bsd.rs index 20daef64ba..969f85a6d3 100644 --- a/iroh-net/src/net/netmon/bsd.rs +++ b/iroh-net/src/net/netmon/bsd.rs @@ -1,5 +1,5 @@ use anyhow::Result; -use tokio::{io::AsyncReadExt, task::JoinHandle}; +use tokio::{io::AsyncReadExt, sync::mpsc, task::JoinHandle}; use tracing::{trace, warn}; #[cfg(any(target_os = "freebsd", target_os = "netbsd", target_os = "openbsd"))] @@ -23,7 +23,7 @@ impl Drop for RouteMonitor { } impl RouteMonitor { - pub(super) fn new(sender: async_channel::Sender) -> Result { + pub(super) fn new(sender: mpsc::Sender) -> Result { let socket = socket2::Socket::new(libc::AF_ROUTE.into(), socket2::Type::RAW, None)?; socket.set_nonblocking(true)?; let socket_std: std::os::unix::net::UnixStream = socket.into(); diff --git a/iroh-net/src/net/netmon/linux.rs b/iroh-net/src/net/netmon/linux.rs index 12976b37e8..7a422ad9c3 100644 --- a/iroh-net/src/net/netmon/linux.rs +++ b/iroh-net/src/net/netmon/linux.rs @@ -9,7 +9,7 @@ use netlink_packet_core::NetlinkPayload; use netlink_packet_route::{address, constants::*, route, RtnlMessage}; use netlink_sys::{AsyncSocket, SocketAddr}; use rtnetlink::new_connection; -use tokio::task::JoinHandle; +use tokio::{sync::mpsc, task::JoinHandle}; use tracing::{info, trace, warn}; use crate::net::ip::is_link_local; @@ -49,7 +49,7 @@ macro_rules! get_nla { } impl RouteMonitor { - pub(super) fn new(sender: async_channel::Sender) -> Result { + pub(super) fn new(sender: mpsc::Sender) -> Result { let (mut conn, mut _handle, mut messages) = new_connection()?; // Specify flags to listen on. diff --git a/iroh-net/src/net/netmon/windows.rs b/iroh-net/src/net/netmon/windows.rs index da77899d0f..1b60a425cf 100644 --- a/iroh-net/src/net/netmon/windows.rs +++ b/iroh-net/src/net/netmon/windows.rs @@ -2,6 +2,7 @@ use std::{collections::HashMap, sync::Arc}; use anyhow::Result; use libc::c_void; +use tokio::sync::mpsc; use tracing::{trace, warn}; use windows::Win32::{ Foundation::{BOOLEAN, HANDLE as Handle}, @@ -19,21 +20,21 @@ pub(super) struct RouteMonitor { } impl RouteMonitor { - pub(super) fn new(sender: async_channel::Sender) -> Result { + pub(super) fn new(sender: mpsc::Sender) -> Result { // Register two callbacks with the windows api let mut cb_handler = CallbackHandler::default(); // 1. Unicast Address Changes let s = sender.clone(); cb_handler.register_unicast_address_change_callback(Box::new(move || { - if let Err(err) = s.send_blocking(NetworkMessage::Change) { + if let Err(err) = s.blocking_send(NetworkMessage::Change) { warn!("unable to send: unicast change notification: {:?}", err); } }))?; // 2. Route Changes cb_handler.register_route_change_callback(Box::new(move || { - if let Err(err) = sender.send_blocking(NetworkMessage::Change) { + if let Err(err) = sender.blocking_send(NetworkMessage::Change) { warn!("unable to send: route change notification: {:?}", err); } }))?;