diff --git a/Cargo.lock b/Cargo.lock index b9a1f5199e..dfa6268180 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1579,7 +1579,6 @@ version = "0.7.2" dependencies = [ "cached", "check-if-email-exists", - "deadpool", "doc-comment", "fast_chemail", "glob", diff --git a/examples/builder/builder.rs b/examples/builder/builder.rs index 08f189e899..1b3f53bd93 100644 --- a/examples/builder/builder.rs +++ b/examples/builder/builder.rs @@ -40,7 +40,7 @@ async fn main() -> Result<()> { .build() .client()?; - let response = client.check("http://example.org").await?; + let response = client.check("https://example.org").await?; dbg!(&response); assert!(response.status().is_success()); Ok(()) diff --git a/examples/client_pool/client_pool.rs b/examples/client_pool/client_pool.rs index ef4bc8d46b..064fe83691 100644 --- a/examples/client_pool/client_pool.rs +++ b/examples/client_pool/client_pool.rs @@ -1,4 +1,4 @@ -use lychee_lib::{ClientBuilder, ClientPool, Input, Request, Result, Uri}; +use lychee_lib::{ClientBuilder, Input, Request, Result, Uri}; use std::convert::TryFrom; use tokio::sync::mpsc; @@ -9,7 +9,7 @@ const CONCURRENT_REQUESTS: usize = 4; async fn main() -> Result<()> { // These channels are used to send requests and receive responses to and // from the lychee client pool - let (send_req, recv_req) = mpsc::channel(CONCURRENT_REQUESTS); + let (send_req, mut recv_req) = mpsc::channel(CONCURRENT_REQUESTS); let (send_resp, mut recv_resp) = mpsc::channel(CONCURRENT_REQUESTS); // Add as many requests as you like @@ -29,13 +29,17 @@ async fn main() -> Result<()> { // Create a default lychee client let client = ClientBuilder::default().client()?; - // Create a pool with four lychee clients - let clients = vec![client; CONCURRENT_REQUESTS]; - let mut clients = ClientPool::new(send_resp, recv_req, clients); - // Handle requests in a client pool tokio::spawn(async move { - clients.listen().await; + while let Some(req) = recv_req.recv().await { + // Client::check() may fail only because Request::try_from() may fail + // here request is already Request, so it never fails + let resp = client.check(req).await.unwrap(); + send_resp + .send(resp) + .await + .expect("Cannot send response to channel"); + } }); // Finally, listen to incoming responses from lychee diff --git a/fixtures/TEST_REPETITION.txt b/fixtures/TEST_REPETITION.txt deleted file mode 100644 index 0395c8e26e..0000000000 --- a/fixtures/TEST_REPETITION.txt +++ /dev/null @@ -1,4 +0,0 @@ -These links are all the same and should only be checked once: -https://example.org/ -https://example.org/ -https://example.org diff --git a/fixtures/TEST_REPETITION_1.txt b/fixtures/TEST_REPETITION_1.txt new file mode 100644 index 0000000000..14570d507c --- /dev/null +++ b/fixtures/TEST_REPETITION_1.txt @@ -0,0 +1,5 @@ +These links are all the same and match those of TEST_REPETITION_2.txt. +All links in both files should be counted as one and checked once only. +https://example.org/ +https://example.org/ +https://example.org diff --git a/fixtures/TEST_REPETITION_2.txt b/fixtures/TEST_REPETITION_2.txt new file mode 100644 index 0000000000..0b6091f1a8 --- /dev/null +++ b/fixtures/TEST_REPETITION_2.txt @@ -0,0 +1,5 @@ +These links are all the same and match those of TEST_REPETITION_1.txt. +All links in both files should be counted as one and checked once only. +https://example.org/ +https://example.org/ +https://example.org diff --git a/lychee-bin/src/main.rs b/lychee-bin/src/main.rs index b1f2c7307c..8c49c5810c 100644 --- a/lychee-bin/src/main.rs +++ b/lychee-bin/src/main.rs @@ -69,7 +69,7 @@ use std::{collections::HashSet, fs, str::FromStr}; use anyhow::{anyhow, Context, Result}; use headers::HeaderMapExt; use indicatif::{ProgressBar, ProgressStyle}; -use lychee_lib::{ClientBuilder, ClientPool, Collector, Input, Request, Response}; +use lychee_lib::{ClientBuilder, Collector, Input, Request, Response}; use openssl_sys as _; // required for vendored-openssl feature use regex::RegexSet; use ring as _; // required for apple silicon @@ -228,7 +228,7 @@ async fn run(cfg: &Config, inputs: Vec) -> Result { Some(bar) }; - let (send_req, recv_req) = mpsc::channel(max_concurrency); + let (send_req, mut recv_req) = mpsc::channel(max_concurrency); let (send_resp, mut recv_resp) = mpsc::channel(max_concurrency); let mut stats = ResponseStats::new(); @@ -245,9 +245,15 @@ async fn run(cfg: &Config, inputs: Vec) -> Result { // Start receiving requests tokio::spawn(async move { - let clients = vec![client; max_concurrency]; - let mut clients = ClientPool::new(send_resp, recv_req, clients); - clients.listen().await; + while let Some(req) = recv_req.recv().await { + // `Client::check()` may fail only because `Request::try_from()` may + // fail. Here `req` is already a valid `Request`, so it never fails. + let resp = client.check(req).await.unwrap(); + send_resp + .send(resp) + .await + .expect("Cannot send response to channel"); + } }); while let Some(response) = recv_resp.recv().await { diff --git a/lychee-bin/src/stats.rs b/lychee-bin/src/stats.rs index 9ba51f91cc..579c30a21e 100644 --- a/lychee-bin/src/stats.rs +++ b/lychee-bin/src/stats.rs @@ -174,7 +174,7 @@ mod test { stats.add(Response( Input::Stdin, ResponseBody { - uri: website("http://example.org/ok"), + uri: website("https://example.org/ok"), status: Status::Ok(StatusCode::OK), }, )); diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index d095c50c37..52cd66ee96 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -216,11 +216,35 @@ mod cli { } #[test] - fn test_repetition() { + fn test_caching_single_file() { let mut cmd = main_command(); - let test_schemes_path = fixtures_path().join("TEST_REPETITION.txt"); + // Repetitions in one file shall all be checked and counted only once. + let test_schemes_path_1 = fixtures_path().join("TEST_REPETITION_1.txt"); - cmd.arg(&test_schemes_path) + cmd.arg(&test_schemes_path_1) + .env_clear() + .assert() + .success() + .stdout(contains("Total............1")) + .stdout(contains("Successful.......1")); + } + + #[test] + #[ignore] + // Test that two identical requests don't get executed twice. + // Note: This currently fails, because we currently don't cache responses. We + // used to, but there were issues. + // See https://github.com/lycheeverse/lychee/pull/349. + // We're planning to add back caching support at a later point in time, + // which is why we keep the test around. + fn test_caching_across_files() { + let mut cmd = main_command(); + // Repetitions across multiple files shall all be checked and counted only once. + let test_schemes_path_1 = fixtures_path().join("TEST_REPETITION_1.txt"); + let test_schemes_path_2 = fixtures_path().join("TEST_REPETITION_2.txt"); + + cmd.arg(&test_schemes_path_1) + .arg(&test_schemes_path_2) .env_clear() .assert() .success() diff --git a/lychee-lib/Cargo.toml b/lychee-lib/Cargo.toml index 1ae84abeb0..67b5043534 100644 --- a/lychee-lib/Cargo.toml +++ b/lychee-lib/Cargo.toml @@ -18,7 +18,6 @@ version = "0.7.2" [dependencies] check-if-email-exists = "0.8.25" -deadpool = "0.7.0" fast_chemail = "0.9.6" glob = "0.3.0" html5ever = "0.25.1" diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs index 4c7d8ec856..8b13d8d0ce 100644 --- a/lychee-lib/src/client.rs +++ b/lychee-lib/src/client.rs @@ -26,6 +26,9 @@ use crate::{ const DEFAULT_MAX_REDIRECTS: usize = 5; const DEFAULT_USER_AGENT: &str = concat!("lychee/", env!("CARGO_PKG_VERSION")); +/// Handles incoming requests and returns responses. Usually you would not +/// initialize a `Client` yourself, but use the `ClientBuilder` because it +/// provides sane defaults for all configuration options. #[derive(Debug, Clone)] pub struct Client { /// Underlying reqwest client instance that handles the HTTP requests. @@ -123,7 +126,13 @@ impl ClientBuilder { } /// The build method instantiates the client. - #[allow(clippy::missing_errors_doc)] + /// + /// # Errors + /// + /// Returns an `Err` if: + /// - The user agent cannot be parsed + /// - The request client cannot be created + /// - The Github client cannot be created pub fn client(&self) -> Result { let mut headers = self.custom_headers.clone(); headers.insert(header::USER_AGENT, HeaderValue::from_str(&self.user_agent)?); @@ -169,6 +178,13 @@ impl ClientBuilder { } impl Client { + /// Check a single request + /// + /// # Errors + /// + /// This returns an `Err` if + /// - The request cannot be parsed + /// - An HTTP website with an invalid URI format gets checked pub async fn check(&self, request: T) -> Result where Request: TryFrom, @@ -185,7 +201,10 @@ impl Client { match self.check_website(&uri).await { Status::Ok(code) if self.require_https && uri.scheme() == "http" => { let mut https_uri = uri.clone(); - https_uri.url.set_scheme("https").unwrap(); + https_uri + .url + .set_scheme("https") + .map_err(|_| ErrorKind::InvalidURI(uri.clone()))?; if self.check_website(&https_uri).await.is_success() { Status::Error(Box::new(ErrorKind::InsecureURL(https_uri))) } else { @@ -200,10 +219,12 @@ impl Client { } /// Check if the given URI is filtered by the client + #[must_use] pub fn filtered(&self, uri: &Uri) -> bool { self.filter.is_excluded(uri) } + /// Check a website URI pub async fn check_website(&self, uri: &Uri) -> Status { let mut retries: i64 = 3; let mut wait: u64 = 1; @@ -256,6 +277,7 @@ impl Client { } } + /// Check a file URI pub async fn check_file(&self, uri: &Uri) -> Status { if let Ok(path) = uri.url.to_file_path() { if path.exists() { @@ -265,6 +287,7 @@ impl Client { ErrorKind::InvalidFilePath(uri.clone()).into() } + /// Check a mail address pub async fn check_mail(&self, uri: &Uri) -> Status { let input = CheckEmailInput::new(vec![uri.as_str().to_owned()]); let result = &(check_email(&input).await)[0]; diff --git a/lychee-lib/src/client_pool.rs b/lychee-lib/src/client_pool.rs deleted file mode 100644 index 92b4d5615c..0000000000 --- a/lychee-lib/src/client_pool.rs +++ /dev/null @@ -1,45 +0,0 @@ -use client::Client; -use deadpool::unmanaged::Pool; -use tokio::sync::mpsc; - -use crate::{client, types}; - -#[allow(missing_debug_implementations)] -/// Manages a channel for incoming requests -/// and a pool of lychee clients to handle them -pub struct ClientPool { - tx: mpsc::Sender, - rx: mpsc::Receiver, - pool: deadpool::unmanaged::Pool, -} - -impl ClientPool { - #[must_use] - /// Creates a new client pool - pub fn new( - tx: mpsc::Sender, - rx: mpsc::Receiver, - clients: Vec, - ) -> Self { - let pool = Pool::from(clients); - ClientPool { tx, rx, pool } - } - - #[allow(clippy::missing_panics_doc)] - /// Start listening for incoming requests and send each of them - /// asynchronously to a client from the pool - pub async fn listen(&mut self) { - while let Some(req) = self.rx.recv().await { - let client = self.pool.get().await; - let tx = self.tx.clone(); - tokio::spawn(async move { - // Client::check() may fail only because Request::try_from() may fail - // here request is already Request, so it never fails - let resp = client.check(req).await.unwrap(); - tx.send(resp) - .await - .expect("Cannot send response to channel"); - }); - } - } -} diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index 871e869557..2089e304f7 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -1,4 +1,4 @@ -use crate::{extract::Extractor, Base, Input, Request, Result, Uri}; +use crate::{extract::Extractor, Base, Input, Request, Result}; use std::collections::HashSet; /// Collector keeps the state of link collection @@ -7,18 +7,20 @@ pub struct Collector { base: Option, skip_missing_inputs: bool, max_concurrency: usize, - cache: HashSet, } impl Collector { /// Create a new collector with an empty cache #[must_use] - pub fn new(base: Option, skip_missing_inputs: bool, max_concurrency: usize) -> Self { + pub const fn new( + base: Option, + skip_missing_inputs: bool, + max_concurrency: usize, + ) -> Self { Collector { base, skip_missing_inputs, max_concurrency, - cache: HashSet::new(), } } @@ -29,7 +31,7 @@ impl Collector { /// # Errors /// /// Will return `Err` if links cannot be extracted from an input - pub async fn collect_links(mut self, inputs: &[Input]) -> Result> { + pub async fn collect_links(self, inputs: &[Input]) -> Result> { let (contents_tx, mut contents_rx) = tokio::sync::mpsc::channel(self.max_concurrency); // extract input contents @@ -71,17 +73,8 @@ impl Collector { links.extend(new_links?); } - // Filter out already cached links (duplicates) - links.retain(|l| !self.cache.contains(&l.uri)); - - self.update_cache(&links); Ok(links) } - - /// Update internal link cache - fn update_cache(&mut self, links: &HashSet) { - self.cache.extend(links.iter().cloned().map(|l| l.uri)); - } } #[cfg(test)] diff --git a/lychee-lib/src/extract.rs b/lychee-lib/src/extract.rs index f8ade534c9..1e4e92cfc5 100644 --- a/lychee-lib/src/extract.rs +++ b/lychee-lib/src/extract.rs @@ -154,7 +154,7 @@ impl Extractor { match resolved { Some(path) => Url::from_file_path(&path) .map(Some) - .map_err(|_e| ErrorKind::InvalidUrl(path)), + .map_err(|_e| ErrorKind::InvalidUrlFromPath(path)), None => Ok(None), } } @@ -239,7 +239,7 @@ mod test { #[test] fn test_extract_link_at_end_of_line() { - let input = "http://www.apache.org/licenses/LICENSE-2.0\n"; + let input = "https://www.apache.org/licenses/LICENSE-2.0\n"; let link = input.trim_end(); let mut extractor = Extractor::new(None); diff --git a/lychee-lib/src/filter/mod.rs b/lychee-lib/src/filter/mod.rs index 07e0913a2d..0a7f8c39da 100644 --- a/lychee-lib/src/filter/mod.rs +++ b/lychee-lib/src/filter/mod.rs @@ -9,7 +9,7 @@ pub use includes::Includes; use crate::Uri; /// Pre-defined exclusions for known false-positives -static FALSE_POSITIVE_PAT: &[&str] = &[r"http://www.w3.org/1999/xhtml"]; +const FALSE_POSITIVE_PAT: &[&str] = &[r"http://www.w3.org/1999/xhtml"]; #[inline] #[must_use] @@ -298,7 +298,7 @@ mod test { ..Filter::default() }; - assert!(filter.is_excluded(&website("http://github.com"))); + assert!(filter.is_excluded(&website("https://github.com"))); assert!(filter.is_excluded(&website("http://exclude.org"))); assert!(filter.is_excluded(&mail("mail@example.org"))); diff --git a/lychee-lib/src/lib.rs b/lychee-lib/src/lib.rs index 58c0a6abc6..e4d1de057a 100644 --- a/lychee-lib/src/lib.rs +++ b/lychee-lib/src/lib.rs @@ -47,7 +47,6 @@ doc_comment::doctest!("../../README.md"); mod client; -mod client_pool; /// A pool of clients, to handle concurrent checks pub mod collector; mod helpers; @@ -73,8 +72,7 @@ use ring as _; // required for apple silicon #[doc(inline)] pub use crate::{ - client::{check, ClientBuilder}, - client_pool::ClientPool, + client::{check, Client, ClientBuilder}, collector::Collector, filter::{Excludes, Filter, Includes}, types::{Base, ErrorKind, Input, Request, Response, ResponseBody, Result, Status, Uri}, diff --git a/lychee-lib/src/types/error.rs b/lychee-lib/src/types/error.rs index 4a761416d3..08391cbad9 100644 --- a/lychee-lib/src/types/error.rs +++ b/lychee-lib/src/types/error.rs @@ -25,7 +25,7 @@ pub enum ErrorKind { /// The given URI cannot be converted to a file path InvalidFilePath(Uri), /// The given path cannot be converted to a URI - InvalidUrl(PathBuf), + InvalidUrlFromPath(PathBuf), /// The given mail address is unreachable UnreachableEmailAddress(Uri), /// The given header could not be parsed. @@ -40,8 +40,10 @@ pub enum ErrorKind { InvalidGlobPattern(glob::PatternError), /// The Github API could not be called because of a missing Github token. MissingGitHubToken, - /// The website is available in HTTPS protocol, but HTTP scheme is used. + /// The website is available through HTTPS, but HTTP scheme is used. InsecureURL(Uri), + /// Invalid URI + InvalidURI(Uri), } impl PartialEq for ErrorKind { @@ -76,7 +78,8 @@ impl Hash for ErrorKind { Self::HubcapsError(e) => e.to_string().hash(state), Self::FileNotFound(e) => e.to_string_lossy().hash(state), Self::UrlParseError(s, e) => (s, e.type_id()).hash(state), - Self::InvalidUrl(p) => p.hash(state), + Self::InvalidURI(u) => u.hash(state), + Self::InvalidUrlFromPath(p) => p.hash(state), Self::Utf8Error(e) => e.to_string().hash(state), Self::InvalidFilePath(u) | Self::UnreachableEmailAddress(u) | Self::InsecureURL(u) => { u.hash(state); @@ -113,7 +116,10 @@ impl Display for ErrorKind { write!(f, "Cannot parse {} as website url ({})", s, url_err) } Self::InvalidFilePath(u) => write!(f, "Invalid file URI: {}", u), - Self::InvalidUrl(p) => write!(f, "Invalid path: {}", p.display()), + Self::InvalidURI(u) => write!(f, "Invalid URI: {}", u), + Self::InvalidUrlFromPath(p) => { + write!(f, "Invalid path to URL conversion: {}", p.display()) + } Self::UnreachableEmailAddress(uri) => write!(f, "Unreachable mail address: {}", uri), Self::InvalidHeader(e) => e.fmt(f), Self::InvalidGlobPattern(e) => e.fmt(f), diff --git a/lychee-lib/src/types/uri.rs b/lychee-lib/src/types/uri.rs index 6ad126c3d4..02b017608c 100644 --- a/lychee-lib/src/types/uri.rs +++ b/lychee-lib/src/types/uri.rs @@ -160,12 +160,12 @@ mod test { fn test_uri_from_str() { assert!(Uri::try_from("").is_err()); assert_eq!( - Uri::try_from("http://example.org"), - Ok(website("http://example.org")) + Uri::try_from("https://example.org"), + Ok(website("https://example.org")) ); assert_eq!( - Uri::try_from("http://example.org/@test/testing"), - Ok(website("http://example.org/@test/testing")) + Uri::try_from("https://example.org/@test/testing"), + Ok(website("https://example.org/@test/testing")) ); assert_eq!( Uri::try_from("mail@example.org"),