diff --git a/Cargo.lock b/Cargo.lock
index b9a1f5199e..dfa6268180 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1579,7 +1579,6 @@ version = "0.7.2"
dependencies = [
"cached",
"check-if-email-exists",
- "deadpool",
"doc-comment",
"fast_chemail",
"glob",
diff --git a/examples/builder/builder.rs b/examples/builder/builder.rs
index 08f189e899..1b3f53bd93 100644
--- a/examples/builder/builder.rs
+++ b/examples/builder/builder.rs
@@ -40,7 +40,7 @@ async fn main() -> Result<()> {
.build()
.client()?;
- let response = client.check("http://example.org").await?;
+ let response = client.check("https://example.org").await?;
dbg!(&response);
assert!(response.status().is_success());
Ok(())
diff --git a/examples/client_pool/client_pool.rs b/examples/client_pool/client_pool.rs
index ef4bc8d46b..064fe83691 100644
--- a/examples/client_pool/client_pool.rs
+++ b/examples/client_pool/client_pool.rs
@@ -1,4 +1,4 @@
-use lychee_lib::{ClientBuilder, ClientPool, Input, Request, Result, Uri};
+use lychee_lib::{ClientBuilder, Input, Request, Result, Uri};
use std::convert::TryFrom;
use tokio::sync::mpsc;
@@ -9,7 +9,7 @@ const CONCURRENT_REQUESTS: usize = 4;
async fn main() -> Result<()> {
// These channels are used to send requests and receive responses to and
// from the lychee client pool
- let (send_req, recv_req) = mpsc::channel(CONCURRENT_REQUESTS);
+ let (send_req, mut recv_req) = mpsc::channel(CONCURRENT_REQUESTS);
let (send_resp, mut recv_resp) = mpsc::channel(CONCURRENT_REQUESTS);
// Add as many requests as you like
@@ -29,13 +29,17 @@ async fn main() -> Result<()> {
// Create a default lychee client
let client = ClientBuilder::default().client()?;
- // Create a pool with four lychee clients
- let clients = vec![client; CONCURRENT_REQUESTS];
- let mut clients = ClientPool::new(send_resp, recv_req, clients);
-
// Handle requests in a client pool
tokio::spawn(async move {
- clients.listen().await;
+ while let Some(req) = recv_req.recv().await {
+ // Client::check() may fail only because Request::try_from() may fail
+ // here request is already Request, so it never fails
+ let resp = client.check(req).await.unwrap();
+ send_resp
+ .send(resp)
+ .await
+ .expect("Cannot send response to channel");
+ }
});
// Finally, listen to incoming responses from lychee
diff --git a/fixtures/TEST_REPETITION.txt b/fixtures/TEST_REPETITION.txt
deleted file mode 100644
index 0395c8e26e..0000000000
--- a/fixtures/TEST_REPETITION.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-These links are all the same and should only be checked once:
-https://example.org/
-https://example.org/
-https://example.org
diff --git a/fixtures/TEST_REPETITION_1.txt b/fixtures/TEST_REPETITION_1.txt
new file mode 100644
index 0000000000..14570d507c
--- /dev/null
+++ b/fixtures/TEST_REPETITION_1.txt
@@ -0,0 +1,5 @@
+These links are all the same and match those of TEST_REPETITION_2.txt.
+All links in both files should be counted as one and checked once only.
+https://example.org/
+https://example.org/
+https://example.org
diff --git a/fixtures/TEST_REPETITION_2.txt b/fixtures/TEST_REPETITION_2.txt
new file mode 100644
index 0000000000..0b6091f1a8
--- /dev/null
+++ b/fixtures/TEST_REPETITION_2.txt
@@ -0,0 +1,5 @@
+These links are all the same and match those of TEST_REPETITION_1.txt.
+All links in both files should be counted as one and checked once only.
+https://example.org/
+https://example.org/
+https://example.org
diff --git a/lychee-bin/src/main.rs b/lychee-bin/src/main.rs
index b1f2c7307c..8c49c5810c 100644
--- a/lychee-bin/src/main.rs
+++ b/lychee-bin/src/main.rs
@@ -69,7 +69,7 @@ use std::{collections::HashSet, fs, str::FromStr};
use anyhow::{anyhow, Context, Result};
use headers::HeaderMapExt;
use indicatif::{ProgressBar, ProgressStyle};
-use lychee_lib::{ClientBuilder, ClientPool, Collector, Input, Request, Response};
+use lychee_lib::{ClientBuilder, Collector, Input, Request, Response};
use openssl_sys as _; // required for vendored-openssl feature
use regex::RegexSet;
use ring as _; // required for apple silicon
@@ -228,7 +228,7 @@ async fn run(cfg: &Config, inputs: Vec) -> Result {
Some(bar)
};
- let (send_req, recv_req) = mpsc::channel(max_concurrency);
+ let (send_req, mut recv_req) = mpsc::channel(max_concurrency);
let (send_resp, mut recv_resp) = mpsc::channel(max_concurrency);
let mut stats = ResponseStats::new();
@@ -245,9 +245,15 @@ async fn run(cfg: &Config, inputs: Vec) -> Result {
// Start receiving requests
tokio::spawn(async move {
- let clients = vec![client; max_concurrency];
- let mut clients = ClientPool::new(send_resp, recv_req, clients);
- clients.listen().await;
+ while let Some(req) = recv_req.recv().await {
+ // `Client::check()` may fail only because `Request::try_from()` may
+ // fail. Here `req` is already a valid `Request`, so it never fails.
+ let resp = client.check(req).await.unwrap();
+ send_resp
+ .send(resp)
+ .await
+ .expect("Cannot send response to channel");
+ }
});
while let Some(response) = recv_resp.recv().await {
diff --git a/lychee-bin/src/stats.rs b/lychee-bin/src/stats.rs
index 9ba51f91cc..579c30a21e 100644
--- a/lychee-bin/src/stats.rs
+++ b/lychee-bin/src/stats.rs
@@ -174,7 +174,7 @@ mod test {
stats.add(Response(
Input::Stdin,
ResponseBody {
- uri: website("http://example.org/ok"),
+ uri: website("https://example.org/ok"),
status: Status::Ok(StatusCode::OK),
},
));
diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs
index d095c50c37..52cd66ee96 100644
--- a/lychee-bin/tests/cli.rs
+++ b/lychee-bin/tests/cli.rs
@@ -216,11 +216,35 @@ mod cli {
}
#[test]
- fn test_repetition() {
+ fn test_caching_single_file() {
let mut cmd = main_command();
- let test_schemes_path = fixtures_path().join("TEST_REPETITION.txt");
+ // Repetitions in one file shall all be checked and counted only once.
+ let test_schemes_path_1 = fixtures_path().join("TEST_REPETITION_1.txt");
- cmd.arg(&test_schemes_path)
+ cmd.arg(&test_schemes_path_1)
+ .env_clear()
+ .assert()
+ .success()
+ .stdout(contains("Total............1"))
+ .stdout(contains("Successful.......1"));
+ }
+
+ #[test]
+ #[ignore]
+ // Test that two identical requests don't get executed twice.
+ // Note: This currently fails, because we currently don't cache responses. We
+ // used to, but there were issues.
+ // See https://github.com/lycheeverse/lychee/pull/349.
+ // We're planning to add back caching support at a later point in time,
+ // which is why we keep the test around.
+ fn test_caching_across_files() {
+ let mut cmd = main_command();
+ // Repetitions across multiple files shall all be checked and counted only once.
+ let test_schemes_path_1 = fixtures_path().join("TEST_REPETITION_1.txt");
+ let test_schemes_path_2 = fixtures_path().join("TEST_REPETITION_2.txt");
+
+ cmd.arg(&test_schemes_path_1)
+ .arg(&test_schemes_path_2)
.env_clear()
.assert()
.success()
diff --git a/lychee-lib/Cargo.toml b/lychee-lib/Cargo.toml
index 1ae84abeb0..67b5043534 100644
--- a/lychee-lib/Cargo.toml
+++ b/lychee-lib/Cargo.toml
@@ -18,7 +18,6 @@ version = "0.7.2"
[dependencies]
check-if-email-exists = "0.8.25"
-deadpool = "0.7.0"
fast_chemail = "0.9.6"
glob = "0.3.0"
html5ever = "0.25.1"
diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs
index 4c7d8ec856..8b13d8d0ce 100644
--- a/lychee-lib/src/client.rs
+++ b/lychee-lib/src/client.rs
@@ -26,6 +26,9 @@ use crate::{
const DEFAULT_MAX_REDIRECTS: usize = 5;
const DEFAULT_USER_AGENT: &str = concat!("lychee/", env!("CARGO_PKG_VERSION"));
+/// Handles incoming requests and returns responses. Usually you would not
+/// initialize a `Client` yourself, but use the `ClientBuilder` because it
+/// provides sane defaults for all configuration options.
#[derive(Debug, Clone)]
pub struct Client {
/// Underlying reqwest client instance that handles the HTTP requests.
@@ -123,7 +126,13 @@ impl ClientBuilder {
}
/// The build method instantiates the client.
- #[allow(clippy::missing_errors_doc)]
+ ///
+ /// # Errors
+ ///
+ /// Returns an `Err` if:
+ /// - The user agent cannot be parsed
+ /// - The request client cannot be created
+ /// - The Github client cannot be created
pub fn client(&self) -> Result {
let mut headers = self.custom_headers.clone();
headers.insert(header::USER_AGENT, HeaderValue::from_str(&self.user_agent)?);
@@ -169,6 +178,13 @@ impl ClientBuilder {
}
impl Client {
+ /// Check a single request
+ ///
+ /// # Errors
+ ///
+ /// This returns an `Err` if
+ /// - The request cannot be parsed
+ /// - An HTTP website with an invalid URI format gets checked
pub async fn check(&self, request: T) -> Result
where
Request: TryFrom,
@@ -185,7 +201,10 @@ impl Client {
match self.check_website(&uri).await {
Status::Ok(code) if self.require_https && uri.scheme() == "http" => {
let mut https_uri = uri.clone();
- https_uri.url.set_scheme("https").unwrap();
+ https_uri
+ .url
+ .set_scheme("https")
+ .map_err(|_| ErrorKind::InvalidURI(uri.clone()))?;
if self.check_website(&https_uri).await.is_success() {
Status::Error(Box::new(ErrorKind::InsecureURL(https_uri)))
} else {
@@ -200,10 +219,12 @@ impl Client {
}
/// Check if the given URI is filtered by the client
+ #[must_use]
pub fn filtered(&self, uri: &Uri) -> bool {
self.filter.is_excluded(uri)
}
+ /// Check a website URI
pub async fn check_website(&self, uri: &Uri) -> Status {
let mut retries: i64 = 3;
let mut wait: u64 = 1;
@@ -256,6 +277,7 @@ impl Client {
}
}
+ /// Check a file URI
pub async fn check_file(&self, uri: &Uri) -> Status {
if let Ok(path) = uri.url.to_file_path() {
if path.exists() {
@@ -265,6 +287,7 @@ impl Client {
ErrorKind::InvalidFilePath(uri.clone()).into()
}
+ /// Check a mail address
pub async fn check_mail(&self, uri: &Uri) -> Status {
let input = CheckEmailInput::new(vec![uri.as_str().to_owned()]);
let result = &(check_email(&input).await)[0];
diff --git a/lychee-lib/src/client_pool.rs b/lychee-lib/src/client_pool.rs
deleted file mode 100644
index 92b4d5615c..0000000000
--- a/lychee-lib/src/client_pool.rs
+++ /dev/null
@@ -1,45 +0,0 @@
-use client::Client;
-use deadpool::unmanaged::Pool;
-use tokio::sync::mpsc;
-
-use crate::{client, types};
-
-#[allow(missing_debug_implementations)]
-/// Manages a channel for incoming requests
-/// and a pool of lychee clients to handle them
-pub struct ClientPool {
- tx: mpsc::Sender,
- rx: mpsc::Receiver,
- pool: deadpool::unmanaged::Pool,
-}
-
-impl ClientPool {
- #[must_use]
- /// Creates a new client pool
- pub fn new(
- tx: mpsc::Sender,
- rx: mpsc::Receiver,
- clients: Vec,
- ) -> Self {
- let pool = Pool::from(clients);
- ClientPool { tx, rx, pool }
- }
-
- #[allow(clippy::missing_panics_doc)]
- /// Start listening for incoming requests and send each of them
- /// asynchronously to a client from the pool
- pub async fn listen(&mut self) {
- while let Some(req) = self.rx.recv().await {
- let client = self.pool.get().await;
- let tx = self.tx.clone();
- tokio::spawn(async move {
- // Client::check() may fail only because Request::try_from() may fail
- // here request is already Request, so it never fails
- let resp = client.check(req).await.unwrap();
- tx.send(resp)
- .await
- .expect("Cannot send response to channel");
- });
- }
- }
-}
diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs
index 871e869557..2089e304f7 100644
--- a/lychee-lib/src/collector.rs
+++ b/lychee-lib/src/collector.rs
@@ -1,4 +1,4 @@
-use crate::{extract::Extractor, Base, Input, Request, Result, Uri};
+use crate::{extract::Extractor, Base, Input, Request, Result};
use std::collections::HashSet;
/// Collector keeps the state of link collection
@@ -7,18 +7,20 @@ pub struct Collector {
base: Option,
skip_missing_inputs: bool,
max_concurrency: usize,
- cache: HashSet,
}
impl Collector {
/// Create a new collector with an empty cache
#[must_use]
- pub fn new(base: Option, skip_missing_inputs: bool, max_concurrency: usize) -> Self {
+ pub const fn new(
+ base: Option,
+ skip_missing_inputs: bool,
+ max_concurrency: usize,
+ ) -> Self {
Collector {
base,
skip_missing_inputs,
max_concurrency,
- cache: HashSet::new(),
}
}
@@ -29,7 +31,7 @@ impl Collector {
/// # Errors
///
/// Will return `Err` if links cannot be extracted from an input
- pub async fn collect_links(mut self, inputs: &[Input]) -> Result> {
+ pub async fn collect_links(self, inputs: &[Input]) -> Result> {
let (contents_tx, mut contents_rx) = tokio::sync::mpsc::channel(self.max_concurrency);
// extract input contents
@@ -71,17 +73,8 @@ impl Collector {
links.extend(new_links?);
}
- // Filter out already cached links (duplicates)
- links.retain(|l| !self.cache.contains(&l.uri));
-
- self.update_cache(&links);
Ok(links)
}
-
- /// Update internal link cache
- fn update_cache(&mut self, links: &HashSet) {
- self.cache.extend(links.iter().cloned().map(|l| l.uri));
- }
}
#[cfg(test)]
diff --git a/lychee-lib/src/extract.rs b/lychee-lib/src/extract.rs
index f8ade534c9..1e4e92cfc5 100644
--- a/lychee-lib/src/extract.rs
+++ b/lychee-lib/src/extract.rs
@@ -154,7 +154,7 @@ impl Extractor {
match resolved {
Some(path) => Url::from_file_path(&path)
.map(Some)
- .map_err(|_e| ErrorKind::InvalidUrl(path)),
+ .map_err(|_e| ErrorKind::InvalidUrlFromPath(path)),
None => Ok(None),
}
}
@@ -239,7 +239,7 @@ mod test {
#[test]
fn test_extract_link_at_end_of_line() {
- let input = "http://www.apache.org/licenses/LICENSE-2.0\n";
+ let input = "https://www.apache.org/licenses/LICENSE-2.0\n";
let link = input.trim_end();
let mut extractor = Extractor::new(None);
diff --git a/lychee-lib/src/filter/mod.rs b/lychee-lib/src/filter/mod.rs
index 07e0913a2d..0a7f8c39da 100644
--- a/lychee-lib/src/filter/mod.rs
+++ b/lychee-lib/src/filter/mod.rs
@@ -9,7 +9,7 @@ pub use includes::Includes;
use crate::Uri;
/// Pre-defined exclusions for known false-positives
-static FALSE_POSITIVE_PAT: &[&str] = &[r"http://www.w3.org/1999/xhtml"];
+const FALSE_POSITIVE_PAT: &[&str] = &[r"http://www.w3.org/1999/xhtml"];
#[inline]
#[must_use]
@@ -298,7 +298,7 @@ mod test {
..Filter::default()
};
- assert!(filter.is_excluded(&website("http://github.com")));
+ assert!(filter.is_excluded(&website("https://github.com")));
assert!(filter.is_excluded(&website("http://exclude.org")));
assert!(filter.is_excluded(&mail("mail@example.org")));
diff --git a/lychee-lib/src/lib.rs b/lychee-lib/src/lib.rs
index 58c0a6abc6..e4d1de057a 100644
--- a/lychee-lib/src/lib.rs
+++ b/lychee-lib/src/lib.rs
@@ -47,7 +47,6 @@
doc_comment::doctest!("../../README.md");
mod client;
-mod client_pool;
/// A pool of clients, to handle concurrent checks
pub mod collector;
mod helpers;
@@ -73,8 +72,7 @@ use ring as _; // required for apple silicon
#[doc(inline)]
pub use crate::{
- client::{check, ClientBuilder},
- client_pool::ClientPool,
+ client::{check, Client, ClientBuilder},
collector::Collector,
filter::{Excludes, Filter, Includes},
types::{Base, ErrorKind, Input, Request, Response, ResponseBody, Result, Status, Uri},
diff --git a/lychee-lib/src/types/error.rs b/lychee-lib/src/types/error.rs
index 4a761416d3..08391cbad9 100644
--- a/lychee-lib/src/types/error.rs
+++ b/lychee-lib/src/types/error.rs
@@ -25,7 +25,7 @@ pub enum ErrorKind {
/// The given URI cannot be converted to a file path
InvalidFilePath(Uri),
/// The given path cannot be converted to a URI
- InvalidUrl(PathBuf),
+ InvalidUrlFromPath(PathBuf),
/// The given mail address is unreachable
UnreachableEmailAddress(Uri),
/// The given header could not be parsed.
@@ -40,8 +40,10 @@ pub enum ErrorKind {
InvalidGlobPattern(glob::PatternError),
/// The Github API could not be called because of a missing Github token.
MissingGitHubToken,
- /// The website is available in HTTPS protocol, but HTTP scheme is used.
+ /// The website is available through HTTPS, but HTTP scheme is used.
InsecureURL(Uri),
+ /// Invalid URI
+ InvalidURI(Uri),
}
impl PartialEq for ErrorKind {
@@ -76,7 +78,8 @@ impl Hash for ErrorKind {
Self::HubcapsError(e) => e.to_string().hash(state),
Self::FileNotFound(e) => e.to_string_lossy().hash(state),
Self::UrlParseError(s, e) => (s, e.type_id()).hash(state),
- Self::InvalidUrl(p) => p.hash(state),
+ Self::InvalidURI(u) => u.hash(state),
+ Self::InvalidUrlFromPath(p) => p.hash(state),
Self::Utf8Error(e) => e.to_string().hash(state),
Self::InvalidFilePath(u) | Self::UnreachableEmailAddress(u) | Self::InsecureURL(u) => {
u.hash(state);
@@ -113,7 +116,10 @@ impl Display for ErrorKind {
write!(f, "Cannot parse {} as website url ({})", s, url_err)
}
Self::InvalidFilePath(u) => write!(f, "Invalid file URI: {}", u),
- Self::InvalidUrl(p) => write!(f, "Invalid path: {}", p.display()),
+ Self::InvalidURI(u) => write!(f, "Invalid URI: {}", u),
+ Self::InvalidUrlFromPath(p) => {
+ write!(f, "Invalid path to URL conversion: {}", p.display())
+ }
Self::UnreachableEmailAddress(uri) => write!(f, "Unreachable mail address: {}", uri),
Self::InvalidHeader(e) => e.fmt(f),
Self::InvalidGlobPattern(e) => e.fmt(f),
diff --git a/lychee-lib/src/types/uri.rs b/lychee-lib/src/types/uri.rs
index 6ad126c3d4..02b017608c 100644
--- a/lychee-lib/src/types/uri.rs
+++ b/lychee-lib/src/types/uri.rs
@@ -160,12 +160,12 @@ mod test {
fn test_uri_from_str() {
assert!(Uri::try_from("").is_err());
assert_eq!(
- Uri::try_from("http://example.org"),
- Ok(website("http://example.org"))
+ Uri::try_from("https://example.org"),
+ Ok(website("https://example.org"))
);
assert_eq!(
- Uri::try_from("http://example.org/@test/testing"),
- Ok(website("http://example.org/@test/testing"))
+ Uri::try_from("https://example.org/@test/testing"),
+ Ok(website("https://example.org/@test/testing"))
);
assert_eq!(
Uri::try_from("mail@example.org"),