From a365f6ad9afae9308f21792bea2828c017edcb13 Mon Sep 17 00:00:00 2001 From: Matthias Endler Date: Thu, 28 Jan 2021 17:55:30 +0100 Subject: [PATCH 01/23] wip --- src/bin/lychee/main.rs | 39 +++++++++++++++++++++++++++++++++++---- src/client_pool.rs | 21 +++++++++++++++------ src/collector.rs | 6 +++--- src/types.rs | 4 ++-- 4 files changed, 55 insertions(+), 15 deletions(-) diff --git a/src/bin/lychee/main.rs b/src/bin/lychee/main.rs index 73202fead1..35d5cc1b70 100644 --- a/src/bin/lychee/main.rs +++ b/src/bin/lychee/main.rs @@ -7,7 +7,7 @@ use regex::RegexSet; use std::{collections::HashSet, time::Duration}; use std::{fs, str::FromStr}; use structopt::StructOpt; -use tokio::sync::mpsc; +use tokio::{sync::mpsc, task}; mod options; mod stats; @@ -130,12 +130,13 @@ async fn run(cfg: &Config, inputs: Vec) -> Result { let mut stats = ResponseStats::new(); let bar = pb.clone(); + let sr = send_req.clone(); tokio::spawn(async move { for link in links { if let Some(pb) = &bar { pb.set_message(&link.to_string()); }; - send_req.send(link).await.unwrap(); + sr.send(link).await.unwrap(); } }); @@ -148,9 +149,39 @@ async fn run(cfg: &Config, inputs: Vec) -> Result { while let Some(response) = recv_resp.recv().await { show_progress(&pb, &response, cfg.verbose); - stats.add(response); - } + stats.add(response.clone()); + if !response.status.is_success() { + continue; + } + if let lychee::Uri::Website(url) = response.uri { + println!("add url: {}", &url); + let input = collector::Input::RemoteUrl(url.clone()); + // TODO: Check recursion level + let links = collector::collect_links( + &[input], + cfg.base_url.clone(), + cfg.skip_missing, + max_concurrency, + ) + .await?; + + let bar = pb.clone(); + let sr = send_req.clone(); + let real_url = url.clone(); + task::spawn_blocking(|| async move { + println!("Adding {} links from {}", links.len(), real_url); + for link in links { + if let Some(pb) = &bar { + pb.inc_length(1); + pb.set_message(&link.to_string()); + }; + sr.send(link).await.unwrap(); + } + println!("Done with all links from {}", real_url); + }); + } + } // Note that print statements may interfere with the progress bar, so this // must go before printing the stats if let Some(pb) = &pb { diff --git a/src/client_pool.rs b/src/client_pool.rs index 3770cb67b9..cde3df5ca3 100644 --- a/src/client_pool.rs +++ b/src/client_pool.rs @@ -1,14 +1,17 @@ -use client::Client; -use deadpool::unmanaged::Pool; -use tokio::sync::mpsc; +use std::collections::HashSet; use crate::uri; use crate::{client, types}; +use client::Client; +use deadpool::unmanaged::Pool; +use tokio::sync::mpsc; +use tokio::task; pub struct ClientPool { tx: mpsc::Sender, rx: mpsc::Receiver, pool: deadpool::unmanaged::Pool, + cache: HashSet, } impl ClientPool { @@ -17,16 +20,22 @@ impl ClientPool { rx: mpsc::Receiver, clients: Vec, ) -> Self { + let cache : HashSet = HashSet::new(); let pool = Pool::from(clients); - ClientPool { tx, rx, pool } + ClientPool { tx, rx, pool, cache} } pub async fn listen(&mut self) { - while let Some(req) = self.rx.recv().await { + while let Some(uri) = self.rx.recv().await { + if self.cache.contains(&uri) { + println!("Already seen: {}", &uri); + continue; + } + self.cache.insert(uri.clone()); let client = self.pool.get().await; let tx = self.tx.clone(); tokio::spawn(async move { - let resp = client.check(req).await; + let resp = client.check(uri).await; tx.send(resp).await.unwrap(); }); } diff --git a/src/collector.rs b/src/collector.rs index 2a7a285c2e..eecfc2105c 100644 --- a/src/collector.rs +++ b/src/collector.rs @@ -198,14 +198,14 @@ pub async fn collect_links( drop(contents_tx); // extract links from input contents - let mut extract_links_handles = vec![]; + let mut extract_link_handles = vec![]; while let Some(result) = contents_rx.recv().await { for input_content in result? { let base_url = base_url.clone(); let handle = tokio::task::spawn_blocking(move || extract_links(&input_content, base_url)); - extract_links_handles.push(handle); + extract_link_handles.push(handle); } } @@ -215,7 +215,7 @@ pub async fn collect_links( // a lot of inputs and/or the inputs are large (e.g. big files). let mut collected_links = HashSet::new(); - for handle in extract_links_handles { + for handle in extract_link_handles { let links = handle.await?; collected_links.extend(links); } diff --git a/src/types.rs b/src/types.rs index 7a0ce5618d..41ead9fb14 100644 --- a/src/types.rs +++ b/src/types.rs @@ -19,7 +19,7 @@ impl TryFrom for RequestMethod { } } -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct Response { pub uri: Uri, pub status: Status, @@ -32,7 +32,7 @@ impl Response { } /// Response status of the request -#[derive(Debug)] +#[derive(Debug, Clone)] pub enum Status { /// Request was successful Ok(http::StatusCode), From ef564d6d5386aa5ed9d66eefc3c79a1900d677df Mon Sep 17 00:00:00 2001 From: Matthias Endler Date: Thu, 28 Jan 2021 17:58:27 +0100 Subject: [PATCH 02/23] wip --- src/bin/lychee/main.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bin/lychee/main.rs b/src/bin/lychee/main.rs index 35d5cc1b70..069f6d4afd 100644 --- a/src/bin/lychee/main.rs +++ b/src/bin/lychee/main.rs @@ -169,7 +169,7 @@ async fn run(cfg: &Config, inputs: Vec) -> Result { let bar = pb.clone(); let sr = send_req.clone(); let real_url = url.clone(); - task::spawn_blocking(|| async move { + tokio::spawn(async move { println!("Adding {} links from {}", links.len(), real_url); for link in links { if let Some(pb) = &bar { From f65543ae16c98a476026a2458831b9477cbd053c Mon Sep 17 00:00:00 2001 From: Matthias Endler Date: Mon, 1 Feb 2021 17:55:46 +0100 Subject: [PATCH 03/23] wip --- src/bin/lychee/cache.rs | 23 +++++++++++++++++++++++ src/bin/lychee/main.rs | 12 +++++++++++- src/client_pool.rs | 12 +----------- 3 files changed, 35 insertions(+), 12 deletions(-) create mode 100644 src/bin/lychee/cache.rs diff --git a/src/bin/lychee/cache.rs b/src/bin/lychee/cache.rs new file mode 100644 index 0000000000..a9ce3b5ecf --- /dev/null +++ b/src/bin/lychee/cache.rs @@ -0,0 +1,23 @@ +use std::collections::HashMap; + +use lychee::Uri; + +/// Link cache for recursion and to avoid checking a link multiple times +pub struct Cache { + pub cache: HashMap, +} + +impl Cache { + pub fn new() -> Self { + let cache = HashMap::new(); + Cache { cache } + } + + pub fn add(&mut self, uri: Uri) { + *self.cache.entry(uri).or_insert(0) += 1; + } + + pub fn contains(&self, uri: &Uri) -> bool { + self.cache.contains_key(uri) + } +} diff --git a/src/bin/lychee/main.rs b/src/bin/lychee/main.rs index 069f6d4afd..c318bc98cb 100644 --- a/src/bin/lychee/main.rs +++ b/src/bin/lychee/main.rs @@ -7,11 +7,13 @@ use regex::RegexSet; use std::{collections::HashSet, time::Duration}; use std::{fs, str::FromStr}; use structopt::StructOpt; -use tokio::{sync::mpsc, task}; +use tokio::sync::mpsc; +mod cache; mod options; mod stats; +use crate::cache::Cache; use crate::options::{Config, LycheeOptions}; use crate::stats::ResponseStats; @@ -104,6 +106,9 @@ async fn run(cfg: &Config, inputs: Vec) -> Result { .accepted(accepted) .build()?; + // Create link cache to keep track of seen links + let mut cache = Cache::new(); + let links = collector::collect_links( &inputs, cfg.base_url.clone(), @@ -154,6 +159,11 @@ async fn run(cfg: &Config, inputs: Vec) -> Result { if !response.status.is_success() { continue; } + if cache.contains(&response.uri) { + continue; + } + cache.add(response.uri.clone()); + if let lychee::Uri::Website(url) = response.uri { println!("add url: {}", &url); let input = collector::Input::RemoteUrl(url.clone()); diff --git a/src/client_pool.rs b/src/client_pool.rs index cde3df5ca3..e47763a3da 100644 --- a/src/client_pool.rs +++ b/src/client_pool.rs @@ -1,17 +1,13 @@ -use std::collections::HashSet; - use crate::uri; use crate::{client, types}; use client::Client; use deadpool::unmanaged::Pool; use tokio::sync::mpsc; -use tokio::task; pub struct ClientPool { tx: mpsc::Sender, rx: mpsc::Receiver, pool: deadpool::unmanaged::Pool, - cache: HashSet, } impl ClientPool { @@ -20,18 +16,12 @@ impl ClientPool { rx: mpsc::Receiver, clients: Vec, ) -> Self { - let cache : HashSet = HashSet::new(); let pool = Pool::from(clients); - ClientPool { tx, rx, pool, cache} + ClientPool { tx, rx, pool } } pub async fn listen(&mut self) { while let Some(uri) = self.rx.recv().await { - if self.cache.contains(&uri) { - println!("Already seen: {}", &uri); - continue; - } - self.cache.insert(uri.clone()); let client = self.pool.get().await; let tx = self.tx.clone(); tokio::spawn(async move { From 9c6735f20da754566046d4359ea0641bd90d0095 Mon Sep 17 00:00:00 2001 From: Matthias Endler Date: Fri, 26 Feb 2021 01:33:27 +0100 Subject: [PATCH 04/23] wip --- src/bin/lychee/cache.rs | 15 ++++++++------- src/bin/lychee/main.rs | 21 ++++++++++++++++++--- src/collector.rs | 2 +- 3 files changed, 27 insertions(+), 11 deletions(-) diff --git a/src/bin/lychee/cache.rs b/src/bin/lychee/cache.rs index a9ce3b5ecf..052b1287da 100644 --- a/src/bin/lychee/cache.rs +++ b/src/bin/lychee/cache.rs @@ -1,23 +1,24 @@ -use std::collections::HashMap; +use std::collections::HashSet; use lychee::Uri; /// Link cache for recursion and to avoid checking a link multiple times +#[derive(Debug)] pub struct Cache { - pub cache: HashMap, + pub cache: HashSet, } impl Cache { pub fn new() -> Self { - let cache = HashMap::new(); + let cache = HashSet::new(); Cache { cache } } - pub fn add(&mut self, uri: Uri) { - *self.cache.entry(uri).or_insert(0) += 1; + pub fn add(&mut self, uri: String) { + self.cache.insert(uri); } - pub fn contains(&self, uri: &Uri) -> bool { - self.cache.contains_key(uri) + pub fn contains(&self, uri: String) -> bool { + self.cache.contains(&uri) } } diff --git a/src/bin/lychee/main.rs b/src/bin/lychee/main.rs index c318bc98cb..2836848fd6 100644 --- a/src/bin/lychee/main.rs +++ b/src/bin/lychee/main.rs @@ -4,6 +4,7 @@ use headers::{Authorization, HeaderMap, HeaderMapExt, HeaderName}; use indicatif::{ProgressBar, ProgressStyle}; use options::Format; use regex::RegexSet; +use reqwest::Url; use std::{collections::HashSet, time::Duration}; use std::{fs, str::FromStr}; use structopt::StructOpt; @@ -152,6 +153,14 @@ async fn run(cfg: &Config, inputs: Vec) -> Result { clients.listen().await; }); + let original_domains: Vec<_> = inputs + .iter() + .filter_map(|i| match i { + Input::RemoteUrl(url) => Some(url.domain()), + _ => None, + }) + .collect(); + while let Some(response) = recv_resp.recv().await { show_progress(&pb, &response, cfg.verbose); stats.add(response.clone()); @@ -159,14 +168,20 @@ async fn run(cfg: &Config, inputs: Vec) -> Result { if !response.status.is_success() { continue; } - if cache.contains(&response.uri) { + if cache.contains(response.uri.as_str().to_string()) { continue; } - cache.add(response.uri.clone()); + cache.add(response.uri.as_str().to_string()); + + println!("cache {:?}", cache); if let lychee::Uri::Website(url) = response.uri { - println!("add url: {}", &url); let input = collector::Input::RemoteUrl(url.clone()); + + if !original_domains.contains(&url.domain()) { + continue; + } + // TODO: Check recursion level let links = collector::collect_links( &[input], diff --git a/src/collector.rs b/src/collector.rs index eecfc2105c..e84d3779d4 100644 --- a/src/collector.rs +++ b/src/collector.rs @@ -12,7 +12,7 @@ use tokio::io::{stdin, AsyncReadExt}; const STDIN: &str = "-"; -#[derive(Debug, Clone)] +#[derive(Debug, Clone, PartialEq)] #[non_exhaustive] pub enum Input { RemoteUrl(Url), From 5be75b76cabb187c3494a7ce89dd39972fdce77d Mon Sep 17 00:00:00 2001 From: Matthias Endler Date: Sun, 28 Feb 2021 20:03:16 +0100 Subject: [PATCH 05/23] update --- src/bin/lychee/cache.rs | 2 -- src/client_pool.rs | 4 +--- src/types.rs | 4 ++-- 3 files changed, 3 insertions(+), 7 deletions(-) diff --git a/src/bin/lychee/cache.rs b/src/bin/lychee/cache.rs index 052b1287da..bfc0f7720e 100644 --- a/src/bin/lychee/cache.rs +++ b/src/bin/lychee/cache.rs @@ -1,7 +1,5 @@ use std::collections::HashSet; -use lychee::Uri; - /// Link cache for recursion and to avoid checking a link multiple times #[derive(Debug)] pub struct Cache { diff --git a/src/client_pool.rs b/src/client_pool.rs index f6f63c622c..2c4f813db3 100644 --- a/src/client_pool.rs +++ b/src/client_pool.rs @@ -1,5 +1,3 @@ -use crate::uri; -use crate::{client, types}; use client::Client; use deadpool::unmanaged::Pool; use tokio::sync::mpsc; @@ -27,7 +25,7 @@ impl ClientPool { let client = self.pool.get().await; let tx = self.tx.clone(); tokio::spawn(async move { - let resp = client.check(req).await.expect("Invalid URI"); + let resp = client.check(uri).await.expect("Invalid URI"); tx.send(resp) .await .expect("Cannot send response to channel"); diff --git a/src/types.rs b/src/types.rs index 935174d332..4f6b881367 100644 --- a/src/types.rs +++ b/src/types.rs @@ -56,7 +56,7 @@ impl TryFrom for RequestMethod { } } -#[derive(Debug, PartialEq, Eq, Hash, Serialize)] +#[derive(Debug, PartialEq, Eq, Hash, Serialize, Clone)] pub struct Response { #[serde(flatten)] pub uri: Uri, @@ -90,7 +90,7 @@ impl Display for Response { } /// Response status of the request -#[derive(Debug, Hash, PartialEq, Eq)] +#[derive(Debug, Hash, PartialEq, Eq, Clone)] pub enum Status { /// Request was successful Ok(http::StatusCode), From 756e5b46e76e77f9e556f67552dc810a801064ce Mon Sep 17 00:00:00 2001 From: Matthias Endler Date: Sun, 28 Feb 2021 20:06:37 +0100 Subject: [PATCH 06/23] Replace own cache type with simple hashset --- src/bin/lychee/cache.rs | 22 ---------------------- src/bin/lychee/main.rs | 8 +++----- src/types.rs | 2 ++ 3 files changed, 5 insertions(+), 27 deletions(-) delete mode 100644 src/bin/lychee/cache.rs diff --git a/src/bin/lychee/cache.rs b/src/bin/lychee/cache.rs deleted file mode 100644 index bfc0f7720e..0000000000 --- a/src/bin/lychee/cache.rs +++ /dev/null @@ -1,22 +0,0 @@ -use std::collections::HashSet; - -/// Link cache for recursion and to avoid checking a link multiple times -#[derive(Debug)] -pub struct Cache { - pub cache: HashSet, -} - -impl Cache { - pub fn new() -> Self { - let cache = HashSet::new(); - Cache { cache } - } - - pub fn add(&mut self, uri: String) { - self.cache.insert(uri); - } - - pub fn contains(&self, uri: String) -> bool { - self.cache.contains(&uri) - } -} diff --git a/src/bin/lychee/main.rs b/src/bin/lychee/main.rs index ad2388056d..17e714e803 100644 --- a/src/bin/lychee/main.rs +++ b/src/bin/lychee/main.rs @@ -10,15 +10,13 @@ use std::{fs, str::FromStr}; use structopt::StructOpt; use tokio::sync::mpsc; -mod cache; mod options; mod stats; -use crate::cache::Cache; use crate::options::{Config, LycheeOptions}; use crate::stats::ResponseStats; -use lychee::collector::{self, Input}; +use lychee::{Cache, collector::{self, Input}}; use lychee::{ClientBuilder, ClientPool, Response}; /// A C-like enum that can be cast to `i32` and used as process exit code. @@ -181,10 +179,10 @@ async fn run(cfg: &Config, inputs: Vec) -> Result { if !response.status.is_success() { continue; } - if cache.contains(response.uri.as_str().to_string()) { + if cache.contains(response.uri.as_str()) { continue; } - cache.add(response.uri.as_str().to_string()); + cache.insert(response.uri.to_string()); println!("cache {:?}", cache); diff --git a/src/types.rs b/src/types.rs index 4f6b881367..d208cbb5bc 100644 --- a/src/types.rs +++ b/src/types.rs @@ -3,6 +3,8 @@ use anyhow::anyhow; use serde::{Serialize, Serializer}; use std::{collections::HashSet, convert::TryFrom, fmt::Display}; +pub type Cache = HashSet; + #[derive(Debug, PartialEq, Eq, Hash, Clone)] pub struct Request { pub uri: Uri, From a5a68e87ec58eb63703b40fd256c9e4e233defe7 Mon Sep 17 00:00:00 2001 From: Matthias Endler Date: Sun, 28 Feb 2021 20:43:14 +0100 Subject: [PATCH 07/23] Move recursion to separate function --- src/bin/lychee/main.rs | 109 ++++++++++++++++++++++++----------------- 1 file changed, 65 insertions(+), 44 deletions(-) diff --git a/src/bin/lychee/main.rs b/src/bin/lychee/main.rs index 17e714e803..6db5f5d4d6 100644 --- a/src/bin/lychee/main.rs +++ b/src/bin/lychee/main.rs @@ -8,7 +8,7 @@ use stats::color_response; use std::{collections::HashSet, time::Duration}; use std::{fs, str::FromStr}; use structopt::StructOpt; -use tokio::sync::mpsc; +use tokio::sync::mpsc::{self, Sender}; mod options; mod stats; @@ -16,7 +16,10 @@ mod stats; use crate::options::{Config, LycheeOptions}; use crate::stats::ResponseStats; -use lychee::{Cache, collector::{self, Input}}; +use lychee::{ + collector::{self, Input}, + Cache, Request, +}; use lychee::{ClientBuilder, ClientPool, Response}; /// A C-like enum that can be cast to `i32` and used as process exit code. @@ -164,7 +167,7 @@ async fn run(cfg: &Config, inputs: Vec) -> Result { clients.listen().await; }); - let original_domains: Vec<_> = inputs + let input_domains: Vec<_> = inputs .iter() .filter_map(|i| match i { Input::RemoteUrl(url) => Some(url.domain()), @@ -176,47 +179,15 @@ async fn run(cfg: &Config, inputs: Vec) -> Result { show_progress(&pb, &response, cfg.verbose); stats.add(response.clone()); - if !response.status.is_success() { - continue; - } - if cache.contains(response.uri.as_str()) { - continue; - } - cache.insert(response.uri.to_string()); - - println!("cache {:?}", cache); - - if let lychee::Uri::Website(url) = response.uri { - let input = collector::Input::RemoteUrl(url.clone()); - - if !original_domains.contains(&url.domain()) { - continue; - } - - // TODO: Check recursion level - let links = collector::collect_links( - &[input], - cfg.base_url.clone(), - cfg.skip_missing, - max_concurrency, - ) - .await?; - - let bar = pb.clone(); - let sr = send_req.clone(); - let real_url = url.clone(); - tokio::spawn(async move { - println!("Adding {} links from {}", links.len(), real_url); - for link in links { - if let Some(pb) = &bar { - pb.inc_length(1); - pb.set_message(&link.to_string()); - }; - sr.send(link).await.unwrap(); - } - println!("Done with all links from {}", real_url); - }); - } + recurse( + response, + &mut cache, + &input_domains, + &cfg, + &pb, + send_req.clone(), + ) + .await?; } // Note that print statements may interfere with the progress bar, so this // must go before printing the stats @@ -237,6 +208,56 @@ async fn run(cfg: &Config, inputs: Vec) -> Result { } } +async fn recurse( + response: Response, + cache: &mut Cache, + input_domains: &[Option<&str>], + cfg: &Config, + pb: &Option, + send_req: Sender, +) -> Result<()> { + if !response.status.is_success() { + return Ok(()); + } + if cache.contains(response.uri.as_str()) { + return Ok(()); + } + cache.insert(response.uri.to_string()); + + if let lychee::Uri::Website(url) = response.uri { + let input = collector::Input::RemoteUrl(url.clone()); + + if !input_domains.contains(&url.domain()) { + return Ok(()); + } + + // TODO: Check recursion level + let links = collector::collect_links( + &[input], + cfg.base_url.clone(), + cfg.skip_missing, + cfg.max_concurrency, + ) + .await?; + + let bar = pb.clone(); + let real_url = url.clone(); + tokio::spawn(async move { + println!("Adding {} links from {}", links.len(), real_url); + for link in links { + if let Some(pb) = &bar { + pb.inc_length(1); + pb.set_message(&link.to_string()); + }; + send_req.send(link).await.unwrap(); + } + println!("Done with all links from {}", real_url); + }); + }; + + Ok(()) +} + fn read_header(input: &str) -> Result<(String, String)> { let elements: Vec<_> = input.split('=').collect(); if elements.len() != 2 { From 0b86f1a415d681a3fe25174d640500a298802552 Mon Sep 17 00:00:00 2001 From: Matthias Endler Date: Sun, 28 Feb 2021 20:43:20 +0100 Subject: [PATCH 08/23] Fix documentation --- src/client.rs | 2 +- src/lib.rs | 65 ++++++++++++++++++++++++++------------------------- 2 files changed, 34 insertions(+), 33 deletions(-) diff --git a/src/client.rs b/src/client.rs index eb493bd9ec..c6d35d2ed5 100644 --- a/src/client.rs +++ b/src/client.rs @@ -264,7 +264,7 @@ impl Client { } } -/// A convenience function to check a single URI +/// A convenience function to check a single URI. /// This is the most simple link check and avoids having to create a client manually. /// For more complex scenarios, look into using the `ClientBuilder` instead. pub async fn check>(request: T) -> Result { diff --git a/src/lib.rs b/src/lib.rs index 4afc5043fe..62071e2a79 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,36 +1,37 @@ -#[deny(missing_docs)] +//! +//! `lychee` is a library for checking links. +//! It is asynchronous and supports multiple input formats like Markdown and HTML. +//! Here is a basic usage example: +//! +//! ``` +//! use std::error::Error; +//! +//! #[tokio::main] +//! async fn main() -> Result<(), Box> { +//! let response = lychee::check("https://github.com/lycheeverse/lychee").await?; +//! println!("{}", response); +//! Ok(()) +//! } +//! ``` +//! +//! For more specific use-cases you can build a lychee client yourself, +//! using the `ClientBuilder` which can be used to +//! configure and run your own link checker and grants full flexibility: +//! +//! ``` +//! use lychee::{ClientBuilder, Status}; +//! use std::error::Error; +//! +//! #[tokio::main] +//! async fn main() -> Result<(), Box> { +//! let client = ClientBuilder::default().build()?; +//! let response = client.check("https://github.com/lycheeverse/lychee").await?; +//! assert!(matches!(response.status, Status::Ok(_))); +//! Ok(()) +//! } +//! ``` -/** -* `lychee` is a library for checking links. -* "Hello world" example: -* ``` -* use std::error::Error; -* -* #[tokio::main] -* async fn main() -> Result<(), Box> { -* let response = lychee::check("https://github.com/lycheeverse/lychee").await?; -* println!("{}", response); -* Ok(()) -* } -* ``` -* -* For more specific use-cases you can build a lychee client yourself, -* using the `ClientBuilder` which can be used to -* configure and run your own link checker and grants full flexibility: -* -* ``` -* use lychee::{ClientBuilder, Status}; -* use std::error::Error; -* -* #[tokio::main] -* async fn main() -> Result<(), Box> { -* let client = ClientBuilder::default().build()?; -* let response = client.check("https://github.com/lycheeverse/lychee").await?; -* assert!(matches!(response.status, Status::Ok(_))); -* Ok(()) -* } -* ``` -*/ +#[deny(missing_docs)] #[cfg(doctest)] #[macro_use] From 5310e8b0388b554d5908bbbee0b354cfc9056194 Mon Sep 17 00:00:00 2001 From: Matthias Endler Date: Sun, 28 Feb 2021 20:50:28 +0100 Subject: [PATCH 09/23] Add recursion flag --- src/bin/lychee/main.rs | 20 +++++++++++--------- src/bin/lychee/options.rs | 6 ++++++ 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/src/bin/lychee/main.rs b/src/bin/lychee/main.rs index 6db5f5d4d6..cf744e53be 100644 --- a/src/bin/lychee/main.rs +++ b/src/bin/lychee/main.rs @@ -179,15 +179,17 @@ async fn run(cfg: &Config, inputs: Vec) -> Result { show_progress(&pb, &response, cfg.verbose); stats.add(response.clone()); - recurse( - response, - &mut cache, - &input_domains, - &cfg, - &pb, - send_req.clone(), - ) - .await?; + if cfg.recursion { + recurse( + response, + &mut cache, + &input_domains, + &cfg, + &pb, + send_req.clone(), + ) + .await?; + } } // Note that print statements may interfere with the progress bar, so this // must go before printing the stats diff --git a/src/bin/lychee/options.rs b/src/bin/lychee/options.rs index 912cd969c5..65d0363ac8 100644 --- a/src/bin/lychee/options.rs +++ b/src/bin/lychee/options.rs @@ -245,6 +245,11 @@ pub struct Config { #[structopt(short, long, default_value = "string")] #[serde(default)] pub format: Format, + + /// Enable recursion (make sub-requests for detected links) + #[structopt(short, long)] + #[serde(default)] + pub recursion: bool, } impl Config { @@ -299,6 +304,7 @@ impl Config { skip_missing: false; glob_ignore_case: false; output: None; + recursion: false; } } } From e2a8624355a8b1b9a0768b1c7ac5150611305021 Mon Sep 17 00:00:00 2001 From: Matthias Endler Date: Tue, 2 Mar 2021 01:36:00 +0100 Subject: [PATCH 10/23] Add logic to terminate the program with recursion --- src/bin/lychee/main.rs | 73 ++++++++++++++++++++++++++++++------------ 1 file changed, 52 insertions(+), 21 deletions(-) diff --git a/src/bin/lychee/main.rs b/src/bin/lychee/main.rs index cf744e53be..79e0658575 100644 --- a/src/bin/lychee/main.rs +++ b/src/bin/lychee/main.rs @@ -1,4 +1,4 @@ -use anyhow::{anyhow, Context, Result}; +use anyhow::{anyhow, bail, Context, Result}; use headers::authorization::Basic; use headers::{Authorization, HeaderMap, HeaderMapExt, HeaderName}; use indicatif::{ProgressBar, ProgressStyle}; @@ -89,6 +89,20 @@ fn fmt(stats: &ResponseStats, format: &Format) -> Result { }) } +// Get the set of input domains +// This is needed for supporting recursion +fn input_domains(inputs: Vec) -> HashSet { + let mut domains = HashSet::new(); + for input in inputs { + if let Input::RemoteUrl(url) = input { + if let Some(domain) = url.domain() { + domains.insert(domain.to_string()); + } + } + } + return domains; +} + async fn run(cfg: &Config, inputs: Vec) -> Result { let mut headers = parse_headers(&cfg.headers)?; if let Some(auth) = &cfg.basic_auth { @@ -131,6 +145,7 @@ async fn run(cfg: &Config, inputs: Vec) -> Result { max_concurrency, ) .await?; + let mut total_requests = links.len(); let pb = match cfg.no_progress { true => None, @@ -167,20 +182,31 @@ async fn run(cfg: &Config, inputs: Vec) -> Result { clients.listen().await; }); - let input_domains: Vec<_> = inputs - .iter() - .filter_map(|i| match i { - Input::RemoteUrl(url) => Some(url.domain()), - _ => None, - }) - .collect(); + let input_domains: HashSet = input_domains(inputs); + + // We keep track of the total number of requests + // and exit the loop once we are done. + // Otherwise the sender would never be dropped and + // we'd be stuck indefinitely. + let mut curr = 0; + loop { + if curr == total_requests { + break; + } + curr += 1; + let response = recv_resp.recv().await; + + if response.is_none() { + // receiver was dropped + break; + } + let response = response.unwrap(); - while let Some(response) = recv_resp.recv().await { show_progress(&pb, &response, cfg.verbose); stats.add(response.clone()); if cfg.recursion { - recurse( + let count = recurse( response, &mut cache, &input_domains, @@ -189,8 +215,10 @@ async fn run(cfg: &Config, inputs: Vec) -> Result { send_req.clone(), ) .await?; + total_requests += count; } } + // Note that print statements may interfere with the progress bar, so this // must go before printing the stats if let Some(pb) = &pb { @@ -213,24 +241,29 @@ async fn run(cfg: &Config, inputs: Vec) -> Result { async fn recurse( response: Response, cache: &mut Cache, - input_domains: &[Option<&str>], + input_domains: &HashSet, cfg: &Config, pb: &Option, send_req: Sender, -) -> Result<()> { +) -> Result { if !response.status.is_success() { - return Ok(()); + return Ok(0); } if cache.contains(response.uri.as_str()) { - return Ok(()); + return Ok(0); } cache.insert(response.uri.to_string()); if let lychee::Uri::Website(url) = response.uri { let input = collector::Input::RemoteUrl(url.clone()); - if !input_domains.contains(&url.domain()) { - return Ok(()); + match url.domain() { + None => bail!("Cannot find domain in url: {}", url), + Some(domain) => { + if !input_domains.contains(domain) { + return Ok(0); + } + } } // TODO: Check recursion level @@ -241,11 +274,10 @@ async fn recurse( cfg.max_concurrency, ) .await?; + let count = links.len(); let bar = pb.clone(); - let real_url = url.clone(); tokio::spawn(async move { - println!("Adding {} links from {}", links.len(), real_url); for link in links { if let Some(pb) = &bar { pb.inc_length(1); @@ -253,11 +285,10 @@ async fn recurse( }; send_req.send(link).await.unwrap(); } - println!("Done with all links from {}", real_url); }); + return Ok(count); }; - - Ok(()) + Ok(0) } fn read_header(input: &str) -> Result<(String, String)> { From aa9c88bc3dc39775413426cae8233523b7c7988a Mon Sep 17 00:00:00 2001 From: Matthias Endler Date: Tue, 2 Mar 2021 01:40:12 +0100 Subject: [PATCH 11/23] Rename flag from "recursion" to "recursive" --- README.md | 3 ++- src/bin/lychee/main.rs | 2 +- src/bin/lychee/options.rs | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 5ae3b04355..343de92dbd 100644 --- a/README.md +++ b/README.md @@ -150,7 +150,7 @@ There is an extensive list of commandline parameters to customize the behavior, see below for a full list. ```sh -USAGE: +iUSAGE: lychee [FLAGS] [OPTIONS] [--] [inputs]... FLAGS: @@ -165,6 +165,7 @@ FLAGS: -i, --insecure Proceed for server connections considered insecure (invalid TLS) -n, --no-progress Do not show progress bar. This is recommended for non-interactive shells (e.g. for continuos integration) + -r, --recursive Enable recursion (make sub-requests for detected links) --skip-missing Skip missing input files (default is to error if they don't exist) -V, --version Prints version information -v, --verbose Verbose program output diff --git a/src/bin/lychee/main.rs b/src/bin/lychee/main.rs index 79e0658575..6e49ea0328 100644 --- a/src/bin/lychee/main.rs +++ b/src/bin/lychee/main.rs @@ -205,7 +205,7 @@ async fn run(cfg: &Config, inputs: Vec) -> Result { show_progress(&pb, &response, cfg.verbose); stats.add(response.clone()); - if cfg.recursion { + if cfg.recursive { let count = recurse( response, &mut cache, diff --git a/src/bin/lychee/options.rs b/src/bin/lychee/options.rs index 65d0363ac8..e692a61d11 100644 --- a/src/bin/lychee/options.rs +++ b/src/bin/lychee/options.rs @@ -249,7 +249,7 @@ pub struct Config { /// Enable recursion (make sub-requests for detected links) #[structopt(short, long)] #[serde(default)] - pub recursion: bool, + pub recursive: bool, } impl Config { @@ -304,7 +304,7 @@ impl Config { skip_missing: false; glob_ignore_case: false; output: None; - recursion: false; + recursive: false; } } } From 148fbcd932dd8984da0c4e571b703b5fd62a8ab3 Mon Sep 17 00:00:00 2001 From: Matthias Endler Date: Tue, 2 Mar 2021 01:57:23 +0100 Subject: [PATCH 12/23] Add support for max recursion level --- README.md | 3 ++- src/bin/lychee/main.rs | 11 ++++++++++- src/bin/lychee/options.rs | 5 +++++ src/bin/lychee/stats.rs | 5 +++++ src/client.rs | 4 ++-- src/extract.rs | 4 ++-- src/filter/mod.rs | 4 +++- src/types.rs | 14 +++++++++----- 8 files changed, 38 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 343de92dbd..5e764ef8b8 100644 --- a/README.md +++ b/README.md @@ -150,7 +150,7 @@ There is an extensive list of commandline parameters to customize the behavior, see below for a full list. ```sh -iUSAGE: +USAGE: lychee [FLAGS] [OPTIONS] [--] [inputs]... FLAGS: @@ -182,6 +182,7 @@ OPTIONS: -h, --headers ... Custom request headers --include ... URLs to check (supports regex). Has preference over all excludes --max-concurrency Maximum number of concurrent network requests [default: 128] + --max-recursion Set maximum recursion depth (recommended for big inputs) -m, --max-redirects Maximum number of allowed redirects [default: 10] -X, --method Request method [default: get] -o, --output Output file of status report diff --git a/src/bin/lychee/main.rs b/src/bin/lychee/main.rs index 6e49ea0328..f00f2a1340 100644 --- a/src/bin/lychee/main.rs +++ b/src/bin/lychee/main.rs @@ -246,6 +246,14 @@ async fn recurse( pb: &Option, send_req: Sender, ) -> Result { + let recursion_level = response.recursion_level + 1; + + if let Some(max_recursion) = cfg.max_recursion { + if recursion_level > max_recursion { + return Ok(0); + } + } + if !response.status.is_success() { return Ok(0); } @@ -278,7 +286,8 @@ async fn recurse( let bar = pb.clone(); tokio::spawn(async move { - for link in links { + for mut link in links { + link.recursion_level = recursion_level; if let Some(pb) = &bar { pb.inc_length(1); pb.set_message(&link.to_string()); diff --git a/src/bin/lychee/options.rs b/src/bin/lychee/options.rs index e692a61d11..6d508e555b 100644 --- a/src/bin/lychee/options.rs +++ b/src/bin/lychee/options.rs @@ -250,6 +250,11 @@ pub struct Config { #[structopt(short, long)] #[serde(default)] pub recursive: bool, + + /// Set maximum recursion depth (recommended for big inputs) + #[structopt(long)] + pub max_recursion: Option, + } impl Config { diff --git a/src/bin/lychee/stats.rs b/src/bin/lychee/stats.rs index 5e0446dd93..38d54e9e26 100644 --- a/src/bin/lychee/stats.rs +++ b/src/bin/lychee/stats.rs @@ -127,16 +127,19 @@ mod test_super { uri: website("http://example.org/ok"), status: Status::Ok(http::StatusCode::OK), source: Input::Stdin, + recursion_level: 0, }); stats.add(Response { uri: website("http://example.org/failed"), status: Status::Failed(http::StatusCode::BAD_GATEWAY), source: Input::Stdin, + recursion_level: 0, }); stats.add(Response { uri: website("http://example.org/redirect"), status: Status::Redirected(http::StatusCode::PERMANENT_REDIRECT), source: Input::Stdin, + recursion_level: 0, }); let mut expected_map = HashMap::new(); expected_map.insert( @@ -146,11 +149,13 @@ mod test_super { uri: website("http://example.org/failed"), status: Status::Failed(http::StatusCode::BAD_GATEWAY), source: Input::Stdin, + recursion_level: 0, }, Response { uri: website("http://example.org/redirect"), status: Status::Redirected(http::StatusCode::PERMANENT_REDIRECT), source: Input::Stdin, + recursion_level: 0, }, ] .into_iter() diff --git a/src/client.rs b/src/client.rs index c6d35d2ed5..4729e396ad 100644 --- a/src/client.rs +++ b/src/client.rs @@ -169,7 +169,7 @@ impl Client { Err(_e) => bail!("Invalid URI"), }; if self.filter.excluded(&request) { - return Ok(Response::new(request.uri, Status::Excluded, request.source)); + return Ok(Response::new(request.uri, Status::Excluded, request.source, request.recursion_level)); } let status = match request.uri { Uri::Website(ref url) => self.check_website(&url).await, @@ -181,7 +181,7 @@ impl Client { } } }; - Ok(Response::new(request.uri, status, request.source)) + Ok(Response::new(request.uri, status, request.source, request.recursion_level)) } pub async fn check_website(&self, url: &Url) -> Status { diff --git a/src/extract.rs b/src/extract.rs index 020b6c79bd..c1adf7e565 100644 --- a/src/extract.rs +++ b/src/extract.rs @@ -157,7 +157,7 @@ pub(crate) fn extract_links( for link in links { match Uri::try_from(link.as_str()) { Ok(uri) => { - requests.insert(Request::new(uri, input_content.input.clone())); + requests.insert(Request::new(uri, input_content.input.clone(), 0)); } Err(_) => { if !Path::new(&link).exists() { @@ -165,7 +165,7 @@ pub(crate) fn extract_links( if let Ok(new_url) = base_url.join(&link) { requests.insert(Request::new( Uri::Website(new_url), - input_content.input.clone(), + input_content.input.clone(), 0 )); } } diff --git a/src/filter/mod.rs b/src/filter/mod.rs index adc0f3a970..1945c73b73 100644 --- a/src/filter/mod.rs +++ b/src/filter/mod.rs @@ -103,7 +103,7 @@ mod test { /// Helper method to convert a string into a Request /// Note: This panics on error, so it should only be used for testing pub fn request(url: &str) -> Request { - Request::new(website(url), Input::Stdin) + Request::new(website(url), Input::Stdin, 0) } #[test] @@ -173,6 +173,7 @@ mod test { filter.excluded(&Request::new( Uri::Mail("mail@example.org".to_string()), Input::Stdin, + 0, )), true ); @@ -182,6 +183,7 @@ mod test { filter.excluded(&Request::new( Uri::Mail("foo@bar.dev".to_string()), Input::Stdin, + 0, )), false ); diff --git a/src/types.rs b/src/types.rs index d208cbb5bc..8e6eb785b0 100644 --- a/src/types.rs +++ b/src/types.rs @@ -9,11 +9,12 @@ pub type Cache = HashSet; pub struct Request { pub uri: Uri, pub source: Input, + pub recursion_level: usize, } impl Request { - pub fn new(uri: Uri, source: Input) -> Self { - Request { uri, source } + pub fn new(uri: Uri, source: Input, recursion_level: usize) -> Self { + Request { uri, source, recursion_level } } } @@ -28,7 +29,7 @@ impl TryFrom for Request { fn try_from(s: String) -> Result { let uri = Uri::try_from(s.as_str())?; - Ok(Request::new(uri, Input::String(s))) + Ok(Request::new(uri, Input::String(s), 0)) } } @@ -37,7 +38,7 @@ impl TryFrom<&str> for Request { fn try_from(s: &str) -> Result { let uri = Uri::try_from(s)?; - Ok(Request::new(uri, Input::String(s.to_owned()))) + Ok(Request::new(uri, Input::String(s.to_owned()), 0)) } } @@ -65,14 +66,17 @@ pub struct Response { pub status: Status, #[serde(skip)] pub source: Input, + #[serde(skip)] + pub recursion_level: usize, } impl Response { - pub fn new(uri: Uri, status: Status, source: Input) -> Self { + pub fn new(uri: Uri, status: Status, source: Input, recursion_level: usize) -> Self { Response { uri, status, source, + recursion_level, } } } From 23743ed41740cc123cc9b322a3ddcc9da52ee0f5 Mon Sep 17 00:00:00 2001 From: Matthias Endler Date: Tue, 2 Mar 2021 02:00:21 +0100 Subject: [PATCH 13/23] Simplify loop --- src/bin/lychee/main.rs | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/bin/lychee/main.rs b/src/bin/lychee/main.rs index f00f2a1340..d3f48eeef1 100644 --- a/src/bin/lychee/main.rs +++ b/src/bin/lychee/main.rs @@ -185,14 +185,12 @@ async fn run(cfg: &Config, inputs: Vec) -> Result { let input_domains: HashSet = input_domains(inputs); // We keep track of the total number of requests - // and exit the loop once we are done. + // and exit the loop once we reach it. // Otherwise the sender would never be dropped and // we'd be stuck indefinitely. let mut curr = 0; - loop { - if curr == total_requests { - break; - } + + while curr < total_requests { curr += 1; let response = recv_resp.recv().await; From 8d42cbb01dd9b1d35164cd0c08e118971121955c Mon Sep 17 00:00:00 2001 From: Matthias Endler Date: Tue, 2 Mar 2021 02:03:28 +0100 Subject: [PATCH 14/23] Make code more idiomatic --- src/bin/lychee/main.rs | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/src/bin/lychee/main.rs b/src/bin/lychee/main.rs index d3f48eeef1..604066a40a 100644 --- a/src/bin/lychee/main.rs +++ b/src/bin/lychee/main.rs @@ -100,7 +100,7 @@ fn input_domains(inputs: Vec) -> HashSet { } } } - return domains; + domains } async fn run(cfg: &Config, inputs: Vec) -> Result { @@ -192,13 +192,7 @@ async fn run(cfg: &Config, inputs: Vec) -> Result { while curr < total_requests { curr += 1; - let response = recv_resp.recv().await; - - if response.is_none() { - // receiver was dropped - break; - } - let response = response.unwrap(); + let response = recv_resp.recv().await.context("Receive channel closed")?; show_progress(&pb, &response, cfg.verbose); stats.add(response.clone()); From b73f709f7f9c43079ef19cd117a44e1322a2d541 Mon Sep 17 00:00:00 2001 From: Matthias Endler Date: Tue, 2 Mar 2021 02:07:18 +0100 Subject: [PATCH 15/23] Update comments --- src/bin/lychee/main.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bin/lychee/main.rs b/src/bin/lychee/main.rs index 604066a40a..ab1109280b 100644 --- a/src/bin/lychee/main.rs +++ b/src/bin/lychee/main.rs @@ -242,6 +242,7 @@ async fn recurse( if let Some(max_recursion) = cfg.max_recursion { if recursion_level > max_recursion { + // Maximum recursion depth reached; stop link checking. return Ok(0); } } @@ -266,7 +267,6 @@ async fn recurse( } } - // TODO: Check recursion level let links = collector::collect_links( &[input], cfg.base_url.clone(), From 15ec139132d98e55e3e470e2e041373c1101f56c Mon Sep 17 00:00:00 2001 From: Matthias Endler Date: Tue, 2 Mar 2021 02:08:55 +0100 Subject: [PATCH 16/23] Formatting --- src/lib.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 62071e2a79..5b6ccf11a1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,6 @@ //! //! `lychee` is a library for checking links. -//! It is asynchronous and supports multiple input formats like Markdown and HTML. +//! It is asynchronous and supports multiple input formats like Markdown and HTML. //! Here is a basic usage example: //! //! ``` @@ -32,7 +32,6 @@ //! ``` #[deny(missing_docs)] - #[cfg(doctest)] #[macro_use] extern crate doc_comment; From 0f73fcb852fb589a2be6d6905d2ab30451258228 Mon Sep 17 00:00:00 2001 From: Matthias Endler Date: Tue, 2 Mar 2021 02:09:20 +0100 Subject: [PATCH 17/23] Formatting --- src/bin/lychee/options.rs | 1 - src/client.rs | 14 ++++++++++++-- src/extract.rs | 3 ++- src/types.rs | 6 +++++- 4 files changed, 19 insertions(+), 5 deletions(-) diff --git a/src/bin/lychee/options.rs b/src/bin/lychee/options.rs index 6d508e555b..37f9e058c6 100644 --- a/src/bin/lychee/options.rs +++ b/src/bin/lychee/options.rs @@ -254,7 +254,6 @@ pub struct Config { /// Set maximum recursion depth (recommended for big inputs) #[structopt(long)] pub max_recursion: Option, - } impl Config { diff --git a/src/client.rs b/src/client.rs index 4729e396ad..c7efd9d636 100644 --- a/src/client.rs +++ b/src/client.rs @@ -169,7 +169,12 @@ impl Client { Err(_e) => bail!("Invalid URI"), }; if self.filter.excluded(&request) { - return Ok(Response::new(request.uri, Status::Excluded, request.source, request.recursion_level)); + return Ok(Response::new( + request.uri, + Status::Excluded, + request.source, + request.recursion_level, + )); } let status = match request.uri { Uri::Website(ref url) => self.check_website(&url).await, @@ -181,7 +186,12 @@ impl Client { } } }; - Ok(Response::new(request.uri, status, request.source, request.recursion_level)) + Ok(Response::new( + request.uri, + status, + request.source, + request.recursion_level, + )) } pub async fn check_website(&self, url: &Url) -> Status { diff --git a/src/extract.rs b/src/extract.rs index c1adf7e565..d24996de01 100644 --- a/src/extract.rs +++ b/src/extract.rs @@ -165,7 +165,8 @@ pub(crate) fn extract_links( if let Ok(new_url) = base_url.join(&link) { requests.insert(Request::new( Uri::Website(new_url), - input_content.input.clone(), 0 + input_content.input.clone(), + 0, )); } } diff --git a/src/types.rs b/src/types.rs index 8e6eb785b0..b6feaf286e 100644 --- a/src/types.rs +++ b/src/types.rs @@ -14,7 +14,11 @@ pub struct Request { impl Request { pub fn new(uri: Uri, source: Input, recursion_level: usize) -> Self { - Request { uri, source, recursion_level } + Request { + uri, + source, + recursion_level, + } } } From 0544799703c9481df6e5070cd261f350f392d7c0 Mon Sep 17 00:00:00 2001 From: Matthias Endler Date: Tue, 2 Mar 2021 13:35:26 +0100 Subject: [PATCH 18/23] Mention recursion support in readme --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 5e764ef8b8..46d951c301 100644 --- a/README.md +++ b/README.md @@ -20,8 +20,8 @@ use | -------------------- | ------- | ------------- | -------- | --------------------- | ------------ | ------------- | --------------------- | ------ | | Language | Rust | Ruby | Go | JS | TypeScript | Python | JS | PHP | | Async/Parallel | ![yes] | ![yes] | ![yes] | ![yes] | ![yes] | ![yes] | ![yes] | ![yes] | -| JSON output | ![yes] | ![no] | ![yes] | ![yes] | ![yes] | ![maybe]1 | ![yes] | ![yes] | -| Static binary | ![yes] | ![no] | ![yes] | ![no] | ![no] | ️ ![no] | ![no] | ![no] | +| JSON output | ![yes] | ![no] | ![yes] | ![yes] | ![yes] | ![maybe]1 | ![yes] | ![yes] | +| Static binary | ![yes] | ![no] | ![yes] | ![no] | ![no] | ️ ![no] | ![no] | ![no] | | Markdown files | ![yes] | ![yes] | ![no] | ![no] | ![no] | ![yes] | ️ ![yes] | ![no] | | HTML files | ![yes] | ![no] | ![no] | ![yes] | ![yes] | ![no] | ![yes] | ![no] | | Text files | ![yes] | ![no] | ![no] | ![no] | ![no] | ![no] | ![no] | ![no] | @@ -32,7 +32,7 @@ use | Custom user agent | ![yes] | ![no] | ![no] | ![yes] | ![no] | ![yes] | ![no] | ![no] | | Relative URLs | ![yes] | ![yes] | ![no] | ![yes] | ![yes] | ![yes] | ![yes] | ![yes] | | Skip relative URLs | ![yes] | ![no] | ![no] | ![maybe] | ![no] | ![no] | ![no] | ![no] | -| Include patterns | ![yes]️ | ![yes] | ![no] | ![yes] | ![no] | ![no] | ![no] | ![no] | +| Include patterns | ![yes]️ | ![yes] | ![no] | ![yes] | ![no] | ![no] | ![no] | ![no] | | Exclude patterns | ![yes] | ![no] | ![yes] | ![yes] | ![yes] | ![yes] | ![yes] | ![yes] | | Handle redirects | ![yes] | ![yes] | ![yes] | ![yes] | ![yes] | ![yes] | ![yes] | ![yes] | | Ignore insecure SSL | ![yes] | ![yes] | ![yes] | ![no] | ![no] | ![yes] | ![no] | ![yes] | @@ -51,7 +51,7 @@ use | [Use as library] | ![yes] | ![yes] | ![no] | ![yes] | ![yes] | ![no] | ![yes] | ![no] | | Quiet mode | ![yes] | ![no] | ![no] | ![no] | ![yes] | ![yes] | ![yes] | ![yes] | | Config file | ![yes] | ![no] | ![no] | ![no] | ![yes] | ![yes] | ![yes] | ![no] | -| Recursion | ![no] | ![no] | ![no] | ![yes] | ![yes] | ![yes] | ![yes] | ![no] | +| Recursion | ![yes] | ![no] | ![no] | ![yes] | ![yes] | ![yes] | ![yes] | ![no] | | Amazing lychee logo | ![yes] | ![no] | ![no] | ![no] | ![no] | ![no] | ![no] | ![no] | [awesome_bot]: https://github.com/dkhamsing/awesome_bot From 1d736363703c908ffd2725187163170ee7aaeace Mon Sep 17 00:00:00 2001 From: Matthias Endler Date: Wed, 3 Mar 2021 13:07:56 +0100 Subject: [PATCH 19/23] Change max-recursion param to depth --- README.md | 3 ++- src/bin/lychee/main.rs | 4 ++-- src/bin/lychee/options.rs | 4 ++-- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 46d951c301..ed3803afb8 100644 --- a/README.md +++ b/README.md @@ -175,6 +175,8 @@ OPTIONS: -b, --base-url Base URL to check relative URLs --basic-auth Basic authentication support. E.g. `username:password` -c, --config Configuration file to use [default: ./lychee.toml] + --depth Stop link checking beyond this maximum recursion depth. (Recommended for + large inputs.) --exclude ... Exclude URLs from checking (supports regex) -f, --format Output file format of status report (json, string) [default: string] --github-token GitHub API token to use when checking github.com links, to avoid rate @@ -182,7 +184,6 @@ OPTIONS: -h, --headers ... Custom request headers --include ... URLs to check (supports regex). Has preference over all excludes --max-concurrency Maximum number of concurrent network requests [default: 128] - --max-recursion Set maximum recursion depth (recommended for big inputs) -m, --max-redirects Maximum number of allowed redirects [default: 10] -X, --method Request method [default: get] -o, --output Output file of status report diff --git a/src/bin/lychee/main.rs b/src/bin/lychee/main.rs index ab1109280b..f4563c13ba 100644 --- a/src/bin/lychee/main.rs +++ b/src/bin/lychee/main.rs @@ -240,8 +240,8 @@ async fn recurse( ) -> Result { let recursion_level = response.recursion_level + 1; - if let Some(max_recursion) = cfg.max_recursion { - if recursion_level > max_recursion { + if let Some(depth) = cfg.depth { + if recursion_level > depth { // Maximum recursion depth reached; stop link checking. return Ok(0); } diff --git a/src/bin/lychee/options.rs b/src/bin/lychee/options.rs index 37f9e058c6..53076ff476 100644 --- a/src/bin/lychee/options.rs +++ b/src/bin/lychee/options.rs @@ -251,9 +251,9 @@ pub struct Config { #[serde(default)] pub recursive: bool, - /// Set maximum recursion depth (recommended for big inputs) + /// Stop link checking beyond this maximum recursion depth. (Recommended for large inputs.) #[structopt(long)] - pub max_recursion: Option, + pub depth: Option, } impl Config { From c6611ea1e8cd9888efead8fbf71e61f182ed1e19 Mon Sep 17 00:00:00 2001 From: Matthias Endler Date: Tue, 9 Mar 2021 01:47:36 +0100 Subject: [PATCH 20/23] Paint progress bar in lychee color --- src/bin/lychee/main.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/bin/lychee/main.rs b/src/bin/lychee/main.rs index f4563c13ba..481592282d 100644 --- a/src/bin/lychee/main.rs +++ b/src/bin/lychee/main.rs @@ -152,8 +152,9 @@ async fn run(cfg: &Config, inputs: Vec) -> Result { false => { let bar = ProgressBar::new(links.len() as u64) .with_style(ProgressStyle::default_bar().template( - "{spinner:.red.bright} {pos}/{len:.dim} [{elapsed_precise}] {bar:25} {wide_msg}", - )); + "{spinner:.red.bright} {pos}/{len:.dim} [{elapsed_precise}] {bar:25.magenta.bright/white} {wide_msg}", + ) + .progress_chars("██")); bar.enable_steady_tick(100); Some(bar) } From 3c4e646937c79676724a7da1ad8f5ffb1286ae42 Mon Sep 17 00:00:00 2001 From: Matthias Endler Date: Mon, 15 Mar 2021 23:52:29 +0100 Subject: [PATCH 21/23] Add local link extraction test --- src/extract.rs | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/src/extract.rs b/src/extract.rs index 377b7c4852..6d937a6a27 100644 --- a/src/extract.rs +++ b/src/extract.rs @@ -222,6 +222,26 @@ mod test { ); } + #[test] + fn test_extract_local_links() { + let input = "http://127.0.0.1/ and http://127.0.0.1:8888/ are local links."; + let links: HashSet = + extract_links(&InputContent::from_string(input, FileType::Plaintext), None) + .into_iter() + .map(|r| r.uri) + .collect(); + assert_eq!( + links, + [ + website("http://127.0.0.1/"), + website("http://127.0.0.1:8888/") + ] + .iter() + .cloned() + .collect() + ) + } + #[test] fn test_extract_markdown_links() { let input = "This is [a test](https://endler.dev). This is a relative link test [Relative Link Test](relative_link)"; From eb427144e04db6f95cc96fc19b73d7cf72f259ec Mon Sep 17 00:00:00 2001 From: Matthias Endler Date: Fri, 19 Mar 2021 18:12:57 +0100 Subject: [PATCH 22/23] Accept localhost in recursion --- src/bin/lychee/main.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/bin/lychee/main.rs b/src/bin/lychee/main.rs index 5c8bff1d66..9507ae7b66 100644 --- a/src/bin/lychee/main.rs +++ b/src/bin/lychee/main.rs @@ -1,4 +1,4 @@ -use anyhow::{anyhow, bail, Context, Result}; +use anyhow::{anyhow, Context, Result}; use headers::authorization::Basic; use headers::{Authorization, HeaderMap, HeaderMapExt, HeaderName}; use indicatif::{ProgressBar, ProgressStyle}; @@ -264,12 +264,12 @@ async fn recurse( if let lychee::Uri::Website(url) = response.uri { let input = collector::Input::RemoteUrl(url.clone()); - match url.domain() { - None => bail!("Cannot find domain in url: {}", url), - Some(domain) => { - if !input_domains.contains(domain) { - return Ok(0); - } + // Check domain against known domains + // If no domain is given, it might be a local link (e.g. 127.0.0.1), + // which we accept + if let Some(domain) = url.domain() { + if !input_domains.contains(domain) { + return Ok(0); } } From b94f4ef5023a8a5c00fdf2ee4dc207fda8b33aac Mon Sep 17 00:00:00 2001 From: Matthias Endler Date: Fri, 19 Mar 2021 18:13:28 +0100 Subject: [PATCH 23/23] WIP integration test for recursion --- src/test_utils.rs | 26 ++++++++++++++++++++++++ tests/recursion.rs | 49 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+) create mode 100644 tests/recursion.rs diff --git a/src/test_utils.rs b/src/test_utils.rs index 9d6b429ab5..aa80c8eb90 100644 --- a/src/test_utils.rs +++ b/src/test_utils.rs @@ -1,3 +1,5 @@ +use std::collections::HashMap; + use http::StatusCode; use reqwest::Url; use wiremock::matchers::path; @@ -34,6 +36,30 @@ where mock_server } +pub async fn get_mock_server_map(pages: HashMap<&str, (S, Option<&str>)>) -> MockServer +where + S: Into, +{ + let mock_server = MockServer::start().await; + + for (route, (response_code, content)) in pages { + let template = ResponseTemplate::new(response_code.into()); + + let template = if let Some(s) = content { + template.set_body_string(s) + } else { + template + }; + + Mock::given(path(route)) + .respond_with(template) + .mount(&mock_server) + .await; + } + + mock_server +} + /// Helper method to convert a string into a URI /// Note: This panics on error, so it should only be used for testing pub fn website(url: &str) -> Uri { diff --git a/tests/recursion.rs b/tests/recursion.rs new file mode 100644 index 0000000000..a1176a2ac3 --- /dev/null +++ b/tests/recursion.rs @@ -0,0 +1,49 @@ +#[cfg(test)] +mod cli { + use assert_cmd::Command; + use lychee::test_utils; + use predicates::str::contains; + use std::{collections::HashMap, thread, time}; + + fn main_command() -> Command { + // this gets the "main" binary name (e.g. `lychee`) + Command::cargo_bin(env!("CARGO_PKG_NAME")).expect("Couldn't get cargo package name") + } + + #[tokio::test] + async fn test_recursion() { + let mut cmd = main_command(); + + let mut routes = HashMap::new(); + routes.insert("/", (http::StatusCode::OK, Some("./foo.html"))); + routes.insert("/foo.html", (http::StatusCode::OK, Some("./bar.html"))); + routes.insert( + "/bar.html", + ( + http::StatusCode::OK, + Some("./baz.html ./path/to/frabz.html ./foo.html"), + ), + ); + routes.insert("/path/to/frabz.html", (http::StatusCode::OK, Some("ok"))); + + let mock_server = test_utils::get_mock_server_map(routes).await; + + let endpoint = mock_server.uri(); + + // println!("{}", endpoint); + // let ten_millis = time::Duration::from_millis(100000000000); + // thread::sleep(ten_millis); + + cmd.arg("--recursive") + .arg("--base-url") + .arg(&endpoint) + .arg("--") + .arg(&endpoint) + .assert() + .success() + .stdout(contains("Total............4")) + .stdout(contains("Excluded.........0")) + .stdout(contains("Successful.......4")) + .stdout(contains("Errors...........0")); + } +}