-
-
Notifications
You must be signed in to change notification settings - Fork 139
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Recursion Support (closes #55, #78) #165
Changes from all commits
a365f6a
ef564d6
f65543a
9c6735f
1d47d2b
5be75b7
756e5b4
a5a68e8
0b86f1a
5310e8b
e2a8624
aa9c88b
148fbcd
23743ed
8d42cbb
b73f709
15ec139
0f73fcb
0544799
1d73636
c6611ea
7516dc7
5be722d
3c4e646
363cf63
eb42714
b94f4ef
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,15 +8,18 @@ use stats::color_response; | |
use std::{collections::HashSet, time::Duration}; | ||
use std::{fs, str::FromStr}; | ||
use structopt::StructOpt; | ||
use tokio::sync::mpsc; | ||
use tokio::sync::mpsc::{self, Sender}; | ||
|
||
mod options; | ||
mod stats; | ||
|
||
use crate::options::{Config, LycheeOptions}; | ||
use crate::stats::ResponseStats; | ||
|
||
use lychee::collector::{self, Input}; | ||
use lychee::{ | ||
collector::{self, Input}, | ||
Cache, Request, | ||
}; | ||
use lychee::{ClientBuilder, ClientPool, Response}; | ||
|
||
/// A C-like enum that can be cast to `i32` and used as process exit code. | ||
|
@@ -86,6 +89,20 @@ fn fmt(stats: &ResponseStats, format: &Format) -> Result<String> { | |
}) | ||
} | ||
|
||
// Get the set of input domains | ||
// This is needed for supporting recursion | ||
fn input_domains(inputs: Vec<Input>) -> HashSet<String> { | ||
let mut domains = HashSet::new(); | ||
for input in inputs { | ||
if let Input::RemoteUrl(url) = input { | ||
if let Some(domain) = url.domain() { | ||
domains.insert(domain.to_string()); | ||
} | ||
} | ||
} | ||
domains | ||
} | ||
|
||
async fn run(cfg: &Config, inputs: Vec<Input>) -> Result<i32> { | ||
let mut headers = parse_headers(&cfg.headers)?; | ||
if let Some(auth) = &cfg.basic_auth { | ||
|
@@ -118,21 +135,26 @@ async fn run(cfg: &Config, inputs: Vec<Input>) -> Result<i32> { | |
.accepted(accepted) | ||
.build()?; | ||
|
||
// Create link cache to keep track of seen links | ||
let mut cache = Cache::new(); | ||
|
||
let links = collector::collect_links( | ||
&inputs, | ||
cfg.base_url.clone(), | ||
cfg.skip_missing, | ||
max_concurrency, | ||
) | ||
.await?; | ||
let mut total_requests = links.len(); | ||
|
||
let pb = match cfg.no_progress { | ||
true => None, | ||
false => { | ||
let bar = ProgressBar::new(links.len() as u64) | ||
.with_style(ProgressStyle::default_bar().template( | ||
"{spinner:.red.bright} {pos}/{len:.dim} [{elapsed_precise}] {bar:25} {wide_msg}", | ||
)); | ||
"{spinner:.red.bright} {pos}/{len:.dim} [{elapsed_precise}] {bar:25.magenta.bright/white} {wide_msg}", | ||
) | ||
.progress_chars("██")); | ||
bar.enable_steady_tick(100); | ||
Some(bar) | ||
} | ||
|
@@ -144,12 +166,13 @@ async fn run(cfg: &Config, inputs: Vec<Input>) -> Result<i32> { | |
let mut stats = ResponseStats::new(); | ||
|
||
let bar = pb.clone(); | ||
let sr = send_req.clone(); | ||
tokio::spawn(async move { | ||
for link in links { | ||
if let Some(pb) = &bar { | ||
pb.set_message(&link.to_string()); | ||
}; | ||
send_req.send(link).await.unwrap(); | ||
sr.send(link).await.unwrap(); | ||
} | ||
}); | ||
|
||
|
@@ -160,9 +183,33 @@ async fn run(cfg: &Config, inputs: Vec<Input>) -> Result<i32> { | |
clients.listen().await; | ||
}); | ||
|
||
while let Some(response) = recv_resp.recv().await { | ||
let input_domains: HashSet<String> = input_domains(inputs); | ||
|
||
// We keep track of the total number of requests | ||
// and exit the loop once we reach it. | ||
// Otherwise the sender would never be dropped and | ||
// we'd be stuck indefinitely. | ||
let mut curr = 0; | ||
|
||
while curr < total_requests { | ||
curr += 1; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @lebensterben, I'm not too happy about the above three lines. Is there a better way to handle the request queue compared to counting the number of total requests like I did? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. tokio-stream crate has a wrapper type for Reciever of the channel. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah that's great. Sounds like what we need. |
||
let response = recv_resp.recv().await.context("Receive channel closed")?; | ||
|
||
show_progress(&pb, &response, cfg.verbose); | ||
stats.add(response); | ||
stats.add(response.clone()); | ||
|
||
if cfg.recursive { | ||
let count = recurse( | ||
response, | ||
&mut cache, | ||
&input_domains, | ||
&cfg, | ||
&pb, | ||
send_req.clone(), | ||
) | ||
.await?; | ||
total_requests += count; | ||
} | ||
} | ||
|
||
// Note that print statements may interfere with the progress bar, so this | ||
|
@@ -189,6 +236,68 @@ async fn run(cfg: &Config, inputs: Vec<Input>) -> Result<i32> { | |
} | ||
} | ||
|
||
async fn recurse( | ||
response: Response, | ||
cache: &mut Cache, | ||
input_domains: &HashSet<String>, | ||
cfg: &Config, | ||
pb: &Option<ProgressBar>, | ||
send_req: Sender<Request>, | ||
) -> Result<usize> { | ||
let recursion_level = response.recursion_level + 1; | ||
|
||
if let Some(depth) = cfg.depth { | ||
if recursion_level > depth { | ||
// Maximum recursion depth reached; stop link checking. | ||
return Ok(0); | ||
} | ||
} | ||
|
||
if !response.status.is_success() { | ||
return Ok(0); | ||
} | ||
if cache.contains(response.uri.as_str()) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nitpick: I'm a bit skeptical of the name There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about |
||
return Ok(0); | ||
} | ||
cache.insert(response.uri.to_string()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since we're allowing infinite recursion, this data structure can really take up the entire memory with sufficient number of links. Depending on the expected maximum number of links checked in recursive mode, we might consider a bloom filter here:
Super-rough calculations below. (Note that I'm just calculating the size of this one structure, there's probably going to be more memory overheads from various bits of the program). Assumptions:
Calculations:
So, HashSet with 1 million unique links will take up ~64+24+1 = ~89 MB (~85 MiB) For comparison, a Bloom Filter with ~1e-7 false positive probability is expected to take up:
So, to be honest, HashSet should be absolutely fine for the most users. But we should make a remark in the README that memory usage when using recursive mode with tens millions of links might cause RAM usage issues. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi, would it be merged soon? and Docker image updated with the change to make GH action use the latest? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
|
||
if let lychee::Uri::Website(url) = response.uri { | ||
let input = collector::Input::RemoteUrl(url.clone()); | ||
|
||
// Check domain against known domains | ||
// If no domain is given, it might be a local link (e.g. 127.0.0.1), | ||
// which we accept | ||
if let Some(domain) = url.domain() { | ||
if !input_domains.contains(domain) { | ||
return Ok(0); | ||
} | ||
} | ||
|
||
let links = collector::collect_links( | ||
&[input], | ||
cfg.base_url.clone(), | ||
cfg.skip_missing, | ||
cfg.max_concurrency, | ||
) | ||
.await?; | ||
let count = links.len(); | ||
|
||
let bar = pb.clone(); | ||
tokio::spawn(async move { | ||
for mut link in links { | ||
link.recursion_level = recursion_level; | ||
if let Some(pb) = &bar { | ||
pb.inc_length(1); | ||
pb.set_message(&link.to_string()); | ||
}; | ||
send_req.send(link).await.unwrap(); | ||
} | ||
}); | ||
return Ok(count); | ||
}; | ||
Ok(0) | ||
} | ||
|
||
fn read_header(input: &str) -> Result<(String, String)> { | ||
let elements: Vec<_> = input.split('=').collect(); | ||
if elements.len() != 2 { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -245,6 +245,15 @@ pub struct Config { | |
#[structopt(short, long, default_value = "string")] | ||
#[serde(default)] | ||
pub format: Format, | ||
|
||
/// Enable recursion (make sub-requests for detected links) | ||
#[structopt(short, long)] | ||
#[serde(default)] | ||
pub recursive: bool, | ||
|
||
/// Stop link checking beyond this maximum recursion depth. (Recommended for large inputs.) | ||
#[structopt(long)] | ||
pub depth: Option<usize>, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd consider allowing this to be signed (e.g. |
||
} | ||
|
||
impl Config { | ||
|
@@ -299,6 +308,7 @@ impl Config { | |
skip_missing: false; | ||
glob_ignore_case: false; | ||
output: None; | ||
recursive: false; | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -149,16 +149,19 @@ mod test_super { | |
uri: website("http://example.org/ok"), | ||
status: Status::Ok(http::StatusCode::OK), | ||
source: Input::Stdin, | ||
recursion_level: 0, | ||
}); | ||
stats.add(Response { | ||
uri: website("http://example.org/failed"), | ||
status: Status::Failed(http::StatusCode::BAD_GATEWAY), | ||
source: Input::Stdin, | ||
recursion_level: 0, | ||
}); | ||
stats.add(Response { | ||
uri: website("http://example.org/redirect"), | ||
status: Status::Redirected(http::StatusCode::PERMANENT_REDIRECT), | ||
source: Input::Stdin, | ||
recursion_level: 0, | ||
}); | ||
let mut expected_map = HashMap::new(); | ||
expected_map.insert( | ||
|
@@ -168,11 +171,13 @@ mod test_super { | |
uri: website("http://example.org/failed"), | ||
status: Status::Failed(http::StatusCode::BAD_GATEWAY), | ||
source: Input::Stdin, | ||
recursion_level: 0, | ||
}, | ||
Response { | ||
uri: website("http://example.org/redirect"), | ||
status: Status::Redirected(http::StatusCode::PERMANENT_REDIRECT), | ||
source: Input::Stdin, | ||
recursion_level: 0, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For convenience and to cover the "most common case", it might be worth considering adding an alternative constructor for Or even have |
||
}, | ||
] | ||
.into_iter() | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I believe we should mention that depth less than zero also means infinite.