From 3239f6181acce4fb46599909ce09ffbbefcc2155 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Romanowski?= Date: Wed, 25 Nov 2020 19:44:19 +0100 Subject: [PATCH 01/12] Add skip_missing flag, add Input enum --- src/extract.rs | 15 +++++++ src/main.rs | 5 ++- src/options.rs | 13 ++++-- src/types.rs | 118 ++++++++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 145 insertions(+), 6 deletions(-) diff --git a/src/extract.rs b/src/extract.rs index feaa8dd946..4814b8f778 100644 --- a/src/extract.rs +++ b/src/extract.rs @@ -13,6 +13,21 @@ pub(crate) enum FileType { Plaintext, } +impl> From

for FileType { + /// Detect if the given path points to a Markdown, HTML, or plaintext file. + fn from(p: P) -> FileType { + let path = p.as_ref(); + match path.extension() { + Some(ext) => match ext.to_str().unwrap() { + "md" => FileType::Markdown, + "html" | "htm" => FileType::HTML, + _ => FileType::Plaintext, + }, + None => FileType::Plaintext, + } + } +} + // Use LinkFinder here to offload the actual link searching fn find_links(input: &str) -> Vec { let finder = LinkFinder::new(); diff --git a/src/main.rs b/src/main.rs index 66ac79bf42..c0d1fe4836 100644 --- a/src/main.rs +++ b/src/main.rs @@ -59,7 +59,10 @@ fn main() -> Result<()> { } None => tokio::runtime::Runtime::new()?, }; - let errorcode = runtime.block_on(run(cfg, opts.inputs))?; + let errorcode = runtime.block_on(run( + cfg, + opts.inputs.iter().map(|i| i.to_string()).collect(), + ))?; std::process::exit(errorcode); } diff --git a/src/options.rs b/src/options.rs index dece34f918..8f860aeba3 100644 --- a/src/options.rs +++ b/src/options.rs @@ -1,7 +1,9 @@ +use crate::types::Input; use anyhow::{Error, Result}; use serde::Deserialize; use std::{fs, io::ErrorKind}; use structopt::{clap::crate_version, StructOpt}; +use url::Url; pub(crate) const USER_AGENT: &str = concat!("lychee/", crate_version!()); const METHOD: &str = "get"; @@ -33,9 +35,9 @@ macro_rules! fold_in { #[derive(Debug, StructOpt)] #[structopt(name = "lychee", about = "A glorious link checker")] pub(crate) struct LycheeOptions { - /// Input files - #[structopt(default_value = "README.md")] - pub inputs: Vec, + /// TODO: Inputs + #[structopt(default_value = "README.md", parse(from_str = Input::from))] + pub inputs: Vec, /// Configuration file to use #[structopt(short, long = "config", default_value = "./lychee.toml")] @@ -52,6 +54,11 @@ pub struct Config { #[serde(default)] pub verbose: bool, + /// TODO: Skip missing input files + #[structopt(long)] + #[serde(default)] + pub skip_missing: bool, + /// Show progress #[structopt(short, long)] #[serde(default)] diff --git a/src/types.rs b/src/types.rs index d2e86e84c2..b4f939b31b 100644 --- a/src/types.rs +++ b/src/types.rs @@ -1,10 +1,17 @@ +use crate::extract::FileType; use crate::options::Config; -use anyhow::anyhow; +use anyhow::{anyhow, Result}; +use glob::glob; use regex::RegexSet; use std::net::IpAddr; +use std::path::{Path, PathBuf}; use std::{collections::HashSet, convert::TryFrom, fmt::Display}; +use tokio::fs::read_to_string; +use tokio::io::{stdin, AsyncReadExt}; use url::Url; +const STDIN: &str = "-"; + #[derive(Clone, Debug, PartialEq, Eq, Hash)] pub enum Uri { Website(Url), @@ -119,7 +126,7 @@ impl From for Status { } /// Exclude configuration for the link checker. -/// You can ignore links based on +/// You can ignore links based on regex patterns or pre-defined IP ranges. #[derive(Clone, Debug)] pub struct Excludes { pub regex: Option, @@ -158,6 +165,113 @@ impl Default for Excludes { } } +#[derive(Debug)] +#[non_exhaustive] +pub(crate) enum Input { + RemoteUrl(Url), + FsGlob(String), + FsPath(PathBuf), + Stdin, +} + +impl ToString for Input { + fn to_string(&self) -> String { + match self { + Self::RemoteUrl(url) => url.to_string(), + Self::FsGlob(s) => s.clone(), + Self::FsPath(p) => p.to_str().unwrap_or_default().to_owned(), + Self::Stdin => STDIN.to_owned(), + } + } +} + +#[derive(Debug)] +pub(crate) struct InputContent { + input: Input, + file_type: FileType, + content: String, +} + +impl From<&str> for Input { + fn from(value: &str) -> Self { + if value == STDIN { + Self::Stdin + } else { + match Url::parse(&value) { + Ok(url) => Self::RemoteUrl(url), + Err(_) => Self::FsGlob(value.to_owned()), + } + } + } +} + +impl Input { + async fn get_contents(self) -> Result> { + use Input::*; + + let contents = match self { + RemoteUrl(url) => vec![Self::url_contents(url).await?], + FsGlob(path_glob) => Self::glob_contents(path_glob).await?, + FsPath(path) => vec![Self::path_content(&path).await?], + Stdin => vec![Self::stdin_content().await?], + }; + + Ok(contents) + } + + async fn url_contents(url: Url) -> Result { + let res = reqwest::get(url.clone()).await?; + let content = res.text().await?; + let input_content = InputContent { + file_type: FileType::from(&url.as_str()), + input: Input::RemoteUrl(url), + content, + }; + + Ok(input_content) + } + + async fn glob_contents(path_glob: String) -> Result> { + let mut contents = vec![]; + + for entry in glob(&path_glob)? { + match entry { + Ok(path) => { + let content = Self::path_content(&path).await?; + contents.push(content); + } + Err(e) => println!("{:?}", e), + } + } + + Ok(contents) + } + + async fn path_content + AsRef>(path: P) -> Result { + let input_content = InputContent { + file_type: FileType::from(path.as_ref()), + content: read_to_string(&path).await?, + input: Input::FsPath(path.into()), + }; + + Ok(input_content) + } + + async fn stdin_content() -> Result { + let mut content = String::new(); + let mut stdin = stdin(); + stdin.read_to_string(&mut content).await?; + + let input_content = InputContent { + input: Input::Stdin, + content, + file_type: FileType::Plaintext, + }; + + Ok(input_content) + } +} + #[cfg(test)] mod test { use super::*; From 61b2dbf666cb451602d07c4b2d54683f8a25f1b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Romanowski?= Date: Wed, 25 Nov 2020 19:59:39 +0100 Subject: [PATCH 02/12] Unify help texts in options --- src/options.rs | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/options.rs b/src/options.rs index 8f860aeba3..1bc473dcd9 100644 --- a/src/options.rs +++ b/src/options.rs @@ -35,7 +35,9 @@ macro_rules! fold_in { #[derive(Debug, StructOpt)] #[structopt(name = "lychee", about = "A glorious link checker")] pub(crate) struct LycheeOptions { - /// TODO: Inputs + /// The inputs (where to get links to check from). + /// These can be: files (e.g. `README.md`), file globs (e.g. `"~/git/*/README.md"`), + /// remote URLs (e.g. `https://example.com/README.md`) or standard input (`-`). #[structopt(default_value = "README.md", parse(from_str = Input::from))] pub inputs: Vec, @@ -54,7 +56,7 @@ pub struct Config { #[serde(default)] pub verbose: bool, - /// TODO: Skip missing input files + /// Skip missing input files (default is to error if they don't exist) #[structopt(long)] #[serde(default)] pub skip_missing: bool, @@ -147,19 +149,18 @@ pub struct Config { #[serde(default = "method")] pub method: String, - #[structopt(short, long, help = "Base URL to check relative URls")] + /// Base URL to check relative URLs + #[structopt(short, long)] #[serde(default)] pub base_url: Option, - #[structopt(long, help = "Basic authentication support. Ex 'username:password'")] + /// Basic authentication support. E.g. `username:password` + #[structopt(long)] #[serde(default)] pub basic_auth: Option, - #[structopt( - long, - help = "GitHub API token to use when checking github.com links, to avoid rate limiting", - env = "GITHUB_TOKEN" - )] + /// GitHub API token to use when checking github.com links, to avoid rate limiting + #[structopt(long, env = "GITHUB_TOKEN")] #[serde(default)] pub github_token: Option, } From 808ec201b89c866d907b0ebddb41921348e79cb2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Romanowski?= Date: Wed, 25 Nov 2020 20:02:50 +0100 Subject: [PATCH 03/12] Include project home page in help text --- src/options.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/options.rs b/src/options.rs index 1bc473dcd9..604081dbdc 100644 --- a/src/options.rs +++ b/src/options.rs @@ -33,7 +33,10 @@ macro_rules! fold_in { } #[derive(Debug, StructOpt)] -#[structopt(name = "lychee", about = "A glorious link checker")] +#[structopt( + name = "lychee", + about = "A glorious link checker.\n\nProject home page: https://github.com/lycheeverse/lychee" +)] pub(crate) struct LycheeOptions { /// The inputs (where to get links to check from). /// These can be: files (e.g. `README.md`), file globs (e.g. `"~/git/*/README.md"`), From c40be9cd89aa58536d5a89b5a312d2e430324016 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Romanowski?= Date: Sun, 29 Nov 2020 15:39:57 +0100 Subject: [PATCH 04/12] Overhaul input handling --- Cargo.lock | 77 +++++++++++++++++++ Cargo.toml | 1 + README.md | 26 ++++++- src/collector.rs | 196 +++++++++++++++++++++++++++++++++++++---------- src/extract.rs | 35 +++++---- src/main.rs | 28 +++---- src/options.rs | 42 +++++++--- src/types.rs | 114 --------------------------- 8 files changed, 318 insertions(+), 201 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c54c696f36..78ad2e4fc3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -119,6 +119,12 @@ version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4d25d88fd6b8041580a654f9d0c581a047baee2b3efee13275f2fc392fc75034" +[[package]] +name = "arrayref" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4c527152e37cf757a3f78aae5a06fbeefdb07ccc535c980a3208ee3060dd544" + [[package]] name = "arrayvec" version = "0.5.1" @@ -476,6 +482,17 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693" +[[package]] +name = "blake2b_simd" +version = "0.5.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "afa748e348ad3be8263be728124b24a24f268266f6f5d58af9d75f6a40b5c587" +dependencies = [ + "arrayref", + "arrayvec", + "constant_time_eq", +] + [[package]] name = "block-buffer" version = "0.7.3" @@ -712,6 +729,12 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ce90df4c658c62f12d78f7508cf92f9173e5184a539c10bfe54a3107b3ffd0f2" +[[package]] +name = "constant_time_eq" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "245097e9a4535ee1e3e3931fcfcd55a796a44c643e8596ff6566d68f09b87bbc" + [[package]] name = "cookie" version = "0.14.2" @@ -943,6 +966,27 @@ dependencies = [ "generic-array 0.14.4", ] +[[package]] +name = "dirs" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13aea89a5c93364a98e9b37b2fa237effbb694d5cfe01c5b70941f7eb087d5e3" +dependencies = [ + "cfg-if 0.1.10", + "dirs-sys", +] + +[[package]] +name = "dirs-sys" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e93d7f5705de3e49895a2b5e0b8855a1c27f080192ae9c32a6432d50741a57a" +dependencies = [ + "libc", + "redox_users", + "winapi 0.3.9", +] + [[package]] name = "discard" version = "1.0.4" @@ -1799,6 +1843,7 @@ dependencies = [ "regex", "reqwest", "serde", + "shellexpand", "structopt", "tokio 0.2.22", "toml", @@ -2421,6 +2466,17 @@ version = "0.1.57" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "41cc0f7e4d5d4544e8861606a285bb08d3e70712ccc7d2b84d7c0ccfaf4b05ce" +[[package]] +name = "redox_users" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de0737333e7a9502c789a36d7c7fa6092a49895d4faa31ca5df163857ded2e9d" +dependencies = [ + "getrandom", + "redox_syscall", + "rust-argon2", +] + [[package]] name = "regex" version = "1.4.2" @@ -2522,6 +2578,18 @@ dependencies = [ "winapi 0.3.9", ] +[[package]] +name = "rust-argon2" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b18820d944b33caa75a71378964ac46f58517c92b6ae5f762636247c09e78fb" +dependencies = [ + "base64 0.13.0", + "blake2b_simd", + "constant_time_eq", + "crossbeam-utils 0.8.0", +] + [[package]] name = "rustc-demangle" version = "0.1.16" @@ -2698,6 +2766,15 @@ dependencies = [ "lazy_static", ] +[[package]] +name = "shellexpand" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a2b22262a9aaf9464d356f656fea420634f78c881c5eebd5ef5e66d8b9bc603" +dependencies = [ + "dirs", +] + [[package]] name = "signal-hook-registry" version = "1.2.1" diff --git a/Cargo.toml b/Cargo.toml index 6ce36f5a4b..5074044a77 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -30,6 +30,7 @@ quick-xml = "0.20.0" headers = "0.3.2" derive_builder = "0.9.0" deadpool = "0.6.0" +shellexpand = "2.0" [dependencies.reqwest] features = ["gzip"] diff --git a/README.md b/README.md index 96b3fc751c..6aa60b39b7 100644 --- a/README.md +++ b/README.md @@ -80,10 +80,32 @@ cargo install lychee ## Usage -Run it inside a repository with a `README.md`, or specify a file with +Run it inside a repository with a `README.md`: ``` -lychee +lychee +``` + +You can also specify various types of inputs: + +``` +# check links on a website: +lychee https://endler.dev/ + +# check links in a remote file: +lychee https://raw.githubusercontent.com/lycheeverse/lychee/master/README.md + +# check links in local file(s): +lychee README.md +lychee test.html info.txt + +# check links in local files (by shell glob): +lychee ~/projects/*/README.md + +# check links in local files (lychee supports advanced globbing and ~ expansion): +lychee "~/projects/big_project/**/README.*" +# ignore case when globbing, displaying progress and check result for each link: +lychee --glob-ignore-case --progress --verbose "~/projects/**/[r]eadme.*" ``` Optional (to avoid getting rate-limited): set an environment variable with your Github token diff --git a/src/collector.rs b/src/collector.rs index 0542c6fbc3..d819c2b3ce 100644 --- a/src/collector.rs +++ b/src/collector.rs @@ -1,24 +1,164 @@ use crate::extract::{extract_links, FileType}; use crate::types::Uri; use anyhow::Result; -use glob::glob; +use glob::glob_with; use reqwest::Url; -use std::{collections::HashSet, fs}; -use std::{ffi::OsStr, path::Path}; +use shellexpand::tilde; +use std::collections::HashSet; +use std::path::Path; +use std::path::PathBuf; +use tokio::fs::read_to_string; +use tokio::io::{stdin, AsyncReadExt}; -/// Detect if the given path points to a Markdown, HTML, or plaintext file. -fn resolve_file_type_by_path>(path: P) -> FileType { - match path.as_ref().extension().and_then(OsStr::to_str) { - Some("md") => FileType::Markdown, - Some("html") => FileType::HTML, - _ => FileType::Plaintext, +const STDIN: &str = "-"; + +#[derive(Debug)] +#[non_exhaustive] +pub(crate) enum Input { + RemoteUrl(Url), + FsGlob { pattern: String, ignore_case: bool }, + FsPath(PathBuf), + Stdin, + String(String), +} + +#[derive(Debug)] +pub(crate) struct InputContent { + pub input: Input, + pub file_type: FileType, + pub content: String, +} + +impl InputContent { + pub fn from_string(s: &str, file_type: FileType) -> Self { + // TODO: consider using Cow (to avoid one .clone() for String types) + Self { + input: Input::String(s.to_owned()), + file_type, + content: s.to_owned(), + } } } -/// Fetch all unique links from a vector of inputs +impl Input { + pub(crate) fn new(value: &str, glob_ignore_case: bool) -> Self { + if value == STDIN { + Self::Stdin + } else { + match Url::parse(&value) { + Ok(url) => Self::RemoteUrl(url), + // we assume that it's cheaper to just do the globbing, without + // checking if the `value` actually is a glob pattern + Err(_) => Self::FsGlob { + pattern: value.to_owned(), + ignore_case: glob_ignore_case, + }, + } + } + } + + pub async fn get_contents( + &self, + file_type_hint: Option, + ) -> Result> { + use Input::*; + + let contents = match self { + RemoteUrl(url) => vec![Self::url_contents(url).await?], + FsGlob { + pattern, + ignore_case, + } => Self::glob_contents(pattern, *ignore_case).await?, + FsPath(path) => vec![Self::path_content(&path).await?], + Stdin => vec![Self::stdin_content(file_type_hint).await?], + String(s) => vec![Self::string_content(s, file_type_hint)?], + }; + + Ok(contents) + } + + async fn url_contents(url: &Url) -> Result { + let res = reqwest::get(url.clone()).await?; + let content = res.text().await?; + let input_content = InputContent { + input: Input::RemoteUrl(url.clone()), + file_type: FileType::from(url.as_str()), + content, + }; + + Ok(input_content) + } + + async fn glob_contents(path_glob: &str, ignore_case: bool) -> Result> { + let mut contents = vec![]; + let glob_expanded = tilde(&path_glob); + let mut match_opts = glob::MatchOptions::new(); + + match_opts.case_sensitive = !ignore_case; + + println!("GLOB {:?} ignore case {:?}", path_glob, ignore_case); + + for entry in glob_with(&glob_expanded, match_opts)? { + match entry { + Ok(path) => { + let content = Self::path_content(&path).await?; + contents.push(content); + } + Err(e) => println!("{:?}", e), + } + } + + Ok(contents) + } + + async fn path_content + AsRef>(path: P) -> Result { + let input_content = InputContent { + file_type: FileType::from(path.as_ref()), + content: read_to_string(&path).await?, + input: Input::FsPath(path.into()), + }; + + Ok(input_content) + } + + async fn stdin_content(file_type_hint: Option) -> Result { + let mut content = String::new(); + let mut stdin = stdin(); + stdin.read_to_string(&mut content).await?; + + let input_content = InputContent { + input: Input::Stdin, + file_type: file_type_hint.unwrap_or_default(), + content, + }; + + Ok(input_content) + } + + fn string_content(s: &str, file_type_hint: Option) -> Result { + Ok(InputContent::from_string( + s, + file_type_hint.unwrap_or_default(), + )) + } +} + +impl ToString for Input { + fn to_string(&self) -> String { + match self { + Self::RemoteUrl(url) => url.to_string(), + Self::FsGlob { pattern, .. } => pattern.clone(), + Self::FsPath(p) => p.to_str().unwrap_or_default().to_owned(), + Self::Stdin => STDIN.to_owned(), + Self::String(s) => s.clone(), + } + } +} + +/// Fetch all unique links from a slice of inputs /// All relative URLs get prefixed with `base_url` if given. pub(crate) async fn collect_links( - inputs: Vec, + inputs: &[Input], base_url: Option, ) -> Result> { let base_url = match base_url { @@ -29,35 +169,11 @@ pub(crate) async fn collect_links( let mut links = HashSet::new(); for input in inputs { - match Url::parse(&input) { - Ok(url) => { - let path = String::from(url.path()); - let res = reqwest::get(url).await?; - let content = res.text().await?; - - links.extend(extract_links( - resolve_file_type_by_path(path), - &content, - base_url.clone(), - )); - } - Err(_) => { - // Assume we got a single file or a glob on our hands - for entry in glob(&input)? { - match entry { - Ok(path) => { - let content = fs::read_to_string(&path)?; - links.extend(extract_links( - resolve_file_type_by_path(&path), - &content, - base_url.clone(), - )); - } - Err(e) => println!("Error handling file pattern {}: {:?}", input, e), - } - } - } - }; + let input_contents = input.get_contents(None).await?; + + for input_content in input_contents { + links.extend(extract_links(&input_content, base_url.clone())); + } } Ok(links) } diff --git a/src/extract.rs b/src/extract.rs index 4814b8f778..7a6c14b0a5 100644 --- a/src/extract.rs +++ b/src/extract.rs @@ -1,3 +1,4 @@ +use crate::collector::InputContent; use crate::types::Uri; use linkify::LinkFinder; use pulldown_cmark::{Event as MDEvent, Parser, Tag}; @@ -13,12 +14,18 @@ pub(crate) enum FileType { Plaintext, } +impl Default for FileType { + fn default() -> Self { + Self::Plaintext + } +} + impl> From

for FileType { /// Detect if the given path points to a Markdown, HTML, or plaintext file. fn from(p: P) -> FileType { let path = p.as_ref(); match path.extension() { - Some(ext) => match ext.to_str().unwrap() { + Some(ext) => match ext.to_str().expect("Couldn't convert OsStr to str") { "md" => FileType::Markdown, "html" | "htm" => FileType::HTML, _ => FileType::Plaintext, @@ -120,15 +127,11 @@ fn extract_links_from_plaintext(input: &str) -> Vec { .collect() } -pub(crate) fn extract_links( - file_type: FileType, - input: &str, - base_url: Option, -) -> HashSet { - let links = match file_type { - FileType::Markdown => extract_links_from_markdown(input), - FileType::HTML => extract_links_from_html(input), - FileType::Plaintext => extract_links_from_plaintext(input), +pub(crate) fn extract_links(input_content: &InputContent, base_url: Option) -> HashSet { + let links = match input_content.file_type { + FileType::Markdown => extract_links_from_markdown(&input_content.content), + FileType::HTML => extract_links_from_html(&input_content.content), + FileType::Plaintext => extract_links_from_plaintext(&input_content.content), }; // Only keep legit URLs. This sorts out things like anchors. @@ -164,8 +167,7 @@ mod test { fn test_extract_markdown_links() { let input = "This is [a test](https://endler.dev). This is a relative link test [Relative Link Test](relative_link)"; let links = extract_links( - FileType::Markdown, - input, + &InputContent::from_string(input, FileType::Markdown), Some(Url::parse("https://github.com/hello-rust/lychee/").unwrap()), ); assert_eq!( @@ -193,8 +195,7 @@ mod test { "#; let links = extract_links( - FileType::HTML, - input, + &InputContent::from_string(input, FileType::HTML), Some(Url::parse("https://github.com/hello-rust/").unwrap()), ); @@ -211,14 +212,14 @@ mod test { #[test] fn test_skip_markdown_anchors() { let input = "This is [a test](#lol)."; - let links = extract_links(FileType::Markdown, input, None); + let links = extract_links(&InputContent::from_string(input, FileType::Markdown), None); assert_eq!(links, HashSet::new()) } #[test] fn test_skip_markdown_internal_urls() { let input = "This is [a test](./internal)."; - let links = extract_links(FileType::Markdown, input, None); + let links = extract_links(&InputContent::from_string(input, FileType::Markdown), None); assert_eq!(links, HashSet::new()) } @@ -226,7 +227,7 @@ mod test { fn test_non_markdown_links() { let input = "https://endler.dev and https://hello-rust.show/foo/bar?lol=1 at test@example.com"; - let links = extract_links(FileType::Plaintext, input, None); + let links = extract_links(&InputContent::from_string(input, FileType::Plaintext), None); let expected = HashSet::from_iter( [ Uri::Website(Url::parse("https://endler.dev").unwrap()), diff --git a/src/main.rs b/src/main.rs index c0d1fe4836..8f13b2f6b0 100644 --- a/src/main.rs +++ b/src/main.rs @@ -21,10 +21,10 @@ mod types; use client::ClientBuilder; use client_pool::ClientPool; +use collector::Input; use options::{Config, LycheeOptions}; use stats::ResponseStats; -use types::Response; -use types::{Excludes, Status}; +use types::{Excludes, Response, Status}; /// A C-like enum that can be cast to `i32` and used as process exit code. enum ExitCode { @@ -39,14 +39,13 @@ enum ExitCode { fn main() -> Result<()> { pretty_env_logger::init(); - let opts = LycheeOptions::from_args(); + let mut opts = LycheeOptions::from_args(); // Load a potentially existing config file and merge it into the config from the CLI - let cfg = if let Some(c) = Config::load_from_file(&opts.config_file)? { + if let Some(c) = Config::load_from_file(&opts.config_file)? { opts.config.merge(c) - } else { - opts.config - }; + } + let cfg = &opts.config; let mut runtime = match cfg.threads { Some(threads) => { @@ -59,10 +58,7 @@ fn main() -> Result<()> { } None => tokio::runtime::Runtime::new()?, }; - let errorcode = runtime.block_on(run( - cfg, - opts.inputs.iter().map(|i| i.to_string()).collect(), - ))?; + let errorcode = runtime.block_on(run(cfg, opts.inputs()))?; std::process::exit(errorcode); } @@ -79,7 +75,7 @@ fn show_progress(progress_bar: &Option, response: &Response, verbos }; } -async fn run(cfg: Config, inputs: Vec) -> Result { +async fn run(cfg: &Config, inputs: Vec) -> Result { let mut headers = parse_headers(&cfg.headers)?; if let Some(auth) = &cfg.basic_auth { let auth_header = parse_basic_auth(&auth)?; @@ -97,18 +93,18 @@ async fn run(cfg: Config, inputs: Vec) -> Result { .includes(includes) .excludes(excludes) .max_redirects(cfg.max_redirects) - .user_agent(cfg.user_agent) + .user_agent(cfg.user_agent.clone()) .allow_insecure(cfg.insecure) .custom_headers(headers) .method(method) .timeout(timeout) .verbose(cfg.verbose) - .github_token(cfg.github_token) - .scheme(cfg.scheme) + .github_token(cfg.github_token.clone()) + .scheme(cfg.scheme.clone()) .accepted(accepted) .build()?; - let links = collector::collect_links(inputs, cfg.base_url.clone()).await?; + let links = collector::collect_links(&inputs, cfg.base_url.clone()).await?; let pb = if cfg.progress { Some( ProgressBar::new(links.len() as u64) diff --git a/src/options.rs b/src/options.rs index 604081dbdc..716209f1ca 100644 --- a/src/options.rs +++ b/src/options.rs @@ -1,9 +1,9 @@ -use crate::types::Input; +use crate::collector::Input; + use anyhow::{Error, Result}; use serde::Deserialize; use std::{fs, io::ErrorKind}; use structopt::{clap::crate_version, StructOpt}; -use url::Url; pub(crate) const USER_AGENT: &str = concat!("lychee/", crate_version!()); const METHOD: &str = "get"; @@ -41,8 +41,8 @@ pub(crate) struct LycheeOptions { /// The inputs (where to get links to check from). /// These can be: files (e.g. `README.md`), file globs (e.g. `"~/git/*/README.md"`), /// remote URLs (e.g. `https://example.com/README.md`) or standard input (`-`). - #[structopt(default_value = "README.md", parse(from_str = Input::from))] - pub inputs: Vec, + #[structopt(name = "inputs", default_value = "README.md")] + raw_inputs: Vec, /// Configuration file to use #[structopt(short, long = "config", default_value = "./lychee.toml")] @@ -52,6 +52,19 @@ pub(crate) struct LycheeOptions { pub config: Config, } +impl LycheeOptions { + // This depends on config, which is why a method is required (we could + // accept a `Vec` in `LycheeOptions` and do the conversion there, + // but we'd get no access to `glob_ignore_case`. + /// Get parsed inputs from options. + pub(crate) fn inputs(&self) -> Vec { + self.raw_inputs + .iter() + .map(|s| Input::new(s, self.config.glob_ignore_case)) + .collect() + } +} + #[derive(Debug, Deserialize, StructOpt)] pub struct Config { /// Verbose program output @@ -59,11 +72,6 @@ pub struct Config { #[serde(default)] pub verbose: bool, - /// Skip missing input files (default is to error if they don't exist) - #[structopt(long)] - #[serde(default)] - pub skip_missing: bool, - /// Show progress #[structopt(short, long)] #[serde(default)] @@ -166,6 +174,16 @@ pub struct Config { #[structopt(long, env = "GITHUB_TOKEN")] #[serde(default)] pub github_token: Option, + + /// Skip missing input files (default is to error if they don't exist) + #[structopt(long)] + #[serde(default)] + pub skip_missing: bool, + + /// Ignore case when expanding filesystem path glob inputs + #[structopt(long)] + #[serde(default)] + pub glob_ignore_case: bool, } impl Config { @@ -189,7 +207,7 @@ impl Config { } /// Merge the configuration from TOML into the CLI configuration - pub(crate) fn merge(mut self, toml: Config) -> Config { + pub(crate) fn merge(&mut self, toml: Config) { fold_in! { // Destination and source configs self, toml; @@ -216,9 +234,9 @@ impl Config { base_url: None; basic_auth: None; github_token: None; + skip_missing: false; + glob_ignore_case: false; } - - self } } diff --git a/src/types.rs b/src/types.rs index b4f939b31b..32c3417e80 100644 --- a/src/types.rs +++ b/src/types.rs @@ -1,17 +1,10 @@ -use crate::extract::FileType; use crate::options::Config; use anyhow::{anyhow, Result}; -use glob::glob; use regex::RegexSet; use std::net::IpAddr; -use std::path::{Path, PathBuf}; use std::{collections::HashSet, convert::TryFrom, fmt::Display}; -use tokio::fs::read_to_string; -use tokio::io::{stdin, AsyncReadExt}; use url::Url; -const STDIN: &str = "-"; - #[derive(Clone, Debug, PartialEq, Eq, Hash)] pub enum Uri { Website(Url), @@ -165,113 +158,6 @@ impl Default for Excludes { } } -#[derive(Debug)] -#[non_exhaustive] -pub(crate) enum Input { - RemoteUrl(Url), - FsGlob(String), - FsPath(PathBuf), - Stdin, -} - -impl ToString for Input { - fn to_string(&self) -> String { - match self { - Self::RemoteUrl(url) => url.to_string(), - Self::FsGlob(s) => s.clone(), - Self::FsPath(p) => p.to_str().unwrap_or_default().to_owned(), - Self::Stdin => STDIN.to_owned(), - } - } -} - -#[derive(Debug)] -pub(crate) struct InputContent { - input: Input, - file_type: FileType, - content: String, -} - -impl From<&str> for Input { - fn from(value: &str) -> Self { - if value == STDIN { - Self::Stdin - } else { - match Url::parse(&value) { - Ok(url) => Self::RemoteUrl(url), - Err(_) => Self::FsGlob(value.to_owned()), - } - } - } -} - -impl Input { - async fn get_contents(self) -> Result> { - use Input::*; - - let contents = match self { - RemoteUrl(url) => vec![Self::url_contents(url).await?], - FsGlob(path_glob) => Self::glob_contents(path_glob).await?, - FsPath(path) => vec![Self::path_content(&path).await?], - Stdin => vec![Self::stdin_content().await?], - }; - - Ok(contents) - } - - async fn url_contents(url: Url) -> Result { - let res = reqwest::get(url.clone()).await?; - let content = res.text().await?; - let input_content = InputContent { - file_type: FileType::from(&url.as_str()), - input: Input::RemoteUrl(url), - content, - }; - - Ok(input_content) - } - - async fn glob_contents(path_glob: String) -> Result> { - let mut contents = vec![]; - - for entry in glob(&path_glob)? { - match entry { - Ok(path) => { - let content = Self::path_content(&path).await?; - contents.push(content); - } - Err(e) => println!("{:?}", e), - } - } - - Ok(contents) - } - - async fn path_content + AsRef>(path: P) -> Result { - let input_content = InputContent { - file_type: FileType::from(path.as_ref()), - content: read_to_string(&path).await?, - input: Input::FsPath(path.into()), - }; - - Ok(input_content) - } - - async fn stdin_content() -> Result { - let mut content = String::new(); - let mut stdin = stdin(); - stdin.read_to_string(&mut content).await?; - - let input_content = InputContent { - input: Input::Stdin, - content, - file_type: FileType::Plaintext, - }; - - Ok(input_content) - } -} - #[cfg(test)] mod test { use super::*; From 2e60d858ee59705af6e23629b6300dbe43367619 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Romanowski?= Date: Sun, 29 Nov 2020 19:36:54 +0100 Subject: [PATCH 05/12] Parallelize input parsing and link collection --- src/collector.rs | 45 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 36 insertions(+), 9 deletions(-) diff --git a/src/collector.rs b/src/collector.rs index d819c2b3ce..f4d5510ea4 100644 --- a/src/collector.rs +++ b/src/collector.rs @@ -12,7 +12,7 @@ use tokio::io::{stdin, AsyncReadExt}; const STDIN: &str = "-"; -#[derive(Debug)] +#[derive(Debug, Clone)] #[non_exhaustive] pub(crate) enum Input { RemoteUrl(Url), @@ -96,8 +96,6 @@ impl Input { match_opts.case_sensitive = !ignore_case; - println!("GLOB {:?} ignore case {:?}", path_glob, ignore_case); - for entry in glob_with(&glob_expanded, match_opts)? { match entry { Ok(path) => { @@ -166,14 +164,43 @@ pub(crate) async fn collect_links( _ => None, }; - let mut links = HashSet::new(); + let (contents_tx, mut contents_rx) = tokio::sync::mpsc::channel(32); + + // extract input contents + for input in inputs.iter().cloned() { + let mut sender = contents_tx.clone(); + + tokio::spawn(async move { + let contents = input.get_contents(None).await; + sender.send(contents).await + }); + } + + // receiver will get None once all tasks are done + drop(contents_tx); - for input in inputs { - let input_contents = input.get_contents(None).await?; + // extract links from input contents + let mut extract_links_handles = vec![]; - for input_content in input_contents { - links.extend(extract_links(&input_content, base_url.clone())); + while let Some(result) = contents_rx.recv().await { + for input_content in result? { + let base_url = base_url.clone(); + let handle = + tokio::task::spawn_blocking(move || extract_links(&input_content, base_url)); + extract_links_handles.push(handle); } } - Ok(links) + + // Note: we could dispatch links to be checked as soon as we get them, + // instead of building a HashSet will all links. + // This optimization would speed up cases where there's + // a lot of inputs and/or the inputs are large (e.g. big files). + let mut collected_links = HashSet::new(); + + for handle in extract_links_handles { + let links = handle.await?; + collected_links.extend(links); + } + + Ok(collected_links) } From 7d64c536e67277cf09a2a4ed000ab9866afb3799 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Romanowski?= Date: Mon, 30 Nov 2020 21:07:44 +0100 Subject: [PATCH 06/12] Properly handle skip_missing flag --- src/collector.rs | 53 ++++++++++++++++++++++++++++++++---------------- src/main.rs | 2 +- 2 files changed, 37 insertions(+), 18 deletions(-) diff --git a/src/collector.rs b/src/collector.rs index f4d5510ea4..7102422a66 100644 --- a/src/collector.rs +++ b/src/collector.rs @@ -1,6 +1,6 @@ use crate::extract::{extract_links, FileType}; use crate::types::Uri; -use anyhow::Result; +use anyhow::{anyhow, Context, Result}; use glob::glob_with; use reqwest::Url; use shellexpand::tilde; @@ -47,12 +47,19 @@ impl Input { } else { match Url::parse(&value) { Ok(url) => Self::RemoteUrl(url), - // we assume that it's cheaper to just do the globbing, without - // checking if the `value` actually is a glob pattern - Err(_) => Self::FsGlob { - pattern: value.to_owned(), - ignore_case: glob_ignore_case, - }, + Err(_) => { + // this seems to be the only way to determine if this is a glob pattern + let is_glob = glob::Pattern::escape(value) != value; + + if is_glob { + Self::FsGlob { + pattern: value.to_owned(), + ignore_case: glob_ignore_case, + } + } else { + Self::FsPath(value.into()) + } + } } } } @@ -60,21 +67,32 @@ impl Input { pub async fn get_contents( &self, file_type_hint: Option, + skip_missing: bool, ) -> Result> { use Input::*; - let contents = match self { - RemoteUrl(url) => vec![Self::url_contents(url).await?], + match self { + RemoteUrl(url) => Ok(vec![Self::url_contents(url).await?]), FsGlob { pattern, ignore_case, - } => Self::glob_contents(pattern, *ignore_case).await?, - FsPath(path) => vec![Self::path_content(&path).await?], - Stdin => vec![Self::stdin_content(file_type_hint).await?], - String(s) => vec![Self::string_content(s, file_type_hint)?], - }; - - Ok(contents) + } => Ok(Self::glob_contents(pattern, *ignore_case).await?), + FsPath(path) => { + let content = Self::path_content(&path).await.with_context(|| { + format!( + "Failed to read file: `{}`", + path.to_str().unwrap_or("") + ) + }); + match content { + Ok(input_content) => Ok(vec![input_content]), + Err(_) if skip_missing => Ok(vec![]), + Err(arg) => Err(anyhow!(arg)), + } + } + Stdin => Ok(vec![Self::stdin_content(file_type_hint).await?]), + String(s) => Ok(vec![Self::string_content(s, file_type_hint)?]), + } } async fn url_contents(url: &Url) -> Result { @@ -158,6 +176,7 @@ impl ToString for Input { pub(crate) async fn collect_links( inputs: &[Input], base_url: Option, + skip_missing_inputs: bool, ) -> Result> { let base_url = match base_url { Some(url) => Some(Url::parse(&url)?), @@ -171,7 +190,7 @@ pub(crate) async fn collect_links( let mut sender = contents_tx.clone(); tokio::spawn(async move { - let contents = input.get_contents(None).await; + let contents = input.get_contents(None, skip_missing_inputs).await; sender.send(contents).await }); } diff --git a/src/main.rs b/src/main.rs index 8f13b2f6b0..c45ba118d1 100644 --- a/src/main.rs +++ b/src/main.rs @@ -104,7 +104,7 @@ async fn run(cfg: &Config, inputs: Vec) -> Result { .accepted(accepted) .build()?; - let links = collector::collect_links(&inputs, cfg.base_url.clone()).await?; + let links = collector::collect_links(&inputs, cfg.base_url.clone(), cfg.skip_missing).await?; let pb = if cfg.progress { Some( ProgressBar::new(links.len() as u64) From 23fd0d8e691f63f95b0e245ac73f49c58bd903b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Romanowski?= Date: Tue, 1 Dec 2020 19:29:05 +0100 Subject: [PATCH 07/12] Add integration tests for stdin, missing files and globs --- Cargo.lock | 2 + Cargo.toml | 2 + README.md | 2 +- fixtures/TEST_404.md | 3 - src/collector.rs | 3 +- tests/cli.rs | 216 ++++++++++++++++++++++++++++++++++++------- 6 files changed, 188 insertions(+), 40 deletions(-) delete mode 100644 fixtures/TEST_404.md diff --git a/Cargo.lock b/Cargo.lock index 78ad2e4fc3..fb236de91d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1845,9 +1845,11 @@ dependencies = [ "serde", "shellexpand", "structopt", + "tempfile", "tokio 0.2.22", "toml", "url", + "uuid", "wiremock", ] diff --git a/Cargo.toml b/Cargo.toml index 5074044a77..61d68c2b76 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -44,3 +44,5 @@ version = "0.2" wiremock = "0.3" assert_cmd = "1.0" predicates = "1.0" +uuid = { version = "0.8", features = ["v4"] } +tempfile = "3.1" diff --git a/README.md b/README.md index 6aa60b39b7..80cf13f36d 100644 --- a/README.md +++ b/README.md @@ -115,7 +115,7 @@ config file. ### CLI exit codes - `0` for success (all links checked successfully or excluded/skipped as configured) -- `1` for any unexpected runtime failures or config errors +- `1` for missing inputs and any unexpected runtime failures or config errors - `2` for link check failures (if any non-excluded link failed the check) ## Troubleshooting and workarounds diff --git a/fixtures/TEST_404.md b/fixtures/TEST_404.md deleted file mode 100644 index e2b546ac63..0000000000 --- a/fixtures/TEST_404.md +++ /dev/null @@ -1,3 +0,0 @@ -Test file: this link should be a valid link but return a HTTP 404 when followed. - -http://httpbin.org/status/404 diff --git a/src/collector.rs b/src/collector.rs index 7102422a66..fd44ff2802 100644 --- a/src/collector.rs +++ b/src/collector.rs @@ -72,6 +72,7 @@ impl Input { use Input::*; match self { + // TODO: should skip_missing also affect URLs? RemoteUrl(url) => Ok(vec![Self::url_contents(url).await?]), FsGlob { pattern, @@ -211,7 +212,7 @@ pub(crate) async fn collect_links( } // Note: we could dispatch links to be checked as soon as we get them, - // instead of building a HashSet will all links. + // instead of building a HashSet with all links. // This optimization would speed up cases where there's // a lot of inputs and/or the inputs are large (e.g. big files). let mut collected_links = HashSet::new(); diff --git a/tests/cli.rs b/tests/cli.rs index 1a5f100d8a..fff27d061f 100644 --- a/tests/cli.rs +++ b/tests/cli.rs @@ -1,20 +1,43 @@ #[cfg(test)] mod cli { + use anyhow::Result; use assert_cmd::Command; + use http::StatusCode; use predicates::str::contains; - use std::path::Path; + use std::fs::File; + use std::io::Write; + use std::path::{Path, PathBuf}; + use wiremock::matchers::path; + use wiremock::{Mock, MockServer, ResponseTemplate}; + + fn main_command() -> Command { + // this gets the "main" binary name (e.g. `lychee`) + Command::cargo_bin(env!("CARGO_PKG_NAME")).expect("Couldn't get cargo package name") + } + + fn fixtures_path() -> PathBuf { + Path::new(module_path!()).parent().unwrap().join("fixtures") + } + + async fn get_mock_server(response_code: S) -> MockServer + where + S: Into, + { + let mock_server = MockServer::start().await; + + Mock::given(path("/")) + .respond_with(ResponseTemplate::new(response_code.into())) + .mount(&mock_server) + .await; + + mock_server + } #[test] fn test_exclude_all_private() { - // this gets the "main" binary name (e.g. `lychee`) - let mut cmd = - Command::cargo_bin(env!("CARGO_PKG_NAME")).expect("Couldn't get cargo package name"); + let mut cmd = main_command(); - let test_all_private_path = Path::new(module_path!()) - .parent() - .unwrap() - .join("fixtures") - .join("TEST_ALL_PRIVATE.md"); + let test_all_private_path = fixtures_path().join("TEST_ALL_PRIVATE.md"); // assert that the command runs OK, and that it excluded all the links cmd.arg("--exclude-all-private") @@ -31,14 +54,8 @@ mod cli { /// Test that a GitHub link can be checked without specifying the token. #[test] fn test_check_github_no_token() { - let mut cmd = - Command::cargo_bin(env!("CARGO_PKG_NAME")).expect("Couldn't get cargo package name"); - - let test_github_path = Path::new(module_path!()) - .parent() - .unwrap() - .join("fixtures") - .join("TEST_GITHUB.md"); + let mut cmd = main_command(); + let test_github_path = fixtures_path().join("TEST_GITHUB.md"); cmd.arg("--verbose") .arg(test_github_path) @@ -50,30 +67,27 @@ mod cli { .stdout(contains("Errors: 0")); } - #[test] - fn test_failure_404_link() { - let mut cmd = - Command::cargo_bin(env!("CARGO_PKG_NAME")).expect("Couldn't get cargo package name"); + #[tokio::test] + async fn test_failure_404_link() { + let mut cmd = main_command(); + let mock_server = get_mock_server(http::StatusCode::NOT_FOUND).await; + let dir = tempfile::tempdir().expect("Failed to create tempdir"); + let file_path = dir.path().join("test.txt"); + let mut file = File::create(&file_path).expect("Failed to create tempfile"); - let test_404_path = Path::new(module_path!()) - .parent() - .unwrap() - .join("fixtures") - .join("TEST_404.md"); + writeln!(file, "{}", mock_server.uri()).expect("Failed to write to file"); - cmd.arg(test_404_path).assert().failure().code(2); + cmd.arg(file_path) + .write_stdin(mock_server.uri()) + .assert() + .failure() + .code(2); } #[test] fn test_failure_github_404_no_token() { - let mut cmd = - Command::cargo_bin(env!("CARGO_PKG_NAME")).expect("Couldn't get cargo package name"); - - let test_github_404_path = Path::new(module_path!()) - .parent() - .unwrap() - .join("fixtures") - .join("TEST_GITHUB_404.md"); + let mut cmd = main_command(); + let test_github_404_path = fixtures_path().join("TEST_GITHUB_404.md"); cmd.arg(test_github_404_path) .env_clear() @@ -83,4 +97,136 @@ mod cli { .stdout(contains("https://github.com/mre/idiomatic-rust-doesnt-exist-man \ (GitHub token not specified. To check GitHub links reliably, use `--github-token` flag / `GITHUB_TOKEN` env var.)")); } + + #[tokio::test] + async fn test_stdin_input() { + let mut cmd = main_command(); + let mock_server = get_mock_server(http::StatusCode::OK).await; + + cmd.arg("-") + .write_stdin(mock_server.uri()) + .assert() + .success(); + } + + #[tokio::test] + async fn test_stdin_input_failure() { + let mut cmd = main_command(); + let mock_server = get_mock_server(http::StatusCode::INTERNAL_SERVER_ERROR).await; + + cmd.arg("-") + .write_stdin(mock_server.uri()) + .assert() + .failure() + .code(2); + } + + #[tokio::test] + async fn test_stdin_input_multiple() { + let mut cmd = main_command(); + let mock_server_a = get_mock_server(http::StatusCode::OK).await; + let mock_server_b = get_mock_server(http::StatusCode::OK).await; + + // this behavior (treating multiple `-` as separate inputs) is the same as most CLI tools + // that accept `-` as stdin, e.g. `cat`, `bat`, `grep` etc. + cmd.arg("-") + .arg("-") + .write_stdin(mock_server_a.uri()) + .write_stdin(mock_server_b.uri()) + .assert() + .success(); + } + + #[test] + fn test_missing_file_error() { + let mut cmd = main_command(); + let filename = format!("non-existing-file-{}", uuid::Uuid::new_v4().to_string()); + + cmd.arg(&filename) + .assert() + .failure() + .code(1) + .stderr(contains(format!( + "Error: Failed to read file: `{}`", + filename + ))); + } + + #[test] + fn test_missing_file_ok_if_skip_missing() { + let mut cmd = main_command(); + let filename = format!("non-existing-file-{}", uuid::Uuid::new_v4().to_string()); + + cmd.arg(&filename).arg("--skip-missing").assert().success(); + } + + #[tokio::test] + async fn test_glob() -> Result<()> { + // using Result to be able to use `?` + let mut cmd = main_command(); + + let dir = tempfile::tempdir()?; + let mock_server_a = get_mock_server(http::StatusCode::OK).await; + let mock_server_b = get_mock_server(http::StatusCode::OK).await; + let mut file_a = File::create(dir.path().join("a.md"))?; + let mut file_b = File::create(dir.path().join("b.md"))?; + + writeln!(file_a, "{}", mock_server_a.uri().as_str())?; + writeln!(file_b, "{}", mock_server_b.uri().as_str())?; + + cmd.arg(dir.path().join("*.md")) + .arg("--verbose") + .assert() + .success() + .stdout(contains("Total: 2")); + + Ok(()) + } + + #[cfg(target_os = "linux")] // MacOS and Windows have case-insensitive filesystems + #[tokio::test] + async fn test_glob_ignore_case() -> Result<()> { + let mut cmd = main_command(); + + let dir = tempfile::tempdir()?; + let mock_server_a = get_mock_server(http::StatusCode::OK).await; + let mock_server_b = get_mock_server(http::StatusCode::OK).await; + let mut file_a = File::create(dir.path().join("README.md"))?; + let mut file_b = File::create(dir.path().join("readme.md"))?; + + writeln!(file_a, "{}", mock_server_a.uri().as_str())?; + writeln!(file_b, "{}", mock_server_b.uri().as_str())?; + + cmd.arg(dir.path().join("[r]eadme.md")) + .arg("--verbose") + .arg("--glob-ignore-case") + .assert() + .success() + .stdout(contains("Total: 2")); + + Ok(()) + } + + #[tokio::test] + async fn test_glob_recursive() -> Result<()> { + let mut cmd = main_command(); + + let dir = tempfile::tempdir()?; + let subdir_level_1 = tempfile::tempdir_in(&dir)?; + let subdir_level_2 = tempfile::tempdir_in(&subdir_level_1)?; + + let mock_server = get_mock_server(http::StatusCode::OK).await; + let mut file = File::create(subdir_level_2.path().join("test.md"))?; + + writeln!(file, "{}", mock_server.uri().as_str())?; + + // ** should be a recursive glob + cmd.arg(dir.path().join("**/*.md")) + .arg("--verbose") + .assert() + .success() + .stdout(contains("Total: 1")); + + Ok(()) + } } From d7cbc58ec81afc39e2935e4caec623f1406c1a12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Romanowski?= Date: Tue, 1 Dec 2020 20:02:40 +0100 Subject: [PATCH 08/12] Move cli tests to main, add collector tests --- src/collector.rs | 56 +++++++++++ src/main.rs | 220 +++++++++++++++++++++++++++++++++++++++++++ src/test_utils.rs | 36 +++++++ tests/cli.rs | 232 ---------------------------------------------- 4 files changed, 312 insertions(+), 232 deletions(-) create mode 100644 src/test_utils.rs delete mode 100644 tests/cli.rs diff --git a/src/collector.rs b/src/collector.rs index fd44ff2802..54a30ecf6b 100644 --- a/src/collector.rs +++ b/src/collector.rs @@ -224,3 +224,59 @@ pub(crate) async fn collect_links( Ok(collected_links) } + +#[cfg(test)] +mod test { + use super::*; + use crate::test_utils::get_mock_server_with_content; + use std::fs::File; + use std::io::Write; + use std::str::FromStr; + + const TEST_STRING: &str = "http://test-string.com"; + const TEST_URL: &str = "https://test-url.org"; + const TEST_FILE: &str = "https://test-file.io"; + const TEST_GLOB_1: &str = "https://test-glob-1.io"; + const TEST_GLOB_2_MAIL: &str = "test@glob-2.io"; + + #[tokio::test] + async fn test_collect_links() -> Result<()> { + let dir = tempfile::tempdir()?; + let file_path = dir.path().join("f"); + let file_glob_1_path = dir.path().join("glob-1"); + let file_glob_2_path = dir.path().join("glob-2"); + + let mut file = File::create(&file_path)?; + let mut file_glob_1 = File::create(file_glob_1_path)?; + let mut file_glob_2 = File::create(file_glob_2_path)?; + + writeln!(file, "{}", TEST_FILE)?; + writeln!(file_glob_1, "{}", TEST_GLOB_1)?; + writeln!(file_glob_2, "{}", TEST_GLOB_2_MAIL)?; + + let mock_server = get_mock_server_with_content(http::StatusCode::OK, Some(TEST_URL)).await; + + let inputs = vec![ + Input::String(TEST_STRING.to_string()), + Input::RemoteUrl(Url::from_str(&mock_server.uri())?), + Input::FsPath(file_path), + Input::FsGlob { + pattern: dir.path().join("glob*").to_str().unwrap().to_string(), + ignore_case: true, + }, + ]; + + let links = collect_links(&inputs, None, false).await?; + + let mut expected_links = HashSet::new(); + expected_links.insert(Uri::Website(Url::from_str(TEST_STRING)?)); + expected_links.insert(Uri::Website(Url::from_str(TEST_URL)?)); + expected_links.insert(Uri::Website(Url::from_str(TEST_FILE)?)); + expected_links.insert(Uri::Website(Url::from_str(TEST_GLOB_1)?)); + expected_links.insert(Uri::Mail(TEST_GLOB_2_MAIL.to_string())); + + assert_eq!(links, expected_links); + + Ok(()) + } +} diff --git a/src/main.rs b/src/main.rs index c45ba118d1..28224ff0ab 100644 --- a/src/main.rs +++ b/src/main.rs @@ -19,6 +19,9 @@ mod options; mod stats; mod types; +#[cfg(test)] +pub(crate) mod test_utils; + use client::ClientBuilder; use client_pool::ClientPool; use collector::Input; @@ -261,3 +264,220 @@ mod test { assert_eq!(expected, actual); } } + +#[cfg(test)] +mod cli_test { + use crate::test_utils::get_mock_server; + use anyhow::Result; + use assert_cmd::Command; + use predicates::str::contains; + use std::fs::File; + use std::io::Write; + use std::path::{Path, PathBuf}; + + fn main_command() -> Command { + // this gets the "main" binary name (e.g. `lychee`) + Command::cargo_bin(env!("CARGO_PKG_NAME")).expect("Couldn't get cargo package name") + } + + fn fixtures_path() -> PathBuf { + Path::new(module_path!()).parent().unwrap().join("fixtures") + } + + #[test] + fn test_exclude_all_private() { + let mut cmd = main_command(); + + let test_all_private_path = fixtures_path().join("TEST_ALL_PRIVATE.md"); + + // assert that the command runs OK, and that it excluded all the links + cmd.arg("--exclude-all-private") + .arg("--verbose") + .arg(test_all_private_path) + .assert() + .success() + .stdout(contains("Total: 7")) + .stdout(contains("Excluded: 7")) + .stdout(contains("Successful: 0")) + .stdout(contains("Errors: 0")); + } + + /// Test that a GitHub link can be checked without specifying the token. + #[test] + fn test_check_github_no_token() { + let mut cmd = main_command(); + let test_github_path = fixtures_path().join("TEST_GITHUB.md"); + + cmd.arg("--verbose") + .arg(test_github_path) + .assert() + .success() + .stdout(contains("Total: 1")) + .stdout(contains("Excluded: 0")) + .stdout(contains("Successful: 1")) + .stdout(contains("Errors: 0")); + } + + #[tokio::test] + async fn test_failure_404_link() { + let mut cmd = main_command(); + let mock_server = get_mock_server(http::StatusCode::NOT_FOUND).await; + let dir = tempfile::tempdir().expect("Failed to create tempdir"); + let file_path = dir.path().join("test.txt"); + let mut file = File::create(&file_path).expect("Failed to create tempfile"); + + writeln!(file, "{}", mock_server.uri()).expect("Failed to write to file"); + + cmd.arg(file_path) + .write_stdin(mock_server.uri()) + .assert() + .failure() + .code(2); + } + + #[test] + fn test_failure_github_404_no_token() { + let mut cmd = main_command(); + let test_github_404_path = fixtures_path().join("TEST_GITHUB_404.md"); + + cmd.arg(test_github_404_path) + .env_clear() + .assert() + .failure() + .code(2) + .stdout(contains("https://github.com/mre/idiomatic-rust-doesnt-exist-man \ + (GitHub token not specified. To check GitHub links reliably, use `--github-token` flag / `GITHUB_TOKEN` env var.)")); + } + + #[tokio::test] + async fn test_stdin_input() { + let mut cmd = main_command(); + let mock_server = get_mock_server(http::StatusCode::OK).await; + + cmd.arg("-") + .write_stdin(mock_server.uri()) + .assert() + .success(); + } + + #[tokio::test] + async fn test_stdin_input_failure() { + let mut cmd = main_command(); + let mock_server = get_mock_server(http::StatusCode::INTERNAL_SERVER_ERROR).await; + + cmd.arg("-") + .write_stdin(mock_server.uri()) + .assert() + .failure() + .code(2); + } + + #[tokio::test] + async fn test_stdin_input_multiple() { + let mut cmd = main_command(); + let mock_server_a = get_mock_server(http::StatusCode::OK).await; + let mock_server_b = get_mock_server(http::StatusCode::OK).await; + + // this behavior (treating multiple `-` as separate inputs) is the same as most CLI tools + // that accept `-` as stdin, e.g. `cat`, `bat`, `grep` etc. + cmd.arg("-") + .arg("-") + .write_stdin(mock_server_a.uri()) + .write_stdin(mock_server_b.uri()) + .assert() + .success(); + } + + #[test] + fn test_missing_file_error() { + let mut cmd = main_command(); + let filename = format!("non-existing-file-{}", uuid::Uuid::new_v4().to_string()); + + cmd.arg(&filename) + .assert() + .failure() + .code(1) + .stderr(contains(format!( + "Error: Failed to read file: `{}`", + filename + ))); + } + + #[test] + fn test_missing_file_ok_if_skip_missing() { + let mut cmd = main_command(); + let filename = format!("non-existing-file-{}", uuid::Uuid::new_v4().to_string()); + + cmd.arg(&filename).arg("--skip-missing").assert().success(); + } + + #[tokio::test] + async fn test_glob() -> Result<()> { + // using Result to be able to use `?` + let mut cmd = main_command(); + + let dir = tempfile::tempdir()?; + let mock_server_a = get_mock_server(http::StatusCode::OK).await; + let mock_server_b = get_mock_server(http::StatusCode::OK).await; + let mut file_a = File::create(dir.path().join("a.md"))?; + let mut file_b = File::create(dir.path().join("b.md"))?; + + writeln!(file_a, "{}", mock_server_a.uri().as_str())?; + writeln!(file_b, "{}", mock_server_b.uri().as_str())?; + + cmd.arg(dir.path().join("*.md")) + .arg("--verbose") + .assert() + .success() + .stdout(contains("Total: 2")); + + Ok(()) + } + + #[cfg(target_os = "linux")] // MacOS and Windows have case-insensitive filesystems + #[tokio::test] + async fn test_glob_ignore_case() -> Result<()> { + let mut cmd = main_command(); + + let dir = tempfile::tempdir()?; + let mock_server_a = get_mock_server(http::StatusCode::OK).await; + let mock_server_b = get_mock_server(http::StatusCode::OK).await; + let mut file_a = File::create(dir.path().join("README.md"))?; + let mut file_b = File::create(dir.path().join("readme.md"))?; + + writeln!(file_a, "{}", mock_server_a.uri().as_str())?; + writeln!(file_b, "{}", mock_server_b.uri().as_str())?; + + cmd.arg(dir.path().join("[r]eadme.md")) + .arg("--verbose") + .arg("--glob-ignore-case") + .assert() + .success() + .stdout(contains("Total: 2")); + + Ok(()) + } + + #[tokio::test] + async fn test_glob_recursive() -> Result<()> { + let mut cmd = main_command(); + + let dir = tempfile::tempdir()?; + let subdir_level_1 = tempfile::tempdir_in(&dir)?; + let subdir_level_2 = tempfile::tempdir_in(&subdir_level_1)?; + + let mock_server = get_mock_server(http::StatusCode::OK).await; + let mut file = File::create(subdir_level_2.path().join("test.md"))?; + + writeln!(file, "{}", mock_server.uri().as_str())?; + + // ** should be a recursive glob + cmd.arg(dir.path().join("**/*.md")) + .arg("--verbose") + .assert() + .success() + .stdout(contains("Total: 1")); + + Ok(()) + } +} diff --git a/src/test_utils.rs b/src/test_utils.rs new file mode 100644 index 0000000000..3dd2400aad --- /dev/null +++ b/src/test_utils.rs @@ -0,0 +1,36 @@ +#![cfg(test)] + +use http::StatusCode; +use wiremock::matchers::path; +use wiremock::{Mock, MockServer, ResponseTemplate}; + +pub(crate) async fn get_mock_server(response_code: S) -> MockServer +where + S: Into, +{ + get_mock_server_with_content(response_code, None).await +} + +pub(crate) async fn get_mock_server_with_content( + response_code: S, + content: Option<&str>, +) -> MockServer +where + S: Into, +{ + let mock_server = MockServer::start().await; + let template = ResponseTemplate::new(response_code.into()); + + let template = if let Some(s) = content { + template.set_body_string(s) + } else { + template + }; + + Mock::given(path("/")) + .respond_with(template) + .mount(&mock_server) + .await; + + mock_server +} diff --git a/tests/cli.rs b/tests/cli.rs deleted file mode 100644 index fff27d061f..0000000000 --- a/tests/cli.rs +++ /dev/null @@ -1,232 +0,0 @@ -#[cfg(test)] -mod cli { - use anyhow::Result; - use assert_cmd::Command; - use http::StatusCode; - use predicates::str::contains; - use std::fs::File; - use std::io::Write; - use std::path::{Path, PathBuf}; - use wiremock::matchers::path; - use wiremock::{Mock, MockServer, ResponseTemplate}; - - fn main_command() -> Command { - // this gets the "main" binary name (e.g. `lychee`) - Command::cargo_bin(env!("CARGO_PKG_NAME")).expect("Couldn't get cargo package name") - } - - fn fixtures_path() -> PathBuf { - Path::new(module_path!()).parent().unwrap().join("fixtures") - } - - async fn get_mock_server(response_code: S) -> MockServer - where - S: Into, - { - let mock_server = MockServer::start().await; - - Mock::given(path("/")) - .respond_with(ResponseTemplate::new(response_code.into())) - .mount(&mock_server) - .await; - - mock_server - } - - #[test] - fn test_exclude_all_private() { - let mut cmd = main_command(); - - let test_all_private_path = fixtures_path().join("TEST_ALL_PRIVATE.md"); - - // assert that the command runs OK, and that it excluded all the links - cmd.arg("--exclude-all-private") - .arg("--verbose") - .arg(test_all_private_path) - .assert() - .success() - .stdout(contains("Total: 7")) - .stdout(contains("Excluded: 7")) - .stdout(contains("Successful: 0")) - .stdout(contains("Errors: 0")); - } - - /// Test that a GitHub link can be checked without specifying the token. - #[test] - fn test_check_github_no_token() { - let mut cmd = main_command(); - let test_github_path = fixtures_path().join("TEST_GITHUB.md"); - - cmd.arg("--verbose") - .arg(test_github_path) - .assert() - .success() - .stdout(contains("Total: 1")) - .stdout(contains("Excluded: 0")) - .stdout(contains("Successful: 1")) - .stdout(contains("Errors: 0")); - } - - #[tokio::test] - async fn test_failure_404_link() { - let mut cmd = main_command(); - let mock_server = get_mock_server(http::StatusCode::NOT_FOUND).await; - let dir = tempfile::tempdir().expect("Failed to create tempdir"); - let file_path = dir.path().join("test.txt"); - let mut file = File::create(&file_path).expect("Failed to create tempfile"); - - writeln!(file, "{}", mock_server.uri()).expect("Failed to write to file"); - - cmd.arg(file_path) - .write_stdin(mock_server.uri()) - .assert() - .failure() - .code(2); - } - - #[test] - fn test_failure_github_404_no_token() { - let mut cmd = main_command(); - let test_github_404_path = fixtures_path().join("TEST_GITHUB_404.md"); - - cmd.arg(test_github_404_path) - .env_clear() - .assert() - .failure() - .code(2) - .stdout(contains("https://github.com/mre/idiomatic-rust-doesnt-exist-man \ - (GitHub token not specified. To check GitHub links reliably, use `--github-token` flag / `GITHUB_TOKEN` env var.)")); - } - - #[tokio::test] - async fn test_stdin_input() { - let mut cmd = main_command(); - let mock_server = get_mock_server(http::StatusCode::OK).await; - - cmd.arg("-") - .write_stdin(mock_server.uri()) - .assert() - .success(); - } - - #[tokio::test] - async fn test_stdin_input_failure() { - let mut cmd = main_command(); - let mock_server = get_mock_server(http::StatusCode::INTERNAL_SERVER_ERROR).await; - - cmd.arg("-") - .write_stdin(mock_server.uri()) - .assert() - .failure() - .code(2); - } - - #[tokio::test] - async fn test_stdin_input_multiple() { - let mut cmd = main_command(); - let mock_server_a = get_mock_server(http::StatusCode::OK).await; - let mock_server_b = get_mock_server(http::StatusCode::OK).await; - - // this behavior (treating multiple `-` as separate inputs) is the same as most CLI tools - // that accept `-` as stdin, e.g. `cat`, `bat`, `grep` etc. - cmd.arg("-") - .arg("-") - .write_stdin(mock_server_a.uri()) - .write_stdin(mock_server_b.uri()) - .assert() - .success(); - } - - #[test] - fn test_missing_file_error() { - let mut cmd = main_command(); - let filename = format!("non-existing-file-{}", uuid::Uuid::new_v4().to_string()); - - cmd.arg(&filename) - .assert() - .failure() - .code(1) - .stderr(contains(format!( - "Error: Failed to read file: `{}`", - filename - ))); - } - - #[test] - fn test_missing_file_ok_if_skip_missing() { - let mut cmd = main_command(); - let filename = format!("non-existing-file-{}", uuid::Uuid::new_v4().to_string()); - - cmd.arg(&filename).arg("--skip-missing").assert().success(); - } - - #[tokio::test] - async fn test_glob() -> Result<()> { - // using Result to be able to use `?` - let mut cmd = main_command(); - - let dir = tempfile::tempdir()?; - let mock_server_a = get_mock_server(http::StatusCode::OK).await; - let mock_server_b = get_mock_server(http::StatusCode::OK).await; - let mut file_a = File::create(dir.path().join("a.md"))?; - let mut file_b = File::create(dir.path().join("b.md"))?; - - writeln!(file_a, "{}", mock_server_a.uri().as_str())?; - writeln!(file_b, "{}", mock_server_b.uri().as_str())?; - - cmd.arg(dir.path().join("*.md")) - .arg("--verbose") - .assert() - .success() - .stdout(contains("Total: 2")); - - Ok(()) - } - - #[cfg(target_os = "linux")] // MacOS and Windows have case-insensitive filesystems - #[tokio::test] - async fn test_glob_ignore_case() -> Result<()> { - let mut cmd = main_command(); - - let dir = tempfile::tempdir()?; - let mock_server_a = get_mock_server(http::StatusCode::OK).await; - let mock_server_b = get_mock_server(http::StatusCode::OK).await; - let mut file_a = File::create(dir.path().join("README.md"))?; - let mut file_b = File::create(dir.path().join("readme.md"))?; - - writeln!(file_a, "{}", mock_server_a.uri().as_str())?; - writeln!(file_b, "{}", mock_server_b.uri().as_str())?; - - cmd.arg(dir.path().join("[r]eadme.md")) - .arg("--verbose") - .arg("--glob-ignore-case") - .assert() - .success() - .stdout(contains("Total: 2")); - - Ok(()) - } - - #[tokio::test] - async fn test_glob_recursive() -> Result<()> { - let mut cmd = main_command(); - - let dir = tempfile::tempdir()?; - let subdir_level_1 = tempfile::tempdir_in(&dir)?; - let subdir_level_2 = tempfile::tempdir_in(&subdir_level_1)?; - - let mock_server = get_mock_server(http::StatusCode::OK).await; - let mut file = File::create(subdir_level_2.path().join("test.md"))?; - - writeln!(file, "{}", mock_server.uri().as_str())?; - - // ** should be a recursive glob - cmd.arg(dir.path().join("**/*.md")) - .arg("--verbose") - .assert() - .success() - .stdout(contains("Total: 1")); - - Ok(()) - } -} From 6384a76d30852df8e54800fa8567a9823626adc9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Romanowski?= Date: Tue, 1 Dec 2020 20:07:55 +0100 Subject: [PATCH 09/12] Fix accidental anyhow::Result import --- src/types.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/types.rs b/src/types.rs index 32c3417e80..2563874c91 100644 --- a/src/types.rs +++ b/src/types.rs @@ -1,5 +1,5 @@ use crate::options::Config; -use anyhow::{anyhow, Result}; +use anyhow::anyhow; use regex::RegexSet; use std::net::IpAddr; use std::{collections::HashSet, convert::TryFrom, fmt::Display}; From 55f900c0444d4d1f335dbbf2aa3471ac55484d63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Romanowski?= Date: Tue, 1 Dec 2020 21:16:52 +0100 Subject: [PATCH 10/12] Fix cli tests (move back to tests dir) --- src/main.rs | 219 +--------------------------------------- src/test_utils.rs | 2 + tests/cli.rs | 247 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 250 insertions(+), 218 deletions(-) create mode 100644 tests/cli.rs diff --git a/src/main.rs b/src/main.rs index 28224ff0ab..89e356b595 100644 --- a/src/main.rs +++ b/src/main.rs @@ -20,7 +20,7 @@ mod stats; mod types; #[cfg(test)] -pub(crate) mod test_utils; +pub mod test_utils; use client::ClientBuilder; use client_pool::ClientPool; @@ -264,220 +264,3 @@ mod test { assert_eq!(expected, actual); } } - -#[cfg(test)] -mod cli_test { - use crate::test_utils::get_mock_server; - use anyhow::Result; - use assert_cmd::Command; - use predicates::str::contains; - use std::fs::File; - use std::io::Write; - use std::path::{Path, PathBuf}; - - fn main_command() -> Command { - // this gets the "main" binary name (e.g. `lychee`) - Command::cargo_bin(env!("CARGO_PKG_NAME")).expect("Couldn't get cargo package name") - } - - fn fixtures_path() -> PathBuf { - Path::new(module_path!()).parent().unwrap().join("fixtures") - } - - #[test] - fn test_exclude_all_private() { - let mut cmd = main_command(); - - let test_all_private_path = fixtures_path().join("TEST_ALL_PRIVATE.md"); - - // assert that the command runs OK, and that it excluded all the links - cmd.arg("--exclude-all-private") - .arg("--verbose") - .arg(test_all_private_path) - .assert() - .success() - .stdout(contains("Total: 7")) - .stdout(contains("Excluded: 7")) - .stdout(contains("Successful: 0")) - .stdout(contains("Errors: 0")); - } - - /// Test that a GitHub link can be checked without specifying the token. - #[test] - fn test_check_github_no_token() { - let mut cmd = main_command(); - let test_github_path = fixtures_path().join("TEST_GITHUB.md"); - - cmd.arg("--verbose") - .arg(test_github_path) - .assert() - .success() - .stdout(contains("Total: 1")) - .stdout(contains("Excluded: 0")) - .stdout(contains("Successful: 1")) - .stdout(contains("Errors: 0")); - } - - #[tokio::test] - async fn test_failure_404_link() { - let mut cmd = main_command(); - let mock_server = get_mock_server(http::StatusCode::NOT_FOUND).await; - let dir = tempfile::tempdir().expect("Failed to create tempdir"); - let file_path = dir.path().join("test.txt"); - let mut file = File::create(&file_path).expect("Failed to create tempfile"); - - writeln!(file, "{}", mock_server.uri()).expect("Failed to write to file"); - - cmd.arg(file_path) - .write_stdin(mock_server.uri()) - .assert() - .failure() - .code(2); - } - - #[test] - fn test_failure_github_404_no_token() { - let mut cmd = main_command(); - let test_github_404_path = fixtures_path().join("TEST_GITHUB_404.md"); - - cmd.arg(test_github_404_path) - .env_clear() - .assert() - .failure() - .code(2) - .stdout(contains("https://github.com/mre/idiomatic-rust-doesnt-exist-man \ - (GitHub token not specified. To check GitHub links reliably, use `--github-token` flag / `GITHUB_TOKEN` env var.)")); - } - - #[tokio::test] - async fn test_stdin_input() { - let mut cmd = main_command(); - let mock_server = get_mock_server(http::StatusCode::OK).await; - - cmd.arg("-") - .write_stdin(mock_server.uri()) - .assert() - .success(); - } - - #[tokio::test] - async fn test_stdin_input_failure() { - let mut cmd = main_command(); - let mock_server = get_mock_server(http::StatusCode::INTERNAL_SERVER_ERROR).await; - - cmd.arg("-") - .write_stdin(mock_server.uri()) - .assert() - .failure() - .code(2); - } - - #[tokio::test] - async fn test_stdin_input_multiple() { - let mut cmd = main_command(); - let mock_server_a = get_mock_server(http::StatusCode::OK).await; - let mock_server_b = get_mock_server(http::StatusCode::OK).await; - - // this behavior (treating multiple `-` as separate inputs) is the same as most CLI tools - // that accept `-` as stdin, e.g. `cat`, `bat`, `grep` etc. - cmd.arg("-") - .arg("-") - .write_stdin(mock_server_a.uri()) - .write_stdin(mock_server_b.uri()) - .assert() - .success(); - } - - #[test] - fn test_missing_file_error() { - let mut cmd = main_command(); - let filename = format!("non-existing-file-{}", uuid::Uuid::new_v4().to_string()); - - cmd.arg(&filename) - .assert() - .failure() - .code(1) - .stderr(contains(format!( - "Error: Failed to read file: `{}`", - filename - ))); - } - - #[test] - fn test_missing_file_ok_if_skip_missing() { - let mut cmd = main_command(); - let filename = format!("non-existing-file-{}", uuid::Uuid::new_v4().to_string()); - - cmd.arg(&filename).arg("--skip-missing").assert().success(); - } - - #[tokio::test] - async fn test_glob() -> Result<()> { - // using Result to be able to use `?` - let mut cmd = main_command(); - - let dir = tempfile::tempdir()?; - let mock_server_a = get_mock_server(http::StatusCode::OK).await; - let mock_server_b = get_mock_server(http::StatusCode::OK).await; - let mut file_a = File::create(dir.path().join("a.md"))?; - let mut file_b = File::create(dir.path().join("b.md"))?; - - writeln!(file_a, "{}", mock_server_a.uri().as_str())?; - writeln!(file_b, "{}", mock_server_b.uri().as_str())?; - - cmd.arg(dir.path().join("*.md")) - .arg("--verbose") - .assert() - .success() - .stdout(contains("Total: 2")); - - Ok(()) - } - - #[cfg(target_os = "linux")] // MacOS and Windows have case-insensitive filesystems - #[tokio::test] - async fn test_glob_ignore_case() -> Result<()> { - let mut cmd = main_command(); - - let dir = tempfile::tempdir()?; - let mock_server_a = get_mock_server(http::StatusCode::OK).await; - let mock_server_b = get_mock_server(http::StatusCode::OK).await; - let mut file_a = File::create(dir.path().join("README.md"))?; - let mut file_b = File::create(dir.path().join("readme.md"))?; - - writeln!(file_a, "{}", mock_server_a.uri().as_str())?; - writeln!(file_b, "{}", mock_server_b.uri().as_str())?; - - cmd.arg(dir.path().join("[r]eadme.md")) - .arg("--verbose") - .arg("--glob-ignore-case") - .assert() - .success() - .stdout(contains("Total: 2")); - - Ok(()) - } - - #[tokio::test] - async fn test_glob_recursive() -> Result<()> { - let mut cmd = main_command(); - - let dir = tempfile::tempdir()?; - let subdir_level_1 = tempfile::tempdir_in(&dir)?; - let subdir_level_2 = tempfile::tempdir_in(&subdir_level_1)?; - - let mock_server = get_mock_server(http::StatusCode::OK).await; - let mut file = File::create(subdir_level_2.path().join("test.md"))?; - - writeln!(file, "{}", mock_server.uri().as_str())?; - - // ** should be a recursive glob - cmd.arg(dir.path().join("**/*.md")) - .arg("--verbose") - .assert() - .success() - .stdout(contains("Total: 1")); - - Ok(()) - } -} diff --git a/src/test_utils.rs b/src/test_utils.rs index 3dd2400aad..743d286d4c 100644 --- a/src/test_utils.rs +++ b/src/test_utils.rs @@ -4,6 +4,8 @@ use http::StatusCode; use wiremock::matchers::path; use wiremock::{Mock, MockServer, ResponseTemplate}; +// TODO: used in cli tests (as duplicate) +#[allow(unused)] pub(crate) async fn get_mock_server(response_code: S) -> MockServer where S: Into, diff --git a/tests/cli.rs b/tests/cli.rs new file mode 100644 index 0000000000..f55614bd8a --- /dev/null +++ b/tests/cli.rs @@ -0,0 +1,247 @@ +#[cfg(test)] +mod cli { + use anyhow::Result; + use assert_cmd::Command; + use http::StatusCode; + use predicates::str::contains; + use std::fs::File; + use std::io::Write; + use std::path::{Path, PathBuf}; + use wiremock::matchers::path; + use wiremock::{Mock, MockServer, ResponseTemplate}; + + fn main_command() -> Command { + // this gets the "main" binary name (e.g. `lychee`) + Command::cargo_bin(env!("CARGO_PKG_NAME")).expect("Couldn't get cargo package name") + } + + fn fixtures_path() -> PathBuf { + Path::new(module_path!()).parent().unwrap().join("fixtures") + } + + // TODO: duplicate of test_utils + async fn get_mock_server(response_code: S) -> MockServer + where + S: Into, + { + get_mock_server_with_content(response_code, None).await + } + + async fn get_mock_server_with_content(response_code: S, content: Option<&str>) -> MockServer + where + S: Into, + { + let mock_server = MockServer::start().await; + let template = ResponseTemplate::new(response_code.into()); + + let template = if let Some(s) = content { + template.set_body_string(s) + } else { + template + }; + + Mock::given(path("/")) + .respond_with(template) + .mount(&mock_server) + .await; + + mock_server + } + + #[test] + fn test_exclude_all_private() { + let mut cmd = main_command(); + + let test_all_private_path = fixtures_path().join("TEST_ALL_PRIVATE.md"); + + // assert that the command runs OK, and that it excluded all the links + cmd.arg("--exclude-all-private") + .arg("--verbose") + .arg(test_all_private_path) + .assert() + .success() + .stdout(contains("Total: 7")) + .stdout(contains("Excluded: 7")) + .stdout(contains("Successful: 0")) + .stdout(contains("Errors: 0")); + } + + /// Test that a GitHub link can be checked without specifying the token. + #[test] + fn test_check_github_no_token() { + let mut cmd = main_command(); + let test_github_path = fixtures_path().join("TEST_GITHUB.md"); + + cmd.arg("--verbose") + .arg(test_github_path) + .assert() + .success() + .stdout(contains("Total: 1")) + .stdout(contains("Excluded: 0")) + .stdout(contains("Successful: 1")) + .stdout(contains("Errors: 0")); + } + + #[tokio::test] + async fn test_failure_404_link() { + let mut cmd = main_command(); + let mock_server = get_mock_server(http::StatusCode::NOT_FOUND).await; + let dir = tempfile::tempdir().expect("Failed to create tempdir"); + let file_path = dir.path().join("test.txt"); + let mut file = File::create(&file_path).expect("Failed to create tempfile"); + + writeln!(file, "{}", mock_server.uri()).expect("Failed to write to file"); + + cmd.arg(file_path) + .write_stdin(mock_server.uri()) + .assert() + .failure() + .code(2); + } + + #[test] + fn test_failure_github_404_no_token() { + let mut cmd = main_command(); + let test_github_404_path = fixtures_path().join("TEST_GITHUB_404.md"); + + cmd.arg(test_github_404_path) + .env_clear() + .assert() + .failure() + .code(2) + .stdout(contains("https://github.com/mre/idiomatic-rust-doesnt-exist-man \ + (GitHub token not specified. To check GitHub links reliably, use `--github-token` flag / `GITHUB_TOKEN` env var.)")); + } + + #[tokio::test] + async fn test_stdin_input() { + let mut cmd = main_command(); + let mock_server = get_mock_server(http::StatusCode::OK).await; + + cmd.arg("-") + .write_stdin(mock_server.uri()) + .assert() + .success(); + } + + #[tokio::test] + async fn test_stdin_input_failure() { + let mut cmd = main_command(); + let mock_server = get_mock_server(http::StatusCode::INTERNAL_SERVER_ERROR).await; + + cmd.arg("-") + .write_stdin(mock_server.uri()) + .assert() + .failure() + .code(2); + } + + #[tokio::test] + async fn test_stdin_input_multiple() { + let mut cmd = main_command(); + let mock_server_a = get_mock_server(http::StatusCode::OK).await; + let mock_server_b = get_mock_server(http::StatusCode::OK).await; + + // this behavior (treating multiple `-` as separate inputs) is the same as most CLI tools + // that accept `-` as stdin, e.g. `cat`, `bat`, `grep` etc. + cmd.arg("-") + .arg("-") + .write_stdin(mock_server_a.uri()) + .write_stdin(mock_server_b.uri()) + .assert() + .success(); + } + + #[test] + fn test_missing_file_error() { + let mut cmd = main_command(); + let filename = format!("non-existing-file-{}", uuid::Uuid::new_v4().to_string()); + + cmd.arg(&filename) + .assert() + .failure() + .code(1) + .stderr(contains(format!( + "Error: Failed to read file: `{}`", + filename + ))); + } + + #[test] + fn test_missing_file_ok_if_skip_missing() { + let mut cmd = main_command(); + let filename = format!("non-existing-file-{}", uuid::Uuid::new_v4().to_string()); + + cmd.arg(&filename).arg("--skip-missing").assert().success(); + } + + #[tokio::test] + async fn test_glob() -> Result<()> { + // using Result to be able to use `?` + let mut cmd = main_command(); + + let dir = tempfile::tempdir()?; + let mock_server_a = get_mock_server(http::StatusCode::OK).await; + let mock_server_b = get_mock_server(http::StatusCode::OK).await; + let mut file_a = File::create(dir.path().join("a.md"))?; + let mut file_b = File::create(dir.path().join("b.md"))?; + + writeln!(file_a, "{}", mock_server_a.uri().as_str())?; + writeln!(file_b, "{}", mock_server_b.uri().as_str())?; + + cmd.arg(dir.path().join("*.md")) + .arg("--verbose") + .assert() + .success() + .stdout(contains("Total: 2")); + + Ok(()) + } + + #[cfg(target_os = "linux")] // MacOS and Windows have case-insensitive filesystems + #[tokio::test] + async fn test_glob_ignore_case() -> Result<()> { + let mut cmd = main_command(); + + let dir = tempfile::tempdir()?; + let mock_server_a = get_mock_server(http::StatusCode::OK).await; + let mock_server_b = get_mock_server(http::StatusCode::OK).await; + let mut file_a = File::create(dir.path().join("README.md"))?; + let mut file_b = File::create(dir.path().join("readme.md"))?; + + writeln!(file_a, "{}", mock_server_a.uri().as_str())?; + writeln!(file_b, "{}", mock_server_b.uri().as_str())?; + + cmd.arg(dir.path().join("[r]eadme.md")) + .arg("--verbose") + .arg("--glob-ignore-case") + .assert() + .success() + .stdout(contains("Total: 2")); + + Ok(()) + } + + #[tokio::test] + async fn test_glob_recursive() -> Result<()> { + let mut cmd = main_command(); + + let dir = tempfile::tempdir()?; + let subdir_level_1 = tempfile::tempdir_in(&dir)?; + let subdir_level_2 = tempfile::tempdir_in(&subdir_level_1)?; + + let mock_server = get_mock_server(http::StatusCode::OK).await; + let mut file = File::create(subdir_level_2.path().join("test.md"))?; + + writeln!(file, "{}", mock_server.uri().as_str())?; + + // ** should be a recursive glob + cmd.arg(dir.path().join("**/*.md")) + .arg("--verbose") + .assert() + .success() + .stdout(contains("Total: 1")); + + Ok(()) + } +} From 3b0d81123c44498e98bb602649328b013dd0ef01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Romanowski?= Date: Tue, 1 Dec 2020 21:19:35 +0100 Subject: [PATCH 11/12] Remove unnecessary pub mod --- src/main.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main.rs b/src/main.rs index 89e356b595..1b232da068 100644 --- a/src/main.rs +++ b/src/main.rs @@ -20,7 +20,7 @@ mod stats; mod types; #[cfg(test)] -pub mod test_utils; +mod test_utils; use client::ClientBuilder; use client_pool::ClientPool; From 5facb1d9ddf2eb416eba82626acbb4942151dee3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Romanowski?= Date: Wed, 2 Dec 2020 20:11:30 +0100 Subject: [PATCH 12/12] Fix some config values being String for no reason Also add max_concurrency to collect_links --- Cargo.lock | 1 + Cargo.toml | 1 + src/collector.rs | 5 +++-- src/extract.rs | 6 +++--- src/main.rs | 16 +++++++++++----- src/options.rs | 46 +++++++++++++++++++++++++++++----------------- 6 files changed, 48 insertions(+), 27 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index fb236de91d..1ee96e4518 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1834,6 +1834,7 @@ dependencies = [ "http", "hubcaps", "indicatif", + "lazy_static", "linkify", "log", "predicates", diff --git a/Cargo.toml b/Cargo.toml index 61d68c2b76..b6e10a4a8c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -31,6 +31,7 @@ headers = "0.3.2" derive_builder = "0.9.0" deadpool = "0.6.0" shellexpand = "2.0" +lazy_static = "1.1" [dependencies.reqwest] features = ["gzip"] diff --git a/src/collector.rs b/src/collector.rs index 54a30ecf6b..f5fab3a0dd 100644 --- a/src/collector.rs +++ b/src/collector.rs @@ -178,13 +178,14 @@ pub(crate) async fn collect_links( inputs: &[Input], base_url: Option, skip_missing_inputs: bool, + max_concurrency: usize, ) -> Result> { let base_url = match base_url { Some(url) => Some(Url::parse(&url)?), _ => None, }; - let (contents_tx, mut contents_rx) = tokio::sync::mpsc::channel(32); + let (contents_tx, mut contents_rx) = tokio::sync::mpsc::channel(max_concurrency); // extract input contents for input in inputs.iter().cloned() { @@ -266,7 +267,7 @@ mod test { }, ]; - let links = collect_links(&inputs, None, false).await?; + let links = collect_links(&inputs, None, false, 8).await?; let mut expected_links = HashSet::new(); expected_links.insert(Uri::Website(Url::from_str(TEST_STRING)?)); diff --git a/src/extract.rs b/src/extract.rs index 7a6c14b0a5..57a1a716ac 100644 --- a/src/extract.rs +++ b/src/extract.rs @@ -25,9 +25,9 @@ impl> From

for FileType { fn from(p: P) -> FileType { let path = p.as_ref(); match path.extension() { - Some(ext) => match ext.to_str().expect("Couldn't convert OsStr to str") { - "md" => FileType::Markdown, - "html" | "htm" => FileType::HTML, + Some(ext) => match ext { + _ if ext == "md" => FileType::Markdown, + _ if (ext == "htm" || ext == "html") => FileType::HTML, _ => FileType::Plaintext, }, None => FileType::Plaintext, diff --git a/src/main.rs b/src/main.rs index 1b232da068..eb88e94cad 100644 --- a/src/main.rs +++ b/src/main.rs @@ -86,8 +86,8 @@ async fn run(cfg: &Config, inputs: Vec) -> Result { } let accepted = cfg.accept.clone().and_then(|a| parse_statuscodes(&a).ok()); - let timeout = parse_timeout(&cfg.timeout)?; - let max_concurrency = cfg.max_concurrency.parse()?; + let timeout = parse_timeout(cfg.timeout); + let max_concurrency = cfg.max_concurrency; let method: reqwest::Method = reqwest::Method::from_str(&cfg.method.to_uppercase())?; let includes = RegexSet::new(&cfg.include)?; let excludes = Excludes::from_options(&cfg); @@ -107,7 +107,13 @@ async fn run(cfg: &Config, inputs: Vec) -> Result { .accepted(accepted) .build()?; - let links = collector::collect_links(&inputs, cfg.base_url.clone(), cfg.skip_missing).await?; + let links = collector::collect_links( + &inputs, + cfg.base_url.clone(), + cfg.skip_missing, + max_concurrency, + ) + .await?; let pb = if cfg.progress { Some( ProgressBar::new(links.len() as u64) @@ -175,8 +181,8 @@ fn read_header(input: &str) -> Result<(String, String)> { Ok((elements[0].into(), elements[1].into())) } -fn parse_timeout>(timeout: S) -> Result { - Ok(Duration::from_secs(timeout.as_ref().parse::()?)) +fn parse_timeout(timeout: usize) -> Duration { + Duration::from_secs(timeout as u64) } fn parse_headers>(headers: &[T]) -> Result { diff --git a/src/options.rs b/src/options.rs index 716209f1ca..b881f164ce 100644 --- a/src/options.rs +++ b/src/options.rs @@ -1,14 +1,24 @@ use crate::collector::Input; use anyhow::{Error, Result}; +use lazy_static::lazy_static; use serde::Deserialize; use std::{fs, io::ErrorKind}; use structopt::{clap::crate_version, StructOpt}; pub(crate) const USER_AGENT: &str = concat!("lychee/", crate_version!()); const METHOD: &str = "get"; -const TIMEOUT: &str = "20"; -const MAX_CONCURRENCY: &str = "128"; +const TIMEOUT: usize = 20; +const MAX_CONCURRENCY: usize = 128; +const MAX_REDIRECTS: usize = 10; + +// this exists because structopt requires `&str` type values for defaults +// (we can't use e.g. `TIMEOUT` or `timeout()` which gets created for serde) +lazy_static! { + static ref TIMEOUT_STR: String = TIMEOUT.to_string(); + static ref MAX_CONCURRENCY_STR: String = MAX_CONCURRENCY.to_string(); + static ref MAX_REDIRECTS_STR: String = MAX_REDIRECTS.to_string(); +} // Macro for generating default functions to be used by serde macro_rules! default_function { @@ -21,6 +31,15 @@ macro_rules! default_function { }; } +// Generate the functions for serde defaults +default_function! { + max_redirects: usize = MAX_REDIRECTS; + max_concurrency: usize = MAX_CONCURRENCY; + user_agent: String = USER_AGENT.to_string(); + timeout: usize = TIMEOUT; + method: String = METHOD.to_string(); +} + // Macro for merging configuration values macro_rules! fold_in { ( $cli:ident , $toml:ident ; $( $key:ident : $default:expr; )* ) => { @@ -78,14 +97,14 @@ pub struct Config { pub progress: bool, /// Maximum number of allowed redirects - #[structopt(short, long, default_value = "10")] - #[serde(default)] + #[structopt(short, long, default_value = &MAX_REDIRECTS_STR)] + #[serde(default = "max_redirects")] pub max_redirects: usize, /// Maximum number of concurrent network requests - #[structopt(long, default_value = MAX_CONCURRENCY)] - #[serde(default)] - pub max_concurrency: String, + #[structopt(long, default_value = &MAX_CONCURRENCY_STR)] + #[serde(default = "max_concurrency")] + pub max_concurrency: usize, /// Number of threads to utilize. /// Defaults to number of cores available to the system @@ -150,9 +169,9 @@ pub struct Config { pub accept: Option, /// Website timeout from connect to response finished - #[structopt(short, long, default_value = TIMEOUT)] + #[structopt(short, long, default_value = &TIMEOUT_STR)] #[serde(default = "timeout")] - pub timeout: String, + pub timeout: usize, /// Request method // Using `-X` as a short param similar to curl @@ -215,7 +234,7 @@ impl Config { // Keys with defaults to assign verbose: false; progress: false; - max_redirects: 10; + max_redirects: MAX_REDIRECTS; max_concurrency: MAX_CONCURRENCY; threads: None; user_agent: USER_AGENT; @@ -239,10 +258,3 @@ impl Config { } } } - -// Generate the functions for serde defaults -default_function! { - user_agent: String = USER_AGENT.to_string(); - timeout: String = TIMEOUT.to_string(); - method: String = METHOD.to_string(); -}