From 701fbc9adad6a30847c46908cd46896be28a2f38 Mon Sep 17 00:00:00 2001 From: Matthias Endler Date: Wed, 16 Jun 2021 13:03:36 +0200 Subject: [PATCH 01/46] Add support for local files --- Cargo.lock | 1 + fixtures/TEST_RELATIVE.html | 1 + fixtures/TEST_RELATIVE_2.html | 1 + fixtures/TEST_RELATIVE_3.html | 1 + lychee-bin/src/main.rs | 5 +- lychee-bin/src/options.rs | 2 +- lychee-bin/tests/local_files.rs | 37 ++++++ lychee-lib/Cargo.toml | 1 + lychee-lib/src/collector.rs | 220 +------------------------------- lychee-lib/src/extract.rs | 75 ++++------- lychee-lib/src/fs_tree.rs | 135 ++++++++++++++++++++ lychee-lib/src/lib.rs | 10 +- lychee-lib/src/types/error.rs | 4 + lychee-lib/src/types/file.rs | 37 ++++++ lychee-lib/src/types/input.rs | 203 +++++++++++++++++++++++++++++ lychee-lib/src/types/mod.rs | 4 + 16 files changed, 462 insertions(+), 275 deletions(-) create mode 100644 fixtures/TEST_RELATIVE.html create mode 100644 fixtures/TEST_RELATIVE_2.html create mode 100644 fixtures/TEST_RELATIVE_3.html create mode 100644 lychee-bin/tests/local_files.rs create mode 100644 lychee-lib/src/fs_tree.rs create mode 100644 lychee-lib/src/types/file.rs create mode 100644 lychee-lib/src/types/input.rs diff --git a/Cargo.lock b/Cargo.lock index e6e8efefa1..1f7194a069 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1391,6 +1391,7 @@ dependencies = [ "http", "hubcaps", "linkify", + "log", "markup5ever_rcdom", "openssl-sys", "pretty_assertions", diff --git a/fixtures/TEST_RELATIVE.html b/fixtures/TEST_RELATIVE.html new file mode 100644 index 0000000000..be4b0e517c --- /dev/null +++ b/fixtures/TEST_RELATIVE.html @@ -0,0 +1 @@ +Foo \ No newline at end of file diff --git a/fixtures/TEST_RELATIVE_2.html b/fixtures/TEST_RELATIVE_2.html new file mode 100644 index 0000000000..89c3e73ade --- /dev/null +++ b/fixtures/TEST_RELATIVE_2.html @@ -0,0 +1 @@ +Bar \ No newline at end of file diff --git a/fixtures/TEST_RELATIVE_3.html b/fixtures/TEST_RELATIVE_3.html new file mode 100644 index 0000000000..a1324d8465 --- /dev/null +++ b/fixtures/TEST_RELATIVE_3.html @@ -0,0 +1 @@ +Example link \ No newline at end of file diff --git a/lychee-bin/src/main.rs b/lychee-bin/src/main.rs index 3f64f8d761..535fa1a9f1 100644 --- a/lychee-bin/src/main.rs +++ b/lychee-bin/src/main.rs @@ -70,10 +70,7 @@ use anyhow::{anyhow, Context, Result}; use headers::{authorization::Basic, Authorization, HeaderMap, HeaderMapExt, HeaderName}; use http::StatusCode; use indicatif::{ProgressBar, ProgressStyle}; -use lychee_lib::{ - collector::{Collector, Input}, - ClientBuilder, ClientPool, Response, -}; +use lychee_lib::{ClientBuilder, ClientPool, Collector, Input, Response}; use openssl_sys as _; // required for vendored-openssl feature use regex::RegexSet; use ring as _; // required for apple silicon diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index be67e0b0d7..ab463df66b 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -2,7 +2,7 @@ use std::{fs, io::ErrorKind, path::PathBuf, str::FromStr}; use anyhow::{anyhow, Error, Result}; use lazy_static::lazy_static; -use lychee_lib::collector::Input; +use lychee_lib::Input; use reqwest::Url; use serde::Deserialize; use structopt::{clap::crate_version, StructOpt}; diff --git a/lychee-bin/tests/local_files.rs b/lychee-bin/tests/local_files.rs new file mode 100644 index 0000000000..ddd0ed25e1 --- /dev/null +++ b/lychee-bin/tests/local_files.rs @@ -0,0 +1,37 @@ +#[cfg(test)] +mod cli { + use std::{fs::File, io::Write}; + + use assert_cmd::Command; + use lychee_lib::Result; + use predicates::str::contains; + + fn main_command() -> Command { + // this gets the "main" binary name (e.g. `lychee`) + Command::cargo_bin(env!("CARGO_PKG_NAME")).expect("Couldn't get cargo package name") + } + + #[tokio::test] + async fn test_local_file() -> Result<()> { + let dir = tempfile::tempdir()?; + let index_path = dir.path().join("index.html"); + let mut index = File::create(&index_path)?; + writeln!(index, r#"Foo"#)?; + + let foo_path = dir.path().join("foo.html"); + let mut foo = File::create(&foo_path)?; + writeln!(foo, r#"example"#)?; + + let mut cmd = main_command(); + cmd.arg(index_path) + .arg("--no-progress") + .arg("--verbose") + .env_clear() + .assert() + .success() + .stdout(contains("Total............1")) + .stdout(contains("example.org")); + + Ok(()) + } +} diff --git a/lychee-lib/Cargo.toml b/lychee-lib/Cargo.toml index cc5dd6d670..ab2c29c303 100644 --- a/lychee-lib/Cargo.toml +++ b/lychee-lib/Cargo.toml @@ -40,6 +40,7 @@ shellexpand = "2.1.0" tokio = { version = "1.6.0", features = ["full"] } typed-builder = "0.9.1" url = { version = "2.2.2", features = ["serde"] } +log = "0.4.14" [dev-dependencies] doc-comment = "0.3.3" diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index 67fb090253..31416cc991 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -1,218 +1,6 @@ -use std::{ - collections::HashSet, - fmt::Display, - path::{Path, PathBuf}, -}; - -use glob::glob_with; +use crate::{extract::extract_links, uri::Uri, Input, Request, Result}; use reqwest::Url; -use serde::Serialize; -use shellexpand::tilde; -use tokio::{ - fs::read_to_string, - io::{stdin, AsyncReadExt}, -}; - -use crate::{ - extract::{extract_links, FileType}, - uri::Uri, - Request, Result, -}; - -const STDIN: &str = "-"; -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -#[non_exhaustive] -/// An exhaustive list of input sources, which lychee accepts -pub enum Input { - /// URL (of HTTP/HTTPS scheme). - RemoteUrl(Box), - /// Unix shell-style glob pattern. - FsGlob { - /// The glob pattern matching all input files - pattern: String, - /// Don't be case sensitive when matching files against a glob - ignore_case: bool, - }, - /// File path. - FsPath(PathBuf), - /// Standard Input. - Stdin, - /// Raw string input. - String(String), -} - -impl Serialize for Input { - fn serialize(&self, serializer: S) -> std::result::Result - where - S: serde::Serializer, - { - serializer.collect_str(self) - } -} - -impl Display for Input { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.write_str(match self { - Input::RemoteUrl(url) => url.as_str(), - Input::FsGlob { pattern, .. } => pattern, - Input::FsPath(path) => path.to_str().unwrap_or_default(), - Input::Stdin => "stdin", - Input::String(_) => "raw input string", - }) - } -} - -#[derive(Debug)] -/// Encapsulates the content for a given input -pub struct InputContent { - /// Input source - pub input: Input, - /// File type of given input - pub file_type: FileType, - /// Raw UTF-8 string content - pub content: String, -} - -impl InputContent { - #[must_use] - /// Create an instance of `InputContent` from an input string - pub fn from_string(s: &str, file_type: FileType) -> Self { - // TODO: consider using Cow (to avoid one .clone() for String types) - Self { - input: Input::String(s.to_owned()), - file_type, - content: s.to_owned(), - } - } -} - -impl Input { - #[must_use] - /// Construct a new `Input` source. In case the input is a `glob` pattern, - /// `glob_ignore_case` decides whether matching files against the `glob` is - /// case-insensitive or not - pub fn new(value: &str, glob_ignore_case: bool) -> Self { - if value == STDIN { - Self::Stdin - } else if let Ok(url) = Url::parse(value) { - Self::RemoteUrl(Box::new(url)) - } else { - // this seems to be the only way to determine if this is a glob pattern - let is_glob = glob::Pattern::escape(value) != value; - - if is_glob { - Self::FsGlob { - pattern: value.to_owned(), - ignore_case: glob_ignore_case, - } - } else { - Self::FsPath(value.into()) - } - } - } - - #[allow(clippy::missing_panics_doc)] - /// Retrieve the contents from the input - /// - /// # Errors - /// - /// Returns an error if the contents can not be retrieved - /// because of an underlying I/O error (e.g. an error while making a - /// network request or retrieving the contents from the file system) - pub async fn get_contents( - &self, - file_type_hint: Option, - skip_missing: bool, - ) -> Result> { - match *self { - // TODO: should skip_missing also affect URLs? - Input::RemoteUrl(ref url) => Ok(vec![Self::url_contents(url).await?]), - Input::FsGlob { - ref pattern, - ignore_case, - } => Ok(Self::glob_contents(pattern, ignore_case).await?), - Input::FsPath(ref path) => { - let content = Self::path_content(path).await; - match content { - Ok(input_content) => Ok(vec![input_content]), - Err(_) if skip_missing => Ok(vec![]), - Err(e) => Err(e), - } - } - Input::Stdin => Ok(vec![Self::stdin_content(file_type_hint).await?]), - Input::String(ref s) => Ok(vec![Self::string_content(s, file_type_hint)]), - } - } - - async fn url_contents(url: &Url) -> Result { - // Assume HTML for default paths - let file_type = if url.path().is_empty() || url.path() == "/" { - FileType::Html - } else { - FileType::from(url.as_str()) - }; - - let res = reqwest::get(url.clone()).await?; - let input_content = InputContent { - input: Input::RemoteUrl(Box::new(url.clone())), - file_type, - content: res.text().await?, - }; - - Ok(input_content) - } - - async fn glob_contents(path_glob: &str, ignore_case: bool) -> Result> { - let mut contents = vec![]; - let glob_expanded = tilde(&path_glob); - let mut match_opts = glob::MatchOptions::new(); - - match_opts.case_sensitive = !ignore_case; - - for entry in glob_with(&glob_expanded, match_opts)? { - match entry { - Ok(path) => { - let content = Self::path_content(&path).await?; - contents.push(content); - } - Err(e) => println!("{:?}", e), - } - } - - Ok(contents) - } - - async fn path_content + AsRef + Clone>(path: P) -> Result { - let content = read_to_string(&path) - .await - .map_err(|e| (path.clone().into(), e))?; - let input_content = InputContent { - file_type: FileType::from(path.as_ref()), - content, - input: Input::FsPath(path.into()), - }; - - Ok(input_content) - } - - async fn stdin_content(file_type_hint: Option) -> Result { - let mut content = String::new(); - let mut stdin = stdin(); - stdin.read_to_string(&mut content).await?; - - let input_content = InputContent { - input: Input::Stdin, - file_type: file_type_hint.unwrap_or_default(), - content, - }; - - Ok(input_content) - } - - fn string_content(s: &str, file_type_hint: Option) -> InputContent { - InputContent::from_string(s, file_type_hint.unwrap_or_default()) - } -} +use std::collections::HashSet; /// Collector keeps the state of link collection #[derive(Debug, Clone)] @@ -278,7 +66,7 @@ impl Collector { for handle in extract_links_handles { let new_links = handle.await?; - links.extend(new_links); + links.extend(new_links?); } // Filter out already cached links (duplicates) @@ -304,9 +92,9 @@ mod test { use super::*; use crate::{ - extract::FileType, mock_server, test_utils::{mail, website}, + types::{FileType, Input}, Result, Uri, }; diff --git a/lychee-lib/src/extract.rs b/lychee-lib/src/extract.rs index 32f24441f2..31c9e3cfab 100644 --- a/lychee-lib/src/extract.rs +++ b/lychee-lib/src/extract.rs @@ -1,51 +1,19 @@ -use std::{collections::HashSet, convert::TryFrom, path::Path}; +use std::{collections::HashSet, convert::TryFrom, path::PathBuf}; use html5ever::{ parse_document, tendril::{StrTendril, TendrilSink}, }; use linkify::LinkFinder; +use log::info; use markup5ever_rcdom::{Handle, NodeData, RcDom}; use pulldown_cmark::{Event as MDEvent, Parser, Tag}; use url::Url; -use crate::{collector::InputContent, Request, Uri}; - -#[derive(Copy, Clone, Debug, PartialEq, Eq)] -/// `FileType` defines which file types lychee can handle -pub enum FileType { - /// File in HTML format - Html, - /// File in Markdown format - Markdown, - /// Generic text file without syntax-specific parsing - Plaintext, -} - -impl Default for FileType { - fn default() -> Self { - Self::Plaintext - } -} - -impl> From

for FileType { - /// Detect if the given path points to a Markdown, HTML, or plaintext file. - fn from(p: P) -> FileType { - let path = p.as_ref(); - // Assume HTML in case of no extension. - // Note: this is only reasonable for URLs; not paths on disk. - // For example, `README` without an extension is more likely to be a plaintext file. - // A better solution would be to also implement `From for FileType`. - // Unfortunately that's not possible without refactoring, as - // `AsRef` could be implemented for `Url` in the future, which is why - // `From for FileType` is not allowed. - match path.extension().and_then(std::ffi::OsStr::to_str) { - Some("md" | "markdown") => FileType::Markdown, - Some("htm" | "html") | None => FileType::Html, - Some(_) => FileType::Plaintext, - } - } -} +use crate::{ + types::{FileType, InputContent}, + Input, Request, Result, Uri, +}; // Use LinkFinder here to offload the actual link searching in plaintext. fn find_links(input: &str) -> Vec { @@ -140,7 +108,7 @@ fn extract_links_from_plaintext(input: &str) -> Vec { pub(crate) fn extract_links( input_content: &InputContent, base_url: &Option, -) -> HashSet { +) -> Result> { let links = match input_content.file_type { FileType::Markdown => extract_links_from_markdown(&input_content.content), FileType::Html => extract_links_from_html(&input_content.content), @@ -153,16 +121,23 @@ pub(crate) fn extract_links( for link in links { if let Ok(uri) = Uri::try_from(link.as_str()) { requests.insert(Request::new(uri, input_content.input.clone())); - } else if !Path::new(&link).exists() { - if let Some(new_url) = base_url.as_ref().and_then(|u| u.join(&link).ok()) { - requests.insert(Request::new( - Uri { url: new_url }, - input_content.input.clone(), - )); + } else if let Some(new_url) = base_url.as_ref().and_then(|u| u.join(&link).ok()) { + requests.insert(Request::new( + Uri { url: new_url }, + input_content.input.clone(), + )); + } else if let Input::FsPath(root) = &input_content.input { + if let Ok(path) = crate::fs_tree::find(&root, &PathBuf::from(&link)) { + let input_content = Input::path_content(path)?; + requests.extend(extract_links(&input_content, base_url)?); + } else { + info!("Cannot find path to {} in filesystem", &link); } - }; + } else { + info!("Handling of {} not implemented yet", &link); + } } - requests + Ok(requests) } #[cfg(test)] @@ -180,10 +155,10 @@ mod test { use super::{ extract_links, extract_links_from_html, extract_links_from_markdown, - extract_links_from_plaintext, find_links, FileType, + extract_links_from_plaintext, find_links, }; + use crate::types::{FileType, InputContent}; use crate::{ - collector::InputContent, test_utils::{mail, website}, Uri, }; @@ -211,6 +186,8 @@ mod test { &InputContent::from_string(input, file_type), &base_url.map(|u| Url::parse(u).unwrap()), ) + // unwrap is fine here as this helper function is only used in tests + .unwrap() .into_iter() .map(|r| r.uri) .collect() diff --git a/lychee-lib/src/fs_tree.rs b/lychee-lib/src/fs_tree.rs new file mode 100644 index 0000000000..a1d9bd40d6 --- /dev/null +++ b/lychee-lib/src/fs_tree.rs @@ -0,0 +1,135 @@ +use crate::{ErrorKind, Result}; +use std::path::PathBuf; + +pub(crate) fn find(root: &PathBuf, dst: &PathBuf) -> Result { + if dst.exists() { + return Ok(dst.clone()); + } + if dst.is_dir() { + return Err(ErrorKind::FileNotFound(dst.clone())); + } + // Find `dst` in the `root` path + if let Some(parent) = root.parent() { + let rel = parent.join(dst); + if rel.exists() { + return Ok(rel); + } + } + return Err(ErrorKind::FileNotFound(dst.clone())); +} + +#[cfg(test)] +mod test_fs_tree { + use std::fs::File; + + use super::*; + use crate::Result; + + // dummy root + // /path/to/foo.html + #[test] + fn test_find_absolute() -> Result<()> { + let dummy = PathBuf::new(); + let dir = tempfile::tempdir()?; + let dst = dir.path().join("foo.html"); + File::create(&dst)?; + assert_eq!(find(&dummy, &dst)?, dst); + Ok(()) + } + + // index.html + // ./foo.html + #[test] + fn test_find_relative() -> Result<()> { + let root = PathBuf::from("index.html"); + let dir = tempfile::tempdir()?; + let dst = dir.path().join("./foo.html"); + File::create(&dst)?; + assert_eq!(find(&root, &dst)?, dst); + Ok(()) + } + + // ./index.html + // ./foo.html + #[test] + fn test_find_relative_index() -> Result<()> { + let root = PathBuf::from("./index.html"); + let dir = tempfile::tempdir()?; + let dst = dir.path().join("./foo.html"); + File::create(&dst)?; + assert_eq!(find(&root, &dst)?, dst); + Ok(()) + } + + #[test] + fn test_find_relative_nonexistent() -> Result<()> { + let root = PathBuf::from("index.html"); + // This file does not exist + let dst = PathBuf::from("./foo.html"); + assert!(find(&root, &dst).is_err()); + Ok(()) + } + + // /path/to/index.html + // ./foo.html + #[test] + fn test_find_relative_from_absolute() -> Result<()> { + let dir = tempfile::tempdir()?; + let root = dir.path().join("index.html"); + // We create the absolute path to foo.html, + // but we address it under its relative path + let dst = PathBuf::from("./foo.html"); + let dst_absolute = dir.path().join("./foo.html"); + File::create(&dst_absolute)?; + assert_eq!(find(&root, &dst)?, dst_absolute); + Ok(()) + } + + // /path/to/index.html + // ./foo.html (non-existent) + #[test] + fn test_find_relative_from_absolute_nonexistent() -> Result<()> { + let dir = tempfile::tempdir()?; + let root = dir.path().join("index.html"); + // We create the absolute path to foo.html, + // but we address it under its relative path + let dst = PathBuf::from("./foo.html"); + assert!(find(&root, &dst).is_err()); + Ok(()) + } + + // /path/to/index.html + // /other/path/to/foo.html + #[test] + fn test_find_absolute_from_absolute() -> Result<()> { + let root = PathBuf::from("/path/to/index.html"); + let dir = tempfile::tempdir()?; + let dst = dir.path().join("foo.html"); + File::create(&dst)?; + assert_eq!(find(&root, &dst)?, dst); + Ok(()) + } + + // /path/to + // /other/path/to/foo.html + #[test] + fn test_root_is_dir() -> Result<()> { + let root = PathBuf::from("/path/to/"); + let dir = tempfile::tempdir()?; + let dst = dir.path().join("foo.html"); + File::create(&dst)?; + assert_eq!(find(&root, &dst)?, dst); + Ok(()) + } + + // /path/to/index.html + // /other/path/to + #[test] + fn test_dst_is_dir() -> Result<()> { + let root = PathBuf::from("/path/to/"); + let dir = tempfile::tempdir()?; + File::create(&dir)?; + assert!(find(&root, &dir.into_path()).is_err()); + Ok(()) + } +} diff --git a/lychee-lib/src/lib.rs b/lychee-lib/src/lib.rs index 71ba9d6986..169b3d48eb 100644 --- a/lychee-lib/src/lib.rs +++ b/lychee-lib/src/lib.rs @@ -47,13 +47,13 @@ doc_comment::doctest!("../../README.md"); mod client; mod client_pool; +/// A pool of clients, to handle concurrent checks +pub mod collector; +mod fs_tree; mod quirks; mod types; mod uri; -/// A pool of clients, to handle concurrent checks -pub mod collector; - /// Functionality to extract URIs from inputs pub mod extract; @@ -75,8 +75,8 @@ use ring as _; // required for apple silicon pub use crate::{ client::{check, ClientBuilder}, client_pool::ClientPool, - collector::{Collector, Input}, + collector::Collector, filter::{Excludes, Filter, Includes}, - types::{ErrorKind, Request, Response, ResponseBody, Result, Status}, + types::{ErrorKind, Input, Request, Response, ResponseBody, Result, Status}, uri::Uri, }; diff --git a/lychee-lib/src/types/error.rs b/lychee-lib/src/types/error.rs index 0710f5ea0f..89c60c5912 100644 --- a/lychee-lib/src/types/error.rs +++ b/lychee-lib/src/types/error.rs @@ -25,6 +25,8 @@ pub enum ErrorKind { /// A possible error when converting a `HeaderValue` from a string or byte /// slice. InvalidHeader(InvalidHeaderValue), + /// Cannot find local file + FileNotFound(PathBuf), /// The given UNIX glob pattern is invalid InvalidGlobPattern(glob::PatternError), /// The Github API could not be called because of a missing Github token. @@ -63,6 +65,7 @@ impl Hash for ErrorKind { Self::IoError(p, e) => (p, e.kind()).hash(state), Self::ReqwestError(e) => e.to_string().hash(state), Self::HubcapsError(e) => e.to_string().hash(state), + Self::FileNotFound(e) => e.to_string_lossy().hash(state), Self::UrlParseError(s, e) => (s, e.type_id()).hash(state), Self::UnreachableEmailAddress(u) | Self::InsecureURL(u) => u.hash(state), Self::InvalidHeader(e) => e.to_string().hash(state), @@ -84,6 +87,7 @@ impl Display for ErrorKind { Self::IoError(None, e) => e.fmt(f), Self::ReqwestError(e) => e.fmt(f), Self::HubcapsError(e) => e.fmt(f), + Self::FileNotFound(e) => write!(f, "{}", e.to_string_lossy()), Self::UrlParseError(s, (url_err, Some(mail_err))) => { write!( f, diff --git a/lychee-lib/src/types/file.rs b/lychee-lib/src/types/file.rs new file mode 100644 index 0000000000..1afe52504f --- /dev/null +++ b/lychee-lib/src/types/file.rs @@ -0,0 +1,37 @@ +use std::path::Path; + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +/// `FileType` defines which file types lychee can handle +pub enum FileType { + /// File in HTML format + Html, + /// File in Markdown format + Markdown, + /// Generic text file without syntax-specific parsing + Plaintext, +} + +impl Default for FileType { + fn default() -> Self { + Self::Plaintext + } +} + +impl> From

for FileType { + /// Detect if the given path points to a Markdown, HTML, or plaintext file. + fn from(p: P) -> FileType { + let path = p.as_ref(); + // Assume HTML in case of no extension. + // Note: this is only reasonable for URLs; not paths on disk. + // For example, `README` without an extension is more likely to be a plaintext file. + // A better solution would be to also implement `From for FileType`. + // Unfortunately that's not possible without refactoring, as + // `AsRef` could be implemented for `Url` in the future, which is why + // `From for FileType` is not allowed. + match path.extension().and_then(std::ffi::OsStr::to_str) { + Some("md") | Some("markdown") => FileType::Markdown, + Some("htm") | Some("html") | None => FileType::Html, + Some(_) => FileType::Plaintext, + } + } +} diff --git a/lychee-lib/src/types/input.rs b/lychee-lib/src/types/input.rs new file mode 100644 index 0000000000..bc8178275b --- /dev/null +++ b/lychee-lib/src/types/input.rs @@ -0,0 +1,203 @@ +use crate::types::FileType; +use crate::Result; +use glob::glob_with; +use reqwest::Url; +use serde::Serialize; +use shellexpand::tilde; +use std::path::{Path, PathBuf}; +use std::{fmt::Display, fs::read_to_string}; +use tokio::io::{stdin, AsyncReadExt}; + +const STDIN: &str = "-"; +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +#[non_exhaustive] +/// An exhaustive list of input sources, which lychee accepts +pub enum Input { + /// URL (of HTTP/HTTPS scheme). + RemoteUrl(Box), + /// Unix shell-style glob pattern. + FsGlob { + /// The glob pattern matching all input files + pattern: String, + /// Don't be case sensitive when matching files against a glob + ignore_case: bool, + }, + /// File path. + FsPath(PathBuf), + /// Standard Input. + Stdin, + /// Raw string input. + String(String), +} + +impl Serialize for Input { + fn serialize(&self, serializer: S) -> std::result::Result + where + S: serde::Serializer, + { + serializer.collect_str(self) + } +} + +impl Display for Input { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(match self { + Input::RemoteUrl(url) => url.as_str(), + Input::FsGlob { pattern, .. } => pattern, + Input::FsPath(path) => path.to_str().unwrap_or_default(), + Input::Stdin => "stdin", + Input::String(_) => "raw input string", + }) + } +} + +#[derive(Debug)] +/// Encapsulates the content for a given input +pub struct InputContent { + /// Input source + pub input: Input, + /// File type of given input + pub file_type: FileType, + /// Raw UTF-8 string content + pub content: String, +} + +impl InputContent { + #[must_use] + /// Create an instance of `InputContent` from an input string + pub fn from_string(s: &str, file_type: FileType) -> Self { + // TODO: consider using Cow (to avoid one .clone() for String types) + Self { + input: Input::String(s.to_owned()), + file_type, + content: s.to_owned(), + } + } +} + +impl Input { + #[must_use] + /// Construct a new `Input` source. In case the input is a `glob` pattern, + /// `glob_ignore_case` decides whether matching files against the `glob` is + /// case-insensitive or not + pub fn new(value: &str, glob_ignore_case: bool) -> Self { + if value == STDIN { + Self::Stdin + } else if let Ok(url) = Url::parse(&value) { + Self::RemoteUrl(Box::new(url)) + } else { + // this seems to be the only way to determine if this is a glob pattern + let is_glob = glob::Pattern::escape(value) != value; + + if is_glob { + Self::FsGlob { + pattern: value.to_owned(), + ignore_case: glob_ignore_case, + } + } else { + Self::FsPath(value.into()) + } + } + } + + #[allow(clippy::missing_panics_doc)] + /// Retrieve the contents from the input + /// + /// # Errors + /// + /// Returns an error if the contents can not be retrieved + /// because of an underlying I/O error (e.g. an error while making a + /// network request or retrieving the contents from the file system) + pub async fn get_contents( + &self, + file_type_hint: Option, + skip_missing: bool, + ) -> Result> { + match *self { + // TODO: should skip_missing also affect URLs? + Input::RemoteUrl(ref url) => Ok(vec![Self::url_contents(url).await?]), + Input::FsGlob { + ref pattern, + ignore_case, + } => Ok(Self::glob_contents(pattern, ignore_case).await?), + Input::FsPath(ref path) => { + let content = Self::path_content(path); + match content { + Ok(input_content) => Ok(vec![input_content]), + Err(_) if skip_missing => Ok(vec![]), + Err(e) => Err(e), + } + } + Input::Stdin => Ok(vec![Self::stdin_content(file_type_hint).await?]), + Input::String(ref s) => Ok(vec![Self::string_content(s, file_type_hint)]), + } + } + + async fn url_contents(url: &Url) -> Result { + // Assume HTML for default paths + let file_type = if url.path().is_empty() || url.path() == "/" { + FileType::Html + } else { + FileType::from(url.as_str()) + }; + + let res = reqwest::get(url.clone()).await?; + let input_content = InputContent { + input: Input::RemoteUrl(Box::new(url.clone())), + file_type, + content: res.text().await?, + }; + + Ok(input_content) + } + + async fn glob_contents(path_glob: &str, ignore_case: bool) -> Result> { + let mut contents = vec![]; + let glob_expanded = tilde(&path_glob); + let mut match_opts = glob::MatchOptions::new(); + + match_opts.case_sensitive = !ignore_case; + + for entry in glob_with(&glob_expanded, match_opts)? { + match entry { + Ok(path) => { + let content = Self::path_content(&path)?; + contents.push(content); + } + Err(e) => println!("{:?}", e), + } + } + + Ok(contents) + } + + /// Get the input content of a given path + pub fn path_content + AsRef + Clone>(path: P) -> Result { + let content = read_to_string(&path).map_err(|e| (path.clone().into(), e))?; + let input_content = InputContent { + file_type: FileType::from(path.as_ref()), + content, + input: Input::FsPath(path.into()), + }; + + Ok(input_content) + } + + async fn stdin_content(file_type_hint: Option) -> Result { + let mut content = String::new(); + let mut stdin = stdin(); + stdin.read_to_string(&mut content).await?; + + let input_content = InputContent { + input: Input::Stdin, + file_type: file_type_hint.unwrap_or_default(), + content, + }; + + Ok(input_content) + } + + fn string_content(s: &str, file_type_hint: Option) -> InputContent { + InputContent::from_string(s, file_type_hint.unwrap_or_default()) + } +} diff --git a/lychee-lib/src/types/mod.rs b/lychee-lib/src/types/mod.rs index a48f7a90fc..552b87fc19 100644 --- a/lychee-lib/src/types/mod.rs +++ b/lychee-lib/src/types/mod.rs @@ -1,11 +1,15 @@ #![allow(unreachable_pub)] mod error; +mod file; +mod input; mod request; mod response; mod status; pub use error::ErrorKind; +pub use file::FileType; +pub use input::{Input, InputContent}; pub use request::Request; pub use response::{Response, ResponseBody}; pub use status::Status; From d5bb7ee7d7c50dea96d7363ac9f802f0e7c24876 Mon Sep 17 00:00:00 2001 From: Matthias Endler Date: Thu, 17 Jun 2021 18:12:07 +0200 Subject: [PATCH 02/46] Or Patterns (Rust 1.53) --- lychee-lib/src/extract.rs | 4 +--- lychee-lib/src/fs_tree.rs | 12 ++++++------ lychee-lib/src/lib.rs | 1 + lychee-lib/src/types/file.rs | 4 ++-- lychee-lib/src/types/input.rs | 3 +++ 5 files changed, 13 insertions(+), 11 deletions(-) diff --git a/lychee-lib/src/extract.rs b/lychee-lib/src/extract.rs index 31c9e3cfab..1dd310b1e9 100644 --- a/lychee-lib/src/extract.rs +++ b/lychee-lib/src/extract.rs @@ -26,9 +26,7 @@ fn extract_links_from_markdown(input: &str) -> Vec { let parser = Parser::new(input); parser .flat_map(|event| match event { - MDEvent::Start(Tag::Link(_, url, _) | Tag::Image(_, url, _)) => { - vec![url.to_string()] - } + MDEvent::Start(Tag::Link(_, url, _) | Tag::Image(_, url, _)) => vec![url.to_string()], MDEvent::Text(txt) => extract_links_from_plaintext(&txt.to_string()), MDEvent::Html(html) => extract_links_from_html(&html.to_string()), _ => vec![], diff --git a/lychee-lib/src/fs_tree.rs b/lychee-lib/src/fs_tree.rs index a1d9bd40d6..b3255ead22 100644 --- a/lychee-lib/src/fs_tree.rs +++ b/lychee-lib/src/fs_tree.rs @@ -1,21 +1,21 @@ use crate::{ErrorKind, Result}; -use std::path::PathBuf; +use std::path::{Path, PathBuf}; -pub(crate) fn find(root: &PathBuf, dst: &PathBuf) -> Result { +pub(crate) fn find(root: &Path, dst: &Path) -> Result { if dst.exists() { - return Ok(dst.clone()); + return Ok(dst.to_path_buf()); } if dst.is_dir() { - return Err(ErrorKind::FileNotFound(dst.clone())); + return Err(ErrorKind::FileNotFound(dst.into())); } // Find `dst` in the `root` path if let Some(parent) = root.parent() { - let rel = parent.join(dst); + let rel = parent.join(dst.to_path_buf()); if rel.exists() { return Ok(rel); } } - return Err(ErrorKind::FileNotFound(dst.clone())); + Err(ErrorKind::FileNotFound(dst.to_path_buf())) } #[cfg(test)] diff --git a/lychee-lib/src/lib.rs b/lychee-lib/src/lib.rs index 169b3d48eb..62257d5dc1 100644 --- a/lychee-lib/src/lib.rs +++ b/lychee-lib/src/lib.rs @@ -41,6 +41,7 @@ )] #![deny(anonymous_parameters, macro_use_extern_crate, pointer_structural_match)] #![deny(missing_docs)] +#![allow(clippy::module_name_repetitions)] #[cfg(doctest)] doc_comment::doctest!("../../README.md"); diff --git a/lychee-lib/src/types/file.rs b/lychee-lib/src/types/file.rs index 1afe52504f..d0d9510024 100644 --- a/lychee-lib/src/types/file.rs +++ b/lychee-lib/src/types/file.rs @@ -29,8 +29,8 @@ impl> From

for FileType { // `AsRef` could be implemented for `Url` in the future, which is why // `From for FileType` is not allowed. match path.extension().and_then(std::ffi::OsStr::to_str) { - Some("md") | Some("markdown") => FileType::Markdown, - Some("htm") | Some("html") | None => FileType::Html, + Some("md" | "markdown") => FileType::Markdown, + Some("htm" | "html") | None => FileType::Html, Some(_) => FileType::Plaintext, } } diff --git a/lychee-lib/src/types/input.rs b/lychee-lib/src/types/input.rs index bc8178275b..20a9f2f9e9 100644 --- a/lychee-lib/src/types/input.rs +++ b/lychee-lib/src/types/input.rs @@ -172,6 +172,9 @@ impl Input { } /// Get the input content of a given path + /// # Errors + /// + /// Will return `Err` if file contents can't be read pub fn path_content + AsRef + Clone>(path: P) -> Result { let content = read_to_string(&path).map_err(|e| (path.clone().into(), e))?; let input_content = InputContent { From f9bf52ef10be804193bb394cddcdaa0952fb5c84 Mon Sep 17 00:00:00 2001 From: Matthias Date: Sun, 20 Jun 2021 18:58:20 +0200 Subject: [PATCH 03/46] Add support for base_dir --- examples/collect_links/collect_links.rs | 4 +- lychee-bin/src/main.rs | 13 ++++-- lychee-bin/src/options.rs | 7 ++- lychee-lib/src/collector.rs | 19 ++++++--- lychee-lib/src/extract.rs | 7 ++- lychee-lib/src/fs_tree.rs | 57 ++++++++++++++++++------- 6 files changed, 78 insertions(+), 29 deletions(-) diff --git a/examples/collect_links/collect_links.rs b/examples/collect_links/collect_links.rs index fbff5f5316..fc97cbd390 100644 --- a/examples/collect_links/collect_links.rs +++ b/examples/collect_links/collect_links.rs @@ -14,8 +14,8 @@ async fn main() -> Result<()> { ]; let links = Collector::new( - None, // base_url - false, // don't skip missing inputs + None, // base_url + None, false, // don't skip missing inputs 10, // max concurrency ) .collect_links( diff --git a/lychee-bin/src/main.rs b/lychee-bin/src/main.rs index 535fa1a9f1..585159f76f 100644 --- a/lychee-bin/src/main.rs +++ b/lychee-bin/src/main.rs @@ -197,10 +197,15 @@ async fn run(cfg: &Config, inputs: Vec) -> Result { .client() .map_err(|e| anyhow!(e))?; - let links = Collector::new(cfg.base_url.clone(), cfg.skip_missing, max_concurrency) - .collect_links(&inputs) - .await - .map_err(|e| anyhow!(e))?; + let links = Collector::new( + cfg.base_url.clone(), + cfg.base_dir.clone(), + cfg.skip_missing, + max_concurrency, + ) + .collect_links(&inputs) + .await + .map_err(|e| anyhow!(e))?; let pb = if cfg.no_progress { None diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index ab463df66b..07b62d2c23 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -218,7 +218,12 @@ pub(crate) struct Config { pub(crate) method: String, /// Base URL to check relative URLs - #[structopt(short, long, parse(try_from_str))] + #[structopt(long, parse(try_from_str))] + #[serde(default)] + pub(crate) base_dir: Option, + + /// Base URL to check relative URLs + #[structopt(long, parse(try_from_str))] #[serde(default)] pub(crate) base_url: Option, diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index 31416cc991..232ddb069f 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -1,11 +1,12 @@ use crate::{extract::extract_links, uri::Uri, Input, Request, Result}; use reqwest::Url; -use std::collections::HashSet; +use std::{collections::HashSet, path::PathBuf}; /// Collector keeps the state of link collection #[derive(Debug, Clone)] pub struct Collector { base_url: Option, + base_dir: Option, skip_missing_inputs: bool, max_concurrency: usize, cache: HashSet, @@ -14,9 +15,15 @@ pub struct Collector { impl Collector { /// Create a new collector with an empty cache #[must_use] - pub fn new(base_url: Option, skip_missing_inputs: bool, max_concurrency: usize) -> Self { + pub fn new( + base_url: Option, + base_dir: Option, + skip_missing_inputs: bool, + max_concurrency: usize, + ) -> Self { Collector { base_url, + base_dir, skip_missing_inputs, max_concurrency, cache: HashSet::new(), @@ -52,8 +59,10 @@ impl Collector { while let Some(result) = contents_rx.recv().await { for input_content in result? { let base_url = self.base_url.clone(); - let handle = - tokio::task::spawn_blocking(move || extract_links(&input_content, &base_url)); + let base_dir = self.base_dir.clone(); + let handle = tokio::task::spawn_blocking(move || { + extract_links(&input_content, &base_url, &base_dir) + }); extract_links_handles.push(handle); } } @@ -160,7 +169,7 @@ mod test { }, ]; - let responses = Collector::new(None, false, 8) + let responses = Collector::new(None, None, false, 8) .collect_links(&inputs) .await?; let mut links = responses.into_iter().map(|r| r.uri).collect::>(); diff --git a/lychee-lib/src/extract.rs b/lychee-lib/src/extract.rs index 1dd310b1e9..5bfafaeb60 100644 --- a/lychee-lib/src/extract.rs +++ b/lychee-lib/src/extract.rs @@ -11,6 +11,7 @@ use pulldown_cmark::{Event as MDEvent, Parser, Tag}; use url::Url; use crate::{ + fs_tree, types::{FileType, InputContent}, Input, Request, Result, Uri, }; @@ -106,6 +107,7 @@ fn extract_links_from_plaintext(input: &str) -> Vec { pub(crate) fn extract_links( input_content: &InputContent, base_url: &Option, + base_dir: &Option, ) -> Result> { let links = match input_content.file_type { FileType::Markdown => extract_links_from_markdown(&input_content.content), @@ -125,9 +127,9 @@ pub(crate) fn extract_links( input_content.input.clone(), )); } else if let Input::FsPath(root) = &input_content.input { - if let Ok(path) = crate::fs_tree::find(&root, &PathBuf::from(&link)) { + if let Ok(path) = fs_tree::find(&root, &PathBuf::from(&link), base_dir) { let input_content = Input::path_content(path)?; - requests.extend(extract_links(&input_content, base_url)?); + requests.extend(extract_links(&input_content, base_url, base_dir)?); } else { info!("Cannot find path to {} in filesystem", &link); } @@ -183,6 +185,7 @@ mod test { extract_links( &InputContent::from_string(input, file_type), &base_url.map(|u| Url::parse(u).unwrap()), + &None, ) // unwrap is fine here as this helper function is only used in tests .unwrap() diff --git a/lychee-lib/src/fs_tree.rs b/lychee-lib/src/fs_tree.rs index b3255ead22..44c9791356 100644 --- a/lychee-lib/src/fs_tree.rs +++ b/lychee-lib/src/fs_tree.rs @@ -1,18 +1,30 @@ use crate::{ErrorKind, Result}; use std::path::{Path, PathBuf}; -pub(crate) fn find(root: &Path, dst: &Path) -> Result { +pub(crate) fn find(src: &Path, dst: &Path, base_dir: &Option) -> Result { if dst.exists() { return Ok(dst.to_path_buf()); } if dst.is_dir() { return Err(ErrorKind::FileNotFound(dst.into())); } - // Find `dst` in the `root` path - if let Some(parent) = root.parent() { - let rel = parent.join(dst.to_path_buf()); - if rel.exists() { - return Ok(rel); + if dst.is_absolute() { + // Absolute local links (leading slash) require the base_url to + // define the document root. + if let Some(base_dir) = base_dir { + let absolute = base_dir.join(dst.to_path_buf()); + if absolute.exists() { + return Ok(absolute); + } + } + } + if dst.is_relative() { + // Find `dst` in the `root` path + if let Some(parent) = src.parent() { + let relative = parent.join(dst.to_path_buf()); + if relative.exists() { + return Ok(relative); + } } } Err(ErrorKind::FileNotFound(dst.to_path_buf())) @@ -33,7 +45,7 @@ mod test_fs_tree { let dir = tempfile::tempdir()?; let dst = dir.path().join("foo.html"); File::create(&dst)?; - assert_eq!(find(&dummy, &dst)?, dst); + assert_eq!(find(&dummy, &dst, &None)?, dst); Ok(()) } @@ -45,7 +57,7 @@ mod test_fs_tree { let dir = tempfile::tempdir()?; let dst = dir.path().join("./foo.html"); File::create(&dst)?; - assert_eq!(find(&root, &dst)?, dst); + assert_eq!(find(&root, &dst, &None)?, dst); Ok(()) } @@ -57,7 +69,7 @@ mod test_fs_tree { let dir = tempfile::tempdir()?; let dst = dir.path().join("./foo.html"); File::create(&dst)?; - assert_eq!(find(&root, &dst)?, dst); + assert_eq!(find(&root, &dst, &None)?, dst); Ok(()) } @@ -66,7 +78,7 @@ mod test_fs_tree { let root = PathBuf::from("index.html"); // This file does not exist let dst = PathBuf::from("./foo.html"); - assert!(find(&root, &dst).is_err()); + assert!(find(&root, &dst, &None).is_err()); Ok(()) } @@ -81,7 +93,22 @@ mod test_fs_tree { let dst = PathBuf::from("./foo.html"); let dst_absolute = dir.path().join("./foo.html"); File::create(&dst_absolute)?; - assert_eq!(find(&root, &dst)?, dst_absolute); + assert_eq!(find(&root, &dst, &None)?, dst_absolute); + Ok(()) + } + + // dummy + // ./foo.html + // valid base dir + #[test] + fn test_find_absolute_from_base_dir() -> Result<()> { + let dummy = PathBuf::new(); + let dir = tempfile::tempdir()?; + let dst = dir.path().join("foo.html"); + File::create(&dst)?; + let base_dir = dir.path().to_path_buf(); + let dst_absolute = base_dir.join(dst.to_path_buf()); + assert_eq!(find(&dummy, &dst, &Some(base_dir))?, dst_absolute); Ok(()) } @@ -94,7 +121,7 @@ mod test_fs_tree { // We create the absolute path to foo.html, // but we address it under its relative path let dst = PathBuf::from("./foo.html"); - assert!(find(&root, &dst).is_err()); + assert!(find(&root, &dst, &None).is_err()); Ok(()) } @@ -106,7 +133,7 @@ mod test_fs_tree { let dir = tempfile::tempdir()?; let dst = dir.path().join("foo.html"); File::create(&dst)?; - assert_eq!(find(&root, &dst)?, dst); + assert_eq!(find(&root, &dst, &None)?, dst); Ok(()) } @@ -118,7 +145,7 @@ mod test_fs_tree { let dir = tempfile::tempdir()?; let dst = dir.path().join("foo.html"); File::create(&dst)?; - assert_eq!(find(&root, &dst)?, dst); + assert_eq!(find(&root, &dst, &None)?, dst); Ok(()) } @@ -129,7 +156,7 @@ mod test_fs_tree { let root = PathBuf::from("/path/to/"); let dir = tempfile::tempdir()?; File::create(&dir)?; - assert!(find(&root, &dir.into_path()).is_err()); + assert!(find(&root, &dir.into_path(), &None).is_err()); Ok(()) } } From 4fbd337326f6a0d4651a20bd8bae55496c5fb433 Mon Sep 17 00:00:00 2001 From: Matthias Date: Wed, 23 Jun 2021 00:14:11 +0200 Subject: [PATCH 04/46] Add install target and fix build phony --- Makefile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index f79461469a..a0b985d870 100644 --- a/Makefile +++ b/Makefile @@ -18,10 +18,14 @@ docker-run: ## Run Docker image docker-push: ## Push image to Docker Hub docker push $(IMAGE_NAME) -.PHONY: build-local +.PHONY: build build: ## Build Rust code locally cargo build +.PHONY: install +install: ## Install project locally + cargo install --path lychee-bin + .PHONY: run run: ## Run Rust code locally cargo run From 185645ac81fbaaf0ca646d41e3ac7e6ff5660006 Mon Sep 17 00:00:00 2001 From: Matthias Date: Wed, 23 Jun 2021 00:14:21 +0200 Subject: [PATCH 05/46] Update docs --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 4d7be79241..326a00c22c 100644 --- a/README.md +++ b/README.md @@ -194,7 +194,8 @@ FLAGS: OPTIONS: -a, --accept Comma-separated list of accepted status codes for valid links - -b, --base-url Base URL to check relative URLs + -b, --base Base URL or website root directory to check relative URLs e.g. + https://example.org or `/path/to/public` --basic-auth Basic authentication support. E.g. `username:password` -c, --config Configuration file to use [default: ./lychee.toml] --exclude ... Exclude URLs from checking (supports regex) From bfa3b1b6a1cfb5bd52b081d3ccf626580d780bd6 Mon Sep 17 00:00:00 2001 From: Matthias Date: Wed, 23 Jun 2021 00:18:12 +0200 Subject: [PATCH 06/46] Introduce Base type, which can be a path or URL --- examples/collect_links/collect_links.rs | 6 +- lychee-bin/src/main.rs | 13 +-- lychee-bin/src/options.rs | 23 +++--- lychee-lib/src/collector.rs | 31 +++----- lychee-lib/src/extract.rs | 37 ++++----- lychee-lib/src/fs_tree.rs | 11 ++- lychee-lib/src/lib.rs | 2 +- lychee-lib/src/types/base.rs | 100 ++++++++++++++++++++++++ lychee-lib/src/types/error.rs | 4 + lychee-lib/src/types/mod.rs | 2 + 10 files changed, 162 insertions(+), 67 deletions(-) create mode 100644 lychee-lib/src/types/base.rs diff --git a/examples/collect_links/collect_links.rs b/examples/collect_links/collect_links.rs index fc97cbd390..60c37f9291 100644 --- a/examples/collect_links/collect_links.rs +++ b/examples/collect_links/collect_links.rs @@ -14,12 +14,12 @@ async fn main() -> Result<()> { ]; let links = Collector::new( - None, // base_url - None, false, // don't skip missing inputs + None, // base + false, // don't skip missing inputs 10, // max concurrency ) .collect_links( - inputs, // base_url + inputs, // base url or directory ) .await?; diff --git a/lychee-bin/src/main.rs b/lychee-bin/src/main.rs index 585159f76f..e1493389ab 100644 --- a/lychee-bin/src/main.rs +++ b/lychee-bin/src/main.rs @@ -197,15 +197,10 @@ async fn run(cfg: &Config, inputs: Vec) -> Result { .client() .map_err(|e| anyhow!(e))?; - let links = Collector::new( - cfg.base_url.clone(), - cfg.base_dir.clone(), - cfg.skip_missing, - max_concurrency, - ) - .collect_links(&inputs) - .await - .map_err(|e| anyhow!(e))?; + let links = Collector::new(cfg.base.clone(), cfg.skip_missing, max_concurrency) + .collect_links(&inputs) + .await + .map_err(|e| anyhow!(e))?; let pb = if cfg.no_progress { None diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index 07b62d2c23..b97236715c 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -1,9 +1,8 @@ -use std::{fs, io::ErrorKind, path::PathBuf, str::FromStr}; +use std::{convert::TryFrom, fs, io::ErrorKind, path::PathBuf, str::FromStr}; use anyhow::{anyhow, Error, Result}; use lazy_static::lazy_static; -use lychee_lib::Input; -use reqwest::Url; +use lychee_lib::{Base, Input}; use serde::Deserialize; use structopt::{clap::crate_version, StructOpt}; @@ -76,6 +75,10 @@ macro_rules! fold_in { }; } +fn parse_base(src: &str) -> Result { + Base::try_from(src) +} + #[derive(Debug, StructOpt)] #[structopt( name = "lychee", @@ -217,15 +220,11 @@ pub(crate) struct Config { #[serde(default = "method")] pub(crate) method: String, - /// Base URL to check relative URLs - #[structopt(long, parse(try_from_str))] - #[serde(default)] - pub(crate) base_dir: Option, - - /// Base URL to check relative URLs - #[structopt(long, parse(try_from_str))] + /// Base URL or website root directory to check relative URLs + /// e.g. https://example.org or `/path/to/public` + #[structopt(short, long, parse(try_from_str = parse_base))] #[serde(default)] - pub(crate) base_url: Option, + pub(crate) base: Option, /// Basic authentication support. E.g. `username:password` #[structopt(long)] @@ -310,7 +309,7 @@ impl Config { accept: None; timeout: TIMEOUT; method: METHOD; - base_url: None; + base: None; basic_auth: None; github_token: None; skip_missing: false; diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index 232ddb069f..712dc090c5 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -1,12 +1,10 @@ -use crate::{extract::extract_links, uri::Uri, Input, Request, Result}; -use reqwest::Url; -use std::{collections::HashSet, path::PathBuf}; +use crate::{extract::extract_links, uri::Uri, Base, Input, Request, Result}; +use std::collections::HashSet; /// Collector keeps the state of link collection #[derive(Debug, Clone)] pub struct Collector { - base_url: Option, - base_dir: Option, + base: Option, skip_missing_inputs: bool, max_concurrency: usize, cache: HashSet, @@ -15,15 +13,9 @@ pub struct Collector { impl Collector { /// Create a new collector with an empty cache #[must_use] - pub fn new( - base_url: Option, - base_dir: Option, - skip_missing_inputs: bool, - max_concurrency: usize, - ) -> Self { + pub fn new(base: Option, skip_missing_inputs: bool, max_concurrency: usize) -> Self { Collector { - base_url, - base_dir, + base, skip_missing_inputs, max_concurrency, cache: HashSet::new(), @@ -31,7 +23,8 @@ impl Collector { } /// Fetch all unique links from a slice of inputs - /// All relative URLs get prefixed with `base_url` if given. + /// All relative URLs get prefixed with `base` if given. + /// (This can be a directory or a base URL) /// /// # Errors /// @@ -58,11 +51,9 @@ impl Collector { while let Some(result) = contents_rx.recv().await { for input_content in result? { - let base_url = self.base_url.clone(); - let base_dir = self.base_dir.clone(); - let handle = tokio::task::spawn_blocking(move || { - extract_links(&input_content, &base_url, &base_dir) - }); + let base = self.base.clone(); + let handle = + tokio::task::spawn_blocking(move || extract_links(&input_content, &base)); extract_links_handles.push(handle); } } @@ -169,7 +160,7 @@ mod test { }, ]; - let responses = Collector::new(None, None, false, 8) + let responses = Collector::new(None, false, 8) .collect_links(&inputs) .await?; let mut links = responses.into_iter().map(|r| r.uri).collect::>(); diff --git a/lychee-lib/src/extract.rs b/lychee-lib/src/extract.rs index 5bfafaeb60..2f65c2625a 100644 --- a/lychee-lib/src/extract.rs +++ b/lychee-lib/src/extract.rs @@ -8,12 +8,11 @@ use linkify::LinkFinder; use log::info; use markup5ever_rcdom::{Handle, NodeData, RcDom}; use pulldown_cmark::{Event as MDEvent, Parser, Tag}; -use url::Url; use crate::{ fs_tree, types::{FileType, InputContent}, - Input, Request, Result, Uri, + Base, Input, Request, Result, Uri, }; // Use LinkFinder here to offload the actual link searching in plaintext. @@ -106,8 +105,7 @@ fn extract_links_from_plaintext(input: &str) -> Vec { pub(crate) fn extract_links( input_content: &InputContent, - base_url: &Option, - base_dir: &Option, + base: &Option, ) -> Result> { let links = match input_content.file_type { FileType::Markdown => extract_links_from_markdown(&input_content.content), @@ -121,15 +119,15 @@ pub(crate) fn extract_links( for link in links { if let Ok(uri) = Uri::try_from(link.as_str()) { requests.insert(Request::new(uri, input_content.input.clone())); - } else if let Some(new_url) = base_url.as_ref().and_then(|u| u.join(&link).ok()) { + } else if let Some(new_url) = base.as_ref().and_then(|u| u.join(&link)) { requests.insert(Request::new( Uri { url: new_url }, input_content.input.clone(), )); } else if let Input::FsPath(root) = &input_content.input { - if let Ok(path) = fs_tree::find(&root, &PathBuf::from(&link), base_dir) { + if let Ok(path) = fs_tree::find(&root, &PathBuf::from(&link), base) { let input_content = Input::path_content(path)?; - requests.extend(extract_links(&input_content, base_url, base_dir)?); + requests.extend(extract_links(&input_content, base)?); } else { info!("Cannot find path to {} in filesystem", &link); } @@ -157,11 +155,14 @@ mod test { extract_links, extract_links_from_html, extract_links_from_markdown, extract_links_from_plaintext, find_links, }; - use crate::types::{FileType, InputContent}; use crate::{ test_utils::{mail, website}, Uri, }; + use crate::{ + types::{FileType, InputContent}, + Base, + }; fn load_fixture(filename: &str) -> String { let fixture_path = Path::new(env!("CARGO_MANIFEST_DIR")) @@ -182,16 +183,16 @@ mod test { } fn extract_uris(input: &str, file_type: FileType, base_url: Option<&str>) -> HashSet { - extract_links( - &InputContent::from_string(input, file_type), - &base_url.map(|u| Url::parse(u).unwrap()), - &None, - ) - // unwrap is fine here as this helper function is only used in tests - .unwrap() - .into_iter() - .map(|r| r.uri) - .collect() + let base = match base_url { + Some(url) => Some(Base::Remote(Url::parse(url).unwrap())), + None => None, + }; + extract_links(&InputContent::from_string(input, file_type), &base) + // unwrap is fine here as this helper function is only used in tests + .unwrap() + .into_iter() + .map(|r| r.uri) + .collect() } #[test] diff --git a/lychee-lib/src/fs_tree.rs b/lychee-lib/src/fs_tree.rs index 44c9791356..344dd1a665 100644 --- a/lychee-lib/src/fs_tree.rs +++ b/lychee-lib/src/fs_tree.rs @@ -1,7 +1,7 @@ -use crate::{ErrorKind, Result}; +use crate::{Base, ErrorKind, Result}; use std::path::{Path, PathBuf}; -pub(crate) fn find(src: &Path, dst: &Path, base_dir: &Option) -> Result { +pub(crate) fn find(src: &Path, dst: &Path, base: &Option) -> Result { if dst.exists() { return Ok(dst.to_path_buf()); } @@ -11,7 +11,7 @@ pub(crate) fn find(src: &Path, dst: &Path, base_dir: &Option) -> Result if dst.is_absolute() { // Absolute local links (leading slash) require the base_url to // define the document root. - if let Some(base_dir) = base_dir { + if let Some(base_dir) = base.as_ref().and_then(|b| b.dir()) { let absolute = base_dir.join(dst.to_path_buf()); if absolute.exists() { return Ok(absolute); @@ -108,7 +108,10 @@ mod test_fs_tree { File::create(&dst)?; let base_dir = dir.path().to_path_buf(); let dst_absolute = base_dir.join(dst.to_path_buf()); - assert_eq!(find(&dummy, &dst, &Some(base_dir))?, dst_absolute); + assert_eq!( + find(&dummy, &dst, &Some(Base::Local(base_dir)))?, + dst_absolute + ); Ok(()) } diff --git a/lychee-lib/src/lib.rs b/lychee-lib/src/lib.rs index 62257d5dc1..3d57e5d72d 100644 --- a/lychee-lib/src/lib.rs +++ b/lychee-lib/src/lib.rs @@ -78,6 +78,6 @@ pub use crate::{ client_pool::ClientPool, collector::Collector, filter::{Excludes, Filter, Includes}, - types::{ErrorKind, Input, Request, Response, ResponseBody, Result, Status}, + types::{Base, ErrorKind, Input, Request, Response, ResponseBody, Result, Status}, uri::Uri, }; diff --git a/lychee-lib/src/types/base.rs b/lychee-lib/src/types/base.rs new file mode 100644 index 0000000000..a3db9b81b8 --- /dev/null +++ b/lychee-lib/src/types/base.rs @@ -0,0 +1,100 @@ +use reqwest::Url; +use serde::{Deserialize, Serialize}; +use std::{convert::TryFrom, path::PathBuf}; + +use crate::ErrorKind; + +/// When encountering links without a full domain in a document, +/// the base determines where this resource can be found. +/// Both, local and remote targets are supported. +#[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Clone)] + +pub enum Base { + /// Local file path pointing to root directory + Local(PathBuf), + /// Remote URL pointing to a website homepage + Remote(Url), +} + +impl Base { + /// Join link with base url + pub fn join(&self, link: &str) -> Option { + match self { + Self::Remote(url) => url.join(link).ok(), + Self::Local(_) => None, + } + } + + /// Return the directory if the base is local + pub fn dir(&self) -> Option { + match self { + Self::Remote(_) => None, + Self::Local(d) => Some(d.to_path_buf()), + } + } +} + +impl TryFrom<&str> for Base { + type Error = ErrorKind; + + fn try_from(value: &str) -> Result { + if let Ok(url) = Url::parse(&value) { + if url.cannot_be_a_base() { + return Err(ErrorKind::InvalidBase( + value.to_string(), + "The given URL cannot be a base".to_string(), + )); + } + return Ok(Self::Remote(url)); + } + // Only accept existing directories as path + let path = PathBuf::from(&value); + if !path.is_dir() { + return Err(ErrorKind::InvalidBase( + value.to_string(), + "The given base path is not a directory".to_string(), + )); + } + if !path.exists() { + return Err(ErrorKind::InvalidBase( + value.to_string(), + "The given base directory does not exist".to_string(), + )); + } + Ok(Self::Local(path)) + } +} + +#[cfg(test)] +mod test_base { + use crate::Result; + + use super::*; + + #[test] + fn test_valid_remote() -> Result<()> { + let base = Base::try_from("https://endler.dev")?; + assert_eq!( + base, + Base::Remote(Url::parse("https://endler.dev").unwrap()) + ); + Ok(()) + } + + #[test] + fn test_invalid_url() { + assert!(Base::try_from("data:text/plain,Hello?World#").is_err()); + } + + #[test] + fn test_valid_local() -> Result<()> { + let dir = tempfile::tempdir()?; + Base::try_from(dir.as_ref().to_str().unwrap())?; + Ok(()) + } + + #[test] + fn test_invalid_local() { + assert!(Base::try_from("/asdfasdd20asdfljvvvzzcv/j2ofasd").is_err()); + } +} diff --git a/lychee-lib/src/types/error.rs b/lychee-lib/src/types/error.rs index 89c60c5912..9b187ccb06 100644 --- a/lychee-lib/src/types/error.rs +++ b/lychee-lib/src/types/error.rs @@ -25,6 +25,8 @@ pub enum ErrorKind { /// A possible error when converting a `HeaderValue` from a string or byte /// slice. InvalidHeader(InvalidHeaderValue), + /// The given string can not be parsed into a valid base URL or base directory + InvalidBase(String, String), /// Cannot find local file FileNotFound(PathBuf), /// The given UNIX glob pattern is invalid @@ -71,6 +73,7 @@ impl Hash for ErrorKind { Self::InvalidHeader(e) => e.to_string().hash(state), Self::InvalidGlobPattern(e) => e.to_string().hash(state), Self::MissingGitHubToken => std::mem::discriminant(self).hash(state), + ErrorKind::InvalidBase(base, e) => (base, e).hash(state), } } } @@ -110,6 +113,7 @@ impl Display for ErrorKind { "This URL is available in HTTPS protocol, but HTTP is provided, use '{}' instead", uri ), + Self::InvalidBase(base, e) => write!(f, "Error while base dir `{}` : {}", base, e), } } } diff --git a/lychee-lib/src/types/mod.rs b/lychee-lib/src/types/mod.rs index 552b87fc19..63fec726ce 100644 --- a/lychee-lib/src/types/mod.rs +++ b/lychee-lib/src/types/mod.rs @@ -1,5 +1,6 @@ #![allow(unreachable_pub)] +mod base; mod error; mod file; mod input; @@ -7,6 +8,7 @@ mod request; mod response; mod status; +pub use base::Base; pub use error::ErrorKind; pub use file::FileType; pub use input::{Input, InputContent}; From 887f1b9589ec3cc7ab23d00e74c4986e7033c02e Mon Sep 17 00:00:00 2001 From: Matthias Date: Thu, 1 Jul 2021 01:44:12 +0200 Subject: [PATCH 07/46] Split up file checking into file discovery and validation of path exists --- lychee-lib/src/client.rs | 14 ++++- lychee-lib/src/collector.rs | 2 +- lychee-lib/src/extract.rs | 40 ++++++------- lychee-lib/src/filter/mod.rs | 2 +- lychee-lib/src/{fs_tree.rs => fs.rs} | 90 ++++++++++++++++++---------- lychee-lib/src/lib.rs | 6 +- lychee-lib/src/types/error.rs | 21 ++++++- lychee-lib/src/types/mod.rs | 2 + 8 files changed, 114 insertions(+), 63 deletions(-) rename lychee-lib/src/{fs_tree.rs => fs.rs} (67%) diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs index c1175ba697..a69167c7d2 100644 --- a/lychee-lib/src/client.rs +++ b/lychee-lib/src/client.rs @@ -20,8 +20,7 @@ use typed_builder::TypedBuilder; use crate::{ filter::{Excludes, Filter, Includes}, quirks::Quirks, - uri::Uri, - ErrorKind, Request, Response, Result, Status, + ErrorKind, Request, Response, Result, Status, Uri, }; const DEFAULT_MAX_REDIRECTS: usize = 5; @@ -178,6 +177,8 @@ impl Client { let Request { uri, source } = Request::try_from(request)?; let status = if self.filter.is_excluded(&uri) { Status::Excluded + } else if uri.is_file() { + self.check_file(&uri).await } else if uri.is_mail() { self.check_mail(&uri).await } else { @@ -250,6 +251,15 @@ impl Client { } } + pub async fn check_file(&self, uri: &Uri) -> Status { + if let Ok(path) = uri.inner.to_file_path() { + if path.exists() { + return Status::Ok(StatusCode::OK); + } + } + ErrorKind::InvalidFileUri(uri.clone()).into() + } + pub async fn check_mail(&self, uri: &Uri) -> Status { let input = CheckEmailInput::new(vec![uri.as_str().to_owned()]); let result = &(check_email(&input).await)[0]; diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index 712dc090c5..b5e69d96ba 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -1,4 +1,4 @@ -use crate::{extract::extract_links, uri::Uri, Base, Input, Request, Result}; +use crate::{extract::extract_links, Base, Input, Request, Result, Uri}; use std::collections::HashSet; /// Collector keeps the state of link collection diff --git a/lychee-lib/src/extract.rs b/lychee-lib/src/extract.rs index 2f65c2625a..65b6ee831e 100644 --- a/lychee-lib/src/extract.rs +++ b/lychee-lib/src/extract.rs @@ -8,11 +8,12 @@ use linkify::LinkFinder; use log::info; use markup5ever_rcdom::{Handle, NodeData, RcDom}; use pulldown_cmark::{Event as MDEvent, Parser, Tag}; +use reqwest::Url; use crate::{ - fs_tree, + fs, types::{FileType, InputContent}, - Base, Input, Request, Result, Uri, + Base, ErrorKind, Input, Request, Result, Uri, }; // Use LinkFinder here to offload the actual link searching in plaintext. @@ -113,27 +114,27 @@ pub(crate) fn extract_links( FileType::Plaintext => extract_links_from_plaintext(&input_content.content), }; - // Only keep legit URLs. This sorts out things like anchors. - // Silently ignore the parse failures for now. + // Only keep legit URLs. For example this filters out anchors. let mut requests: HashSet = HashSet::new(); for link in links { - if let Ok(uri) = Uri::try_from(link.as_str()) { - requests.insert(Request::new(uri, input_content.input.clone())); + let req = if let Ok(uri) = Uri::try_from(link.as_str()) { + Request::new(uri, input_content.input.clone()) } else if let Some(new_url) = base.as_ref().and_then(|u| u.join(&link)) { - requests.insert(Request::new( - Uri { url: new_url }, - input_content.input.clone(), - )); + Request::new(Uri { inner: new_url }, input_content.input.clone()) } else if let Input::FsPath(root) = &input_content.input { - if let Ok(path) = fs_tree::find(&root, &PathBuf::from(&link), base) { - let input_content = Input::path_content(path)?; - requests.extend(extract_links(&input_content, base)?); - } else { - info!("Cannot find path to {} in filesystem", &link); - } + let link = fs::sanitize(link); + let path = fs::resolve(&root, &PathBuf::from(&link), base)?; + Request::new( + Uri { + inner: Url::from_file_path(&path).map_err(|_e| ErrorKind::InvalidPath(path))?, + }, + input_content.input.clone(), + ) } else { info!("Handling of {} not implemented yet", &link); - } + continue; + }; + requests.insert(req); } Ok(requests) } @@ -151,10 +152,7 @@ mod test { use pretty_assertions::assert_eq; use url::Url; - use super::{ - extract_links, extract_links_from_html, extract_links_from_markdown, - extract_links_from_plaintext, find_links, - }; + use super::*; use crate::{ test_utils::{mail, website}, Uri, diff --git a/lychee-lib/src/filter/mod.rs b/lychee-lib/src/filter/mod.rs index f9daac8bfa..0726aa67d0 100644 --- a/lychee-lib/src/filter/mod.rs +++ b/lychee-lib/src/filter/mod.rs @@ -6,7 +6,7 @@ use std::{collections::HashSet, net::IpAddr}; pub use excludes::Excludes; pub use includes::Includes; -use crate::uri::Uri; +use crate::Uri; /// Pre-defined exclusions for known false-positives static FALSE_POSITIVE_PAT: &[&str] = &[r"http://www.w3.org/1999/xhtml"]; diff --git a/lychee-lib/src/fs_tree.rs b/lychee-lib/src/fs.rs similarity index 67% rename from lychee-lib/src/fs_tree.rs rename to lychee-lib/src/fs.rs index 344dd1a665..98255b18ae 100644 --- a/lychee-lib/src/fs_tree.rs +++ b/lychee-lib/src/fs.rs @@ -1,35 +1,49 @@ use crate::{Base, ErrorKind, Result}; use std::path::{Path, PathBuf}; -pub(crate) fn find(src: &Path, dst: &Path, base: &Option) -> Result { - if dst.exists() { - return Ok(dst.to_path_buf()); - } - if dst.is_dir() { - return Err(ErrorKind::FileNotFound(dst.into())); +// Returns the base if it is a valid `PathBuf` +fn get_base_dir(base: &Option) -> Option { + base.as_ref().and_then(|b| b.dir()) +} + +pub(crate) fn resolve(src: &Path, dst: &Path, base: &Option) -> Result { + if dst.is_relative() { + // Find `dst` in the parent directory of `src` + if let Some(parent) = src.parent() { + let rel_path = parent.join(dst.to_path_buf()); + return Ok(rel_path); + } } if dst.is_absolute() { // Absolute local links (leading slash) require the base_url to // define the document root. - if let Some(base_dir) = base.as_ref().and_then(|b| b.dir()) { - let absolute = base_dir.join(dst.to_path_buf()); - if absolute.exists() { - return Ok(absolute); - } - } - } - if dst.is_relative() { - // Find `dst` in the `root` path - if let Some(parent) = src.parent() { - let relative = parent.join(dst.to_path_buf()); - if relative.exists() { - return Ok(relative); - } + if let Some(base_dir) = get_base_dir(base) { + let abs_path = join(base_dir, dst); + return Ok(abs_path); } } Err(ErrorKind::FileNotFound(dst.to_path_buf())) } +// A cumbersome way to concatenate paths without checking their +// existence on disk. See https://github.com/rust-lang/rust/issues/16507 +fn join(base: PathBuf, dst: &Path) -> PathBuf { + let mut abs = base.into_os_string(); + let target_str = dst.as_os_str(); + abs.push(target_str); + PathBuf::from(abs) +} + +/// A little helper function to remove the get parameters from a URL link. +/// The link is not a URL but a String as that link may not have a base domain. +pub(crate) fn sanitize(link: String) -> String { + let path = match link.split_once('?') { + Some((path, _params)) => path, + None => link.as_str(), + }; + path.to_string() +} + #[cfg(test)] mod test_fs_tree { use std::fs::File; @@ -37,6 +51,31 @@ mod test_fs_tree { use super::*; use crate::Result; + #[test] + fn test_sanitize() { + assert_eq!(sanitize("/".to_string()), "/".to_string()); + assert_eq!( + sanitize("index.html?foo=bar".to_string()), + "index.html".to_string() + ); + assert_eq!( + sanitize("/index.html?foo=bar".to_string()), + "/index.html".to_string() + ); + assert_eq!( + sanitize("/index.html?foo=bar&baz=zorx?bla=blub".to_string()), + "/index.html".to_string() + ); + assert_eq!( + sanitize("https://example.org/index.html?foo=bar".to_string()), + "https://example.org/index.html".to_string() + ); + assert_eq!( + sanitize("test.png?foo=bar".to_string()), + "test.png".to_string() + ); + } + // dummy root // /path/to/foo.html #[test] @@ -151,15 +190,4 @@ mod test_fs_tree { assert_eq!(find(&root, &dst, &None)?, dst); Ok(()) } - - // /path/to/index.html - // /other/path/to - #[test] - fn test_dst_is_dir() -> Result<()> { - let root = PathBuf::from("/path/to/"); - let dir = tempfile::tempdir()?; - File::create(&dir)?; - assert!(find(&root, &dir.into_path(), &None).is_err()); - Ok(()) - } } diff --git a/lychee-lib/src/lib.rs b/lychee-lib/src/lib.rs index 3d57e5d72d..5af79c6c9a 100644 --- a/lychee-lib/src/lib.rs +++ b/lychee-lib/src/lib.rs @@ -50,10 +50,9 @@ mod client; mod client_pool; /// A pool of clients, to handle concurrent checks pub mod collector; -mod fs_tree; +mod fs; mod quirks; mod types; -mod uri; /// Functionality to extract URIs from inputs pub mod extract; @@ -78,6 +77,5 @@ pub use crate::{ client_pool::ClientPool, collector::Collector, filter::{Excludes, Filter, Includes}, - types::{Base, ErrorKind, Input, Request, Response, ResponseBody, Result, Status}, - uri::Uri, + types::{Base, ErrorKind, Input, Request, Response, ResponseBody, Result, Status, Uri}, }; diff --git a/lychee-lib/src/types/error.rs b/lychee-lib/src/types/error.rs index 9b187ccb06..60cafa073d 100644 --- a/lychee-lib/src/types/error.rs +++ b/lychee-lib/src/types/error.rs @@ -10,15 +10,20 @@ use crate::Uri; #[derive(Debug)] #[non_exhaustive] pub enum ErrorKind { - // TODO: maybe need to be splitted; currently first slot is Some only for reading files + // TODO: maybe needs to be split; currently first element is `Some` only for + // reading files /// Any form of I/O error occurred while reading from a given path. IoError(Option, std::io::Error), /// Network error when trying to connect to an endpoint via reqwest. ReqwestError(reqwest::Error), /// Network error when trying to connect to an endpoint via hubcaps. HubcapsError(hubcaps::Error), - /// The given string can not be parsed into a valid URL or e-mail address + /// The given string can not be parsed into a valid URL, e-mail address, or file path UrlParseError(String, (url::ParseError, Option)), + /// The given URI cannot be converted to a file path + InvalidFileUri(Uri), + /// The given path cannot be converted to a URI + InvalidPath(PathBuf), /// The given mail address is unreachable UnreachableEmailAddress(Uri), /// The given header could not be parsed. @@ -70,10 +75,12 @@ impl Hash for ErrorKind { Self::FileNotFound(e) => e.to_string_lossy().hash(state), Self::UrlParseError(s, e) => (s, e.type_id()).hash(state), Self::UnreachableEmailAddress(u) | Self::InsecureURL(u) => u.hash(state), + Self::InvalidFileUri(u) => u.hash(state), + Self::InvalidPath(p) => p.hash(state), + Self::UnreachableEmailAddress(u) => u.hash(state), Self::InvalidHeader(e) => e.to_string().hash(state), Self::InvalidGlobPattern(e) => e.to_string().hash(state), Self::MissingGitHubToken => std::mem::discriminant(self).hash(state), - ErrorKind::InvalidBase(base, e) => (base, e).hash(state), } } } @@ -101,6 +108,8 @@ impl Display for ErrorKind { Self::UrlParseError(s, (url_err, None)) => { write!(f, "Cannot parse {} as website url ({})", s, url_err) } + Self::InvalidFileUri(u) => write!(f, "Invalid file URI: {}", u), + Self::InvalidPath(p) => write!(f, "Invalid path: {}", p.display()), Self::UnreachableEmailAddress(uri) => write!(f, "Unreachable mail address: {}", uri), Self::InvalidHeader(e) => e.fmt(f), Self::InvalidGlobPattern(e) => e.fmt(f), @@ -157,6 +166,12 @@ impl From for ErrorKind { } } +impl From for ErrorKind { + fn from(e: url::ParseError) -> Self { + Self::UrlParseError("Cannot parse URL".to_string(), (e, None)) + } +} + impl From<(String, url::ParseError)> for ErrorKind { fn from(value: (String, url::ParseError)) -> Self { Self::UrlParseError(value.0, (value.1, None)) diff --git a/lychee-lib/src/types/mod.rs b/lychee-lib/src/types/mod.rs index 63fec726ce..9453d5ee9f 100644 --- a/lychee-lib/src/types/mod.rs +++ b/lychee-lib/src/types/mod.rs @@ -7,6 +7,7 @@ mod input; mod request; mod response; mod status; +mod uri; pub use base::Base; pub use error::ErrorKind; @@ -15,6 +16,7 @@ pub use input::{Input, InputContent}; pub use request::Request; pub use response::{Response, ResponseBody}; pub use status::Status; +pub use uri::Uri; /// The lychee `Result` type pub type Result = std::result::Result; From d51a49db461d51391b0555ab6a78befcc19352a2 Mon Sep 17 00:00:00 2001 From: Matthias Date: Thu, 1 Jul 2021 01:44:21 +0200 Subject: [PATCH 08/46] Move uri to types --- lychee-lib/src/{ => types}/uri.rs | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) rename lychee-lib/src/{ => types}/uri.rs (94%) diff --git a/lychee-lib/src/uri.rs b/lychee-lib/src/types/uri.rs similarity index 94% rename from lychee-lib/src/uri.rs rename to lychee-lib/src/types/uri.rs index a25aad395f..aaf7d0c0fe 100644 --- a/lychee-lib/src/uri.rs +++ b/lychee-lib/src/types/uri.rs @@ -6,14 +6,14 @@ use url::Url; use crate::{ErrorKind, Result}; -/// Lychee's own representation of a URI, which encapsulates all support formats. +/// Lychee's own representation of a URI, which encapsulates all supported formats. /// /// If the scheme is `mailto`, it's a mail address. /// Otherwise it's treated as a website URL. #[derive(Clone, Debug, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)] pub struct Uri { /// Website URL or mail address - pub(crate) url: Url, + pub(crate) inner: Url, } impl Uri { @@ -24,21 +24,21 @@ impl Uri { #[inline] #[must_use] pub fn as_str(&self) -> &str { - self.url.as_ref().trim_start_matches("mailto:") + self.inner.as_ref().trim_start_matches("mailto:") } #[inline] #[must_use] /// Returns the scheme of the URI (e.g. `http` or `mailto`) pub fn scheme(&self) -> &str { - self.url.scheme() + self.inner.scheme() } #[inline] #[must_use] /// Returns the domain of the URI (e.g. `example.org`) pub fn domain(&self) -> Option<&str> { - self.url.domain() + self.inner.domain() } #[inline] @@ -49,14 +49,14 @@ impl Uri { /// /// Return `None` for cannot-be-a-base URLs. pub fn path_segments(&self) -> Option> { - self.url.path_segments() + self.inner.path_segments() } #[must_use] /// Returns the IP address (either IPv4 or IPv6) of the URI, /// or `None` if it is a domain pub fn host_ip(&self) -> Option { - match self.url.host()? { + match self.inner.host()? { url::Host::Domain(_) => None, url::Host::Ipv4(v4_addr) => Some(v4_addr.into()), url::Host::Ipv6(v6_addr) => Some(v6_addr.into()), @@ -85,6 +85,11 @@ impl Uri { pub(crate) fn is_mail(&self) -> bool { self.scheme() == "mailto" } + + #[inline] + pub(crate) fn is_file(&self) -> bool { + self.scheme() == "file" + } } impl AsRef for Uri { @@ -95,7 +100,7 @@ impl AsRef for Uri { impl From for Uri { fn from(url: Url) -> Self { - Self { url } + Self { inner: url } } } From d924c25669c272b9c16b40c838b902fb5802ec73 Mon Sep 17 00:00:00 2001 From: Matthias Date: Thu, 1 Jul 2021 01:44:52 +0200 Subject: [PATCH 09/46] Non-existing directories are fine for URI base for files --- lychee-lib/src/types/base.rs | 23 ++--------------------- 1 file changed, 2 insertions(+), 21 deletions(-) diff --git a/lychee-lib/src/types/base.rs b/lychee-lib/src/types/base.rs index a3db9b81b8..8b7f2f866a 100644 --- a/lychee-lib/src/types/base.rs +++ b/lychee-lib/src/types/base.rs @@ -8,7 +8,7 @@ use crate::ErrorKind; /// the base determines where this resource can be found. /// Both, local and remote targets are supported. #[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Clone)] - +#[allow(variant_size_differences)] pub enum Base { /// Local file path pointing to root directory Local(PathBuf), @@ -47,21 +47,7 @@ impl TryFrom<&str> for Base { } return Ok(Self::Remote(url)); } - // Only accept existing directories as path - let path = PathBuf::from(&value); - if !path.is_dir() { - return Err(ErrorKind::InvalidBase( - value.to_string(), - "The given base path is not a directory".to_string(), - )); - } - if !path.exists() { - return Err(ErrorKind::InvalidBase( - value.to_string(), - "The given base directory does not exist".to_string(), - )); - } - Ok(Self::Local(path)) + Ok(Self::Local(PathBuf::from(value))) } } @@ -92,9 +78,4 @@ mod test_base { Base::try_from(dir.as_ref().to_str().unwrap())?; Ok(()) } - - #[test] - fn test_invalid_local() { - assert!(Base::try_from("/asdfasdd20asdfljvvvzzcv/j2ofasd").is_err()); - } } From f5ee472d930b244ebf67a57f0321b87964accf1e Mon Sep 17 00:00:00 2001 From: Matthias Date: Mon, 5 Jul 2021 01:33:32 +0200 Subject: [PATCH 10/46] explicit naming --- fixtures/TEST_SCHEMES.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fixtures/TEST_SCHEMES.txt b/fixtures/TEST_SCHEMES.txt index 29ab3b53d2..47a061e41e 100644 --- a/fixtures/TEST_SCHEMES.txt +++ b/fixtures/TEST_SCHEMES.txt @@ -1,3 +1,3 @@ slack://channel?id=123 -file://foo/bar +file:///test_folder/test_file https://example.org From ee70e13bf7f4337fd82b3cb94497374522e10d5c Mon Sep 17 00:00:00 2001 From: Matthias Date: Mon, 5 Jul 2021 01:34:35 +0200 Subject: [PATCH 11/46] Check real link to file --- fixtures/TEST.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fixtures/TEST.md b/fixtures/TEST.md index be6d5a0c9b..dc07cc6e0b 100644 --- a/fixtures/TEST.md +++ b/fixtures/TEST.md @@ -1,5 +1,5 @@ -This link should be ignored as it is not a fully qualified URL. -![Logo](awesome.png) +Check file link +![Logo](../assets/banner.svg) ![Anchors should be ignored](#awesome) From daa5be4c3ac77ed3385b2afedbbc625ec9a35b89 Mon Sep 17 00:00:00 2001 From: Matthias Date: Mon, 5 Jul 2021 01:35:36 +0200 Subject: [PATCH 12/46] Add/change file link tests --- lychee-bin/tests/cli.rs | 27 +++++++++++++++++---------- lychee-bin/tests/local_files.rs | 5 ++--- lychee-lib/src/client.rs | 17 ++++++++++++++++- 3 files changed, 35 insertions(+), 14 deletions(-) diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index b42f988a0b..6d0819b579 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -130,15 +130,22 @@ mod cli { /// Test unsupported URI schemes #[test] - fn test_unsupported_uri_schemes() -> Result<()> { - test_json_output!( - "TEST_SCHEMES.txt", - MockResponseStats { - total: 1, - successful: 1, - ..MockResponseStats::default() - } - ) + fn test_unsupported_uri_schemes() { + let mut cmd = main_command(); + let test_schemes_path = fixtures_path().join("TEST_SCHEMES.txt"); + + // Exclude file link because it doesn't exist on the filesystem. + // (File URIs are absolute paths, which we don't have.) + // Nevertheless, the `file` scheme should be recognized. + cmd.arg(test_schemes_path) + .arg("--exclude") + .arg("file://") + .env_clear() + .assert() + .success() + .stdout(contains("Total............2")) + .stdout(contains("Successful.......1")) + .stdout(contains("Excluded.........1")); } #[test] @@ -364,7 +371,7 @@ mod cli { .assert() .success(); - let expected = r#"{"total":10,"successful":10,"failures":0,"timeouts":0,"redirects":0,"excludes":0,"errors":0,"fail_map":{}}"#; + let expected = r#"{"total":11,"successful":11,"failures":0,"timeouts":0,"redirects":0,"excludes":0,"errors":0,"fail_map":{}}"#; let output = fs::read_to_string(&outfile)?; assert_eq!(output.split_whitespace().collect::(), expected); fs::remove_file(outfile)?; diff --git a/lychee-bin/tests/local_files.rs b/lychee-bin/tests/local_files.rs index ddd0ed25e1..11574e172d 100644 --- a/lychee-bin/tests/local_files.rs +++ b/lychee-bin/tests/local_files.rs @@ -19,8 +19,7 @@ mod cli { writeln!(index, r#"Foo"#)?; let foo_path = dir.path().join("foo.html"); - let mut foo = File::create(&foo_path)?; - writeln!(foo, r#"example"#)?; + File::create(&foo_path)?; let mut cmd = main_command(); cmd.arg(index_path) @@ -30,7 +29,7 @@ mod cli { .assert() .success() .stdout(contains("Total............1")) - .stdout(contains("example.org")); + .stdout(contains("foo.html")); Ok(()) } diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs index a69167c7d2..e8d477fd7f 100644 --- a/lychee-lib/src/client.rs +++ b/lychee-lib/src/client.rs @@ -287,10 +287,14 @@ where #[cfg(test)] mod test { - use std::time::{Duration, Instant}; + use std::{ + fs::File, + time::{Duration, Instant}, + }; use http::{header::HeaderMap, StatusCode}; use reqwest::header; + use tempfile::tempdir; use super::ClientBuilder; use crate::{mock_server, test_utils::get_mock_client_response}; @@ -375,6 +379,17 @@ mod test { assert!(res.status().is_success()); } + #[tokio::test] + async fn test_file() { + let dir = tempdir().unwrap(); + let file = dir.path().join("temp"); + File::create(file).unwrap(); + let uri = format!("file://{}", dir.path().join("temp").to_str().unwrap()); + + let res = get_mock_client_response(uri).await; + assert!(res.status().is_success()); + } + #[tokio::test] async fn test_custom_headers() { // See https://github.com/rust-lang/crates.io/issues/788 From a3fd85d923b7762d9d98687ff1f9b58f44f6b573 Mon Sep 17 00:00:00 2001 From: Matthias Date: Mon, 5 Jul 2021 01:36:43 +0200 Subject: [PATCH 13/46] Exclude anchor links --- lychee-lib/src/extract.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/lychee-lib/src/extract.rs b/lychee-lib/src/extract.rs index 65b6ee831e..039c039425 100644 --- a/lychee-lib/src/extract.rs +++ b/lychee-lib/src/extract.rs @@ -123,13 +123,13 @@ pub(crate) fn extract_links( Request::new(Uri { inner: new_url }, input_content.input.clone()) } else if let Input::FsPath(root) = &input_content.input { let link = fs::sanitize(link); + if link.starts_with("#") { + // Silently ignore anchors for now. + continue; + } let path = fs::resolve(&root, &PathBuf::from(&link), base)?; - Request::new( - Uri { - inner: Url::from_file_path(&path).map_err(|_e| ErrorKind::InvalidPath(path))?, - }, - input_content.input.clone(), - ) + let uri = Url::from_file_path(&path).map_err(|_e| ErrorKind::InvalidPath(path))?; + Request::new(Uri { inner: uri }, input_content.input.clone()) } else { info!("Handling of {} not implemented yet", &link); continue; From 1546d6ee38536570071653753dbe77244a4575c3 Mon Sep 17 00:00:00 2001 From: Matthias Date: Mon, 5 Jul 2021 01:38:14 +0200 Subject: [PATCH 14/46] Normalize path; fix tests --- lychee-lib/src/fs.rs | 167 ++++++++++++++++++++++--------------------- 1 file changed, 84 insertions(+), 83 deletions(-) diff --git a/lychee-lib/src/fs.rs b/lychee-lib/src/fs.rs index 98255b18ae..f7fd6a8680 100644 --- a/lychee-lib/src/fs.rs +++ b/lychee-lib/src/fs.rs @@ -1,26 +1,67 @@ use crate::{Base, ErrorKind, Result}; -use std::path::{Path, PathBuf}; +use std::path::{Component, Path, PathBuf}; // Returns the base if it is a valid `PathBuf` fn get_base_dir(base: &Option) -> Option { base.as_ref().and_then(|b| b.dir()) } +/// Normalize a path, removing things like `.` and `..`. +/// +/// CAUTION: This does not resolve symlinks (unlike +/// [`std::fs::canonicalize`]). This may cause incorrect or surprising +/// behavior at times. This should be used carefully. Unfortunately, +/// [`std::fs::canonicalize`] can be hard to use correctly, since it can often +/// fail, or on Windows returns annoying device paths. This is a problem Cargo +/// needs to improve on. +/// +/// Taken from https://github.com/rust-lang/cargo/blob/fede83ccf973457de319ba6fa0e36ead454d2e20/src/cargo/util/paths.rs#L61 +pub(crate) fn normalize(path: &Path) -> PathBuf { + let mut components = path.components().peekable(); + let mut ret = if let Some(c @ Component::Prefix(..)) = components.peek().cloned() { + components.next(); + PathBuf::from(c.as_os_str()) + } else { + PathBuf::new() + }; + + for component in components { + match component { + Component::Prefix(..) => unreachable!(), + Component::RootDir => { + ret.push(component.as_os_str()); + } + Component::CurDir => {} + Component::ParentDir => { + ret.pop(); + } + Component::Normal(c) => { + ret.push(c); + } + } + } + ret +} + pub(crate) fn resolve(src: &Path, dst: &Path, base: &Option) -> Result { if dst.is_relative() { // Find `dst` in the parent directory of `src` if let Some(parent) = src.parent() { let rel_path = parent.join(dst.to_path_buf()); - return Ok(rel_path); + return Ok(normalize(&rel_path)); } } if dst.is_absolute() { // Absolute local links (leading slash) require the base_url to // define the document root. - if let Some(base_dir) = get_base_dir(base) { - let abs_path = join(base_dir, dst); - return Ok(abs_path); - } + let base_dir = get_base_dir(base).unwrap_or( + src.to_path_buf() + .parent() + .map(|p| p.to_path_buf()) + .unwrap_or(PathBuf::new()), + ); + let abs_path = join(base_dir, dst); + return Ok(normalize(&abs_path)); } Err(ErrorKind::FileNotFound(dst.to_path_buf())) } @@ -46,8 +87,6 @@ pub(crate) fn sanitize(link: String) -> String { #[cfg(test)] mod test_fs_tree { - use std::fs::File; - use super::*; use crate::Result; @@ -79,115 +118,77 @@ mod test_fs_tree { // dummy root // /path/to/foo.html #[test] - fn test_find_absolute() -> Result<()> { + fn test_resolve_absolute() -> Result<()> { let dummy = PathBuf::new(); - let dir = tempfile::tempdir()?; - let dst = dir.path().join("foo.html"); - File::create(&dst)?; - assert_eq!(find(&dummy, &dst, &None)?, dst); + let abs_path = PathBuf::from("/absolute/path/to/foo.html"); + assert_eq!(resolve(&dummy, &abs_path, &None)?, abs_path); Ok(()) } // index.html // ./foo.html #[test] - fn test_find_relative() -> Result<()> { - let root = PathBuf::from("index.html"); - let dir = tempfile::tempdir()?; - let dst = dir.path().join("./foo.html"); - File::create(&dst)?; - assert_eq!(find(&root, &dst, &None)?, dst); + fn test_resolve_relative() -> Result<()> { + let dummy = PathBuf::from("index.html"); + let abs_path = PathBuf::from("./foo.html"); + assert_eq!( + resolve(&dummy, &abs_path, &None)?, + PathBuf::from("foo.html") + ); Ok(()) } // ./index.html // ./foo.html #[test] - fn test_find_relative_index() -> Result<()> { - let root = PathBuf::from("./index.html"); - let dir = tempfile::tempdir()?; - let dst = dir.path().join("./foo.html"); - File::create(&dst)?; - assert_eq!(find(&root, &dst, &None)?, dst); - Ok(()) - } - - #[test] - fn test_find_relative_nonexistent() -> Result<()> { - let root = PathBuf::from("index.html"); - // This file does not exist - let dst = PathBuf::from("./foo.html"); - assert!(find(&root, &dst, &None).is_err()); + fn test_resolve_relative_index() -> Result<()> { + let dummy = PathBuf::from("./index.html"); + let abs_path = PathBuf::from("./foo.html"); + assert_eq!( + resolve(&dummy, &abs_path, &None)?, + PathBuf::from("foo.html") + ); Ok(()) } // /path/to/index.html // ./foo.html #[test] - fn test_find_relative_from_absolute() -> Result<()> { - let dir = tempfile::tempdir()?; - let root = dir.path().join("index.html"); - // We create the absolute path to foo.html, - // but we address it under its relative path - let dst = PathBuf::from("./foo.html"); - let dst_absolute = dir.path().join("./foo.html"); - File::create(&dst_absolute)?; - assert_eq!(find(&root, &dst, &None)?, dst_absolute); + fn test_resolve_from_absolute() -> Result<()> { + let abs_index = PathBuf::from("/path/to/index.html"); + let abs_path = PathBuf::from("./foo.html"); + assert_eq!( + resolve(&abs_index, &abs_path, &None)?, + PathBuf::from("/path/to/foo.html") + ); Ok(()) } // dummy - // ./foo.html + // foo.html // valid base dir #[test] - fn test_find_absolute_from_base_dir() -> Result<()> { + fn test_resolve_absolute_from_base_dir() -> Result<()> { let dummy = PathBuf::new(); - let dir = tempfile::tempdir()?; - let dst = dir.path().join("foo.html"); - File::create(&dst)?; - let base_dir = dir.path().to_path_buf(); - let dst_absolute = base_dir.join(dst.to_path_buf()); + let abs_path = PathBuf::from("/foo.html"); + let base = Some(Base::Local(PathBuf::from("/some/absolute/base/dir"))); assert_eq!( - find(&dummy, &dst, &Some(Base::Local(base_dir)))?, - dst_absolute + resolve(&dummy, &abs_path, &base)?, + PathBuf::from("/some/absolute/base/dir/foo.html") ); Ok(()) } // /path/to/index.html - // ./foo.html (non-existent) - #[test] - fn test_find_relative_from_absolute_nonexistent() -> Result<()> { - let dir = tempfile::tempdir()?; - let root = dir.path().join("index.html"); - // We create the absolute path to foo.html, - // but we address it under its relative path - let dst = PathBuf::from("./foo.html"); - assert!(find(&root, &dst, &None).is_err()); - Ok(()) - } - - // /path/to/index.html - // /other/path/to/foo.html - #[test] - fn test_find_absolute_from_absolute() -> Result<()> { - let root = PathBuf::from("/path/to/index.html"); - let dir = tempfile::tempdir()?; - let dst = dir.path().join("foo.html"); - File::create(&dst)?; - assert_eq!(find(&root, &dst, &None)?, dst); - Ok(()) - } - - // /path/to // /other/path/to/foo.html #[test] - fn test_root_is_dir() -> Result<()> { - let root = PathBuf::from("/path/to/"); - let dir = tempfile::tempdir()?; - let dst = dir.path().join("foo.html"); - File::create(&dst)?; - assert_eq!(find(&root, &dst, &None)?, dst); + fn test_resolve_absolute_from_absolute() -> Result<()> { + let abs_index = PathBuf::from("/path/to/index.html"); + let abs_path = PathBuf::from("/other/path/to/foo.html"); + assert_eq!( + resolve(&abs_index, &abs_path, &None)?, + PathBuf::from("/path/to/other/path/to/foo.html") + ); Ok(()) } } From afdb721612f1ec5307bbc0a5eccd186baadb2d44 Mon Sep 17 00:00:00 2001 From: Matthias Date: Mon, 5 Jul 2021 02:03:00 +0200 Subject: [PATCH 15/46] Fix lints --- lychee-lib/src/extract.rs | 4 ++-- lychee-lib/src/fs.rs | 43 +++++++++++++---------------------- lychee-lib/src/types/base.rs | 4 +++- lychee-lib/src/types/error.rs | 1 + 4 files changed, 22 insertions(+), 30 deletions(-) diff --git a/lychee-lib/src/extract.rs b/lychee-lib/src/extract.rs index 039c039425..309e63b4c0 100644 --- a/lychee-lib/src/extract.rs +++ b/lychee-lib/src/extract.rs @@ -122,8 +122,8 @@ pub(crate) fn extract_links( } else if let Some(new_url) = base.as_ref().and_then(|u| u.join(&link)) { Request::new(Uri { inner: new_url }, input_content.input.clone()) } else if let Input::FsPath(root) = &input_content.input { - let link = fs::sanitize(link); - if link.starts_with("#") { + let link = fs::sanitize(&link); + if link.starts_with('#') { // Silently ignore anchors for now. continue; } diff --git a/lychee-lib/src/fs.rs b/lychee-lib/src/fs.rs index f7fd6a8680..a5c90fb5c2 100644 --- a/lychee-lib/src/fs.rs +++ b/lychee-lib/src/fs.rs @@ -3,7 +3,7 @@ use std::path::{Component, Path, PathBuf}; // Returns the base if it is a valid `PathBuf` fn get_base_dir(base: &Option) -> Option { - base.as_ref().and_then(|b| b.dir()) + base.as_ref().and_then(Base::dir) } /// Normalize a path, removing things like `.` and `..`. @@ -15,15 +15,14 @@ fn get_base_dir(base: &Option) -> Option { /// fail, or on Windows returns annoying device paths. This is a problem Cargo /// needs to improve on. /// -/// Taken from https://github.com/rust-lang/cargo/blob/fede83ccf973457de319ba6fa0e36ead454d2e20/src/cargo/util/paths.rs#L61 +/// Taken from [`cargo`](https://github.com/rust-lang/cargo/blob/fede83ccf973457de319ba6fa0e36ead454d2e20/src/cargo/util/paths.rs#L61) pub(crate) fn normalize(path: &Path) -> PathBuf { let mut components = path.components().peekable(); - let mut ret = if let Some(c @ Component::Prefix(..)) = components.peek().cloned() { + + let mut ret = components.peek().copied().map_or_else(PathBuf::new, |c| { components.next(); PathBuf::from(c.as_os_str()) - } else { - PathBuf::new() - }; + }); for component in components { match component { @@ -54,12 +53,11 @@ pub(crate) fn resolve(src: &Path, dst: &Path, base: &Option) -> Result PathBuf { /// A little helper function to remove the get parameters from a URL link. /// The link is not a URL but a String as that link may not have a base domain. -pub(crate) fn sanitize(link: String) -> String { +pub(crate) fn sanitize(link: &str) -> String { let path = match link.split_once('?') { Some((path, _params)) => path, - None => link.as_str(), + None => link, }; path.to_string() } @@ -92,27 +90,18 @@ mod test_fs_tree { #[test] fn test_sanitize() { - assert_eq!(sanitize("/".to_string()), "/".to_string()); - assert_eq!( - sanitize("index.html?foo=bar".to_string()), - "index.html".to_string() - ); + assert_eq!(sanitize("/"), "/".to_string()); + assert_eq!(sanitize("index.html?foo=bar"), "index.html".to_string()); + assert_eq!(sanitize("/index.html?foo=bar"), "/index.html".to_string()); assert_eq!( - sanitize("/index.html?foo=bar".to_string()), + sanitize("/index.html?foo=bar&baz=zorx?bla=blub"), "/index.html".to_string() ); assert_eq!( - sanitize("/index.html?foo=bar&baz=zorx?bla=blub".to_string()), - "/index.html".to_string() - ); - assert_eq!( - sanitize("https://example.org/index.html?foo=bar".to_string()), + sanitize("https://example.org/index.html?foo=bar"), "https://example.org/index.html".to_string() ); - assert_eq!( - sanitize("test.png?foo=bar".to_string()), - "test.png".to_string() - ); + assert_eq!(sanitize("test.png?foo=bar"), "test.png".to_string()); } // dummy root diff --git a/lychee-lib/src/types/base.rs b/lychee-lib/src/types/base.rs index 8b7f2f866a..f38ec29c87 100644 --- a/lychee-lib/src/types/base.rs +++ b/lychee-lib/src/types/base.rs @@ -18,6 +18,7 @@ pub enum Base { impl Base { /// Join link with base url + #[must_use] pub fn join(&self, link: &str) -> Option { match self { Self::Remote(url) => url.join(link).ok(), @@ -26,10 +27,11 @@ impl Base { } /// Return the directory if the base is local + #[must_use] pub fn dir(&self) -> Option { match self { Self::Remote(_) => None, - Self::Local(d) => Some(d.to_path_buf()), + Self::Local(d) => Some(d.clone()), } } } diff --git a/lychee-lib/src/types/error.rs b/lychee-lib/src/types/error.rs index 60cafa073d..7de4608106 100644 --- a/lychee-lib/src/types/error.rs +++ b/lychee-lib/src/types/error.rs @@ -76,6 +76,7 @@ impl Hash for ErrorKind { Self::UrlParseError(s, e) => (s, e.type_id()).hash(state), Self::UnreachableEmailAddress(u) | Self::InsecureURL(u) => u.hash(state), Self::InvalidFileUri(u) => u.hash(state), + Self::InvalidFileUri(f) => f.hash(state), Self::InvalidPath(p) => p.hash(state), Self::UnreachableEmailAddress(u) => u.hash(state), Self::InvalidHeader(e) => e.to_string().hash(state), From 4f9dc67bbd85cdf13927d27d15ea84f582e20ed9 Mon Sep 17 00:00:00 2001 From: Matthias Date: Mon, 5 Jul 2021 11:21:49 +0200 Subject: [PATCH 16/46] fix test --- lychee-lib/src/fs.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lychee-lib/src/fs.rs b/lychee-lib/src/fs.rs index a5c90fb5c2..806166e773 100644 --- a/lychee-lib/src/fs.rs +++ b/lychee-lib/src/fs.rs @@ -122,7 +122,7 @@ mod test_fs_tree { let abs_path = PathBuf::from("./foo.html"); assert_eq!( resolve(&dummy, &abs_path, &None)?, - PathBuf::from("foo.html") + PathBuf::from("./foo.html") ); Ok(()) } From 04bf838f9802a62495e91877bef5394f7581f879 Mon Sep 17 00:00:00 2001 From: Matthias Date: Mon, 5 Jul 2021 11:25:51 +0200 Subject: [PATCH 17/46] lint --- lychee-lib/src/extract.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/lychee-lib/src/extract.rs b/lychee-lib/src/extract.rs index 309e63b4c0..a0aaf9cb35 100644 --- a/lychee-lib/src/extract.rs +++ b/lychee-lib/src/extract.rs @@ -181,10 +181,7 @@ mod test { } fn extract_uris(input: &str, file_type: FileType, base_url: Option<&str>) -> HashSet { - let base = match base_url { - Some(url) => Some(Base::Remote(Url::parse(url).unwrap())), - None => None, - }; + let base = base_url.map(|url| Base::Remote(Url::parse(url).unwrap())); extract_links(&InputContent::from_string(input, file_type), &base) // unwrap is fine here as this helper function is only used in tests .unwrap() From b06afb7252f4760ab7ec2bedd49b74d4d4ffe69f Mon Sep 17 00:00:00 2001 From: Matthias Date: Mon, 5 Jul 2021 11:52:47 +0200 Subject: [PATCH 18/46] fix test --- lychee-lib/src/fs.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lychee-lib/src/fs.rs b/lychee-lib/src/fs.rs index 806166e773..a854602fd5 100644 --- a/lychee-lib/src/fs.rs +++ b/lychee-lib/src/fs.rs @@ -135,7 +135,7 @@ mod test_fs_tree { let abs_path = PathBuf::from("./foo.html"); assert_eq!( resolve(&dummy, &abs_path, &None)?, - PathBuf::from("foo.html") + PathBuf::from("./foo.html") ); Ok(()) } From 495f856c612ef2078c2ea24ad9ac620291ceedc6 Mon Sep 17 00:00:00 2001 From: Matthias Date: Mon, 5 Jul 2021 21:41:44 +0200 Subject: [PATCH 19/46] cleanup --- fixtures/TEST_RELATIVE.html | 1 - fixtures/TEST_RELATIVE_2.html | 1 - fixtures/TEST_RELATIVE_3.html | 1 - 3 files changed, 3 deletions(-) delete mode 100644 fixtures/TEST_RELATIVE.html delete mode 100644 fixtures/TEST_RELATIVE_2.html delete mode 100644 fixtures/TEST_RELATIVE_3.html diff --git a/fixtures/TEST_RELATIVE.html b/fixtures/TEST_RELATIVE.html deleted file mode 100644 index be4b0e517c..0000000000 --- a/fixtures/TEST_RELATIVE.html +++ /dev/null @@ -1 +0,0 @@ -Foo \ No newline at end of file diff --git a/fixtures/TEST_RELATIVE_2.html b/fixtures/TEST_RELATIVE_2.html deleted file mode 100644 index 89c3e73ade..0000000000 --- a/fixtures/TEST_RELATIVE_2.html +++ /dev/null @@ -1 +0,0 @@ -Bar \ No newline at end of file diff --git a/fixtures/TEST_RELATIVE_3.html b/fixtures/TEST_RELATIVE_3.html deleted file mode 100644 index a1324d8465..0000000000 --- a/fixtures/TEST_RELATIVE_3.html +++ /dev/null @@ -1 +0,0 @@ -Example link \ No newline at end of file From 5a2e10799f289993573486bad7f125c7bdc172b0 Mon Sep 17 00:00:00 2001 From: Matthias Date: Wed, 1 Sep 2021 19:06:23 +0200 Subject: [PATCH 20/46] linting --- .github/workflows/release.yml | 6 +++--- .github/workflows/rust.yml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index bb1b4451c4..c56d92f3c0 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -43,17 +43,17 @@ jobs: fail-fast: false steps: - name: Install musl tools - if: contains(matrix.target, 'musl') + if: ${{ contains(matrix.target, 'musl') }} run: sudo apt-get install -y musl-tools - name: Install arm tools - if: contains(matrix.target, 'arm') + if: ${{ contains(matrix.target, 'arm') }} run: | echo "GNU_PREFIX=arm-linux-gnueabihf-" >> $GITHUB_ENV sudo apt-get install -y binutils-arm-linux-gnueabihf - name: Install aarch64 tools - if: contains(matrix.target, 'aarch64') + if: ${{ contains(matrix.target, 'aarch64') }} run: | echo "GNU_PREFIX=aarch64-linux-gnu-" >> $GITHUB_ENV sudo apt-get install -y binutils-aarch64-linux-gnu diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 7404bc17d9..4b7369f8c8 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -56,7 +56,7 @@ jobs: - run: cargo-publish-all --dry-run publish: - if: startsWith(github.ref, 'refs/tags/') + if: ${{ startsWith(github.ref, 'refs/tags/') }} needs: - test - lint From dd3205a87cf46382bf5e3aa0f8d7f82e31359d45 Mon Sep 17 00:00:00 2001 From: Matthias Date: Thu, 2 Sep 2021 23:10:46 +0200 Subject: [PATCH 21/46] wip --- lychee-bin/src/main.rs | 9 +- lychee-bin/src/options.rs | 5 ++ lychee-lib/src/extract.rs | 105 ++++++++++------------ lychee-lib/src/helpers/mod.rs | 2 + lychee-lib/src/{fs.rs => helpers/path.rs} | 56 +++++------- lychee-lib/src/helpers/url.rs | 68 ++++++++++++++ lychee-lib/src/lib.rs | 2 +- lychee-lib/src/types/error.rs | 1 + 8 files changed, 154 insertions(+), 94 deletions(-) create mode 100644 lychee-lib/src/helpers/mod.rs rename lychee-lib/src/{fs.rs => helpers/path.rs} (78%) create mode 100644 lychee-lib/src/helpers/url.rs diff --git a/lychee-bin/src/main.rs b/lychee-bin/src/main.rs index e1493389ab..f8ba1bc09a 100644 --- a/lychee-bin/src/main.rs +++ b/lychee-bin/src/main.rs @@ -175,6 +175,13 @@ async fn run(cfg: &Config, inputs: Vec) -> Result { let include = RegexSet::new(&cfg.include)?; let exclude = RegexSet::new(&cfg.exclude)?; + // Offline mode overrides the scheme + let schemes = if cfg.offline { + vec!["file".to_string()] + } else { + cfg.scheme.clone() + }; + let client = ClientBuilder::builder() .includes(include) .excludes(exclude) @@ -190,7 +197,7 @@ async fn run(cfg: &Config, inputs: Vec) -> Result { .method(method) .timeout(timeout) .github_token(cfg.github_token.clone()) - .schemes(HashSet::from_iter(cfg.scheme.clone())) + .schemes(HashSet::from_iter(schemes)) .accepted(accepted) .require_https(cfg.require_https) .build() diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index b97236715c..a6f66d15a1 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -158,6 +158,11 @@ pub(crate) struct Config { #[serde(default)] pub(crate) scheme: Vec, + /// Only check local files and block network requests. + #[structopt(long)] + #[serde(default)] + pub(crate) offline: bool, + /// URLs to check (supports regex). Has preference over all excludes. #[structopt(long)] #[serde(default)] diff --git a/lychee-lib/src/extract.rs b/lychee-lib/src/extract.rs index a0aaf9cb35..97d98eeeba 100644 --- a/lychee-lib/src/extract.rs +++ b/lychee-lib/src/extract.rs @@ -4,25 +4,53 @@ use html5ever::{ parse_document, tendril::{StrTendril, TendrilSink}, }; -use linkify::LinkFinder; use log::info; use markup5ever_rcdom::{Handle, NodeData, RcDom}; use pulldown_cmark::{Event as MDEvent, Parser, Tag}; use reqwest::Url; use crate::{ - fs, + helpers::{path, url}, types::{FileType, InputContent}, Base, ErrorKind, Input, Request, Result, Uri, }; -// Use LinkFinder here to offload the actual link searching in plaintext. -fn find_links(input: &str) -> Vec { - let finder = LinkFinder::new(); - finder.links(input).collect() +/// Main entrypoint for extracting links from various sources +/// (Markdown, HTML, and plaintext) +pub(crate) fn extract_links( + input_content: &InputContent, + base: &Option, +) -> Result> { + let links = match input_content.file_type { + FileType::Markdown => extract_links_from_markdown(&input_content.content), + FileType::Html => extract_links_from_html(&input_content.content), + FileType::Plaintext => extract_links_from_plaintext(&input_content.content), + }; + + // Only keep legit URLs. For example this filters out anchors. + let mut requests: HashSet = HashSet::new(); + for link in links { + let req = if let Ok(uri) = Uri::try_from(link.as_str()) { + Request::new(uri, input_content.input.clone()) + } else if let Some(new_url) = base.as_ref().and_then(|u| u.join(&link)) { + Request::new(Uri { inner: new_url }, input_content.input.clone()) + } else if let Input::FsPath(root) = &input_content.input { + if url::is_anchor(&link) { + // Silently ignore anchor links for now + continue; + } + let uri = create_uri(root, base, &link)?; + Request::new(Uri { inner: uri }, input_content.input.clone()) + } else { + info!("Handling of {} not implemented yet", &link); + continue; + }; + requests.insert(req); + } + Ok(requests) } -/// Extract unparsed URL strings from a markdown string. +/// Extract unparsed URL strings from a Markdown string. fn extract_links_from_markdown(input: &str) -> Vec { let parser = Parser::new(input); parser @@ -35,15 +63,15 @@ fn extract_links_from_markdown(input: &str) -> Vec { .collect() } -/// Extract unparsed URL strings from a HTML string. +/// Extract unparsed URL strings from an HTML string. fn extract_links_from_html(input: &str) -> Vec { let tendril = StrTendril::from(input); let rc_dom = parse_document(RcDom::default(), html5ever::ParseOpts::default()).one(tendril); let mut urls = Vec::new(); - // we pass mutable urls reference to avoid extra allocations in each - // recursive descent + // We pass mutable URL references here to avoid + // extra allocations in each recursive descent walk_html_links(&mut urls, &rc_dom.document); urls @@ -68,7 +96,7 @@ fn walk_html_links(mut urls: &mut Vec, node: &Handle) { for attr in attrs.borrow().iter() { let attr_value = attr.value.to_string(); - if elem_attr_is_link(attr.name.local.as_ref(), name.local.as_ref()) { + if url::elem_attr_is_link(attr.name.local.as_ref(), name.local.as_ref()) { urls.push(attr_value); } else { urls.append(&mut extract_links_from_plaintext(&attr_value)); @@ -80,63 +108,24 @@ fn walk_html_links(mut urls: &mut Vec, node: &Handle) { } // recursively traverse the document's nodes -- this doesn't need any extra - // exit conditions because the document is a tree + // exit conditions, because the document is a tree for child in node.children.borrow().iter() { walk_html_links(&mut urls, child); } } -/// Determine if element's attribute contains a link / URL. -fn elem_attr_is_link(attr_name: &str, elem_name: &str) -> bool { - // See a comprehensive list of attributes that might contain URLs/URIs - // over at: https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes - matches!( - (attr_name, elem_name), - ("href" | "src" | "srcset" | "cite", _) | ("data", "object") | ("onhashchange", "body") - ) -} - -/// Extract unparsed URL strings from a plaintext. +/// Extract unparsed URL strings from plaintext fn extract_links_from_plaintext(input: &str) -> Vec { - find_links(input) + url::find_links(input) .iter() .map(|l| String::from(l.as_str())) .collect() } -pub(crate) fn extract_links( - input_content: &InputContent, - base: &Option, -) -> Result> { - let links = match input_content.file_type { - FileType::Markdown => extract_links_from_markdown(&input_content.content), - FileType::Html => extract_links_from_html(&input_content.content), - FileType::Plaintext => extract_links_from_plaintext(&input_content.content), - }; - - // Only keep legit URLs. For example this filters out anchors. - let mut requests: HashSet = HashSet::new(); - for link in links { - let req = if let Ok(uri) = Uri::try_from(link.as_str()) { - Request::new(uri, input_content.input.clone()) - } else if let Some(new_url) = base.as_ref().and_then(|u| u.join(&link)) { - Request::new(Uri { inner: new_url }, input_content.input.clone()) - } else if let Input::FsPath(root) = &input_content.input { - let link = fs::sanitize(&link); - if link.starts_with('#') { - // Silently ignore anchors for now. - continue; - } - let path = fs::resolve(&root, &PathBuf::from(&link), base)?; - let uri = Url::from_file_path(&path).map_err(|_e| ErrorKind::InvalidPath(path))?; - Request::new(Uri { inner: uri }, input_content.input.clone()) - } else { - info!("Handling of {} not implemented yet", &link); - continue; - }; - requests.insert(req); - } - Ok(requests) +fn create_uri(root: &PathBuf, base: &Option, link: &str) -> Result { + let link = url::remove_get_params(&link); + let path = path::resolve(root, &PathBuf::from(&link), base)?; + Ok(Url::from_file_path(&path).map_err(|_e| ErrorKind::InvalidPath(path))?) } #[cfg(test)] @@ -150,10 +139,10 @@ mod test { }; use pretty_assertions::assert_eq; - use url::Url; use super::*; use crate::{ + helpers::url::find_links, test_utils::{mail, website}, Uri, }; diff --git a/lychee-lib/src/helpers/mod.rs b/lychee-lib/src/helpers/mod.rs new file mode 100644 index 0000000000..94f2d21cf8 --- /dev/null +++ b/lychee-lib/src/helpers/mod.rs @@ -0,0 +1,2 @@ +pub(crate) mod path; +pub(crate) mod url; diff --git a/lychee-lib/src/fs.rs b/lychee-lib/src/helpers/path.rs similarity index 78% rename from lychee-lib/src/fs.rs rename to lychee-lib/src/helpers/path.rs index a854602fd5..6872b4cc00 100644 --- a/lychee-lib/src/fs.rs +++ b/lychee-lib/src/helpers/path.rs @@ -42,6 +42,18 @@ pub(crate) fn normalize(path: &Path) -> PathBuf { ret } +// Get the parent directory of a given `Path`. +fn dirname(src: &Path) -> PathBuf { + if src.is_file() { + src.to_path_buf() + .parent() + .map_or(PathBuf::new(), Path::to_path_buf) + } else { + src.to_path_buf() + } +} + +// Resolve `dst` that was linked to from within `src` pub(crate) fn resolve(src: &Path, dst: &Path, base: &Option) -> Result { if dst.is_relative() { // Find `dst` in the parent directory of `src` @@ -51,14 +63,16 @@ pub(crate) fn resolve(src: &Path, dst: &Path, base: &Option) -> Result".to_string(), + format!("Found absolute local link {:?} but no base directory was set. Set with `--base`.", dst) + .to_string(), + ) + })?; + let abs_path = join(dirname(&base), dst); return Ok(normalize(&abs_path)); } Err(ErrorKind::FileNotFound(dst.to_path_buf())) @@ -73,37 +87,11 @@ fn join(base: PathBuf, dst: &Path) -> PathBuf { PathBuf::from(abs) } -/// A little helper function to remove the get parameters from a URL link. -/// The link is not a URL but a String as that link may not have a base domain. -pub(crate) fn sanitize(link: &str) -> String { - let path = match link.split_once('?') { - Some((path, _params)) => path, - None => link, - }; - path.to_string() -} - #[cfg(test)] -mod test_fs_tree { +mod test_path { use super::*; use crate::Result; - #[test] - fn test_sanitize() { - assert_eq!(sanitize("/"), "/".to_string()); - assert_eq!(sanitize("index.html?foo=bar"), "index.html".to_string()); - assert_eq!(sanitize("/index.html?foo=bar"), "/index.html".to_string()); - assert_eq!( - sanitize("/index.html?foo=bar&baz=zorx?bla=blub"), - "/index.html".to_string() - ); - assert_eq!( - sanitize("https://example.org/index.html?foo=bar"), - "https://example.org/index.html".to_string() - ); - assert_eq!(sanitize("test.png?foo=bar"), "test.png".to_string()); - } - // dummy root // /path/to/foo.html #[test] diff --git a/lychee-lib/src/helpers/url.rs b/lychee-lib/src/helpers/url.rs new file mode 100644 index 0000000000..b00624d48e --- /dev/null +++ b/lychee-lib/src/helpers/url.rs @@ -0,0 +1,68 @@ +use linkify::LinkFinder; + +/// Remove all GET parameters from a URL. +/// The link is not a URL but a String as it may not have a base domain. +pub(crate) fn remove_get_params(url: &str) -> String { + let path = match url.split_once('?') { + Some((path, _params)) => path, + None => url, + }; + path.to_string() +} + +/// Determine if an element's attribute contains a link / URL. +pub(crate) fn elem_attr_is_link(attr_name: &str, elem_name: &str) -> bool { + // See a comprehensive list of attributes that might contain URLs/URIs + // over at: https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes + matches!( + (attr_name, elem_name), + ("href" | "src" | "srcset" | "cite", _) | ("data", "object") | ("onhashchange", "body") + ) +} + +// Taken from https://github.com/getzola/zola/blob/master/components/link_checker/src/lib.rs +pub(crate) fn is_anchor(url: &str) -> bool { + url.starts_with('#') +} + +// Use `LinkFinder` to offload the raw link searching in plaintext +pub(crate) fn find_links(input: &str) -> Vec { + let finder = LinkFinder::new(); + finder.links(input).collect() +} + +#[cfg(test)] +mod test_fs_tree { + use super::*; + + #[test] + fn test_is_anchor() { + assert!(is_anchor("#anchor")); + assert!(!is_anchor("notan#anchor")); + } + + #[test] + fn test_remove_get_params() { + assert_eq!(remove_get_params("/"), "/".to_string()); + assert_eq!( + remove_get_params("index.html?foo=bar"), + "index.html".to_string() + ); + assert_eq!( + remove_get_params("/index.html?foo=bar"), + "/index.html".to_string() + ); + assert_eq!( + remove_get_params("/index.html?foo=bar&baz=zorx?bla=blub"), + "/index.html".to_string() + ); + assert_eq!( + remove_get_params("https://example.org/index.html?foo=bar"), + "https://example.org/index.html".to_string() + ); + assert_eq!( + remove_get_params("test.png?foo=bar"), + "test.png".to_string() + ); + } +} diff --git a/lychee-lib/src/lib.rs b/lychee-lib/src/lib.rs index 5af79c6c9a..22b76f8a22 100644 --- a/lychee-lib/src/lib.rs +++ b/lychee-lib/src/lib.rs @@ -50,7 +50,7 @@ mod client; mod client_pool; /// A pool of clients, to handle concurrent checks pub mod collector; -mod fs; +mod helpers; mod quirks; mod types; diff --git a/lychee-lib/src/types/error.rs b/lychee-lib/src/types/error.rs index 7de4608106..1dd68d901b 100644 --- a/lychee-lib/src/types/error.rs +++ b/lychee-lib/src/types/error.rs @@ -124,6 +124,7 @@ impl Display for ErrorKind { uri ), Self::InvalidBase(base, e) => write!(f, "Error while base dir `{}` : {}", base, e), + Self::InvalidBase(base, e) => write!(f, "Error with base dir `{}` : {}", base, e), } } } From b7c129c43113999fedaa096c7fa31dc2069d0382 Mon Sep 17 00:00:00 2001 From: Matthias Date: Fri, 3 Sep 2021 01:42:57 +0200 Subject: [PATCH 22/46] Fix resolving absolute paths The previous solution didn't resolve to absolute paths and rather removed things like `.` and `..`. --- Cargo.lock | 7 +++++ README.md | 4 +++ lychee-lib/Cargo.toml | 1 + lychee-lib/src/extract.rs | 4 +-- lychee-lib/src/helpers/path.rs | 50 ++++++++++------------------------ 5 files changed, 29 insertions(+), 37 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1f7194a069..a71f5915ac 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1394,6 +1394,7 @@ dependencies = [ "log", "markup5ever_rcdom", "openssl-sys", + "path-clean", "pretty_assertions", "pulldown-cmark", "regex", @@ -1718,6 +1719,12 @@ dependencies = [ "winapi", ] +[[package]] +name = "path-clean" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ecba01bf2678719532c5e3059e0b5f0811273d94b397088b82e3bd0a78c78fdd" + [[package]] name = "pem" version = "0.8.3" diff --git a/README.md b/README.md index 326a00c22c..e452c5b172 100644 --- a/README.md +++ b/README.md @@ -148,11 +148,15 @@ lychee ~/projects/*/README.md # check links in local files (lychee supports advanced globbing and ~ expansion): lychee "~/projects/big_project/**/README.*" + # ignore case when globbing and check result for each link: lychee --glob-ignore-case --verbose "~/projects/**/[r]eadme.*" # check links from epub file (requires atool: https://www.nongnu.org/atool) acat -F zip {file.epub} "*.xhtml" "*.html" | lychee - + +# check links in directory; block network requests +lychee --offline path/to/directory ``` ### GitHub token diff --git a/lychee-lib/Cargo.toml b/lychee-lib/Cargo.toml index ab2c29c303..2b165a0838 100644 --- a/lychee-lib/Cargo.toml +++ b/lychee-lib/Cargo.toml @@ -41,6 +41,7 @@ tokio = { version = "1.6.0", features = ["full"] } typed-builder = "0.9.1" url = { version = "2.2.2", features = ["serde"] } log = "0.4.14" +path-clean = "0.1.0" [dev-dependencies] doc-comment = "0.3.3" diff --git a/lychee-lib/src/extract.rs b/lychee-lib/src/extract.rs index 97d98eeeba..41b62e3278 100644 --- a/lychee-lib/src/extract.rs +++ b/lychee-lib/src/extract.rs @@ -39,7 +39,7 @@ pub(crate) fn extract_links( // Silently ignore anchor links for now continue; } - let uri = create_uri(root, base, &link)?; + let uri = create_uri_from_path(root, base, &link)?; Request::new(Uri { inner: uri }, input_content.input.clone()) } else { info!("Handling of {} not implemented yet", &link); @@ -122,7 +122,7 @@ fn extract_links_from_plaintext(input: &str) -> Vec { .collect() } -fn create_uri(root: &PathBuf, base: &Option, link: &str) -> Result { +fn create_uri_from_path(root: &PathBuf, base: &Option, link: &str) -> Result { let link = url::remove_get_params(&link); let path = path::resolve(root, &PathBuf::from(&link), base)?; Ok(Url::from_file_path(&path).map_err(|_e| ErrorKind::InvalidPath(path))?) diff --git a/lychee-lib/src/helpers/path.rs b/lychee-lib/src/helpers/path.rs index 6872b4cc00..8b875ab1d7 100644 --- a/lychee-lib/src/helpers/path.rs +++ b/lychee-lib/src/helpers/path.rs @@ -1,45 +1,25 @@ use crate::{Base, ErrorKind, Result}; -use std::path::{Component, Path, PathBuf}; +use path_clean::PathClean; +use std::env; +use std::path::{Path, PathBuf}; // Returns the base if it is a valid `PathBuf` fn get_base_dir(base: &Option) -> Option { base.as_ref().and_then(Base::dir) } -/// Normalize a path, removing things like `.` and `..`. -/// -/// CAUTION: This does not resolve symlinks (unlike -/// [`std::fs::canonicalize`]). This may cause incorrect or surprising -/// behavior at times. This should be used carefully. Unfortunately, -/// [`std::fs::canonicalize`] can be hard to use correctly, since it can often -/// fail, or on Windows returns annoying device paths. This is a problem Cargo -/// needs to improve on. -/// -/// Taken from [`cargo`](https://github.com/rust-lang/cargo/blob/fede83ccf973457de319ba6fa0e36ead454d2e20/src/cargo/util/paths.rs#L61) -pub(crate) fn normalize(path: &Path) -> PathBuf { - let mut components = path.components().peekable(); +// https://stackoverflow.com/a/54817755/270334 +pub(crate) fn absolute_path(path: impl AsRef) -> Result { + let path = path.as_ref(); - let mut ret = components.peek().copied().map_or_else(PathBuf::new, |c| { - components.next(); - PathBuf::from(c.as_os_str()) - }); - - for component in components { - match component { - Component::Prefix(..) => unreachable!(), - Component::RootDir => { - ret.push(component.as_os_str()); - } - Component::CurDir => {} - Component::ParentDir => { - ret.pop(); - } - Component::Normal(c) => { - ret.push(c); - } - } + let absolute_path = if path.is_absolute() { + path.to_path_buf() + } else { + env::current_dir()?.join(path) } - ret + .clean(); + + Ok(absolute_path) } // Get the parent directory of a given `Path`. @@ -59,7 +39,7 @@ pub(crate) fn resolve(src: &Path, dst: &Path, base: &Option) -> Result) -> Result Date: Fri, 3 Sep 2021 01:43:45 +0200 Subject: [PATCH 23/46] Add fixtures for offline testing --- fixtures/offline/about/index.html | 21 +++++++++++++++++++++ fixtures/offline/blog/post1/index.html | 21 +++++++++++++++++++++ fixtures/offline/blog/post2/index.html | 18 ++++++++++++++++++ fixtures/offline/index.html | 18 ++++++++++++++++++ 4 files changed, 78 insertions(+) create mode 100644 fixtures/offline/about/index.html create mode 100644 fixtures/offline/blog/post1/index.html create mode 100644 fixtures/offline/blog/post2/index.html create mode 100644 fixtures/offline/index.html diff --git a/fixtures/offline/about/index.html b/fixtures/offline/about/index.html new file mode 100644 index 0000000000..1121b0bfa2 --- /dev/null +++ b/fixtures/offline/about/index.html @@ -0,0 +1,21 @@ + + + About + + +

About

+

+

+

+ + \ No newline at end of file diff --git a/fixtures/offline/blog/post1/index.html b/fixtures/offline/blog/post1/index.html new file mode 100644 index 0000000000..91129223da --- /dev/null +++ b/fixtures/offline/blog/post1/index.html @@ -0,0 +1,21 @@ + + + Post 2 + + +

Post 2 Title

+

+

+

+ + \ No newline at end of file diff --git a/fixtures/offline/blog/post2/index.html b/fixtures/offline/blog/post2/index.html new file mode 100644 index 0000000000..514ac4eeec --- /dev/null +++ b/fixtures/offline/blog/post2/index.html @@ -0,0 +1,18 @@ + + + Post 1 + + +

Post 1 Title

+

+

+

+ + \ No newline at end of file diff --git a/fixtures/offline/index.html b/fixtures/offline/index.html new file mode 100644 index 0000000000..d0879ff786 --- /dev/null +++ b/fixtures/offline/index.html @@ -0,0 +1,18 @@ + + + Post 2 + + +

Post 2 Title

+

+

+

+ + \ No newline at end of file From 82652a69d5771203408670189042ab9c966bdd62 Mon Sep 17 00:00:00 2001 From: Matthias Date: Fri, 3 Sep 2021 01:48:50 +0200 Subject: [PATCH 24/46] Add test --- lychee-bin/tests/cli.rs | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index 6d0819b579..a0681bf3aa 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -148,6 +148,25 @@ mod cli { .stdout(contains("Excluded.........1")); } + #[test] + fn test_resolve_paths() { + let mut cmd = main_command(); + let offline_dir = fixtures_path().join("offline"); + + // Exclude file link because it doesn't exist on the filesystem. + // (File URIs are absolute paths, which we don't have.) + // Nevertheless, the `file` scheme should be recognized. + cmd.arg("--offline") + .arg("--base") + .arg(&offline_dir) + .arg(&offline_dir.join("index.html")) + .env_clear() + .assert() + .success() + .stdout(contains("Total............2")) + .stdout(contains("Successful.......2")); + } + #[test] fn test_quirks() -> Result<()> { test_json_output!( From 87fd90f2fce97577bc50dfecd1e64000f4171b80 Mon Sep 17 00:00:00 2001 From: Matthias Date: Fri, 3 Sep 2021 02:09:30 +0200 Subject: [PATCH 25/46] cargo fmt --- lychee-bin/src/options.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index a6f66d15a1..c0ef15d2f8 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -158,7 +158,7 @@ pub(crate) struct Config { #[serde(default)] pub(crate) scheme: Vec, - /// Only check local files and block network requests. + /// Only check local files and block network requests. #[structopt(long)] #[serde(default)] pub(crate) offline: bool, From 9163066a6b6e0f492d1f4cc2d2cc0d55e4cdf764 Mon Sep 17 00:00:00 2001 From: Matthias Date: Fri, 3 Sep 2021 21:54:16 +0200 Subject: [PATCH 26/46] Reintegrate master --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index e452c5b172..9c5e7ed7e4 100644 --- a/README.md +++ b/README.md @@ -300,7 +300,8 @@ Try one of these links to get started: - [good first issues](https://github.com/lycheeverse/lychee/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) - [help wanted](https://github.com/lycheeverse/lychee/issues?q=is%3Aissue+is%3Aopen+label%3A%22help+wanted%22) -Lychee is written in Rust. Install [rust-up](https://rustup.rs/) to get started. Begin my making sure the following commands succeed without errors. +Lychee is written in Rust. Install [rust-up](https://rustup.rs/) to get started. +Begin my making sure the following commands succeed without errors. ```bash cargo test # runs tests From 57af648ec94920e40965ebbd32820d4b49c9e5db Mon Sep 17 00:00:00 2001 From: Matthias Date: Fri, 3 Sep 2021 23:09:28 +0200 Subject: [PATCH 27/46] fix tests after making base dir mandatory --- lychee-lib/src/helpers/path.rs | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/lychee-lib/src/helpers/path.rs b/lychee-lib/src/helpers/path.rs index 8b875ab1d7..fda4660d10 100644 --- a/lychee-lib/src/helpers/path.rs +++ b/lychee-lib/src/helpers/path.rs @@ -72,16 +72,6 @@ mod test_path { use super::*; use crate::Result; - // dummy root - // /path/to/foo.html - #[test] - fn test_resolve_absolute() -> Result<()> { - let dummy = PathBuf::new(); - let abs_path = PathBuf::from("/absolute/path/to/foo.html"); - assert_eq!(resolve(&dummy, &abs_path, &None)?, abs_path); - Ok(()) - } - // index.html // ./foo.html #[test] @@ -90,7 +80,7 @@ mod test_path { let abs_path = PathBuf::from("./foo.html"); assert_eq!( resolve(&dummy, &abs_path, &None)?, - PathBuf::from("./foo.html") + env::current_dir()?.join("./foo.html") ); Ok(()) } @@ -103,7 +93,7 @@ mod test_path { let abs_path = PathBuf::from("./foo.html"); assert_eq!( resolve(&dummy, &abs_path, &None)?, - PathBuf::from("./foo.html") + env::current_dir()?.join("./foo.html") ); Ok(()) } @@ -142,9 +132,10 @@ mod test_path { fn test_resolve_absolute_from_absolute() -> Result<()> { let abs_index = PathBuf::from("/path/to/index.html"); let abs_path = PathBuf::from("/other/path/to/foo.html"); + let base = Some(Base::Local(PathBuf::from("/some/absolute/base/dir"))); assert_eq!( - resolve(&abs_index, &abs_path, &None)?, - PathBuf::from("/path/to/other/path/to/foo.html") + resolve(&abs_index, &abs_path, &base)?, + PathBuf::from("/some/absolute/base/dir/other/path/to/foo.html") ); Ok(()) } From b3c5d122e712196d5a777a631068ace205736109 Mon Sep 17 00:00:00 2001 From: Matthias Date: Fri, 3 Sep 2021 23:21:24 +0200 Subject: [PATCH 28/46] Fix clippy lints --- lychee-lib/src/extract.rs | 8 ++++---- lychee-lib/src/helpers/path.rs | 5 ++--- lychee-lib/src/types/base.rs | 2 +- lychee-lib/src/types/input.rs | 2 +- 4 files changed, 8 insertions(+), 9 deletions(-) diff --git a/lychee-lib/src/extract.rs b/lychee-lib/src/extract.rs index 41b62e3278..0a48b335b5 100644 --- a/lychee-lib/src/extract.rs +++ b/lychee-lib/src/extract.rs @@ -1,4 +1,4 @@ -use std::{collections::HashSet, convert::TryFrom, path::PathBuf}; +use std::{collections::HashSet, convert::TryFrom, path::Path, path::PathBuf}; use html5ever::{ parse_document, @@ -122,10 +122,10 @@ fn extract_links_from_plaintext(input: &str) -> Vec { .collect() } -fn create_uri_from_path(root: &PathBuf, base: &Option, link: &str) -> Result { - let link = url::remove_get_params(&link); +fn create_uri_from_path(root: &Path, base: &Option, link: &str) -> Result { + let link = url::remove_get_params(link); let path = path::resolve(root, &PathBuf::from(&link), base)?; - Ok(Url::from_file_path(&path).map_err(|_e| ErrorKind::InvalidPath(path))?) + Url::from_file_path(&path).map_err(|_e| ErrorKind::InvalidPath(path)) } #[cfg(test)] diff --git a/lychee-lib/src/helpers/path.rs b/lychee-lib/src/helpers/path.rs index fda4660d10..87445c3424 100644 --- a/lychee-lib/src/helpers/path.rs +++ b/lychee-lib/src/helpers/path.rs @@ -39,7 +39,7 @@ pub(crate) fn resolve(src: &Path, dst: &Path, base: &Option) -> Result) -> Result".to_string(), format!("Found absolute local link {:?} but no base directory was set. Set with `--base`.", dst) - .to_string(), ) })?; let abs_path = join(dirname(&base), dst); - return Ok(absolute_path(&abs_path)?); + return absolute_path(&abs_path); } Err(ErrorKind::FileNotFound(dst.to_path_buf())) } diff --git a/lychee-lib/src/types/base.rs b/lychee-lib/src/types/base.rs index f38ec29c87..affeacc8ba 100644 --- a/lychee-lib/src/types/base.rs +++ b/lychee-lib/src/types/base.rs @@ -40,7 +40,7 @@ impl TryFrom<&str> for Base { type Error = ErrorKind; fn try_from(value: &str) -> Result { - if let Ok(url) = Url::parse(&value) { + if let Ok(url) = Url::parse(value) { if url.cannot_be_a_base() { return Err(ErrorKind::InvalidBase( value.to_string(), diff --git a/lychee-lib/src/types/input.rs b/lychee-lib/src/types/input.rs index 20a9f2f9e9..ad5ed835c2 100644 --- a/lychee-lib/src/types/input.rs +++ b/lychee-lib/src/types/input.rs @@ -83,7 +83,7 @@ impl Input { pub fn new(value: &str, glob_ignore_case: bool) -> Self { if value == STDIN { Self::Stdin - } else if let Ok(url) = Url::parse(&value) { + } else if let Ok(url) = Url::parse(value) { Self::RemoteUrl(Box::new(url)) } else { // this seems to be the only way to determine if this is a glob pattern From f143087743c3086babae1c7d7dff726a08f79890 Mon Sep 17 00:00:00 2001 From: Matthias Date: Sat, 4 Sep 2021 00:24:39 +0200 Subject: [PATCH 29/46] Relative path not needed --- lychee-lib/src/helpers/path.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lychee-lib/src/helpers/path.rs b/lychee-lib/src/helpers/path.rs index 87445c3424..b31d522477 100644 --- a/lychee-lib/src/helpers/path.rs +++ b/lychee-lib/src/helpers/path.rs @@ -79,7 +79,7 @@ mod test_path { let abs_path = PathBuf::from("./foo.html"); assert_eq!( resolve(&dummy, &abs_path, &None)?, - env::current_dir()?.join("./foo.html") + env::current_dir()?.join("foo.html") ); Ok(()) } @@ -92,7 +92,7 @@ mod test_path { let abs_path = PathBuf::from("./foo.html"); assert_eq!( resolve(&dummy, &abs_path, &None)?, - env::current_dir()?.join("./foo.html") + env::current_dir()?.join("foo.html") ); Ok(()) } From f47282093a9de71da52bbb66750251e9808d7476 Mon Sep 17 00:00:00 2001 From: Matthias Date: Sat, 4 Sep 2021 00:24:48 +0200 Subject: [PATCH 30/46] String allocation not needed --- lychee-lib/src/helpers/url.rs | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/lychee-lib/src/helpers/url.rs b/lychee-lib/src/helpers/url.rs index b00624d48e..fe275f6cc8 100644 --- a/lychee-lib/src/helpers/url.rs +++ b/lychee-lib/src/helpers/url.rs @@ -2,12 +2,12 @@ use linkify::LinkFinder; /// Remove all GET parameters from a URL. /// The link is not a URL but a String as it may not have a base domain. -pub(crate) fn remove_get_params(url: &str) -> String { +pub(crate) fn remove_get_params(url: &str) -> &str { let path = match url.split_once('?') { Some((path, _params)) => path, None => url, }; - path.to_string() + path } /// Determine if an element's attribute contains a link / URL. @@ -43,26 +43,17 @@ mod test_fs_tree { #[test] fn test_remove_get_params() { - assert_eq!(remove_get_params("/"), "/".to_string()); - assert_eq!( - remove_get_params("index.html?foo=bar"), - "index.html".to_string() - ); - assert_eq!( - remove_get_params("/index.html?foo=bar"), - "/index.html".to_string() - ); + assert_eq!(remove_get_params("/"), "/"); + assert_eq!(remove_get_params("index.html?foo=bar"), "index.html"); + assert_eq!(remove_get_params("/index.html?foo=bar"), "/index.html"); assert_eq!( remove_get_params("/index.html?foo=bar&baz=zorx?bla=blub"), - "/index.html".to_string() + "/index.html" ); assert_eq!( remove_get_params("https://example.org/index.html?foo=bar"), - "https://example.org/index.html".to_string() - ); - assert_eq!( - remove_get_params("test.png?foo=bar"), - "test.png".to_string() + "https://example.org/index.html" ); + assert_eq!(remove_get_params("test.png?foo=bar"), "test.png"); } } From 00ddb6dfc8df2744b98908362f06384dac933d70 Mon Sep 17 00:00:00 2001 From: Matthias Date: Mon, 6 Sep 2021 00:35:11 +0200 Subject: [PATCH 31/46] Filter out directories with suffixes that look like extensions Directories can still have a suffix which looks like a file extension like `foo.html`. This can lead to unexpected behavior with glob patterns like `**/*.html`. Therefore filter these out. https://github.com/lycheeverse/lychee/pull/262#issuecomment-91322681 --- lychee-lib/src/types/input.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/lychee-lib/src/types/input.rs b/lychee-lib/src/types/input.rs index ad5ed835c2..ad97355dd2 100644 --- a/lychee-lib/src/types/input.rs +++ b/lychee-lib/src/types/input.rs @@ -161,6 +161,14 @@ impl Input { for entry in glob_with(&glob_expanded, match_opts)? { match entry { Ok(path) => { + if path.is_dir() { + // Directories can still have a suffix which looks like + // a file extension like `foo.html`. This can lead to + // unexpected behavior with glob patterns like + // `**/*.html`. Therefore filter these out. + // https://github.com/lycheeverse/lychee/pull/262#issuecomment-913226819 + continue; + } let content = Self::path_content(&path)?; contents.push(content); } From b2ce61357fda9d7544f564b583f9698be132aa8d Mon Sep 17 00:00:00 2001 From: Matthias Date: Mon, 6 Sep 2021 23:46:31 +0200 Subject: [PATCH 32/46] Fix build errors; cleanup code --- lychee-lib/src/client.rs | 4 ++-- lychee-lib/src/extract.rs | 8 ++++---- lychee-lib/src/types/error.rs | 6 ++---- lychee-lib/src/types/uri.rs | 16 ++++++++-------- 4 files changed, 16 insertions(+), 18 deletions(-) diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs index e8d477fd7f..8fdb0a1c71 100644 --- a/lychee-lib/src/client.rs +++ b/lychee-lib/src/client.rs @@ -93,7 +93,7 @@ pub struct ClientBuilder { accepted: Option>, /// Response timeout per request timeout: Option, - /// Treat HTTP links as erros when HTTPS is available + /// Treat HTTP links as errors when HTTPS is available require_https: bool, } @@ -252,7 +252,7 @@ impl Client { } pub async fn check_file(&self, uri: &Uri) -> Status { - if let Ok(path) = uri.inner.to_file_path() { + if let Ok(path) = uri.url.to_file_path() { if path.exists() { return Status::Ok(StatusCode::OK); } diff --git a/lychee-lib/src/extract.rs b/lychee-lib/src/extract.rs index 0a48b335b5..672150f26c 100644 --- a/lychee-lib/src/extract.rs +++ b/lychee-lib/src/extract.rs @@ -32,15 +32,15 @@ pub(crate) fn extract_links( for link in links { let req = if let Ok(uri) = Uri::try_from(link.as_str()) { Request::new(uri, input_content.input.clone()) - } else if let Some(new_url) = base.as_ref().and_then(|u| u.join(&link)) { - Request::new(Uri { inner: new_url }, input_content.input.clone()) + } else if let Some(url) = base.as_ref().and_then(|u| u.join(&link)) { + Request::new(Uri { url }, input_content.input.clone()) } else if let Input::FsPath(root) = &input_content.input { if url::is_anchor(&link) { // Silently ignore anchor links for now continue; } - let uri = create_uri_from_path(root, base, &link)?; - Request::new(Uri { inner: uri }, input_content.input.clone()) + let url = create_uri_from_path(root, base, &link)?; + Request::new(Uri { url }, input_content.input.clone()) } else { info!("Handling of {} not implemented yet", &link); continue; diff --git a/lychee-lib/src/types/error.rs b/lychee-lib/src/types/error.rs index 1dd68d901b..208e0afea2 100644 --- a/lychee-lib/src/types/error.rs +++ b/lychee-lib/src/types/error.rs @@ -74,11 +74,10 @@ impl Hash for ErrorKind { Self::HubcapsError(e) => e.to_string().hash(state), Self::FileNotFound(e) => e.to_string_lossy().hash(state), Self::UrlParseError(s, e) => (s, e.type_id()).hash(state), - Self::UnreachableEmailAddress(u) | Self::InsecureURL(u) => u.hash(state), Self::InvalidFileUri(u) => u.hash(state), - Self::InvalidFileUri(f) => f.hash(state), Self::InvalidPath(p) => p.hash(state), - Self::UnreachableEmailAddress(u) => u.hash(state), + Self::UnreachableEmailAddress(u) | Self::InsecureURL(u) => u.hash(state), + Self::InvalidBase(base, e) => (base, e).hash(state), Self::InvalidHeader(e) => e.to_string().hash(state), Self::InvalidGlobPattern(e) => e.to_string().hash(state), Self::MissingGitHubToken => std::mem::discriminant(self).hash(state), @@ -123,7 +122,6 @@ impl Display for ErrorKind { "This URL is available in HTTPS protocol, but HTTP is provided, use '{}' instead", uri ), - Self::InvalidBase(base, e) => write!(f, "Error while base dir `{}` : {}", base, e), Self::InvalidBase(base, e) => write!(f, "Error with base dir `{}` : {}", base, e), } } diff --git a/lychee-lib/src/types/uri.rs b/lychee-lib/src/types/uri.rs index aaf7d0c0fe..edb6fc795a 100644 --- a/lychee-lib/src/types/uri.rs +++ b/lychee-lib/src/types/uri.rs @@ -6,14 +6,14 @@ use url::Url; use crate::{ErrorKind, Result}; -/// Lychee's own representation of a URI, which encapsulates all supported formats. +/// Lychee's own representation of a URI, which encapsulates all support formats. /// /// If the scheme is `mailto`, it's a mail address. /// Otherwise it's treated as a website URL. #[derive(Clone, Debug, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)] pub struct Uri { /// Website URL or mail address - pub(crate) inner: Url, + pub(crate) url: Url, } impl Uri { @@ -24,21 +24,21 @@ impl Uri { #[inline] #[must_use] pub fn as_str(&self) -> &str { - self.inner.as_ref().trim_start_matches("mailto:") + self.url.as_ref().trim_start_matches("mailto:") } #[inline] #[must_use] /// Returns the scheme of the URI (e.g. `http` or `mailto`) pub fn scheme(&self) -> &str { - self.inner.scheme() + self.url.scheme() } #[inline] #[must_use] /// Returns the domain of the URI (e.g. `example.org`) pub fn domain(&self) -> Option<&str> { - self.inner.domain() + self.url.domain() } #[inline] @@ -49,14 +49,14 @@ impl Uri { /// /// Return `None` for cannot-be-a-base URLs. pub fn path_segments(&self) -> Option> { - self.inner.path_segments() + self.url.path_segments() } #[must_use] /// Returns the IP address (either IPv4 or IPv6) of the URI, /// or `None` if it is a domain pub fn host_ip(&self) -> Option { - match self.inner.host()? { + match self.url.host()? { url::Host::Domain(_) => None, url::Host::Ipv4(v4_addr) => Some(v4_addr.into()), url::Host::Ipv6(v6_addr) => Some(v6_addr.into()), @@ -100,7 +100,7 @@ impl AsRef for Uri { impl From for Uri { fn from(url: Url) -> Self { - Self { inner: url } + Self { url } } } From 5d0b95271d5693ac8c30155cf4186fa5e5a0aa59 Mon Sep 17 00:00:00 2001 From: Matthias Date: Tue, 7 Sep 2021 00:20:09 +0200 Subject: [PATCH 33/46] Remove anchor from file links --- fixtures/offline/index.html | 3 +++ lychee-lib/src/extract.rs | 2 +- lychee-lib/src/helpers/url.rs | 46 +++++++++++++++++++++++++++-------- 3 files changed, 40 insertions(+), 11 deletions(-) diff --git a/fixtures/offline/index.html b/fixtures/offline/index.html index d0879ff786..b7789c04c8 100644 --- a/fixtures/offline/index.html +++ b/fixtures/offline/index.html @@ -12,6 +12,9 @@

Post 2 Title

  • About
  • +
  • + About +
  • diff --git a/lychee-lib/src/extract.rs b/lychee-lib/src/extract.rs index 672150f26c..9c192bd424 100644 --- a/lychee-lib/src/extract.rs +++ b/lychee-lib/src/extract.rs @@ -123,7 +123,7 @@ fn extract_links_from_plaintext(input: &str) -> Vec { } fn create_uri_from_path(root: &Path, base: &Option, link: &str) -> Result { - let link = url::remove_get_params(link); + let link = url::remove_get_params_and_fragment(link); let path = path::resolve(root, &PathBuf::from(&link), base)?; Url::from_file_path(&path).map_err(|_e| ErrorKind::InvalidPath(path)) } diff --git a/lychee-lib/src/helpers/url.rs b/lychee-lib/src/helpers/url.rs index fe275f6cc8..12b822b817 100644 --- a/lychee-lib/src/helpers/url.rs +++ b/lychee-lib/src/helpers/url.rs @@ -2,11 +2,15 @@ use linkify::LinkFinder; /// Remove all GET parameters from a URL. /// The link is not a URL but a String as it may not have a base domain. -pub(crate) fn remove_get_params(url: &str) -> &str { - let path = match url.split_once('?') { - Some((path, _params)) => path, +pub(crate) fn remove_get_params_and_fragment(url: &str) -> &str { + let path = match url.split_once('#') { + Some((path_without_fragment, _fragment)) => path_without_fragment, None => url, }; + let path = match path.split_once('?') { + Some((path_without_params, _params)) => path_without_params, + None => path, + }; path } @@ -42,18 +46,40 @@ mod test_fs_tree { } #[test] - fn test_remove_get_params() { - assert_eq!(remove_get_params("/"), "/"); - assert_eq!(remove_get_params("index.html?foo=bar"), "index.html"); - assert_eq!(remove_get_params("/index.html?foo=bar"), "/index.html"); + fn test_remove_get_params_and_fragment() { + assert_eq!(remove_get_params_and_fragment("/"), "/"); + assert_eq!( + remove_get_params_and_fragment("index.html?foo=bar"), + "index.html" + ); assert_eq!( - remove_get_params("/index.html?foo=bar&baz=zorx?bla=blub"), + remove_get_params_and_fragment("/index.html?foo=bar"), "/index.html" ); assert_eq!( - remove_get_params("https://example.org/index.html?foo=bar"), + remove_get_params_and_fragment("/index.html?foo=bar&baz=zorx?bla=blub"), + "/index.html" + ); + assert_eq!( + remove_get_params_and_fragment("https://example.org/index.html?foo=bar"), "https://example.org/index.html" ); - assert_eq!(remove_get_params("test.png?foo=bar"), "test.png"); + assert_eq!( + remove_get_params_and_fragment("test.png?foo=bar"), + "test.png" + ); + + assert_eq!( + remove_get_params_and_fragment("https://example.org/index.html#anchor"), + "https://example.org/index.html" + ); + assert_eq!( + remove_get_params_and_fragment("https://example.org/index.html?foo=bar#anchor"), + "https://example.org/index.html" + ); + assert_eq!( + remove_get_params_and_fragment("test.png?foo=bar#anchor"), + "test.png" + ); } } From 4827ecf6bd863fcb3458f32264d982b17d9b462b Mon Sep 17 00:00:00 2001 From: Matthias Date: Tue, 7 Sep 2021 00:22:06 +0200 Subject: [PATCH 34/46] Fix clippy warnings --- lychee-lib/src/types/error.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lychee-lib/src/types/error.rs b/lychee-lib/src/types/error.rs index 208e0afea2..c6b3002c8e 100644 --- a/lychee-lib/src/types/error.rs +++ b/lychee-lib/src/types/error.rs @@ -74,9 +74,10 @@ impl Hash for ErrorKind { Self::HubcapsError(e) => e.to_string().hash(state), Self::FileNotFound(e) => e.to_string_lossy().hash(state), Self::UrlParseError(s, e) => (s, e.type_id()).hash(state), - Self::InvalidFileUri(u) => u.hash(state), Self::InvalidPath(p) => p.hash(state), - Self::UnreachableEmailAddress(u) | Self::InsecureURL(u) => u.hash(state), + Self::InvalidFileUri(u) | Self::UnreachableEmailAddress(u) | Self::InsecureURL(u) => { + u.hash(state); + } Self::InvalidBase(base, e) => (base, e).hash(state), Self::InvalidHeader(e) => e.to_string().hash(state), Self::InvalidGlobPattern(e) => e.to_string().hash(state), From a28f932fb276334cb88cf2c08b0e53fe6fb97109 Mon Sep 17 00:00:00 2001 From: Matthias Date: Tue, 7 Sep 2021 00:41:07 +0200 Subject: [PATCH 35/46] Fix wildcard test --- lychee-bin/tests/cli.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index a0681bf3aa..62abd4ea42 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -408,7 +408,7 @@ mod cli { .arg(".*") .assert() .success() - .stdout(contains("Excluded........10")); + .stdout(contains("Excluded........11")); Ok(()) } From 8353ab1071963635f53e7153a0140f48a2cbc258 Mon Sep 17 00:00:00 2001 From: Matthias Date: Tue, 7 Sep 2021 00:53:42 +0200 Subject: [PATCH 36/46] Update docs --- README.md | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/README.md b/README.md index 9c5e7ed7e4..37d1c51a80 100644 --- a/README.md +++ b/README.md @@ -191,6 +191,7 @@ FLAGS: -i, --insecure Proceed for server connections considered insecure (invalid TLS) -n, --no-progress Do not show progress bar. This is recommended for non-interactive shells (e.g. for continuous integration) + --offline Only check local files and block network requests --require-https When HTTPS is available, treat HTTP links as errors --skip-missing Skip missing input files (default is to error if they don't exist) -V, --version Prints version information @@ -218,12 +219,6 @@ OPTIONS: the system -t, --timeout Website timeout from connect to response finished [default: 20] -u, --user-agent User agent [default: lychee/0.7.2] - -ARGS: - ... The inputs (where to get links to check from). These can be: files (e.g. `README.md`), file globs - (e.g. `"~/git/*/README.md"`), remote URLs (e.g. `https://example.org/README.md`) or standard - input (`-`). Prefix with `--` to separate inputs from options that allow multiple arguments - [default: README.md] ``` ### Exit codes From 0c5dcf3aa36c2f3b13fd4c24a15be4a419473b82 Mon Sep 17 00:00:00 2001 From: Matthias Date: Tue, 7 Sep 2021 01:05:41 +0200 Subject: [PATCH 37/46] whoops --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 37d1c51a80..99afbd51cc 100644 --- a/README.md +++ b/README.md @@ -219,6 +219,12 @@ OPTIONS: the system -t, --timeout Website timeout from connect to response finished [default: 20] -u, --user-agent User agent [default: lychee/0.7.2] + +ARGS: + ... The inputs (where to get links to check from). These can be: files (e.g. `README.md`), file globs + (e.g. `"~/git/*/README.md"`), remote URLs (e.g. `https://example.org/README.md`) or standard + input (`-`). Prefix with `--` to separate inputs from options that allow multiple arguments + [default: README.md] ``` ### Exit codes From 67268ed59842662c5775d935481c3def974371a2 Mon Sep 17 00:00:00 2001 From: Matthias Date: Tue, 7 Sep 2021 13:02:39 +0200 Subject: [PATCH 38/46] Clean up params and fragment handling --- lychee-lib/src/helpers/url.rs | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/lychee-lib/src/helpers/url.rs b/lychee-lib/src/helpers/url.rs index 12b822b817..3a830f95b4 100644 --- a/lychee-lib/src/helpers/url.rs +++ b/lychee-lib/src/helpers/url.rs @@ -3,13 +3,12 @@ use linkify::LinkFinder; /// Remove all GET parameters from a URL. /// The link is not a URL but a String as it may not have a base domain. pub(crate) fn remove_get_params_and_fragment(url: &str) -> &str { - let path = match url.split_once('#') { - Some((path_without_fragment, _fragment)) => path_without_fragment, - None => url, - }; - let path = match path.split_once('?') { + let path = match url.split_once('?') { Some((path_without_params, _params)) => path_without_params, - None => path, + None => match url.split_once('#') { + Some((path_without_fragment, _fragment)) => path_without_fragment, + None => url, + }, }; path } From ffab0343fc2e588b6482f12c06ed1a6891980f9b Mon Sep 17 00:00:00 2001 From: Matthias Date: Wed, 8 Sep 2021 00:29:30 +0200 Subject: [PATCH 39/46] Revert refactor for removing params and fragments The refactored version was not equivalent. It could not handle fragments containing a question mark. See https://github.com/lycheeverse/lychee/pull/262/commits/67268ed59842662c5775d935481c3def974371a2#r703400238 --- lychee-lib/src/helpers/url.rs | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/lychee-lib/src/helpers/url.rs b/lychee-lib/src/helpers/url.rs index 3a830f95b4..712d6f8992 100644 --- a/lychee-lib/src/helpers/url.rs +++ b/lychee-lib/src/helpers/url.rs @@ -3,12 +3,13 @@ use linkify::LinkFinder; /// Remove all GET parameters from a URL. /// The link is not a URL but a String as it may not have a base domain. pub(crate) fn remove_get_params_and_fragment(url: &str) -> &str { - let path = match url.split_once('?') { + let path = match url.split_once('#') { + Some((path_without_fragment, _fragment)) => path_without_fragment, + None => url, + }; + let path = match path.split_once('?') { Some((path_without_params, _params)) => path_without_params, - None => match url.split_once('#') { - Some((path_without_fragment, _fragment)) => path_without_fragment, - None => url, - }, + None => path, }; path } @@ -80,5 +81,13 @@ mod test_fs_tree { remove_get_params_and_fragment("test.png?foo=bar#anchor"), "test.png" ); + assert_eq!( + remove_get_params_and_fragment("test.png#anchor?anchor!?"), + "test.png" + ); + assert_eq!( + remove_get_params_and_fragment("test.png?foo=bar#anchor?anchor!"), + "test.png" + ); } } From 24ea2482d34c8fd55ca904d1741a3565b905471f Mon Sep 17 00:00:00 2001 From: Matthias Date: Wed, 8 Sep 2021 01:08:59 +0200 Subject: [PATCH 40/46] Update docs --- lychee-bin/tests/cli.rs | 3 --- lychee-lib/src/types/uri.rs | 2 ++ 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index 62abd4ea42..01d2d98d1e 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -153,9 +153,6 @@ mod cli { let mut cmd = main_command(); let offline_dir = fixtures_path().join("offline"); - // Exclude file link because it doesn't exist on the filesystem. - // (File URIs are absolute paths, which we don't have.) - // Nevertheless, the `file` scheme should be recognized. cmd.arg("--offline") .arg("--base") .arg(&offline_dir) diff --git a/lychee-lib/src/types/uri.rs b/lychee-lib/src/types/uri.rs index edb6fc795a..6ad126c3d4 100644 --- a/lychee-lib/src/types/uri.rs +++ b/lychee-lib/src/types/uri.rs @@ -82,11 +82,13 @@ impl Uri { } #[inline] + /// Check if the URI is a valid mail address pub(crate) fn is_mail(&self) -> bool { self.scheme() == "mailto" } #[inline] + /// Check if the URI is a file pub(crate) fn is_file(&self) -> bool { self.scheme() == "file" } From a75cae54b10d446ce2dd50c0c0f0d32342619849 Mon Sep 17 00:00:00 2001 From: Matthias Date: Thu, 9 Sep 2021 01:17:56 +0200 Subject: [PATCH 41/46] Add failing test --- fixtures/offline/404.html/.gitkeep | 0 fixtures/offline/another page/.gitkeep | 0 fixtures/offline/index.html | 10 ++++++++-- lychee-bin/tests/cli.rs | 4 ++-- 4 files changed, 10 insertions(+), 4 deletions(-) create mode 100644 fixtures/offline/404.html/.gitkeep create mode 100644 fixtures/offline/another page/.gitkeep diff --git a/fixtures/offline/404.html/.gitkeep b/fixtures/offline/404.html/.gitkeep new file mode 100644 index 0000000000..e69de29bb2 diff --git a/fixtures/offline/another page/.gitkeep b/fixtures/offline/another page/.gitkeep new file mode 100644 index 0000000000..e69de29bb2 diff --git a/fixtures/offline/index.html b/fixtures/offline/index.html index b7789c04c8..8594d3fdb5 100644 --- a/fixtures/offline/index.html +++ b/fixtures/offline/index.html @@ -1,9 +1,9 @@ - Post 2 + Index -

    Post 2 Title

    +

    Index Title

    diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index 01d2d98d1e..4ed0e2630e 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -160,8 +160,8 @@ mod cli { .env_clear() .assert() .success() - .stdout(contains("Total............2")) - .stdout(contains("Successful.......2")); + .stdout(contains("Total............4")) + .stdout(contains("Successful.......4")); } #[test] From 93948d73673e29418b0686ec4c4dcf4d089d5be7 Mon Sep 17 00:00:00 2001 From: Matthias Date: Thu, 9 Sep 2021 01:44:10 +0200 Subject: [PATCH 42/46] Avoid double-encoding already encoded destination paths E.g. `web%20site` becomes `web site`. That's because Url::from_file_path will encode the full URL in the end. This behavior cannot be configured. See https://github.com/lycheeverse/lychee/pull/262#issuecomment-915245411 --- Cargo.lock | 1 + lychee-bin/tests/cli.rs | 4 ++-- lychee-lib/Cargo.toml | 1 + lychee-lib/src/client.rs | 2 +- lychee-lib/src/extract.rs | 15 +++++++++++---- lychee-lib/src/types/error.rs | 22 ++++++++++++++++------ 6 files changed, 32 insertions(+), 13 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e8cbd95b6e..10ba84181b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1395,6 +1395,7 @@ dependencies = [ "markup5ever_rcdom", "openssl-sys", "path-clean", + "percent-encoding", "pretty_assertions", "pulldown-cmark", "regex", diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index 4ed0e2630e..94b0cef679 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -160,8 +160,8 @@ mod cli { .env_clear() .assert() .success() - .stdout(contains("Total............4")) - .stdout(contains("Successful.......4")); + .stdout(contains("Total............3")) + .stdout(contains("Successful.......3")); } #[test] diff --git a/lychee-lib/Cargo.toml b/lychee-lib/Cargo.toml index 2b165a0838..e31bd4a8dd 100644 --- a/lychee-lib/Cargo.toml +++ b/lychee-lib/Cargo.toml @@ -42,6 +42,7 @@ typed-builder = "0.9.1" url = { version = "2.2.2", features = ["serde"] } log = "0.4.14" path-clean = "0.1.0" +percent-encoding = "2.1.0" [dev-dependencies] doc-comment = "0.3.3" diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs index 0e43df7095..4c7d8ec856 100644 --- a/lychee-lib/src/client.rs +++ b/lychee-lib/src/client.rs @@ -262,7 +262,7 @@ impl Client { return Status::Ok(StatusCode::OK); } } - ErrorKind::InvalidFileUri(uri.clone()).into() + ErrorKind::InvalidFilePath(uri.clone()).into() } pub async fn check_mail(&self, uri: &Uri) -> Status { diff --git a/lychee-lib/src/extract.rs b/lychee-lib/src/extract.rs index 9c192bd424..d99792efa4 100644 --- a/lychee-lib/src/extract.rs +++ b/lychee-lib/src/extract.rs @@ -6,6 +6,7 @@ use html5ever::{ }; use log::info; use markup5ever_rcdom::{Handle, NodeData, RcDom}; +use percent_encoding::percent_decode_str; use pulldown_cmark::{Event as MDEvent, Parser, Tag}; use reqwest::Url; @@ -122,10 +123,16 @@ fn extract_links_from_plaintext(input: &str) -> Vec { .collect() } -fn create_uri_from_path(root: &Path, base: &Option, link: &str) -> Result { - let link = url::remove_get_params_and_fragment(link); - let path = path::resolve(root, &PathBuf::from(&link), base)?; - Url::from_file_path(&path).map_err(|_e| ErrorKind::InvalidPath(path)) +fn create_uri_from_path(src: &Path, base: &Option, dst: &str) -> Result { + let dst = url::remove_get_params_and_fragment(dst); + // Avoid double-encoding already encoded destination paths by removing any + // potential encoding (e.g. `web%20site` becomes `web site`). + // That's because Url::from_file_path will encode the full URL in the end. + // This behavior cannot be configured. + // See https://github.com/lycheeverse/lychee/pull/262#issuecomment-915245411 + let decoded = percent_decode_str(dst).decode_utf8()?.to_string(); + let path = path::resolve(src, &PathBuf::from(decoded), base)?; + Url::from_file_path(&path).map_err(|_e| ErrorKind::InvalidUrl(path)) } #[cfg(test)] diff --git a/lychee-lib/src/types/error.rs b/lychee-lib/src/types/error.rs index c6b3002c8e..4a761416d3 100644 --- a/lychee-lib/src/types/error.rs +++ b/lychee-lib/src/types/error.rs @@ -14,6 +14,8 @@ pub enum ErrorKind { // reading files /// Any form of I/O error occurred while reading from a given path. IoError(Option, std::io::Error), + /// Errors which can occur when attempting to interpret a sequence of u8 as a string + Utf8Error(std::str::Utf8Error), /// Network error when trying to connect to an endpoint via reqwest. ReqwestError(reqwest::Error), /// Network error when trying to connect to an endpoint via hubcaps. @@ -21,9 +23,9 @@ pub enum ErrorKind { /// The given string can not be parsed into a valid URL, e-mail address, or file path UrlParseError(String, (url::ParseError, Option)), /// The given URI cannot be converted to a file path - InvalidFileUri(Uri), + InvalidFilePath(Uri), /// The given path cannot be converted to a URI - InvalidPath(PathBuf), + InvalidUrl(PathBuf), /// The given mail address is unreachable UnreachableEmailAddress(Uri), /// The given header could not be parsed. @@ -74,8 +76,9 @@ impl Hash for ErrorKind { Self::HubcapsError(e) => e.to_string().hash(state), Self::FileNotFound(e) => e.to_string_lossy().hash(state), Self::UrlParseError(s, e) => (s, e.type_id()).hash(state), - Self::InvalidPath(p) => p.hash(state), - Self::InvalidFileUri(u) | Self::UnreachableEmailAddress(u) | Self::InsecureURL(u) => { + Self::InvalidUrl(p) => p.hash(state), + Self::Utf8Error(e) => e.to_string().hash(state), + Self::InvalidFilePath(u) | Self::UnreachableEmailAddress(u) | Self::InsecureURL(u) => { u.hash(state); } Self::InvalidBase(base, e) => (base, e).hash(state), @@ -109,8 +112,8 @@ impl Display for ErrorKind { Self::UrlParseError(s, (url_err, None)) => { write!(f, "Cannot parse {} as website url ({})", s, url_err) } - Self::InvalidFileUri(u) => write!(f, "Invalid file URI: {}", u), - Self::InvalidPath(p) => write!(f, "Invalid path: {}", p.display()), + Self::InvalidFilePath(u) => write!(f, "Invalid file URI: {}", u), + Self::InvalidUrl(p) => write!(f, "Invalid path: {}", p.display()), Self::UnreachableEmailAddress(uri) => write!(f, "Unreachable mail address: {}", uri), Self::InvalidHeader(e) => e.fmt(f), Self::InvalidGlobPattern(e) => e.fmt(f), @@ -124,6 +127,7 @@ impl Display for ErrorKind { uri ), Self::InvalidBase(base, e) => write!(f, "Error with base dir `{}` : {}", base, e), + Self::Utf8Error(e) => e.fmt(f), } } } @@ -143,6 +147,12 @@ impl From<(PathBuf, std::io::Error)> for ErrorKind { } } +impl From for ErrorKind { + fn from(e: std::str::Utf8Error) -> Self { + Self::Utf8Error(e) + } +} + impl From for ErrorKind { fn from(e: std::io::Error) -> Self { Self::IoError(None, e) From a1acf7b0d0e3810c066b57fc58cf2eb6a66046c2 Mon Sep 17 00:00:00 2001 From: Matthias Date: Thu, 9 Sep 2021 01:49:25 +0200 Subject: [PATCH 43/46] Reintegrate master --- Cargo.lock | 34 ++++++++++++++++++++++++++++++---- README.md | 29 +++++++++++++++++++++-------- lychee-bin/Cargo.toml | 2 +- lychee-bin/src/stats.rs | 5 ++++- lychee-bin/tests/cli.rs | 5 ++++- lychee-lib/src/types/status.rs | 7 ++++++- 6 files changed, 66 insertions(+), 16 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 10ba84181b..d13478f557 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -55,13 +55,13 @@ checksum = "71938f30533e4d95a6d17aa530939da3842c2ab6f4f84b9dae68447e4129f74a" [[package]] name = "assert_cmd" -version = "1.0.4" +version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f57fec1ac7e4de72dcc69811795f1a7172ed06012f80a5d1ee651b62484f588" +checksum = "b800c4403e8105d959595e1f88119e78bc12bc874c4336973658b648a746ba93" dependencies = [ "bstr", "doc-comment", - "predicates", + "predicates 2.0.2", "predicates-core", "predicates-tree", "wait-timeout", @@ -595,6 +595,12 @@ version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "524cbf6897b527295dff137cec09ecf3a05f4fddffd7dfcd1585403449e74198" +[[package]] +name = "difflib" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6184e33543162437515c2e2b48714794e37845ec9851711914eec9d308f6ebe8" + [[package]] name = "digest" version = "0.9.0" @@ -1237,6 +1243,15 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "47be2f14c678be2fdcab04ab1171db51b2762ce6f0a8ee87c8dd4a04ed216135" +[[package]] +name = "itertools" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69ddb889f9d0d08a67338271fa9b62996bc788c7796a5c18cf057420aaed5eaf" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "0.4.7" @@ -1363,7 +1378,7 @@ dependencies = [ "lychee-lib", "openssl-sys", "pad", - "predicates", + "predicates 1.0.8", "pretty_assertions", "regex", "reqwest", @@ -1857,6 +1872,17 @@ dependencies = [ "regex", ] +[[package]] +name = "predicates" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c143348f141cc87aab5b950021bac6145d0e5ae754b0591de23244cee42c9308" +dependencies = [ + "difflib", + "itertools", + "predicates-core", +] + [[package]] name = "predicates-core" version = "1.0.2" diff --git a/README.md b/README.md index d36875a8db..587ae39653 100644 --- a/README.md +++ b/README.md @@ -56,6 +56,19 @@ You can download them from the [releases page](https://github.com/lycheeverse/ly ### Cargo +#### Build dependencies + +On APT/dpkg-based Linux distros (e.g. Debian, Ubuntu, Linux Mint and Kali Linux) +the following commands will install all required build dependencies, including +the Rust toolchain and `cargo`: + +```sh +curl -sSf 'https://sh.rustup.rs' | sh +apt install gcc pkg-config libc6-dev libssl-dev +``` + +#### Compile and install lychee + ```sh cargo install lychee ``` @@ -65,16 +78,16 @@ cargo install lychee This comparison is made on a best-effort basis. Please create a PR to fix outdated information. -use + | | lychee | [awesome_bot] | [muffet] | [broken-link-checker] | [linkinator] | [linkchecker] | [markdown-link-check] | [fink] | | -------------------- | ------- | ------------- | -------- | --------------------- | ------------ | ------------- | --------------------- | ------ | | Language | Rust | Ruby | Go | JS | TypeScript | Python | JS | PHP | | Async/Parallel | ![yes] | ![yes] | ![yes] | ![yes] | ![yes] | ![yes] | ![yes] | ![yes] | -| JSON output | ![yes] | ![no] | ![yes] | ![yes] | ![yes] | ![maybe]1 | ![yes] | ![yes] | -| Static binary | ![yes] | ![no] | ![yes] | ![no] | ![no] | ️ ![no] | ![no] | ![no] | -| Markdown files | ![yes] | ![yes] | ![no] | ![no] | ![no] | ![yes] | ️ ![yes] | ![no] | -| HTML files | ![yes] | ![no] | ![no] | ![yes] | ![yes] | ![no] | ![yes] | ![no] | -| Text files | ![yes] | ![no] | ![no] | ![no] | ![no] | ![no] | ![no] | ![no] | +| JSON output | ![yes] | ![no] | ![yes] | ![yes] | ![yes] | ![maybe]1 | ![yes] | ![yes] | +| Static binary | ![yes] | ![no] | ![yes] | ![no] | ![no] | ️![no] | ![no] | ![no] | +| Markdown files | ![yes] | ![yes] | ![no] | ![no] | ![no] | ![yes] | ![yes] | ![no] | +| HTML files | ![yes] | ![no] | ![no] | ![yes] | ![yes] | ![no] | ![yes] | ![no] | +| Text files | ![yes] | ![no] | ![no] | ![no] | ![no] | ![no] | ![no] | ![no] | | Website support | ![yes] | ![no] | ![yes] | ![yes] | ![yes] | ![yes] | ![no] | ![yes] | | Chunked encodings | ![yes] | ![maybe] | ![maybe] | ![maybe] | ![maybe] | ![no] | ![yes] | ![yes] | | GZIP compression | ![yes] | ![maybe] | ![maybe] | ![yes] | ![maybe] | ![yes] | ![maybe] | ![no] | @@ -82,7 +95,7 @@ use | Custom user agent | ![yes] | ![no] | ![no] | ![yes] | ![no] | ![yes] | ![no] | ![no] | | Relative URLs | ![yes] | ![yes] | ![no] | ![yes] | ![yes] | ![yes] | ![yes] | ![yes] | | Skip relative URLs | ![yes] | ![no] | ![no] | ![maybe] | ![no] | ![no] | ![no] | ![no] | -| Include patterns | ![yes]️ | ![yes] | ![no] | ![yes] | ![no] | ![no] | ![no] | ![no] | +| Include patterns | ![yes]️ | ![yes] | ![no] | ![yes] | ![no] | ![no] | ![no] | ![no] | | Exclude patterns | ![yes] | ![no] | ![yes] | ![yes] | ![yes] | ![yes] | ![yes] | ![yes] | | Handle redirects | ![yes] | ![yes] | ![yes] | ![yes] | ![yes] | ![yes] | ![yes] | ![yes] | | Ignore insecure SSL | ![yes] | ![yes] | ![yes] | ![no] | ![no] | ![yes] | ![no] | ![yes] | @@ -101,7 +114,7 @@ use | [Use as library] | ![yes] | ![yes] | ![no] | ![yes] | ![yes] | ![no] | ![yes] | ![no] | | Quiet mode | ![yes] | ![no] | ![no] | ![no] | ![yes] | ![yes] | ![yes] | ![yes] | | [Config file] | ![yes] | ![no] | ![no] | ![no] | ![yes] | ![yes] | ![yes] | ![no] | -| Recursion | ![no] | ![no] | ![no] | ![yes] | ![yes] | ![yes] | ![yes] | ![no] | +| Recursion | ![no] | ![no] | ![no] | ![yes] | ![yes] | ![yes] | ![yes] | ![no] | | Amazing lychee logo | ![yes] | ![no] | ![no] | ![no] | ![no] | ![no] | ![no] | ![no] | [awesome_bot]: https://github.com/dkhamsing/awesome_bot diff --git a/lychee-bin/Cargo.toml b/lychee-bin/Cargo.toml index 83aa4bce40..8e78b3859d 100644 --- a/lychee-bin/Cargo.toml +++ b/lychee-bin/Cargo.toml @@ -40,7 +40,7 @@ tokio = { version = "1.6.0", features = ["full"] } toml = "0.5.8" [dev-dependencies] -assert_cmd = "1.0.4" +assert_cmd = "2.0.1" predicates = "1.0.8" pretty_assertions = "0.7.2" tempfile = "3.2.0" diff --git a/lychee-bin/src/stats.rs b/lychee-bin/src/stats.rs index 47c1949c4c..4a511d764d 100644 --- a/lychee-bin/src/stats.rs +++ b/lychee-bin/src/stats.rs @@ -16,7 +16,7 @@ pub(crate) fn color_response(response: &ResponseBody) -> String { Status::Ok(_) => style(response).green().bright(), Status::Excluded | Status::Unsupported(_) => style(response).dim(), Status::Redirected(_) => style(response), - Status::Timeout(_) => style(response).yellow().bright(), + Status::UnknownStatusCode(_) | Status::Timeout(_) => style(response).yellow().bright(), Status::Error(_) => style(response).red().bright(), }; out.to_string() @@ -27,6 +27,7 @@ pub(crate) struct ResponseStats { total: usize, successful: usize, failures: usize, + unknown: usize, timeouts: usize, redirects: usize, excludes: usize, @@ -52,6 +53,7 @@ impl ResponseStats { match status { Status::Ok(_) => self.successful += 1, Status::Error(_) => self.failures += 1, + Status::UnknownStatusCode(_) => self.unknown += 1, Status::Timeout(_) => self.timeouts += 1, Status::Redirected(_) => self.redirects += 1, Status::Excluded => self.excludes += 1, @@ -105,6 +107,7 @@ impl Display for ResponseStats { write_stat(f, "\u{23f3} Timeouts", self.timeouts, true)?; // ⏳ write_stat(f, "\u{1f500} Redirected", self.redirects, true)?; // 🔀 write_stat(f, "\u{1f47b} Excluded", self.excludes, true)?; // 👻 + write_stat(f, "\u{26a0} Unknown", self.unknown, true)?; // ⚠️ write_stat(f, "\u{1f6ab} Errors", self.errors + self.failures, false)?; // 🚫 for (input, responses) in &self.fail_map { diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index 94b0cef679..06d60621e2 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -40,6 +40,7 @@ mod cli { total: usize, successful: usize, failures: usize, + unknown: usize, timeouts: usize, redirects: usize, excludes: usize, @@ -53,6 +54,7 @@ mod cli { "total": {}, "successful": {}, "failures": {}, + "unknown": {}, "timeouts": {}, "redirects": {}, "excludes": {}, @@ -62,6 +64,7 @@ mod cli { self.total, self.successful, self.failures, + self.unknown, self.timeouts, self.redirects, self.excludes, @@ -387,7 +390,7 @@ mod cli { .assert() .success(); - let expected = r#"{"total":11,"successful":11,"failures":0,"timeouts":0,"redirects":0,"excludes":0,"errors":0,"fail_map":{}}"#; + let expected = r#"{"total":11,"successful":11,"failures":0,"unknown":0,"timeouts":0,"redirects":0,"excludes":0,"errors":0,"fail_map":{}}"#; let output = fs::read_to_string(&outfile)?; assert_eq!(output.split_whitespace().collect::(), expected); fs::remove_file(outfile)?; diff --git a/lychee-lib/src/types/status.rs b/lychee-lib/src/types/status.rs index dd3fffd304..c43786d5c5 100644 --- a/lychee-lib/src/types/status.rs +++ b/lychee-lib/src/types/status.rs @@ -10,6 +10,7 @@ const ICON_OK: &str = "\u{2714}"; // ✔ const ICON_REDIRECTED: &str = "\u{21c4}"; // ⇄ const ICON_EXCLUDED: &str = "\u{003f}"; // ? const ICON_UNSUPPORTED: &str = "\u{003f}"; // ? (using same icon, but under different name for explicitness) +const ICON_UNKNOWN: &str = "\u{003f}"; // ? const ICON_ERROR: &str = "\u{2717}"; // ✗ const ICON_TIMEOUT: &str = "\u{29d6}"; // ⧖ @@ -25,6 +26,8 @@ pub enum Status { Timeout(Option), /// Got redirected to different resource Redirected(StatusCode), + /// The given status code is not known by lychee + UnknownStatusCode(StatusCode), /// Resource was excluded from checking Excluded, /// The request type is currently not supported, @@ -38,6 +41,7 @@ impl Display for Status { match self { Status::Ok(c) => write!(f, "OK ({})", c), Status::Redirected(c) => write!(f, "Redirect ({})", c), + Status::UnknownStatusCode(c) => write!(f, "Unknown status: {}", c), Status::Excluded => f.write_str("Excluded"), Status::Timeout(Some(c)) => write!(f, "Timeout ({})", c), Status::Timeout(None) => f.write_str("Timeout"), @@ -69,8 +73,8 @@ impl Status { match response.error_for_status_ref() { Ok(_) if code.is_success() => Self::Ok(code), Ok(_) if code.is_redirection() => Self::Redirected(code), + Ok(_) => Self::UnknownStatusCode(code), Err(e) => e.into(), - Ok(_) => unreachable!(), } } } @@ -116,6 +120,7 @@ impl Status { match self { Status::Ok(_) => ICON_OK, Status::Redirected(_) => ICON_REDIRECTED, + Status::UnknownStatusCode(_) => ICON_UNKNOWN, Status::Excluded => ICON_EXCLUDED, Status::Error(_) => ICON_ERROR, Status::Timeout(_) => ICON_TIMEOUT, From 2a4170eade6c30adb900afeb0d6043903e2cd5f1 Mon Sep 17 00:00:00 2001 From: Matthias Date: Thu, 9 Sep 2021 14:42:09 +0200 Subject: [PATCH 44/46] Add test for `+` encoding --- lychee-lib/src/extract.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/lychee-lib/src/extract.rs b/lychee-lib/src/extract.rs index d99792efa4..bbd79a096d 100644 --- a/lychee-lib/src/extract.rs +++ b/lychee-lib/src/extract.rs @@ -158,6 +158,12 @@ mod test { Base, }; + #[test] + fn test_create_uri_from_path() { + let result = create_uri_from_path(&PathBuf::from("/README.md"), &None, "test+encoding").unwrap(); + assert_eq!(result.as_str(), "file:///test+encoding"); + } + fn load_fixture(filename: &str) -> String { let fixture_path = Path::new(env!("CARGO_MANIFEST_DIR")) .parent() From d7436575eb988839db3e3191108b47688a2fe282 Mon Sep 17 00:00:00 2001 From: Matthias Date: Thu, 9 Sep 2021 14:43:40 +0200 Subject: [PATCH 45/46] formatting --- lychee-lib/src/extract.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lychee-lib/src/extract.rs b/lychee-lib/src/extract.rs index bbd79a096d..1bbf05918a 100644 --- a/lychee-lib/src/extract.rs +++ b/lychee-lib/src/extract.rs @@ -160,7 +160,8 @@ mod test { #[test] fn test_create_uri_from_path() { - let result = create_uri_from_path(&PathBuf::from("/README.md"), &None, "test+encoding").unwrap(); + let result = + create_uri_from_path(&PathBuf::from("/README.md"), &None, "test+encoding").unwrap(); assert_eq!(result.as_str(), "file:///test+encoding"); } From de55fbd178ab90da724ea1032553157f7ec0f5dc Mon Sep 17 00:00:00 2001 From: Matthias Date: Thu, 9 Sep 2021 19:31:49 +0200 Subject: [PATCH 46/46] Add TODO for fixing URL encoding for paths --- lychee-lib/src/extract.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lychee-lib/src/extract.rs b/lychee-lib/src/extract.rs index 1bbf05918a..fcb4bcc008 100644 --- a/lychee-lib/src/extract.rs +++ b/lychee-lib/src/extract.rs @@ -130,6 +130,10 @@ fn create_uri_from_path(src: &Path, base: &Option, dst: &str) -> Result