From 701fbc9adad6a30847c46908cd46896be28a2f38 Mon Sep 17 00:00:00 2001
From: Matthias Endler
Date: Wed, 16 Jun 2021 13:03:36 +0200
Subject: [PATCH 01/46] Add support for local files
---
Cargo.lock | 1 +
fixtures/TEST_RELATIVE.html | 1 +
fixtures/TEST_RELATIVE_2.html | 1 +
fixtures/TEST_RELATIVE_3.html | 1 +
lychee-bin/src/main.rs | 5 +-
lychee-bin/src/options.rs | 2 +-
lychee-bin/tests/local_files.rs | 37 ++++++
lychee-lib/Cargo.toml | 1 +
lychee-lib/src/collector.rs | 220 +-------------------------------
lychee-lib/src/extract.rs | 75 ++++-------
lychee-lib/src/fs_tree.rs | 135 ++++++++++++++++++++
lychee-lib/src/lib.rs | 10 +-
lychee-lib/src/types/error.rs | 4 +
lychee-lib/src/types/file.rs | 37 ++++++
lychee-lib/src/types/input.rs | 203 +++++++++++++++++++++++++++++
lychee-lib/src/types/mod.rs | 4 +
16 files changed, 462 insertions(+), 275 deletions(-)
create mode 100644 fixtures/TEST_RELATIVE.html
create mode 100644 fixtures/TEST_RELATIVE_2.html
create mode 100644 fixtures/TEST_RELATIVE_3.html
create mode 100644 lychee-bin/tests/local_files.rs
create mode 100644 lychee-lib/src/fs_tree.rs
create mode 100644 lychee-lib/src/types/file.rs
create mode 100644 lychee-lib/src/types/input.rs
diff --git a/Cargo.lock b/Cargo.lock
index e6e8efefa1..1f7194a069 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1391,6 +1391,7 @@ dependencies = [
"http",
"hubcaps",
"linkify",
+ "log",
"markup5ever_rcdom",
"openssl-sys",
"pretty_assertions",
diff --git a/fixtures/TEST_RELATIVE.html b/fixtures/TEST_RELATIVE.html
new file mode 100644
index 0000000000..be4b0e517c
--- /dev/null
+++ b/fixtures/TEST_RELATIVE.html
@@ -0,0 +1 @@
+Foo
\ No newline at end of file
diff --git a/fixtures/TEST_RELATIVE_2.html b/fixtures/TEST_RELATIVE_2.html
new file mode 100644
index 0000000000..89c3e73ade
--- /dev/null
+++ b/fixtures/TEST_RELATIVE_2.html
@@ -0,0 +1 @@
+Bar
\ No newline at end of file
diff --git a/fixtures/TEST_RELATIVE_3.html b/fixtures/TEST_RELATIVE_3.html
new file mode 100644
index 0000000000..a1324d8465
--- /dev/null
+++ b/fixtures/TEST_RELATIVE_3.html
@@ -0,0 +1 @@
+Example link
\ No newline at end of file
diff --git a/lychee-bin/src/main.rs b/lychee-bin/src/main.rs
index 3f64f8d761..535fa1a9f1 100644
--- a/lychee-bin/src/main.rs
+++ b/lychee-bin/src/main.rs
@@ -70,10 +70,7 @@ use anyhow::{anyhow, Context, Result};
use headers::{authorization::Basic, Authorization, HeaderMap, HeaderMapExt, HeaderName};
use http::StatusCode;
use indicatif::{ProgressBar, ProgressStyle};
-use lychee_lib::{
- collector::{Collector, Input},
- ClientBuilder, ClientPool, Response,
-};
+use lychee_lib::{ClientBuilder, ClientPool, Collector, Input, Response};
use openssl_sys as _; // required for vendored-openssl feature
use regex::RegexSet;
use ring as _; // required for apple silicon
diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs
index be67e0b0d7..ab463df66b 100644
--- a/lychee-bin/src/options.rs
+++ b/lychee-bin/src/options.rs
@@ -2,7 +2,7 @@ use std::{fs, io::ErrorKind, path::PathBuf, str::FromStr};
use anyhow::{anyhow, Error, Result};
use lazy_static::lazy_static;
-use lychee_lib::collector::Input;
+use lychee_lib::Input;
use reqwest::Url;
use serde::Deserialize;
use structopt::{clap::crate_version, StructOpt};
diff --git a/lychee-bin/tests/local_files.rs b/lychee-bin/tests/local_files.rs
new file mode 100644
index 0000000000..ddd0ed25e1
--- /dev/null
+++ b/lychee-bin/tests/local_files.rs
@@ -0,0 +1,37 @@
+#[cfg(test)]
+mod cli {
+ use std::{fs::File, io::Write};
+
+ use assert_cmd::Command;
+ use lychee_lib::Result;
+ use predicates::str::contains;
+
+ fn main_command() -> Command {
+ // this gets the "main" binary name (e.g. `lychee`)
+ Command::cargo_bin(env!("CARGO_PKG_NAME")).expect("Couldn't get cargo package name")
+ }
+
+ #[tokio::test]
+ async fn test_local_file() -> Result<()> {
+ let dir = tempfile::tempdir()?;
+ let index_path = dir.path().join("index.html");
+ let mut index = File::create(&index_path)?;
+ writeln!(index, r#"Foo "#)?;
+
+ let foo_path = dir.path().join("foo.html");
+ let mut foo = File::create(&foo_path)?;
+ writeln!(foo, r#"example "#)?;
+
+ let mut cmd = main_command();
+ cmd.arg(index_path)
+ .arg("--no-progress")
+ .arg("--verbose")
+ .env_clear()
+ .assert()
+ .success()
+ .stdout(contains("Total............1"))
+ .stdout(contains("example.org"));
+
+ Ok(())
+ }
+}
diff --git a/lychee-lib/Cargo.toml b/lychee-lib/Cargo.toml
index cc5dd6d670..ab2c29c303 100644
--- a/lychee-lib/Cargo.toml
+++ b/lychee-lib/Cargo.toml
@@ -40,6 +40,7 @@ shellexpand = "2.1.0"
tokio = { version = "1.6.0", features = ["full"] }
typed-builder = "0.9.1"
url = { version = "2.2.2", features = ["serde"] }
+log = "0.4.14"
[dev-dependencies]
doc-comment = "0.3.3"
diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs
index 67fb090253..31416cc991 100644
--- a/lychee-lib/src/collector.rs
+++ b/lychee-lib/src/collector.rs
@@ -1,218 +1,6 @@
-use std::{
- collections::HashSet,
- fmt::Display,
- path::{Path, PathBuf},
-};
-
-use glob::glob_with;
+use crate::{extract::extract_links, uri::Uri, Input, Request, Result};
use reqwest::Url;
-use serde::Serialize;
-use shellexpand::tilde;
-use tokio::{
- fs::read_to_string,
- io::{stdin, AsyncReadExt},
-};
-
-use crate::{
- extract::{extract_links, FileType},
- uri::Uri,
- Request, Result,
-};
-
-const STDIN: &str = "-";
-#[derive(Debug, Clone, PartialEq, Eq, Hash)]
-#[non_exhaustive]
-/// An exhaustive list of input sources, which lychee accepts
-pub enum Input {
- /// URL (of HTTP/HTTPS scheme).
- RemoteUrl(Box),
- /// Unix shell-style glob pattern.
- FsGlob {
- /// The glob pattern matching all input files
- pattern: String,
- /// Don't be case sensitive when matching files against a glob
- ignore_case: bool,
- },
- /// File path.
- FsPath(PathBuf),
- /// Standard Input.
- Stdin,
- /// Raw string input.
- String(String),
-}
-
-impl Serialize for Input {
- fn serialize(&self, serializer: S) -> std::result::Result
- where
- S: serde::Serializer,
- {
- serializer.collect_str(self)
- }
-}
-
-impl Display for Input {
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
- f.write_str(match self {
- Input::RemoteUrl(url) => url.as_str(),
- Input::FsGlob { pattern, .. } => pattern,
- Input::FsPath(path) => path.to_str().unwrap_or_default(),
- Input::Stdin => "stdin",
- Input::String(_) => "raw input string",
- })
- }
-}
-
-#[derive(Debug)]
-/// Encapsulates the content for a given input
-pub struct InputContent {
- /// Input source
- pub input: Input,
- /// File type of given input
- pub file_type: FileType,
- /// Raw UTF-8 string content
- pub content: String,
-}
-
-impl InputContent {
- #[must_use]
- /// Create an instance of `InputContent` from an input string
- pub fn from_string(s: &str, file_type: FileType) -> Self {
- // TODO: consider using Cow (to avoid one .clone() for String types)
- Self {
- input: Input::String(s.to_owned()),
- file_type,
- content: s.to_owned(),
- }
- }
-}
-
-impl Input {
- #[must_use]
- /// Construct a new `Input` source. In case the input is a `glob` pattern,
- /// `glob_ignore_case` decides whether matching files against the `glob` is
- /// case-insensitive or not
- pub fn new(value: &str, glob_ignore_case: bool) -> Self {
- if value == STDIN {
- Self::Stdin
- } else if let Ok(url) = Url::parse(value) {
- Self::RemoteUrl(Box::new(url))
- } else {
- // this seems to be the only way to determine if this is a glob pattern
- let is_glob = glob::Pattern::escape(value) != value;
-
- if is_glob {
- Self::FsGlob {
- pattern: value.to_owned(),
- ignore_case: glob_ignore_case,
- }
- } else {
- Self::FsPath(value.into())
- }
- }
- }
-
- #[allow(clippy::missing_panics_doc)]
- /// Retrieve the contents from the input
- ///
- /// # Errors
- ///
- /// Returns an error if the contents can not be retrieved
- /// because of an underlying I/O error (e.g. an error while making a
- /// network request or retrieving the contents from the file system)
- pub async fn get_contents(
- &self,
- file_type_hint: Option,
- skip_missing: bool,
- ) -> Result> {
- match *self {
- // TODO: should skip_missing also affect URLs?
- Input::RemoteUrl(ref url) => Ok(vec![Self::url_contents(url).await?]),
- Input::FsGlob {
- ref pattern,
- ignore_case,
- } => Ok(Self::glob_contents(pattern, ignore_case).await?),
- Input::FsPath(ref path) => {
- let content = Self::path_content(path).await;
- match content {
- Ok(input_content) => Ok(vec![input_content]),
- Err(_) if skip_missing => Ok(vec![]),
- Err(e) => Err(e),
- }
- }
- Input::Stdin => Ok(vec![Self::stdin_content(file_type_hint).await?]),
- Input::String(ref s) => Ok(vec![Self::string_content(s, file_type_hint)]),
- }
- }
-
- async fn url_contents(url: &Url) -> Result {
- // Assume HTML for default paths
- let file_type = if url.path().is_empty() || url.path() == "/" {
- FileType::Html
- } else {
- FileType::from(url.as_str())
- };
-
- let res = reqwest::get(url.clone()).await?;
- let input_content = InputContent {
- input: Input::RemoteUrl(Box::new(url.clone())),
- file_type,
- content: res.text().await?,
- };
-
- Ok(input_content)
- }
-
- async fn glob_contents(path_glob: &str, ignore_case: bool) -> Result> {
- let mut contents = vec![];
- let glob_expanded = tilde(&path_glob);
- let mut match_opts = glob::MatchOptions::new();
-
- match_opts.case_sensitive = !ignore_case;
-
- for entry in glob_with(&glob_expanded, match_opts)? {
- match entry {
- Ok(path) => {
- let content = Self::path_content(&path).await?;
- contents.push(content);
- }
- Err(e) => println!("{:?}", e),
- }
- }
-
- Ok(contents)
- }
-
- async fn path_content + AsRef + Clone>(path: P) -> Result {
- let content = read_to_string(&path)
- .await
- .map_err(|e| (path.clone().into(), e))?;
- let input_content = InputContent {
- file_type: FileType::from(path.as_ref()),
- content,
- input: Input::FsPath(path.into()),
- };
-
- Ok(input_content)
- }
-
- async fn stdin_content(file_type_hint: Option) -> Result {
- let mut content = String::new();
- let mut stdin = stdin();
- stdin.read_to_string(&mut content).await?;
-
- let input_content = InputContent {
- input: Input::Stdin,
- file_type: file_type_hint.unwrap_or_default(),
- content,
- };
-
- Ok(input_content)
- }
-
- fn string_content(s: &str, file_type_hint: Option) -> InputContent {
- InputContent::from_string(s, file_type_hint.unwrap_or_default())
- }
-}
+use std::collections::HashSet;
/// Collector keeps the state of link collection
#[derive(Debug, Clone)]
@@ -278,7 +66,7 @@ impl Collector {
for handle in extract_links_handles {
let new_links = handle.await?;
- links.extend(new_links);
+ links.extend(new_links?);
}
// Filter out already cached links (duplicates)
@@ -304,9 +92,9 @@ mod test {
use super::*;
use crate::{
- extract::FileType,
mock_server,
test_utils::{mail, website},
+ types::{FileType, Input},
Result, Uri,
};
diff --git a/lychee-lib/src/extract.rs b/lychee-lib/src/extract.rs
index 32f24441f2..31c9e3cfab 100644
--- a/lychee-lib/src/extract.rs
+++ b/lychee-lib/src/extract.rs
@@ -1,51 +1,19 @@
-use std::{collections::HashSet, convert::TryFrom, path::Path};
+use std::{collections::HashSet, convert::TryFrom, path::PathBuf};
use html5ever::{
parse_document,
tendril::{StrTendril, TendrilSink},
};
use linkify::LinkFinder;
+use log::info;
use markup5ever_rcdom::{Handle, NodeData, RcDom};
use pulldown_cmark::{Event as MDEvent, Parser, Tag};
use url::Url;
-use crate::{collector::InputContent, Request, Uri};
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
-/// `FileType` defines which file types lychee can handle
-pub enum FileType {
- /// File in HTML format
- Html,
- /// File in Markdown format
- Markdown,
- /// Generic text file without syntax-specific parsing
- Plaintext,
-}
-
-impl Default for FileType {
- fn default() -> Self {
- Self::Plaintext
- }
-}
-
-impl> From for FileType {
- /// Detect if the given path points to a Markdown, HTML, or plaintext file.
- fn from(p: P) -> FileType {
- let path = p.as_ref();
- // Assume HTML in case of no extension.
- // Note: this is only reasonable for URLs; not paths on disk.
- // For example, `README` without an extension is more likely to be a plaintext file.
- // A better solution would be to also implement `From for FileType`.
- // Unfortunately that's not possible without refactoring, as
- // `AsRef` could be implemented for `Url` in the future, which is why
- // `From for FileType` is not allowed.
- match path.extension().and_then(std::ffi::OsStr::to_str) {
- Some("md" | "markdown") => FileType::Markdown,
- Some("htm" | "html") | None => FileType::Html,
- Some(_) => FileType::Plaintext,
- }
- }
-}
+use crate::{
+ types::{FileType, InputContent},
+ Input, Request, Result, Uri,
+};
// Use LinkFinder here to offload the actual link searching in plaintext.
fn find_links(input: &str) -> Vec {
@@ -140,7 +108,7 @@ fn extract_links_from_plaintext(input: &str) -> Vec {
pub(crate) fn extract_links(
input_content: &InputContent,
base_url: &Option,
-) -> HashSet {
+) -> Result> {
let links = match input_content.file_type {
FileType::Markdown => extract_links_from_markdown(&input_content.content),
FileType::Html => extract_links_from_html(&input_content.content),
@@ -153,16 +121,23 @@ pub(crate) fn extract_links(
for link in links {
if let Ok(uri) = Uri::try_from(link.as_str()) {
requests.insert(Request::new(uri, input_content.input.clone()));
- } else if !Path::new(&link).exists() {
- if let Some(new_url) = base_url.as_ref().and_then(|u| u.join(&link).ok()) {
- requests.insert(Request::new(
- Uri { url: new_url },
- input_content.input.clone(),
- ));
+ } else if let Some(new_url) = base_url.as_ref().and_then(|u| u.join(&link).ok()) {
+ requests.insert(Request::new(
+ Uri { url: new_url },
+ input_content.input.clone(),
+ ));
+ } else if let Input::FsPath(root) = &input_content.input {
+ if let Ok(path) = crate::fs_tree::find(&root, &PathBuf::from(&link)) {
+ let input_content = Input::path_content(path)?;
+ requests.extend(extract_links(&input_content, base_url)?);
+ } else {
+ info!("Cannot find path to {} in filesystem", &link);
}
- };
+ } else {
+ info!("Handling of {} not implemented yet", &link);
+ }
}
- requests
+ Ok(requests)
}
#[cfg(test)]
@@ -180,10 +155,10 @@ mod test {
use super::{
extract_links, extract_links_from_html, extract_links_from_markdown,
- extract_links_from_plaintext, find_links, FileType,
+ extract_links_from_plaintext, find_links,
};
+ use crate::types::{FileType, InputContent};
use crate::{
- collector::InputContent,
test_utils::{mail, website},
Uri,
};
@@ -211,6 +186,8 @@ mod test {
&InputContent::from_string(input, file_type),
&base_url.map(|u| Url::parse(u).unwrap()),
)
+ // unwrap is fine here as this helper function is only used in tests
+ .unwrap()
.into_iter()
.map(|r| r.uri)
.collect()
diff --git a/lychee-lib/src/fs_tree.rs b/lychee-lib/src/fs_tree.rs
new file mode 100644
index 0000000000..a1d9bd40d6
--- /dev/null
+++ b/lychee-lib/src/fs_tree.rs
@@ -0,0 +1,135 @@
+use crate::{ErrorKind, Result};
+use std::path::PathBuf;
+
+pub(crate) fn find(root: &PathBuf, dst: &PathBuf) -> Result {
+ if dst.exists() {
+ return Ok(dst.clone());
+ }
+ if dst.is_dir() {
+ return Err(ErrorKind::FileNotFound(dst.clone()));
+ }
+ // Find `dst` in the `root` path
+ if let Some(parent) = root.parent() {
+ let rel = parent.join(dst);
+ if rel.exists() {
+ return Ok(rel);
+ }
+ }
+ return Err(ErrorKind::FileNotFound(dst.clone()));
+}
+
+#[cfg(test)]
+mod test_fs_tree {
+ use std::fs::File;
+
+ use super::*;
+ use crate::Result;
+
+ // dummy root
+ // /path/to/foo.html
+ #[test]
+ fn test_find_absolute() -> Result<()> {
+ let dummy = PathBuf::new();
+ let dir = tempfile::tempdir()?;
+ let dst = dir.path().join("foo.html");
+ File::create(&dst)?;
+ assert_eq!(find(&dummy, &dst)?, dst);
+ Ok(())
+ }
+
+ // index.html
+ // ./foo.html
+ #[test]
+ fn test_find_relative() -> Result<()> {
+ let root = PathBuf::from("index.html");
+ let dir = tempfile::tempdir()?;
+ let dst = dir.path().join("./foo.html");
+ File::create(&dst)?;
+ assert_eq!(find(&root, &dst)?, dst);
+ Ok(())
+ }
+
+ // ./index.html
+ // ./foo.html
+ #[test]
+ fn test_find_relative_index() -> Result<()> {
+ let root = PathBuf::from("./index.html");
+ let dir = tempfile::tempdir()?;
+ let dst = dir.path().join("./foo.html");
+ File::create(&dst)?;
+ assert_eq!(find(&root, &dst)?, dst);
+ Ok(())
+ }
+
+ #[test]
+ fn test_find_relative_nonexistent() -> Result<()> {
+ let root = PathBuf::from("index.html");
+ // This file does not exist
+ let dst = PathBuf::from("./foo.html");
+ assert!(find(&root, &dst).is_err());
+ Ok(())
+ }
+
+ // /path/to/index.html
+ // ./foo.html
+ #[test]
+ fn test_find_relative_from_absolute() -> Result<()> {
+ let dir = tempfile::tempdir()?;
+ let root = dir.path().join("index.html");
+ // We create the absolute path to foo.html,
+ // but we address it under its relative path
+ let dst = PathBuf::from("./foo.html");
+ let dst_absolute = dir.path().join("./foo.html");
+ File::create(&dst_absolute)?;
+ assert_eq!(find(&root, &dst)?, dst_absolute);
+ Ok(())
+ }
+
+ // /path/to/index.html
+ // ./foo.html (non-existent)
+ #[test]
+ fn test_find_relative_from_absolute_nonexistent() -> Result<()> {
+ let dir = tempfile::tempdir()?;
+ let root = dir.path().join("index.html");
+ // We create the absolute path to foo.html,
+ // but we address it under its relative path
+ let dst = PathBuf::from("./foo.html");
+ assert!(find(&root, &dst).is_err());
+ Ok(())
+ }
+
+ // /path/to/index.html
+ // /other/path/to/foo.html
+ #[test]
+ fn test_find_absolute_from_absolute() -> Result<()> {
+ let root = PathBuf::from("/path/to/index.html");
+ let dir = tempfile::tempdir()?;
+ let dst = dir.path().join("foo.html");
+ File::create(&dst)?;
+ assert_eq!(find(&root, &dst)?, dst);
+ Ok(())
+ }
+
+ // /path/to
+ // /other/path/to/foo.html
+ #[test]
+ fn test_root_is_dir() -> Result<()> {
+ let root = PathBuf::from("/path/to/");
+ let dir = tempfile::tempdir()?;
+ let dst = dir.path().join("foo.html");
+ File::create(&dst)?;
+ assert_eq!(find(&root, &dst)?, dst);
+ Ok(())
+ }
+
+ // /path/to/index.html
+ // /other/path/to
+ #[test]
+ fn test_dst_is_dir() -> Result<()> {
+ let root = PathBuf::from("/path/to/");
+ let dir = tempfile::tempdir()?;
+ File::create(&dir)?;
+ assert!(find(&root, &dir.into_path()).is_err());
+ Ok(())
+ }
+}
diff --git a/lychee-lib/src/lib.rs b/lychee-lib/src/lib.rs
index 71ba9d6986..169b3d48eb 100644
--- a/lychee-lib/src/lib.rs
+++ b/lychee-lib/src/lib.rs
@@ -47,13 +47,13 @@ doc_comment::doctest!("../../README.md");
mod client;
mod client_pool;
+/// A pool of clients, to handle concurrent checks
+pub mod collector;
+mod fs_tree;
mod quirks;
mod types;
mod uri;
-/// A pool of clients, to handle concurrent checks
-pub mod collector;
-
/// Functionality to extract URIs from inputs
pub mod extract;
@@ -75,8 +75,8 @@ use ring as _; // required for apple silicon
pub use crate::{
client::{check, ClientBuilder},
client_pool::ClientPool,
- collector::{Collector, Input},
+ collector::Collector,
filter::{Excludes, Filter, Includes},
- types::{ErrorKind, Request, Response, ResponseBody, Result, Status},
+ types::{ErrorKind, Input, Request, Response, ResponseBody, Result, Status},
uri::Uri,
};
diff --git a/lychee-lib/src/types/error.rs b/lychee-lib/src/types/error.rs
index 0710f5ea0f..89c60c5912 100644
--- a/lychee-lib/src/types/error.rs
+++ b/lychee-lib/src/types/error.rs
@@ -25,6 +25,8 @@ pub enum ErrorKind {
/// A possible error when converting a `HeaderValue` from a string or byte
/// slice.
InvalidHeader(InvalidHeaderValue),
+ /// Cannot find local file
+ FileNotFound(PathBuf),
/// The given UNIX glob pattern is invalid
InvalidGlobPattern(glob::PatternError),
/// The Github API could not be called because of a missing Github token.
@@ -63,6 +65,7 @@ impl Hash for ErrorKind {
Self::IoError(p, e) => (p, e.kind()).hash(state),
Self::ReqwestError(e) => e.to_string().hash(state),
Self::HubcapsError(e) => e.to_string().hash(state),
+ Self::FileNotFound(e) => e.to_string_lossy().hash(state),
Self::UrlParseError(s, e) => (s, e.type_id()).hash(state),
Self::UnreachableEmailAddress(u) | Self::InsecureURL(u) => u.hash(state),
Self::InvalidHeader(e) => e.to_string().hash(state),
@@ -84,6 +87,7 @@ impl Display for ErrorKind {
Self::IoError(None, e) => e.fmt(f),
Self::ReqwestError(e) => e.fmt(f),
Self::HubcapsError(e) => e.fmt(f),
+ Self::FileNotFound(e) => write!(f, "{}", e.to_string_lossy()),
Self::UrlParseError(s, (url_err, Some(mail_err))) => {
write!(
f,
diff --git a/lychee-lib/src/types/file.rs b/lychee-lib/src/types/file.rs
new file mode 100644
index 0000000000..1afe52504f
--- /dev/null
+++ b/lychee-lib/src/types/file.rs
@@ -0,0 +1,37 @@
+use std::path::Path;
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+/// `FileType` defines which file types lychee can handle
+pub enum FileType {
+ /// File in HTML format
+ Html,
+ /// File in Markdown format
+ Markdown,
+ /// Generic text file without syntax-specific parsing
+ Plaintext,
+}
+
+impl Default for FileType {
+ fn default() -> Self {
+ Self::Plaintext
+ }
+}
+
+impl> From for FileType {
+ /// Detect if the given path points to a Markdown, HTML, or plaintext file.
+ fn from(p: P) -> FileType {
+ let path = p.as_ref();
+ // Assume HTML in case of no extension.
+ // Note: this is only reasonable for URLs; not paths on disk.
+ // For example, `README` without an extension is more likely to be a plaintext file.
+ // A better solution would be to also implement `From for FileType`.
+ // Unfortunately that's not possible without refactoring, as
+ // `AsRef` could be implemented for `Url` in the future, which is why
+ // `From for FileType` is not allowed.
+ match path.extension().and_then(std::ffi::OsStr::to_str) {
+ Some("md") | Some("markdown") => FileType::Markdown,
+ Some("htm") | Some("html") | None => FileType::Html,
+ Some(_) => FileType::Plaintext,
+ }
+ }
+}
diff --git a/lychee-lib/src/types/input.rs b/lychee-lib/src/types/input.rs
new file mode 100644
index 0000000000..bc8178275b
--- /dev/null
+++ b/lychee-lib/src/types/input.rs
@@ -0,0 +1,203 @@
+use crate::types::FileType;
+use crate::Result;
+use glob::glob_with;
+use reqwest::Url;
+use serde::Serialize;
+use shellexpand::tilde;
+use std::path::{Path, PathBuf};
+use std::{fmt::Display, fs::read_to_string};
+use tokio::io::{stdin, AsyncReadExt};
+
+const STDIN: &str = "-";
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+#[non_exhaustive]
+/// An exhaustive list of input sources, which lychee accepts
+pub enum Input {
+ /// URL (of HTTP/HTTPS scheme).
+ RemoteUrl(Box),
+ /// Unix shell-style glob pattern.
+ FsGlob {
+ /// The glob pattern matching all input files
+ pattern: String,
+ /// Don't be case sensitive when matching files against a glob
+ ignore_case: bool,
+ },
+ /// File path.
+ FsPath(PathBuf),
+ /// Standard Input.
+ Stdin,
+ /// Raw string input.
+ String(String),
+}
+
+impl Serialize for Input {
+ fn serialize(&self, serializer: S) -> std::result::Result
+ where
+ S: serde::Serializer,
+ {
+ serializer.collect_str(self)
+ }
+}
+
+impl Display for Input {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ f.write_str(match self {
+ Input::RemoteUrl(url) => url.as_str(),
+ Input::FsGlob { pattern, .. } => pattern,
+ Input::FsPath(path) => path.to_str().unwrap_or_default(),
+ Input::Stdin => "stdin",
+ Input::String(_) => "raw input string",
+ })
+ }
+}
+
+#[derive(Debug)]
+/// Encapsulates the content for a given input
+pub struct InputContent {
+ /// Input source
+ pub input: Input,
+ /// File type of given input
+ pub file_type: FileType,
+ /// Raw UTF-8 string content
+ pub content: String,
+}
+
+impl InputContent {
+ #[must_use]
+ /// Create an instance of `InputContent` from an input string
+ pub fn from_string(s: &str, file_type: FileType) -> Self {
+ // TODO: consider using Cow (to avoid one .clone() for String types)
+ Self {
+ input: Input::String(s.to_owned()),
+ file_type,
+ content: s.to_owned(),
+ }
+ }
+}
+
+impl Input {
+ #[must_use]
+ /// Construct a new `Input` source. In case the input is a `glob` pattern,
+ /// `glob_ignore_case` decides whether matching files against the `glob` is
+ /// case-insensitive or not
+ pub fn new(value: &str, glob_ignore_case: bool) -> Self {
+ if value == STDIN {
+ Self::Stdin
+ } else if let Ok(url) = Url::parse(&value) {
+ Self::RemoteUrl(Box::new(url))
+ } else {
+ // this seems to be the only way to determine if this is a glob pattern
+ let is_glob = glob::Pattern::escape(value) != value;
+
+ if is_glob {
+ Self::FsGlob {
+ pattern: value.to_owned(),
+ ignore_case: glob_ignore_case,
+ }
+ } else {
+ Self::FsPath(value.into())
+ }
+ }
+ }
+
+ #[allow(clippy::missing_panics_doc)]
+ /// Retrieve the contents from the input
+ ///
+ /// # Errors
+ ///
+ /// Returns an error if the contents can not be retrieved
+ /// because of an underlying I/O error (e.g. an error while making a
+ /// network request or retrieving the contents from the file system)
+ pub async fn get_contents(
+ &self,
+ file_type_hint: Option,
+ skip_missing: bool,
+ ) -> Result> {
+ match *self {
+ // TODO: should skip_missing also affect URLs?
+ Input::RemoteUrl(ref url) => Ok(vec![Self::url_contents(url).await?]),
+ Input::FsGlob {
+ ref pattern,
+ ignore_case,
+ } => Ok(Self::glob_contents(pattern, ignore_case).await?),
+ Input::FsPath(ref path) => {
+ let content = Self::path_content(path);
+ match content {
+ Ok(input_content) => Ok(vec![input_content]),
+ Err(_) if skip_missing => Ok(vec![]),
+ Err(e) => Err(e),
+ }
+ }
+ Input::Stdin => Ok(vec![Self::stdin_content(file_type_hint).await?]),
+ Input::String(ref s) => Ok(vec![Self::string_content(s, file_type_hint)]),
+ }
+ }
+
+ async fn url_contents(url: &Url) -> Result {
+ // Assume HTML for default paths
+ let file_type = if url.path().is_empty() || url.path() == "/" {
+ FileType::Html
+ } else {
+ FileType::from(url.as_str())
+ };
+
+ let res = reqwest::get(url.clone()).await?;
+ let input_content = InputContent {
+ input: Input::RemoteUrl(Box::new(url.clone())),
+ file_type,
+ content: res.text().await?,
+ };
+
+ Ok(input_content)
+ }
+
+ async fn glob_contents(path_glob: &str, ignore_case: bool) -> Result> {
+ let mut contents = vec![];
+ let glob_expanded = tilde(&path_glob);
+ let mut match_opts = glob::MatchOptions::new();
+
+ match_opts.case_sensitive = !ignore_case;
+
+ for entry in glob_with(&glob_expanded, match_opts)? {
+ match entry {
+ Ok(path) => {
+ let content = Self::path_content(&path)?;
+ contents.push(content);
+ }
+ Err(e) => println!("{:?}", e),
+ }
+ }
+
+ Ok(contents)
+ }
+
+ /// Get the input content of a given path
+ pub fn path_content + AsRef + Clone>(path: P) -> Result {
+ let content = read_to_string(&path).map_err(|e| (path.clone().into(), e))?;
+ let input_content = InputContent {
+ file_type: FileType::from(path.as_ref()),
+ content,
+ input: Input::FsPath(path.into()),
+ };
+
+ Ok(input_content)
+ }
+
+ async fn stdin_content(file_type_hint: Option) -> Result {
+ let mut content = String::new();
+ let mut stdin = stdin();
+ stdin.read_to_string(&mut content).await?;
+
+ let input_content = InputContent {
+ input: Input::Stdin,
+ file_type: file_type_hint.unwrap_or_default(),
+ content,
+ };
+
+ Ok(input_content)
+ }
+
+ fn string_content(s: &str, file_type_hint: Option) -> InputContent {
+ InputContent::from_string(s, file_type_hint.unwrap_or_default())
+ }
+}
diff --git a/lychee-lib/src/types/mod.rs b/lychee-lib/src/types/mod.rs
index a48f7a90fc..552b87fc19 100644
--- a/lychee-lib/src/types/mod.rs
+++ b/lychee-lib/src/types/mod.rs
@@ -1,11 +1,15 @@
#![allow(unreachable_pub)]
mod error;
+mod file;
+mod input;
mod request;
mod response;
mod status;
pub use error::ErrorKind;
+pub use file::FileType;
+pub use input::{Input, InputContent};
pub use request::Request;
pub use response::{Response, ResponseBody};
pub use status::Status;
From d5bb7ee7d7c50dea96d7363ac9f802f0e7c24876 Mon Sep 17 00:00:00 2001
From: Matthias Endler
Date: Thu, 17 Jun 2021 18:12:07 +0200
Subject: [PATCH 02/46] Or Patterns (Rust 1.53)
---
lychee-lib/src/extract.rs | 4 +---
lychee-lib/src/fs_tree.rs | 12 ++++++------
lychee-lib/src/lib.rs | 1 +
lychee-lib/src/types/file.rs | 4 ++--
lychee-lib/src/types/input.rs | 3 +++
5 files changed, 13 insertions(+), 11 deletions(-)
diff --git a/lychee-lib/src/extract.rs b/lychee-lib/src/extract.rs
index 31c9e3cfab..1dd310b1e9 100644
--- a/lychee-lib/src/extract.rs
+++ b/lychee-lib/src/extract.rs
@@ -26,9 +26,7 @@ fn extract_links_from_markdown(input: &str) -> Vec {
let parser = Parser::new(input);
parser
.flat_map(|event| match event {
- MDEvent::Start(Tag::Link(_, url, _) | Tag::Image(_, url, _)) => {
- vec![url.to_string()]
- }
+ MDEvent::Start(Tag::Link(_, url, _) | Tag::Image(_, url, _)) => vec![url.to_string()],
MDEvent::Text(txt) => extract_links_from_plaintext(&txt.to_string()),
MDEvent::Html(html) => extract_links_from_html(&html.to_string()),
_ => vec![],
diff --git a/lychee-lib/src/fs_tree.rs b/lychee-lib/src/fs_tree.rs
index a1d9bd40d6..b3255ead22 100644
--- a/lychee-lib/src/fs_tree.rs
+++ b/lychee-lib/src/fs_tree.rs
@@ -1,21 +1,21 @@
use crate::{ErrorKind, Result};
-use std::path::PathBuf;
+use std::path::{Path, PathBuf};
-pub(crate) fn find(root: &PathBuf, dst: &PathBuf) -> Result {
+pub(crate) fn find(root: &Path, dst: &Path) -> Result {
if dst.exists() {
- return Ok(dst.clone());
+ return Ok(dst.to_path_buf());
}
if dst.is_dir() {
- return Err(ErrorKind::FileNotFound(dst.clone()));
+ return Err(ErrorKind::FileNotFound(dst.into()));
}
// Find `dst` in the `root` path
if let Some(parent) = root.parent() {
- let rel = parent.join(dst);
+ let rel = parent.join(dst.to_path_buf());
if rel.exists() {
return Ok(rel);
}
}
- return Err(ErrorKind::FileNotFound(dst.clone()));
+ Err(ErrorKind::FileNotFound(dst.to_path_buf()))
}
#[cfg(test)]
diff --git a/lychee-lib/src/lib.rs b/lychee-lib/src/lib.rs
index 169b3d48eb..62257d5dc1 100644
--- a/lychee-lib/src/lib.rs
+++ b/lychee-lib/src/lib.rs
@@ -41,6 +41,7 @@
)]
#![deny(anonymous_parameters, macro_use_extern_crate, pointer_structural_match)]
#![deny(missing_docs)]
+#![allow(clippy::module_name_repetitions)]
#[cfg(doctest)]
doc_comment::doctest!("../../README.md");
diff --git a/lychee-lib/src/types/file.rs b/lychee-lib/src/types/file.rs
index 1afe52504f..d0d9510024 100644
--- a/lychee-lib/src/types/file.rs
+++ b/lychee-lib/src/types/file.rs
@@ -29,8 +29,8 @@ impl> From for FileType {
// `AsRef` could be implemented for `Url` in the future, which is why
// `From for FileType` is not allowed.
match path.extension().and_then(std::ffi::OsStr::to_str) {
- Some("md") | Some("markdown") => FileType::Markdown,
- Some("htm") | Some("html") | None => FileType::Html,
+ Some("md" | "markdown") => FileType::Markdown,
+ Some("htm" | "html") | None => FileType::Html,
Some(_) => FileType::Plaintext,
}
}
diff --git a/lychee-lib/src/types/input.rs b/lychee-lib/src/types/input.rs
index bc8178275b..20a9f2f9e9 100644
--- a/lychee-lib/src/types/input.rs
+++ b/lychee-lib/src/types/input.rs
@@ -172,6 +172,9 @@ impl Input {
}
/// Get the input content of a given path
+ /// # Errors
+ ///
+ /// Will return `Err` if file contents can't be read
pub fn path_content + AsRef + Clone>(path: P) -> Result {
let content = read_to_string(&path).map_err(|e| (path.clone().into(), e))?;
let input_content = InputContent {
From f9bf52ef10be804193bb394cddcdaa0952fb5c84 Mon Sep 17 00:00:00 2001
From: Matthias
Date: Sun, 20 Jun 2021 18:58:20 +0200
Subject: [PATCH 03/46] Add support for base_dir
---
examples/collect_links/collect_links.rs | 4 +-
lychee-bin/src/main.rs | 13 ++++--
lychee-bin/src/options.rs | 7 ++-
lychee-lib/src/collector.rs | 19 ++++++---
lychee-lib/src/extract.rs | 7 ++-
lychee-lib/src/fs_tree.rs | 57 ++++++++++++++++++-------
6 files changed, 78 insertions(+), 29 deletions(-)
diff --git a/examples/collect_links/collect_links.rs b/examples/collect_links/collect_links.rs
index fbff5f5316..fc97cbd390 100644
--- a/examples/collect_links/collect_links.rs
+++ b/examples/collect_links/collect_links.rs
@@ -14,8 +14,8 @@ async fn main() -> Result<()> {
];
let links = Collector::new(
- None, // base_url
- false, // don't skip missing inputs
+ None, // base_url
+ None, false, // don't skip missing inputs
10, // max concurrency
)
.collect_links(
diff --git a/lychee-bin/src/main.rs b/lychee-bin/src/main.rs
index 535fa1a9f1..585159f76f 100644
--- a/lychee-bin/src/main.rs
+++ b/lychee-bin/src/main.rs
@@ -197,10 +197,15 @@ async fn run(cfg: &Config, inputs: Vec ) -> Result {
.client()
.map_err(|e| anyhow!(e))?;
- let links = Collector::new(cfg.base_url.clone(), cfg.skip_missing, max_concurrency)
- .collect_links(&inputs)
- .await
- .map_err(|e| anyhow!(e))?;
+ let links = Collector::new(
+ cfg.base_url.clone(),
+ cfg.base_dir.clone(),
+ cfg.skip_missing,
+ max_concurrency,
+ )
+ .collect_links(&inputs)
+ .await
+ .map_err(|e| anyhow!(e))?;
let pb = if cfg.no_progress {
None
diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs
index ab463df66b..07b62d2c23 100644
--- a/lychee-bin/src/options.rs
+++ b/lychee-bin/src/options.rs
@@ -218,7 +218,12 @@ pub(crate) struct Config {
pub(crate) method: String,
/// Base URL to check relative URLs
- #[structopt(short, long, parse(try_from_str))]
+ #[structopt(long, parse(try_from_str))]
+ #[serde(default)]
+ pub(crate) base_dir: Option,
+
+ /// Base URL to check relative URLs
+ #[structopt(long, parse(try_from_str))]
#[serde(default)]
pub(crate) base_url: Option,
diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs
index 31416cc991..232ddb069f 100644
--- a/lychee-lib/src/collector.rs
+++ b/lychee-lib/src/collector.rs
@@ -1,11 +1,12 @@
use crate::{extract::extract_links, uri::Uri, Input, Request, Result};
use reqwest::Url;
-use std::collections::HashSet;
+use std::{collections::HashSet, path::PathBuf};
/// Collector keeps the state of link collection
#[derive(Debug, Clone)]
pub struct Collector {
base_url: Option,
+ base_dir: Option,
skip_missing_inputs: bool,
max_concurrency: usize,
cache: HashSet,
@@ -14,9 +15,15 @@ pub struct Collector {
impl Collector {
/// Create a new collector with an empty cache
#[must_use]
- pub fn new(base_url: Option, skip_missing_inputs: bool, max_concurrency: usize) -> Self {
+ pub fn new(
+ base_url: Option,
+ base_dir: Option,
+ skip_missing_inputs: bool,
+ max_concurrency: usize,
+ ) -> Self {
Collector {
base_url,
+ base_dir,
skip_missing_inputs,
max_concurrency,
cache: HashSet::new(),
@@ -52,8 +59,10 @@ impl Collector {
while let Some(result) = contents_rx.recv().await {
for input_content in result? {
let base_url = self.base_url.clone();
- let handle =
- tokio::task::spawn_blocking(move || extract_links(&input_content, &base_url));
+ let base_dir = self.base_dir.clone();
+ let handle = tokio::task::spawn_blocking(move || {
+ extract_links(&input_content, &base_url, &base_dir)
+ });
extract_links_handles.push(handle);
}
}
@@ -160,7 +169,7 @@ mod test {
},
];
- let responses = Collector::new(None, false, 8)
+ let responses = Collector::new(None, None, false, 8)
.collect_links(&inputs)
.await?;
let mut links = responses.into_iter().map(|r| r.uri).collect::>();
diff --git a/lychee-lib/src/extract.rs b/lychee-lib/src/extract.rs
index 1dd310b1e9..5bfafaeb60 100644
--- a/lychee-lib/src/extract.rs
+++ b/lychee-lib/src/extract.rs
@@ -11,6 +11,7 @@ use pulldown_cmark::{Event as MDEvent, Parser, Tag};
use url::Url;
use crate::{
+ fs_tree,
types::{FileType, InputContent},
Input, Request, Result, Uri,
};
@@ -106,6 +107,7 @@ fn extract_links_from_plaintext(input: &str) -> Vec {
pub(crate) fn extract_links(
input_content: &InputContent,
base_url: &Option,
+ base_dir: &Option,
) -> Result> {
let links = match input_content.file_type {
FileType::Markdown => extract_links_from_markdown(&input_content.content),
@@ -125,9 +127,9 @@ pub(crate) fn extract_links(
input_content.input.clone(),
));
} else if let Input::FsPath(root) = &input_content.input {
- if let Ok(path) = crate::fs_tree::find(&root, &PathBuf::from(&link)) {
+ if let Ok(path) = fs_tree::find(&root, &PathBuf::from(&link), base_dir) {
let input_content = Input::path_content(path)?;
- requests.extend(extract_links(&input_content, base_url)?);
+ requests.extend(extract_links(&input_content, base_url, base_dir)?);
} else {
info!("Cannot find path to {} in filesystem", &link);
}
@@ -183,6 +185,7 @@ mod test {
extract_links(
&InputContent::from_string(input, file_type),
&base_url.map(|u| Url::parse(u).unwrap()),
+ &None,
)
// unwrap is fine here as this helper function is only used in tests
.unwrap()
diff --git a/lychee-lib/src/fs_tree.rs b/lychee-lib/src/fs_tree.rs
index b3255ead22..44c9791356 100644
--- a/lychee-lib/src/fs_tree.rs
+++ b/lychee-lib/src/fs_tree.rs
@@ -1,18 +1,30 @@
use crate::{ErrorKind, Result};
use std::path::{Path, PathBuf};
-pub(crate) fn find(root: &Path, dst: &Path) -> Result {
+pub(crate) fn find(src: &Path, dst: &Path, base_dir: &Option) -> Result {
if dst.exists() {
return Ok(dst.to_path_buf());
}
if dst.is_dir() {
return Err(ErrorKind::FileNotFound(dst.into()));
}
- // Find `dst` in the `root` path
- if let Some(parent) = root.parent() {
- let rel = parent.join(dst.to_path_buf());
- if rel.exists() {
- return Ok(rel);
+ if dst.is_absolute() {
+ // Absolute local links (leading slash) require the base_url to
+ // define the document root.
+ if let Some(base_dir) = base_dir {
+ let absolute = base_dir.join(dst.to_path_buf());
+ if absolute.exists() {
+ return Ok(absolute);
+ }
+ }
+ }
+ if dst.is_relative() {
+ // Find `dst` in the `root` path
+ if let Some(parent) = src.parent() {
+ let relative = parent.join(dst.to_path_buf());
+ if relative.exists() {
+ return Ok(relative);
+ }
}
}
Err(ErrorKind::FileNotFound(dst.to_path_buf()))
@@ -33,7 +45,7 @@ mod test_fs_tree {
let dir = tempfile::tempdir()?;
let dst = dir.path().join("foo.html");
File::create(&dst)?;
- assert_eq!(find(&dummy, &dst)?, dst);
+ assert_eq!(find(&dummy, &dst, &None)?, dst);
Ok(())
}
@@ -45,7 +57,7 @@ mod test_fs_tree {
let dir = tempfile::tempdir()?;
let dst = dir.path().join("./foo.html");
File::create(&dst)?;
- assert_eq!(find(&root, &dst)?, dst);
+ assert_eq!(find(&root, &dst, &None)?, dst);
Ok(())
}
@@ -57,7 +69,7 @@ mod test_fs_tree {
let dir = tempfile::tempdir()?;
let dst = dir.path().join("./foo.html");
File::create(&dst)?;
- assert_eq!(find(&root, &dst)?, dst);
+ assert_eq!(find(&root, &dst, &None)?, dst);
Ok(())
}
@@ -66,7 +78,7 @@ mod test_fs_tree {
let root = PathBuf::from("index.html");
// This file does not exist
let dst = PathBuf::from("./foo.html");
- assert!(find(&root, &dst).is_err());
+ assert!(find(&root, &dst, &None).is_err());
Ok(())
}
@@ -81,7 +93,22 @@ mod test_fs_tree {
let dst = PathBuf::from("./foo.html");
let dst_absolute = dir.path().join("./foo.html");
File::create(&dst_absolute)?;
- assert_eq!(find(&root, &dst)?, dst_absolute);
+ assert_eq!(find(&root, &dst, &None)?, dst_absolute);
+ Ok(())
+ }
+
+ // dummy
+ // ./foo.html
+ // valid base dir
+ #[test]
+ fn test_find_absolute_from_base_dir() -> Result<()> {
+ let dummy = PathBuf::new();
+ let dir = tempfile::tempdir()?;
+ let dst = dir.path().join("foo.html");
+ File::create(&dst)?;
+ let base_dir = dir.path().to_path_buf();
+ let dst_absolute = base_dir.join(dst.to_path_buf());
+ assert_eq!(find(&dummy, &dst, &Some(base_dir))?, dst_absolute);
Ok(())
}
@@ -94,7 +121,7 @@ mod test_fs_tree {
// We create the absolute path to foo.html,
// but we address it under its relative path
let dst = PathBuf::from("./foo.html");
- assert!(find(&root, &dst).is_err());
+ assert!(find(&root, &dst, &None).is_err());
Ok(())
}
@@ -106,7 +133,7 @@ mod test_fs_tree {
let dir = tempfile::tempdir()?;
let dst = dir.path().join("foo.html");
File::create(&dst)?;
- assert_eq!(find(&root, &dst)?, dst);
+ assert_eq!(find(&root, &dst, &None)?, dst);
Ok(())
}
@@ -118,7 +145,7 @@ mod test_fs_tree {
let dir = tempfile::tempdir()?;
let dst = dir.path().join("foo.html");
File::create(&dst)?;
- assert_eq!(find(&root, &dst)?, dst);
+ assert_eq!(find(&root, &dst, &None)?, dst);
Ok(())
}
@@ -129,7 +156,7 @@ mod test_fs_tree {
let root = PathBuf::from("/path/to/");
let dir = tempfile::tempdir()?;
File::create(&dir)?;
- assert!(find(&root, &dir.into_path()).is_err());
+ assert!(find(&root, &dir.into_path(), &None).is_err());
Ok(())
}
}
From 4fbd337326f6a0d4651a20bd8bae55496c5fb433 Mon Sep 17 00:00:00 2001
From: Matthias
Date: Wed, 23 Jun 2021 00:14:11 +0200
Subject: [PATCH 04/46] Add install target and fix build phony
---
Makefile | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/Makefile b/Makefile
index f79461469a..a0b985d870 100644
--- a/Makefile
+++ b/Makefile
@@ -18,10 +18,14 @@ docker-run: ## Run Docker image
docker-push: ## Push image to Docker Hub
docker push $(IMAGE_NAME)
-.PHONY: build-local
+.PHONY: build
build: ## Build Rust code locally
cargo build
+.PHONY: install
+install: ## Install project locally
+ cargo install --path lychee-bin
+
.PHONY: run
run: ## Run Rust code locally
cargo run
From 185645ac81fbaaf0ca646d41e3ac7e6ff5660006 Mon Sep 17 00:00:00 2001
From: Matthias
Date: Wed, 23 Jun 2021 00:14:21 +0200
Subject: [PATCH 05/46] Update docs
---
README.md | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/README.md b/README.md
index 4d7be79241..326a00c22c 100644
--- a/README.md
+++ b/README.md
@@ -194,7 +194,8 @@ FLAGS:
OPTIONS:
-a, --accept Comma-separated list of accepted status codes for valid links
- -b, --base-url Base URL to check relative URLs
+ -b, --base Base URL or website root directory to check relative URLs e.g.
+ https://example.org or `/path/to/public`
--basic-auth Basic authentication support. E.g. `username:password`
-c, --config Configuration file to use [default: ./lychee.toml]
--exclude ... Exclude URLs from checking (supports regex)
From bfa3b1b6a1cfb5bd52b081d3ccf626580d780bd6 Mon Sep 17 00:00:00 2001
From: Matthias
Date: Wed, 23 Jun 2021 00:18:12 +0200
Subject: [PATCH 06/46] Introduce Base type, which can be a path or URL
---
examples/collect_links/collect_links.rs | 6 +-
lychee-bin/src/main.rs | 13 +--
lychee-bin/src/options.rs | 23 +++---
lychee-lib/src/collector.rs | 31 +++-----
lychee-lib/src/extract.rs | 37 ++++-----
lychee-lib/src/fs_tree.rs | 11 ++-
lychee-lib/src/lib.rs | 2 +-
lychee-lib/src/types/base.rs | 100 ++++++++++++++++++++++++
lychee-lib/src/types/error.rs | 4 +
lychee-lib/src/types/mod.rs | 2 +
10 files changed, 162 insertions(+), 67 deletions(-)
create mode 100644 lychee-lib/src/types/base.rs
diff --git a/examples/collect_links/collect_links.rs b/examples/collect_links/collect_links.rs
index fc97cbd390..60c37f9291 100644
--- a/examples/collect_links/collect_links.rs
+++ b/examples/collect_links/collect_links.rs
@@ -14,12 +14,12 @@ async fn main() -> Result<()> {
];
let links = Collector::new(
- None, // base_url
- None, false, // don't skip missing inputs
+ None, // base
+ false, // don't skip missing inputs
10, // max concurrency
)
.collect_links(
- inputs, // base_url
+ inputs, // base url or directory
)
.await?;
diff --git a/lychee-bin/src/main.rs b/lychee-bin/src/main.rs
index 585159f76f..e1493389ab 100644
--- a/lychee-bin/src/main.rs
+++ b/lychee-bin/src/main.rs
@@ -197,15 +197,10 @@ async fn run(cfg: &Config, inputs: Vec ) -> Result {
.client()
.map_err(|e| anyhow!(e))?;
- let links = Collector::new(
- cfg.base_url.clone(),
- cfg.base_dir.clone(),
- cfg.skip_missing,
- max_concurrency,
- )
- .collect_links(&inputs)
- .await
- .map_err(|e| anyhow!(e))?;
+ let links = Collector::new(cfg.base.clone(), cfg.skip_missing, max_concurrency)
+ .collect_links(&inputs)
+ .await
+ .map_err(|e| anyhow!(e))?;
let pb = if cfg.no_progress {
None
diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs
index 07b62d2c23..b97236715c 100644
--- a/lychee-bin/src/options.rs
+++ b/lychee-bin/src/options.rs
@@ -1,9 +1,8 @@
-use std::{fs, io::ErrorKind, path::PathBuf, str::FromStr};
+use std::{convert::TryFrom, fs, io::ErrorKind, path::PathBuf, str::FromStr};
use anyhow::{anyhow, Error, Result};
use lazy_static::lazy_static;
-use lychee_lib::Input;
-use reqwest::Url;
+use lychee_lib::{Base, Input};
use serde::Deserialize;
use structopt::{clap::crate_version, StructOpt};
@@ -76,6 +75,10 @@ macro_rules! fold_in {
};
}
+fn parse_base(src: &str) -> Result {
+ Base::try_from(src)
+}
+
#[derive(Debug, StructOpt)]
#[structopt(
name = "lychee",
@@ -217,15 +220,11 @@ pub(crate) struct Config {
#[serde(default = "method")]
pub(crate) method: String,
- /// Base URL to check relative URLs
- #[structopt(long, parse(try_from_str))]
- #[serde(default)]
- pub(crate) base_dir: Option,
-
- /// Base URL to check relative URLs
- #[structopt(long, parse(try_from_str))]
+ /// Base URL or website root directory to check relative URLs
+ /// e.g. https://example.org or `/path/to/public`
+ #[structopt(short, long, parse(try_from_str = parse_base))]
#[serde(default)]
- pub(crate) base_url: Option,
+ pub(crate) base: Option ,
/// Basic authentication support. E.g. `username:password`
#[structopt(long)]
@@ -310,7 +309,7 @@ impl Config {
accept: None;
timeout: TIMEOUT;
method: METHOD;
- base_url: None;
+ base: None;
basic_auth: None;
github_token: None;
skip_missing: false;
diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs
index 232ddb069f..712dc090c5 100644
--- a/lychee-lib/src/collector.rs
+++ b/lychee-lib/src/collector.rs
@@ -1,12 +1,10 @@
-use crate::{extract::extract_links, uri::Uri, Input, Request, Result};
-use reqwest::Url;
-use std::{collections::HashSet, path::PathBuf};
+use crate::{extract::extract_links, uri::Uri, Base, Input, Request, Result};
+use std::collections::HashSet;
/// Collector keeps the state of link collection
#[derive(Debug, Clone)]
pub struct Collector {
- base_url: Option,
- base_dir: Option,
+ base: Option ,
skip_missing_inputs: bool,
max_concurrency: usize,
cache: HashSet,
@@ -15,15 +13,9 @@ pub struct Collector {
impl Collector {
/// Create a new collector with an empty cache
#[must_use]
- pub fn new(
- base_url: Option,
- base_dir: Option,
- skip_missing_inputs: bool,
- max_concurrency: usize,
- ) -> Self {
+ pub fn new(base: Option , skip_missing_inputs: bool, max_concurrency: usize) -> Self {
Collector {
- base_url,
- base_dir,
+ base,
skip_missing_inputs,
max_concurrency,
cache: HashSet::new(),
@@ -31,7 +23,8 @@ impl Collector {
}
/// Fetch all unique links from a slice of inputs
- /// All relative URLs get prefixed with `base_url` if given.
+ /// All relative URLs get prefixed with `base` if given.
+ /// (This can be a directory or a base URL)
///
/// # Errors
///
@@ -58,11 +51,9 @@ impl Collector {
while let Some(result) = contents_rx.recv().await {
for input_content in result? {
- let base_url = self.base_url.clone();
- let base_dir = self.base_dir.clone();
- let handle = tokio::task::spawn_blocking(move || {
- extract_links(&input_content, &base_url, &base_dir)
- });
+ let base = self.base.clone();
+ let handle =
+ tokio::task::spawn_blocking(move || extract_links(&input_content, &base));
extract_links_handles.push(handle);
}
}
@@ -169,7 +160,7 @@ mod test {
},
];
- let responses = Collector::new(None, None, false, 8)
+ let responses = Collector::new(None, false, 8)
.collect_links(&inputs)
.await?;
let mut links = responses.into_iter().map(|r| r.uri).collect::>();
diff --git a/lychee-lib/src/extract.rs b/lychee-lib/src/extract.rs
index 5bfafaeb60..2f65c2625a 100644
--- a/lychee-lib/src/extract.rs
+++ b/lychee-lib/src/extract.rs
@@ -8,12 +8,11 @@ use linkify::LinkFinder;
use log::info;
use markup5ever_rcdom::{Handle, NodeData, RcDom};
use pulldown_cmark::{Event as MDEvent, Parser, Tag};
-use url::Url;
use crate::{
fs_tree,
types::{FileType, InputContent},
- Input, Request, Result, Uri,
+ Base, Input, Request, Result, Uri,
};
// Use LinkFinder here to offload the actual link searching in plaintext.
@@ -106,8 +105,7 @@ fn extract_links_from_plaintext(input: &str) -> Vec {
pub(crate) fn extract_links(
input_content: &InputContent,
- base_url: &Option,
- base_dir: &Option,
+ base: &Option ,
) -> Result> {
let links = match input_content.file_type {
FileType::Markdown => extract_links_from_markdown(&input_content.content),
@@ -121,15 +119,15 @@ pub(crate) fn extract_links(
for link in links {
if let Ok(uri) = Uri::try_from(link.as_str()) {
requests.insert(Request::new(uri, input_content.input.clone()));
- } else if let Some(new_url) = base_url.as_ref().and_then(|u| u.join(&link).ok()) {
+ } else if let Some(new_url) = base.as_ref().and_then(|u| u.join(&link)) {
requests.insert(Request::new(
Uri { url: new_url },
input_content.input.clone(),
));
} else if let Input::FsPath(root) = &input_content.input {
- if let Ok(path) = fs_tree::find(&root, &PathBuf::from(&link), base_dir) {
+ if let Ok(path) = fs_tree::find(&root, &PathBuf::from(&link), base) {
let input_content = Input::path_content(path)?;
- requests.extend(extract_links(&input_content, base_url, base_dir)?);
+ requests.extend(extract_links(&input_content, base)?);
} else {
info!("Cannot find path to {} in filesystem", &link);
}
@@ -157,11 +155,14 @@ mod test {
extract_links, extract_links_from_html, extract_links_from_markdown,
extract_links_from_plaintext, find_links,
};
- use crate::types::{FileType, InputContent};
use crate::{
test_utils::{mail, website},
Uri,
};
+ use crate::{
+ types::{FileType, InputContent},
+ Base,
+ };
fn load_fixture(filename: &str) -> String {
let fixture_path = Path::new(env!("CARGO_MANIFEST_DIR"))
@@ -182,16 +183,16 @@ mod test {
}
fn extract_uris(input: &str, file_type: FileType, base_url: Option<&str>) -> HashSet {
- extract_links(
- &InputContent::from_string(input, file_type),
- &base_url.map(|u| Url::parse(u).unwrap()),
- &None,
- )
- // unwrap is fine here as this helper function is only used in tests
- .unwrap()
- .into_iter()
- .map(|r| r.uri)
- .collect()
+ let base = match base_url {
+ Some(url) => Some(Base::Remote(Url::parse(url).unwrap())),
+ None => None,
+ };
+ extract_links(&InputContent::from_string(input, file_type), &base)
+ // unwrap is fine here as this helper function is only used in tests
+ .unwrap()
+ .into_iter()
+ .map(|r| r.uri)
+ .collect()
}
#[test]
diff --git a/lychee-lib/src/fs_tree.rs b/lychee-lib/src/fs_tree.rs
index 44c9791356..344dd1a665 100644
--- a/lychee-lib/src/fs_tree.rs
+++ b/lychee-lib/src/fs_tree.rs
@@ -1,7 +1,7 @@
-use crate::{ErrorKind, Result};
+use crate::{Base, ErrorKind, Result};
use std::path::{Path, PathBuf};
-pub(crate) fn find(src: &Path, dst: &Path, base_dir: &Option) -> Result {
+pub(crate) fn find(src: &Path, dst: &Path, base: &Option ) -> Result {
if dst.exists() {
return Ok(dst.to_path_buf());
}
@@ -11,7 +11,7 @@ pub(crate) fn find(src: &Path, dst: &Path, base_dir: &Option) -> Result
if dst.is_absolute() {
// Absolute local links (leading slash) require the base_url to
// define the document root.
- if let Some(base_dir) = base_dir {
+ if let Some(base_dir) = base.as_ref().and_then(|b| b.dir()) {
let absolute = base_dir.join(dst.to_path_buf());
if absolute.exists() {
return Ok(absolute);
@@ -108,7 +108,10 @@ mod test_fs_tree {
File::create(&dst)?;
let base_dir = dir.path().to_path_buf();
let dst_absolute = base_dir.join(dst.to_path_buf());
- assert_eq!(find(&dummy, &dst, &Some(base_dir))?, dst_absolute);
+ assert_eq!(
+ find(&dummy, &dst, &Some(Base::Local(base_dir)))?,
+ dst_absolute
+ );
Ok(())
}
diff --git a/lychee-lib/src/lib.rs b/lychee-lib/src/lib.rs
index 62257d5dc1..3d57e5d72d 100644
--- a/lychee-lib/src/lib.rs
+++ b/lychee-lib/src/lib.rs
@@ -78,6 +78,6 @@ pub use crate::{
client_pool::ClientPool,
collector::Collector,
filter::{Excludes, Filter, Includes},
- types::{ErrorKind, Input, Request, Response, ResponseBody, Result, Status},
+ types::{Base, ErrorKind, Input, Request, Response, ResponseBody, Result, Status},
uri::Uri,
};
diff --git a/lychee-lib/src/types/base.rs b/lychee-lib/src/types/base.rs
new file mode 100644
index 0000000000..a3db9b81b8
--- /dev/null
+++ b/lychee-lib/src/types/base.rs
@@ -0,0 +1,100 @@
+use reqwest::Url;
+use serde::{Deserialize, Serialize};
+use std::{convert::TryFrom, path::PathBuf};
+
+use crate::ErrorKind;
+
+/// When encountering links without a full domain in a document,
+/// the base determines where this resource can be found.
+/// Both, local and remote targets are supported.
+#[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Clone)]
+
+pub enum Base {
+ /// Local file path pointing to root directory
+ Local(PathBuf),
+ /// Remote URL pointing to a website homepage
+ Remote(Url),
+}
+
+impl Base {
+ /// Join link with base url
+ pub fn join(&self, link: &str) -> Option {
+ match self {
+ Self::Remote(url) => url.join(link).ok(),
+ Self::Local(_) => None,
+ }
+ }
+
+ /// Return the directory if the base is local
+ pub fn dir(&self) -> Option {
+ match self {
+ Self::Remote(_) => None,
+ Self::Local(d) => Some(d.to_path_buf()),
+ }
+ }
+}
+
+impl TryFrom<&str> for Base {
+ type Error = ErrorKind;
+
+ fn try_from(value: &str) -> Result {
+ if let Ok(url) = Url::parse(&value) {
+ if url.cannot_be_a_base() {
+ return Err(ErrorKind::InvalidBase(
+ value.to_string(),
+ "The given URL cannot be a base".to_string(),
+ ));
+ }
+ return Ok(Self::Remote(url));
+ }
+ // Only accept existing directories as path
+ let path = PathBuf::from(&value);
+ if !path.is_dir() {
+ return Err(ErrorKind::InvalidBase(
+ value.to_string(),
+ "The given base path is not a directory".to_string(),
+ ));
+ }
+ if !path.exists() {
+ return Err(ErrorKind::InvalidBase(
+ value.to_string(),
+ "The given base directory does not exist".to_string(),
+ ));
+ }
+ Ok(Self::Local(path))
+ }
+}
+
+#[cfg(test)]
+mod test_base {
+ use crate::Result;
+
+ use super::*;
+
+ #[test]
+ fn test_valid_remote() -> Result<()> {
+ let base = Base::try_from("https://endler.dev")?;
+ assert_eq!(
+ base,
+ Base::Remote(Url::parse("https://endler.dev").unwrap())
+ );
+ Ok(())
+ }
+
+ #[test]
+ fn test_invalid_url() {
+ assert!(Base::try_from("data:text/plain,Hello?World#").is_err());
+ }
+
+ #[test]
+ fn test_valid_local() -> Result<()> {
+ let dir = tempfile::tempdir()?;
+ Base::try_from(dir.as_ref().to_str().unwrap())?;
+ Ok(())
+ }
+
+ #[test]
+ fn test_invalid_local() {
+ assert!(Base::try_from("/asdfasdd20asdfljvvvzzcv/j2ofasd").is_err());
+ }
+}
diff --git a/lychee-lib/src/types/error.rs b/lychee-lib/src/types/error.rs
index 89c60c5912..9b187ccb06 100644
--- a/lychee-lib/src/types/error.rs
+++ b/lychee-lib/src/types/error.rs
@@ -25,6 +25,8 @@ pub enum ErrorKind {
/// A possible error when converting a `HeaderValue` from a string or byte
/// slice.
InvalidHeader(InvalidHeaderValue),
+ /// The given string can not be parsed into a valid base URL or base directory
+ InvalidBase(String, String),
/// Cannot find local file
FileNotFound(PathBuf),
/// The given UNIX glob pattern is invalid
@@ -71,6 +73,7 @@ impl Hash for ErrorKind {
Self::InvalidHeader(e) => e.to_string().hash(state),
Self::InvalidGlobPattern(e) => e.to_string().hash(state),
Self::MissingGitHubToken => std::mem::discriminant(self).hash(state),
+ ErrorKind::InvalidBase(base, e) => (base, e).hash(state),
}
}
}
@@ -110,6 +113,7 @@ impl Display for ErrorKind {
"This URL is available in HTTPS protocol, but HTTP is provided, use '{}' instead",
uri
),
+ Self::InvalidBase(base, e) => write!(f, "Error while base dir `{}` : {}", base, e),
}
}
}
diff --git a/lychee-lib/src/types/mod.rs b/lychee-lib/src/types/mod.rs
index 552b87fc19..63fec726ce 100644
--- a/lychee-lib/src/types/mod.rs
+++ b/lychee-lib/src/types/mod.rs
@@ -1,5 +1,6 @@
#![allow(unreachable_pub)]
+mod base;
mod error;
mod file;
mod input;
@@ -7,6 +8,7 @@ mod request;
mod response;
mod status;
+pub use base::Base;
pub use error::ErrorKind;
pub use file::FileType;
pub use input::{Input, InputContent};
From 887f1b9589ec3cc7ab23d00e74c4986e7033c02e Mon Sep 17 00:00:00 2001
From: Matthias
Date: Thu, 1 Jul 2021 01:44:12 +0200
Subject: [PATCH 07/46] Split up file checking into file discovery and
validation of path exists
---
lychee-lib/src/client.rs | 14 ++++-
lychee-lib/src/collector.rs | 2 +-
lychee-lib/src/extract.rs | 40 ++++++-------
lychee-lib/src/filter/mod.rs | 2 +-
lychee-lib/src/{fs_tree.rs => fs.rs} | 90 ++++++++++++++++++----------
lychee-lib/src/lib.rs | 6 +-
lychee-lib/src/types/error.rs | 21 ++++++-
lychee-lib/src/types/mod.rs | 2 +
8 files changed, 114 insertions(+), 63 deletions(-)
rename lychee-lib/src/{fs_tree.rs => fs.rs} (67%)
diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs
index c1175ba697..a69167c7d2 100644
--- a/lychee-lib/src/client.rs
+++ b/lychee-lib/src/client.rs
@@ -20,8 +20,7 @@ use typed_builder::TypedBuilder;
use crate::{
filter::{Excludes, Filter, Includes},
quirks::Quirks,
- uri::Uri,
- ErrorKind, Request, Response, Result, Status,
+ ErrorKind, Request, Response, Result, Status, Uri,
};
const DEFAULT_MAX_REDIRECTS: usize = 5;
@@ -178,6 +177,8 @@ impl Client {
let Request { uri, source } = Request::try_from(request)?;
let status = if self.filter.is_excluded(&uri) {
Status::Excluded
+ } else if uri.is_file() {
+ self.check_file(&uri).await
} else if uri.is_mail() {
self.check_mail(&uri).await
} else {
@@ -250,6 +251,15 @@ impl Client {
}
}
+ pub async fn check_file(&self, uri: &Uri) -> Status {
+ if let Ok(path) = uri.inner.to_file_path() {
+ if path.exists() {
+ return Status::Ok(StatusCode::OK);
+ }
+ }
+ ErrorKind::InvalidFileUri(uri.clone()).into()
+ }
+
pub async fn check_mail(&self, uri: &Uri) -> Status {
let input = CheckEmailInput::new(vec![uri.as_str().to_owned()]);
let result = &(check_email(&input).await)[0];
diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs
index 712dc090c5..b5e69d96ba 100644
--- a/lychee-lib/src/collector.rs
+++ b/lychee-lib/src/collector.rs
@@ -1,4 +1,4 @@
-use crate::{extract::extract_links, uri::Uri, Base, Input, Request, Result};
+use crate::{extract::extract_links, Base, Input, Request, Result, Uri};
use std::collections::HashSet;
/// Collector keeps the state of link collection
diff --git a/lychee-lib/src/extract.rs b/lychee-lib/src/extract.rs
index 2f65c2625a..65b6ee831e 100644
--- a/lychee-lib/src/extract.rs
+++ b/lychee-lib/src/extract.rs
@@ -8,11 +8,12 @@ use linkify::LinkFinder;
use log::info;
use markup5ever_rcdom::{Handle, NodeData, RcDom};
use pulldown_cmark::{Event as MDEvent, Parser, Tag};
+use reqwest::Url;
use crate::{
- fs_tree,
+ fs,
types::{FileType, InputContent},
- Base, Input, Request, Result, Uri,
+ Base, ErrorKind, Input, Request, Result, Uri,
};
// Use LinkFinder here to offload the actual link searching in plaintext.
@@ -113,27 +114,27 @@ pub(crate) fn extract_links(
FileType::Plaintext => extract_links_from_plaintext(&input_content.content),
};
- // Only keep legit URLs. This sorts out things like anchors.
- // Silently ignore the parse failures for now.
+ // Only keep legit URLs. For example this filters out anchors.
let mut requests: HashSet = HashSet::new();
for link in links {
- if let Ok(uri) = Uri::try_from(link.as_str()) {
- requests.insert(Request::new(uri, input_content.input.clone()));
+ let req = if let Ok(uri) = Uri::try_from(link.as_str()) {
+ Request::new(uri, input_content.input.clone())
} else if let Some(new_url) = base.as_ref().and_then(|u| u.join(&link)) {
- requests.insert(Request::new(
- Uri { url: new_url },
- input_content.input.clone(),
- ));
+ Request::new(Uri { inner: new_url }, input_content.input.clone())
} else if let Input::FsPath(root) = &input_content.input {
- if let Ok(path) = fs_tree::find(&root, &PathBuf::from(&link), base) {
- let input_content = Input::path_content(path)?;
- requests.extend(extract_links(&input_content, base)?);
- } else {
- info!("Cannot find path to {} in filesystem", &link);
- }
+ let link = fs::sanitize(link);
+ let path = fs::resolve(&root, &PathBuf::from(&link), base)?;
+ Request::new(
+ Uri {
+ inner: Url::from_file_path(&path).map_err(|_e| ErrorKind::InvalidPath(path))?,
+ },
+ input_content.input.clone(),
+ )
} else {
info!("Handling of {} not implemented yet", &link);
- }
+ continue;
+ };
+ requests.insert(req);
}
Ok(requests)
}
@@ -151,10 +152,7 @@ mod test {
use pretty_assertions::assert_eq;
use url::Url;
- use super::{
- extract_links, extract_links_from_html, extract_links_from_markdown,
- extract_links_from_plaintext, find_links,
- };
+ use super::*;
use crate::{
test_utils::{mail, website},
Uri,
diff --git a/lychee-lib/src/filter/mod.rs b/lychee-lib/src/filter/mod.rs
index f9daac8bfa..0726aa67d0 100644
--- a/lychee-lib/src/filter/mod.rs
+++ b/lychee-lib/src/filter/mod.rs
@@ -6,7 +6,7 @@ use std::{collections::HashSet, net::IpAddr};
pub use excludes::Excludes;
pub use includes::Includes;
-use crate::uri::Uri;
+use crate::Uri;
/// Pre-defined exclusions for known false-positives
static FALSE_POSITIVE_PAT: &[&str] = &[r"http://www.w3.org/1999/xhtml"];
diff --git a/lychee-lib/src/fs_tree.rs b/lychee-lib/src/fs.rs
similarity index 67%
rename from lychee-lib/src/fs_tree.rs
rename to lychee-lib/src/fs.rs
index 344dd1a665..98255b18ae 100644
--- a/lychee-lib/src/fs_tree.rs
+++ b/lychee-lib/src/fs.rs
@@ -1,35 +1,49 @@
use crate::{Base, ErrorKind, Result};
use std::path::{Path, PathBuf};
-pub(crate) fn find(src: &Path, dst: &Path, base: &Option ) -> Result {
- if dst.exists() {
- return Ok(dst.to_path_buf());
- }
- if dst.is_dir() {
- return Err(ErrorKind::FileNotFound(dst.into()));
+// Returns the base if it is a valid `PathBuf`
+fn get_base_dir(base: &Option ) -> Option {
+ base.as_ref().and_then(|b| b.dir())
+}
+
+pub(crate) fn resolve(src: &Path, dst: &Path, base: &Option ) -> Result {
+ if dst.is_relative() {
+ // Find `dst` in the parent directory of `src`
+ if let Some(parent) = src.parent() {
+ let rel_path = parent.join(dst.to_path_buf());
+ return Ok(rel_path);
+ }
}
if dst.is_absolute() {
// Absolute local links (leading slash) require the base_url to
// define the document root.
- if let Some(base_dir) = base.as_ref().and_then(|b| b.dir()) {
- let absolute = base_dir.join(dst.to_path_buf());
- if absolute.exists() {
- return Ok(absolute);
- }
- }
- }
- if dst.is_relative() {
- // Find `dst` in the `root` path
- if let Some(parent) = src.parent() {
- let relative = parent.join(dst.to_path_buf());
- if relative.exists() {
- return Ok(relative);
- }
+ if let Some(base_dir) = get_base_dir(base) {
+ let abs_path = join(base_dir, dst);
+ return Ok(abs_path);
}
}
Err(ErrorKind::FileNotFound(dst.to_path_buf()))
}
+// A cumbersome way to concatenate paths without checking their
+// existence on disk. See https://github.com/rust-lang/rust/issues/16507
+fn join(base: PathBuf, dst: &Path) -> PathBuf {
+ let mut abs = base.into_os_string();
+ let target_str = dst.as_os_str();
+ abs.push(target_str);
+ PathBuf::from(abs)
+}
+
+/// A little helper function to remove the get parameters from a URL link.
+/// The link is not a URL but a String as that link may not have a base domain.
+pub(crate) fn sanitize(link: String) -> String {
+ let path = match link.split_once('?') {
+ Some((path, _params)) => path,
+ None => link.as_str(),
+ };
+ path.to_string()
+}
+
#[cfg(test)]
mod test_fs_tree {
use std::fs::File;
@@ -37,6 +51,31 @@ mod test_fs_tree {
use super::*;
use crate::Result;
+ #[test]
+ fn test_sanitize() {
+ assert_eq!(sanitize("/".to_string()), "/".to_string());
+ assert_eq!(
+ sanitize("index.html?foo=bar".to_string()),
+ "index.html".to_string()
+ );
+ assert_eq!(
+ sanitize("/index.html?foo=bar".to_string()),
+ "/index.html".to_string()
+ );
+ assert_eq!(
+ sanitize("/index.html?foo=bar&baz=zorx?bla=blub".to_string()),
+ "/index.html".to_string()
+ );
+ assert_eq!(
+ sanitize("https://example.org/index.html?foo=bar".to_string()),
+ "https://example.org/index.html".to_string()
+ );
+ assert_eq!(
+ sanitize("test.png?foo=bar".to_string()),
+ "test.png".to_string()
+ );
+ }
+
// dummy root
// /path/to/foo.html
#[test]
@@ -151,15 +190,4 @@ mod test_fs_tree {
assert_eq!(find(&root, &dst, &None)?, dst);
Ok(())
}
-
- // /path/to/index.html
- // /other/path/to
- #[test]
- fn test_dst_is_dir() -> Result<()> {
- let root = PathBuf::from("/path/to/");
- let dir = tempfile::tempdir()?;
- File::create(&dir)?;
- assert!(find(&root, &dir.into_path(), &None).is_err());
- Ok(())
- }
}
diff --git a/lychee-lib/src/lib.rs b/lychee-lib/src/lib.rs
index 3d57e5d72d..5af79c6c9a 100644
--- a/lychee-lib/src/lib.rs
+++ b/lychee-lib/src/lib.rs
@@ -50,10 +50,9 @@ mod client;
mod client_pool;
/// A pool of clients, to handle concurrent checks
pub mod collector;
-mod fs_tree;
+mod fs;
mod quirks;
mod types;
-mod uri;
/// Functionality to extract URIs from inputs
pub mod extract;
@@ -78,6 +77,5 @@ pub use crate::{
client_pool::ClientPool,
collector::Collector,
filter::{Excludes, Filter, Includes},
- types::{Base, ErrorKind, Input, Request, Response, ResponseBody, Result, Status},
- uri::Uri,
+ types::{Base, ErrorKind, Input, Request, Response, ResponseBody, Result, Status, Uri},
};
diff --git a/lychee-lib/src/types/error.rs b/lychee-lib/src/types/error.rs
index 9b187ccb06..60cafa073d 100644
--- a/lychee-lib/src/types/error.rs
+++ b/lychee-lib/src/types/error.rs
@@ -10,15 +10,20 @@ use crate::Uri;
#[derive(Debug)]
#[non_exhaustive]
pub enum ErrorKind {
- // TODO: maybe need to be splitted; currently first slot is Some only for reading files
+ // TODO: maybe needs to be split; currently first element is `Some` only for
+ // reading files
/// Any form of I/O error occurred while reading from a given path.
IoError(Option, std::io::Error),
/// Network error when trying to connect to an endpoint via reqwest.
ReqwestError(reqwest::Error),
/// Network error when trying to connect to an endpoint via hubcaps.
HubcapsError(hubcaps::Error),
- /// The given string can not be parsed into a valid URL or e-mail address
+ /// The given string can not be parsed into a valid URL, e-mail address, or file path
UrlParseError(String, (url::ParseError, Option)),
+ /// The given URI cannot be converted to a file path
+ InvalidFileUri(Uri),
+ /// The given path cannot be converted to a URI
+ InvalidPath(PathBuf),
/// The given mail address is unreachable
UnreachableEmailAddress(Uri),
/// The given header could not be parsed.
@@ -70,10 +75,12 @@ impl Hash for ErrorKind {
Self::FileNotFound(e) => e.to_string_lossy().hash(state),
Self::UrlParseError(s, e) => (s, e.type_id()).hash(state),
Self::UnreachableEmailAddress(u) | Self::InsecureURL(u) => u.hash(state),
+ Self::InvalidFileUri(u) => u.hash(state),
+ Self::InvalidPath(p) => p.hash(state),
+ Self::UnreachableEmailAddress(u) => u.hash(state),
Self::InvalidHeader(e) => e.to_string().hash(state),
Self::InvalidGlobPattern(e) => e.to_string().hash(state),
Self::MissingGitHubToken => std::mem::discriminant(self).hash(state),
- ErrorKind::InvalidBase(base, e) => (base, e).hash(state),
}
}
}
@@ -101,6 +108,8 @@ impl Display for ErrorKind {
Self::UrlParseError(s, (url_err, None)) => {
write!(f, "Cannot parse {} as website url ({})", s, url_err)
}
+ Self::InvalidFileUri(u) => write!(f, "Invalid file URI: {}", u),
+ Self::InvalidPath(p) => write!(f, "Invalid path: {}", p.display()),
Self::UnreachableEmailAddress(uri) => write!(f, "Unreachable mail address: {}", uri),
Self::InvalidHeader(e) => e.fmt(f),
Self::InvalidGlobPattern(e) => e.fmt(f),
@@ -157,6 +166,12 @@ impl From for ErrorKind {
}
}
+impl From for ErrorKind {
+ fn from(e: url::ParseError) -> Self {
+ Self::UrlParseError("Cannot parse URL".to_string(), (e, None))
+ }
+}
+
impl From<(String, url::ParseError)> for ErrorKind {
fn from(value: (String, url::ParseError)) -> Self {
Self::UrlParseError(value.0, (value.1, None))
diff --git a/lychee-lib/src/types/mod.rs b/lychee-lib/src/types/mod.rs
index 63fec726ce..9453d5ee9f 100644
--- a/lychee-lib/src/types/mod.rs
+++ b/lychee-lib/src/types/mod.rs
@@ -7,6 +7,7 @@ mod input;
mod request;
mod response;
mod status;
+mod uri;
pub use base::Base;
pub use error::ErrorKind;
@@ -15,6 +16,7 @@ pub use input::{Input, InputContent};
pub use request::Request;
pub use response::{Response, ResponseBody};
pub use status::Status;
+pub use uri::Uri;
/// The lychee `Result` type
pub type Result = std::result::Result;
From d51a49db461d51391b0555ab6a78befcc19352a2 Mon Sep 17 00:00:00 2001
From: Matthias
Date: Thu, 1 Jul 2021 01:44:21 +0200
Subject: [PATCH 08/46] Move uri to types
---
lychee-lib/src/{ => types}/uri.rs | 21 +++++++++++++--------
1 file changed, 13 insertions(+), 8 deletions(-)
rename lychee-lib/src/{ => types}/uri.rs (94%)
diff --git a/lychee-lib/src/uri.rs b/lychee-lib/src/types/uri.rs
similarity index 94%
rename from lychee-lib/src/uri.rs
rename to lychee-lib/src/types/uri.rs
index a25aad395f..aaf7d0c0fe 100644
--- a/lychee-lib/src/uri.rs
+++ b/lychee-lib/src/types/uri.rs
@@ -6,14 +6,14 @@ use url::Url;
use crate::{ErrorKind, Result};
-/// Lychee's own representation of a URI, which encapsulates all support formats.
+/// Lychee's own representation of a URI, which encapsulates all supported formats.
///
/// If the scheme is `mailto`, it's a mail address.
/// Otherwise it's treated as a website URL.
#[derive(Clone, Debug, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct Uri {
/// Website URL or mail address
- pub(crate) url: Url,
+ pub(crate) inner: Url,
}
impl Uri {
@@ -24,21 +24,21 @@ impl Uri {
#[inline]
#[must_use]
pub fn as_str(&self) -> &str {
- self.url.as_ref().trim_start_matches("mailto:")
+ self.inner.as_ref().trim_start_matches("mailto:")
}
#[inline]
#[must_use]
/// Returns the scheme of the URI (e.g. `http` or `mailto`)
pub fn scheme(&self) -> &str {
- self.url.scheme()
+ self.inner.scheme()
}
#[inline]
#[must_use]
/// Returns the domain of the URI (e.g. `example.org`)
pub fn domain(&self) -> Option<&str> {
- self.url.domain()
+ self.inner.domain()
}
#[inline]
@@ -49,14 +49,14 @@ impl Uri {
///
/// Return `None` for cannot-be-a-base URLs.
pub fn path_segments(&self) -> Option> {
- self.url.path_segments()
+ self.inner.path_segments()
}
#[must_use]
/// Returns the IP address (either IPv4 or IPv6) of the URI,
/// or `None` if it is a domain
pub fn host_ip(&self) -> Option {
- match self.url.host()? {
+ match self.inner.host()? {
url::Host::Domain(_) => None,
url::Host::Ipv4(v4_addr) => Some(v4_addr.into()),
url::Host::Ipv6(v6_addr) => Some(v6_addr.into()),
@@ -85,6 +85,11 @@ impl Uri {
pub(crate) fn is_mail(&self) -> bool {
self.scheme() == "mailto"
}
+
+ #[inline]
+ pub(crate) fn is_file(&self) -> bool {
+ self.scheme() == "file"
+ }
}
impl AsRef for Uri {
@@ -95,7 +100,7 @@ impl AsRef for Uri {
impl From for Uri {
fn from(url: Url) -> Self {
- Self { url }
+ Self { inner: url }
}
}
From d924c25669c272b9c16b40c838b902fb5802ec73 Mon Sep 17 00:00:00 2001
From: Matthias
Date: Thu, 1 Jul 2021 01:44:52 +0200
Subject: [PATCH 09/46] Non-existing directories are fine for URI base for
files
---
lychee-lib/src/types/base.rs | 23 ++---------------------
1 file changed, 2 insertions(+), 21 deletions(-)
diff --git a/lychee-lib/src/types/base.rs b/lychee-lib/src/types/base.rs
index a3db9b81b8..8b7f2f866a 100644
--- a/lychee-lib/src/types/base.rs
+++ b/lychee-lib/src/types/base.rs
@@ -8,7 +8,7 @@ use crate::ErrorKind;
/// the base determines where this resource can be found.
/// Both, local and remote targets are supported.
#[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Clone)]
-
+#[allow(variant_size_differences)]
pub enum Base {
/// Local file path pointing to root directory
Local(PathBuf),
@@ -47,21 +47,7 @@ impl TryFrom<&str> for Base {
}
return Ok(Self::Remote(url));
}
- // Only accept existing directories as path
- let path = PathBuf::from(&value);
- if !path.is_dir() {
- return Err(ErrorKind::InvalidBase(
- value.to_string(),
- "The given base path is not a directory".to_string(),
- ));
- }
- if !path.exists() {
- return Err(ErrorKind::InvalidBase(
- value.to_string(),
- "The given base directory does not exist".to_string(),
- ));
- }
- Ok(Self::Local(path))
+ Ok(Self::Local(PathBuf::from(value)))
}
}
@@ -92,9 +78,4 @@ mod test_base {
Base::try_from(dir.as_ref().to_str().unwrap())?;
Ok(())
}
-
- #[test]
- fn test_invalid_local() {
- assert!(Base::try_from("/asdfasdd20asdfljvvvzzcv/j2ofasd").is_err());
- }
}
From f5ee472d930b244ebf67a57f0321b87964accf1e Mon Sep 17 00:00:00 2001
From: Matthias
Date: Mon, 5 Jul 2021 01:33:32 +0200
Subject: [PATCH 10/46] explicit naming
---
fixtures/TEST_SCHEMES.txt | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fixtures/TEST_SCHEMES.txt b/fixtures/TEST_SCHEMES.txt
index 29ab3b53d2..47a061e41e 100644
--- a/fixtures/TEST_SCHEMES.txt
+++ b/fixtures/TEST_SCHEMES.txt
@@ -1,3 +1,3 @@
slack://channel?id=123
-file://foo/bar
+file:///test_folder/test_file
https://example.org
From ee70e13bf7f4337fd82b3cb94497374522e10d5c Mon Sep 17 00:00:00 2001
From: Matthias
Date: Mon, 5 Jul 2021 01:34:35 +0200
Subject: [PATCH 11/46] Check real link to file
---
fixtures/TEST.md | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/fixtures/TEST.md b/fixtures/TEST.md
index be6d5a0c9b..dc07cc6e0b 100644
--- a/fixtures/TEST.md
+++ b/fixtures/TEST.md
@@ -1,5 +1,5 @@
-This link should be ignored as it is not a fully qualified URL.
-![Logo](awesome.png)
+Check file link
+![Logo](../assets/banner.svg)
![Anchors should be ignored](#awesome)
From daa5be4c3ac77ed3385b2afedbbc625ec9a35b89 Mon Sep 17 00:00:00 2001
From: Matthias
Date: Mon, 5 Jul 2021 01:35:36 +0200
Subject: [PATCH 12/46] Add/change file link tests
---
lychee-bin/tests/cli.rs | 27 +++++++++++++++++----------
lychee-bin/tests/local_files.rs | 5 ++---
lychee-lib/src/client.rs | 17 ++++++++++++++++-
3 files changed, 35 insertions(+), 14 deletions(-)
diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs
index b42f988a0b..6d0819b579 100644
--- a/lychee-bin/tests/cli.rs
+++ b/lychee-bin/tests/cli.rs
@@ -130,15 +130,22 @@ mod cli {
/// Test unsupported URI schemes
#[test]
- fn test_unsupported_uri_schemes() -> Result<()> {
- test_json_output!(
- "TEST_SCHEMES.txt",
- MockResponseStats {
- total: 1,
- successful: 1,
- ..MockResponseStats::default()
- }
- )
+ fn test_unsupported_uri_schemes() {
+ let mut cmd = main_command();
+ let test_schemes_path = fixtures_path().join("TEST_SCHEMES.txt");
+
+ // Exclude file link because it doesn't exist on the filesystem.
+ // (File URIs are absolute paths, which we don't have.)
+ // Nevertheless, the `file` scheme should be recognized.
+ cmd.arg(test_schemes_path)
+ .arg("--exclude")
+ .arg("file://")
+ .env_clear()
+ .assert()
+ .success()
+ .stdout(contains("Total............2"))
+ .stdout(contains("Successful.......1"))
+ .stdout(contains("Excluded.........1"));
}
#[test]
@@ -364,7 +371,7 @@ mod cli {
.assert()
.success();
- let expected = r#"{"total":10,"successful":10,"failures":0,"timeouts":0,"redirects":0,"excludes":0,"errors":0,"fail_map":{}}"#;
+ let expected = r#"{"total":11,"successful":11,"failures":0,"timeouts":0,"redirects":0,"excludes":0,"errors":0,"fail_map":{}}"#;
let output = fs::read_to_string(&outfile)?;
assert_eq!(output.split_whitespace().collect::(), expected);
fs::remove_file(outfile)?;
diff --git a/lychee-bin/tests/local_files.rs b/lychee-bin/tests/local_files.rs
index ddd0ed25e1..11574e172d 100644
--- a/lychee-bin/tests/local_files.rs
+++ b/lychee-bin/tests/local_files.rs
@@ -19,8 +19,7 @@ mod cli {
writeln!(index, r#"Foo "#)?;
let foo_path = dir.path().join("foo.html");
- let mut foo = File::create(&foo_path)?;
- writeln!(foo, r#"example "#)?;
+ File::create(&foo_path)?;
let mut cmd = main_command();
cmd.arg(index_path)
@@ -30,7 +29,7 @@ mod cli {
.assert()
.success()
.stdout(contains("Total............1"))
- .stdout(contains("example.org"));
+ .stdout(contains("foo.html"));
Ok(())
}
diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs
index a69167c7d2..e8d477fd7f 100644
--- a/lychee-lib/src/client.rs
+++ b/lychee-lib/src/client.rs
@@ -287,10 +287,14 @@ where
#[cfg(test)]
mod test {
- use std::time::{Duration, Instant};
+ use std::{
+ fs::File,
+ time::{Duration, Instant},
+ };
use http::{header::HeaderMap, StatusCode};
use reqwest::header;
+ use tempfile::tempdir;
use super::ClientBuilder;
use crate::{mock_server, test_utils::get_mock_client_response};
@@ -375,6 +379,17 @@ mod test {
assert!(res.status().is_success());
}
+ #[tokio::test]
+ async fn test_file() {
+ let dir = tempdir().unwrap();
+ let file = dir.path().join("temp");
+ File::create(file).unwrap();
+ let uri = format!("file://{}", dir.path().join("temp").to_str().unwrap());
+
+ let res = get_mock_client_response(uri).await;
+ assert!(res.status().is_success());
+ }
+
#[tokio::test]
async fn test_custom_headers() {
// See https://github.com/rust-lang/crates.io/issues/788
From a3fd85d923b7762d9d98687ff1f9b58f44f6b573 Mon Sep 17 00:00:00 2001
From: Matthias
Date: Mon, 5 Jul 2021 01:36:43 +0200
Subject: [PATCH 13/46] Exclude anchor links
---
lychee-lib/src/extract.rs | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/lychee-lib/src/extract.rs b/lychee-lib/src/extract.rs
index 65b6ee831e..039c039425 100644
--- a/lychee-lib/src/extract.rs
+++ b/lychee-lib/src/extract.rs
@@ -123,13 +123,13 @@ pub(crate) fn extract_links(
Request::new(Uri { inner: new_url }, input_content.input.clone())
} else if let Input::FsPath(root) = &input_content.input {
let link = fs::sanitize(link);
+ if link.starts_with("#") {
+ // Silently ignore anchors for now.
+ continue;
+ }
let path = fs::resolve(&root, &PathBuf::from(&link), base)?;
- Request::new(
- Uri {
- inner: Url::from_file_path(&path).map_err(|_e| ErrorKind::InvalidPath(path))?,
- },
- input_content.input.clone(),
- )
+ let uri = Url::from_file_path(&path).map_err(|_e| ErrorKind::InvalidPath(path))?;
+ Request::new(Uri { inner: uri }, input_content.input.clone())
} else {
info!("Handling of {} not implemented yet", &link);
continue;
From 1546d6ee38536570071653753dbe77244a4575c3 Mon Sep 17 00:00:00 2001
From: Matthias
Date: Mon, 5 Jul 2021 01:38:14 +0200
Subject: [PATCH 14/46] Normalize path; fix tests
---
lychee-lib/src/fs.rs | 167 ++++++++++++++++++++++---------------------
1 file changed, 84 insertions(+), 83 deletions(-)
diff --git a/lychee-lib/src/fs.rs b/lychee-lib/src/fs.rs
index 98255b18ae..f7fd6a8680 100644
--- a/lychee-lib/src/fs.rs
+++ b/lychee-lib/src/fs.rs
@@ -1,26 +1,67 @@
use crate::{Base, ErrorKind, Result};
-use std::path::{Path, PathBuf};
+use std::path::{Component, Path, PathBuf};
// Returns the base if it is a valid `PathBuf`
fn get_base_dir(base: &Option ) -> Option {
base.as_ref().and_then(|b| b.dir())
}
+/// Normalize a path, removing things like `.` and `..`.
+///
+/// CAUTION: This does not resolve symlinks (unlike
+/// [`std::fs::canonicalize`]). This may cause incorrect or surprising
+/// behavior at times. This should be used carefully. Unfortunately,
+/// [`std::fs::canonicalize`] can be hard to use correctly, since it can often
+/// fail, or on Windows returns annoying device paths. This is a problem Cargo
+/// needs to improve on.
+///
+/// Taken from https://github.com/rust-lang/cargo/blob/fede83ccf973457de319ba6fa0e36ead454d2e20/src/cargo/util/paths.rs#L61
+pub(crate) fn normalize(path: &Path) -> PathBuf {
+ let mut components = path.components().peekable();
+ let mut ret = if let Some(c @ Component::Prefix(..)) = components.peek().cloned() {
+ components.next();
+ PathBuf::from(c.as_os_str())
+ } else {
+ PathBuf::new()
+ };
+
+ for component in components {
+ match component {
+ Component::Prefix(..) => unreachable!(),
+ Component::RootDir => {
+ ret.push(component.as_os_str());
+ }
+ Component::CurDir => {}
+ Component::ParentDir => {
+ ret.pop();
+ }
+ Component::Normal(c) => {
+ ret.push(c);
+ }
+ }
+ }
+ ret
+}
+
pub(crate) fn resolve(src: &Path, dst: &Path, base: &Option ) -> Result {
if dst.is_relative() {
// Find `dst` in the parent directory of `src`
if let Some(parent) = src.parent() {
let rel_path = parent.join(dst.to_path_buf());
- return Ok(rel_path);
+ return Ok(normalize(&rel_path));
}
}
if dst.is_absolute() {
// Absolute local links (leading slash) require the base_url to
// define the document root.
- if let Some(base_dir) = get_base_dir(base) {
- let abs_path = join(base_dir, dst);
- return Ok(abs_path);
- }
+ let base_dir = get_base_dir(base).unwrap_or(
+ src.to_path_buf()
+ .parent()
+ .map(|p| p.to_path_buf())
+ .unwrap_or(PathBuf::new()),
+ );
+ let abs_path = join(base_dir, dst);
+ return Ok(normalize(&abs_path));
}
Err(ErrorKind::FileNotFound(dst.to_path_buf()))
}
@@ -46,8 +87,6 @@ pub(crate) fn sanitize(link: String) -> String {
#[cfg(test)]
mod test_fs_tree {
- use std::fs::File;
-
use super::*;
use crate::Result;
@@ -79,115 +118,77 @@ mod test_fs_tree {
// dummy root
// /path/to/foo.html
#[test]
- fn test_find_absolute() -> Result<()> {
+ fn test_resolve_absolute() -> Result<()> {
let dummy = PathBuf::new();
- let dir = tempfile::tempdir()?;
- let dst = dir.path().join("foo.html");
- File::create(&dst)?;
- assert_eq!(find(&dummy, &dst, &None)?, dst);
+ let abs_path = PathBuf::from("/absolute/path/to/foo.html");
+ assert_eq!(resolve(&dummy, &abs_path, &None)?, abs_path);
Ok(())
}
// index.html
// ./foo.html
#[test]
- fn test_find_relative() -> Result<()> {
- let root = PathBuf::from("index.html");
- let dir = tempfile::tempdir()?;
- let dst = dir.path().join("./foo.html");
- File::create(&dst)?;
- assert_eq!(find(&root, &dst, &None)?, dst);
+ fn test_resolve_relative() -> Result<()> {
+ let dummy = PathBuf::from("index.html");
+ let abs_path = PathBuf::from("./foo.html");
+ assert_eq!(
+ resolve(&dummy, &abs_path, &None)?,
+ PathBuf::from("foo.html")
+ );
Ok(())
}
// ./index.html
// ./foo.html
#[test]
- fn test_find_relative_index() -> Result<()> {
- let root = PathBuf::from("./index.html");
- let dir = tempfile::tempdir()?;
- let dst = dir.path().join("./foo.html");
- File::create(&dst)?;
- assert_eq!(find(&root, &dst, &None)?, dst);
- Ok(())
- }
-
- #[test]
- fn test_find_relative_nonexistent() -> Result<()> {
- let root = PathBuf::from("index.html");
- // This file does not exist
- let dst = PathBuf::from("./foo.html");
- assert!(find(&root, &dst, &None).is_err());
+ fn test_resolve_relative_index() -> Result<()> {
+ let dummy = PathBuf::from("./index.html");
+ let abs_path = PathBuf::from("./foo.html");
+ assert_eq!(
+ resolve(&dummy, &abs_path, &None)?,
+ PathBuf::from("foo.html")
+ );
Ok(())
}
// /path/to/index.html
// ./foo.html
#[test]
- fn test_find_relative_from_absolute() -> Result<()> {
- let dir = tempfile::tempdir()?;
- let root = dir.path().join("index.html");
- // We create the absolute path to foo.html,
- // but we address it under its relative path
- let dst = PathBuf::from("./foo.html");
- let dst_absolute = dir.path().join("./foo.html");
- File::create(&dst_absolute)?;
- assert_eq!(find(&root, &dst, &None)?, dst_absolute);
+ fn test_resolve_from_absolute() -> Result<()> {
+ let abs_index = PathBuf::from("/path/to/index.html");
+ let abs_path = PathBuf::from("./foo.html");
+ assert_eq!(
+ resolve(&abs_index, &abs_path, &None)?,
+ PathBuf::from("/path/to/foo.html")
+ );
Ok(())
}
// dummy
- // ./foo.html
+ // foo.html
// valid base dir
#[test]
- fn test_find_absolute_from_base_dir() -> Result<()> {
+ fn test_resolve_absolute_from_base_dir() -> Result<()> {
let dummy = PathBuf::new();
- let dir = tempfile::tempdir()?;
- let dst = dir.path().join("foo.html");
- File::create(&dst)?;
- let base_dir = dir.path().to_path_buf();
- let dst_absolute = base_dir.join(dst.to_path_buf());
+ let abs_path = PathBuf::from("/foo.html");
+ let base = Some(Base::Local(PathBuf::from("/some/absolute/base/dir")));
assert_eq!(
- find(&dummy, &dst, &Some(Base::Local(base_dir)))?,
- dst_absolute
+ resolve(&dummy, &abs_path, &base)?,
+ PathBuf::from("/some/absolute/base/dir/foo.html")
);
Ok(())
}
// /path/to/index.html
- // ./foo.html (non-existent)
- #[test]
- fn test_find_relative_from_absolute_nonexistent() -> Result<()> {
- let dir = tempfile::tempdir()?;
- let root = dir.path().join("index.html");
- // We create the absolute path to foo.html,
- // but we address it under its relative path
- let dst = PathBuf::from("./foo.html");
- assert!(find(&root, &dst, &None).is_err());
- Ok(())
- }
-
- // /path/to/index.html
- // /other/path/to/foo.html
- #[test]
- fn test_find_absolute_from_absolute() -> Result<()> {
- let root = PathBuf::from("/path/to/index.html");
- let dir = tempfile::tempdir()?;
- let dst = dir.path().join("foo.html");
- File::create(&dst)?;
- assert_eq!(find(&root, &dst, &None)?, dst);
- Ok(())
- }
-
- // /path/to
// /other/path/to/foo.html
#[test]
- fn test_root_is_dir() -> Result<()> {
- let root = PathBuf::from("/path/to/");
- let dir = tempfile::tempdir()?;
- let dst = dir.path().join("foo.html");
- File::create(&dst)?;
- assert_eq!(find(&root, &dst, &None)?, dst);
+ fn test_resolve_absolute_from_absolute() -> Result<()> {
+ let abs_index = PathBuf::from("/path/to/index.html");
+ let abs_path = PathBuf::from("/other/path/to/foo.html");
+ assert_eq!(
+ resolve(&abs_index, &abs_path, &None)?,
+ PathBuf::from("/path/to/other/path/to/foo.html")
+ );
Ok(())
}
}
From afdb721612f1ec5307bbc0a5eccd186baadb2d44 Mon Sep 17 00:00:00 2001
From: Matthias
Date: Mon, 5 Jul 2021 02:03:00 +0200
Subject: [PATCH 15/46] Fix lints
---
lychee-lib/src/extract.rs | 4 ++--
lychee-lib/src/fs.rs | 43 +++++++++++++----------------------
lychee-lib/src/types/base.rs | 4 +++-
lychee-lib/src/types/error.rs | 1 +
4 files changed, 22 insertions(+), 30 deletions(-)
diff --git a/lychee-lib/src/extract.rs b/lychee-lib/src/extract.rs
index 039c039425..309e63b4c0 100644
--- a/lychee-lib/src/extract.rs
+++ b/lychee-lib/src/extract.rs
@@ -122,8 +122,8 @@ pub(crate) fn extract_links(
} else if let Some(new_url) = base.as_ref().and_then(|u| u.join(&link)) {
Request::new(Uri { inner: new_url }, input_content.input.clone())
} else if let Input::FsPath(root) = &input_content.input {
- let link = fs::sanitize(link);
- if link.starts_with("#") {
+ let link = fs::sanitize(&link);
+ if link.starts_with('#') {
// Silently ignore anchors for now.
continue;
}
diff --git a/lychee-lib/src/fs.rs b/lychee-lib/src/fs.rs
index f7fd6a8680..a5c90fb5c2 100644
--- a/lychee-lib/src/fs.rs
+++ b/lychee-lib/src/fs.rs
@@ -3,7 +3,7 @@ use std::path::{Component, Path, PathBuf};
// Returns the base if it is a valid `PathBuf`
fn get_base_dir(base: &Option ) -> Option {
- base.as_ref().and_then(|b| b.dir())
+ base.as_ref().and_then(Base::dir)
}
/// Normalize a path, removing things like `.` and `..`.
@@ -15,15 +15,14 @@ fn get_base_dir(base: &Option ) -> Option {
/// fail, or on Windows returns annoying device paths. This is a problem Cargo
/// needs to improve on.
///
-/// Taken from https://github.com/rust-lang/cargo/blob/fede83ccf973457de319ba6fa0e36ead454d2e20/src/cargo/util/paths.rs#L61
+/// Taken from [`cargo`](https://github.com/rust-lang/cargo/blob/fede83ccf973457de319ba6fa0e36ead454d2e20/src/cargo/util/paths.rs#L61)
pub(crate) fn normalize(path: &Path) -> PathBuf {
let mut components = path.components().peekable();
- let mut ret = if let Some(c @ Component::Prefix(..)) = components.peek().cloned() {
+
+ let mut ret = components.peek().copied().map_or_else(PathBuf::new, |c| {
components.next();
PathBuf::from(c.as_os_str())
- } else {
- PathBuf::new()
- };
+ });
for component in components {
match component {
@@ -54,12 +53,11 @@ pub(crate) fn resolve(src: &Path, dst: &Path, base: &Option ) -> Result PathBuf {
/// A little helper function to remove the get parameters from a URL link.
/// The link is not a URL but a String as that link may not have a base domain.
-pub(crate) fn sanitize(link: String) -> String {
+pub(crate) fn sanitize(link: &str) -> String {
let path = match link.split_once('?') {
Some((path, _params)) => path,
- None => link.as_str(),
+ None => link,
};
path.to_string()
}
@@ -92,27 +90,18 @@ mod test_fs_tree {
#[test]
fn test_sanitize() {
- assert_eq!(sanitize("/".to_string()), "/".to_string());
- assert_eq!(
- sanitize("index.html?foo=bar".to_string()),
- "index.html".to_string()
- );
+ assert_eq!(sanitize("/"), "/".to_string());
+ assert_eq!(sanitize("index.html?foo=bar"), "index.html".to_string());
+ assert_eq!(sanitize("/index.html?foo=bar"), "/index.html".to_string());
assert_eq!(
- sanitize("/index.html?foo=bar".to_string()),
+ sanitize("/index.html?foo=bar&baz=zorx?bla=blub"),
"/index.html".to_string()
);
assert_eq!(
- sanitize("/index.html?foo=bar&baz=zorx?bla=blub".to_string()),
- "/index.html".to_string()
- );
- assert_eq!(
- sanitize("https://example.org/index.html?foo=bar".to_string()),
+ sanitize("https://example.org/index.html?foo=bar"),
"https://example.org/index.html".to_string()
);
- assert_eq!(
- sanitize("test.png?foo=bar".to_string()),
- "test.png".to_string()
- );
+ assert_eq!(sanitize("test.png?foo=bar"), "test.png".to_string());
}
// dummy root
diff --git a/lychee-lib/src/types/base.rs b/lychee-lib/src/types/base.rs
index 8b7f2f866a..f38ec29c87 100644
--- a/lychee-lib/src/types/base.rs
+++ b/lychee-lib/src/types/base.rs
@@ -18,6 +18,7 @@ pub enum Base {
impl Base {
/// Join link with base url
+ #[must_use]
pub fn join(&self, link: &str) -> Option {
match self {
Self::Remote(url) => url.join(link).ok(),
@@ -26,10 +27,11 @@ impl Base {
}
/// Return the directory if the base is local
+ #[must_use]
pub fn dir(&self) -> Option {
match self {
Self::Remote(_) => None,
- Self::Local(d) => Some(d.to_path_buf()),
+ Self::Local(d) => Some(d.clone()),
}
}
}
diff --git a/lychee-lib/src/types/error.rs b/lychee-lib/src/types/error.rs
index 60cafa073d..7de4608106 100644
--- a/lychee-lib/src/types/error.rs
+++ b/lychee-lib/src/types/error.rs
@@ -76,6 +76,7 @@ impl Hash for ErrorKind {
Self::UrlParseError(s, e) => (s, e.type_id()).hash(state),
Self::UnreachableEmailAddress(u) | Self::InsecureURL(u) => u.hash(state),
Self::InvalidFileUri(u) => u.hash(state),
+ Self::InvalidFileUri(f) => f.hash(state),
Self::InvalidPath(p) => p.hash(state),
Self::UnreachableEmailAddress(u) => u.hash(state),
Self::InvalidHeader(e) => e.to_string().hash(state),
From 4f9dc67bbd85cdf13927d27d15ea84f582e20ed9 Mon Sep 17 00:00:00 2001
From: Matthias
Date: Mon, 5 Jul 2021 11:21:49 +0200
Subject: [PATCH 16/46] fix test
---
lychee-lib/src/fs.rs | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/lychee-lib/src/fs.rs b/lychee-lib/src/fs.rs
index a5c90fb5c2..806166e773 100644
--- a/lychee-lib/src/fs.rs
+++ b/lychee-lib/src/fs.rs
@@ -122,7 +122,7 @@ mod test_fs_tree {
let abs_path = PathBuf::from("./foo.html");
assert_eq!(
resolve(&dummy, &abs_path, &None)?,
- PathBuf::from("foo.html")
+ PathBuf::from("./foo.html")
);
Ok(())
}
From 04bf838f9802a62495e91877bef5394f7581f879 Mon Sep 17 00:00:00 2001
From: Matthias
Date: Mon, 5 Jul 2021 11:25:51 +0200
Subject: [PATCH 17/46] lint
---
lychee-lib/src/extract.rs | 5 +----
1 file changed, 1 insertion(+), 4 deletions(-)
diff --git a/lychee-lib/src/extract.rs b/lychee-lib/src/extract.rs
index 309e63b4c0..a0aaf9cb35 100644
--- a/lychee-lib/src/extract.rs
+++ b/lychee-lib/src/extract.rs
@@ -181,10 +181,7 @@ mod test {
}
fn extract_uris(input: &str, file_type: FileType, base_url: Option<&str>) -> HashSet {
- let base = match base_url {
- Some(url) => Some(Base::Remote(Url::parse(url).unwrap())),
- None => None,
- };
+ let base = base_url.map(|url| Base::Remote(Url::parse(url).unwrap()));
extract_links(&InputContent::from_string(input, file_type), &base)
// unwrap is fine here as this helper function is only used in tests
.unwrap()
From b06afb7252f4760ab7ec2bedd49b74d4d4ffe69f Mon Sep 17 00:00:00 2001
From: Matthias
Date: Mon, 5 Jul 2021 11:52:47 +0200
Subject: [PATCH 18/46] fix test
---
lychee-lib/src/fs.rs | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/lychee-lib/src/fs.rs b/lychee-lib/src/fs.rs
index 806166e773..a854602fd5 100644
--- a/lychee-lib/src/fs.rs
+++ b/lychee-lib/src/fs.rs
@@ -135,7 +135,7 @@ mod test_fs_tree {
let abs_path = PathBuf::from("./foo.html");
assert_eq!(
resolve(&dummy, &abs_path, &None)?,
- PathBuf::from("foo.html")
+ PathBuf::from("./foo.html")
);
Ok(())
}
From 495f856c612ef2078c2ea24ad9ac620291ceedc6 Mon Sep 17 00:00:00 2001
From: Matthias
Date: Mon, 5 Jul 2021 21:41:44 +0200
Subject: [PATCH 19/46] cleanup
---
fixtures/TEST_RELATIVE.html | 1 -
fixtures/TEST_RELATIVE_2.html | 1 -
fixtures/TEST_RELATIVE_3.html | 1 -
3 files changed, 3 deletions(-)
delete mode 100644 fixtures/TEST_RELATIVE.html
delete mode 100644 fixtures/TEST_RELATIVE_2.html
delete mode 100644 fixtures/TEST_RELATIVE_3.html
diff --git a/fixtures/TEST_RELATIVE.html b/fixtures/TEST_RELATIVE.html
deleted file mode 100644
index be4b0e517c..0000000000
--- a/fixtures/TEST_RELATIVE.html
+++ /dev/null
@@ -1 +0,0 @@
-Foo
\ No newline at end of file
diff --git a/fixtures/TEST_RELATIVE_2.html b/fixtures/TEST_RELATIVE_2.html
deleted file mode 100644
index 89c3e73ade..0000000000
--- a/fixtures/TEST_RELATIVE_2.html
+++ /dev/null
@@ -1 +0,0 @@
-Bar
\ No newline at end of file
diff --git a/fixtures/TEST_RELATIVE_3.html b/fixtures/TEST_RELATIVE_3.html
deleted file mode 100644
index a1324d8465..0000000000
--- a/fixtures/TEST_RELATIVE_3.html
+++ /dev/null
@@ -1 +0,0 @@
-Example link
\ No newline at end of file
From 5a2e10799f289993573486bad7f125c7bdc172b0 Mon Sep 17 00:00:00 2001
From: Matthias
Date: Wed, 1 Sep 2021 19:06:23 +0200
Subject: [PATCH 20/46] linting
---
.github/workflows/release.yml | 6 +++---
.github/workflows/rust.yml | 2 +-
2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index bb1b4451c4..c56d92f3c0 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -43,17 +43,17 @@ jobs:
fail-fast: false
steps:
- name: Install musl tools
- if: contains(matrix.target, 'musl')
+ if: ${{ contains(matrix.target, 'musl') }}
run: sudo apt-get install -y musl-tools
- name: Install arm tools
- if: contains(matrix.target, 'arm')
+ if: ${{ contains(matrix.target, 'arm') }}
run: |
echo "GNU_PREFIX=arm-linux-gnueabihf-" >> $GITHUB_ENV
sudo apt-get install -y binutils-arm-linux-gnueabihf
- name: Install aarch64 tools
- if: contains(matrix.target, 'aarch64')
+ if: ${{ contains(matrix.target, 'aarch64') }}
run: |
echo "GNU_PREFIX=aarch64-linux-gnu-" >> $GITHUB_ENV
sudo apt-get install -y binutils-aarch64-linux-gnu
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 7404bc17d9..4b7369f8c8 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -56,7 +56,7 @@ jobs:
- run: cargo-publish-all --dry-run
publish:
- if: startsWith(github.ref, 'refs/tags/')
+ if: ${{ startsWith(github.ref, 'refs/tags/') }}
needs:
- test
- lint
From dd3205a87cf46382bf5e3aa0f8d7f82e31359d45 Mon Sep 17 00:00:00 2001
From: Matthias
Date: Thu, 2 Sep 2021 23:10:46 +0200
Subject: [PATCH 21/46] wip
---
lychee-bin/src/main.rs | 9 +-
lychee-bin/src/options.rs | 5 ++
lychee-lib/src/extract.rs | 105 ++++++++++------------
lychee-lib/src/helpers/mod.rs | 2 +
lychee-lib/src/{fs.rs => helpers/path.rs} | 56 +++++-------
lychee-lib/src/helpers/url.rs | 68 ++++++++++++++
lychee-lib/src/lib.rs | 2 +-
lychee-lib/src/types/error.rs | 1 +
8 files changed, 154 insertions(+), 94 deletions(-)
create mode 100644 lychee-lib/src/helpers/mod.rs
rename lychee-lib/src/{fs.rs => helpers/path.rs} (78%)
create mode 100644 lychee-lib/src/helpers/url.rs
diff --git a/lychee-bin/src/main.rs b/lychee-bin/src/main.rs
index e1493389ab..f8ba1bc09a 100644
--- a/lychee-bin/src/main.rs
+++ b/lychee-bin/src/main.rs
@@ -175,6 +175,13 @@ async fn run(cfg: &Config, inputs: Vec ) -> Result {
let include = RegexSet::new(&cfg.include)?;
let exclude = RegexSet::new(&cfg.exclude)?;
+ // Offline mode overrides the scheme
+ let schemes = if cfg.offline {
+ vec!["file".to_string()]
+ } else {
+ cfg.scheme.clone()
+ };
+
let client = ClientBuilder::builder()
.includes(include)
.excludes(exclude)
@@ -190,7 +197,7 @@ async fn run(cfg: &Config, inputs: Vec ) -> Result {
.method(method)
.timeout(timeout)
.github_token(cfg.github_token.clone())
- .schemes(HashSet::from_iter(cfg.scheme.clone()))
+ .schemes(HashSet::from_iter(schemes))
.accepted(accepted)
.require_https(cfg.require_https)
.build()
diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs
index b97236715c..a6f66d15a1 100644
--- a/lychee-bin/src/options.rs
+++ b/lychee-bin/src/options.rs
@@ -158,6 +158,11 @@ pub(crate) struct Config {
#[serde(default)]
pub(crate) scheme: Vec,
+ /// Only check local files and block network requests.
+ #[structopt(long)]
+ #[serde(default)]
+ pub(crate) offline: bool,
+
/// URLs to check (supports regex). Has preference over all excludes.
#[structopt(long)]
#[serde(default)]
diff --git a/lychee-lib/src/extract.rs b/lychee-lib/src/extract.rs
index a0aaf9cb35..97d98eeeba 100644
--- a/lychee-lib/src/extract.rs
+++ b/lychee-lib/src/extract.rs
@@ -4,25 +4,53 @@ use html5ever::{
parse_document,
tendril::{StrTendril, TendrilSink},
};
-use linkify::LinkFinder;
use log::info;
use markup5ever_rcdom::{Handle, NodeData, RcDom};
use pulldown_cmark::{Event as MDEvent, Parser, Tag};
use reqwest::Url;
use crate::{
- fs,
+ helpers::{path, url},
types::{FileType, InputContent},
Base, ErrorKind, Input, Request, Result, Uri,
};
-// Use LinkFinder here to offload the actual link searching in plaintext.
-fn find_links(input: &str) -> Vec {
- let finder = LinkFinder::new();
- finder.links(input).collect()
+/// Main entrypoint for extracting links from various sources
+/// (Markdown, HTML, and plaintext)
+pub(crate) fn extract_links(
+ input_content: &InputContent,
+ base: &Option ,
+) -> Result> {
+ let links = match input_content.file_type {
+ FileType::Markdown => extract_links_from_markdown(&input_content.content),
+ FileType::Html => extract_links_from_html(&input_content.content),
+ FileType::Plaintext => extract_links_from_plaintext(&input_content.content),
+ };
+
+ // Only keep legit URLs. For example this filters out anchors.
+ let mut requests: HashSet = HashSet::new();
+ for link in links {
+ let req = if let Ok(uri) = Uri::try_from(link.as_str()) {
+ Request::new(uri, input_content.input.clone())
+ } else if let Some(new_url) = base.as_ref().and_then(|u| u.join(&link)) {
+ Request::new(Uri { inner: new_url }, input_content.input.clone())
+ } else if let Input::FsPath(root) = &input_content.input {
+ if url::is_anchor(&link) {
+ // Silently ignore anchor links for now
+ continue;
+ }
+ let uri = create_uri(root, base, &link)?;
+ Request::new(Uri { inner: uri }, input_content.input.clone())
+ } else {
+ info!("Handling of {} not implemented yet", &link);
+ continue;
+ };
+ requests.insert(req);
+ }
+ Ok(requests)
}
-/// Extract unparsed URL strings from a markdown string.
+/// Extract unparsed URL strings from a Markdown string.
fn extract_links_from_markdown(input: &str) -> Vec {
let parser = Parser::new(input);
parser
@@ -35,15 +63,15 @@ fn extract_links_from_markdown(input: &str) -> Vec {
.collect()
}
-/// Extract unparsed URL strings from a HTML string.
+/// Extract unparsed URL strings from an HTML string.
fn extract_links_from_html(input: &str) -> Vec {
let tendril = StrTendril::from(input);
let rc_dom = parse_document(RcDom::default(), html5ever::ParseOpts::default()).one(tendril);
let mut urls = Vec::new();
- // we pass mutable urls reference to avoid extra allocations in each
- // recursive descent
+ // We pass mutable URL references here to avoid
+ // extra allocations in each recursive descent
walk_html_links(&mut urls, &rc_dom.document);
urls
@@ -68,7 +96,7 @@ fn walk_html_links(mut urls: &mut Vec, node: &Handle) {
for attr in attrs.borrow().iter() {
let attr_value = attr.value.to_string();
- if elem_attr_is_link(attr.name.local.as_ref(), name.local.as_ref()) {
+ if url::elem_attr_is_link(attr.name.local.as_ref(), name.local.as_ref()) {
urls.push(attr_value);
} else {
urls.append(&mut extract_links_from_plaintext(&attr_value));
@@ -80,63 +108,24 @@ fn walk_html_links(mut urls: &mut Vec, node: &Handle) {
}
// recursively traverse the document's nodes -- this doesn't need any extra
- // exit conditions because the document is a tree
+ // exit conditions, because the document is a tree
for child in node.children.borrow().iter() {
walk_html_links(&mut urls, child);
}
}
-/// Determine if element's attribute contains a link / URL.
-fn elem_attr_is_link(attr_name: &str, elem_name: &str) -> bool {
- // See a comprehensive list of attributes that might contain URLs/URIs
- // over at: https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes
- matches!(
- (attr_name, elem_name),
- ("href" | "src" | "srcset" | "cite", _) | ("data", "object") | ("onhashchange", "body")
- )
-}
-
-/// Extract unparsed URL strings from a plaintext.
+/// Extract unparsed URL strings from plaintext
fn extract_links_from_plaintext(input: &str) -> Vec {
- find_links(input)
+ url::find_links(input)
.iter()
.map(|l| String::from(l.as_str()))
.collect()
}
-pub(crate) fn extract_links(
- input_content: &InputContent,
- base: &Option ,
-) -> Result> {
- let links = match input_content.file_type {
- FileType::Markdown => extract_links_from_markdown(&input_content.content),
- FileType::Html => extract_links_from_html(&input_content.content),
- FileType::Plaintext => extract_links_from_plaintext(&input_content.content),
- };
-
- // Only keep legit URLs. For example this filters out anchors.
- let mut requests: HashSet = HashSet::new();
- for link in links {
- let req = if let Ok(uri) = Uri::try_from(link.as_str()) {
- Request::new(uri, input_content.input.clone())
- } else if let Some(new_url) = base.as_ref().and_then(|u| u.join(&link)) {
- Request::new(Uri { inner: new_url }, input_content.input.clone())
- } else if let Input::FsPath(root) = &input_content.input {
- let link = fs::sanitize(&link);
- if link.starts_with('#') {
- // Silently ignore anchors for now.
- continue;
- }
- let path = fs::resolve(&root, &PathBuf::from(&link), base)?;
- let uri = Url::from_file_path(&path).map_err(|_e| ErrorKind::InvalidPath(path))?;
- Request::new(Uri { inner: uri }, input_content.input.clone())
- } else {
- info!("Handling of {} not implemented yet", &link);
- continue;
- };
- requests.insert(req);
- }
- Ok(requests)
+fn create_uri(root: &PathBuf, base: &Option , link: &str) -> Result {
+ let link = url::remove_get_params(&link);
+ let path = path::resolve(root, &PathBuf::from(&link), base)?;
+ Ok(Url::from_file_path(&path).map_err(|_e| ErrorKind::InvalidPath(path))?)
}
#[cfg(test)]
@@ -150,10 +139,10 @@ mod test {
};
use pretty_assertions::assert_eq;
- use url::Url;
use super::*;
use crate::{
+ helpers::url::find_links,
test_utils::{mail, website},
Uri,
};
diff --git a/lychee-lib/src/helpers/mod.rs b/lychee-lib/src/helpers/mod.rs
new file mode 100644
index 0000000000..94f2d21cf8
--- /dev/null
+++ b/lychee-lib/src/helpers/mod.rs
@@ -0,0 +1,2 @@
+pub(crate) mod path;
+pub(crate) mod url;
diff --git a/lychee-lib/src/fs.rs b/lychee-lib/src/helpers/path.rs
similarity index 78%
rename from lychee-lib/src/fs.rs
rename to lychee-lib/src/helpers/path.rs
index a854602fd5..6872b4cc00 100644
--- a/lychee-lib/src/fs.rs
+++ b/lychee-lib/src/helpers/path.rs
@@ -42,6 +42,18 @@ pub(crate) fn normalize(path: &Path) -> PathBuf {
ret
}
+// Get the parent directory of a given `Path`.
+fn dirname(src: &Path) -> PathBuf {
+ if src.is_file() {
+ src.to_path_buf()
+ .parent()
+ .map_or(PathBuf::new(), Path::to_path_buf)
+ } else {
+ src.to_path_buf()
+ }
+}
+
+// Resolve `dst` that was linked to from within `src`
pub(crate) fn resolve(src: &Path, dst: &Path, base: &Option ) -> Result {
if dst.is_relative() {
// Find `dst` in the parent directory of `src`
@@ -51,14 +63,16 @@ pub(crate) fn resolve(src: &Path, dst: &Path, base: &Option ) -> Result".to_string(),
+ format!("Found absolute local link {:?} but no base directory was set. Set with `--base`.", dst)
+ .to_string(),
+ )
+ })?;
+ let abs_path = join(dirname(&base), dst);
return Ok(normalize(&abs_path));
}
Err(ErrorKind::FileNotFound(dst.to_path_buf()))
@@ -73,37 +87,11 @@ fn join(base: PathBuf, dst: &Path) -> PathBuf {
PathBuf::from(abs)
}
-/// A little helper function to remove the get parameters from a URL link.
-/// The link is not a URL but a String as that link may not have a base domain.
-pub(crate) fn sanitize(link: &str) -> String {
- let path = match link.split_once('?') {
- Some((path, _params)) => path,
- None => link,
- };
- path.to_string()
-}
-
#[cfg(test)]
-mod test_fs_tree {
+mod test_path {
use super::*;
use crate::Result;
- #[test]
- fn test_sanitize() {
- assert_eq!(sanitize("/"), "/".to_string());
- assert_eq!(sanitize("index.html?foo=bar"), "index.html".to_string());
- assert_eq!(sanitize("/index.html?foo=bar"), "/index.html".to_string());
- assert_eq!(
- sanitize("/index.html?foo=bar&baz=zorx?bla=blub"),
- "/index.html".to_string()
- );
- assert_eq!(
- sanitize("https://example.org/index.html?foo=bar"),
- "https://example.org/index.html".to_string()
- );
- assert_eq!(sanitize("test.png?foo=bar"), "test.png".to_string());
- }
-
// dummy root
// /path/to/foo.html
#[test]
diff --git a/lychee-lib/src/helpers/url.rs b/lychee-lib/src/helpers/url.rs
new file mode 100644
index 0000000000..b00624d48e
--- /dev/null
+++ b/lychee-lib/src/helpers/url.rs
@@ -0,0 +1,68 @@
+use linkify::LinkFinder;
+
+/// Remove all GET parameters from a URL.
+/// The link is not a URL but a String as it may not have a base domain.
+pub(crate) fn remove_get_params(url: &str) -> String {
+ let path = match url.split_once('?') {
+ Some((path, _params)) => path,
+ None => url,
+ };
+ path.to_string()
+}
+
+/// Determine if an element's attribute contains a link / URL.
+pub(crate) fn elem_attr_is_link(attr_name: &str, elem_name: &str) -> bool {
+ // See a comprehensive list of attributes that might contain URLs/URIs
+ // over at: https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes
+ matches!(
+ (attr_name, elem_name),
+ ("href" | "src" | "srcset" | "cite", _) | ("data", "object") | ("onhashchange", "body")
+ )
+}
+
+// Taken from https://github.com/getzola/zola/blob/master/components/link_checker/src/lib.rs
+pub(crate) fn is_anchor(url: &str) -> bool {
+ url.starts_with('#')
+}
+
+// Use `LinkFinder` to offload the raw link searching in plaintext
+pub(crate) fn find_links(input: &str) -> Vec {
+ let finder = LinkFinder::new();
+ finder.links(input).collect()
+}
+
+#[cfg(test)]
+mod test_fs_tree {
+ use super::*;
+
+ #[test]
+ fn test_is_anchor() {
+ assert!(is_anchor("#anchor"));
+ assert!(!is_anchor("notan#anchor"));
+ }
+
+ #[test]
+ fn test_remove_get_params() {
+ assert_eq!(remove_get_params("/"), "/".to_string());
+ assert_eq!(
+ remove_get_params("index.html?foo=bar"),
+ "index.html".to_string()
+ );
+ assert_eq!(
+ remove_get_params("/index.html?foo=bar"),
+ "/index.html".to_string()
+ );
+ assert_eq!(
+ remove_get_params("/index.html?foo=bar&baz=zorx?bla=blub"),
+ "/index.html".to_string()
+ );
+ assert_eq!(
+ remove_get_params("https://example.org/index.html?foo=bar"),
+ "https://example.org/index.html".to_string()
+ );
+ assert_eq!(
+ remove_get_params("test.png?foo=bar"),
+ "test.png".to_string()
+ );
+ }
+}
diff --git a/lychee-lib/src/lib.rs b/lychee-lib/src/lib.rs
index 5af79c6c9a..22b76f8a22 100644
--- a/lychee-lib/src/lib.rs
+++ b/lychee-lib/src/lib.rs
@@ -50,7 +50,7 @@ mod client;
mod client_pool;
/// A pool of clients, to handle concurrent checks
pub mod collector;
-mod fs;
+mod helpers;
mod quirks;
mod types;
diff --git a/lychee-lib/src/types/error.rs b/lychee-lib/src/types/error.rs
index 7de4608106..1dd68d901b 100644
--- a/lychee-lib/src/types/error.rs
+++ b/lychee-lib/src/types/error.rs
@@ -124,6 +124,7 @@ impl Display for ErrorKind {
uri
),
Self::InvalidBase(base, e) => write!(f, "Error while base dir `{}` : {}", base, e),
+ Self::InvalidBase(base, e) => write!(f, "Error with base dir `{}` : {}", base, e),
}
}
}
From b7c129c43113999fedaa096c7fa31dc2069d0382 Mon Sep 17 00:00:00 2001
From: Matthias
Date: Fri, 3 Sep 2021 01:42:57 +0200
Subject: [PATCH 22/46] Fix resolving absolute paths
The previous solution didn't resolve to absolute paths
and rather removed things like `.` and `..`.
---
Cargo.lock | 7 +++++
README.md | 4 +++
lychee-lib/Cargo.toml | 1 +
lychee-lib/src/extract.rs | 4 +--
lychee-lib/src/helpers/path.rs | 50 ++++++++++------------------------
5 files changed, 29 insertions(+), 37 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index 1f7194a069..a71f5915ac 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1394,6 +1394,7 @@ dependencies = [
"log",
"markup5ever_rcdom",
"openssl-sys",
+ "path-clean",
"pretty_assertions",
"pulldown-cmark",
"regex",
@@ -1718,6 +1719,12 @@ dependencies = [
"winapi",
]
+[[package]]
+name = "path-clean"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ecba01bf2678719532c5e3059e0b5f0811273d94b397088b82e3bd0a78c78fdd"
+
[[package]]
name = "pem"
version = "0.8.3"
diff --git a/README.md b/README.md
index 326a00c22c..e452c5b172 100644
--- a/README.md
+++ b/README.md
@@ -148,11 +148,15 @@ lychee ~/projects/*/README.md
# check links in local files (lychee supports advanced globbing and ~ expansion):
lychee "~/projects/big_project/**/README.*"
+
# ignore case when globbing and check result for each link:
lychee --glob-ignore-case --verbose "~/projects/**/[r]eadme.*"
# check links from epub file (requires atool: https://www.nongnu.org/atool)
acat -F zip {file.epub} "*.xhtml" "*.html" | lychee -
+
+# check links in directory; block network requests
+lychee --offline path/to/directory
```
### GitHub token
diff --git a/lychee-lib/Cargo.toml b/lychee-lib/Cargo.toml
index ab2c29c303..2b165a0838 100644
--- a/lychee-lib/Cargo.toml
+++ b/lychee-lib/Cargo.toml
@@ -41,6 +41,7 @@ tokio = { version = "1.6.0", features = ["full"] }
typed-builder = "0.9.1"
url = { version = "2.2.2", features = ["serde"] }
log = "0.4.14"
+path-clean = "0.1.0"
[dev-dependencies]
doc-comment = "0.3.3"
diff --git a/lychee-lib/src/extract.rs b/lychee-lib/src/extract.rs
index 97d98eeeba..41b62e3278 100644
--- a/lychee-lib/src/extract.rs
+++ b/lychee-lib/src/extract.rs
@@ -39,7 +39,7 @@ pub(crate) fn extract_links(
// Silently ignore anchor links for now
continue;
}
- let uri = create_uri(root, base, &link)?;
+ let uri = create_uri_from_path(root, base, &link)?;
Request::new(Uri { inner: uri }, input_content.input.clone())
} else {
info!("Handling of {} not implemented yet", &link);
@@ -122,7 +122,7 @@ fn extract_links_from_plaintext(input: &str) -> Vec {
.collect()
}
-fn create_uri(root: &PathBuf, base: &Option , link: &str) -> Result {
+fn create_uri_from_path(root: &PathBuf, base: &Option , link: &str) -> Result {
let link = url::remove_get_params(&link);
let path = path::resolve(root, &PathBuf::from(&link), base)?;
Ok(Url::from_file_path(&path).map_err(|_e| ErrorKind::InvalidPath(path))?)
diff --git a/lychee-lib/src/helpers/path.rs b/lychee-lib/src/helpers/path.rs
index 6872b4cc00..8b875ab1d7 100644
--- a/lychee-lib/src/helpers/path.rs
+++ b/lychee-lib/src/helpers/path.rs
@@ -1,45 +1,25 @@
use crate::{Base, ErrorKind, Result};
-use std::path::{Component, Path, PathBuf};
+use path_clean::PathClean;
+use std::env;
+use std::path::{Path, PathBuf};
// Returns the base if it is a valid `PathBuf`
fn get_base_dir(base: &Option ) -> Option {
base.as_ref().and_then(Base::dir)
}
-/// Normalize a path, removing things like `.` and `..`.
-///
-/// CAUTION: This does not resolve symlinks (unlike
-/// [`std::fs::canonicalize`]). This may cause incorrect or surprising
-/// behavior at times. This should be used carefully. Unfortunately,
-/// [`std::fs::canonicalize`] can be hard to use correctly, since it can often
-/// fail, or on Windows returns annoying device paths. This is a problem Cargo
-/// needs to improve on.
-///
-/// Taken from [`cargo`](https://github.com/rust-lang/cargo/blob/fede83ccf973457de319ba6fa0e36ead454d2e20/src/cargo/util/paths.rs#L61)
-pub(crate) fn normalize(path: &Path) -> PathBuf {
- let mut components = path.components().peekable();
+// https://stackoverflow.com/a/54817755/270334
+pub(crate) fn absolute_path(path: impl AsRef) -> Result {
+ let path = path.as_ref();
- let mut ret = components.peek().copied().map_or_else(PathBuf::new, |c| {
- components.next();
- PathBuf::from(c.as_os_str())
- });
-
- for component in components {
- match component {
- Component::Prefix(..) => unreachable!(),
- Component::RootDir => {
- ret.push(component.as_os_str());
- }
- Component::CurDir => {}
- Component::ParentDir => {
- ret.pop();
- }
- Component::Normal(c) => {
- ret.push(c);
- }
- }
+ let absolute_path = if path.is_absolute() {
+ path.to_path_buf()
+ } else {
+ env::current_dir()?.join(path)
}
- ret
+ .clean();
+
+ Ok(absolute_path)
}
// Get the parent directory of a given `Path`.
@@ -59,7 +39,7 @@ pub(crate) fn resolve(src: &Path, dst: &Path, base: &Option ) -> Result) -> Result
Date: Fri, 3 Sep 2021 01:43:45 +0200
Subject: [PATCH 23/46] Add fixtures for offline testing
---
fixtures/offline/about/index.html | 21 +++++++++++++++++++++
fixtures/offline/blog/post1/index.html | 21 +++++++++++++++++++++
fixtures/offline/blog/post2/index.html | 18 ++++++++++++++++++
fixtures/offline/index.html | 18 ++++++++++++++++++
4 files changed, 78 insertions(+)
create mode 100644 fixtures/offline/about/index.html
create mode 100644 fixtures/offline/blog/post1/index.html
create mode 100644 fixtures/offline/blog/post2/index.html
create mode 100644 fixtures/offline/index.html
diff --git a/fixtures/offline/about/index.html b/fixtures/offline/about/index.html
new file mode 100644
index 0000000000..1121b0bfa2
--- /dev/null
+++ b/fixtures/offline/about/index.html
@@ -0,0 +1,21 @@
+
+
+ About
+
+
+ About
+
+
+
+
+
\ No newline at end of file
diff --git a/fixtures/offline/blog/post1/index.html b/fixtures/offline/blog/post1/index.html
new file mode 100644
index 0000000000..91129223da
--- /dev/null
+++ b/fixtures/offline/blog/post1/index.html
@@ -0,0 +1,21 @@
+
+
+ Post 2
+
+
+ Post 2 Title
+
+
+
+
+
\ No newline at end of file
diff --git a/fixtures/offline/blog/post2/index.html b/fixtures/offline/blog/post2/index.html
new file mode 100644
index 0000000000..514ac4eeec
--- /dev/null
+++ b/fixtures/offline/blog/post2/index.html
@@ -0,0 +1,18 @@
+
+
+ Post 1
+
+
+ Post 1 Title
+
+
+
+
+
\ No newline at end of file
diff --git a/fixtures/offline/index.html b/fixtures/offline/index.html
new file mode 100644
index 0000000000..d0879ff786
--- /dev/null
+++ b/fixtures/offline/index.html
@@ -0,0 +1,18 @@
+
+
+ Post 2
+
+
+ Post 2 Title
+
+
+
+
+
\ No newline at end of file
From 82652a69d5771203408670189042ab9c966bdd62 Mon Sep 17 00:00:00 2001
From: Matthias
Date: Fri, 3 Sep 2021 01:48:50 +0200
Subject: [PATCH 24/46] Add test
---
lychee-bin/tests/cli.rs | 19 +++++++++++++++++++
1 file changed, 19 insertions(+)
diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs
index 6d0819b579..a0681bf3aa 100644
--- a/lychee-bin/tests/cli.rs
+++ b/lychee-bin/tests/cli.rs
@@ -148,6 +148,25 @@ mod cli {
.stdout(contains("Excluded.........1"));
}
+ #[test]
+ fn test_resolve_paths() {
+ let mut cmd = main_command();
+ let offline_dir = fixtures_path().join("offline");
+
+ // Exclude file link because it doesn't exist on the filesystem.
+ // (File URIs are absolute paths, which we don't have.)
+ // Nevertheless, the `file` scheme should be recognized.
+ cmd.arg("--offline")
+ .arg("--base")
+ .arg(&offline_dir)
+ .arg(&offline_dir.join("index.html"))
+ .env_clear()
+ .assert()
+ .success()
+ .stdout(contains("Total............2"))
+ .stdout(contains("Successful.......2"));
+ }
+
#[test]
fn test_quirks() -> Result<()> {
test_json_output!(
From 87fd90f2fce97577bc50dfecd1e64000f4171b80 Mon Sep 17 00:00:00 2001
From: Matthias
Date: Fri, 3 Sep 2021 02:09:30 +0200
Subject: [PATCH 25/46] cargo fmt
---
lychee-bin/src/options.rs | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs
index a6f66d15a1..c0ef15d2f8 100644
--- a/lychee-bin/src/options.rs
+++ b/lychee-bin/src/options.rs
@@ -158,7 +158,7 @@ pub(crate) struct Config {
#[serde(default)]
pub(crate) scheme: Vec,
- /// Only check local files and block network requests.
+ /// Only check local files and block network requests.
#[structopt(long)]
#[serde(default)]
pub(crate) offline: bool,
From 9163066a6b6e0f492d1f4cc2d2cc0d55e4cdf764 Mon Sep 17 00:00:00 2001
From: Matthias
Date: Fri, 3 Sep 2021 21:54:16 +0200
Subject: [PATCH 26/46] Reintegrate master
---
README.md | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/README.md b/README.md
index e452c5b172..9c5e7ed7e4 100644
--- a/README.md
+++ b/README.md
@@ -300,7 +300,8 @@ Try one of these links to get started:
- [good first issues](https://github.com/lycheeverse/lychee/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22)
- [help wanted](https://github.com/lycheeverse/lychee/issues?q=is%3Aissue+is%3Aopen+label%3A%22help+wanted%22)
-Lychee is written in Rust. Install [rust-up](https://rustup.rs/) to get started. Begin my making sure the following commands succeed without errors.
+Lychee is written in Rust. Install [rust-up](https://rustup.rs/) to get started.
+Begin my making sure the following commands succeed without errors.
```bash
cargo test # runs tests
From 57af648ec94920e40965ebbd32820d4b49c9e5db Mon Sep 17 00:00:00 2001
From: Matthias
Date: Fri, 3 Sep 2021 23:09:28 +0200
Subject: [PATCH 27/46] fix tests after making base dir mandatory
---
lychee-lib/src/helpers/path.rs | 19 +++++--------------
1 file changed, 5 insertions(+), 14 deletions(-)
diff --git a/lychee-lib/src/helpers/path.rs b/lychee-lib/src/helpers/path.rs
index 8b875ab1d7..fda4660d10 100644
--- a/lychee-lib/src/helpers/path.rs
+++ b/lychee-lib/src/helpers/path.rs
@@ -72,16 +72,6 @@ mod test_path {
use super::*;
use crate::Result;
- // dummy root
- // /path/to/foo.html
- #[test]
- fn test_resolve_absolute() -> Result<()> {
- let dummy = PathBuf::new();
- let abs_path = PathBuf::from("/absolute/path/to/foo.html");
- assert_eq!(resolve(&dummy, &abs_path, &None)?, abs_path);
- Ok(())
- }
-
// index.html
// ./foo.html
#[test]
@@ -90,7 +80,7 @@ mod test_path {
let abs_path = PathBuf::from("./foo.html");
assert_eq!(
resolve(&dummy, &abs_path, &None)?,
- PathBuf::from("./foo.html")
+ env::current_dir()?.join("./foo.html")
);
Ok(())
}
@@ -103,7 +93,7 @@ mod test_path {
let abs_path = PathBuf::from("./foo.html");
assert_eq!(
resolve(&dummy, &abs_path, &None)?,
- PathBuf::from("./foo.html")
+ env::current_dir()?.join("./foo.html")
);
Ok(())
}
@@ -142,9 +132,10 @@ mod test_path {
fn test_resolve_absolute_from_absolute() -> Result<()> {
let abs_index = PathBuf::from("/path/to/index.html");
let abs_path = PathBuf::from("/other/path/to/foo.html");
+ let base = Some(Base::Local(PathBuf::from("/some/absolute/base/dir")));
assert_eq!(
- resolve(&abs_index, &abs_path, &None)?,
- PathBuf::from("/path/to/other/path/to/foo.html")
+ resolve(&abs_index, &abs_path, &base)?,
+ PathBuf::from("/some/absolute/base/dir/other/path/to/foo.html")
);
Ok(())
}
From b3c5d122e712196d5a777a631068ace205736109 Mon Sep 17 00:00:00 2001
From: Matthias
Date: Fri, 3 Sep 2021 23:21:24 +0200
Subject: [PATCH 28/46] Fix clippy lints
---
lychee-lib/src/extract.rs | 8 ++++----
lychee-lib/src/helpers/path.rs | 5 ++---
lychee-lib/src/types/base.rs | 2 +-
lychee-lib/src/types/input.rs | 2 +-
4 files changed, 8 insertions(+), 9 deletions(-)
diff --git a/lychee-lib/src/extract.rs b/lychee-lib/src/extract.rs
index 41b62e3278..0a48b335b5 100644
--- a/lychee-lib/src/extract.rs
+++ b/lychee-lib/src/extract.rs
@@ -1,4 +1,4 @@
-use std::{collections::HashSet, convert::TryFrom, path::PathBuf};
+use std::{collections::HashSet, convert::TryFrom, path::Path, path::PathBuf};
use html5ever::{
parse_document,
@@ -122,10 +122,10 @@ fn extract_links_from_plaintext(input: &str) -> Vec {
.collect()
}
-fn create_uri_from_path(root: &PathBuf, base: &Option , link: &str) -> Result {
- let link = url::remove_get_params(&link);
+fn create_uri_from_path(root: &Path, base: &Option , link: &str) -> Result {
+ let link = url::remove_get_params(link);
let path = path::resolve(root, &PathBuf::from(&link), base)?;
- Ok(Url::from_file_path(&path).map_err(|_e| ErrorKind::InvalidPath(path))?)
+ Url::from_file_path(&path).map_err(|_e| ErrorKind::InvalidPath(path))
}
#[cfg(test)]
diff --git a/lychee-lib/src/helpers/path.rs b/lychee-lib/src/helpers/path.rs
index fda4660d10..87445c3424 100644
--- a/lychee-lib/src/helpers/path.rs
+++ b/lychee-lib/src/helpers/path.rs
@@ -39,7 +39,7 @@ pub(crate) fn resolve(src: &Path, dst: &Path, base: &Option ) -> Result) -> Result".to_string(),
format!("Found absolute local link {:?} but no base directory was set. Set with `--base`.", dst)
- .to_string(),
)
})?;
let abs_path = join(dirname(&base), dst);
- return Ok(absolute_path(&abs_path)?);
+ return absolute_path(&abs_path);
}
Err(ErrorKind::FileNotFound(dst.to_path_buf()))
}
diff --git a/lychee-lib/src/types/base.rs b/lychee-lib/src/types/base.rs
index f38ec29c87..affeacc8ba 100644
--- a/lychee-lib/src/types/base.rs
+++ b/lychee-lib/src/types/base.rs
@@ -40,7 +40,7 @@ impl TryFrom<&str> for Base {
type Error = ErrorKind;
fn try_from(value: &str) -> Result {
- if let Ok(url) = Url::parse(&value) {
+ if let Ok(url) = Url::parse(value) {
if url.cannot_be_a_base() {
return Err(ErrorKind::InvalidBase(
value.to_string(),
diff --git a/lychee-lib/src/types/input.rs b/lychee-lib/src/types/input.rs
index 20a9f2f9e9..ad5ed835c2 100644
--- a/lychee-lib/src/types/input.rs
+++ b/lychee-lib/src/types/input.rs
@@ -83,7 +83,7 @@ impl Input {
pub fn new(value: &str, glob_ignore_case: bool) -> Self {
if value == STDIN {
Self::Stdin
- } else if let Ok(url) = Url::parse(&value) {
+ } else if let Ok(url) = Url::parse(value) {
Self::RemoteUrl(Box::new(url))
} else {
// this seems to be the only way to determine if this is a glob pattern
From f143087743c3086babae1c7d7dff726a08f79890 Mon Sep 17 00:00:00 2001
From: Matthias
Date: Sat, 4 Sep 2021 00:24:39 +0200
Subject: [PATCH 29/46] Relative path not needed
---
lychee-lib/src/helpers/path.rs | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/lychee-lib/src/helpers/path.rs b/lychee-lib/src/helpers/path.rs
index 87445c3424..b31d522477 100644
--- a/lychee-lib/src/helpers/path.rs
+++ b/lychee-lib/src/helpers/path.rs
@@ -79,7 +79,7 @@ mod test_path {
let abs_path = PathBuf::from("./foo.html");
assert_eq!(
resolve(&dummy, &abs_path, &None)?,
- env::current_dir()?.join("./foo.html")
+ env::current_dir()?.join("foo.html")
);
Ok(())
}
@@ -92,7 +92,7 @@ mod test_path {
let abs_path = PathBuf::from("./foo.html");
assert_eq!(
resolve(&dummy, &abs_path, &None)?,
- env::current_dir()?.join("./foo.html")
+ env::current_dir()?.join("foo.html")
);
Ok(())
}
From f47282093a9de71da52bbb66750251e9808d7476 Mon Sep 17 00:00:00 2001
From: Matthias
Date: Sat, 4 Sep 2021 00:24:48 +0200
Subject: [PATCH 30/46] String allocation not needed
---
lychee-lib/src/helpers/url.rs | 25 ++++++++-----------------
1 file changed, 8 insertions(+), 17 deletions(-)
diff --git a/lychee-lib/src/helpers/url.rs b/lychee-lib/src/helpers/url.rs
index b00624d48e..fe275f6cc8 100644
--- a/lychee-lib/src/helpers/url.rs
+++ b/lychee-lib/src/helpers/url.rs
@@ -2,12 +2,12 @@ use linkify::LinkFinder;
/// Remove all GET parameters from a URL.
/// The link is not a URL but a String as it may not have a base domain.
-pub(crate) fn remove_get_params(url: &str) -> String {
+pub(crate) fn remove_get_params(url: &str) -> &str {
let path = match url.split_once('?') {
Some((path, _params)) => path,
None => url,
};
- path.to_string()
+ path
}
/// Determine if an element's attribute contains a link / URL.
@@ -43,26 +43,17 @@ mod test_fs_tree {
#[test]
fn test_remove_get_params() {
- assert_eq!(remove_get_params("/"), "/".to_string());
- assert_eq!(
- remove_get_params("index.html?foo=bar"),
- "index.html".to_string()
- );
- assert_eq!(
- remove_get_params("/index.html?foo=bar"),
- "/index.html".to_string()
- );
+ assert_eq!(remove_get_params("/"), "/");
+ assert_eq!(remove_get_params("index.html?foo=bar"), "index.html");
+ assert_eq!(remove_get_params("/index.html?foo=bar"), "/index.html");
assert_eq!(
remove_get_params("/index.html?foo=bar&baz=zorx?bla=blub"),
- "/index.html".to_string()
+ "/index.html"
);
assert_eq!(
remove_get_params("https://example.org/index.html?foo=bar"),
- "https://example.org/index.html".to_string()
- );
- assert_eq!(
- remove_get_params("test.png?foo=bar"),
- "test.png".to_string()
+ "https://example.org/index.html"
);
+ assert_eq!(remove_get_params("test.png?foo=bar"), "test.png");
}
}
From 00ddb6dfc8df2744b98908362f06384dac933d70 Mon Sep 17 00:00:00 2001
From: Matthias
Date: Mon, 6 Sep 2021 00:35:11 +0200
Subject: [PATCH 31/46] Filter out directories with suffixes that look like
extensions
Directories can still have a suffix which looks like
a file extension like `foo.html`. This can lead to
unexpected behavior with glob patterns like
`**/*.html`. Therefore filter these out.
https://github.com/lycheeverse/lychee/pull/262#issuecomment-91322681
---
lychee-lib/src/types/input.rs | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/lychee-lib/src/types/input.rs b/lychee-lib/src/types/input.rs
index ad5ed835c2..ad97355dd2 100644
--- a/lychee-lib/src/types/input.rs
+++ b/lychee-lib/src/types/input.rs
@@ -161,6 +161,14 @@ impl Input {
for entry in glob_with(&glob_expanded, match_opts)? {
match entry {
Ok(path) => {
+ if path.is_dir() {
+ // Directories can still have a suffix which looks like
+ // a file extension like `foo.html`. This can lead to
+ // unexpected behavior with glob patterns like
+ // `**/*.html`. Therefore filter these out.
+ // https://github.com/lycheeverse/lychee/pull/262#issuecomment-913226819
+ continue;
+ }
let content = Self::path_content(&path)?;
contents.push(content);
}
From b2ce61357fda9d7544f564b583f9698be132aa8d Mon Sep 17 00:00:00 2001
From: Matthias
Date: Mon, 6 Sep 2021 23:46:31 +0200
Subject: [PATCH 32/46] Fix build errors; cleanup code
---
lychee-lib/src/client.rs | 4 ++--
lychee-lib/src/extract.rs | 8 ++++----
lychee-lib/src/types/error.rs | 6 ++----
lychee-lib/src/types/uri.rs | 16 ++++++++--------
4 files changed, 16 insertions(+), 18 deletions(-)
diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs
index e8d477fd7f..8fdb0a1c71 100644
--- a/lychee-lib/src/client.rs
+++ b/lychee-lib/src/client.rs
@@ -93,7 +93,7 @@ pub struct ClientBuilder {
accepted: Option>,
/// Response timeout per request
timeout: Option,
- /// Treat HTTP links as erros when HTTPS is available
+ /// Treat HTTP links as errors when HTTPS is available
require_https: bool,
}
@@ -252,7 +252,7 @@ impl Client {
}
pub async fn check_file(&self, uri: &Uri) -> Status {
- if let Ok(path) = uri.inner.to_file_path() {
+ if let Ok(path) = uri.url.to_file_path() {
if path.exists() {
return Status::Ok(StatusCode::OK);
}
diff --git a/lychee-lib/src/extract.rs b/lychee-lib/src/extract.rs
index 0a48b335b5..672150f26c 100644
--- a/lychee-lib/src/extract.rs
+++ b/lychee-lib/src/extract.rs
@@ -32,15 +32,15 @@ pub(crate) fn extract_links(
for link in links {
let req = if let Ok(uri) = Uri::try_from(link.as_str()) {
Request::new(uri, input_content.input.clone())
- } else if let Some(new_url) = base.as_ref().and_then(|u| u.join(&link)) {
- Request::new(Uri { inner: new_url }, input_content.input.clone())
+ } else if let Some(url) = base.as_ref().and_then(|u| u.join(&link)) {
+ Request::new(Uri { url }, input_content.input.clone())
} else if let Input::FsPath(root) = &input_content.input {
if url::is_anchor(&link) {
// Silently ignore anchor links for now
continue;
}
- let uri = create_uri_from_path(root, base, &link)?;
- Request::new(Uri { inner: uri }, input_content.input.clone())
+ let url = create_uri_from_path(root, base, &link)?;
+ Request::new(Uri { url }, input_content.input.clone())
} else {
info!("Handling of {} not implemented yet", &link);
continue;
diff --git a/lychee-lib/src/types/error.rs b/lychee-lib/src/types/error.rs
index 1dd68d901b..208e0afea2 100644
--- a/lychee-lib/src/types/error.rs
+++ b/lychee-lib/src/types/error.rs
@@ -74,11 +74,10 @@ impl Hash for ErrorKind {
Self::HubcapsError(e) => e.to_string().hash(state),
Self::FileNotFound(e) => e.to_string_lossy().hash(state),
Self::UrlParseError(s, e) => (s, e.type_id()).hash(state),
- Self::UnreachableEmailAddress(u) | Self::InsecureURL(u) => u.hash(state),
Self::InvalidFileUri(u) => u.hash(state),
- Self::InvalidFileUri(f) => f.hash(state),
Self::InvalidPath(p) => p.hash(state),
- Self::UnreachableEmailAddress(u) => u.hash(state),
+ Self::UnreachableEmailAddress(u) | Self::InsecureURL(u) => u.hash(state),
+ Self::InvalidBase(base, e) => (base, e).hash(state),
Self::InvalidHeader(e) => e.to_string().hash(state),
Self::InvalidGlobPattern(e) => e.to_string().hash(state),
Self::MissingGitHubToken => std::mem::discriminant(self).hash(state),
@@ -123,7 +122,6 @@ impl Display for ErrorKind {
"This URL is available in HTTPS protocol, but HTTP is provided, use '{}' instead",
uri
),
- Self::InvalidBase(base, e) => write!(f, "Error while base dir `{}` : {}", base, e),
Self::InvalidBase(base, e) => write!(f, "Error with base dir `{}` : {}", base, e),
}
}
diff --git a/lychee-lib/src/types/uri.rs b/lychee-lib/src/types/uri.rs
index aaf7d0c0fe..edb6fc795a 100644
--- a/lychee-lib/src/types/uri.rs
+++ b/lychee-lib/src/types/uri.rs
@@ -6,14 +6,14 @@ use url::Url;
use crate::{ErrorKind, Result};
-/// Lychee's own representation of a URI, which encapsulates all supported formats.
+/// Lychee's own representation of a URI, which encapsulates all support formats.
///
/// If the scheme is `mailto`, it's a mail address.
/// Otherwise it's treated as a website URL.
#[derive(Clone, Debug, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct Uri {
/// Website URL or mail address
- pub(crate) inner: Url,
+ pub(crate) url: Url,
}
impl Uri {
@@ -24,21 +24,21 @@ impl Uri {
#[inline]
#[must_use]
pub fn as_str(&self) -> &str {
- self.inner.as_ref().trim_start_matches("mailto:")
+ self.url.as_ref().trim_start_matches("mailto:")
}
#[inline]
#[must_use]
/// Returns the scheme of the URI (e.g. `http` or `mailto`)
pub fn scheme(&self) -> &str {
- self.inner.scheme()
+ self.url.scheme()
}
#[inline]
#[must_use]
/// Returns the domain of the URI (e.g. `example.org`)
pub fn domain(&self) -> Option<&str> {
- self.inner.domain()
+ self.url.domain()
}
#[inline]
@@ -49,14 +49,14 @@ impl Uri {
///
/// Return `None` for cannot-be-a-base URLs.
pub fn path_segments(&self) -> Option> {
- self.inner.path_segments()
+ self.url.path_segments()
}
#[must_use]
/// Returns the IP address (either IPv4 or IPv6) of the URI,
/// or `None` if it is a domain
pub fn host_ip(&self) -> Option {
- match self.inner.host()? {
+ match self.url.host()? {
url::Host::Domain(_) => None,
url::Host::Ipv4(v4_addr) => Some(v4_addr.into()),
url::Host::Ipv6(v6_addr) => Some(v6_addr.into()),
@@ -100,7 +100,7 @@ impl AsRef for Uri {
impl From for Uri {
fn from(url: Url) -> Self {
- Self { inner: url }
+ Self { url }
}
}
From 5d0b95271d5693ac8c30155cf4186fa5e5a0aa59 Mon Sep 17 00:00:00 2001
From: Matthias
Date: Tue, 7 Sep 2021 00:20:09 +0200
Subject: [PATCH 33/46] Remove anchor from file links
---
fixtures/offline/index.html | 3 +++
lychee-lib/src/extract.rs | 2 +-
lychee-lib/src/helpers/url.rs | 46 +++++++++++++++++++++++++++--------
3 files changed, 40 insertions(+), 11 deletions(-)
diff --git a/fixtures/offline/index.html b/fixtures/offline/index.html
index d0879ff786..b7789c04c8 100644
--- a/fixtures/offline/index.html
+++ b/fixtures/offline/index.html
@@ -12,6 +12,9 @@ Post 2 Title
About
+
+ About
+