From 3ac68b8aa41a9397bd248009a804f39aff3aaa13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Romanowski?= Date: Wed, 25 Nov 2020 19:44:19 +0100 Subject: [PATCH] Add skip_missing flag, add Input enum --- src/extract.rs | 15 +++++++ src/main.rs | 5 ++- src/options.rs | 13 ++++-- src/types.rs | 118 ++++++++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 145 insertions(+), 6 deletions(-) diff --git a/src/extract.rs b/src/extract.rs index feaa8dd946..4814b8f778 100644 --- a/src/extract.rs +++ b/src/extract.rs @@ -13,6 +13,21 @@ pub(crate) enum FileType { Plaintext, } +impl> From

for FileType { + /// Detect if the given path points to a Markdown, HTML, or plaintext file. + fn from(p: P) -> FileType { + let path = p.as_ref(); + match path.extension() { + Some(ext) => match ext.to_str().unwrap() { + "md" => FileType::Markdown, + "html" | "htm" => FileType::HTML, + _ => FileType::Plaintext, + }, + None => FileType::Plaintext, + } + } +} + // Use LinkFinder here to offload the actual link searching fn find_links(input: &str) -> Vec { let finder = LinkFinder::new(); diff --git a/src/main.rs b/src/main.rs index 66ac79bf42..c0d1fe4836 100644 --- a/src/main.rs +++ b/src/main.rs @@ -59,7 +59,10 @@ fn main() -> Result<()> { } None => tokio::runtime::Runtime::new()?, }; - let errorcode = runtime.block_on(run(cfg, opts.inputs))?; + let errorcode = runtime.block_on(run( + cfg, + opts.inputs.iter().map(|i| i.to_string()).collect(), + ))?; std::process::exit(errorcode); } diff --git a/src/options.rs b/src/options.rs index dece34f918..8f860aeba3 100644 --- a/src/options.rs +++ b/src/options.rs @@ -1,7 +1,9 @@ +use crate::types::Input; use anyhow::{Error, Result}; use serde::Deserialize; use std::{fs, io::ErrorKind}; use structopt::{clap::crate_version, StructOpt}; +use url::Url; pub(crate) const USER_AGENT: &str = concat!("lychee/", crate_version!()); const METHOD: &str = "get"; @@ -33,9 +35,9 @@ macro_rules! fold_in { #[derive(Debug, StructOpt)] #[structopt(name = "lychee", about = "A glorious link checker")] pub(crate) struct LycheeOptions { - /// Input files - #[structopt(default_value = "README.md")] - pub inputs: Vec, + /// TODO: Inputs + #[structopt(default_value = "README.md", parse(from_str = Input::from))] + pub inputs: Vec, /// Configuration file to use #[structopt(short, long = "config", default_value = "./lychee.toml")] @@ -52,6 +54,11 @@ pub struct Config { #[serde(default)] pub verbose: bool, + /// TODO: Skip missing input files + #[structopt(long)] + #[serde(default)] + pub skip_missing: bool, + /// Show progress #[structopt(short, long)] #[serde(default)] diff --git a/src/types.rs b/src/types.rs index d2e86e84c2..b4f939b31b 100644 --- a/src/types.rs +++ b/src/types.rs @@ -1,10 +1,17 @@ +use crate::extract::FileType; use crate::options::Config; -use anyhow::anyhow; +use anyhow::{anyhow, Result}; +use glob::glob; use regex::RegexSet; use std::net::IpAddr; +use std::path::{Path, PathBuf}; use std::{collections::HashSet, convert::TryFrom, fmt::Display}; +use tokio::fs::read_to_string; +use tokio::io::{stdin, AsyncReadExt}; use url::Url; +const STDIN: &str = "-"; + #[derive(Clone, Debug, PartialEq, Eq, Hash)] pub enum Uri { Website(Url), @@ -119,7 +126,7 @@ impl From for Status { } /// Exclude configuration for the link checker. -/// You can ignore links based on +/// You can ignore links based on regex patterns or pre-defined IP ranges. #[derive(Clone, Debug)] pub struct Excludes { pub regex: Option, @@ -158,6 +165,113 @@ impl Default for Excludes { } } +#[derive(Debug)] +#[non_exhaustive] +pub(crate) enum Input { + RemoteUrl(Url), + FsGlob(String), + FsPath(PathBuf), + Stdin, +} + +impl ToString for Input { + fn to_string(&self) -> String { + match self { + Self::RemoteUrl(url) => url.to_string(), + Self::FsGlob(s) => s.clone(), + Self::FsPath(p) => p.to_str().unwrap_or_default().to_owned(), + Self::Stdin => STDIN.to_owned(), + } + } +} + +#[derive(Debug)] +pub(crate) struct InputContent { + input: Input, + file_type: FileType, + content: String, +} + +impl From<&str> for Input { + fn from(value: &str) -> Self { + if value == STDIN { + Self::Stdin + } else { + match Url::parse(&value) { + Ok(url) => Self::RemoteUrl(url), + Err(_) => Self::FsGlob(value.to_owned()), + } + } + } +} + +impl Input { + async fn get_contents(self) -> Result> { + use Input::*; + + let contents = match self { + RemoteUrl(url) => vec![Self::url_contents(url).await?], + FsGlob(path_glob) => Self::glob_contents(path_glob).await?, + FsPath(path) => vec![Self::path_content(&path).await?], + Stdin => vec![Self::stdin_content().await?], + }; + + Ok(contents) + } + + async fn url_contents(url: Url) -> Result { + let res = reqwest::get(url.clone()).await?; + let content = res.text().await?; + let input_content = InputContent { + file_type: FileType::from(&url.as_str()), + input: Input::RemoteUrl(url), + content, + }; + + Ok(input_content) + } + + async fn glob_contents(path_glob: String) -> Result> { + let mut contents = vec![]; + + for entry in glob(&path_glob)? { + match entry { + Ok(path) => { + let content = Self::path_content(&path).await?; + contents.push(content); + } + Err(e) => println!("{:?}", e), + } + } + + Ok(contents) + } + + async fn path_content + AsRef>(path: P) -> Result { + let input_content = InputContent { + file_type: FileType::from(path.as_ref()), + content: read_to_string(&path).await?, + input: Input::FsPath(path.into()), + }; + + Ok(input_content) + } + + async fn stdin_content() -> Result { + let mut content = String::new(); + let mut stdin = stdin(); + stdin.read_to_string(&mut content).await?; + + let input_content = InputContent { + input: Input::Stdin, + content, + file_type: FileType::Plaintext, + }; + + Ok(input_content) + } +} + #[cfg(test)] mod test { use super::*;