From b07b49b3e6006f1080cc0a176e58246a343aecc2 Mon Sep 17 00:00:00 2001 From: BrightShard Date: Fri, 25 Oct 2024 11:59:30 -0400 Subject: [PATCH] Initial commit --- .gitignore | 1 + Cargo.lock | 23 ++ Cargo.toml | 12 + README.md | 103 +++++++ src/compiler.rs | 121 ++++++++ src/lib.rs | 71 +++++ src/main.rs | 99 +++++++ src/minifier.rs | 5 + src/minifier/css.rs | 79 +++++ src/minifier/html.rs | 589 +++++++++++++++++++++++++++++++++++++ src/translator.rs | 4 + src/translator/gemtext.rs | 119 ++++++++ src/translator/markdown.rs | 0 tests/gemtext.rs | 21 ++ tests/gemtext/header.gmi | 3 + tests/gemtext/header.html | 1 + tests/gemtext/link.gmi | 4 + tests/gemtext/link.html | 1 + tests/gemtext/list.gmi | 3 + tests/gemtext/list.html | 1 + tests/gemtext/text.gmi | 1 + tests/gemtext/text.html | 1 + 22 files changed, 1262 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 README.md create mode 100644 src/compiler.rs create mode 100644 src/lib.rs create mode 100644 src/main.rs create mode 100644 src/minifier.rs create mode 100644 src/minifier/css.rs create mode 100644 src/minifier/html.rs create mode 100644 src/translator.rs create mode 100644 src/translator/gemtext.rs create mode 100644 src/translator/markdown.rs create mode 100644 tests/gemtext.rs create mode 100644 tests/gemtext/header.gmi create mode 100644 tests/gemtext/header.html create mode 100644 tests/gemtext/link.gmi create mode 100644 tests/gemtext/link.html create mode 100644 tests/gemtext/list.gmi create mode 100644 tests/gemtext/list.html create mode 100644 tests/gemtext/text.gmi create mode 100644 tests/gemtext/text.html diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/target diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..fe1a8f8 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,23 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "base64" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + +[[package]] +name = "boml" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85fdb93f04c73bff54305fa437ffea5449c41edcaadfe882f35836206b166ac5" + +[[package]] +name = "webby" +version = "0.1.0" +dependencies = [ + "base64", + "boml", +] diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..a3a24b8 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "webby" +version = "0.1.0" +edition = "2021" + +[dependencies] +base64 = "0.22.1" +boml = "0.3.1" + +[features] +default = [] +log = [] diff --git a/README.md b/README.md new file mode 100644 index 0000000..9063717 --- /dev/null +++ b/README.md @@ -0,0 +1,103 @@ +# webby + +> The smol web compiler + +As seen in [my website](https://github.com/bright-shard/website). + +**webby** is a small and efficient compiler for making static sites. It adds macros, minifiers, and translators to compile your project into a tiny static site. + +> Note: Webby is WIP. The above is a summary of what I want it to do when it's finished. For the current project status, see [todo](#todo). + +# macros + +webby adds a few simple macros to make writing HTML simpler. Macros open with `#!`, followed by the macro name, followed by arguments in parentheses, like so: + +``` +#!MACRO_NAME(args) +``` + +Macros can be combined, like this: + +``` +#!MACRO1(#!MACRO2(args)) +``` + +- `#!INCLUDE(path/to/file)`: Webby will compile the given file, then embed it at the macro's location. The file must contain valid UTF-8 text. +- `#!BASE64(text)`: Base64-encode the given text. +- `#!INCLUDE_BASE64(path/to/file)`: Base64-encode the given file. This differs from `#!BASE64(#!INCLUDE(path/to/file))` because it can also base64-encode binary files. + +# minifiers + +webby will automatically strip comments and unneeded whitespace from your code to make it as small as possible. + +# translators + +Translators cross-compile between languages - for example, Markdown to HTML, or Gemtext to HTML. + + + +# usage + +webby projects have a `webby.toml` in the root of their project, just like Rust projects have a `Cargo.toml` in the root of theirs. The format of `webby.toml` is given in [config](#config). + +To install webby, just install it with Cargo: + +```sh +cargo install --git https://github.com/bright-shard/webby +``` + +Then just run `webby` in your webby project. + +# config + +In its simplest form, the `webby.toml` file will look like this: + +```toml +# For every file you want to compile with webby, add a `[[target]]` section +[[target]] +# The path to the file to compile +path = "index.html" + +[[target]] +path = "blog.html" +``` + +However, webby allows customising more if you need it: + +```toml +# (Optional) the directory to put the output files at +# If this isn't specified it defaults to `webby` +# The path is relative to the webby.toml file +output = "my/custom/build/dir" + +[[target]] +# The path to the file, relative to the webby.toml file +# If you list a folder instead of a file, webby will compile all of the files +# in that folder +path = "path/to/file.html" +# (Optional) Where to put the compiled file +# If this isn't specified it defaults to the name of the file given in path +# The path is relative to the output directory +output = "file.out.html" +# (Optional) The compilation mode +# This can be "compile", "copy", or "link". Compile will compile the file. Copy +# will just copy the file as-is and will not compile it at all. Link is the same +# as copy, but it creates a hard link (not a symlink) to the file instead of +# copying it. +# If this isn't specified, webby will infer if it should compile or copy the +# file based on the file's ending. +mode = "compile" +``` + +# todo + +- [x] Macros + - [x] INCLUDE + - [x] BASE64 + - [x] BASE64_INCLUDE +- [x] HTML minifier +- [x] CSS minifier +- [ ] JS minifier +- [x] Gemtext translator +- [ ] Markdown translator +- [ ] Redo macro compiler... it's old and has bugs diff --git a/src/compiler.rs b/src/compiler.rs new file mode 100644 index 0000000..6c14d41 --- /dev/null +++ b/src/compiler.rs @@ -0,0 +1,121 @@ +use { + crate::{line_number_of_offset, Cow}, + base64::{engine::general_purpose::STANDARD, Engine}, + std::{fs, path::Path}, +}; + +pub fn compile_macros<'a>(original: &'a str, source_path: &'a Path) -> Cow<'a> { + let mut output = String::default(); + let mut offset = 0; + + while let Some(start_idx) = original[offset..].find("#!") { + if original[offset..] + .as_bytes() + .get(start_idx.saturating_sub(1)) + .copied() + == Some(b'\\') + { + if !output.is_empty() { + output += &original[offset..offset + start_idx + 1] + } + offset += start_idx + 1; + continue; + } + + output += &original[offset..offset + start_idx]; + offset += start_idx; + + let macro_src = &original[offset..]; + let paren_open = macro_src.find('(').unwrap_or_else(|| { + panic!( + "Expected ( in macro invocation at {source_path:?}:{}", + line_number_of_offset(original, offset) + ) + }); + let mut paren_close = macro_src.find(')').unwrap_or_else(|| { + panic!( + "Expected ) to end macro invocation at {source_path:?}:{}", + line_number_of_offset(original, offset) + ) + }); + while macro_src.as_bytes().get(paren_close + 1).copied() == Some(b')') { + paren_close += 1; + } + + let macro_name = ¯o_src[2..paren_open]; + let macro_args = ¯o_src[paren_open + 1..paren_close]; + let macro_args = compile_macros(macro_args, source_path); + let macro_args = macro_args.as_ref(); + + match macro_name { + "INCLUDE" => { + let path = source_path.parent().unwrap().join(macro_args); + let src = fs::read_to_string(&path).unwrap_or_else(|err| { + panic!( + "Error in INCLUDE macro at {source_path:?}:{}: {err}", + line_number_of_offset(original, offset) + ) + }); + let compiled = compile_macros(&src, &path); + output += compiled.as_ref(); + } + "BASE64" => { + output += STANDARD.encode(macro_args).as_str(); + } + "INCLUDE_BASE64" => { + let path = source_path.parent().unwrap().join(macro_args); + let src = fs::read(&path).unwrap_or_else(|err| { + panic!( + "Error in INCLUDE_BASE64 macro at {source_path:?}:{}: {err}", + line_number_of_offset(original, offset) + ) + }); + output += STANDARD.encode(&src).as_str(); + } + other => panic!( + "Unknown macro '{other}' in macro invocation at {source_path:?}:{}", + line_number_of_offset(original, offset) + ), + } + + offset += paren_close + 1; + } + + if output.is_empty() { + Cow::Borrowed(original) + } else { + output += &original[offset..]; + Cow::Owned(output) + } +} + +pub fn copy_batch_target(src: &Path, dest: &Path) { + if dest.is_file() { + fs::remove_file(dest).unwrap_or_else(|err| { + panic!("Failed to copy batch target {src:?}. There was already a file where its output should go ({dest:?}), which couldn't be removed: {err}"); + }); + } + if !dest.exists() { + fs::create_dir_all(dest).unwrap_or_else(|err| { + panic!("Failed to copy batch target {src:?}. Couldn't create its output folder at {dest:?} because: {err}"); + }); + } + + let src = src.read_dir().unwrap_or_else(|err| { + panic!( + "Failed to copy batch target {dest:?}. Couldn't open its source directory because: {err}" + ); + }); + + for dir_entry in src.filter_map(|dir_entry| dir_entry.ok()) { + let dir_entry = &dir_entry.path(); + + if dir_entry.is_file() { + fs::copy(dir_entry, dest.join(dir_entry.file_name().unwrap())).unwrap_or_else(|err| { + panic!("Failed to copy batch target {dest:?}. Couldn't copy file at {dir_entry:?} because: {err}"); + }); + } else { + copy_batch_target(dir_entry, &dest.join(dir_entry.file_name().unwrap())); + } + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..797d9ff --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,71 @@ +pub mod compiler; +pub mod minifier; +pub mod translator; + +use std::{fs, path::PathBuf}; + +type Cow<'a> = std::borrow::Cow<'a, str>; + +pub enum Mode { + Compile, + Copy, + Link, +} + +pub struct Target { + pub path: PathBuf, + pub output: PathBuf, + pub mode: Mode, +} + +pub fn build_target(target: Target) -> Result<(), Cow<'static>> { + match target.mode { + Mode::Copy => { + if target.path.is_file() | target.path.is_symlink() { + fs::copy(target.path, target.output).unwrap(); + } else { + compiler::copy_batch_target(&target.path, &target.output); + } + } + Mode::Link => { + if target.output.exists() { + fs::remove_file(&target.output) + .unwrap_or_else(|err| panic!("Failed to link target {:?}: {err}", &target.path)) + } + fs::hard_link(&target.path, target.output) + .unwrap_or_else(|err| panic!("Failed to link target {:?}: {err}", &target.path)); + } + Mode::Compile => { + let original = fs::read_to_string(&target.path).unwrap_or_else(|err| { + panic!( + "Failed to compile target {:?}: Error occurred while reading the source file: {err}", + &target.path + ) + }); + let compiled_macros = compiler::compile_macros(&original, &target.path); + + let output = match target.path.extension().and_then(|val| val.to_str()) { + Some("gmi") => Cow::Owned(translator::translate_gemtext( + &target.path, + compiled_macros.as_ref(), + )?), + Some("html") => Cow::Owned(minifier::minify_html( + target.path.to_str().unwrap(), + &compiled_macros, + &original, + )?), + Some("css") => Cow::Owned(minifier::minify_css(&compiled_macros)), + _ => compiled_macros, + }; + + fs::write(&target.output, output.as_ref()) + .unwrap_or_else(|err| panic!("Failed to compile target {:?}: Error occured while writing the compiled file: {err}", &target.path)); + } + } + + Ok(()) +} + +fn line_number_of_offset(src: &str, offset: usize) -> usize { + src[..offset].bytes().filter(|byte| *byte == b'\n').count() +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..ea0a3c5 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,99 @@ +use { + boml::{table::TomlGetError, Toml}, + std::{borrow::Cow, env, fs, thread}, + webby::{build_target, Mode, Target}, +}; + +pub fn main() -> Result<(), Cow<'static, str>> { + let cwd = env::current_dir().expect("Failed to find current directory"); + let mut root = cwd.as_path(); + + while !root + .read_dir() + .expect("Failed to list files in current folder") + .any(|file| { + if let Ok(ref file) = file { + if let Some(name) = file.file_name().to_str() { + if name == "webby.toml" && file.path().is_file() { + return true; + } + } + } + + false + }) + { + let Some(parent) = root.parent() else { + return Err("Failed to find webby.toml".into()); + }; + root = parent; + } + + let cfg = fs::read_to_string(root.join("webby.toml")).expect("Failed to read webby.toml"); + let toml = Toml::parse(&cfg).unwrap(); + + let output_dir = if let Ok(output) = toml.get_string("output") { + root.join(output) + } else { + root.join("webby") + }; + + if !output_dir.exists() { + fs::create_dir(&output_dir).expect("Failed to create output directory"); + } + + let mut tasks = Vec::default(); + + match toml.get_array("target") { + Ok(targets) => { + for target in targets { + let Some(table) = target.table() else { + return Err("All target entries in webby.toml must be a TOML table.".into()); + }; + let Ok(path) = table.get_string("path") else { + return Err("Target in webby.toml didn't have a path".into()); + }; + let path = root.join(path); + let mode = if let Ok(mode) = table.get_string("mode") { + match mode { + "compile" => Mode::Compile, + "copy" => Mode::Copy, + "link" => Mode::Link, + other => panic!("Unknown mode: {other} for target: {path:?}"), + } + } else { + match path.extension().and_then(|osstr| osstr.to_str()) { + Some("gmi" | "html" | "svg" | "md" | "css") => Mode::Compile, + _ => Mode::Copy, + } + }; + let output = if let Ok(output_name) = table.get_string("output") { + output_dir.join(output_name) + } else { + output_dir.join(path.file_name().unwrap()) + }; + + let target = Target { path, output, mode }; + let worker = thread::spawn(move || build_target(target)); + tasks.push(worker); + } + } + Err(e) => match e { + TomlGetError::InvalidKey => { + return Err("No targets specified. See the GitHub for an example on setting up a webby project: https://github.com/bright-shard/webby".into()); + } + TomlGetError::TypeMismatch(_, _) => { + return Err("The 'target' entry has to an array in webby.toml".into()); + } + }, + } + + for task in tasks { + match task.join().unwrap() { + Ok(()) => {} + Err(err) => println!("{err}"), + } + } + + Ok(()) +} diff --git a/src/minifier.rs b/src/minifier.rs new file mode 100644 index 0000000..17107a1 --- /dev/null +++ b/src/minifier.rs @@ -0,0 +1,5 @@ +mod css; +mod html; + +pub use css::minify_css; +pub use html::minify_html; diff --git a/src/minifier/css.rs b/src/minifier/css.rs new file mode 100644 index 0000000..e7203ae --- /dev/null +++ b/src/minifier/css.rs @@ -0,0 +1,79 @@ +pub fn minify_css(source: &str) -> String { + let mut out = String::new(); + let mut chars = source.chars().peekable(); + + let mut function_depth = 0; + let mut maybe_in_rule = false; + + while let Some(char) = chars.next() { + match char { + '/' if chars.peek().copied() == Some('*') => { + chars.next(); + + while let Some(char) = chars.next() { + if char == '*' && chars.peek().copied() == Some('/') { + chars.next(); + break; + } + } + continue; + } + '\'' | '"' => { + out.push(char); + + while let Some(subchar) = chars.next() { + if subchar == char { + out.push(char); + break; + } else if subchar == '\\' { + if let Some(char) = chars.next() { + out.push(char); + } + } + + out.push(subchar); + } + + continue; + } + '\n' => { + while chars.peek().map(|c| c.is_whitespace()).unwrap_or(false) { + chars.next(); + } + + if function_depth > 0 || maybe_in_rule { + out.push(' '); + } + continue; + } + '(' => { + function_depth += 1; + out.truncate(out.trim_end().len()); + while chars.peek().map(|c| c.is_whitespace()).unwrap_or(false) { + chars.next(); + } + } + ')' => { + function_depth -= 1; + out.truncate(out.trim_end().len()); + } + '{' | '}' | ',' => { + out.truncate(out.trim_end().len()); + while chars.peek().map(|c| c.is_whitespace()).unwrap_or(false) { + chars.next(); + } + } + ':' => { + while chars.peek().map(|c| c.is_whitespace()).unwrap_or(false) { + chars.next(); + } + maybe_in_rule = true; + } + ';' => maybe_in_rule = false, + _ => {} + } + out.push(char); + } + + out +} diff --git a/src/minifier/html.rs b/src/minifier/html.rs new file mode 100644 index 0000000..5080d7d --- /dev/null +++ b/src/minifier/html.rs @@ -0,0 +1,589 @@ +use crate::{line_number_of_offset, minifier, Cow}; + +macro_rules! log { + ($($t:tt)*) => { + #[cfg(any(test, feature = "log"))] + println!($($t)*); + }; +} + +pub fn minify_html( + source_path: &str, + source: &str, + original: &str, +) -> Result> { + let mut result = String::new(); + let mut handled_bytes = 0; + + while handled_bytes < source.len() { + let (tag, bytes) = handle_tag( + source_path, + &source[handled_bytes..], + (original, handled_bytes), + )?; + result += &tag; + handled_bytes += bytes + 1; + } + + Ok(result) +} + +/// Parses an individual HTML tag and minifies it. +/// +/// This is a rough overview of the strategy this function uses to parse HTML, +/// handle its edge cases, and then minify it: +/// +/// DISCLAIMER: The above is what this minifier will do when it's complete. See +/// the TODO at the bottom for what's not yet implemented. +/// +/// 1. Tags begin with a <, then have the tag type. If there is whitespace after +/// the <, it's not considered a tag. +/// 2. If the tag's type is `!--`, it is a comment and will be removed from +/// the resulting HTML. +/// 3. The tag may have properties in the format `name=value`, with optional +/// whitespace around the `=` and optional quotes around the value. The tag +/// may also have properties in the format `name`. Properties are never +/// minimised except to remove whitespace around the `=`. +/// 4. The tag may be closed with either a `/>`, a closing tag, or may not be +/// closed properly at all. +/// 5. If the tag is closed with a closing tag, this function will classify the +/// tag as either a *text* tag or a *content* tag. Text tags store text (`p`, +/// `a`, `h1`, etc), while content tags store other HTMl elements (`head`, +/// `body`, etc). If the tag is not closed, this function just returns the +/// tag. If the tag is a style tag, it will be run through the CSS minifier. +/// Script tags will only be minified with the `js-minify` feature enabled. +/// 6. If the tag is a text tag, the only minification that will occur is +/// removing newlines around tags inside it. Any content tags inside the text +/// tag will be minified as normal for a content tag. +/// 7. If the tag is a content tag, newlines will be stripped from it. +/// Whitespace that isn't in a tag property's value will also be stripped. +/// Any nested tags inside that content tag will be re-run through this +/// minifier. +/// +/// # TODO +/// - Find a decent JS minifier, add it as a dep, and feature flag it. JS is +/// too complicated to write a minifier for, when I don't even use it. +fn handle_tag<'a>( + source_path: &'a str, + source: &'a str, + error_meta: (&'a str, usize), +) -> Result<(Cow<'a>, usize), Cow<'static>> { + if source + .chars() + .next() + .map(|char| char.is_whitespace()) + .unwrap_or(false) + { + return Ok((Cow::Borrowed("<"), 1)); + } else if source.starts_with("") else { + return Err(Cow::Owned(format!( + "HTML error: Unclosed HTML comment at {source_path}:{}", + line_number_of_offset(error_meta.0, error_meta.1) + ))); + }; + return Ok((Cow::Borrowed(""), ending + 2)); + } + + let mut output = String::from("<"); + let mut chars = source.char_indices().peekable(); + chars.next(); // discard opening < + + let tag_name_end; + let tag_closed; + loop { + let Some((byte_idx, char)) = chars.next() else { + return Ok((Cow::Owned(output), source.len() - 1)); + }; + + if char == '/' && chars.peek().map(|(_, char)| *char) == Some('>') { + output += "/>"; + chars.next(); + let mut end = byte_idx + 1; + if chars.peek().map(|(_, char)| *char) == Some('\n') { + while chars + .peek() + .map(|(_, c)| c.is_whitespace()) + .unwrap_or(false) + { + end = chars.next().unwrap().0; + } + } + + return Ok((Cow::Owned(output), end)); + } else if char == '>' { + tag_name_end = byte_idx; + tag_closed = true; + break; + } else if char.is_whitespace() { + tag_name_end = byte_idx; + tag_closed = false; + + while chars + .peek() + .map(|(_, c)| c.is_whitespace()) + .unwrap_or(false) + { + chars.next(); + } + + output.push(' '); + break; + } + + output.push(char); + } + + let tag_name = &source[1..tag_name_end]; + log!("Parsing tag `{tag_name}`"); + + if tag_closed { + log!(" Opening tag closed w/o properties"); + output.push('>'); + } else { + // Each loop parses 1 property + 'parse_properties: loop { + let Some((byte_idx, char)) = chars.next() else { + return Ok((Cow::Owned(output), source.len() - 1)); + }; + log!(" 'parse_properties: Found `{char}`"); + + if char == '/' && chars.peek().map(|(_, char)| *char) == Some('>') { + output += "/>"; + chars.next(); + let mut end = byte_idx + 1; + if chars.peek().map(|(_, char)| *char) == Some('\n') { + while chars + .peek() + .map(|(_, c)| c.is_whitespace()) + .unwrap_or(false) + { + end = chars.next().unwrap().0; + } + } + + return Ok((Cow::Owned(output), end)); + } else if char == '>' { + log!(" Opening tag closed in 'parsed_properties"); + output.push('>'); + break 'parse_properties; + } else if char == '\n' { + continue 'parse_properties; + } else if char.is_whitespace() { + while chars + .peek() + .map(|(_, c)| c.is_whitespace()) + .unwrap_or(false) + { + chars.next(); + } + output.push(' '); + continue 'parse_properties; + } + + // Parse property's name + output.push(char); + 'parse_property_name: loop { + let Some((byte_idx, char)) = chars.next() else { + return Ok((Cow::Owned(output), source.len() - 1)); + }; + + if char == '=' { + break 'parse_property_name; + } else if char == '/' && chars.peek().map(|(_, char)| *char) == Some('>') { + output += "/>"; + chars.next(); + let mut end = byte_idx + 1; + if chars.peek().map(|(_, char)| *char) == Some('\n') { + while chars + .peek() + .map(|(_, c)| c.is_whitespace()) + .unwrap_or(false) + { + end = chars.next().unwrap().0; + } + } + + return Ok((Cow::Owned(output), end)); + } else if char == '>' { + output += ">"; + break 'parse_properties; + } else if char.is_whitespace() { + while chars + .peek() + .map(|(_, c)| c.is_whitespace()) + .unwrap_or(false) + { + chars.next(); + } + + if chars.peek().map(|(_, char)| *char) == Some('=') { + // Whitespace followed by an `=` - should break name + // parsing and start parsing the value + chars.next(); + break 'parse_property_name; + } else { + // Whitespace followed by other characters - this + // property didn't have a value and we should go parse + // the next one + output.push(' '); + continue 'parse_properties; + } + } + + output.push(char); + } + + // If we get to this point, the property has a value. The chars + // iterator will pick up after the =. + output.push('='); + + while chars + .peek() + .map(|(_, c)| c.is_whitespace()) + .unwrap_or(false) + { + chars.next(); + } + + // Parse property's value + let Some((idx, char)) = chars.next() else { + return Ok((Cow::Owned(output), source.len() - 1)); + }; + match char { + '\'' | '"' => { + output.push(char); + loop { + let Some((_, next)) = chars.next() else { + return Err(Cow::Owned(format!( + "Unclosed quotation in HTML property at {source_path}:{}", + line_number_of_offset(error_meta.0, error_meta.1 + idx) + ))); + }; + if next == '\\' { + if let Some((_, next)) = chars.next() { + output.push(next); + } + continue; + } + + output.push(next); + + if next == char { + break; + } + } + } + _ => { + output.push(char); + loop { + let Some((idx, char)) = chars.next() else { + return Err(Cow::Owned(format!( + "Unclosed quotation in HTML property at {source_path}:{}", + line_number_of_offset(error_meta.0, error_meta.1 + idx) + ))); + }; + + if char == '/' && chars.peek().map(|(_, char)| *char) == Some('>') { + output += "/>"; + chars.next(); + let mut end = idx + 1; + if chars.peek().map(|(_, char)| *char) == Some('\n') { + while chars + .peek() + .map(|(_, c)| c.is_whitespace()) + .unwrap_or(false) + { + end = chars.next().unwrap().0; + } + } + + return Ok((Cow::Owned(output), end)); + } else if char == '>' { + output += ">"; + break 'parse_properties; + } else if char.is_whitespace() { + while chars + .peek() + .map(|(_, c)| c.is_whitespace()) + .unwrap_or(false) + { + chars.next(); + } + output.push(char); + + continue 'parse_properties; + } else { + output.push(char); + } + } + } + } + } + } + + // By now the > of the opening tag has been reached + // We need to find the closing tag, then minify the contents of the tag + // as needed + debug_assert!(output.ends_with('>'), "output is: `{output}`"); + + if chars.peek().map(|(_, char)| *char) == Some('\n') { + while chars + .peek() + .map(|(_, char)| char.is_whitespace()) + .unwrap_or(false) + { + chars.next(); + } + } + + if tag_name == "script" || tag_name == "style" { + // TODO: Actually parse and minify JS + let closing_tag = if tag_name == "script" { + "" + } else { + "" + }; + let Some(search_start_idx) = chars.next().map(|(idx, _)| idx) else { + return Ok((Cow::Owned(output), source.len() - 1)); + }; + + let Some(closing) = source[search_start_idx..].find(closing_tag) else { + return Err(Cow::Owned(format!( + "Unclosed style or script tag at {source_path}:{}", + line_number_of_offset(error_meta.0, error_meta.1 + search_start_idx) + ))); + }; + let closing = search_start_idx + closing + closing_tag.len(); + + if tag_name == "style" { + output += &minifier::minify_css(&source[search_start_idx..closing]); + } else { + output += &source[search_start_idx..closing]; + } + + return Ok((Cow::Owned(output), closing)); + } + + let textual_tag = matches!( + tag_name, + "a" | "abbr" + | "acronym" + | "aside" + | "b" + | "bdi" + | "bdo" + | "big" + | "blockquote" + | "button" + | "caption" + | "cite" + | "code" + | "dd" + | "del" + | "details" + | "dfn" + | "dt" + | "em" + | "figcaption" + | "h1" + | "h2" + | "h3" + | "h4" + | "h5" + | "h6" + | "i" + | "ins" + | "kbd" + | "label" + | "legend" + | "li" + | "mark" + | "marquee" + | "meter" + | "nobr" + | "option" + | "output" + | "p" + | "pre" + | "progress" + | "q" + | "rb" + | "rp" + | "rt" + | "s" + | "sample" + | "small" + | "span" + | "strong" + | "sub" + | "summary" + | "sup" + | "td" + | "textarea" + | "th" + | "time" + | "title" + | "u" + | "var" + ); + let preformatted = tag_name == "pre"; + log!(" Textual tag? {textual_tag}"); + + while let Some((byte_idx, char)) = chars.next() { + if char == '<' { + let Some((next_idx, next_char)) = chars.peek().copied() else { + return Ok((Cow::Owned(output), source.len() - 1)); + }; + + if next_char == '/' { + chars.next(); + + if !preformatted { + let mut trim = false; + let mut chars_rev = source[..byte_idx].chars(); + while let Some(char) = chars_rev.next_back() { + if !char.is_whitespace() { + break; + } else if char == '\n' { + trim = true; + } + } + if trim { + output = output.trim_end().to_string(); + } + } + + output += "' { + let mut end = idx; + if !preformatted && chars.peek().map(|(_, char)| *char) == Some('\n') { + while chars + .peek() + .map(|(_, c)| c.is_whitespace()) + .unwrap_or(false) + { + end = chars.next().unwrap().0; + } + } + + if output.ends_with(tag_name) { + output.push('>'); + return Ok((Cow::Owned(output), end)); + } else { + output.push('>'); + break; + } + } + + if !char.is_whitespace() { + output.push(char); + } + } + } else if !next_char.is_whitespace() { + let (subtag, used) = handle_tag(source_path, &source[byte_idx..], error_meta)?; + log!(" Found subtag `{subtag}`. Ends at {used}, current char is {next_idx}."); + let used = used + byte_idx; + + loop { + let (next_idx, _) = chars.next().unwrap(); + if next_idx == used { + break; + } + } + + output += &subtag; + } else { + output.push('<'); + } + } else if !preformatted { + match char { + '\n' => {} + _ if char.is_whitespace() => { + while chars + .peek() + .map(|(_, char)| char.is_whitespace()) + .unwrap_or(false) + { + chars.next(); + } + if textual_tag { + output.push(' '); + } + } + _ => output.push(char), + } + } else { + output.push(char); + } + } + + Ok((Cow::Owned(output), source.len() - 1)) +} + +#[cfg(test)] +mod tests { + use crate::minifier::minify_html; + + struct Tester { + name: &'static str, + source: &'static str, + expected: &'static str, + } + impl Tester { + fn test(self) { + log!("\nSTARTING TEST '{}'", self.name); + let result = minify_html("test/path", self.source, self.source).unwrap(); + assert_eq!(&result, self.expected, "Test name: {}", self.name); + } + } + + #[test] + fn test() { + let cases = [ + Tester { + name: "Trim whitespace between tags", + source: "

hi

", + expected: "

hi

", + }, + Tester { + name: "Trim comments", + source: "

hi

", + expected: "

hi

", + }, + Tester { + name: "Includes whitespace in textual comments", + source: "

This has weird whitespace!!!\n

", + expected: "

This has weird whitespace!!!\n

", + }, + Tester { + name: "Element properties", + source: "

hewwo

", + expected: "

hewwo

", + }, + Tester { + name: "Element properties 2", + source: "

hewwo

", + expected: "

hewwo

", + }, + Tester { + name: "Unclosed Elements", + source: "

hello

\n
", + expected: "

hello


" + } + ]; + + for case in cases { + case.test(); + } + } +} diff --git a/src/translator.rs b/src/translator.rs new file mode 100644 index 0000000..d80002b --- /dev/null +++ b/src/translator.rs @@ -0,0 +1,4 @@ +mod gemtext; +mod markdown; + +pub use gemtext::translate_gemtext; diff --git a/src/translator/gemtext.rs b/src/translator/gemtext.rs new file mode 100644 index 0000000..1e92c42 --- /dev/null +++ b/src/translator/gemtext.rs @@ -0,0 +1,119 @@ +use { + crate::Cow, + std::{fmt::Write, path::Path}, +}; + +#[derive(PartialEq, Eq)] +enum ParserState { + Text, + List, + Preformatted, +} + +/// Escapes characters from an input string so valid Gemtext doesn't get +/// misinterpreted as HTML. +// +// This should prevent any form of HTML injection... but other programs filter +// more characters than are being filtered here, which should be looked into... +// +// Cases covered by Canvas LMS: +// '&' => *out += "&", +// '<' => *out += "<", +// '>' => *out += ">", +// '"' => *out += """, +// '\'' => *out += "'", +// '/' => *out += "/", +// '`' => *out += "`", +// '=' => *out += "=", +// From https://github.com/instructure/canvas-lms/blob/master/packages/html-escape/index.js#L85 +fn html_escape_into(input: &str, out: &mut String) { + for char in input.chars() { + match char { + '<' => *out += "<", + '>' => *out += ">", + '"' => *out += """, + '&' => *out += "&", + other => out.push(other), + } + } +} + +pub fn translate_gemtext(source_path: &Path, source: &str) -> Result> { + let mut output = String::new(); + let mut state = ParserState::Text; + output += "

"; + + for (line_num, line) in source.lines().enumerate() { + if state == ParserState::Preformatted { + if line.starts_with("```") { + state = ParserState::Text; + output += ""; + continue; + } + + html_escape_into(line, &mut output); + continue; + } + + if let Some(list_line) = line.strip_prefix("* ") { + if state != ParserState::List { + state = ParserState::List; + output += "

    "; + } + output += "
  • "; + html_escape_into(list_line, &mut output); + output += "
  • "; + continue; + } else if state == ParserState::List { + state = ParserState::Text; + output += "
"; + } + + if let Some(link_line) = line.strip_prefix("=>") { + let mut line = link_line.split_whitespace(); + let link = line.next().ok_or(Cow::Owned(format!( + "Expected URL in link at {source_path:?}:{line_num}" + )))?; + + output += ""; + + if let Some(link_text) = line.next() { + html_escape_into(link_text, &mut output); + } else { + html_escape_into(link, &mut output); + } + + output += "
"; + } else if let Some(alt) = line.strip_prefix("```") { + output += "
";
+            state = ParserState::Preformatted;
+        } else if let Some(quote) = line.strip_prefix("> ") {
+            output += "

"; + html_escape_into(quote, &mut output); + output += "

"; + } else if line.starts_with('#') { + let mut chars = line.bytes(); + let mut level = 0; + while chars.next() == Some(b'#') { + level += 1; + } + + write!(output, "").unwrap(); + html_escape_into(line[level..].trim_start(), &mut output); + write!(output, "").unwrap(); + } else { + output += line; + } + } + + if state == ParserState::List { + output += ""; + } + + output += "

"; + Ok(output) +} diff --git a/src/translator/markdown.rs b/src/translator/markdown.rs new file mode 100644 index 0000000..e69de29 diff --git a/tests/gemtext.rs b/tests/gemtext.rs new file mode 100644 index 0000000..cfbe520 --- /dev/null +++ b/tests/gemtext.rs @@ -0,0 +1,21 @@ +use { + std::{fs, path::PathBuf}, + webby::translator, +}; + +#[test] +fn test() { + let tests = ["link", "header", "text", "list"]; + + for test in tests { + let gmi_path = PathBuf::from(format!("tests/gemtext/{test}.gmi")); + let html_path = PathBuf::from(format!("tests/gemtext/{test}.html")); + let html = + translator::translate_gemtext(&gmi_path, &fs::read_to_string(&gmi_path).unwrap()) + .unwrap(); + assert_eq!( + html, + format!("

{}

", fs::read_to_string(&html_path).unwrap()) + ) + } +} diff --git a/tests/gemtext/header.gmi b/tests/gemtext/header.gmi new file mode 100644 index 0000000..5da5539 --- /dev/null +++ b/tests/gemtext/header.gmi @@ -0,0 +1,3 @@ +# header1 +## header2 +### header3 diff --git a/tests/gemtext/header.html b/tests/gemtext/header.html new file mode 100644 index 0000000..ff686c4 --- /dev/null +++ b/tests/gemtext/header.html @@ -0,0 +1 @@ +

header1

header2

header3

\ No newline at end of file diff --git a/tests/gemtext/link.gmi b/tests/gemtext/link.gmi new file mode 100644 index 0000000..e02069a --- /dev/null +++ b/tests/gemtext/link.gmi @@ -0,0 +1,4 @@ +=> https://google.com +=> https://google.com google +=> https://google.com +=> https://google.com google diff --git a/tests/gemtext/link.html b/tests/gemtext/link.html new file mode 100644 index 0000000..9e84ed5 --- /dev/null +++ b/tests/gemtext/link.html @@ -0,0 +1 @@ +https://google.com
google
https://google.com
google
\ No newline at end of file diff --git a/tests/gemtext/list.gmi b/tests/gemtext/list.gmi new file mode 100644 index 0000000..e61e1d4 --- /dev/null +++ b/tests/gemtext/list.gmi @@ -0,0 +1,3 @@ +* one +* two +* three diff --git a/tests/gemtext/list.html b/tests/gemtext/list.html new file mode 100644 index 0000000..a66f58a --- /dev/null +++ b/tests/gemtext/list.html @@ -0,0 +1 @@ +
  • one
  • two
  • three
\ No newline at end of file diff --git a/tests/gemtext/text.gmi b/tests/gemtext/text.gmi new file mode 100644 index 0000000..840b569 --- /dev/null +++ b/tests/gemtext/text.gmi @@ -0,0 +1 @@ +ipsum lorem I don't speak latinum diff --git a/tests/gemtext/text.html b/tests/gemtext/text.html new file mode 100644 index 0000000..02ee347 --- /dev/null +++ b/tests/gemtext/text.html @@ -0,0 +1 @@ +ipsum lorem I don't speak latinum \ No newline at end of file