From b07b49b3e6006f1080cc0a176e58246a343aecc2 Mon Sep 17 00:00:00 2001
From: BrightShard
Date: Fri, 25 Oct 2024 11:59:30 -0400
Subject: [PATCH] Initial commit
---
.gitignore | 1 +
Cargo.lock | 23 ++
Cargo.toml | 12 +
README.md | 103 +++++++
src/compiler.rs | 121 ++++++++
src/lib.rs | 71 +++++
src/main.rs | 99 +++++++
src/minifier.rs | 5 +
src/minifier/css.rs | 79 +++++
src/minifier/html.rs | 589 +++++++++++++++++++++++++++++++++++++
src/translator.rs | 4 +
src/translator/gemtext.rs | 119 ++++++++
src/translator/markdown.rs | 0
tests/gemtext.rs | 21 ++
tests/gemtext/header.gmi | 3 +
tests/gemtext/header.html | 1 +
tests/gemtext/link.gmi | 4 +
tests/gemtext/link.html | 1 +
tests/gemtext/list.gmi | 3 +
tests/gemtext/list.html | 1 +
tests/gemtext/text.gmi | 1 +
tests/gemtext/text.html | 1 +
22 files changed, 1262 insertions(+)
create mode 100644 .gitignore
create mode 100644 Cargo.lock
create mode 100644 Cargo.toml
create mode 100644 README.md
create mode 100644 src/compiler.rs
create mode 100644 src/lib.rs
create mode 100644 src/main.rs
create mode 100644 src/minifier.rs
create mode 100644 src/minifier/css.rs
create mode 100644 src/minifier/html.rs
create mode 100644 src/translator.rs
create mode 100644 src/translator/gemtext.rs
create mode 100644 src/translator/markdown.rs
create mode 100644 tests/gemtext.rs
create mode 100644 tests/gemtext/header.gmi
create mode 100644 tests/gemtext/header.html
create mode 100644 tests/gemtext/link.gmi
create mode 100644 tests/gemtext/link.html
create mode 100644 tests/gemtext/list.gmi
create mode 100644 tests/gemtext/list.html
create mode 100644 tests/gemtext/text.gmi
create mode 100644 tests/gemtext/text.html
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..ea8c4bf
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+/target
diff --git a/Cargo.lock b/Cargo.lock
new file mode 100644
index 0000000..fe1a8f8
--- /dev/null
+++ b/Cargo.lock
@@ -0,0 +1,23 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 4
+
+[[package]]
+name = "base64"
+version = "0.22.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
+
+[[package]]
+name = "boml"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85fdb93f04c73bff54305fa437ffea5449c41edcaadfe882f35836206b166ac5"
+
+[[package]]
+name = "webby"
+version = "0.1.0"
+dependencies = [
+ "base64",
+ "boml",
+]
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..a3a24b8
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,12 @@
+[package]
+name = "webby"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+base64 = "0.22.1"
+boml = "0.3.1"
+
+[features]
+default = []
+log = []
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..9063717
--- /dev/null
+++ b/README.md
@@ -0,0 +1,103 @@
+# webby
+
+> The smol web compiler
+
+As seen in [my website](https://github.com/bright-shard/website).
+
+**webby** is a small and efficient compiler for making static sites. It adds macros, minifiers, and translators to compile your project into a tiny static site.
+
+> Note: Webby is WIP. The above is a summary of what I want it to do when it's finished. For the current project status, see [todo](#todo).
+
+# macros
+
+webby adds a few simple macros to make writing HTML simpler. Macros open with `#!`, followed by the macro name, followed by arguments in parentheses, like so:
+
+```
+#!MACRO_NAME(args)
+```
+
+Macros can be combined, like this:
+
+```
+#!MACRO1(#!MACRO2(args))
+```
+
+- `#!INCLUDE(path/to/file)`: Webby will compile the given file, then embed it at the macro's location. The file must contain valid UTF-8 text.
+- `#!BASE64(text)`: Base64-encode the given text.
+- `#!INCLUDE_BASE64(path/to/file)`: Base64-encode the given file. This differs from `#!BASE64(#!INCLUDE(path/to/file))` because it can also base64-encode binary files.
+
+# minifiers
+
+webby will automatically strip comments and unneeded whitespace from your code to make it as small as possible.
+
+# translators
+
+Translators cross-compile between languages - for example, Markdown to HTML, or Gemtext to HTML.
+
+
+
+# usage
+
+webby projects have a `webby.toml` in the root of their project, just like Rust projects have a `Cargo.toml` in the root of theirs. The format of `webby.toml` is given in [config](#config).
+
+To install webby, just install it with Cargo:
+
+```sh
+cargo install --git https://github.com/bright-shard/webby
+```
+
+Then just run `webby` in your webby project.
+
+# config
+
+In its simplest form, the `webby.toml` file will look like this:
+
+```toml
+# For every file you want to compile with webby, add a `[[target]]` section
+[[target]]
+# The path to the file to compile
+path = "index.html"
+
+[[target]]
+path = "blog.html"
+```
+
+However, webby allows customising more if you need it:
+
+```toml
+# (Optional) the directory to put the output files at
+# If this isn't specified it defaults to `webby`
+# The path is relative to the webby.toml file
+output = "my/custom/build/dir"
+
+[[target]]
+# The path to the file, relative to the webby.toml file
+# If you list a folder instead of a file, webby will compile all of the files
+# in that folder
+path = "path/to/file.html"
+# (Optional) Where to put the compiled file
+# If this isn't specified it defaults to the name of the file given in path
+# The path is relative to the output directory
+output = "file.out.html"
+# (Optional) The compilation mode
+# This can be "compile", "copy", or "link". Compile will compile the file. Copy
+# will just copy the file as-is and will not compile it at all. Link is the same
+# as copy, but it creates a hard link (not a symlink) to the file instead of
+# copying it.
+# If this isn't specified, webby will infer if it should compile or copy the
+# file based on the file's ending.
+mode = "compile"
+```
+
+# todo
+
+- [x] Macros
+ - [x] INCLUDE
+ - [x] BASE64
+ - [x] BASE64_INCLUDE
+- [x] HTML minifier
+- [x] CSS minifier
+- [ ] JS minifier
+- [x] Gemtext translator
+- [ ] Markdown translator
+- [ ] Redo macro compiler... it's old and has bugs
diff --git a/src/compiler.rs b/src/compiler.rs
new file mode 100644
index 0000000..6c14d41
--- /dev/null
+++ b/src/compiler.rs
@@ -0,0 +1,121 @@
+use {
+ crate::{line_number_of_offset, Cow},
+ base64::{engine::general_purpose::STANDARD, Engine},
+ std::{fs, path::Path},
+};
+
+pub fn compile_macros<'a>(original: &'a str, source_path: &'a Path) -> Cow<'a> {
+ let mut output = String::default();
+ let mut offset = 0;
+
+ while let Some(start_idx) = original[offset..].find("#!") {
+ if original[offset..]
+ .as_bytes()
+ .get(start_idx.saturating_sub(1))
+ .copied()
+ == Some(b'\\')
+ {
+ if !output.is_empty() {
+ output += &original[offset..offset + start_idx + 1]
+ }
+ offset += start_idx + 1;
+ continue;
+ }
+
+ output += &original[offset..offset + start_idx];
+ offset += start_idx;
+
+ let macro_src = &original[offset..];
+ let paren_open = macro_src.find('(').unwrap_or_else(|| {
+ panic!(
+ "Expected ( in macro invocation at {source_path:?}:{}",
+ line_number_of_offset(original, offset)
+ )
+ });
+ let mut paren_close = macro_src.find(')').unwrap_or_else(|| {
+ panic!(
+ "Expected ) to end macro invocation at {source_path:?}:{}",
+ line_number_of_offset(original, offset)
+ )
+ });
+ while macro_src.as_bytes().get(paren_close + 1).copied() == Some(b')') {
+ paren_close += 1;
+ }
+
+ let macro_name = ¯o_src[2..paren_open];
+ let macro_args = ¯o_src[paren_open + 1..paren_close];
+ let macro_args = compile_macros(macro_args, source_path);
+ let macro_args = macro_args.as_ref();
+
+ match macro_name {
+ "INCLUDE" => {
+ let path = source_path.parent().unwrap().join(macro_args);
+ let src = fs::read_to_string(&path).unwrap_or_else(|err| {
+ panic!(
+ "Error in INCLUDE macro at {source_path:?}:{}: {err}",
+ line_number_of_offset(original, offset)
+ )
+ });
+ let compiled = compile_macros(&src, &path);
+ output += compiled.as_ref();
+ }
+ "BASE64" => {
+ output += STANDARD.encode(macro_args).as_str();
+ }
+ "INCLUDE_BASE64" => {
+ let path = source_path.parent().unwrap().join(macro_args);
+ let src = fs::read(&path).unwrap_or_else(|err| {
+ panic!(
+ "Error in INCLUDE_BASE64 macro at {source_path:?}:{}: {err}",
+ line_number_of_offset(original, offset)
+ )
+ });
+ output += STANDARD.encode(&src).as_str();
+ }
+ other => panic!(
+ "Unknown macro '{other}' in macro invocation at {source_path:?}:{}",
+ line_number_of_offset(original, offset)
+ ),
+ }
+
+ offset += paren_close + 1;
+ }
+
+ if output.is_empty() {
+ Cow::Borrowed(original)
+ } else {
+ output += &original[offset..];
+ Cow::Owned(output)
+ }
+}
+
+pub fn copy_batch_target(src: &Path, dest: &Path) {
+ if dest.is_file() {
+ fs::remove_file(dest).unwrap_or_else(|err| {
+ panic!("Failed to copy batch target {src:?}. There was already a file where its output should go ({dest:?}), which couldn't be removed: {err}");
+ });
+ }
+ if !dest.exists() {
+ fs::create_dir_all(dest).unwrap_or_else(|err| {
+ panic!("Failed to copy batch target {src:?}. Couldn't create its output folder at {dest:?} because: {err}");
+ });
+ }
+
+ let src = src.read_dir().unwrap_or_else(|err| {
+ panic!(
+ "Failed to copy batch target {dest:?}. Couldn't open its source directory because: {err}"
+ );
+ });
+
+ for dir_entry in src.filter_map(|dir_entry| dir_entry.ok()) {
+ let dir_entry = &dir_entry.path();
+
+ if dir_entry.is_file() {
+ fs::copy(dir_entry, dest.join(dir_entry.file_name().unwrap())).unwrap_or_else(|err| {
+ panic!("Failed to copy batch target {dest:?}. Couldn't copy file at {dir_entry:?} because: {err}");
+ });
+ } else {
+ copy_batch_target(dir_entry, &dest.join(dir_entry.file_name().unwrap()));
+ }
+ }
+}
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..797d9ff
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,71 @@
+pub mod compiler;
+pub mod minifier;
+pub mod translator;
+
+use std::{fs, path::PathBuf};
+
+type Cow<'a> = std::borrow::Cow<'a, str>;
+
+pub enum Mode {
+ Compile,
+ Copy,
+ Link,
+}
+
+pub struct Target {
+ pub path: PathBuf,
+ pub output: PathBuf,
+ pub mode: Mode,
+}
+
+pub fn build_target(target: Target) -> Result<(), Cow<'static>> {
+ match target.mode {
+ Mode::Copy => {
+ if target.path.is_file() | target.path.is_symlink() {
+ fs::copy(target.path, target.output).unwrap();
+ } else {
+ compiler::copy_batch_target(&target.path, &target.output);
+ }
+ }
+ Mode::Link => {
+ if target.output.exists() {
+ fs::remove_file(&target.output)
+ .unwrap_or_else(|err| panic!("Failed to link target {:?}: {err}", &target.path))
+ }
+ fs::hard_link(&target.path, target.output)
+ .unwrap_or_else(|err| panic!("Failed to link target {:?}: {err}", &target.path));
+ }
+ Mode::Compile => {
+ let original = fs::read_to_string(&target.path).unwrap_or_else(|err| {
+ panic!(
+ "Failed to compile target {:?}: Error occurred while reading the source file: {err}",
+ &target.path
+ )
+ });
+ let compiled_macros = compiler::compile_macros(&original, &target.path);
+
+ let output = match target.path.extension().and_then(|val| val.to_str()) {
+ Some("gmi") => Cow::Owned(translator::translate_gemtext(
+ &target.path,
+ compiled_macros.as_ref(),
+ )?),
+ Some("html") => Cow::Owned(minifier::minify_html(
+ target.path.to_str().unwrap(),
+ &compiled_macros,
+ &original,
+ )?),
+ Some("css") => Cow::Owned(minifier::minify_css(&compiled_macros)),
+ _ => compiled_macros,
+ };
+
+ fs::write(&target.output, output.as_ref())
+ .unwrap_or_else(|err| panic!("Failed to compile target {:?}: Error occured while writing the compiled file: {err}", &target.path));
+ }
+ }
+
+ Ok(())
+}
+
+fn line_number_of_offset(src: &str, offset: usize) -> usize {
+ src[..offset].bytes().filter(|byte| *byte == b'\n').count()
+}
diff --git a/src/main.rs b/src/main.rs
new file mode 100644
index 0000000..ea0a3c5
--- /dev/null
+++ b/src/main.rs
@@ -0,0 +1,99 @@
+use {
+ boml::{table::TomlGetError, Toml},
+ std::{borrow::Cow, env, fs, thread},
+ webby::{build_target, Mode, Target},
+};
+
+pub fn main() -> Result<(), Cow<'static, str>> {
+ let cwd = env::current_dir().expect("Failed to find current directory");
+ let mut root = cwd.as_path();
+
+ while !root
+ .read_dir()
+ .expect("Failed to list files in current folder")
+ .any(|file| {
+ if let Ok(ref file) = file {
+ if let Some(name) = file.file_name().to_str() {
+ if name == "webby.toml" && file.path().is_file() {
+ return true;
+ }
+ }
+ }
+
+ false
+ })
+ {
+ let Some(parent) = root.parent() else {
+ return Err("Failed to find webby.toml".into());
+ };
+ root = parent;
+ }
+
+ let cfg = fs::read_to_string(root.join("webby.toml")).expect("Failed to read webby.toml");
+ let toml = Toml::parse(&cfg).unwrap();
+
+ let output_dir = if let Ok(output) = toml.get_string("output") {
+ root.join(output)
+ } else {
+ root.join("webby")
+ };
+
+ if !output_dir.exists() {
+ fs::create_dir(&output_dir).expect("Failed to create output directory");
+ }
+
+ let mut tasks = Vec::default();
+
+ match toml.get_array("target") {
+ Ok(targets) => {
+ for target in targets {
+ let Some(table) = target.table() else {
+ return Err("All target entries in webby.toml must be a TOML table.".into());
+ };
+ let Ok(path) = table.get_string("path") else {
+ return Err("Target in webby.toml didn't have a path".into());
+ };
+ let path = root.join(path);
+ let mode = if let Ok(mode) = table.get_string("mode") {
+ match mode {
+ "compile" => Mode::Compile,
+ "copy" => Mode::Copy,
+ "link" => Mode::Link,
+ other => panic!("Unknown mode: {other} for target: {path:?}"),
+ }
+ } else {
+ match path.extension().and_then(|osstr| osstr.to_str()) {
+ Some("gmi" | "html" | "svg" | "md" | "css") => Mode::Compile,
+ _ => Mode::Copy,
+ }
+ };
+ let output = if let Ok(output_name) = table.get_string("output") {
+ output_dir.join(output_name)
+ } else {
+ output_dir.join(path.file_name().unwrap())
+ };
+
+ let target = Target { path, output, mode };
+ let worker = thread::spawn(move || build_target(target));
+ tasks.push(worker);
+ }
+ }
+ Err(e) => match e {
+ TomlGetError::InvalidKey => {
+ return Err("No targets specified. See the GitHub for an example on setting up a webby project: https://github.com/bright-shard/webby".into());
+ }
+ TomlGetError::TypeMismatch(_, _) => {
+ return Err("The 'target' entry has to an array in webby.toml".into());
+ }
+ },
+ }
+
+ for task in tasks {
+ match task.join().unwrap() {
+ Ok(()) => {}
+ Err(err) => println!("{err}"),
+ }
+ }
+
+ Ok(())
+}
diff --git a/src/minifier.rs b/src/minifier.rs
new file mode 100644
index 0000000..17107a1
--- /dev/null
+++ b/src/minifier.rs
@@ -0,0 +1,5 @@
+mod css;
+mod html;
+
+pub use css::minify_css;
+pub use html::minify_html;
diff --git a/src/minifier/css.rs b/src/minifier/css.rs
new file mode 100644
index 0000000..e7203ae
--- /dev/null
+++ b/src/minifier/css.rs
@@ -0,0 +1,79 @@
+pub fn minify_css(source: &str) -> String {
+ let mut out = String::new();
+ let mut chars = source.chars().peekable();
+
+ let mut function_depth = 0;
+ let mut maybe_in_rule = false;
+
+ while let Some(char) = chars.next() {
+ match char {
+ '/' if chars.peek().copied() == Some('*') => {
+ chars.next();
+
+ while let Some(char) = chars.next() {
+ if char == '*' && chars.peek().copied() == Some('/') {
+ chars.next();
+ break;
+ }
+ }
+ continue;
+ }
+ '\'' | '"' => {
+ out.push(char);
+
+ while let Some(subchar) = chars.next() {
+ if subchar == char {
+ out.push(char);
+ break;
+ } else if subchar == '\\' {
+ if let Some(char) = chars.next() {
+ out.push(char);
+ }
+ }
+
+ out.push(subchar);
+ }
+
+ continue;
+ }
+ '\n' => {
+ while chars.peek().map(|c| c.is_whitespace()).unwrap_or(false) {
+ chars.next();
+ }
+
+ if function_depth > 0 || maybe_in_rule {
+ out.push(' ');
+ }
+ continue;
+ }
+ '(' => {
+ function_depth += 1;
+ out.truncate(out.trim_end().len());
+ while chars.peek().map(|c| c.is_whitespace()).unwrap_or(false) {
+ chars.next();
+ }
+ }
+ ')' => {
+ function_depth -= 1;
+ out.truncate(out.trim_end().len());
+ }
+ '{' | '}' | ',' => {
+ out.truncate(out.trim_end().len());
+ while chars.peek().map(|c| c.is_whitespace()).unwrap_or(false) {
+ chars.next();
+ }
+ }
+ ':' => {
+ while chars.peek().map(|c| c.is_whitespace()).unwrap_or(false) {
+ chars.next();
+ }
+ maybe_in_rule = true;
+ }
+ ';' => maybe_in_rule = false,
+ _ => {}
+ }
+ out.push(char);
+ }
+
+ out
+}
diff --git a/src/minifier/html.rs b/src/minifier/html.rs
new file mode 100644
index 0000000..5080d7d
--- /dev/null
+++ b/src/minifier/html.rs
@@ -0,0 +1,589 @@
+use crate::{line_number_of_offset, minifier, Cow};
+
+macro_rules! log {
+ ($($t:tt)*) => {
+ #[cfg(any(test, feature = "log"))]
+ println!($($t)*);
+ };
+}
+
+pub fn minify_html(
+ source_path: &str,
+ source: &str,
+ original: &str,
+) -> Result> {
+ let mut result = String::new();
+ let mut handled_bytes = 0;
+
+ while handled_bytes < source.len() {
+ let (tag, bytes) = handle_tag(
+ source_path,
+ &source[handled_bytes..],
+ (original, handled_bytes),
+ )?;
+ result += &tag;
+ handled_bytes += bytes + 1;
+ }
+
+ Ok(result)
+}
+
+/// Parses an individual HTML tag and minifies it.
+///
+/// This is a rough overview of the strategy this function uses to parse HTML,
+/// handle its edge cases, and then minify it:
+///
+/// DISCLAIMER: The above is what this minifier will do when it's complete. See
+/// the TODO at the bottom for what's not yet implemented.
+///
+/// 1. Tags begin with a <, then have the tag type. If there is whitespace after
+/// the <, it's not considered a tag.
+/// 2. If the tag's type is `!--`, it is a comment and will be removed from
+/// the resulting HTML.
+/// 3. The tag may have properties in the format `name=value`, with optional
+/// whitespace around the `=` and optional quotes around the value. The tag
+/// may also have properties in the format `name`. Properties are never
+/// minimised except to remove whitespace around the `=`.
+/// 4. The tag may be closed with either a `/>`, a closing tag, or may not be
+/// closed properly at all.
+/// 5. If the tag is closed with a closing tag, this function will classify the
+/// tag as either a *text* tag or a *content* tag. Text tags store text (`p`,
+/// `a`, `h1`, etc), while content tags store other HTMl elements (`head`,
+/// `body`, etc). If the tag is not closed, this function just returns the
+/// tag. If the tag is a style tag, it will be run through the CSS minifier.
+/// Script tags will only be minified with the `js-minify` feature enabled.
+/// 6. If the tag is a text tag, the only minification that will occur is
+/// removing newlines around tags inside it. Any content tags inside the text
+/// tag will be minified as normal for a content tag.
+/// 7. If the tag is a content tag, newlines will be stripped from it.
+/// Whitespace that isn't in a tag property's value will also be stripped.
+/// Any nested tags inside that content tag will be re-run through this
+/// minifier.
+///
+/// # TODO
+/// - Find a decent JS minifier, add it as a dep, and feature flag it. JS is
+/// too complicated to write a minifier for, when I don't even use it.
+fn handle_tag<'a>(
+ source_path: &'a str,
+ source: &'a str,
+ error_meta: (&'a str, usize),
+) -> Result<(Cow<'a>, usize), Cow<'static>> {
+ if source
+ .chars()
+ .next()
+ .map(|char| char.is_whitespace())
+ .unwrap_or(false)
+ {
+ return Ok((Cow::Borrowed("<"), 1));
+ } else if source.starts_with("") else {
+ return Err(Cow::Owned(format!(
+ "HTML error: Unclosed HTML comment at {source_path}:{}",
+ line_number_of_offset(error_meta.0, error_meta.1)
+ )));
+ };
+ return Ok((Cow::Borrowed(""), ending + 2));
+ }
+
+ let mut output = String::from("<");
+ let mut chars = source.char_indices().peekable();
+ chars.next(); // discard opening <
+
+ let tag_name_end;
+ let tag_closed;
+ loop {
+ let Some((byte_idx, char)) = chars.next() else {
+ return Ok((Cow::Owned(output), source.len() - 1));
+ };
+
+ if char == '/' && chars.peek().map(|(_, char)| *char) == Some('>') {
+ output += "/>";
+ chars.next();
+ let mut end = byte_idx + 1;
+ if chars.peek().map(|(_, char)| *char) == Some('\n') {
+ while chars
+ .peek()
+ .map(|(_, c)| c.is_whitespace())
+ .unwrap_or(false)
+ {
+ end = chars.next().unwrap().0;
+ }
+ }
+
+ return Ok((Cow::Owned(output), end));
+ } else if char == '>' {
+ tag_name_end = byte_idx;
+ tag_closed = true;
+ break;
+ } else if char.is_whitespace() {
+ tag_name_end = byte_idx;
+ tag_closed = false;
+
+ while chars
+ .peek()
+ .map(|(_, c)| c.is_whitespace())
+ .unwrap_or(false)
+ {
+ chars.next();
+ }
+
+ output.push(' ');
+ break;
+ }
+
+ output.push(char);
+ }
+
+ let tag_name = &source[1..tag_name_end];
+ log!("Parsing tag `{tag_name}`");
+
+ if tag_closed {
+ log!(" Opening tag closed w/o properties");
+ output.push('>');
+ } else {
+ // Each loop parses 1 property
+ 'parse_properties: loop {
+ let Some((byte_idx, char)) = chars.next() else {
+ return Ok((Cow::Owned(output), source.len() - 1));
+ };
+ log!(" 'parse_properties: Found `{char}`");
+
+ if char == '/' && chars.peek().map(|(_, char)| *char) == Some('>') {
+ output += "/>";
+ chars.next();
+ let mut end = byte_idx + 1;
+ if chars.peek().map(|(_, char)| *char) == Some('\n') {
+ while chars
+ .peek()
+ .map(|(_, c)| c.is_whitespace())
+ .unwrap_or(false)
+ {
+ end = chars.next().unwrap().0;
+ }
+ }
+
+ return Ok((Cow::Owned(output), end));
+ } else if char == '>' {
+ log!(" Opening tag closed in 'parsed_properties");
+ output.push('>');
+ break 'parse_properties;
+ } else if char == '\n' {
+ continue 'parse_properties;
+ } else if char.is_whitespace() {
+ while chars
+ .peek()
+ .map(|(_, c)| c.is_whitespace())
+ .unwrap_or(false)
+ {
+ chars.next();
+ }
+ output.push(' ');
+ continue 'parse_properties;
+ }
+
+ // Parse property's name
+ output.push(char);
+ 'parse_property_name: loop {
+ let Some((byte_idx, char)) = chars.next() else {
+ return Ok((Cow::Owned(output), source.len() - 1));
+ };
+
+ if char == '=' {
+ break 'parse_property_name;
+ } else if char == '/' && chars.peek().map(|(_, char)| *char) == Some('>') {
+ output += "/>";
+ chars.next();
+ let mut end = byte_idx + 1;
+ if chars.peek().map(|(_, char)| *char) == Some('\n') {
+ while chars
+ .peek()
+ .map(|(_, c)| c.is_whitespace())
+ .unwrap_or(false)
+ {
+ end = chars.next().unwrap().0;
+ }
+ }
+
+ return Ok((Cow::Owned(output), end));
+ } else if char == '>' {
+ output += ">";
+ break 'parse_properties;
+ } else if char.is_whitespace() {
+ while chars
+ .peek()
+ .map(|(_, c)| c.is_whitespace())
+ .unwrap_or(false)
+ {
+ chars.next();
+ }
+
+ if chars.peek().map(|(_, char)| *char) == Some('=') {
+ // Whitespace followed by an `=` - should break name
+ // parsing and start parsing the value
+ chars.next();
+ break 'parse_property_name;
+ } else {
+ // Whitespace followed by other characters - this
+ // property didn't have a value and we should go parse
+ // the next one
+ output.push(' ');
+ continue 'parse_properties;
+ }
+ }
+
+ output.push(char);
+ }
+
+ // If we get to this point, the property has a value. The chars
+ // iterator will pick up after the =.
+ output.push('=');
+
+ while chars
+ .peek()
+ .map(|(_, c)| c.is_whitespace())
+ .unwrap_or(false)
+ {
+ chars.next();
+ }
+
+ // Parse property's value
+ let Some((idx, char)) = chars.next() else {
+ return Ok((Cow::Owned(output), source.len() - 1));
+ };
+ match char {
+ '\'' | '"' => {
+ output.push(char);
+ loop {
+ let Some((_, next)) = chars.next() else {
+ return Err(Cow::Owned(format!(
+ "Unclosed quotation in HTML property at {source_path}:{}",
+ line_number_of_offset(error_meta.0, error_meta.1 + idx)
+ )));
+ };
+ if next == '\\' {
+ if let Some((_, next)) = chars.next() {
+ output.push(next);
+ }
+ continue;
+ }
+
+ output.push(next);
+
+ if next == char {
+ break;
+ }
+ }
+ }
+ _ => {
+ output.push(char);
+ loop {
+ let Some((idx, char)) = chars.next() else {
+ return Err(Cow::Owned(format!(
+ "Unclosed quotation in HTML property at {source_path}:{}",
+ line_number_of_offset(error_meta.0, error_meta.1 + idx)
+ )));
+ };
+
+ if char == '/' && chars.peek().map(|(_, char)| *char) == Some('>') {
+ output += "/>";
+ chars.next();
+ let mut end = idx + 1;
+ if chars.peek().map(|(_, char)| *char) == Some('\n') {
+ while chars
+ .peek()
+ .map(|(_, c)| c.is_whitespace())
+ .unwrap_or(false)
+ {
+ end = chars.next().unwrap().0;
+ }
+ }
+
+ return Ok((Cow::Owned(output), end));
+ } else if char == '>' {
+ output += ">";
+ break 'parse_properties;
+ } else if char.is_whitespace() {
+ while chars
+ .peek()
+ .map(|(_, c)| c.is_whitespace())
+ .unwrap_or(false)
+ {
+ chars.next();
+ }
+ output.push(char);
+
+ continue 'parse_properties;
+ } else {
+ output.push(char);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ // By now the > of the opening tag has been reached
+ // We need to find the closing tag, then minify the contents of the tag
+ // as needed
+ debug_assert!(output.ends_with('>'), "output is: `{output}`");
+
+ if chars.peek().map(|(_, char)| *char) == Some('\n') {
+ while chars
+ .peek()
+ .map(|(_, char)| char.is_whitespace())
+ .unwrap_or(false)
+ {
+ chars.next();
+ }
+ }
+
+ if tag_name == "script" || tag_name == "style" {
+ // TODO: Actually parse and minify JS
+ let closing_tag = if tag_name == "script" {
+ ""
+ } else {
+ ""
+ };
+ let Some(search_start_idx) = chars.next().map(|(idx, _)| idx) else {
+ return Ok((Cow::Owned(output), source.len() - 1));
+ };
+
+ let Some(closing) = source[search_start_idx..].find(closing_tag) else {
+ return Err(Cow::Owned(format!(
+ "Unclosed style or script tag at {source_path}:{}",
+ line_number_of_offset(error_meta.0, error_meta.1 + search_start_idx)
+ )));
+ };
+ let closing = search_start_idx + closing + closing_tag.len();
+
+ if tag_name == "style" {
+ output += &minifier::minify_css(&source[search_start_idx..closing]);
+ } else {
+ output += &source[search_start_idx..closing];
+ }
+
+ return Ok((Cow::Owned(output), closing));
+ }
+
+ let textual_tag = matches!(
+ tag_name,
+ "a" | "abbr"
+ | "acronym"
+ | "aside"
+ | "b"
+ | "bdi"
+ | "bdo"
+ | "big"
+ | "blockquote"
+ | "button"
+ | "caption"
+ | "cite"
+ | "code"
+ | "dd"
+ | "del"
+ | "details"
+ | "dfn"
+ | "dt"
+ | "em"
+ | "figcaption"
+ | "h1"
+ | "h2"
+ | "h3"
+ | "h4"
+ | "h5"
+ | "h6"
+ | "i"
+ | "ins"
+ | "kbd"
+ | "label"
+ | "legend"
+ | "li"
+ | "mark"
+ | "marquee"
+ | "meter"
+ | "nobr"
+ | "option"
+ | "output"
+ | "p"
+ | "pre"
+ | "progress"
+ | "q"
+ | "rb"
+ | "rp"
+ | "rt"
+ | "s"
+ | "sample"
+ | "small"
+ | "span"
+ | "strong"
+ | "sub"
+ | "summary"
+ | "sup"
+ | "td"
+ | "textarea"
+ | "th"
+ | "time"
+ | "title"
+ | "u"
+ | "var"
+ );
+ let preformatted = tag_name == "pre";
+ log!(" Textual tag? {textual_tag}");
+
+ while let Some((byte_idx, char)) = chars.next() {
+ if char == '<' {
+ let Some((next_idx, next_char)) = chars.peek().copied() else {
+ return Ok((Cow::Owned(output), source.len() - 1));
+ };
+
+ if next_char == '/' {
+ chars.next();
+
+ if !preformatted {
+ let mut trim = false;
+ let mut chars_rev = source[..byte_idx].chars();
+ while let Some(char) = chars_rev.next_back() {
+ if !char.is_whitespace() {
+ break;
+ } else if char == '\n' {
+ trim = true;
+ }
+ }
+ if trim {
+ output = output.trim_end().to_string();
+ }
+ }
+
+ output += "";
+
+ loop {
+ let Some((idx, char)) = chars.next() else {
+ return Err(Cow::Owned(format!(
+ "Unclosed HTML closing tag at {source_path}:{}",
+ line_number_of_offset(error_meta.0, error_meta.1 + next_idx)
+ )));
+ };
+
+ if char == '>' {
+ let mut end = idx;
+ if !preformatted && chars.peek().map(|(_, char)| *char) == Some('\n') {
+ while chars
+ .peek()
+ .map(|(_, c)| c.is_whitespace())
+ .unwrap_or(false)
+ {
+ end = chars.next().unwrap().0;
+ }
+ }
+
+ if output.ends_with(tag_name) {
+ output.push('>');
+ return Ok((Cow::Owned(output), end));
+ } else {
+ output.push('>');
+ break;
+ }
+ }
+
+ if !char.is_whitespace() {
+ output.push(char);
+ }
+ }
+ } else if !next_char.is_whitespace() {
+ let (subtag, used) = handle_tag(source_path, &source[byte_idx..], error_meta)?;
+ log!(" Found subtag `{subtag}`. Ends at {used}, current char is {next_idx}.");
+ let used = used + byte_idx;
+
+ loop {
+ let (next_idx, _) = chars.next().unwrap();
+ if next_idx == used {
+ break;
+ }
+ }
+
+ output += &subtag;
+ } else {
+ output.push('<');
+ }
+ } else if !preformatted {
+ match char {
+ '\n' => {}
+ _ if char.is_whitespace() => {
+ while chars
+ .peek()
+ .map(|(_, char)| char.is_whitespace())
+ .unwrap_or(false)
+ {
+ chars.next();
+ }
+ if textual_tag {
+ output.push(' ');
+ }
+ }
+ _ => output.push(char),
+ }
+ } else {
+ output.push(char);
+ }
+ }
+
+ Ok((Cow::Owned(output), source.len() - 1))
+}
+
+#[cfg(test)]
+mod tests {
+ use crate::minifier::minify_html;
+
+ struct Tester {
+ name: &'static str,
+ source: &'static str,
+ expected: &'static str,
+ }
+ impl Tester {
+ fn test(self) {
+ log!("\nSTARTING TEST '{}'", self.name);
+ let result = minify_html("test/path", self.source, self.source).unwrap();
+ assert_eq!(&result, self.expected, "Test name: {}", self.name);
+ }
+ }
+
+ #[test]
+ fn test() {
+ let cases = [
+ Tester {
+ name: "Trim whitespace between tags",
+ source: " hi
",
+ expected: "hi
",
+ },
+ Tester {
+ name: "Trim comments",
+ source: "hi
",
+ expected: "hi
",
+ },
+ Tester {
+ name: "Includes whitespace in textual comments",
+ source: " This has weird whitespace!!!\n
",
+ expected: " This has weird whitespace!!!\n
",
+ },
+ Tester {
+ name: "Element properties",
+ source: "hewwo
",
+ expected: "hewwo
",
+ },
+ Tester {
+ name: "Element properties 2",
+ source: "hewwo
",
+ expected: "hewwo
",
+ },
+ Tester {
+ name: "Unclosed Elements",
+ source: "
hello
\n
",
+ expected: "
hello
"
+ }
+ ];
+
+ for case in cases {
+ case.test();
+ }
+ }
+}
diff --git a/src/translator.rs b/src/translator.rs
new file mode 100644
index 0000000..d80002b
--- /dev/null
+++ b/src/translator.rs
@@ -0,0 +1,4 @@
+mod gemtext;
+mod markdown;
+
+pub use gemtext::translate_gemtext;
diff --git a/src/translator/gemtext.rs b/src/translator/gemtext.rs
new file mode 100644
index 0000000..1e92c42
--- /dev/null
+++ b/src/translator/gemtext.rs
@@ -0,0 +1,119 @@
+use {
+ crate::Cow,
+ std::{fmt::Write, path::Path},
+};
+
+#[derive(PartialEq, Eq)]
+enum ParserState {
+ Text,
+ List,
+ Preformatted,
+}
+
+/// Escapes characters from an input string so valid Gemtext doesn't get
+/// misinterpreted as HTML.
+//
+// This should prevent any form of HTML injection... but other programs filter
+// more characters than are being filtered here, which should be looked into...
+//
+// Cases covered by Canvas LMS:
+// '&' => *out += "&",
+// '<' => *out += "<",
+// '>' => *out += ">",
+// '"' => *out += """,
+// '\'' => *out += "'",
+// '/' => *out += "/",
+// '`' => *out += "`",
+// '=' => *out += "=",
+// From https://github.com/instructure/canvas-lms/blob/master/packages/html-escape/index.js#L85
+fn html_escape_into(input: &str, out: &mut String) {
+ for char in input.chars() {
+ match char {
+ '<' => *out += "<",
+ '>' => *out += ">",
+ '"' => *out += """,
+ '&' => *out += "&",
+ other => out.push(other),
+ }
+ }
+}
+
+pub fn translate_gemtext(source_path: &Path, source: &str) -> Result> {
+ let mut output = String::new();
+ let mut state = ParserState::Text;
+ output += "";
+
+ for (line_num, line) in source.lines().enumerate() {
+ if state == ParserState::Preformatted {
+ if line.starts_with("```") {
+ state = ParserState::Text;
+ output += "";
+ continue;
+ }
+
+ html_escape_into(line, &mut output);
+ continue;
+ }
+
+ if let Some(list_line) = line.strip_prefix("* ") {
+ if state != ParserState::List {
+ state = ParserState::List;
+ output += "
";
+ }
+ output += "- ";
+ html_escape_into(list_line, &mut output);
+ output += "
";
+ continue;
+ } else if state == ParserState::List {
+ state = ParserState::Text;
+ output += "
";
+ }
+
+ if let Some(link_line) = line.strip_prefix("=>") {
+ let mut line = link_line.split_whitespace();
+ let link = line.next().ok_or(Cow::Owned(format!(
+ "Expected URL in link at {source_path:?}:{line_num}"
+ )))?;
+
+ output += "";
+
+ if let Some(link_text) = line.next() {
+ html_escape_into(link_text, &mut output);
+ } else {
+ html_escape_into(link, &mut output);
+ }
+
+ output += "
";
+ } else if let Some(alt) = line.strip_prefix("```") {
+ output += "";
+ state = ParserState::Preformatted;
+ } else if let Some(quote) = line.strip_prefix("> ") {
+ output += "";
+ html_escape_into(quote, &mut output);
+ output += "
";
+ } else if line.starts_with('#') {
+ let mut chars = line.bytes();
+ let mut level = 0;
+ while chars.next() == Some(b'#') {
+ level += 1;
+ }
+
+ write!(output, "").unwrap();
+ html_escape_into(line[level..].trim_start(), &mut output);
+ write!(output, "").unwrap();
+ } else {
+ output += line;
+ }
+ }
+
+ if state == ParserState::List {
+ output += "";
+ }
+
+ output += "
";
+ Ok(output)
+}
diff --git a/src/translator/markdown.rs b/src/translator/markdown.rs
new file mode 100644
index 0000000..e69de29
diff --git a/tests/gemtext.rs b/tests/gemtext.rs
new file mode 100644
index 0000000..cfbe520
--- /dev/null
+++ b/tests/gemtext.rs
@@ -0,0 +1,21 @@
+use {
+ std::{fs, path::PathBuf},
+ webby::translator,
+};
+
+#[test]
+fn test() {
+ let tests = ["link", "header", "text", "list"];
+
+ for test in tests {
+ let gmi_path = PathBuf::from(format!("tests/gemtext/{test}.gmi"));
+ let html_path = PathBuf::from(format!("tests/gemtext/{test}.html"));
+ let html =
+ translator::translate_gemtext(&gmi_path, &fs::read_to_string(&gmi_path).unwrap())
+ .unwrap();
+ assert_eq!(
+ html,
+ format!("{}
", fs::read_to_string(&html_path).unwrap())
+ )
+ }
+}
diff --git a/tests/gemtext/header.gmi b/tests/gemtext/header.gmi
new file mode 100644
index 0000000..5da5539
--- /dev/null
+++ b/tests/gemtext/header.gmi
@@ -0,0 +1,3 @@
+# header1
+## header2
+### header3
diff --git a/tests/gemtext/header.html b/tests/gemtext/header.html
new file mode 100644
index 0000000..ff686c4
--- /dev/null
+++ b/tests/gemtext/header.html
@@ -0,0 +1 @@
+header1
header2
header3
\ No newline at end of file
diff --git a/tests/gemtext/link.gmi b/tests/gemtext/link.gmi
new file mode 100644
index 0000000..e02069a
--- /dev/null
+++ b/tests/gemtext/link.gmi
@@ -0,0 +1,4 @@
+=> https://google.com
+=> https://google.com google
+=> https://google.com
+=> https://google.com google
diff --git a/tests/gemtext/link.html b/tests/gemtext/link.html
new file mode 100644
index 0000000..9e84ed5
--- /dev/null
+++ b/tests/gemtext/link.html
@@ -0,0 +1 @@
+https://google.com
google
https://google.com
google
\ No newline at end of file
diff --git a/tests/gemtext/list.gmi b/tests/gemtext/list.gmi
new file mode 100644
index 0000000..e61e1d4
--- /dev/null
+++ b/tests/gemtext/list.gmi
@@ -0,0 +1,3 @@
+* one
+* two
+* three
diff --git a/tests/gemtext/list.html b/tests/gemtext/list.html
new file mode 100644
index 0000000..a66f58a
--- /dev/null
+++ b/tests/gemtext/list.html
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/tests/gemtext/text.gmi b/tests/gemtext/text.gmi
new file mode 100644
index 0000000..840b569
--- /dev/null
+++ b/tests/gemtext/text.gmi
@@ -0,0 +1 @@
+ipsum lorem I don't speak latinum
diff --git a/tests/gemtext/text.html b/tests/gemtext/text.html
new file mode 100644
index 0000000..02ee347
--- /dev/null
+++ b/tests/gemtext/text.html
@@ -0,0 +1 @@
+ipsum lorem I don't speak latinum
\ No newline at end of file