From d4271b0e01143de4199b579b84dd2003e8a58afd Mon Sep 17 00:00:00 2001 From: Felix Zeller Date: Mon, 18 Mar 2024 21:26:20 -0400 Subject: [PATCH] completion refactoring and implementation --- Cargo.lock | 5 +- Cargo.toml | 2 +- TestFiles/A | 1 + TestFiles/Another Test.md | 3 +- TestFiles/folder/Fiile.md | 6 +- matcher/.gitignore | 1 - matcher/Cargo.lock | 82 -- matcher/Cargo.toml | 23 - matcher/LICENSE | 1 - matcher/fuzz.sh | 3 - matcher/fuzz/.gitignore | 4 - matcher/fuzz/Cargo.toml | 29 - matcher/fuzz/fuzz_targets/fuzz_target_1.rs | 78 -- matcher/generate_case_fold_table.sh | 13 - matcher/src/chars.rs | 200 ----- matcher/src/chars/case_fold.rs | 347 --------- matcher/src/chars/normalize.rs | 526 ------------- matcher/src/config.rs | 70 -- matcher/src/debug.rs | 32 - matcher/src/exact.rs | 274 ------- matcher/src/fuzzy_greedy.rs | 51 -- matcher/src/fuzzy_optimal.rs | 348 --------- matcher/src/lib.rs | 759 ------------------ matcher/src/matrix.rs | 198 ----- matcher/src/pattern.rs | 568 -------------- matcher/src/pattern/tests.rs | 114 --- matcher/src/prefilter.rs | 95 --- matcher/src/score.rs | 158 ---- matcher/src/tests.rs | 728 ------------------ matcher/src/utf32_str.rs | 355 --------- src/completion.rs | 847 --------------------- src/completion/link_completer.rs | 382 ++++++++++ src/completion/matcher.rs | 91 +++ src/completion/mod.rs | 774 +++++++++++++++++++ src/main.rs | 63 +- src/vault/mod.rs | 11 +- src/vault/referenceable.rs | 0 37 files changed, 1308 insertions(+), 5934 deletions(-) create mode 100644 TestFiles/A delete mode 100644 matcher/.gitignore delete mode 100644 matcher/Cargo.lock delete mode 100644 matcher/Cargo.toml delete mode 120000 matcher/LICENSE delete mode 100755 matcher/fuzz.sh delete mode 100644 matcher/fuzz/.gitignore delete mode 100644 matcher/fuzz/Cargo.toml delete mode 100644 matcher/fuzz/fuzz_targets/fuzz_target_1.rs delete mode 100755 matcher/generate_case_fold_table.sh delete mode 100644 matcher/src/chars.rs delete mode 100644 matcher/src/chars/case_fold.rs delete mode 100644 matcher/src/chars/normalize.rs delete mode 100644 matcher/src/config.rs delete mode 100644 matcher/src/debug.rs delete mode 100644 matcher/src/exact.rs delete mode 100644 matcher/src/fuzzy_greedy.rs delete mode 100644 matcher/src/fuzzy_optimal.rs delete mode 100644 matcher/src/lib.rs delete mode 100644 matcher/src/matrix.rs delete mode 100644 matcher/src/pattern.rs delete mode 100644 matcher/src/pattern/tests.rs delete mode 100644 matcher/src/prefilter.rs delete mode 100644 matcher/src/score.rs delete mode 100644 matcher/src/tests.rs delete mode 100644 matcher/src/utf32_str.rs delete mode 100644 src/completion.rs create mode 100644 src/completion/link_completer.rs create mode 100644 src/completion/matcher.rs create mode 100644 src/completion/mod.rs create mode 100644 src/vault/referenceable.rs diff --git a/Cargo.lock b/Cargo.lock index 7e9a54a2..15117396 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -406,10 +406,11 @@ dependencies = [ [[package]] name = "nucleo-matcher" -version = "0.3.0" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf33f538733d1a5a3494b836ba913207f14d9d4a1d3cd67030c5061bdd2cac85" dependencies = [ "memchr", - "rayon", "unicode-segmentation", ] diff --git a/Cargo.toml b/Cargo.toml index ba23ea0a..4eab073e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,7 +9,7 @@ edition = "2021" anyhow = "1.0.80" itertools = "0.10.5" nanoid = "0.4.0" -nucleo-matcher = { path = "./matcher" } +nucleo-matcher = "0.3.1" once_cell = "1.18.0" pathdiff = "0.2.1" rayon = "1.7.0" diff --git a/TestFiles/A b/TestFiles/A new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/TestFiles/A @@ -0,0 +1 @@ + diff --git a/TestFiles/Another Test.md b/TestFiles/Another Test.md index 00ffdd49..977d0fd2 100644 --- a/TestFiles/Another Test.md +++ b/TestFiles/Another Test.md @@ -1,8 +1,7 @@ xxf jdsklfjdsklf dskljfj dsklf jslkd jfklsd [[Another Test]] fjskadlf jkdsl - - +f jslk []() fj dalkfdj diff --git a/TestFiles/folder/Fiile.md b/TestFiles/folder/Fiile.md index 6295ff8e..a4ea7f98 100644 --- a/TestFiles/folder/Fiile.md +++ b/TestFiles/folder/Fiile.md @@ -1,5 +1,8 @@ -[[Another Test]] + +jf ldjfla sfkjl d [[Another Test]] [Another]() + + # Heading @@ -14,4 +17,3 @@ more footntoe[^2] [^2]: This is a different footntoe -[[folder/Fiile#ZxY7P]] diff --git a/matcher/.gitignore b/matcher/.gitignore deleted file mode 100644 index eb5a316c..00000000 --- a/matcher/.gitignore +++ /dev/null @@ -1 +0,0 @@ -target diff --git a/matcher/Cargo.lock b/matcher/Cargo.lock deleted file mode 100644 index 073a0bd3..00000000 --- a/matcher/Cargo.lock +++ /dev/null @@ -1,82 +0,0 @@ -# This file is automatically @generated by Cargo. -# It is not intended for manual editing. -version = 3 - -[[package]] -name = "cov-mark" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ffa3d3e0138386cd4361f63537765cac7ee40698028844635a54495a92f67f3" - -[[package]] -name = "crossbeam-deque" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" -dependencies = [ - "crossbeam-epoch", - "crossbeam-utils", -] - -[[package]] -name = "crossbeam-epoch" -version = "0.9.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" -dependencies = [ - "crossbeam-utils", -] - -[[package]] -name = "crossbeam-utils" -version = "0.8.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" - -[[package]] -name = "either" -version = "1.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11157ac094ffbdde99aa67b23417ebdd801842852b500e395a45a9c0aac03e4a" - -[[package]] -name = "memchr" -version = "2.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" - -[[package]] -name = "nucleo-matcher" -version = "0.3.0" -dependencies = [ - "cov-mark", - "memchr", - "rayon", - "unicode-segmentation", -] - -[[package]] -name = "rayon" -version = "1.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa7237101a77a10773db45d62004a272517633fbcc3df19d96455ede1122e051" -dependencies = [ - "either", - "rayon-core", -] - -[[package]] -name = "rayon-core" -version = "1.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" -dependencies = [ - "crossbeam-deque", - "crossbeam-utils", -] - -[[package]] -name = "unicode-segmentation" -version = "1.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202" diff --git a/matcher/Cargo.toml b/matcher/Cargo.toml deleted file mode 100644 index a9b6fd97..00000000 --- a/matcher/Cargo.toml +++ /dev/null @@ -1,23 +0,0 @@ -[package] -name = "nucleo-matcher" -description = "plug and play high performance fuzzy matcher" -authors = ["Pascal Kuthe "] -version = "0.3.0" -edition = "2021" -license = "MPL-2.0" -repository = "https://github.com/helix-editor/nucleo" -readme = "../README.md" - -[dependencies] -memchr = "2.5.0" -rayon = "*" -unicode-segmentation = { version = "1.10", optional = true } - -[features] -default = ["unicode-normalization", "unicode-casefold", "unicode-segmentation"] -unicode-normalization = [] -unicode-casefold = [] -unicode-segmentation = ["dep:unicode-segmentation"] - -[dev-dependencies] -cov-mark = { version = "1.1.0", default-features = true } diff --git a/matcher/LICENSE b/matcher/LICENSE deleted file mode 120000 index ea5b6064..00000000 --- a/matcher/LICENSE +++ /dev/null @@ -1 +0,0 @@ -../LICENSE \ No newline at end of file diff --git a/matcher/fuzz.sh b/matcher/fuzz.sh deleted file mode 100755 index d3ffa2c9..00000000 --- a/matcher/fuzz.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/usr/bin/env bash - -cargo +nightly fuzz "${1}" fuzz_target_1 "${@:2:99}" diff --git a/matcher/fuzz/.gitignore b/matcher/fuzz/.gitignore deleted file mode 100644 index 1a45eee7..00000000 --- a/matcher/fuzz/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -target -corpus -artifacts -coverage diff --git a/matcher/fuzz/Cargo.toml b/matcher/fuzz/Cargo.toml deleted file mode 100644 index 1b9d8a7f..00000000 --- a/matcher/fuzz/Cargo.toml +++ /dev/null @@ -1,29 +0,0 @@ -[package] -name = "fzf_oxide-fuzz" -version = "0.0.0" -publish = false -edition = "2021" - -[package.metadata] -cargo-fuzz = true - -[dependencies] -libfuzzer-sys = "0.4" -arbitrary = { version = "1", features = ["derive"] } - -[dependencies.fzf_oxide] -path = ".." - -# Prevent this from interfering with workspaces -[workspace] -members = ["."] - -[profile.release] -debug = 1 - -[[bin]] -name = "fuzz_target_1" -path = "fuzz_targets/fuzz_target_1.rs" -test = false -doc = false - diff --git a/matcher/fuzz/fuzz_targets/fuzz_target_1.rs b/matcher/fuzz/fuzz_targets/fuzz_target_1.rs deleted file mode 100644 index d9df7d36..00000000 --- a/matcher/fuzz/fuzz_targets/fuzz_target_1.rs +++ /dev/null @@ -1,78 +0,0 @@ -#![no_main] - -use fzf_oxide::{chars, Matcher, MatcherConfig, Utf32Str}; -use libfuzzer_sys::arbitrary::Arbitrary; -use libfuzzer_sys::fuzz_target; - -#[derive(Arbitrary, Debug)] -pub struct Input<'a> { - haystack: &'a str, - needle: &'a str, - ignore_case: bool, - normalize: bool, -} - -fuzz_target!(|data: Input<'_>| { - let mut data = data; - let mut config = MatcherConfig::DEFAULT; - config.ignore_case = data.ignore_case; - config.normalize = data.normalize; - let mut matcher = Matcher::new(config); - let mut indices_optimal = Vec::new(); - let mut indices_greedy = Vec::new(); - let mut needle_buf = Vec::new(); - let mut haystack_buf = Vec::new(); - let normalize = |mut c: char| { - if config.normalize { - c = chars::normalize(c); - } - if config.ignore_case { - c = chars::to_lower_case(c); - } - c - }; - let needle: String = data.needle.chars().map(normalize).collect(); - let needle_chars: Vec<_> = needle.chars().collect(); - let needle = Utf32Str::new(&needle, &mut needle_buf); - let haystack = Utf32Str::new(data.haystack, &mut haystack_buf); - - let greedy_score = matcher.fuzzy_indices_greedy(haystack, needle, &mut indices_greedy); - if greedy_score.is_some() { - let match_chars: Vec<_> = indices_greedy - .iter() - .map(|&i| normalize(haystack.get(i))) - .collect(); - assert_eq!( - match_chars, needle_chars, - "failed match, found {indices_greedy:?} {match_chars:?} (greedy)" - ); - } - let optimal_score = matcher.fuzzy_indices(haystack, needle, &mut indices_optimal); - if optimal_score.is_some() { - let match_chars: Vec<_> = indices_optimal - .iter() - .map(|&i| normalize(haystack.get(i))) - .collect(); - assert_eq!( - match_chars, needle_chars, - "failed match, found {indices_optimal:?} {match_chars:?}" - ); - } - match (greedy_score, optimal_score) { - (None, Some(score)) => unreachable!("optimal matched {score} but greedy did not match"), - (Some(score), None) => unreachable!("greedy matched {score} but optimal did not match"), - (Some(greedy), Some(optimal)) => { - assert!( - greedy <= optimal, - "optimal score must be atleast the same as greedy score {greedy} {optimal}" - ); - if indices_greedy == indices_optimal { - assert_eq!( - greedy, optimal, - "if matching same char greedy and optimal score should be identical" - ) - } - } - (None, None) => (), - } -}); diff --git a/matcher/generate_case_fold_table.sh b/matcher/generate_case_fold_table.sh deleted file mode 100755 index 32a26697..00000000 --- a/matcher/generate_case_fold_table.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env bash -set -e - -dir=$(pwd) -mkdir /tmp/ucd-15.0.0 -cd /tmp/ucd-15.0.0 -curl -LO https://www.unicode.org/Public/zipped/15.0.0/UCD.zip -unzip UCD.zip - -cd "${dir}" -cargo install ucd-generate -ucd-generate case-folding-simple /tmp/ucd-15.0.0 --chars > src/chars/case_fold.rs -rm -rf /tmp/ucd-15.0.0 diff --git a/matcher/src/chars.rs b/matcher/src/chars.rs deleted file mode 100644 index 53555f50..00000000 --- a/matcher/src/chars.rs +++ /dev/null @@ -1,200 +0,0 @@ -//! Utilities for working with (unicode) characters/codepoints - -use std::fmt::{self, Debug, Display}; - -#[cfg(feature = "unicode-casefold")] -use crate::chars::case_fold::CASE_FOLDING_SIMPLE; -use crate::Config; - -//autogenerated by generate-ucd -#[allow(warnings)] -#[rustfmt::skip] -#[cfg(feature = "unicode-casefold")] -mod case_fold; -#[cfg(feature = "unicode-normalization")] -mod normalize; - -pub(crate) trait Char: Copy + Eq + Ord + fmt::Display { - const ASCII: bool; - fn char_class(self, config: &Config) -> CharClass; - fn char_class_and_normalize(self, config: &Config) -> (Self, CharClass); - fn normalize(self, config: &Config) -> Self; -} - -/// repr tansparent wrapper around u8 with better formatting and `PartialEq` implementation -#[repr(transparent)] -#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy)] -pub(crate) struct AsciiChar(pub u8); - -impl AsciiChar { - pub fn cast(bytes: &[u8]) -> &[AsciiChar] { - unsafe { &*(bytes as *const [u8] as *const [AsciiChar]) } - } -} - -impl fmt::Display for AsciiChar { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - Display::fmt(&(self.0 as char), f) - } -} - -impl PartialEq for char { - fn eq(&self, other: &AsciiChar) -> bool { - other.0 as char == *self - } -} - -impl Char for AsciiChar { - const ASCII: bool = true; - #[inline] - fn char_class(self, config: &Config) -> CharClass { - let c = self.0; - // using manual if conditions instead optimizes better - if c >= b'a' && c <= b'z' { - CharClass::Lower - } else if c >= b'A' && c <= b'Z' { - CharClass::Upper - } else if c >= b'0' && c <= b'9' { - CharClass::Number - } else if c.is_ascii_whitespace() { - CharClass::Whitespace - } else if config.delimiter_chars.contains(&c) { - CharClass::Delimiter - } else { - CharClass::NonWord - } - } - - #[inline(always)] - fn char_class_and_normalize(mut self, config: &Config) -> (Self, CharClass) { - let char_class = self.char_class(config); - if config.ignore_case && char_class == CharClass::Upper { - self.0 += 32 - } - (self, char_class) - } - - #[inline(always)] - fn normalize(mut self, config: &Config) -> Self { - if config.ignore_case && self.0 >= b'A' && self.0 <= b'Z' { - self.0 += 32 - } - self - } -} -fn char_class_non_ascii(c: char) -> CharClass { - if c.is_lowercase() { - CharClass::Lower - } else if is_upper_case(c) { - CharClass::Upper - } else if c.is_numeric() { - CharClass::Number - } else if c.is_alphabetic() { - CharClass::Letter - } else if c.is_whitespace() { - CharClass::Whitespace - } else { - CharClass::NonWord - } -} -impl Char for char { - const ASCII: bool = false; - #[inline(always)] - fn char_class(self, config: &Config) -> CharClass { - if self.is_ascii() { - return AsciiChar(self as u8).char_class(config); - } - char_class_non_ascii(self) - } - - #[inline(always)] - fn char_class_and_normalize(mut self, config: &Config) -> (Self, CharClass) { - if self.is_ascii() { - let (c, class) = AsciiChar(self as u8).char_class_and_normalize(config); - return (c.0 as char, class); - } - let char_class = char_class_non_ascii(self); - #[cfg(feature = "unicode-casefold")] - let mut case_fold = char_class == CharClass::Upper; - #[cfg(feature = "unicode-normalization")] - if config.normalize { - self = normalize::normalize(self); - case_fold = true - } - #[cfg(feature = "unicode-casefold")] - if case_fold && config.ignore_case { - self = CASE_FOLDING_SIMPLE - .binary_search_by_key(&self, |(upper, _)| *upper) - .map_or(self, |idx| CASE_FOLDING_SIMPLE[idx].1) - } - (self, char_class) - } - - #[inline(always)] - fn normalize(mut self, config: &Config) -> Self { - #[cfg(feature = "unicode-normalization")] - if config.normalize { - self = normalize::normalize(self); - } - #[cfg(feature = "unicode-casefold")] - if config.ignore_case { - self = to_lower_case(self) - } - self - } -} - -#[cfg(feature = "unicode-normalization")] -pub use normalize::normalize; -#[cfg(feature = "unicode-segmentation")] -use unicode_segmentation::UnicodeSegmentation; - -/// Converts a character to lower case using simple unicode case folding -#[cfg(feature = "unicode-casefold")] -#[inline(always)] -pub fn to_lower_case(c: char) -> char { - CASE_FOLDING_SIMPLE - .binary_search_by_key(&c, |(upper, _)| *upper) - .map_or(c, |idx| CASE_FOLDING_SIMPLE[idx].1) -} - -/// Checks if a character is upper case according to simple unicode case folding. -/// if the `unicode-casefold` feature is disable the equivalent std function is used -#[inline(always)] -pub fn is_upper_case(c: char) -> bool { - #[cfg(feature = "unicode-casefold")] - let val = CASE_FOLDING_SIMPLE - .binary_search_by_key(&c, |(upper, _)| *upper) - .is_ok(); - #[cfg(not(feature = "unicode-casefold"))] - let val = c.is_uppercase(); - val -} - -#[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Copy, Clone, Hash)] -pub(crate) enum CharClass { - Whitespace, - NonWord, - Delimiter, - Lower, - Upper, - Letter, - Number, -} - -/// Nucleo cannot match graphemes as single units. To work around -/// that we only use the first codepoint of each grapheme. This -/// iterator returns the first character of each unicode grapheme -/// in a string and is used for constructing `Utf32Str(ing)`. -pub fn graphemes(text: &str) -> impl Iterator + '_ { - #[cfg(feature = "unicode-segmentation")] - let res = text.graphemes(true).map(|grapheme| { - grapheme - .chars() - .next() - .expect("graphemes must be non-empty") - }); - #[cfg(not(feature = "unicode-segmentation"))] - let res = text.chars(); - res -} diff --git a/matcher/src/chars/case_fold.rs b/matcher/src/chars/case_fold.rs deleted file mode 100644 index aacbe461..00000000 --- a/matcher/src/chars/case_fold.rs +++ /dev/null @@ -1,347 +0,0 @@ -// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: -// -// ucd-generate case-folding-simple /tmp/ucd-15.0.0 --chars -// -// Unicode version: 15.0.0. -// -// ucd-generate 0.3.0 is available on crates.io. - -pub const CASE_FOLDING_SIMPLE: &'static [(char, char)] = &[ - ('A', 'a'), ('B', 'b'), ('C', 'c'), ('D', 'd'), ('E', 'e'), ('F', 'f'), - ('G', 'g'), ('H', 'h'), ('I', 'i'), ('J', 'j'), ('K', 'k'), ('L', 'l'), - ('M', 'm'), ('N', 'n'), ('O', 'o'), ('P', 'p'), ('Q', 'q'), ('R', 'r'), - ('S', 's'), ('T', 't'), ('U', 'u'), ('V', 'v'), ('W', 'w'), ('X', 'x'), - ('Y', 'y'), ('Z', 'z'), ('µ', 'μ'), ('À', 'à'), ('Á', 'á'), - ('Â', 'â'), ('Ã', 'ã'), ('Ä', 'ä'), ('Å', 'å'), ('Æ', 'æ'), - ('Ç', 'ç'), ('È', 'è'), ('É', 'é'), ('Ê', 'ê'), ('Ë', 'ë'), - ('Ì', 'ì'), ('Í', 'í'), ('Î', 'î'), ('Ï', 'ï'), ('Ð', 'ð'), - ('Ñ', 'ñ'), ('Ò', 'ò'), ('Ó', 'ó'), ('Ô', 'ô'), ('Õ', 'õ'), - ('Ö', 'ö'), ('Ø', 'ø'), ('Ù', 'ù'), ('Ú', 'ú'), ('Û', 'û'), - ('Ü', 'ü'), ('Ý', 'ý'), ('Þ', 'þ'), ('Ā', 'ā'), ('Ă', 'ă'), - ('Ą', 'ą'), ('Ć', 'ć'), ('Ĉ', 'ĉ'), ('Ċ', 'ċ'), ('Č', 'č'), - ('Ď', 'ď'), ('Đ', 'đ'), ('Ē', 'ē'), ('Ĕ', 'ĕ'), ('Ė', 'ė'), - ('Ę', 'ę'), ('Ě', 'ě'), ('Ĝ', 'ĝ'), ('Ğ', 'ğ'), ('Ġ', 'ġ'), - ('Ģ', 'ģ'), ('Ĥ', 'ĥ'), ('Ħ', 'ħ'), ('Ĩ', 'ĩ'), ('Ī', 'ī'), - ('Ĭ', 'ĭ'), ('Į', 'į'), ('IJ', 'ij'), ('Ĵ', 'ĵ'), ('Ķ', 'ķ'), - ('Ĺ', 'ĺ'), ('Ļ', 'ļ'), ('Ľ', 'ľ'), ('Ŀ', 'ŀ'), ('Ł', 'ł'), - ('Ń', 'ń'), ('Ņ', 'ņ'), ('Ň', 'ň'), ('Ŋ', 'ŋ'), ('Ō', 'ō'), - ('Ŏ', 'ŏ'), ('Ő', 'ő'), ('Œ', 'œ'), ('Ŕ', 'ŕ'), ('Ŗ', 'ŗ'), - ('Ř', 'ř'), ('Ś', 'ś'), ('Ŝ', 'ŝ'), ('Ş', 'ş'), ('Š', 'š'), - ('Ţ', 'ţ'), ('Ť', 'ť'), ('Ŧ', 'ŧ'), ('Ũ', 'ũ'), ('Ū', 'ū'), - ('Ŭ', 'ŭ'), ('Ů', 'ů'), ('Ű', 'ű'), ('Ų', 'ų'), ('Ŵ', 'ŵ'), - ('Ŷ', 'ŷ'), ('Ÿ', 'ÿ'), ('Ź', 'ź'), ('Ż', 'ż'), ('Ž', 'ž'), - ('ſ', 's'), ('Ɓ', 'ɓ'), ('Ƃ', 'ƃ'), ('Ƅ', 'ƅ'), ('Ɔ', 'ɔ'), - ('Ƈ', 'ƈ'), ('Ɖ', 'ɖ'), ('Ɗ', 'ɗ'), ('Ƌ', 'ƌ'), ('Ǝ', 'ǝ'), - ('Ə', 'ə'), ('Ɛ', 'ɛ'), ('Ƒ', 'ƒ'), ('Ɠ', 'ɠ'), ('Ɣ', 'ɣ'), - ('Ɩ', 'ɩ'), ('Ɨ', 'ɨ'), ('Ƙ', 'ƙ'), ('Ɯ', 'ɯ'), ('Ɲ', 'ɲ'), - ('Ɵ', 'ɵ'), ('Ơ', 'ơ'), ('Ƣ', 'ƣ'), ('Ƥ', 'ƥ'), ('Ʀ', 'ʀ'), - ('Ƨ', 'ƨ'), ('Ʃ', 'ʃ'), ('Ƭ', 'ƭ'), ('Ʈ', 'ʈ'), ('Ư', 'ư'), - ('Ʊ', 'ʊ'), ('Ʋ', 'ʋ'), ('Ƴ', 'ƴ'), ('Ƶ', 'ƶ'), ('Ʒ', 'ʒ'), - ('Ƹ', 'ƹ'), ('Ƽ', 'ƽ'), ('DŽ', 'dž'), ('Dž', 'dž'), ('LJ', 'lj'), - ('Lj', 'lj'), ('NJ', 'nj'), ('Nj', 'nj'), ('Ǎ', 'ǎ'), ('Ǐ', 'ǐ'), - ('Ǒ', 'ǒ'), ('Ǔ', 'ǔ'), ('Ǖ', 'ǖ'), ('Ǘ', 'ǘ'), ('Ǚ', 'ǚ'), - ('Ǜ', 'ǜ'), ('Ǟ', 'ǟ'), ('Ǡ', 'ǡ'), ('Ǣ', 'ǣ'), ('Ǥ', 'ǥ'), - ('Ǧ', 'ǧ'), ('Ǩ', 'ǩ'), ('Ǫ', 'ǫ'), ('Ǭ', 'ǭ'), ('Ǯ', 'ǯ'), - ('DZ', 'dz'), ('Dz', 'dz'), ('Ǵ', 'ǵ'), ('Ƕ', 'ƕ'), ('Ƿ', 'ƿ'), - ('Ǹ', 'ǹ'), ('Ǻ', 'ǻ'), ('Ǽ', 'ǽ'), ('Ǿ', 'ǿ'), ('Ȁ', 'ȁ'), - ('Ȃ', 'ȃ'), ('Ȅ', 'ȅ'), ('Ȇ', 'ȇ'), ('Ȉ', 'ȉ'), ('Ȋ', 'ȋ'), - ('Ȍ', 'ȍ'), ('Ȏ', 'ȏ'), ('Ȑ', 'ȑ'), ('Ȓ', 'ȓ'), ('Ȕ', 'ȕ'), - ('Ȗ', 'ȗ'), ('Ș', 'ș'), ('Ț', 'ț'), ('Ȝ', 'ȝ'), ('Ȟ', 'ȟ'), - ('Ƞ', 'ƞ'), ('Ȣ', 'ȣ'), ('Ȥ', 'ȥ'), ('Ȧ', 'ȧ'), ('Ȩ', 'ȩ'), - ('Ȫ', 'ȫ'), ('Ȭ', 'ȭ'), ('Ȯ', 'ȯ'), ('Ȱ', 'ȱ'), ('Ȳ', 'ȳ'), - ('Ⱥ', 'ⱥ'), ('Ȼ', 'ȼ'), ('Ƚ', 'ƚ'), ('Ⱦ', 'ⱦ'), ('Ɂ', 'ɂ'), - ('Ƀ', 'ƀ'), ('Ʉ', 'ʉ'), ('Ʌ', 'ʌ'), ('Ɇ', 'ɇ'), ('Ɉ', 'ɉ'), - ('Ɋ', 'ɋ'), ('Ɍ', 'ɍ'), ('Ɏ', 'ɏ'), ('\u{345}', 'ι'), ('Ͱ', 'ͱ'), - ('Ͳ', 'ͳ'), ('Ͷ', 'ͷ'), ('Ϳ', 'ϳ'), ('Ά', 'ά'), ('Έ', 'έ'), - ('Ή', 'ή'), ('Ί', 'ί'), ('Ό', 'ό'), ('Ύ', 'ύ'), ('Ώ', 'ώ'), - ('Α', 'α'), ('Β', 'β'), ('Γ', 'γ'), ('Δ', 'δ'), ('Ε', 'ε'), - ('Ζ', 'ζ'), ('Η', 'η'), ('Θ', 'θ'), ('Ι', 'ι'), ('Κ', 'κ'), - ('Λ', 'λ'), ('Μ', 'μ'), ('Ν', 'ν'), ('Ξ', 'ξ'), ('Ο', 'ο'), - ('Π', 'π'), ('Ρ', 'ρ'), ('Σ', 'σ'), ('Τ', 'τ'), ('Υ', 'υ'), - ('Φ', 'φ'), ('Χ', 'χ'), ('Ψ', 'ψ'), ('Ω', 'ω'), ('Ϊ', 'ϊ'), - ('Ϋ', 'ϋ'), ('ς', 'σ'), ('Ϗ', 'ϗ'), ('ϐ', 'β'), ('ϑ', 'θ'), - ('ϕ', 'φ'), ('ϖ', 'π'), ('Ϙ', 'ϙ'), ('Ϛ', 'ϛ'), ('Ϝ', 'ϝ'), - ('Ϟ', 'ϟ'), ('Ϡ', 'ϡ'), ('Ϣ', 'ϣ'), ('Ϥ', 'ϥ'), ('Ϧ', 'ϧ'), - ('Ϩ', 'ϩ'), ('Ϫ', 'ϫ'), ('Ϭ', 'ϭ'), ('Ϯ', 'ϯ'), ('ϰ', 'κ'), - ('ϱ', 'ρ'), ('ϴ', 'θ'), ('ϵ', 'ε'), ('Ϸ', 'ϸ'), ('Ϲ', 'ϲ'), - ('Ϻ', 'ϻ'), ('Ͻ', 'ͻ'), ('Ͼ', 'ͼ'), ('Ͽ', 'ͽ'), ('Ѐ', 'ѐ'), - ('Ё', 'ё'), ('Ђ', 'ђ'), ('Ѓ', 'ѓ'), ('Є', 'є'), ('Ѕ', 'ѕ'), - ('І', 'і'), ('Ї', 'ї'), ('Ј', 'ј'), ('Љ', 'љ'), ('Њ', 'њ'), - ('Ћ', 'ћ'), ('Ќ', 'ќ'), ('Ѝ', 'ѝ'), ('Ў', 'ў'), ('Џ', 'џ'), - ('А', 'а'), ('Б', 'б'), ('В', 'в'), ('Г', 'г'), ('Д', 'д'), - ('Е', 'е'), ('Ж', 'ж'), ('З', 'з'), ('И', 'и'), ('Й', 'й'), - ('К', 'к'), ('Л', 'л'), ('М', 'м'), ('Н', 'н'), ('О', 'о'), - ('П', 'п'), ('Р', 'р'), ('С', 'с'), ('Т', 'т'), ('У', 'у'), - ('Ф', 'ф'), ('Х', 'х'), ('Ц', 'ц'), ('Ч', 'ч'), ('Ш', 'ш'), - ('Щ', 'щ'), ('Ъ', 'ъ'), ('Ы', 'ы'), ('Ь', 'ь'), ('Э', 'э'), - ('Ю', 'ю'), ('Я', 'я'), ('Ѡ', 'ѡ'), ('Ѣ', 'ѣ'), ('Ѥ', 'ѥ'), - ('Ѧ', 'ѧ'), ('Ѩ', 'ѩ'), ('Ѫ', 'ѫ'), ('Ѭ', 'ѭ'), ('Ѯ', 'ѯ'), - ('Ѱ', 'ѱ'), ('Ѳ', 'ѳ'), ('Ѵ', 'ѵ'), ('Ѷ', 'ѷ'), ('Ѹ', 'ѹ'), - ('Ѻ', 'ѻ'), ('Ѽ', 'ѽ'), ('Ѿ', 'ѿ'), ('Ҁ', 'ҁ'), ('Ҋ', 'ҋ'), - ('Ҍ', 'ҍ'), ('Ҏ', 'ҏ'), ('Ґ', 'ґ'), ('Ғ', 'ғ'), ('Ҕ', 'ҕ'), - ('Җ', 'җ'), ('Ҙ', 'ҙ'), ('Қ', 'қ'), ('Ҝ', 'ҝ'), ('Ҟ', 'ҟ'), - ('Ҡ', 'ҡ'), ('Ң', 'ң'), ('Ҥ', 'ҥ'), ('Ҧ', 'ҧ'), ('Ҩ', 'ҩ'), - ('Ҫ', 'ҫ'), ('Ҭ', 'ҭ'), ('Ү', 'ү'), ('Ұ', 'ұ'), ('Ҳ', 'ҳ'), - ('Ҵ', 'ҵ'), ('Ҷ', 'ҷ'), ('Ҹ', 'ҹ'), ('Һ', 'һ'), ('Ҽ', 'ҽ'), - ('Ҿ', 'ҿ'), ('Ӏ', 'ӏ'), ('Ӂ', 'ӂ'), ('Ӄ', 'ӄ'), ('Ӆ', 'ӆ'), - ('Ӈ', 'ӈ'), ('Ӊ', 'ӊ'), ('Ӌ', 'ӌ'), ('Ӎ', 'ӎ'), ('Ӑ', 'ӑ'), - ('Ӓ', 'ӓ'), ('Ӕ', 'ӕ'), ('Ӗ', 'ӗ'), ('Ә', 'ә'), ('Ӛ', 'ӛ'), - ('Ӝ', 'ӝ'), ('Ӟ', 'ӟ'), ('Ӡ', 'ӡ'), ('Ӣ', 'ӣ'), ('Ӥ', 'ӥ'), - ('Ӧ', 'ӧ'), ('Ө', 'ө'), ('Ӫ', 'ӫ'), ('Ӭ', 'ӭ'), ('Ӯ', 'ӯ'), - ('Ӱ', 'ӱ'), ('Ӳ', 'ӳ'), ('Ӵ', 'ӵ'), ('Ӷ', 'ӷ'), ('Ӹ', 'ӹ'), - ('Ӻ', 'ӻ'), ('Ӽ', 'ӽ'), ('Ӿ', 'ӿ'), ('Ԁ', 'ԁ'), ('Ԃ', 'ԃ'), - ('Ԅ', 'ԅ'), ('Ԇ', 'ԇ'), ('Ԉ', 'ԉ'), ('Ԋ', 'ԋ'), ('Ԍ', 'ԍ'), - ('Ԏ', 'ԏ'), ('Ԑ', 'ԑ'), ('Ԓ', 'ԓ'), ('Ԕ', 'ԕ'), ('Ԗ', 'ԗ'), - ('Ԙ', 'ԙ'), ('Ԛ', 'ԛ'), ('Ԝ', 'ԝ'), ('Ԟ', 'ԟ'), ('Ԡ', 'ԡ'), - ('Ԣ', 'ԣ'), ('Ԥ', 'ԥ'), ('Ԧ', 'ԧ'), ('Ԩ', 'ԩ'), ('Ԫ', 'ԫ'), - ('Ԭ', 'ԭ'), ('Ԯ', 'ԯ'), ('Ա', 'ա'), ('Բ', 'բ'), ('Գ', 'գ'), - ('Դ', 'դ'), ('Ե', 'ե'), ('Զ', 'զ'), ('Է', 'է'), ('Ը', 'ը'), - ('Թ', 'թ'), ('Ժ', 'ժ'), ('Ի', 'ի'), ('Լ', 'լ'), ('Խ', 'խ'), - ('Ծ', 'ծ'), ('Կ', 'կ'), ('Հ', 'հ'), ('Ձ', 'ձ'), ('Ղ', 'ղ'), - ('Ճ', 'ճ'), ('Մ', 'մ'), ('Յ', 'յ'), ('Ն', 'ն'), ('Շ', 'շ'), - ('Ո', 'ո'), ('Չ', 'չ'), ('Պ', 'պ'), ('Ջ', 'ջ'), ('Ռ', 'ռ'), - ('Ս', 'ս'), ('Վ', 'վ'), ('Տ', 'տ'), ('Ր', 'ր'), ('Ց', 'ց'), - ('Ւ', 'ւ'), ('Փ', 'փ'), ('Ք', 'ք'), ('Օ', 'օ'), ('Ֆ', 'ֆ'), - ('Ⴀ', 'ⴀ'), ('Ⴁ', 'ⴁ'), ('Ⴂ', 'ⴂ'), ('Ⴃ', 'ⴃ'), - ('Ⴄ', 'ⴄ'), ('Ⴅ', 'ⴅ'), ('Ⴆ', 'ⴆ'), ('Ⴇ', 'ⴇ'), - ('Ⴈ', 'ⴈ'), ('Ⴉ', 'ⴉ'), ('Ⴊ', 'ⴊ'), ('Ⴋ', 'ⴋ'), - ('Ⴌ', 'ⴌ'), ('Ⴍ', 'ⴍ'), ('Ⴎ', 'ⴎ'), ('Ⴏ', 'ⴏ'), - ('Ⴐ', 'ⴐ'), ('Ⴑ', 'ⴑ'), ('Ⴒ', 'ⴒ'), ('Ⴓ', 'ⴓ'), - ('Ⴔ', 'ⴔ'), ('Ⴕ', 'ⴕ'), ('Ⴖ', 'ⴖ'), ('Ⴗ', 'ⴗ'), - ('Ⴘ', 'ⴘ'), ('Ⴙ', 'ⴙ'), ('Ⴚ', 'ⴚ'), ('Ⴛ', 'ⴛ'), - ('Ⴜ', 'ⴜ'), ('Ⴝ', 'ⴝ'), ('Ⴞ', 'ⴞ'), ('Ⴟ', 'ⴟ'), - ('Ⴠ', 'ⴠ'), ('Ⴡ', 'ⴡ'), ('Ⴢ', 'ⴢ'), ('Ⴣ', 'ⴣ'), - ('Ⴤ', 'ⴤ'), ('Ⴥ', 'ⴥ'), ('Ⴧ', 'ⴧ'), ('Ⴭ', 'ⴭ'), - ('ᏸ', 'Ᏸ'), ('ᏹ', 'Ᏹ'), ('ᏺ', 'Ᏺ'), ('ᏻ', 'Ᏻ'), - ('ᏼ', 'Ᏼ'), ('ᏽ', 'Ᏽ'), ('ᲀ', 'в'), ('ᲁ', 'д'), ('ᲂ', 'о'), - ('ᲃ', 'с'), ('ᲄ', 'т'), ('ᲅ', 'т'), ('ᲆ', 'ъ'), ('ᲇ', 'ѣ'), - ('ᲈ', 'ꙋ'), ('Ა', 'ა'), ('Ბ', 'ბ'), ('Გ', 'გ'), - ('Დ', 'დ'), ('Ე', 'ე'), ('Ვ', 'ვ'), ('Ზ', 'ზ'), - ('Თ', 'თ'), ('Ი', 'ი'), ('Კ', 'კ'), ('Ლ', 'ლ'), - ('Მ', 'მ'), ('Ნ', 'ნ'), ('Ო', 'ო'), ('Პ', 'პ'), - ('Ჟ', 'ჟ'), ('Რ', 'რ'), ('Ს', 'ს'), ('Ტ', 'ტ'), - ('Უ', 'უ'), ('Ფ', 'ფ'), ('Ქ', 'ქ'), ('Ღ', 'ღ'), - ('Ყ', 'ყ'), ('Შ', 'შ'), ('Ჩ', 'ჩ'), ('Ც', 'ც'), - ('Ძ', 'ძ'), ('Წ', 'წ'), ('Ჭ', 'ჭ'), ('Ხ', 'ხ'), - ('Ჯ', 'ჯ'), ('Ჰ', 'ჰ'), ('Ჱ', 'ჱ'), ('Ჲ', 'ჲ'), - ('Ჳ', 'ჳ'), ('Ჴ', 'ჴ'), ('Ჵ', 'ჵ'), ('Ჶ', 'ჶ'), - ('Ჷ', 'ჷ'), ('Ჸ', 'ჸ'), ('Ჹ', 'ჹ'), ('Ჺ', 'ჺ'), - ('Ჽ', 'ჽ'), ('Ჾ', 'ჾ'), ('Ჿ', 'ჿ'), ('Ḁ', 'ḁ'), - ('Ḃ', 'ḃ'), ('Ḅ', 'ḅ'), ('Ḇ', 'ḇ'), ('Ḉ', 'ḉ'), - ('Ḋ', 'ḋ'), ('Ḍ', 'ḍ'), ('Ḏ', 'ḏ'), ('Ḑ', 'ḑ'), - ('Ḓ', 'ḓ'), ('Ḕ', 'ḕ'), ('Ḗ', 'ḗ'), ('Ḙ', 'ḙ'), - ('Ḛ', 'ḛ'), ('Ḝ', 'ḝ'), ('Ḟ', 'ḟ'), ('Ḡ', 'ḡ'), - ('Ḣ', 'ḣ'), ('Ḥ', 'ḥ'), ('Ḧ', 'ḧ'), ('Ḩ', 'ḩ'), - ('Ḫ', 'ḫ'), ('Ḭ', 'ḭ'), ('Ḯ', 'ḯ'), ('Ḱ', 'ḱ'), - ('Ḳ', 'ḳ'), ('Ḵ', 'ḵ'), ('Ḷ', 'ḷ'), ('Ḹ', 'ḹ'), - ('Ḻ', 'ḻ'), ('Ḽ', 'ḽ'), ('Ḿ', 'ḿ'), ('Ṁ', 'ṁ'), - ('Ṃ', 'ṃ'), ('Ṅ', 'ṅ'), ('Ṇ', 'ṇ'), ('Ṉ', 'ṉ'), - ('Ṋ', 'ṋ'), ('Ṍ', 'ṍ'), ('Ṏ', 'ṏ'), ('Ṑ', 'ṑ'), - ('Ṓ', 'ṓ'), ('Ṕ', 'ṕ'), ('Ṗ', 'ṗ'), ('Ṙ', 'ṙ'), - ('Ṛ', 'ṛ'), ('Ṝ', 'ṝ'), ('Ṟ', 'ṟ'), ('Ṡ', 'ṡ'), - ('Ṣ', 'ṣ'), ('Ṥ', 'ṥ'), ('Ṧ', 'ṧ'), ('Ṩ', 'ṩ'), - ('Ṫ', 'ṫ'), ('Ṭ', 'ṭ'), ('Ṯ', 'ṯ'), ('Ṱ', 'ṱ'), - ('Ṳ', 'ṳ'), ('Ṵ', 'ṵ'), ('Ṷ', 'ṷ'), ('Ṹ', 'ṹ'), - ('Ṻ', 'ṻ'), ('Ṽ', 'ṽ'), ('Ṿ', 'ṿ'), ('Ẁ', 'ẁ'), - ('Ẃ', 'ẃ'), ('Ẅ', 'ẅ'), ('Ẇ', 'ẇ'), ('Ẉ', 'ẉ'), - ('Ẋ', 'ẋ'), ('Ẍ', 'ẍ'), ('Ẏ', 'ẏ'), ('Ẑ', 'ẑ'), - ('Ẓ', 'ẓ'), ('Ẕ', 'ẕ'), ('ẛ', 'ṡ'), ('ẞ', 'ß'), - ('Ạ', 'ạ'), ('Ả', 'ả'), ('Ấ', 'ấ'), ('Ầ', 'ầ'), - ('Ẩ', 'ẩ'), ('Ẫ', 'ẫ'), ('Ậ', 'ậ'), ('Ắ', 'ắ'), - ('Ằ', 'ằ'), ('Ẳ', 'ẳ'), ('Ẵ', 'ẵ'), ('Ặ', 'ặ'), - ('Ẹ', 'ẹ'), ('Ẻ', 'ẻ'), ('Ẽ', 'ẽ'), ('Ế', 'ế'), - ('Ề', 'ề'), ('Ể', 'ể'), ('Ễ', 'ễ'), ('Ệ', 'ệ'), - ('Ỉ', 'ỉ'), ('Ị', 'ị'), ('Ọ', 'ọ'), ('Ỏ', 'ỏ'), - ('Ố', 'ố'), ('Ồ', 'ồ'), ('Ổ', 'ổ'), ('Ỗ', 'ỗ'), - ('Ộ', 'ộ'), ('Ớ', 'ớ'), ('Ờ', 'ờ'), ('Ở', 'ở'), - ('Ỡ', 'ỡ'), ('Ợ', 'ợ'), ('Ụ', 'ụ'), ('Ủ', 'ủ'), - ('Ứ', 'ứ'), ('Ừ', 'ừ'), ('Ử', 'ử'), ('Ữ', 'ữ'), - ('Ự', 'ự'), ('Ỳ', 'ỳ'), ('Ỵ', 'ỵ'), ('Ỷ', 'ỷ'), - ('Ỹ', 'ỹ'), ('Ỻ', 'ỻ'), ('Ỽ', 'ỽ'), ('Ỿ', 'ỿ'), - ('Ἀ', 'ἀ'), ('Ἁ', 'ἁ'), ('Ἂ', 'ἂ'), ('Ἃ', 'ἃ'), - ('Ἄ', 'ἄ'), ('Ἅ', 'ἅ'), ('Ἆ', 'ἆ'), ('Ἇ', 'ἇ'), - ('Ἐ', 'ἐ'), ('Ἑ', 'ἑ'), ('Ἒ', 'ἒ'), ('Ἓ', 'ἓ'), - ('Ἔ', 'ἔ'), ('Ἕ', 'ἕ'), ('Ἠ', 'ἠ'), ('Ἡ', 'ἡ'), - ('Ἢ', 'ἢ'), ('Ἣ', 'ἣ'), ('Ἤ', 'ἤ'), ('Ἥ', 'ἥ'), - ('Ἦ', 'ἦ'), ('Ἧ', 'ἧ'), ('Ἰ', 'ἰ'), ('Ἱ', 'ἱ'), - ('Ἲ', 'ἲ'), ('Ἳ', 'ἳ'), ('Ἴ', 'ἴ'), ('Ἵ', 'ἵ'), - ('Ἶ', 'ἶ'), ('Ἷ', 'ἷ'), ('Ὀ', 'ὀ'), ('Ὁ', 'ὁ'), - ('Ὂ', 'ὂ'), ('Ὃ', 'ὃ'), ('Ὄ', 'ὄ'), ('Ὅ', 'ὅ'), - ('Ὑ', 'ὑ'), ('Ὓ', 'ὓ'), ('Ὕ', 'ὕ'), ('Ὗ', 'ὗ'), - ('Ὠ', 'ὠ'), ('Ὡ', 'ὡ'), ('Ὢ', 'ὢ'), ('Ὣ', 'ὣ'), - ('Ὤ', 'ὤ'), ('Ὥ', 'ὥ'), ('Ὦ', 'ὦ'), ('Ὧ', 'ὧ'), - ('ᾈ', 'ᾀ'), ('ᾉ', 'ᾁ'), ('ᾊ', 'ᾂ'), ('ᾋ', 'ᾃ'), - ('ᾌ', 'ᾄ'), ('ᾍ', 'ᾅ'), ('ᾎ', 'ᾆ'), ('ᾏ', 'ᾇ'), - ('ᾘ', 'ᾐ'), ('ᾙ', 'ᾑ'), ('ᾚ', 'ᾒ'), ('ᾛ', 'ᾓ'), - ('ᾜ', 'ᾔ'), ('ᾝ', 'ᾕ'), ('ᾞ', 'ᾖ'), ('ᾟ', 'ᾗ'), - ('ᾨ', 'ᾠ'), ('ᾩ', 'ᾡ'), ('ᾪ', 'ᾢ'), ('ᾫ', 'ᾣ'), - ('ᾬ', 'ᾤ'), ('ᾭ', 'ᾥ'), ('ᾮ', 'ᾦ'), ('ᾯ', 'ᾧ'), - ('Ᾰ', 'ᾰ'), ('Ᾱ', 'ᾱ'), ('Ὰ', 'ὰ'), ('Ά', 'ά'), - ('ᾼ', 'ᾳ'), ('ι', 'ι'), ('Ὲ', 'ὲ'), ('Έ', 'έ'), - ('Ὴ', 'ὴ'), ('Ή', 'ή'), ('ῌ', 'ῃ'), ('Ῐ', 'ῐ'), - ('Ῑ', 'ῑ'), ('Ὶ', 'ὶ'), ('Ί', 'ί'), ('Ῠ', 'ῠ'), - ('Ῡ', 'ῡ'), ('Ὺ', 'ὺ'), ('Ύ', 'ύ'), ('Ῥ', 'ῥ'), - ('Ὸ', 'ὸ'), ('Ό', 'ό'), ('Ὼ', 'ὼ'), ('Ώ', 'ώ'), - ('ῼ', 'ῳ'), ('Ω', 'ω'), ('K', 'k'), ('Å', 'å'), ('Ⅎ', 'ⅎ'), - ('Ⅰ', 'ⅰ'), ('Ⅱ', 'ⅱ'), ('Ⅲ', 'ⅲ'), ('Ⅳ', 'ⅳ'), - ('Ⅴ', 'ⅴ'), ('Ⅵ', 'ⅵ'), ('Ⅶ', 'ⅶ'), ('Ⅷ', 'ⅷ'), - ('Ⅸ', 'ⅸ'), ('Ⅹ', 'ⅹ'), ('Ⅺ', 'ⅺ'), ('Ⅻ', 'ⅻ'), - ('Ⅼ', 'ⅼ'), ('Ⅽ', 'ⅽ'), ('Ⅾ', 'ⅾ'), ('Ⅿ', 'ⅿ'), - ('Ↄ', 'ↄ'), ('Ⓐ', 'ⓐ'), ('Ⓑ', 'ⓑ'), ('Ⓒ', 'ⓒ'), - ('Ⓓ', 'ⓓ'), ('Ⓔ', 'ⓔ'), ('Ⓕ', 'ⓕ'), ('Ⓖ', 'ⓖ'), - ('Ⓗ', 'ⓗ'), ('Ⓘ', 'ⓘ'), ('Ⓙ', 'ⓙ'), ('Ⓚ', 'ⓚ'), - ('Ⓛ', 'ⓛ'), ('Ⓜ', 'ⓜ'), ('Ⓝ', 'ⓝ'), ('Ⓞ', 'ⓞ'), - ('Ⓟ', 'ⓟ'), ('Ⓠ', 'ⓠ'), ('Ⓡ', 'ⓡ'), ('Ⓢ', 'ⓢ'), - ('Ⓣ', 'ⓣ'), ('Ⓤ', 'ⓤ'), ('Ⓥ', 'ⓥ'), ('Ⓦ', 'ⓦ'), - ('Ⓧ', 'ⓧ'), ('Ⓨ', 'ⓨ'), ('Ⓩ', 'ⓩ'), ('Ⰰ', 'ⰰ'), - ('Ⰱ', 'ⰱ'), ('Ⰲ', 'ⰲ'), ('Ⰳ', 'ⰳ'), ('Ⰴ', 'ⰴ'), - ('Ⰵ', 'ⰵ'), ('Ⰶ', 'ⰶ'), ('Ⰷ', 'ⰷ'), ('Ⰸ', 'ⰸ'), - ('Ⰹ', 'ⰹ'), ('Ⰺ', 'ⰺ'), ('Ⰻ', 'ⰻ'), ('Ⰼ', 'ⰼ'), - ('Ⰽ', 'ⰽ'), ('Ⰾ', 'ⰾ'), ('Ⰿ', 'ⰿ'), ('Ⱀ', 'ⱀ'), - ('Ⱁ', 'ⱁ'), ('Ⱂ', 'ⱂ'), ('Ⱃ', 'ⱃ'), ('Ⱄ', 'ⱄ'), - ('Ⱅ', 'ⱅ'), ('Ⱆ', 'ⱆ'), ('Ⱇ', 'ⱇ'), ('Ⱈ', 'ⱈ'), - ('Ⱉ', 'ⱉ'), ('Ⱊ', 'ⱊ'), ('Ⱋ', 'ⱋ'), ('Ⱌ', 'ⱌ'), - ('Ⱍ', 'ⱍ'), ('Ⱎ', 'ⱎ'), ('Ⱏ', 'ⱏ'), ('Ⱐ', 'ⱐ'), - ('Ⱑ', 'ⱑ'), ('Ⱒ', 'ⱒ'), ('Ⱓ', 'ⱓ'), ('Ⱔ', 'ⱔ'), - ('Ⱕ', 'ⱕ'), ('Ⱖ', 'ⱖ'), ('Ⱗ', 'ⱗ'), ('Ⱘ', 'ⱘ'), - ('Ⱙ', 'ⱙ'), ('Ⱚ', 'ⱚ'), ('Ⱛ', 'ⱛ'), ('Ⱜ', 'ⱜ'), - ('Ⱝ', 'ⱝ'), ('Ⱞ', 'ⱞ'), ('Ⱟ', 'ⱟ'), ('Ⱡ', 'ⱡ'), - ('Ɫ', 'ɫ'), ('Ᵽ', 'ᵽ'), ('Ɽ', 'ɽ'), ('Ⱨ', 'ⱨ'), - ('Ⱪ', 'ⱪ'), ('Ⱬ', 'ⱬ'), ('Ɑ', 'ɑ'), ('Ɱ', 'ɱ'), ('Ɐ', 'ɐ'), - ('Ɒ', 'ɒ'), ('Ⱳ', 'ⱳ'), ('Ⱶ', 'ⱶ'), ('Ȿ', 'ȿ'), ('Ɀ', 'ɀ'), - ('Ⲁ', 'ⲁ'), ('Ⲃ', 'ⲃ'), ('Ⲅ', 'ⲅ'), ('Ⲇ', 'ⲇ'), - ('Ⲉ', 'ⲉ'), ('Ⲋ', 'ⲋ'), ('Ⲍ', 'ⲍ'), ('Ⲏ', 'ⲏ'), - ('Ⲑ', 'ⲑ'), ('Ⲓ', 'ⲓ'), ('Ⲕ', 'ⲕ'), ('Ⲗ', 'ⲗ'), - ('Ⲙ', 'ⲙ'), ('Ⲛ', 'ⲛ'), ('Ⲝ', 'ⲝ'), ('Ⲟ', 'ⲟ'), - ('Ⲡ', 'ⲡ'), ('Ⲣ', 'ⲣ'), ('Ⲥ', 'ⲥ'), ('Ⲧ', 'ⲧ'), - ('Ⲩ', 'ⲩ'), ('Ⲫ', 'ⲫ'), ('Ⲭ', 'ⲭ'), ('Ⲯ', 'ⲯ'), - ('Ⲱ', 'ⲱ'), ('Ⲳ', 'ⲳ'), ('Ⲵ', 'ⲵ'), ('Ⲷ', 'ⲷ'), - ('Ⲹ', 'ⲹ'), ('Ⲻ', 'ⲻ'), ('Ⲽ', 'ⲽ'), ('Ⲿ', 'ⲿ'), - ('Ⳁ', 'ⳁ'), ('Ⳃ', 'ⳃ'), ('Ⳅ', 'ⳅ'), ('Ⳇ', 'ⳇ'), - ('Ⳉ', 'ⳉ'), ('Ⳋ', 'ⳋ'), ('Ⳍ', 'ⳍ'), ('Ⳏ', 'ⳏ'), - ('Ⳑ', 'ⳑ'), ('Ⳓ', 'ⳓ'), ('Ⳕ', 'ⳕ'), ('Ⳗ', 'ⳗ'), - ('Ⳙ', 'ⳙ'), ('Ⳛ', 'ⳛ'), ('Ⳝ', 'ⳝ'), ('Ⳟ', 'ⳟ'), - ('Ⳡ', 'ⳡ'), ('Ⳣ', 'ⳣ'), ('Ⳬ', 'ⳬ'), ('Ⳮ', 'ⳮ'), - ('Ⳳ', 'ⳳ'), ('Ꙁ', 'ꙁ'), ('Ꙃ', 'ꙃ'), ('Ꙅ', 'ꙅ'), - ('Ꙇ', 'ꙇ'), ('Ꙉ', 'ꙉ'), ('Ꙋ', 'ꙋ'), ('Ꙍ', 'ꙍ'), - ('Ꙏ', 'ꙏ'), ('Ꙑ', 'ꙑ'), ('Ꙓ', 'ꙓ'), ('Ꙕ', 'ꙕ'), - ('Ꙗ', 'ꙗ'), ('Ꙙ', 'ꙙ'), ('Ꙛ', 'ꙛ'), ('Ꙝ', 'ꙝ'), - ('Ꙟ', 'ꙟ'), ('Ꙡ', 'ꙡ'), ('Ꙣ', 'ꙣ'), ('Ꙥ', 'ꙥ'), - ('Ꙧ', 'ꙧ'), ('Ꙩ', 'ꙩ'), ('Ꙫ', 'ꙫ'), ('Ꙭ', 'ꙭ'), - ('Ꚁ', 'ꚁ'), ('Ꚃ', 'ꚃ'), ('Ꚅ', 'ꚅ'), ('Ꚇ', 'ꚇ'), - ('Ꚉ', 'ꚉ'), ('Ꚋ', 'ꚋ'), ('Ꚍ', 'ꚍ'), ('Ꚏ', 'ꚏ'), - ('Ꚑ', 'ꚑ'), ('Ꚓ', 'ꚓ'), ('Ꚕ', 'ꚕ'), ('Ꚗ', 'ꚗ'), - ('Ꚙ', 'ꚙ'), ('Ꚛ', 'ꚛ'), ('Ꜣ', 'ꜣ'), ('Ꜥ', 'ꜥ'), - ('Ꜧ', 'ꜧ'), ('Ꜩ', 'ꜩ'), ('Ꜫ', 'ꜫ'), ('Ꜭ', 'ꜭ'), - ('Ꜯ', 'ꜯ'), ('Ꜳ', 'ꜳ'), ('Ꜵ', 'ꜵ'), ('Ꜷ', 'ꜷ'), - ('Ꜹ', 'ꜹ'), ('Ꜻ', 'ꜻ'), ('Ꜽ', 'ꜽ'), ('Ꜿ', 'ꜿ'), - ('Ꝁ', 'ꝁ'), ('Ꝃ', 'ꝃ'), ('Ꝅ', 'ꝅ'), ('Ꝇ', 'ꝇ'), - ('Ꝉ', 'ꝉ'), ('Ꝋ', 'ꝋ'), ('Ꝍ', 'ꝍ'), ('Ꝏ', 'ꝏ'), - ('Ꝑ', 'ꝑ'), ('Ꝓ', 'ꝓ'), ('Ꝕ', 'ꝕ'), ('Ꝗ', 'ꝗ'), - ('Ꝙ', 'ꝙ'), ('Ꝛ', 'ꝛ'), ('Ꝝ', 'ꝝ'), ('Ꝟ', 'ꝟ'), - ('Ꝡ', 'ꝡ'), ('Ꝣ', 'ꝣ'), ('Ꝥ', 'ꝥ'), ('Ꝧ', 'ꝧ'), - ('Ꝩ', 'ꝩ'), ('Ꝫ', 'ꝫ'), ('Ꝭ', 'ꝭ'), ('Ꝯ', 'ꝯ'), - ('Ꝺ', 'ꝺ'), ('Ꝼ', 'ꝼ'), ('Ᵹ', 'ᵹ'), ('Ꝿ', 'ꝿ'), - ('Ꞁ', 'ꞁ'), ('Ꞃ', 'ꞃ'), ('Ꞅ', 'ꞅ'), ('Ꞇ', 'ꞇ'), - ('Ꞌ', 'ꞌ'), ('Ɥ', 'ɥ'), ('Ꞑ', 'ꞑ'), ('Ꞓ', 'ꞓ'), - ('Ꞗ', 'ꞗ'), ('Ꞙ', 'ꞙ'), ('Ꞛ', 'ꞛ'), ('Ꞝ', 'ꞝ'), - ('Ꞟ', 'ꞟ'), ('Ꞡ', 'ꞡ'), ('Ꞣ', 'ꞣ'), ('Ꞥ', 'ꞥ'), - ('Ꞧ', 'ꞧ'), ('Ꞩ', 'ꞩ'), ('Ɦ', 'ɦ'), ('Ɜ', 'ɜ'), ('Ɡ', 'ɡ'), - ('Ɬ', 'ɬ'), ('Ɪ', 'ɪ'), ('Ʞ', 'ʞ'), ('Ʇ', 'ʇ'), ('Ʝ', 'ʝ'), - ('Ꭓ', 'ꭓ'), ('Ꞵ', 'ꞵ'), ('Ꞷ', 'ꞷ'), ('Ꞹ', 'ꞹ'), - ('Ꞻ', 'ꞻ'), ('Ꞽ', 'ꞽ'), ('Ꞿ', 'ꞿ'), ('Ꟁ', 'ꟁ'), - ('Ꟃ', 'ꟃ'), ('Ꞔ', 'ꞔ'), ('Ʂ', 'ʂ'), ('Ᶎ', 'ᶎ'), - ('Ꟈ', 'ꟈ'), ('Ꟊ', 'ꟊ'), ('Ꟑ', 'ꟑ'), ('Ꟗ', 'ꟗ'), - ('Ꟙ', 'ꟙ'), ('Ꟶ', 'ꟶ'), ('ꭰ', 'Ꭰ'), ('ꭱ', 'Ꭱ'), - ('ꭲ', 'Ꭲ'), ('ꭳ', 'Ꭳ'), ('ꭴ', 'Ꭴ'), ('ꭵ', 'Ꭵ'), - ('ꭶ', 'Ꭶ'), ('ꭷ', 'Ꭷ'), ('ꭸ', 'Ꭸ'), ('ꭹ', 'Ꭹ'), - ('ꭺ', 'Ꭺ'), ('ꭻ', 'Ꭻ'), ('ꭼ', 'Ꭼ'), ('ꭽ', 'Ꭽ'), - ('ꭾ', 'Ꭾ'), ('ꭿ', 'Ꭿ'), ('ꮀ', 'Ꮀ'), ('ꮁ', 'Ꮁ'), - ('ꮂ', 'Ꮂ'), ('ꮃ', 'Ꮃ'), ('ꮄ', 'Ꮄ'), ('ꮅ', 'Ꮅ'), - ('ꮆ', 'Ꮆ'), ('ꮇ', 'Ꮇ'), ('ꮈ', 'Ꮈ'), ('ꮉ', 'Ꮉ'), - ('ꮊ', 'Ꮊ'), ('ꮋ', 'Ꮋ'), ('ꮌ', 'Ꮌ'), ('ꮍ', 'Ꮍ'), - ('ꮎ', 'Ꮎ'), ('ꮏ', 'Ꮏ'), ('ꮐ', 'Ꮐ'), ('ꮑ', 'Ꮑ'), - ('ꮒ', 'Ꮒ'), ('ꮓ', 'Ꮓ'), ('ꮔ', 'Ꮔ'), ('ꮕ', 'Ꮕ'), - ('ꮖ', 'Ꮖ'), ('ꮗ', 'Ꮗ'), ('ꮘ', 'Ꮘ'), ('ꮙ', 'Ꮙ'), - ('ꮚ', 'Ꮚ'), ('ꮛ', 'Ꮛ'), ('ꮜ', 'Ꮜ'), ('ꮝ', 'Ꮝ'), - ('ꮞ', 'Ꮞ'), ('ꮟ', 'Ꮟ'), ('ꮠ', 'Ꮠ'), ('ꮡ', 'Ꮡ'), - ('ꮢ', 'Ꮢ'), ('ꮣ', 'Ꮣ'), ('ꮤ', 'Ꮤ'), ('ꮥ', 'Ꮥ'), - ('ꮦ', 'Ꮦ'), ('ꮧ', 'Ꮧ'), ('ꮨ', 'Ꮨ'), ('ꮩ', 'Ꮩ'), - ('ꮪ', 'Ꮪ'), ('ꮫ', 'Ꮫ'), ('ꮬ', 'Ꮬ'), ('ꮭ', 'Ꮭ'), - ('ꮮ', 'Ꮮ'), ('ꮯ', 'Ꮯ'), ('ꮰ', 'Ꮰ'), ('ꮱ', 'Ꮱ'), - ('ꮲ', 'Ꮲ'), ('ꮳ', 'Ꮳ'), ('ꮴ', 'Ꮴ'), ('ꮵ', 'Ꮵ'), - ('ꮶ', 'Ꮶ'), ('ꮷ', 'Ꮷ'), ('ꮸ', 'Ꮸ'), ('ꮹ', 'Ꮹ'), - ('ꮺ', 'Ꮺ'), ('ꮻ', 'Ꮻ'), ('ꮼ', 'Ꮼ'), ('ꮽ', 'Ꮽ'), - ('ꮾ', 'Ꮾ'), ('ꮿ', 'Ꮿ'), ('A', 'a'), ('B', 'b'), - ('C', 'c'), ('D', 'd'), ('E', 'e'), ('F', 'f'), - ('G', 'g'), ('H', 'h'), ('I', 'i'), ('J', 'j'), - ('K', 'k'), ('L', 'l'), ('M', 'm'), ('N', 'n'), - ('O', 'o'), ('P', 'p'), ('Q', 'q'), ('R', 'r'), - ('S', 's'), ('T', 't'), ('U', 'u'), ('V', 'v'), - ('W', 'w'), ('X', 'x'), ('Y', 'y'), ('Z', 'z'), - ('𐐀', '𐐨'), ('𐐁', '𐐩'), ('𐐂', '𐐪'), ('𐐃', '𐐫'), - ('𐐄', '𐐬'), ('𐐅', '𐐭'), ('𐐆', '𐐮'), ('𐐇', '𐐯'), - ('𐐈', '𐐰'), ('𐐉', '𐐱'), ('𐐊', '𐐲'), ('𐐋', '𐐳'), - ('𐐌', '𐐴'), ('𐐍', '𐐵'), ('𐐎', '𐐶'), ('𐐏', '𐐷'), - ('𐐐', '𐐸'), ('𐐑', '𐐹'), ('𐐒', '𐐺'), ('𐐓', '𐐻'), - ('𐐔', '𐐼'), ('𐐕', '𐐽'), ('𐐖', '𐐾'), ('𐐗', '𐐿'), - ('𐐘', '𐑀'), ('𐐙', '𐑁'), ('𐐚', '𐑂'), ('𐐛', '𐑃'), - ('𐐜', '𐑄'), ('𐐝', '𐑅'), ('𐐞', '𐑆'), ('𐐟', '𐑇'), - ('𐐠', '𐑈'), ('𐐡', '𐑉'), ('𐐢', '𐑊'), ('𐐣', '𐑋'), - ('𐐤', '𐑌'), ('𐐥', '𐑍'), ('𐐦', '𐑎'), ('𐐧', '𐑏'), - ('𐒰', '𐓘'), ('𐒱', '𐓙'), ('𐒲', '𐓚'), ('𐒳', '𐓛'), - ('𐒴', '𐓜'), ('𐒵', '𐓝'), ('𐒶', '𐓞'), ('𐒷', '𐓟'), - ('𐒸', '𐓠'), ('𐒹', '𐓡'), ('𐒺', '𐓢'), ('𐒻', '𐓣'), - ('𐒼', '𐓤'), ('𐒽', '𐓥'), ('𐒾', '𐓦'), ('𐒿', '𐓧'), - ('𐓀', '𐓨'), ('𐓁', '𐓩'), ('𐓂', '𐓪'), ('𐓃', '𐓫'), - ('𐓄', '𐓬'), ('𐓅', '𐓭'), ('𐓆', '𐓮'), ('𐓇', '𐓯'), - ('𐓈', '𐓰'), ('𐓉', '𐓱'), ('𐓊', '𐓲'), ('𐓋', '𐓳'), - ('𐓌', '𐓴'), ('𐓍', '𐓵'), ('𐓎', '𐓶'), ('𐓏', '𐓷'), - ('𐓐', '𐓸'), ('𐓑', '𐓹'), ('𐓒', '𐓺'), ('𐓓', '𐓻'), - ('𐕰', '𐖗'), ('𐕱', '𐖘'), ('𐕲', '𐖙'), ('𐕳', '𐖚'), - ('𐕴', '𐖛'), ('𐕵', '𐖜'), ('𐕶', '𐖝'), ('𐕷', '𐖞'), - ('𐕸', '𐖟'), ('𐕹', '𐖠'), ('𐕺', '𐖡'), ('𐕼', '𐖣'), - ('𐕽', '𐖤'), ('𐕾', '𐖥'), ('𐕿', '𐖦'), ('𐖀', '𐖧'), - ('𐖁', '𐖨'), ('𐖂', '𐖩'), ('𐖃', '𐖪'), ('𐖄', '𐖫'), - ('𐖅', '𐖬'), ('𐖆', '𐖭'), ('𐖇', '𐖮'), ('𐖈', '𐖯'), - ('𐖉', '𐖰'), ('𐖊', '𐖱'), ('𐖌', '𐖳'), ('𐖍', '𐖴'), - ('𐖎', '𐖵'), ('𐖏', '𐖶'), ('𐖐', '𐖷'), ('𐖑', '𐖸'), - ('𐖒', '𐖹'), ('𐖔', '𐖻'), ('𐖕', '𐖼'), ('𐲀', '𐳀'), - ('𐲁', '𐳁'), ('𐲂', '𐳂'), ('𐲃', '𐳃'), ('𐲄', '𐳄'), - ('𐲅', '𐳅'), ('𐲆', '𐳆'), ('𐲇', '𐳇'), ('𐲈', '𐳈'), - ('𐲉', '𐳉'), ('𐲊', '𐳊'), ('𐲋', '𐳋'), ('𐲌', '𐳌'), - ('𐲍', '𐳍'), ('𐲎', '𐳎'), ('𐲏', '𐳏'), ('𐲐', '𐳐'), - ('𐲑', '𐳑'), ('𐲒', '𐳒'), ('𐲓', '𐳓'), ('𐲔', '𐳔'), - ('𐲕', '𐳕'), ('𐲖', '𐳖'), ('𐲗', '𐳗'), ('𐲘', '𐳘'), - ('𐲙', '𐳙'), ('𐲚', '𐳚'), ('𐲛', '𐳛'), ('𐲜', '𐳜'), - ('𐲝', '𐳝'), ('𐲞', '𐳞'), ('𐲟', '𐳟'), ('𐲠', '𐳠'), - ('𐲡', '𐳡'), ('𐲢', '𐳢'), ('𐲣', '𐳣'), ('𐲤', '𐳤'), - ('𐲥', '𐳥'), ('𐲦', '𐳦'), ('𐲧', '𐳧'), ('𐲨', '𐳨'), - ('𐲩', '𐳩'), ('𐲪', '𐳪'), ('𐲫', '𐳫'), ('𐲬', '𐳬'), - ('𐲭', '𐳭'), ('𐲮', '𐳮'), ('𐲯', '𐳯'), ('𐲰', '𐳰'), - ('𐲱', '𐳱'), ('𐲲', '𐳲'), ('𑢠', '𑣀'), ('𑢡', '𑣁'), - ('𑢢', '𑣂'), ('𑢣', '𑣃'), ('𑢤', '𑣄'), ('𑢥', '𑣅'), - ('𑢦', '𑣆'), ('𑢧', '𑣇'), ('𑢨', '𑣈'), ('𑢩', '𑣉'), - ('𑢪', '𑣊'), ('𑢫', '𑣋'), ('𑢬', '𑣌'), ('𑢭', '𑣍'), - ('𑢮', '𑣎'), ('𑢯', '𑣏'), ('𑢰', '𑣐'), ('𑢱', '𑣑'), - ('𑢲', '𑣒'), ('𑢳', '𑣓'), ('𑢴', '𑣔'), ('𑢵', '𑣕'), - ('𑢶', '𑣖'), ('𑢷', '𑣗'), ('𑢸', '𑣘'), ('𑢹', '𑣙'), - ('𑢺', '𑣚'), ('𑢻', '𑣛'), ('𑢼', '𑣜'), ('𑢽', '𑣝'), - ('𑢾', '𑣞'), ('𑢿', '𑣟'), ('𖹀', '𖹠'), ('𖹁', '𖹡'), - ('𖹂', '𖹢'), ('𖹃', '𖹣'), ('𖹄', '𖹤'), ('𖹅', '𖹥'), - ('𖹆', '𖹦'), ('𖹇', '𖹧'), ('𖹈', '𖹨'), ('𖹉', '𖹩'), - ('𖹊', '𖹪'), ('𖹋', '𖹫'), ('𖹌', '𖹬'), ('𖹍', '𖹭'), - ('𖹎', '𖹮'), ('𖹏', '𖹯'), ('𖹐', '𖹰'), ('𖹑', '𖹱'), - ('𖹒', '𖹲'), ('𖹓', '𖹳'), ('𖹔', '𖹴'), ('𖹕', '𖹵'), - ('𖹖', '𖹶'), ('𖹗', '𖹷'), ('𖹘', '𖹸'), ('𖹙', '𖹹'), - ('𖹚', '𖹺'), ('𖹛', '𖹻'), ('𖹜', '𖹼'), ('𖹝', '𖹽'), - ('𖹞', '𖹾'), ('𖹟', '𖹿'), ('𞤀', '𞤢'), ('𞤁', '𞤣'), - ('𞤂', '𞤤'), ('𞤃', '𞤥'), ('𞤄', '𞤦'), ('𞤅', '𞤧'), - ('𞤆', '𞤨'), ('𞤇', '𞤩'), ('𞤈', '𞤪'), ('𞤉', '𞤫'), - ('𞤊', '𞤬'), ('𞤋', '𞤭'), ('𞤌', '𞤮'), ('𞤍', '𞤯'), - ('𞤎', '𞤰'), ('𞤏', '𞤱'), ('𞤐', '𞤲'), ('𞤑', '𞤳'), - ('𞤒', '𞤴'), ('𞤓', '𞤵'), ('𞤔', '𞤶'), ('𞤕', '𞤷'), - ('𞤖', '𞤸'), ('𞤗', '𞤹'), ('𞤘', '𞤺'), ('𞤙', '𞤻'), - ('𞤚', '𞤼'), ('𞤛', '𞤽'), ('𞤜', '𞤾'), ('𞤝', '𞤿'), - ('𞤞', '𞥀'), ('𞤟', '𞥁'), ('𞤠', '𞥂'), ('𞤡', '𞥃'), -]; diff --git a/matcher/src/chars/normalize.rs b/matcher/src/chars/normalize.rs deleted file mode 100644 index d3df40ef..00000000 --- a/matcher/src/chars/normalize.rs +++ /dev/null @@ -1,526 +0,0 @@ -use std::mem::transmute; - -const DATA1: [(char, char); 277] = [ - ('\u{00C0}', 'A'), // WITH GRAVE, LATIN CAPITAL LETTER - ('\u{00C1}', 'A'), // WITH ACUTE, LATIN CAPITAL LETTER - ('\u{00C2}', 'A'), // WITH CIRCUMFLEX, LATIN CAPITAL LETTER - ('\u{00C3}', 'A'), // WITH TILDE, LATIN CAPITAL LETTER - ('\u{00C4}', 'A'), // WITH DIAERESIS, LATIN CAPITAL LETTER - ('\u{00C5}', 'A'), // WITH RING ABOVE, LATIN CAPITAL LETTER - ('\u{00C7}', 'C'), // WITH CEDILLA, LATIN CAPITAL LETTER - ('\u{00C8}', 'E'), // WITH GRAVE, LATIN CAPITAL LETTER - ('\u{00C9}', 'E'), // WITH ACUTE, LATIN CAPITAL LETTER - ('\u{00CA}', 'E'), // WITH CIRCUMFLEX, LATIN CAPITAL LETTER - ('\u{00CB}', 'E'), // WITH DIAERESIS, LATIN CAPITAL LETTER - ('\u{00CC}', 'I'), // WITH GRAVE, LATIN CAPITAL LETTER - ('\u{00CD}', 'I'), // WITH ACUTE, LATIN CAPITAL LETTER - ('\u{00CE}', 'I'), // WITH CIRCUMFLEX, LATIN CAPITAL LETTER - ('\u{00CF}', 'I'), // WITH DIAERESIS, LATIN CAPITAL LETTER - ('\u{00D1}', 'N'), // WITH TILDE, LATIN CAPITAL LETTER - ('\u{00D2}', 'O'), // WITH GRAVE, LATIN CAPITAL LETTER - ('\u{00D3}', 'O'), // WITH ACUTE, LATIN CAPITAL LETTER - ('\u{00D4}', 'O'), // WITH CIRCUMFLEX, LATIN CAPITAL LETTER - ('\u{00D5}', 'O'), // WITH TILDE, LATIN CAPITAL LETTER - ('\u{00D6}', 'O'), // WITH DIAERESIS, LATIN CAPITAL LETTER - ('\u{00D8}', 'O'), // WITH STROKE, LATIN CAPITAL LETTER - ('\u{00D9}', 'U'), // WITH GRAVE, LATIN CAPITAL LETTER - ('\u{00DA}', 'U'), // WITH ACUTE, LATIN CAPITAL LETTER - ('\u{00DB}', 'U'), // WITH CIRCUMFLEX, LATIN CAPITAL LETTER - ('\u{00DC}', 'U'), // WITH DIAERESIS, LATIN CAPITAL LETTER - ('\u{00DD}', 'Y'), // WITH ACUTE, LATIN CAPITAL LETTER - ('\u{00DF}', 's'), // , LATIN SMALL LETTER SHARP - ('\u{00E0}', 'a'), // WITH GRAVE, LATIN SMALL LETTER - ('\u{00E1}', 'a'), // WITH ACUTE, LATIN SMALL LETTER - ('\u{00E2}', 'a'), // WITH CIRCUMFLEX, LATIN SMALL LETTER - ('\u{00E3}', 'a'), // WITH TILDE, LATIN SMALL LETTER - ('\u{00E4}', 'a'), // WITH DIAERESIS, LATIN SMALL LETTER - ('\u{00E5}', 'a'), // WITH RING ABOVE, LATIN SMALL LETTER - ('\u{00E7}', 'c'), // WITH CEDILLA, LATIN SMALL LETTER - ('\u{00E8}', 'e'), // WITH GRAVE, LATIN SMALL LETTER - ('\u{00E9}', 'e'), // WITH ACUTE, LATIN SMALL LETTER - ('\u{00EA}', 'e'), // WITH CIRCUMFLEX, LATIN SMALL LETTER - ('\u{00EB}', 'e'), // WITH DIAERESIS, LATIN SMALL LETTER - ('\u{00EC}', 'i'), // WITH GRAVE, LATIN SMALL LETTER - ('\u{00ED}', 'i'), // WITH ACUTE, LATIN SMALL LETTER - ('\u{00EE}', 'i'), // WITH CIRCUMFLEX, LATIN SMALL LETTER - ('\u{00EF}', 'i'), // WITH DIAERESIS, LATIN SMALL LETTER - ('\u{00F1}', 'n'), // WITH TILDE, LATIN SMALL LETTER - ('\u{00F2}', 'o'), // WITH GRAVE, LATIN SMALL LETTER - ('\u{00F3}', 'o'), // WITH ACUTE, LATIN SMALL LETTER - ('\u{00F4}', 'o'), // WITH CIRCUMFLEX, LATIN SMALL LETTER - ('\u{00F5}', 'o'), // WITH TILDE, LATIN SMALL LETTER - ('\u{00F6}', 'o'), // WITH DIAERESIS, LATIN SMALL LETTER - ('\u{00F8}', 'o'), // WITH STROKE, LATIN SMALL LETTER - ('\u{00F9}', 'u'), // WITH GRAVE, LATIN SMALL LETTER - ('\u{00FA}', 'u'), // WITH ACUTE, LATIN SMALL LETTER - ('\u{00FB}', 'u'), // WITH CIRCUMFLEX, LATIN SMALL LETTER - ('\u{00FC}', 'u'), // WITH DIAERESIS, LATIN SMALL LETTER - ('\u{00FD}', 'y'), // WITH ACUTE, LATIN SMALL LETTER - ('\u{00FF}', 'y'), // WITH DIAERESIS, LATIN SMALL LETTER - ('\u{0101}', 'a'), // WITH MACRON, LATIN SMALL LETTER - ('\u{0103}', 'a'), // WITH BREVE, LATIN SMALL LETTER - ('\u{0105}', 'a'), // WITH OGONEK, LATIN SMALL LETTER - ('\u{0107}', 'c'), // WITH ACUTE, LATIN SMALL LETTER - ('\u{0109}', 'c'), // WITH CIRCUMFLEX, LATIN SMALL LETTER - ('\u{010B}', 'c'), // WITH DOT ABOVE, LATIN SMALL LETTER - ('\u{010D}', 'c'), // WITH CARON, LATIN SMALL LETTER - ('\u{010F}', 'd'), // WITH CARON, LATIN SMALL LETTER - ('\u{0111}', 'd'), // WITH STROKE, LATIN SMALL LETTER - ('\u{0113}', 'e'), // WITH MACRON, LATIN SMALL LETTER - ('\u{0115}', 'e'), // WITH BREVE, LATIN SMALL LETTER - ('\u{0117}', 'e'), // WITH DOT ABOVE, LATIN SMALL LETTER - ('\u{0119}', 'e'), // WITH OGONEK, LATIN SMALL LETTER - ('\u{011B}', 'e'), // WITH CARON, LATIN SMALL LETTER - ('\u{011D}', 'g'), // WITH CIRCUMFLEX, LATIN SMALL LETTER - ('\u{011F}', 'g'), // WITH BREVE, LATIN SMALL LETTER - ('\u{0121}', 'g'), // WITH DOT ABOVE, LATIN SMALL LETTER - ('\u{0123}', 'g'), // WITH CEDILLA, LATIN SMALL LETTER - ('\u{0125}', 'h'), // WITH CIRCUMFLEX, LATIN SMALL LETTER - ('\u{0127}', 'h'), // WITH STROKE, LATIN SMALL LETTER - ('\u{0129}', 'i'), // WITH TILDE, LATIN SMALL LETTER - ('\u{012B}', 'i'), // WITH MACRON, LATIN SMALL LETTER - ('\u{012D}', 'i'), // WITH BREVE, LATIN SMALL LETTER - ('\u{012F}', 'i'), // WITH OGONEK, LATIN SMALL LETTER - ('\u{0130}', 'I'), // WITH DOT ABOVE, LATIN CAPITAL LETTER - ('\u{0131}', 'i'), // , LATIN SMALL LETTER DOTLESS - ('\u{0135}', 'j'), // WITH CIRCUMFLEX, LATIN SMALL LETTER - ('\u{0137}', 'k'), // WITH CEDILLA, LATIN SMALL LETTER - ('\u{013A}', 'l'), // WITH ACUTE, LATIN SMALL LETTER - ('\u{013C}', 'l'), // WITH CEDILLA, LATIN SMALL LETTER - ('\u{013E}', 'l'), // WITH CARON, LATIN SMALL LETTER - ('\u{0140}', 'l'), // WITH MIDDLE DOT, LATIN SMALL LETTER - ('\u{0142}', 'l'), // WITH STROKE, LATIN SMALL LETTER - ('\u{0144}', 'n'), // WITH ACUTE, LATIN SMALL LETTER - ('\u{0146}', 'n'), // WITH CEDILLA, LATIN SMALL LETTER - ('\u{0148}', 'n'), // WITH CARON, LATIN SMALL LETTER - ('\u{014D}', 'o'), // WITH MACRON, LATIN SMALL LETTER - ('\u{014F}', 'o'), // WITH BREVE, LATIN SMALL LETTER - ('\u{0151}', 'o'), // WITH DOUBLE ACUTE, LATIN SMALL LETTER - ('\u{0155}', 'r'), // WITH ACUTE, LATIN SMALL LETTER - ('\u{0157}', 'r'), // WITH CEDILLA, LATIN SMALL LETTER - ('\u{0159}', 'r'), // WITH CARON, LATIN SMALL LETTER - ('\u{015B}', 's'), // WITH ACUTE, LATIN SMALL LETTER - ('\u{015D}', 's'), // WITH CIRCUMFLEX, LATIN SMALL LETTER - ('\u{015F}', 's'), // WITH CEDILLA, LATIN SMALL LETTER - ('\u{0161}', 's'), // WITH CARON, LATIN SMALL LETTER - ('\u{0163}', 't'), // WITH CEDILLA, LATIN SMALL LETTER - ('\u{0165}', 't'), // WITH CARON, LATIN SMALL LETTER - ('\u{0167}', 't'), // WITH STROKE, LATIN SMALL LETTER - ('\u{0169}', 'u'), // WITH TILDE, LATIN SMALL LETTER - ('\u{016B}', 'u'), // WITH MACRON, LATIN SMALL LETTER - ('\u{016D}', 'u'), // WITH BREVE, LATIN SMALL LETTER - ('\u{016F}', 'u'), // WITH RING ABOVE, LATIN SMALL LETTER - ('\u{0171}', 'u'), // WITH DOUBLE ACUTE, LATIN SMALL LETTER - ('\u{0173}', 'u'), // WITH OGONEK, LATIN SMALL LETTER - ('\u{0175}', 'w'), // WITH CIRCUMFLEX, LATIN SMALL LETTER - ('\u{0177}', 'y'), // WITH CIRCUMFLEX, LATIN SMALL LETTER - ('\u{0178}', 'Y'), // WITH DIAERESIS, LATIN CAPITAL LETTER - ('\u{017A}', 'z'), // WITH ACUTE, LATIN SMALL LETTER - ('\u{017C}', 'z'), // WITH DOT ABOVE, LATIN SMALL LETTER - ('\u{017E}', 'z'), // WITH CARON, LATIN SMALL LETTER - ('\u{017F}', 's'), // , LATIN SMALL LETTER LONG - ('\u{0180}', 'b'), // WITH STROKE, LATIN SMALL LETTER - ('\u{0181}', 'B'), // WITH HOOK, LATIN CAPITAL LETTER - ('\u{0183}', 'b'), // WITH TOPBAR, LATIN SMALL LETTER - ('\u{0186}', 'O'), // , LATIN CAPITAL LETTER OPEN - ('\u{0188}', 'c'), // WITH HOOK, LATIN SMALL LETTER - ('\u{0189}', 'D'), // , LATIN CAPITAL LETTER AFRICAN - ('\u{018A}', 'D'), // WITH HOOK, LATIN CAPITAL LETTER - ('\u{018C}', 'd'), // WITH TOPBAR, LATIN SMALL LETTER - ('\u{018E}', 'E'), // , LATIN CAPITAL LETTER REVERSED - ('\u{0190}', 'E'), // , LATIN CAPITAL LETTER OPEN - ('\u{0192}', 'f'), // WITH HOOK, LATIN SMALL LETTER - ('\u{0193}', 'G'), // WITH HOOK, LATIN CAPITAL LETTER - ('\u{0197}', 'I'), // WITH STROKE, LATIN CAPITAL LETTER - ('\u{0199}', 'k'), // WITH HOOK, LATIN SMALL LETTER - ('\u{019A}', 'l'), // WITH BAR, LATIN SMALL LETTER - ('\u{019C}', 'M'), // , LATIN CAPITAL LETTER TURNED - ('\u{019D}', 'N'), // WITH LEFT HOOK, LATIN CAPITAL LETTER - ('\u{019E}', 'n'), // WITH LONG RIGHT LEG, LATIN SMALL LETTER - ('\u{019F}', 'O'), // WITH MIDDLE TILDE, LATIN CAPITAL LETTER - ('\u{01A1}', 'o'), // WITH HORN, LATIN SMALL LETTER - ('\u{01A5}', 'p'), // WITH HOOK, LATIN SMALL LETTER - ('\u{01AB}', 't'), // WITH PALATAL HOOK, LATIN SMALL LETTER - ('\u{01AD}', 't'), // WITH HOOK, LATIN SMALL LETTER - ('\u{01AE}', 'T'), // WITH RETROFLEX HOOK, LATIN CAPITAL LETTER - ('\u{01B0}', 'u'), // WITH HORN, LATIN SMALL LETTER - ('\u{01B2}', 'V'), // WITH HOOK, LATIN CAPITAL LETTER - ('\u{01B4}', 'y'), // WITH HOOK, LATIN SMALL LETTER - ('\u{01B6}', 'z'), // WITH STROKE, LATIN SMALL LETTER - ('\u{01CE}', 'a'), // WITH CARON, LATIN SMALL LETTER - ('\u{01D0}', 'i'), // WITH CARON, LATIN SMALL LETTER - ('\u{01D2}', 'o'), // WITH CARON, LATIN SMALL LETTER - ('\u{01D4}', 'u'), // WITH CARON, LATIN SMALL LETTER - ('\u{01DD}', 'e'), // , LATIN SMALL LETTER TURNED - ('\u{01E5}', 'g'), // WITH STROKE, LATIN SMALL LETTER - ('\u{01E7}', 'g'), // WITH CARON, LATIN SMALL LETTER - ('\u{01E9}', 'k'), // WITH CARON, LATIN SMALL LETTER - ('\u{01EB}', 'o'), // WITH OGONEK, LATIN SMALL LETTER - ('\u{01F0}', 'j'), // WITH CARON, LATIN SMALL LETTER - ('\u{01F5}', 'g'), // WITH ACUTE, LATIN SMALL LETTER - ('\u{01F9}', 'n'), // WITH GRAVE, LATIN SMALL LETTER - ('\u{0201}', 'a'), // WITH DOUBLE GRAVE, LATIN SMALL LETTER - ('\u{0203}', 'a'), // WITH INVERTED BREVE, LATIN SMALL LETTER - ('\u{0205}', 'e'), // WITH DOUBLE GRAVE, LATIN SMALL LETTER - ('\u{0207}', 'e'), // WITH INVERTED BREVE, LATIN SMALL LETTER - ('\u{0209}', 'i'), // WITH DOUBLE GRAVE, LATIN SMALL LETTER - ('\u{020B}', 'i'), // WITH INVERTED BREVE, LATIN SMALL LETTER - ('\u{020D}', 'o'), // WITH DOUBLE GRAVE, LATIN SMALL LETTER - ('\u{020F}', 'o'), // WITH INVERTED BREVE, LATIN SMALL LETTER - ('\u{0211}', 'r'), // WITH DOUBLE GRAVE, LATIN SMALL LETTER - ('\u{0213}', 'r'), // WITH INVERTED BREVE, LATIN SMALL LETTER - ('\u{0215}', 'u'), // WITH DOUBLE GRAVE, LATIN SMALL LETTER - ('\u{0217}', 'u'), // WITH INVERTED BREVE, LATIN SMALL LETTER - ('\u{0219}', 's'), // WITH COMMA BELOW, LATIN SMALL LETTER - ('\u{021B}', 't'), // WITH COMMA BELOW, LATIN SMALL LETTER - ('\u{021F}', 'h'), // WITH CARON, LATIN SMALL LETTER - ('\u{0220}', 'N'), // WITH LONG RIGHT LEG, LATIN CAPITAL LETTER - ('\u{0221}', 'd'), // WITH CURL, LATIN SMALL LETTER - ('\u{0225}', 'z'), // WITH HOOK, LATIN SMALL LETTER - ('\u{0227}', 'a'), // WITH DOT ABOVE, LATIN SMALL LETTER - ('\u{0229}', 'e'), // WITH CEDILLA, LATIN SMALL LETTER - ('\u{022F}', 'o'), // WITH DOT ABOVE, LATIN SMALL LETTER - ('\u{0233}', 'y'), // WITH MACRON, LATIN SMALL LETTER - ('\u{0234}', 'l'), // WITH CURL, LATIN SMALL LETTER - ('\u{0235}', 'n'), // WITH CURL, LATIN SMALL LETTER - ('\u{0236}', 't'), // WITH CURL, LATIN SMALL LETTER - ('\u{0237}', 'j'), // , LATIN SMALL LETTER DOTLESS - ('\u{023A}', 'A'), // WITH STROKE, LATIN CAPITAL LETTER - ('\u{023B}', 'C'), // WITH STROKE, LATIN CAPITAL LETTER - ('\u{023C}', 'c'), // WITH STROKE, LATIN SMALL LETTER - ('\u{023D}', 'L'), // WITH BAR, LATIN CAPITAL LETTER - ('\u{023E}', 'T'), // WITH DIAGONAL STROKE, LATIN CAPITAL LETTER - ('\u{023F}', 's'), // WITH SWASH TAIL, LATIN SMALL LETTER - ('\u{0240}', 'z'), // WITH SWASH TAIL, LATIN SMALL LETTER - ('\u{0243}', 'B'), // WITH STROKE, LATIN CAPITAL LETTER - ('\u{0244}', 'U'), // BAR, LATIN CAPITAL LETTER - ('\u{0245}', 'V'), // , LATIN CAPITAL LETTER TURNED - ('\u{0246}', 'E'), // WITH STROKE, LATIN CAPITAL LETTER - ('\u{0247}', 'e'), // WITH STROKE, LATIN SMALL LETTER - ('\u{0248}', 'J'), // WITH STROKE, LATIN CAPITAL LETTER - ('\u{0249}', 'j'), // WITH STROKE, LATIN SMALL LETTER - ('\u{024A}', 'Q'), // WITH HOOK TAIL, LATIN CAPITAL LETTER SMALL - ('\u{024B}', 'q'), // WITH HOOK TAIL, LATIN SMALL LETTER - ('\u{024C}', 'R'), // WITH STROKE, LATIN CAPITAL LETTER - ('\u{024D}', 'r'), // WITH STROKE, LATIN SMALL LETTER - ('\u{024E}', 'Y'), // WITH STROKE, LATIN CAPITAL LETTER - ('\u{024F}', 'y'), // WITH STROKE, LATIN SMALL LETTER - ('\u{0250}', 'a'), // , LATIN SMALL LETTER TURNED - ('\u{0251}', 'a'), // , latin small letter script - ('\u{0253}', 'b'), // WITH HOOK, LATIN SMALL LETTER - ('\u{0254}', 'o'), // , LATIN SMALL LETTER OPEN - ('\u{0255}', 'c'), // WITH CURL, LATIN SMALL LETTER - ('\u{0256}', 'd'), // WITH TAIL, LATIN SMALL LETTER - ('\u{0257}', 'd'), // WITH HOOK, LATIN SMALL LETTER - ('\u{0258}', 'e'), // , LATIN SMALL LETTER REVERSED - ('\u{025B}', 'e'), // , LATIN SMALL LETTER OPEN - ('\u{025C}', 'e'), // , LATIN SMALL LETTER REVERSED OPEN - ('\u{025D}', 'e'), // WITH HOOK, LATIN SMALL LETTER REVERSED OPEN - ('\u{025E}', 'e'), // , LATIN SMALL LETTER CLOSED REVERSED OPEN - ('\u{025F}', 'j'), // WITH STROKE, LATIN SMALL LETTER DOTLESS - ('\u{0260}', 'g'), // WITH HOOK, LATIN SMALL LETTER - ('\u{0261}', 'g'), // , LATIN SMALL LETTER SCRIPT - ('\u{0262}', 'G'), // , LATIN LETTER SMALL CAPITAL - ('\u{0265}', 'h'), // , LATIN SMALL LETTER TURNED - ('\u{0266}', 'h'), // WITH HOOK, LATIN SMALL LETTER - ('\u{0268}', 'i'), // WITH STROKE, LATIN SMALL LETTER - ('\u{026A}', 'I'), // , LATIN LETTER SMALL CAPITAL - ('\u{026B}', 'l'), // WITH MIDDLE TILDE, LATIN SMALL LETTER - ('\u{026C}', 'l'), // WITH BELT, LATIN SMALL LETTER - ('\u{026D}', 'l'), // WITH RETROFLEX HOOK, LATIN SMALL LETTER - ('\u{026F}', 'm'), // , LATIN SMALL LETTER TURNED - ('\u{0270}', 'm'), // WITH LONG LEG, LATIN SMALL LETTER TURNED - ('\u{0271}', 'm'), // WITH HOOK, LATIN SMALL LETTER - ('\u{0272}', 'n'), // WITH LEFT HOOK, LATIN SMALL LETTER - ('\u{0273}', 'n'), // WITH RETROFLEX HOOK, LATIN SMALL LETTER - ('\u{0274}', 'N'), // , LATIN LETTER SMALL CAPITAL - ('\u{0275}', 'o'), // , LATIN SMALL LETTER BARRED - ('\u{0279}', 'r'), // , LATIN SMALL LETTER TURNED - ('\u{027A}', 'r'), // WITH LONG LEG, LATIN SMALL LETTER TURNED - ('\u{027B}', 'r'), // WITH HOOK, LATIN SMALL LETTER TURNED - ('\u{027C}', 'r'), // WITH LONG LEG, LATIN SMALL LETTER - ('\u{027D}', 'r'), // WITH TAIL, LATIN SMALL LETTER - ('\u{027E}', 'r'), // WITH FISHHOOK, LATIN SMALL LETTER - ('\u{027F}', 'r'), // WITH FISHHOOK, LATIN SMALL LETTER REVERSED - ('\u{0280}', 'R'), // , LATIN LETTER SMALL CAPITAL - ('\u{0281}', 'R'), // , LATIN LETTER SMALL CAPITAL INVERTED - ('\u{0282}', 's'), // WITH HOOK, LATIN SMALL LETTER - ('\u{0287}', 't'), // , LATIN SMALL LETTER TURNED - ('\u{0288}', 't'), // WITH RETROFLEX HOOK, LATIN SMALL LETTER - ('\u{0289}', 'u'), // BAR, LATIN SMALL LETTER - ('\u{028B}', 'v'), // WITH HOOK, LATIN SMALL LETTER - ('\u{028C}', 'v'), // , LATIN SMALL LETTER TURNED - ('\u{028D}', 'w'), // , LATIN SMALL LETTER TURNED - ('\u{028E}', 'y'), // , LATIN SMALL LETTER TURNED - ('\u{028F}', 'Y'), // , LATIN LETTER SMALL CAPITAL - ('\u{0290}', 'z'), // WITH RETROFLEX HOOK, LATIN SMALL LETTER - ('\u{0291}', 'z'), // WITH CURL, LATIN SMALL LETTER - ('\u{0297}', 'c'), // , LATIN LETTER STRETCHED - ('\u{0299}', 'B'), // , LATIN LETTER SMALL CAPITAL - ('\u{029A}', 'e'), // , LATIN SMALL LETTER CLOSED OPEN - ('\u{029B}', 'G'), // WITH HOOK, LATIN LETTER SMALL CAPITAL - ('\u{029C}', 'H'), // , LATIN LETTER SMALL CAPITAL - ('\u{029D}', 'j'), // WITH CROSSED-TAIL, LATIN SMALL LETTER - ('\u{029E}', 'k'), // , LATIN SMALL LETTER TURNED - ('\u{029F}', 'L'), // , LATIN LETTER SMALL CAPITAL - ('\u{02A0}', 'q'), // WITH HOOK, LATIN SMALL LETTER - ('\u{02AE}', 'h'), // WITH FISHHOOK, LATIN SMALL LETTER TURNED - ('\u{0363}', 'a'), // , COMBINING LATIN SMALL LETTER - ('\u{0364}', 'e'), // , COMBINING LATIN SMALL LETTER - ('\u{0365}', 'i'), // , COMBINING LATIN SMALL LETTER - ('\u{0366}', 'o'), // , COMBINING LATIN SMALL LETTER - ('\u{0367}', 'u'), // , COMBINING LATIN SMALL LETTER - ('\u{0368}', 'c'), // , COMBINING LATIN SMALL LETTER - ('\u{0369}', 'd'), // , COMBINING LATIN SMALL LETTER - ('\u{036A}', 'h'), // , COMBINING LATIN SMALL LETTER - ('\u{036B}', 'm'), // , COMBINING LATIN SMALL LETTER - ('\u{036C}', 'r'), // , COMBINING LATIN SMALL LETTER - ('\u{036D}', 't'), // , COMBINING LATIN SMALL LETTER - ('\u{036E}', 'v'), // , COMBINING LATIN SMALL LETTER - ('\u{036F}', 'x'), // , COMBINING LATIN SMALL LETTER -]; - -const DATA2: [(char, char); 167] = [ - ('\u{1D00}', 'A'), // , LATIN LETTER SMALL CAPITAL - ('\u{1D03}', 'B'), // , LATIN LETTER SMALL CAPITAL BARRED - ('\u{1D04}', 'C'), // , LATIN LETTER SMALL CAPITAL - ('\u{1D05}', 'D'), // , LATIN LETTER SMALL CAPITAL - ('\u{1D07}', 'E'), // , LATIN LETTER SMALL CAPITAL - ('\u{1D08}', 'e'), // , LATIN SMALL LETTER TURNED OPEN - ('\u{1D09}', 'i'), // , LATIN SMALL LETTER TURNED - ('\u{1D0A}', 'J'), // , LATIN LETTER SMALL CAPITAL - ('\u{1D0B}', 'K'), // , LATIN LETTER SMALL CAPITAL - ('\u{1D0C}', 'L'), // WITH STROKE, LATIN LETTER SMALL CAPITAL - ('\u{1D0D}', 'M'), // , LATIN LETTER SMALL CAPITAL - ('\u{1D0E}', 'N'), // , LATIN LETTER SMALL CAPITAL REVERSED - ('\u{1D0F}', 'O'), // , LATIN LETTER SMALL CAPITAL - ('\u{1D10}', 'O'), // , LATIN LETTER SMALL CAPITAL OPEN - ('\u{1D11}', 'o'), // , LATIN SMALL LETTER SIDEWAYS - ('\u{1D12}', 'o'), // , LATIN SMALL LETTER SIDEWAYS OPEN - ('\u{1D13}', 'o'), // WITH STROKE, LATIN SMALL LETTER SIDEWAYS - ('\u{1D16}', 'o'), // , LATIN SMALL LETTER TOP HALF - ('\u{1D17}', 'o'), // , LATIN SMALL LETTER BOTTOM HALF - ('\u{1D18}', 'P'), // , LATIN LETTER SMALL CAPITAL - ('\u{1D19}', 'R'), // , LATIN LETTER SMALL CAPITAL REVERSED - ('\u{1D1A}', 'R'), // , LATIN LETTER SMALL CAPITAL TURNED - ('\u{1D1B}', 'T'), // , LATIN LETTER SMALL CAPITAL - ('\u{1D1C}', 'U'), // , LATIN LETTER SMALL CAPITAL - ('\u{1D1D}', 'u'), // , LATIN SMALL LETTER SIDEWAYS - ('\u{1D1E}', 'u'), // , LATIN SMALL LETTER SIDEWAYS DIAERESIZED - ('\u{1D1F}', 'm'), // , LATIN SMALL LETTER SIDEWAYS TURNED - ('\u{1D20}', 'V'), // , LATIN LETTER SMALL CAPITAL - ('\u{1D21}', 'W'), // , LATIN LETTER SMALL CAPITAL - ('\u{1D22}', 'Z'), // , LATIN LETTER SMALL CAPITAL - ('\u{1D62}', 'i'), // , LATIN SUBSCRIPT SMALL LETTER - ('\u{1D63}', 'r'), // , LATIN SUBSCRIPT SMALL LETTER - ('\u{1D64}', 'u'), // , LATIN SUBSCRIPT SMALL LETTER - ('\u{1D65}', 'v'), // , LATIN SUBSCRIPT SMALL LETTER - ('\u{1E01}', 'a'), // WITH RING BELOW, LATIN SMALL LETTER - ('\u{1E03}', 'b'), // WITH DOT ABOVE, LATIN SMALL LETTER - ('\u{1E05}', 'b'), // WITH DOT BELOW, LATIN SMALL LETTER - ('\u{1E07}', 'b'), // WITH LINE BELOW, LATIN SMALL LETTER - ('\u{1E0B}', 'd'), // WITH DOT ABOVE, LATIN SMALL LETTER - ('\u{1E0D}', 'd'), // WITH DOT BELOW, LATIN SMALL LETTER - ('\u{1E0F}', 'd'), // WITH LINE BELOW, LATIN SMALL LETTER - ('\u{1E11}', 'd'), // WITH CEDILLA, LATIN SMALL LETTER - ('\u{1E13}', 'd'), // WITH CIRCUMFLEX BELOW, LATIN SMALL LETTER - ('\u{1E19}', 'e'), // WITH CIRCUMFLEX BELOW, LATIN SMALL LETTER - ('\u{1E1B}', 'e'), // WITH TILDE BELOW, LATIN SMALL LETTER - ('\u{1E1F}', 'f'), // WITH DOT ABOVE, LATIN SMALL LETTER - ('\u{1E21}', 'g'), // WITH MACRON, LATIN SMALL LETTER - ('\u{1E23}', 'h'), // WITH DOT ABOVE, LATIN SMALL LETTER - ('\u{1E25}', 'h'), // WITH DOT BELOW, LATIN SMALL LETTER - ('\u{1E27}', 'h'), // WITH DIAERESIS, LATIN SMALL LETTER - ('\u{1E29}', 'h'), // WITH CEDILLA, LATIN SMALL LETTER - ('\u{1E2B}', 'h'), // WITH BREVE BELOW, LATIN SMALL LETTER - ('\u{1E2D}', 'i'), // WITH TILDE BELOW, LATIN SMALL LETTER - ('\u{1E31}', 'k'), // WITH ACUTE, LATIN SMALL LETTER - ('\u{1E33}', 'k'), // WITH DOT BELOW, LATIN SMALL LETTER - ('\u{1E35}', 'k'), // WITH LINE BELOW, LATIN SMALL LETTER - ('\u{1E37}', 'l'), // WITH DOT BELOW, LATIN SMALL LETTER ('\u{1E3B}', 'l'), // WITH LINE BELOW, LATIN SMALL LETTER - ('\u{1E3D}', 'l'), // WITH CIRCUMFLEX BELOW, LATIN SMALL LETTER - ('\u{1E3F}', 'm'), // WITH ACUTE, LATIN SMALL LETTER - ('\u{1E41}', 'm'), // WITH DOT ABOVE, LATIN SMALL LETTER - ('\u{1E43}', 'm'), // WITH DOT BELOW, LATIN SMALL LETTER - ('\u{1E45}', 'n'), // WITH DOT ABOVE, LATIN SMALL LETTER - ('\u{1E47}', 'n'), // WITH DOT BELOW, LATIN SMALL LETTER - ('\u{1E49}', 'n'), // WITH LINE BELOW, LATIN SMALL LETTER - ('\u{1E4B}', 'n'), // WITH CIRCUMFLEX BELOW, LATIN SMALL LETTER - ('\u{1E55}', 'p'), // WITH ACUTE, LATIN SMALL LETTER - ('\u{1E57}', 'p'), // WITH DOT ABOVE, LATIN SMALL LETTER ('\u{1E59}', 'r'), // WITH DOT ABOVE, LATIN SMALL LETTER - ('\u{1E5B}', 'r'), // WITH DOT BELOW, LATIN SMALL LETTER - ('\u{1E5F}', 'r'), // WITH LINE BELOW, LATIN SMALL LETTER - ('\u{1E61}', 's'), // WITH DOT ABOVE, LATIN SMALL LETTER - ('\u{1E63}', 's'), // WITH DOT BELOW, LATIN SMALL LETTER - ('\u{1E6B}', 't'), // WITH DOT ABOVE, LATIN SMALL LETTER - ('\u{1E6D}', 't'), // WITH DOT BELOW, LATIN SMALL LETTER - ('\u{1E6F}', 't'), // WITH LINE BELOW, LATIN SMALL LETTER - ('\u{1E71}', 't'), // WITH CIRCUMFLEX BELOW, LATIN SMALL LETTER - ('\u{1E73}', 'u'), // WITH DIAERESIS BELOW, LATIN SMALL LETTER - ('\u{1E75}', 'u'), // WITH TILDE BELOW, LATIN SMALL LETTER ('\u{1E77}', 'u'), // WITH CIRCUMFLEX BELOW, LATIN SMALL LETTER - ('\u{1E7D}', 'v'), // WITH TILDE, LATIN SMALL LETTER - ('\u{1E7F}', 'v'), // WITH DOT BELOW, LATIN SMALL LETTER - ('\u{1E81}', 'w'), // WITH GRAVE, LATIN SMALL LETTER - ('\u{1E83}', 'w'), // WITH ACUTE, LATIN SMALL LETTER - ('\u{1E85}', 'w'), // WITH DIAERESIS, LATIN SMALL LETTER ('\u{1E87}', 'w'), // WITH DOT ABOVE, LATIN SMALL LETTER - ('\u{1E89}', 'w'), // WITH DOT BELOW, LATIN SMALL LETTER - ('\u{1E8B}', 'x'), // WITH DOT ABOVE, LATIN SMALL LETTER - ('\u{1E8D}', 'x'), // WITH DIAERESIS, LATIN SMALL LETTER - ('\u{1E8F}', 'y'), // WITH DOT ABOVE, LATIN SMALL LETTER - ('\u{1E91}', 'z'), // WITH CIRCUMFLEX, LATIN SMALL LETTER ('\u{1E93}', 'z'), // WITH DOT BELOW, LATIN SMALL LETTER - ('\u{1E95}', 'z'), // WITH LINE BELOW, LATIN SMALL LETTER - ('\u{1E96}', 'h'), // WITH LINE BELOW, LATIN SMALL LETTER - ('\u{1E97}', 't'), // WITH DIAERESIS, LATIN SMALL LETTER - ('\u{1E98}', 'w'), // WITH RING ABOVE, LATIN SMALL LETTER - ('\u{1E99}', 'y'), // WITH RING ABOVE, LATIN SMALL LETTER - ('\u{1E9A}', 'a'), // WITH RIGHT HALF RING, LATIN SMALL LETTER - ('\u{1E9B}', 's'), // WITH DOT ABOVE, LATIN SMALL LETTER LONG - ('\u{1EA1}', 'a'), // WITH DOT BELOW, LATIN SMALL LETTER - ('\u{1EA3}', 'a'), // WITH HOOK ABOVE, LATIN SMALL LETTER - ('\u{1EB9}', 'e'), // WITH DOT BELOW, LATIN SMALL LETTER ('\u{1EBB}', 'e'), // WITH HOOK ABOVE, LATIN SMALL LETTER - ('\u{1EBD}', 'e'), // WITH TILDE, LATIN SMALL LETTER - ('\u{1EC9}', 'i'), // WITH HOOK ABOVE, LATIN SMALL LETTER - ('\u{1ECB}', 'i'), // WITH DOT BELOW, LATIN SMALL LETTER - ('\u{1ECD}', 'o'), // WITH DOT BELOW, LATIN SMALL LETTER - ('\u{1ECF}', 'o'), // WITH HOOK ABOVE, LATIN SMALL LETTER - ('\u{1EE5}', 'u'), // WITH DOT BELOW, LATIN SMALL LETTER - ('\u{1EE7}', 'u'), // WITH HOOK ABOVE, LATIN SMALL LETTER - ('\u{1EF3}', 'y'), // WITH GRAVE, LATIN SMALL LETTER - ('\u{1EF5}', 'y'), // WITH DOT BELOW, LATIN SMALL LETTER - ('\u{1EF7}', 'y'), // WITH HOOK ABOVE, LATIN SMALL LETTER ('\u{1EF9}', 'y'), // WITH TILDE, LATIN SMALL LETTER - ('\u{1ea4}', 'A'), - ('\u{1ea5}', 'a'), - ('\u{1ea6}', 'A'), - ('\u{1ea7}', 'a'), - ('\u{1ea8}', 'A'), - ('\u{1ea9}', 'a'), - ('\u{1eaa}', 'A'), - ('\u{1eab}', 'a'), - ('\u{1eac}', 'A'), - ('\u{1ead}', 'a'), - ('\u{1eae}', 'A'), - ('\u{1eaf}', 'a'), - ('\u{1eb0}', 'A'), - ('\u{1eb1}', 'a'), - ('\u{1eb2}', 'A'), - ('\u{1eb3}', 'a'), - ('\u{1eb4}', 'A'), - ('\u{1eb5}', 'a'), - ('\u{1eb6}', 'A'), - ('\u{1eb7}', 'a'), - ('\u{1ebe}', 'E'), - ('\u{1ebf}', 'e'), - ('\u{1ec0}', 'E'), - ('\u{1ec1}', 'e'), - ('\u{1ec2}', 'E'), - ('\u{1ec3}', 'e'), - ('\u{1ec4}', 'E'), - ('\u{1ec5}', 'e'), - ('\u{1ec6}', 'E'), - ('\u{1ec7}', 'e'), - ('\u{1ed0}', 'O'), - ('\u{1ed1}', 'o'), - ('\u{1ed2}', 'O'), - ('\u{1ed3}', 'o'), - ('\u{1ed4}', 'O'), - ('\u{1ed5}', 'o'), - ('\u{1ed6}', 'O'), - ('\u{1ed7}', 'o'), - ('\u{1ed8}', 'O'), - ('\u{1ed9}', 'o'), - ('\u{1eda}', 'O'), - ('\u{1edb}', 'o'), - ('\u{1edc}', 'O'), - ('\u{1edd}', 'o'), - ('\u{1ede}', 'O'), - ('\u{1edf}', 'o'), - ('\u{1ee0}', 'O'), - ('\u{1ee1}', 'o'), - ('\u{1ee2}', 'O'), - ('\u{1ee3}', 'o'), - ('\u{1ee8}', 'U'), - ('\u{1ee9}', 'u'), - ('\u{1eea}', 'U'), - ('\u{1eeb}', 'u'), - ('\u{1eec}', 'U'), - ('\u{1eed}', 'u'), - ('\u{1eee}', 'U'), - ('\u{1eef}', 'u'), - ('\u{1ef0}', 'U'), - ('\u{1ef1}', 'u'), -]; - -const DATA3: [(char, char); 9] = [ - ('\u{2071}', 'i'), // , SUPERSCRIPT LATIN SMALL LETTER - ('\u{2095}', 'h'), // , LATIN SUBSCRIPT SMALL LETTER - ('\u{2096}', 'k'), // , LATIN SUBSCRIPT SMALL LETTER - ('\u{2097}', 'l'), // , LATIN SUBSCRIPT SMALL LETTER - ('\u{2098}', 'm'), // , LATIN SUBSCRIPT SMALL LETTER0x2099: 'n', // , LATIN SUBSCRIPT SMALL LETTER - ('\u{209A}', 'p'), // , LATIN SUBSCRIPT SMALL LETTER - ('\u{209B}', 's'), // , LATIN SUBSCRIPT SMALL LETTER - ('\u{209C}', 't'), // , LATIN SUBSCRIPT SMALL LETTER - ('\u{2184}', 'c'), // , LATIN SMALL LETTER REVERSED -]; - -const DATA1_START: u32 = DATA1[0].0 as u32; -const DATA1_END: u32 = DATA1[DATA1.len() - 1].0 as u32 + 1; -const LEN1: usize = (DATA1_END - DATA1_START) as usize; -static TABLE1: [char; LEN1] = generate_table(&DATA1); - -const fn generate_table(sparse_data: &[(char, char)]) -> [char; LEN] { - let mut table: [char; LEN] = ['\0'; LEN]; - let start = sparse_data[0].0 as u32; - let mut i = 0u32; - let mut j = 0; - while i < table.len() as u32 { - let key = unsafe { transmute(start + i) }; - if sparse_data[j].0 == key { - table[i as usize] = DATA1[j].1; - j += 1; - } else { - //identity - table[i as usize] = key; - } - i += 1; - } - table -} -const DATA2_START: u32 = DATA2[0].0 as u32; -const DATA2_END: u32 = DATA2[DATA2.len() - 1].0 as u32 + 1; -const LEN2: usize = (DATA2_END - DATA2_START) as usize; -static TABLE2: [char; LEN2] = generate_table(&DATA2); - -const DATA3_START: u32 = DATA3[0].0 as u32; -const DATA3_END: u32 = DATA3[DATA3.len() - 1].0 as u32 + 1; -const LEN3: usize = (DATA3_END - DATA3_START) as usize; -static TABLE3: [char; LEN3] = generate_table(&DATA3); - -/// Normalizes a unicode character by converting latin characters -/// which are variants of ASCII characters to their latin equivant. -/// -/// # Example -/// -/// ``` rust -/// # use nucleo_matcher::chars::normalize; -/// -/// assert_eq!(normalize('ä'), 'a'); -/// ``` -pub fn normalize(c: char) -> char { - let i = c as u32; - if i < DATA1_START || i >= DATA3_END { - return c; - } - if i < DATA1_END { - return TABLE1[(i - DATA1_START) as usize]; - } - if i < DATA2_START { - return c; - } - if i < DATA2_END { - return TABLE2[(i - DATA2_START) as usize]; - } - if i < DATA3_START { - return c; - } - TABLE3[(i - DATA3_START) as usize] -} diff --git a/matcher/src/config.rs b/matcher/src/config.rs deleted file mode 100644 index eca7ae38..00000000 --- a/matcher/src/config.rs +++ /dev/null @@ -1,70 +0,0 @@ -use crate::chars::CharClass; -use crate::score::BONUS_BOUNDARY; - -/// Configuration data that controls how a matcher behaves -#[non_exhaustive] -#[derive(PartialEq, Eq, Debug, Clone)] -pub struct Config { - /// Characters that act as delimiters and provide bonus - /// for matching the following char - pub(crate) delimiter_chars: &'static [u8], - /// Extra bonus for word boundary after whitespace character or beginning of the string - pub(crate) bonus_boundary_white: u16, - /// Extra bonus for word boundary after slash, colon, semi-colon, and comma - pub(crate) bonus_boundary_delimiter: u16, - pub(crate) initial_char_class: CharClass, - - /// Whether to normalize latin script characters to ASCII (enabled by default) - pub normalize: bool, - /// whether to ignore casing - pub ignore_case: bool, - /// Whether to provide a bonus to matches by their distance from the start - /// of the haystack. The bonus is fairly small compared to the normal gap - /// penalty to avoid messing with the normal score heuristic. This setting - /// is not turned on by default and only recommended for autocompletion - /// usecases where the expectation is that the user is typing the entire - /// match. For a full fzf-like fuzzy matcher/picker word segmentation and - /// explicit prefix literals should be used instead. - pub prefer_prefix: bool, -} - -impl Config { - /// The default config for nucleo, implemented as a constant since - /// Default::default can not be called in a const context - pub const DEFAULT: Self = { - Config { - delimiter_chars: b"/,:;|", - bonus_boundary_white: BONUS_BOUNDARY + 2, - bonus_boundary_delimiter: BONUS_BOUNDARY + 1, - initial_char_class: CharClass::Whitespace, - normalize: true, - ignore_case: true, - prefer_prefix: false, - } - }; -} - -impl Config { - /// Configures the matcher with bonuses appropriate for matching file paths. - pub fn set_match_paths(&mut self) { - if cfg!(windows) { - self.delimiter_chars = b"/:\\"; - } else { - self.delimiter_chars = b"/:"; - } - self.bonus_boundary_white = BONUS_BOUNDARY; - self.initial_char_class = CharClass::Delimiter; - } - - /// Configures the matcher with bonuses appropriate for matching file paths. - pub const fn match_paths(mut self) -> Self { - if cfg!(windows) { - self.delimiter_chars = b"/\\"; - } else { - self.delimiter_chars = b"/"; - } - self.bonus_boundary_white = BONUS_BOUNDARY; - self.initial_char_class = CharClass::Delimiter; - self - } -} diff --git a/matcher/src/debug.rs b/matcher/src/debug.rs deleted file mode 100644 index 364676cb..00000000 --- a/matcher/src/debug.rs +++ /dev/null @@ -1,32 +0,0 @@ -use crate::matrix::{MatrixCell, ScoreCell}; -use std::fmt::{Debug, Formatter, Result}; - -// impl MatcherData<'_, C> { -// pub fn rows(&self) -> impl Iterator + ExactSizeIterator + Clone + Sized { -// let mut cells = &*self.cells; -// self.row_offs.iter().map(move |&off| { -// let len = self.haystack.len() - off as usize; -// let (row, tmp) = cells.split_at(len); -// cells = tmp; -// MatrixRow { off, cells: row } -// }) -// } - -// pub fn haystack( -// &self, -// ) -> impl Iterator> + ExactSizeIterator + '_ + Clone { -// haystack(self.haystack, self.bonus, 0) -// } -// } - -impl Debug for ScoreCell { - fn fmt(&self, f: &mut Formatter<'_>) -> Result { - write!(f, "({}, {})", self.score, self.matched) - } -} - -impl Debug for MatrixCell { - fn fmt(&self, f: &mut Formatter<'_>) -> Result { - write!(f, "({}, {})", (self.0 & 1) != 0, (self.0 & 2) != 0) - } -} diff --git a/matcher/src/exact.rs b/matcher/src/exact.rs deleted file mode 100644 index 49388fd5..00000000 --- a/matcher/src/exact.rs +++ /dev/null @@ -1,274 +0,0 @@ -use memchr::memmem; -use memchr::{Memchr, Memchr2}; - -use crate::chars::{AsciiChar, Char}; -use crate::score::{BONUS_FIRST_CHAR_MULTIPLIER, SCORE_MATCH}; -use crate::Matcher; - -impl Matcher { - pub(crate) fn substring_match_1_ascii( - &mut self, - haystack: &[u8], - c: u8, - indices: &mut Vec, - ) -> Option { - let mut max_score = 0; - let mut max_pos = 0; - if self.config.ignore_case && c >= b'a' && c <= b'z' { - for i in Memchr2::new(c, c - 32, haystack) { - let prev_char_class = i - .checked_sub(1) - .map(|i| AsciiChar(haystack[i]).char_class(&self.config)) - .unwrap_or(self.config.initial_char_class); - let char_class = AsciiChar(haystack[i]).char_class(&self.config); - let bonus = self.config.bonus_for(prev_char_class, char_class); - let score = bonus * BONUS_FIRST_CHAR_MULTIPLIER + SCORE_MATCH; - if score > max_score { - max_pos = i as u32; - max_score = score; - // can't get better than this - if bonus >= self.config.bonus_boundary_white { - break; - } - } - } - } else { - let char_class = AsciiChar(c).char_class(&self.config); - for i in Memchr::new(c, haystack) { - let prev_char_class = i - .checked_sub(1) - .map(|i| AsciiChar(haystack[i]).char_class(&self.config)) - .unwrap_or(self.config.initial_char_class); - let bonus = self.config.bonus_for(prev_char_class, char_class); - let score = bonus * BONUS_FIRST_CHAR_MULTIPLIER + SCORE_MATCH; - if score > max_score { - max_pos = i as u32; - max_score = score; - // can't get better than this - if bonus >= self.config.bonus_boundary_white { - break; - } - } - } - } - if max_score == 0 { - return None; - } - - if INDICES { - indices.push(max_pos); - } - Some(max_score) - } - - pub(crate) fn substring_match_ascii_with_prefilter( - &mut self, - haystack: &[u8], - needle: &[u8], - prefilter_len: usize, - prefilter: impl Iterator, - ) -> (u16, usize) { - let needle_without_prefilter = &needle[prefilter_len..]; - let mut max_score = 0; - let mut max_pos = 0; - for i in prefilter { - let prev_char_class = i - .checked_sub(1) - .map(|i| AsciiChar(haystack[i]).char_class(&self.config)) - .unwrap_or(self.config.initial_char_class); - let char_class = AsciiChar(haystack[i]).char_class(&self.config); - let bonus = self.config.bonus_for(prev_char_class, char_class); - let score = bonus * BONUS_FIRST_CHAR_MULTIPLIER + SCORE_MATCH; - if score > max_score - && haystack[i + prefilter_len..(i + needle.len()).min(haystack.len())] - .iter() - .map(|&c| AsciiChar(c).normalize(&self.config).0) - .eq(needle_without_prefilter.iter().copied()) - { - max_pos = i; - max_score = score; - // can't get better than this - if bonus >= self.config.bonus_boundary_white { - break; - } - } - } - (max_score, max_pos) - } - - pub(crate) fn substring_match_ascii( - &mut self, - haystack: &[u8], - needle: &[u8], - indices: &mut Vec, - ) -> Option { - let mut max_score = 0; - let mut max_pos = 0; - if self.config.ignore_case { - match needle.iter().position(|&c| c >= b'a' && c <= b'z') { - // start with char do case insensitive search - Some(0) => { - (max_score, max_pos) = self.substring_match_ascii_with_prefilter( - haystack, - needle, - 1, - Memchr2::new( - needle[0], - needle[0] - 32, - &haystack[..haystack.len() - needle.len() + 1], - ), - ); - if max_score == 0 { - return None; - } - } - Some(1) => { - (max_score, max_pos) = self.substring_match_ascii_with_prefilter( - haystack, - needle, - 1, - Memchr::new(needle[0], &haystack[..haystack.len() - needle.len() + 1]), - ); - if max_score == 0 { - return None; - } - } - Some(len) => { - (max_score, max_pos) = self.substring_match_ascii_with_prefilter( - haystack, - needle, - 1, - memmem::find_iter(&haystack[..haystack.len() - needle.len() + len], needle), - ); - if max_score == 0 { - return None; - } - } - // in case we don't have any letter in the needle - // we can treat the search as case sensitive and use memmem directly which is way faster - None => (), - } - } - - if max_score == 0 { - let char_class = AsciiChar(needle[0]).char_class(&self.config); - for i in memmem::find_iter(haystack, needle) { - let prev_char_class = i - .checked_sub(1) - .map(|i| AsciiChar(haystack[i]).char_class(&self.config)) - .unwrap_or(self.config.initial_char_class); - let bonus = self.config.bonus_for(prev_char_class, char_class); - let score = bonus * BONUS_FIRST_CHAR_MULTIPLIER + SCORE_MATCH; - if score > max_score { - max_pos = i; - max_score = score; - // can't get better than this - if bonus >= self.config.bonus_boundary_white { - break; - } - } - } - if max_score == 0 { - return None; - } - } - let score = self.calculate_score::( - AsciiChar::cast(haystack), - AsciiChar::cast(needle), - max_pos, - max_pos + needle.len(), - indices, - ); - Some(score) - } - - pub(crate) fn substring_match_1_non_ascii( - &mut self, - haystack: &[char], - needle: char, - start: usize, - indices: &mut Vec, - ) -> u16 { - let mut max_score = 0; - let mut max_pos = 0; - let mut prev_class = start - .checked_sub(1) - .map(|i| haystack[i].char_class(&self.config)) - .unwrap_or(self.config.initial_char_class); - for (i, &c) in haystack[start..].iter().enumerate() { - let (c, char_class) = c.char_class_and_normalize(&self.config); - if c != needle { - continue; - } - let bonus = self.config.bonus_for(prev_class, char_class); - prev_class = char_class; - let score = bonus * BONUS_FIRST_CHAR_MULTIPLIER + SCORE_MATCH; - if score > max_score { - max_pos = i as u32; - max_score = score; - // can't get better than this - if bonus >= self.config.bonus_boundary_white { - break; - } - } - } - - if INDICES { - indices.push(max_pos + start as u32); - } - max_score - } - - pub(crate) fn substring_match_non_ascii( - &mut self, - haystack: &[char], - needle: &[N], - start: usize, - indices: &mut Vec, - ) -> Option - where - N: Char, - char: PartialEq, - { - let mut max_score = 0; - let mut max_pos = 0; - let mut prev_class = start - .checked_sub(1) - .map(|i| haystack[i].char_class(&self.config)) - .unwrap_or(self.config.initial_char_class); - for (i, &c) in haystack[start..].iter().enumerate() { - let (c, char_class) = c.char_class_and_normalize(&self.config); - if c != needle[0] { - continue; - } - let bonus = self.config.bonus_for(prev_class, char_class); - prev_class = char_class; - let score = bonus * BONUS_FIRST_CHAR_MULTIPLIER + SCORE_MATCH; - if score > max_score - && haystack[start + i + 1..start + i + needle.len()] - .iter() - .map(|c| c.normalize(&self.config)) - .eq(needle[1..].iter().copied()) - { - max_pos = i; - max_score = score; - // can't get better than this - if bonus >= self.config.bonus_boundary_white { - break; - } - } - } - if max_score == 0 { - return None; - } - - let score = self.calculate_score::( - haystack, - needle, - max_pos, - max_pos + needle.len(), - indices, - ); - Some(score) - } -} diff --git a/matcher/src/fuzzy_greedy.rs b/matcher/src/fuzzy_greedy.rs deleted file mode 100644 index 8215bf31..00000000 --- a/matcher/src/fuzzy_greedy.rs +++ /dev/null @@ -1,51 +0,0 @@ -use crate::chars::Char; -use crate::Matcher; - -impl Matcher { - /// greedy fallback algorithm, much faster (linear time) but reported scores/indicies - /// might not be the best match - pub(crate) fn fuzzy_match_greedy_, N: Char>( - &mut self, - haystack: &[H], - needle: &[N], - mut start: usize, - mut end: usize, - indices: &mut Vec, - ) -> Option { - let first_char_end = if H::ASCII && N::ASCII { start + 1 } else { end }; - 'nonascii: { - if !H::ASCII || !N::ASCII { - let mut needle_iter = needle[1..].iter().copied(); - if let Some(mut needle_char) = needle_iter.next() { - for (i, &c) in haystack[first_char_end..].iter().enumerate() { - if c.normalize(&self.config) == needle_char { - let Some(next_needle_char) = needle_iter.next() else { - // we found a match so we are now in the same state - // as the prefilter would produce - end = first_char_end + i + 1; - break 'nonascii; - }; - needle_char = next_needle_char; - } - } - // some needle chars were not matched bail out - return None; - } - } - } // minimize the greedly match by greedy matching in reverse - - let mut needle_iter = needle.iter().rev().copied(); - let mut needle_char = needle_iter.next().unwrap(); - for (i, &c) in haystack[start..end].iter().enumerate().rev() { - let c = c.normalize(&self.config); - if c == needle_char { - let Some(next_needle_char) = needle_iter.next() else { - start += i; - break; - }; - needle_char = next_needle_char; - } - } - Some(self.calculate_score::(haystack, needle, start, end, indices)) - } -} diff --git a/matcher/src/fuzzy_optimal.rs b/matcher/src/fuzzy_optimal.rs deleted file mode 100644 index aba7bbe3..00000000 --- a/matcher/src/fuzzy_optimal.rs +++ /dev/null @@ -1,348 +0,0 @@ -use std::cmp::max; - -use crate::chars::{Char, CharClass}; -use crate::matrix::{MatcherDataView, MatrixCell, ScoreCell}; -use crate::score::{ - BONUS_BOUNDARY, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, MAX_PREFIX_BONUS, - PENALTY_GAP_EXTENSION, PENALTY_GAP_START, PREFIX_BONUS_SCALE, SCORE_MATCH, -}; -use crate::{Config, Matcher}; - -impl Matcher { - pub(crate) fn fuzzy_match_optimal, N: Char>( - &mut self, - haystack: &[H], - needle: &[N], - start: usize, - greedy_end: usize, - end: usize, - indices: &mut Vec, - ) -> Option { - // construct a matrix (and copy the haystack), the matrix and haystack size are bounded - // to avoid the slow O(mn) time complexity for large inputs. Furthermore, it allows - // us to treat needle indices as u16 - let Some(mut matrix) = self.slab.alloc(&haystack[start..end], needle.len()) else { - return self.fuzzy_match_greedy_::( - haystack, needle, start, greedy_end, indices, - ); - }; - - let prev_class = start - .checked_sub(1) - .map(|i| haystack[i].char_class(&self.config)) - .unwrap_or(self.config.initial_char_class); - let matched = matrix.setup::(needle, prev_class, &self.config, start as u32); - // this only happened with unicode haystacks, for ASCII the prefilter handles all rejects - if !matched { - assert!( - !N::ASCII || !H::ASCII, - "should have been caught by prefilter" - ); - return None; - } - - // populate the matrix and find the best score - let matrix_len = matrix.populate_matrix::(needle); - let last_row_off = matrix.row_offs[needle.len() - 1]; - let relative_last_row_off = last_row_off as usize + 1 - needle.len(); - let (match_end, match_score_cell) = matrix.current_row[relative_last_row_off..] - .iter() - .enumerate() - .max_by_key(|(_, cell)| cell.score) - .expect("there must be atleast one match"); - if INDICES { - matrix.reconstruct_optimal_path(match_end as u16, indices, matrix_len, start as u32); - } - Some(match_score_cell.score) - } -} - -const UNMATCHED: ScoreCell = ScoreCell { - score: 0, - // if matched is true then the consecutive bonus - // is always atleast BONUS_CONSECUTIVE so - // this constant can never occur naturally - consecutive_bonus: 0, - matched: true, -}; - -fn next_m_cell(p_score: u16, bonus: u16, m_cell: ScoreCell) -> ScoreCell { - if m_cell == UNMATCHED { - return ScoreCell { - score: p_score + bonus + SCORE_MATCH, - matched: false, - consecutive_bonus: bonus as u8, - }; - } - - let mut consecutive_bonus = max(m_cell.consecutive_bonus as u16, BONUS_CONSECUTIVE); - if bonus >= BONUS_BOUNDARY && bonus > consecutive_bonus { - consecutive_bonus = bonus - } - - let score_match = m_cell.score + max(consecutive_bonus, bonus); - let score_skip = p_score + bonus; - if score_match > score_skip { - ScoreCell { - score: score_match + SCORE_MATCH, - matched: true, - consecutive_bonus: consecutive_bonus as u8, - } - } else { - ScoreCell { - score: score_skip + SCORE_MATCH, - matched: false, - consecutive_bonus: bonus as u8, - } - } -} - -fn p_score(prev_p_score: u16, prev_m_score: u16) -> (u16, bool) { - let score_match = prev_m_score.saturating_sub(PENALTY_GAP_START); - let score_skip = prev_p_score.saturating_sub(PENALTY_GAP_EXTENSION); - if score_match > score_skip { - (score_match, true) - } else { - (score_skip, false) - } -} - -impl MatcherDataView<'_, H> { - fn setup( - &mut self, - needle: &[N], - mut prev_class: CharClass, - config: &Config, - start: u32, - ) -> bool - where - H: PartialEq, - { - let mut row_iter = needle.iter().copied().zip(self.row_offs.iter_mut()); - let (mut needle_char, mut row_start) = row_iter.next().unwrap(); - - let col_iter = self - .haystack - .iter_mut() - .zip(self.bonus.iter_mut()) - .enumerate(); - - let mut matched = false; - for (i, (c_, bonus_)) in col_iter { - let (c, class) = c_.char_class_and_normalize(config); - *c_ = c; - - let bonus = config.bonus_for(prev_class, class); - // save bonus for later so we don't have to recompute it each time - *bonus_ = bonus as u8; - prev_class = class; - - let i = i as u16; - if c == needle_char { - // save the first idx of each char - if let Some(next) = row_iter.next() { - *row_start = i; - (needle_char, row_start) = next; - } else if !matched { - *row_start = i; - // we have atleast one match - matched = true; - } - } - } - if !matched { - return false; - } - debug_assert_eq!(self.row_offs[0], 0); - Self::score_row::( - self.current_row, - self.matrix_cells, - self.haystack, - self.bonus, - 0, - self.row_offs[1], - 0, - needle[0], - needle[1], - if config.prefer_prefix { - if start == 0 { - MAX_PREFIX_BONUS * PREFIX_BONUS_SCALE - } else { - (MAX_PREFIX_BONUS * PREFIX_BONUS_SCALE - PENALTY_GAP_START).saturating_sub( - (start - 1).min(u16::MAX as u32) as u16 * PENALTY_GAP_EXTENSION, - ) - } - } else { - 0 - }, - ); - true - } - - #[allow(clippy::too_many_arguments)] - fn score_row( - current_row: &mut [ScoreCell], - matrix_cells: &mut [MatrixCell], - haystack: &[H], - bonus: &[u8], - row_off: u16, - mut next_row_off: u16, - needle_idx: u16, - needle_char: N, - next_needle_char: N, - mut prefix_bonus: u16, - ) where - H: PartialEq, - { - next_row_off -= 1; - let relative_row_off = row_off - needle_idx; - let next_relative_row_off = next_row_off - needle_idx; - let skipped_col_iter = haystack[row_off as usize..next_row_off as usize] - .iter() - .zip(bonus[row_off as usize..next_row_off as usize].iter()) - .zip(current_row[relative_row_off as usize..next_relative_row_off as usize].iter_mut()) - .zip(matrix_cells.iter_mut()); - let mut prev_p_score = 0; - let mut prev_m_score = 0; - for (((&c, bonus), score_cell), matrix_cell) in skipped_col_iter { - let (p_score, p_matched) = p_score(prev_p_score, prev_m_score); - let m_cell = if FIRST_ROW { - let cell = if c == needle_char { - ScoreCell { - score: *bonus as u16 * BONUS_FIRST_CHAR_MULTIPLIER - + SCORE_MATCH - + prefix_bonus / PREFIX_BONUS_SCALE, - matched: false, - consecutive_bonus: *bonus, - } - } else { - UNMATCHED - }; - prefix_bonus = prefix_bonus.saturating_sub(PENALTY_GAP_EXTENSION); - cell - } else { - *score_cell - }; - if INDICES { - matrix_cell.set(p_matched, m_cell.matched); - } - prev_p_score = p_score; - prev_m_score = m_cell.score; - } - let col_iter = haystack[next_row_off as usize..] - .windows(2) - .zip(bonus[next_row_off as usize..].windows(2)) - .zip(current_row[next_relative_row_off as usize..].iter_mut()) - .zip(matrix_cells[(next_relative_row_off - relative_row_off) as usize..].iter_mut()); - for (((c, bonus), score_cell), matrix_cell) in col_iter { - let (p_score, p_matched) = p_score(prev_p_score, prev_m_score); - let m_cell = if FIRST_ROW { - let cell = if c[0] == needle_char { - ScoreCell { - score: bonus[0] as u16 * BONUS_FIRST_CHAR_MULTIPLIER - + SCORE_MATCH - + prefix_bonus / PREFIX_BONUS_SCALE, - matched: false, - consecutive_bonus: bonus[0], - } - } else { - UNMATCHED - }; - prefix_bonus = prefix_bonus.saturating_sub(PENALTY_GAP_EXTENSION); - cell - } else { - *score_cell - }; - *score_cell = if c[1] == next_needle_char { - next_m_cell(p_score, bonus[1] as u16, m_cell) - } else { - UNMATCHED - }; - if INDICES { - matrix_cell.set(p_matched, m_cell.matched); - } - prev_p_score = p_score; - prev_m_score = m_cell.score; - } - } - - fn populate_matrix(&mut self, needle: &[N]) -> usize - where - H: PartialEq, - { - let mut matrix_cells = &mut self.matrix_cells[self.current_row.len()..]; - let mut row_iter = needle[1..] - .iter() - .copied() - .zip(self.row_offs[1..].iter().copied()) - .enumerate(); - let (mut needle_idx, (mut needle_char, mut row_off)) = row_iter.next().unwrap(); - for (next_needle_idx, (next_needle_char, next_row_off)) in row_iter { - Self::score_row::( - self.current_row, - matrix_cells, - self.haystack, - self.bonus, - row_off, - next_row_off, - needle_idx as u16 + 1, - needle_char, - next_needle_char, - 0, - ); - let len = self.current_row.len() + needle_idx + 1 - row_off as usize; - matrix_cells = &mut matrix_cells[len..]; - (needle_idx, needle_char, row_off) = (next_needle_idx, next_needle_char, next_row_off); - } - matrix_cells.as_ptr() as usize - self.matrix_cells.as_ptr() as usize - } - - fn reconstruct_optimal_path( - &self, - max_score_end: u16, - indices: &mut Vec, - matrix_len: usize, - start: u32, - ) { - let indices_start = indices.len(); - indices.resize(indices_start + self.row_offs.len(), 0); - let indices = &mut indices[indices_start..]; - let last_row_off = *self.row_offs.last().unwrap(); - indices[self.row_offs.len() - 1] = start + max_score_end as u32 + last_row_off as u32; - - let mut matrix_cells = &self.matrix_cells[..matrix_len]; - let width = self.current_row.len(); - let mut row_iter = self.row_offs[..self.row_offs.len() - 1] - .iter() - .copied() - .enumerate() - .rev() - .map(|(i, off)| { - let relative_off = off as usize - i; - let row; - (matrix_cells, row) = - matrix_cells.split_at(matrix_cells.len() - (width - relative_off)); - (i, off, row) - }); - let (mut row_idx, mut row_off, mut row) = row_iter.next().unwrap(); - let mut col = max_score_end; - let relative_last_row_off = last_row_off as usize + 1 - self.row_offs.len(); - let mut matched = self.current_row[col as usize + relative_last_row_off].matched; - col += last_row_off - row_off - 1; - loop { - if matched { - indices[row_idx] = start + col as u32 + row_off as u32; - } - let next_matched = row[col as usize].get(matched); - if matched { - let Some((next_row_idx, next_row_off, next_row)) = row_iter.next() else { - break; - }; - col += row_off - next_row_off; - (row_idx, row_off, row) = (next_row_idx, next_row_off, next_row) - } - col -= 1; - matched = next_matched; - } - } -} diff --git a/matcher/src/lib.rs b/matcher/src/lib.rs deleted file mode 100644 index e175dd6b..00000000 --- a/matcher/src/lib.rs +++ /dev/null @@ -1,759 +0,0 @@ -/*! -`nucleo_matcher` is a low level crate that contains the matcher implementation -used by the high level `nucleo` crate. - -The matcher is hightly optimized and can significantly outperform `fzf` and -`skim` (the `fuzzy-matcher` crate). However some of these optimizations require -a slightly less convenient API. Be sure to carefully read the documentation of -the [`Matcher`] to avoid unexpected behaviour. -# Examples - -For almost all usecases the [`pattern`] API should be used instead of calling -the matcher methods directly. [`Pattern::parse`](pattern::Pattern::parse) will -construct a single Atom (a single match operation) for each word. The pattern -can contain special characters to control what kind of match is performed (see -[`AtomKind`](crate::pattern::AtomKind)). - -``` -# use nucleo_matcher::{Matcher, Config}; -# use nucleo_matcher::pattern::{Pattern, Normalization, CaseMatching}; -let paths = ["foo/bar", "bar/foo", "foobar"]; -let mut matcher = Matcher::new(Config::DEFAULT.match_paths()); -let matches = Pattern::parse("foo bar", CaseMatching::Ignore, Normalization::Smart).match_list(paths, &mut matcher); -assert_eq!(matches, vec![("foo/bar", 168), ("bar/foo", 168), ("foobar", 140)]); -let matches = Pattern::parse("^foo bar", CaseMatching::Ignore, Normalization::Smart).match_list(paths, &mut matcher); -assert_eq!(matches, vec![("foo/bar", 168), ("foobar", 140)]); -``` - -If the pattern should be matched literally (without this special parsing) -[`Pattern::new`](pattern::Pattern::new) can be used instead. - -``` -# use nucleo_matcher::{Matcher, Config}; -# use nucleo_matcher::pattern::{Pattern, CaseMatching, AtomKind, Normalization}; -let paths = ["foo/bar", "bar/foo", "foobar"]; -let mut matcher = Matcher::new(Config::DEFAULT.match_paths()); -let matches = Pattern::new("foo bar", CaseMatching::Ignore, Normalization::Smart, AtomKind::Fuzzy).match_list(paths, &mut matcher); -assert_eq!(matches, vec![("foo/bar", 168), ("bar/foo", 168), ("foobar", 140)]); -let paths = ["^foo/bar", "bar/^foo", "foobar"]; -let matches = Pattern::new("^foo bar", CaseMatching::Ignore, Normalization::Smart, AtomKind::Fuzzy).match_list(paths, &mut matcher); -assert_eq!(matches, vec![("^foo/bar", 188), ("bar/^foo", 188)]); -``` - -If word segmentation is also not desired, a single `Atom` can be constructed directly. - -``` -# use nucleo_matcher::{Matcher, Config}; -# use nucleo_matcher::pattern::{Pattern, Atom, CaseMatching, Normalization, AtomKind}; -let paths = ["foobar", "foo bar"]; -let mut matcher = Matcher::new(Config::DEFAULT); -let matches = Atom::new("foo bar", CaseMatching::Ignore, Normalization::Smart, AtomKind::Fuzzy, false).match_list(paths, &mut matcher); -assert_eq!(matches, vec![("foo bar", 192)]); -``` - - -# Status - -Nucleo is used in the helix-editor and therefore has a large user base with lots or real world testing. The core matcher implementation is considered complete and is unlikely to see major changes. The `nucleo-matcher` crate is finished and ready for widespread use, breaking changes should be very rare (a 1.0 release should not be far away). - -*/ - -// sadly ranges don't optmimzie well -#![allow(clippy::manual_range_contains)] -#![warn(missing_docs)] - -pub mod chars; -mod config; -#[cfg(test)] -mod debug; -mod exact; -mod fuzzy_greedy; -mod fuzzy_optimal; -mod matrix; -pub mod pattern; -mod prefilter; -mod score; -mod utf32_str; - -#[cfg(test)] -mod tests; - -pub use crate::config::Config; -pub use crate::utf32_str::{Utf32Str, Utf32String}; - -use crate::chars::{AsciiChar, Char}; -use crate::matrix::MatrixSlab; - -/// A matcher engine that can execute (fuzzy) matches. -/// -/// A matches contains **heap allocated** scratch memory that is reused during -/// matching. This scratch memory allows the matcher to guarantee that it will -/// **never allocate** during matching (with the exception of pushing to the -/// `indices` vector if there isn't enough capacity). However this scratch -/// memory is fairly large (around 135KB) so creating a matcher is expensive. -/// -/// All `.._match` functions will not compute the indices of the matched -/// characters. These should be used to prefilter to filter and rank all -/// matches. All `.._indices` functions will also compute the indices of the -/// matched characters but are slower compared to the `..match` variant. These -/// should be used when rendering the best N matches. Note that the `indices` -/// argument is **never cleared**. This allows running multiple different -/// matches on the same haystack and merging the indices by sorting and -/// deduplicating the vector. -/// -/// The `needle` argument for each function must always be normalized by the -/// caller (unicode normalization and case folding). Otherwise, the matcher -/// may fail to produce a match. The [`pattern`] modules provides utilities -/// to preprocess needles and **should usually be preferred over invoking the -/// matcher directly**. Additionally it's recommend to perform separate matches -/// for each word in the needle. Consider the folloling example: -/// -/// If `foo bar` is used as the needle it matches both `foo test baaar` and -/// `foo hello-world bar`. However, `foo test baaar` will receive a higher -/// score than `foo hello-world bar`. `baaar` contains a 2 character gap which -/// will receive a penalty and therefore the user will likely expect it to rank -/// lower. However, if `foo bar` is matched as a single query `hello-world` and -/// `test` are both considered gaps too. As `hello-world` is a much longer gap -/// then `test` the extra penalty for `baaar` is canceled out. If both words -/// are matched individually the interspersed words do not receive a penalty and -/// `foo hello-world bar` ranks higher. -/// -/// In general nucleo is a **substring matching tool** (except for the prefix/ -/// postfix matching modes) with no penalty assigned to matches that start -/// later within the same pattern (which enables matching words individually -/// as shown above). If patterns show a large variety in length and the syntax -/// described above is not used it may be preferable to give preference to -/// matches closer to the start of a haystack. To accommodate that usecase the -/// [`prefer_prefix`](Config::prefer_prefix) option can be set to true. -/// -/// Matching is limited to 2^32-1 codepoints, if the haystack is longer than -/// that the matcher **will panic**. The caller must decide whether it wants to -/// filter out long haystacks or truncate them. -pub struct Matcher { - #[allow(missing_docs)] - pub config: Config, - slab: MatrixSlab, -} - -// this is just here for convenience not sure if we should implement this -impl Clone for Matcher { - fn clone(&self) -> Self { - Matcher { - config: self.config.clone(), - slab: MatrixSlab::new(), - } - } -} - -impl std::fmt::Debug for Matcher { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("Matcher") - .field("config", &self.config) - .finish_non_exhaustive() - } -} - -impl Default for Matcher { - fn default() -> Self { - Matcher { - config: Config::DEFAULT, - slab: MatrixSlab::new(), - } - } -} - -impl Matcher { - /// Creates a new matcher instance, note that this will eagerly allocate a - /// fairly large chunk of heap memory (around 135KB currently but subject to - /// change) so matchers should be reused if called often (like in a loop). - pub fn new(config: Config) -> Self { - Self { - config, - slab: MatrixSlab::new(), - } - } - - /// Find the fuzzy match with the highest score in the `haystack`. - /// - /// This functions has `O(mn)` time complexity for short inputs. - /// To avoid slowdowns it automatically falls back to - /// [greedy matching](crate::Matcher::fuzzy_match_greedy) for large - /// needles and haystacks. - /// - /// See the [matcher documentation](crate::Matcher) for more details. - pub fn fuzzy_match(&mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>) -> Option { - assert!(haystack.len() <= u32::MAX as usize); - self.fuzzy_matcher_impl::(haystack, needle, &mut Vec::new()) - } - - /// Find the fuzzy match with the higehest score in the `haystack` and - /// compute its indices. - /// - /// This functions has `O(mn)` time complexity for short inputs. To - /// avoid slowdowns it automatically falls back to [greedy matching] - /// (crate::Matcher::fuzzy_match_greedy) for large needles and haystacks - /// - /// See the [matcher documentation](crate::Matcher) for more details. - pub fn fuzzy_indices( - &mut self, - haystack: Utf32Str<'_>, - needle: Utf32Str<'_>, - indices: &mut Vec, - ) -> Option { - assert!(haystack.len() <= u32::MAX as usize); - self.fuzzy_matcher_impl::(haystack, needle, indices) - } - - fn fuzzy_matcher_impl( - &mut self, - haystack_: Utf32Str<'_>, - needle_: Utf32Str<'_>, - indices: &mut Vec, - ) -> Option { - if needle_.len() > haystack_.len() { - return None; - } - if needle_.is_empty() { - return Some(0); - } - if needle_.len() == haystack_.len() { - return self.exact_match_impl::( - haystack_, - needle_, - 0, - haystack_.len(), - indices, - ); - } - assert!( - haystack_.len() <= u32::MAX as usize, - "fuzzy matching is only support for up to 2^32-1 codepoints" - ); - match (haystack_, needle_) { - (Utf32Str::Ascii(haystack), Utf32Str::Ascii(needle)) => { - if let &[needle] = needle { - return self.substring_match_1_ascii::(haystack, needle, indices); - } - let (start, greedy_end, end) = self.prefilter_ascii(haystack, needle, false)?; - if needle_.len() == end - start { - return Some(self.calculate_score::( - AsciiChar::cast(haystack), - AsciiChar::cast(needle), - start, - greedy_end, - indices, - )); - } - self.fuzzy_match_optimal::( - AsciiChar::cast(haystack), - AsciiChar::cast(needle), - start, - greedy_end, - end, - indices, - ) - } - (Utf32Str::Ascii(_), Utf32Str::Unicode(_)) => { - // a purely ascii haystack can never be transformed to match - // a needle that contains non-ascii chars since we don't allow gaps - None - } - (Utf32Str::Unicode(haystack), Utf32Str::Ascii(needle)) => { - if let &[needle] = needle { - let (start, _) = self.prefilter_non_ascii(haystack, needle_, true)?; - let res = self.substring_match_1_non_ascii::( - haystack, - needle as char, - start, - indices, - ); - return Some(res); - } - let (start, end) = self.prefilter_non_ascii(haystack, needle_, false)?; - if needle_.len() == end - start { - return self - .exact_match_impl::(haystack_, needle_, start, end, indices); - } - self.fuzzy_match_optimal::( - haystack, - AsciiChar::cast(needle), - start, - start + 1, - end, - indices, - ) - } - (Utf32Str::Unicode(haystack), Utf32Str::Unicode(needle)) => { - if let &[needle] = needle { - let (start, _) = self.prefilter_non_ascii(haystack, needle_, true)?; - let res = self - .substring_match_1_non_ascii::(haystack, needle, start, indices); - return Some(res); - } - let (start, end) = self.prefilter_non_ascii(haystack, needle_, false)?; - if needle_.len() == end - start { - return self - .exact_match_impl::(haystack_, needle_, start, end, indices); - } - self.fuzzy_match_optimal::( - haystack, - needle, - start, - start + 1, - end, - indices, - ) - } - } - } - - /// Greedly find a fuzzy match in the `haystack`. - /// - /// This functions has `O(n)` time complexity but may provide unintutive (non-optimal) - /// indices and scores. Usually [fuzzy_match](crate::Matcher::fuzzy_match) should - /// be preferred. - /// - /// See the [matcher documentation](crate::Matcher) for more details. - pub fn fuzzy_match_greedy( - &mut self, - haystack: Utf32Str<'_>, - needle: Utf32Str<'_>, - ) -> Option { - assert!(haystack.len() <= u32::MAX as usize); - self.fuzzy_match_greedy_impl::(haystack, needle, &mut Vec::new()) - } - - /// Greedly find a fuzzy match in the `haystack` and compute its indices. - /// - /// This functions has `O(n)` time complexity but may provide unintuitive (non-optimal) - /// indices and scores. Usually [fuzzy_indices](crate::Matcher::fuzzy_indices) should - /// be preferred. - /// - /// See the [matcher documentation](crate::Matcher) for more details. - pub fn fuzzy_indices_greedy( - &mut self, - haystack: Utf32Str<'_>, - needle: Utf32Str<'_>, - indices: &mut Vec, - ) -> Option { - assert!(haystack.len() <= u32::MAX as usize); - self.fuzzy_match_greedy_impl::(haystack, needle, indices) - } - - fn fuzzy_match_greedy_impl( - &mut self, - haystack: Utf32Str<'_>, - needle_: Utf32Str<'_>, - indices: &mut Vec, - ) -> Option { - if needle_.len() > haystack.len() { - return None; - } - if needle_.is_empty() { - return Some(0); - } - if needle_.len() == haystack.len() { - return self.exact_match_impl::(haystack, needle_, 0, haystack.len(), indices); - } - assert!( - haystack.len() <= u32::MAX as usize, - "matching is only support for up to 2^32-1 codepoints" - ); - match (haystack, needle_) { - (Utf32Str::Ascii(haystack), Utf32Str::Ascii(needle)) => { - let (start, greedy_end, _) = self.prefilter_ascii(haystack, needle, true)?; - if needle_.len() == greedy_end - start { - return Some(self.calculate_score::( - AsciiChar::cast(haystack), - AsciiChar::cast(needle), - start, - greedy_end, - indices, - )); - } - self.fuzzy_match_greedy_::( - AsciiChar::cast(haystack), - AsciiChar::cast(needle), - start, - greedy_end, - indices, - ) - } - (Utf32Str::Ascii(_), Utf32Str::Unicode(_)) => { - // a purely ascii haystack can never be transformed to match - // a needle that contains non-ascii chars since we don't allow gaps - None - } - (Utf32Str::Unicode(haystack), Utf32Str::Ascii(needle)) => { - let (start, _) = self.prefilter_non_ascii(haystack, needle_, true)?; - self.fuzzy_match_greedy_::( - haystack, - AsciiChar::cast(needle), - start, - start + 1, - indices, - ) - } - (Utf32Str::Unicode(haystack), Utf32Str::Unicode(needle)) => { - let (start, _) = self.prefilter_non_ascii(haystack, needle_, true)?; - self.fuzzy_match_greedy_::( - haystack, - needle, - start, - start + 1, - indices, - ) - } - } - } - - /// Finds the substring match with the highest score in the `haystack`. - /// - /// This functions has `O(nm)` time complexity. However many cases can - /// be significantly accelerated using prefilters so it's usually very fast - /// in practice. - /// - /// See the [matcher documentation](crate::Matcher) for more details. - pub fn substring_match( - &mut self, - haystack: Utf32Str<'_>, - needle_: Utf32Str<'_>, - ) -> Option { - self.substring_match_impl::(haystack, needle_, &mut Vec::new()) - } - - /// Finds the substring match with the highest score in the `haystack` and - /// compute its indices. - /// - /// This functions has `O(nm)` time complexity. However many cases can - /// be significantly accelerated using prefilters so it's usually fast - /// in practice. - /// - /// See the [matcher documentation](crate::Matcher) for more details. - pub fn substring_indices( - &mut self, - haystack: Utf32Str<'_>, - needle_: Utf32Str<'_>, - indices: &mut Vec, - ) -> Option { - self.substring_match_impl::(haystack, needle_, indices) - } - - fn substring_match_impl( - &mut self, - haystack: Utf32Str<'_>, - needle_: Utf32Str<'_>, - indices: &mut Vec, - ) -> Option { - if needle_.len() > haystack.len() { - return None; - } - if needle_.is_empty() { - return Some(0); - } - if needle_.len() == haystack.len() { - return self.exact_match_impl::(haystack, needle_, 0, haystack.len(), indices); - } - assert!( - haystack.len() <= u32::MAX as usize, - "matching is only support for up to 2^32-1 codepoints" - ); - match (haystack, needle_) { - (Utf32Str::Ascii(haystack), Utf32Str::Ascii(needle)) => { - if let &[needle] = needle { - return self.substring_match_1_ascii::(haystack, needle, indices); - } - self.substring_match_ascii::(haystack, needle, indices) - } - (Utf32Str::Ascii(_), Utf32Str::Unicode(_)) => { - // a purely ascii haystack can never be transformed to match - // a needle that contains non-ascii chars since we don't allow gaps - None - } - (Utf32Str::Unicode(haystack), Utf32Str::Ascii(needle)) => { - if let &[needle] = needle { - let (start, _) = self.prefilter_non_ascii(haystack, needle_, true)?; - let res = self.substring_match_1_non_ascii::( - haystack, - needle as char, - start, - indices, - ); - return Some(res); - } - let (start, _) = self.prefilter_non_ascii(haystack, needle_, false)?; - self.substring_match_non_ascii::( - haystack, - AsciiChar::cast(needle), - start, - indices, - ) - } - (Utf32Str::Unicode(haystack), Utf32Str::Unicode(needle)) => { - if let &[needle] = needle { - let (start, _) = self.prefilter_non_ascii(haystack, needle_, true)?; - let res = self - .substring_match_1_non_ascii::(haystack, needle, start, indices); - return Some(res); - } - let (start, _) = self.prefilter_non_ascii(haystack, needle_, false)?; - self.substring_match_non_ascii::(haystack, needle, start, indices) - } - } - } - - /// Checks whether needle and haystack match exactly. - /// - /// This functions has `O(n)` time complexity. - /// - /// See the [matcher documentation](crate::Matcher) for more details. - pub fn exact_match(&mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>) -> Option { - if needle.is_empty() { - return Some(0); - } - let mut leading_space = 0; - let mut trailing_space = 0; - if !needle.first().is_whitespace() { - leading_space = haystack.leading_white_space() - } - if !needle.last().is_whitespace() { - trailing_space = haystack.trailing_white_space() - } - // avoid wraparound in size check - if trailing_space == haystack.len() { - return None; - } - self.exact_match_impl::( - haystack, - needle, - leading_space, - haystack.len() - trailing_space, - &mut Vec::new(), - ) - } - - /// Checks whether needle and haystack match exactly and compute the matches indices. - /// - /// This functions has `O(n)` time complexity. - /// - /// See the [matcher documentation](crate::Matcher) for more details. - pub fn exact_indices( - &mut self, - haystack: Utf32Str<'_>, - needle: Utf32Str<'_>, - indices: &mut Vec, - ) -> Option { - if needle.is_empty() { - return Some(0); - } - let mut leading_space = 0; - let mut trailing_space = 0; - if !needle.first().is_whitespace() { - leading_space = haystack.leading_white_space() - } - if !needle.last().is_whitespace() { - trailing_space = haystack.trailing_white_space() - } - // avoid wraparound in size check - if trailing_space == haystack.len() { - return None; - } - self.exact_match_impl::( - haystack, - needle, - leading_space, - haystack.len() - trailing_space, - indices, - ) - } - - /// Checks whether needle is a prefix of the haystack. - /// - /// This functions has `O(n)` time complexity. - /// - /// See the [matcher documentation](crate::Matcher) for more details. - pub fn prefix_match(&mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>) -> Option { - if needle.is_empty() { - return Some(0); - } - let mut leading_space = 0; - if !needle.first().is_whitespace() { - leading_space = haystack.leading_white_space() - } - if haystack.len() - leading_space < needle.len() { - None - } else { - self.exact_match_impl::( - haystack, - needle, - leading_space, - needle.len() + leading_space, - &mut Vec::new(), - ) - } - } - - /// Checks whether needle is a prefix of the haystack and compute the matches indices. - /// - /// This functions has `O(n)` time complexity. - /// - /// See the [matcher documentation](crate::Matcher) for more details. - pub fn prefix_indices( - &mut self, - haystack: Utf32Str<'_>, - needle: Utf32Str<'_>, - indices: &mut Vec, - ) -> Option { - if needle.is_empty() { - return Some(0); - } - let mut leading_space = 0; - if !needle.first().is_whitespace() { - leading_space = haystack.leading_white_space() - } - if haystack.len() - leading_space < needle.len() { - None - } else { - self.exact_match_impl::( - haystack, - needle, - leading_space, - needle.len() + leading_space, - indices, - ) - } - } - - /// Checks whether needle is a postfix of the haystack. - /// - /// This functions has `O(n)` time complexity. - /// - /// See the [matcher documentation](crate::Matcher) for more details. - pub fn postfix_match(&mut self, haystack: Utf32Str<'_>, needle: Utf32Str<'_>) -> Option { - if needle.is_empty() { - return Some(0); - } - let mut trailing_spaces = 0; - if !needle.last().is_whitespace() { - trailing_spaces = haystack.trailing_white_space() - } - if haystack.len() - trailing_spaces < needle.len() { - None - } else { - self.exact_match_impl::( - haystack, - needle, - haystack.len() - needle.len() - trailing_spaces, - haystack.len() - trailing_spaces, - &mut Vec::new(), - ) - } - } - - /// Checks whether needle is a postfix of the haystack and compute the matches indices. - /// - /// This functions has `O(n)` time complexity. - /// - /// See the [matcher documentation](crate::Matcher) for more details. - pub fn postfix_indices( - &mut self, - haystack: Utf32Str<'_>, - needle: Utf32Str<'_>, - indices: &mut Vec, - ) -> Option { - if needle.is_empty() { - return Some(0); - } - let mut trailing_spaces = 0; - if !needle.last().is_whitespace() { - trailing_spaces = haystack.trailing_white_space() - } - if haystack.len() - trailing_spaces < needle.len() { - None - } else { - self.exact_match_impl::( - haystack, - needle, - haystack.len() - needle.len() - trailing_spaces, - haystack.len() - trailing_spaces, - indices, - ) - } - } - - fn exact_match_impl( - &mut self, - haystack: Utf32Str<'_>, - needle_: Utf32Str<'_>, - start: usize, - end: usize, - indices: &mut Vec, - ) -> Option { - if needle_.len() != end - start { - return None; - } - assert!( - haystack.len() <= u32::MAX as usize, - "matching is only support for up to 2^32-1 codepoints" - ); - let score = match (haystack, needle_) { - (Utf32Str::Ascii(haystack), Utf32Str::Ascii(needle)) => { - let matched = if self.config.ignore_case { - AsciiChar::cast(haystack)[start..end] - .iter() - .map(|c| c.normalize(&self.config)) - .eq(AsciiChar::cast(needle) - .iter() - .map(|c| c.normalize(&self.config))) - } else { - haystack == needle - }; - if !matched { - return None; - } - self.calculate_score::( - AsciiChar::cast(haystack), - AsciiChar::cast(needle), - start, - end, - indices, - ) - } - (Utf32Str::Ascii(_), Utf32Str::Unicode(_)) => { - // a purely ascii haystack can never be transformed to match - // a needle that contains non-ascii chars since we don't allow gaps - return None; - } - (Utf32Str::Unicode(haystack), Utf32Str::Ascii(needle)) => { - let matched = haystack[start..end] - .iter() - .map(|c| c.normalize(&self.config)) - .eq(AsciiChar::cast(needle) - .iter() - .map(|c| c.normalize(&self.config))); - if !matched { - return None; - } - - self.calculate_score::( - haystack, - AsciiChar::cast(needle), - start, - end, - indices, - ) - } - (Utf32Str::Unicode(haystack), Utf32Str::Unicode(needle)) => { - let matched = haystack[start..end] - .iter() - .map(|c| c.normalize(&self.config)) - .eq(needle.iter().map(|c| c.normalize(&self.config))); - if !matched { - return None; - } - self.calculate_score::(haystack, needle, start, end, indices) - } - }; - Some(score) - } -} diff --git a/matcher/src/matrix.rs b/matcher/src/matrix.rs deleted file mode 100644 index a91ed95f..00000000 --- a/matcher/src/matrix.rs +++ /dev/null @@ -1,198 +0,0 @@ -use std::alloc::{alloc_zeroed, dealloc, handle_alloc_error, Layout}; -use std::marker::PhantomData; -use std::mem::size_of; -use std::panic::{RefUnwindSafe, UnwindSafe}; -use std::ptr::{slice_from_raw_parts_mut, NonNull}; - -use crate::chars::Char; - -const MAX_MATRIX_SIZE: usize = 100 * 1024; // 100*1024 = 100KB - -// these two aren't hard maxima, instead we simply allow whatever will fit into memory -const MAX_HAYSTACK_LEN: usize = 2048; // 64KB -const MAX_NEEDLE_LEN: usize = 2048; // 64KB - -struct MatrixLayout { - haystack_len: usize, - needle_len: usize, - layout: Layout, - haystack_off: usize, - bonus_off: usize, - rows_off: usize, - score_off: usize, - matrix_off: usize, - _phantom: PhantomData, -} -impl MatrixLayout { - fn new(haystack_len: usize, needle_len: usize) -> MatrixLayout { - assert!(haystack_len >= needle_len); - assert!(haystack_len <= u32::MAX as usize); - let mut layout = Layout::from_size_align(0, 1).unwrap(); - let haystack_layout = Layout::array::(haystack_len).unwrap(); - let bonus_layout = Layout::array::(haystack_len).unwrap(); - let rows_layout = Layout::array::(needle_len).unwrap(); - let score_layout = Layout::array::(haystack_len + 1 - needle_len).unwrap(); - let matrix_layout = - Layout::array::((haystack_len + 1 - needle_len) * needle_len).unwrap(); - - let haystack_off; - (layout, haystack_off) = layout.extend(haystack_layout).unwrap(); - let bonus_off; - (layout, bonus_off) = layout.extend(bonus_layout).unwrap(); - let rows_off; - (layout, rows_off) = layout.extend(rows_layout).unwrap(); - let score_off; - (layout, score_off) = layout.extend(score_layout).unwrap(); - let matrix_off; - (layout, matrix_off) = layout.extend(matrix_layout).unwrap(); - MatrixLayout { - haystack_len, - needle_len, - layout, - haystack_off, - bonus_off, - rows_off, - score_off, - matrix_off, - _phantom: PhantomData, - } - } - /// # Safety - /// - /// `ptr` must point at an allocated with MARTIX_ALLOC_LAYOUT - #[allow(clippy::type_complexity)] - unsafe fn fieds_from_ptr( - &self, - ptr: NonNull, - ) -> ( - *mut [C], - *mut [u8], - *mut [u16], - *mut [ScoreCell], - *mut [MatrixCell], - ) { - let base = ptr.as_ptr(); - let haystack = base.add(self.haystack_off) as *mut C; - let haystack = slice_from_raw_parts_mut(haystack, self.haystack_len); - let bonus = base.add(self.bonus_off); - let bonus = slice_from_raw_parts_mut(bonus, self.haystack_len); - let rows = base.add(self.rows_off) as *mut u16; - let rows = slice_from_raw_parts_mut(rows, self.needle_len); - let cells = base.add(self.score_off) as *mut ScoreCell; - let cells = slice_from_raw_parts_mut(cells, self.haystack_len + 1 - self.needle_len); - let matrix = base.add(self.matrix_off) as *mut MatrixCell; - let matrix = slice_from_raw_parts_mut( - matrix, - (self.haystack_len + 1 - self.needle_len) * self.haystack_len, - ); - (haystack, bonus, rows, cells, matrix) - } -} - -const _SIZE_CHECK: () = { - if size_of::() != 8 { - panic!() - } -}; - -// make this act like a u64 -#[repr(align(8))] -#[derive(Clone, Copy, PartialEq, Eq)] -pub(crate) struct ScoreCell { - pub score: u16, - pub consecutive_bonus: u8, - pub matched: bool, -} - -pub(crate) struct MatcherDataView<'a, C: Char> { - pub haystack: &'a mut [C], - // stored as a separate array instead of struct - // to avoid padding since char is too large and u8 too small :/ - pub bonus: &'a mut [u8], - pub current_row: &'a mut [ScoreCell], - pub row_offs: &'a mut [u16], - pub matrix_cells: &'a mut [MatrixCell], -} -#[repr(transparent)] -pub struct MatrixCell(pub(crate) u8); - -impl MatrixCell { - pub fn set(&mut self, p_match: bool, m_match: bool) { - self.0 = p_match as u8 | ((m_match as u8) << 1); - } - - pub fn get(&self, m_matrix: bool) -> bool { - let mask = m_matrix as u8 + 1; - (self.0 & mask) != 0 - } -} - -// we only use this to construct the layout for the slab allocation -#[allow(unused)] -struct MatcherData { - haystack: [char; MAX_HAYSTACK_LEN], - bonus: [u8; MAX_HAYSTACK_LEN], - row_offs: [u16; MAX_NEEDLE_LEN], - scratch_space: [ScoreCell; MAX_HAYSTACK_LEN], - matrix: [u8; MAX_MATRIX_SIZE], -} - -pub(crate) struct MatrixSlab(NonNull); -unsafe impl Sync for MatrixSlab {} -unsafe impl Send for MatrixSlab {} -impl UnwindSafe for MatrixSlab {} -impl RefUnwindSafe for MatrixSlab {} - -impl MatrixSlab { - pub fn new() -> Self { - let layout = Layout::new::(); - // safety: the matrix is never zero sized (hardcoded constants) - let ptr = unsafe { alloc_zeroed(layout) }; - let Some(ptr) = NonNull::new(ptr) else { - handle_alloc_error(layout) - }; - MatrixSlab(ptr.cast()) - } - - pub(crate) fn alloc( - &mut self, - haystack_: &[C], - needle_len: usize, - ) -> Option> { - let cells = haystack_.len() * needle_len; - if cells > MAX_MATRIX_SIZE - || haystack_.len() > u16::MAX as usize - // ensures that scores never overflow - || needle_len > MAX_NEEDLE_LEN - { - return None; - } - let matrix_layout = MatrixLayout::::new(haystack_.len(), needle_len); - if matrix_layout.layout.size() > size_of::() { - return None; - } - unsafe { - // safely: this allocation is valid for MATRIX_ALLOC_LAYOUT - let (haystack, bonus, rows, current_row, matrix_cells) = - matrix_layout.fieds_from_ptr(self.0); - // copy haystack before creating references to ensure we don't create - // references to invalid chars (which may or may not be UB) - haystack_ - .as_ptr() - .copy_to_nonoverlapping(haystack as *mut _, haystack_.len()); - Some(MatcherDataView { - haystack: &mut *haystack, - row_offs: &mut *rows, - bonus: &mut *bonus, - current_row: &mut *current_row, - matrix_cells: &mut *matrix_cells, - }) - } - } -} - -impl Drop for MatrixSlab { - fn drop(&mut self) { - unsafe { dealloc(self.0.as_ptr(), Layout::new::()) }; - } -} diff --git a/matcher/src/pattern.rs b/matcher/src/pattern.rs deleted file mode 100644 index b2028311..00000000 --- a/matcher/src/pattern.rs +++ /dev/null @@ -1,568 +0,0 @@ -//! This module provides a slightly higher level API for matching strings. - -use std::cmp::Reverse; - -use crate::{chars, Matcher, Utf32Str}; - -#[cfg(test)] -mod tests; - -use crate::Utf32String; - -#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)] -#[non_exhaustive] -/// How to treat a case mismatch between two characters. -pub enum CaseMatching { - /// Characters never match their case folded version (`a != A`). - Respect, - /// Characters always match their case folded version (`a == A`). - #[cfg(feature = "unicode-casefold")] - Ignore, - /// Acts like [`Ignore`](CaseMatching::Ignore) if all characters in a pattern atom are - /// lowercase and like [`Respect`](CaseMatching::Respect) otherwise. - #[default] - #[cfg(feature = "unicode-casefold")] - Smart, -} - -#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)] -#[non_exhaustive] -/// How to handle unicode normalization, -pub enum Normalization { - /// Characters never match their normalized version (`a != ä`). - Never, - /// Acts like [`Never`](Normalization::Never) if any character in a pattern atom - /// would need to be normalized. Otherwise normalization occurs (`a == ä` but `ä != a`). - #[default] - #[cfg(feature = "unicode-normalization")] - Smart, -} - -#[derive(Debug, PartialEq, Eq, Clone, Copy)] -#[non_exhaustive] -/// The kind of matching algorithm to run for an atom. -pub enum AtomKind { - /// Fuzzy matching where the needle must match any haystack characters - /// (match can contain gaps). This atom kind is used by default if no - /// special syntax is used. There is no negated fuzzy matching (too - /// many false positives). - /// - /// See also [`Matcher::fuzzy_match`](crate::Matcher::fuzzy_match). - Fuzzy, - /// The needle must match a contiguous sequence of haystack characters - /// without gaps. This atom kind is parsed from the following syntax: - /// `'foo` and `!foo` (negated). - /// - /// See also [`Matcher::substring_match`](crate::Matcher::substring_match). - Substring, - /// The needle must match all leading haystack characters without gaps or - /// prefix. This atom kind is parsed from the following syntax: `^foo` and - /// `!^foo` (negated). - /// - /// See also [`Matcher::prefix_match`](crate::Matcher::prefix_match). - Prefix, - /// The needle must match all trailing haystack characters without gaps or - /// postfix. This atom kind is parsed from the following syntax: `foo$` and - /// `!foo$` (negated). - /// - /// See also [`Matcher::postfix_match`](crate::Matcher::postfix_match). - Postfix, - /// The needle must match all haystack characters without gaps or prefix. - /// This atom kind is parsed from the following syntax: `^foo$` and `!^foo$` - /// (negated). - /// - /// See also [`Matcher::exact_match`](crate::Matcher::exact_match). - Exact, -} - -/// A single pattern component that is matched with a single [`Matcher`] function -#[derive(Debug, PartialEq, Eq, Clone)] -pub struct Atom { - /// Whether this pattern atom is a negative match. - /// A negative pattern atom will prevent haystacks matching it from - /// being matchend. It does not contribute to scoring/indices - pub negative: bool, - /// The kind of match that this pattern performs - pub kind: AtomKind, - needle: Utf32String, - ignore_case: bool, - normalize: bool, -} - -impl Atom { - /// Creates a single [`Atom`] from a string by performing unicode - /// normalization and case folding (if necessary). Optionally `\ ` can - /// be escaped to ` `. - pub fn new( - needle: &str, - case: CaseMatching, - normalize: Normalization, - kind: AtomKind, - escape_whitespace: bool, - ) -> Atom { - Atom::new_inner(needle, case, normalize, kind, escape_whitespace, false) - } - - fn new_inner( - needle: &str, - case: CaseMatching, - normalization: Normalization, - kind: AtomKind, - escape_whitespace: bool, - append_dollar: bool, - ) -> Atom { - let mut ignore_case; - let mut normalize; - #[cfg(feature = "unicode-normalization")] - { - normalize = matches!(normalization, Normalization::Smart); - } - #[cfg(not(feature = "unicode-normalization"))] - { - normalize = false; - } - let needle = if needle.is_ascii() { - let mut needle = if escape_whitespace { - if let Some((start, rem)) = needle.split_once("\\ ") { - let mut needle = start.to_owned(); - for rem in rem.split("\\ ") { - needle.push(' '); - needle.push_str(rem); - } - needle - } else { - needle.to_owned() - } - } else { - needle.to_owned() - }; - - match case { - #[cfg(feature = "unicode-casefold")] - CaseMatching::Ignore => { - ignore_case = true; - needle.make_ascii_lowercase() - } - #[cfg(feature = "unicode-casefold")] - CaseMatching::Smart => { - ignore_case = !needle.bytes().any(|b| b.is_ascii_uppercase()) - } - CaseMatching::Respect => ignore_case = false, - } - if append_dollar { - needle.push('$'); - } - Utf32String::Ascii(needle.into_boxed_str()) - } else { - let mut needle_ = Vec::with_capacity(needle.len()); - #[cfg(feature = "unicode-casefold")] - { - ignore_case = matches!(case, CaseMatching::Ignore | CaseMatching::Smart); - } - #[cfg(not(feature = "unicode-casefold"))] - { - ignore_case = false; - } - #[cfg(feature = "unicode-normalization")] - { - normalize = matches!(normalization, Normalization::Smart); - } - if escape_whitespace { - let mut saw_backslash = false; - for mut c in chars::graphemes(needle) { - if saw_backslash { - if c == ' ' { - needle_.push(' '); - saw_backslash = false; - continue; - } else { - needle_.push('\\'); - } - } - saw_backslash = c == '\\'; - match case { - #[cfg(feature = "unicode-casefold")] - CaseMatching::Ignore => c = chars::to_lower_case(c), - #[cfg(feature = "unicode-casefold")] - CaseMatching::Smart => { - ignore_case = ignore_case && !chars::is_upper_case(c) - } - CaseMatching::Respect => (), - } - match normalization { - #[cfg(feature = "unicode-normalization")] - Normalization::Smart => { - normalize = normalize && chars::normalize(c) == c; - } - Normalization::Never => (), - } - needle_.push(c); - } - } else { - let chars = chars::graphemes(needle).map(|mut c| { - match case { - #[cfg(feature = "unicode-casefold")] - CaseMatching::Ignore => c = chars::to_lower_case(c), - #[cfg(feature = "unicode-casefold")] - CaseMatching::Smart => { - ignore_case = ignore_case && !chars::is_upper_case(c); - } - CaseMatching::Respect => (), - } - match normalization { - #[cfg(feature = "unicode-normalization")] - Normalization::Smart => { - normalize = normalize && chars::normalize(c) == c; - } - Normalization::Never => (), - } - c - }); - needle_.extend(chars); - }; - if append_dollar { - needle_.push('$'); - } - Utf32String::Unicode(needle_.into_boxed_slice()) - }; - Atom { - kind, - needle, - negative: false, - ignore_case, - normalize, - } - } - - /// Parse a pattern atom from a string. Some special trailing and leading - /// characters can be used to control the atom kind. See [`AtomKind`] for - /// details. - pub fn parse(raw: &str, case: CaseMatching, normalize: Normalization) -> Atom { - let mut atom = raw; - let invert = match atom.as_bytes() { - [b'!', ..] => { - atom = &atom[1..]; - true - } - [b'\\', b'!', ..] => { - atom = &atom[1..]; - false - } - _ => false, - }; - - let mut kind = match atom.as_bytes() { - [b'^', ..] => { - atom = &atom[1..]; - AtomKind::Prefix - } - [b'\'', ..] => { - atom = &atom[1..]; - AtomKind::Substring - } - [b'\\', b'^' | b'\'', ..] => { - atom = &atom[1..]; - AtomKind::Fuzzy - } - _ => AtomKind::Fuzzy, - }; - - let mut append_dollar = false; - match atom.as_bytes() { - [.., b'\\', b'$'] => { - append_dollar = true; - atom = &atom[..atom.len() - 2] - } - [.., b'$'] => { - kind = if kind == AtomKind::Fuzzy { - AtomKind::Postfix - } else { - AtomKind::Exact - }; - atom = &atom[..atom.len() - 1] - } - _ => (), - } - - if invert && kind == AtomKind::Fuzzy { - kind = AtomKind::Substring - } - - let mut pattern = Atom::new_inner(atom, case, normalize, kind, true, append_dollar); - pattern.negative = invert; - pattern - } - - /// Matches this pattern against `haystack` (using the allocation and configuration - /// from `matcher`) and calculates a ranking score. See the [`Matcher`]. - /// Documentation for more details. - /// - /// *Note:* The `ignore_case` setting is overwritten to match the casing of - /// each pattern atom. - pub fn score(&self, haystack: Utf32Str<'_>, matcher: &mut Matcher) -> Option { - matcher.config.ignore_case = self.ignore_case; - matcher.config.normalize = self.normalize; - let pattern_score = match self.kind { - AtomKind::Exact => matcher.exact_match(haystack, self.needle.slice(..)), - AtomKind::Fuzzy => matcher.fuzzy_match(haystack, self.needle.slice(..)), - AtomKind::Substring => matcher.substring_match(haystack, self.needle.slice(..)), - AtomKind::Prefix => matcher.prefix_match(haystack, self.needle.slice(..)), - AtomKind::Postfix => matcher.postfix_match(haystack, self.needle.slice(..)), - }; - if self.negative { - if pattern_score.is_some() { - return None; - } - Some(0) - } else { - pattern_score - } - } - - /// Matches this pattern against `haystack` (using the allocation and - /// configuration from `matcher`), calculates a ranking score and the match - /// indices. See the [`Matcher`]. Documentation for more - /// details. - /// - /// *Note:* The `ignore_case` setting is overwritten to match the casing of - /// each pattern atom. - /// - /// *Note:* The `indices` vector is not cleared by this function. - pub fn indices( - &self, - haystack: Utf32Str<'_>, - matcher: &mut Matcher, - indices: &mut Vec, - ) -> Option { - matcher.config.ignore_case = self.ignore_case; - matcher.config.normalize = self.normalize; - if self.negative { - let pattern_score = match self.kind { - AtomKind::Exact => matcher.exact_match(haystack, self.needle.slice(..)), - AtomKind::Fuzzy => matcher.fuzzy_match(haystack, self.needle.slice(..)), - AtomKind::Substring => matcher.substring_match(haystack, self.needle.slice(..)), - AtomKind::Prefix => matcher.prefix_match(haystack, self.needle.slice(..)), - AtomKind::Postfix => matcher.postfix_match(haystack, self.needle.slice(..)), - }; - pattern_score.is_none().then_some(0) - } else { - match self.kind { - AtomKind::Exact => matcher.exact_indices(haystack, self.needle.slice(..), indices), - AtomKind::Fuzzy => matcher.fuzzy_indices(haystack, self.needle.slice(..), indices), - AtomKind::Substring => { - matcher.substring_indices(haystack, self.needle.slice(..), indices) - } - AtomKind::Prefix => { - matcher.prefix_indices(haystack, self.needle.slice(..), indices) - } - AtomKind::Postfix => { - matcher.postfix_indices(haystack, self.needle.slice(..), indices) - } - } - } - } - - /// Returns the needle text that is passed to the matcher. All indices - /// produced by the `indices` functions produce char indices used to index - /// this text - pub fn needle_text(&self) -> Utf32Str<'_> { - self.needle.slice(..) - } - /// Convenience function to easily match (and sort) a (relatively small) - /// list of inputs. - /// - /// *Note* This function is not recommended for building a full fuzzy - /// matching application that can match large numbers of matches (like all - /// files in a directory) as all matching is done on the current thread, - /// effectively blocking the UI. For such applications the high level - /// `nucleo` crate can be used instead. - pub fn match_list>( - &self, - items: impl IntoIterator, - matcher: &mut Matcher, - ) -> Vec<(T, u16)> { - if self.needle.is_empty() { - return items.into_iter().map(|item| (item, 0)).collect(); - } - let mut buf = Vec::new(); - let mut items: Vec<_> = items - .into_iter() - .filter_map(|item| { - self.score(Utf32Str::new(item.as_ref(), &mut buf), matcher) - .map(|score| (item, score)) - }) - .collect(); - items.sort_by_key(|(_, score)| Reverse(*score)); - items - } -} - -fn pattern_atoms(pattern: &str) -> impl Iterator + '_ { - let mut saw_backslash = false; - pattern.split(move |c| { - saw_backslash = match c { - ' ' if !saw_backslash => return true, - '\\' => true, - _ => false, - }; - false - }) -} - -#[derive(Debug, Default)] -/// A text pattern made up of (potentially multiple) [atoms](crate::pattern::Atom). -#[non_exhaustive] -pub struct Pattern { - /// The individual pattern (words) in this pattern - pub atoms: Vec, -} - -impl Pattern { - /// Creates a pattern where each word is matched individually (whitespaces - /// can be escaped with `\`). Otherwise no parsing is performed (so $, !, ' - /// and ^ don't receive special treatment). If you want to match the entire - /// pattern as a single needle use a single [`Atom`] instead. - pub fn new( - pattern: &str, - case_matching: CaseMatching, - normalize: Normalization, - kind: AtomKind, - ) -> Pattern { - let atoms = pattern_atoms(pattern) - .filter_map(|pat| { - let pat = Atom::new(pat, case_matching, normalize, kind, true); - (!pat.needle.is_empty()).then_some(pat) - }) - .collect(); - Pattern { atoms } - } - /// Creates a pattern where each word is matched individually (whitespaces - /// can be escaped with `\`). And $, !, ' and ^ at word boundaries will - /// cause different matching behaviour (see [`AtomKind`]). These can be - /// escaped with backslash. - pub fn parse(pattern: &str, case_matching: CaseMatching, normalize: Normalization) -> Pattern { - let atoms = pattern_atoms(pattern) - .filter_map(|pat| { - let pat = Atom::parse(pat, case_matching, normalize); - (!pat.needle.is_empty()).then_some(pat) - }) - .collect(); - Pattern { atoms } - } - - /// Convenience function to easily match (and sort) a (relatively small) - /// list of inputs. - /// - /// *Note* This function is not recommended for building a full fuzzy - /// matching application that can match large numbers of matches (like all - /// files in a directory) as all matching is done on the current thread, - /// effectively blocking the UI. For such applications the high level - /// `nucleo` crate can be used instead. - pub fn match_list( - &self, - items: impl IntoIterator, - matcher: &mut Matcher, - ) -> Vec<(T, u32)> { - if self.atoms.is_empty() { - return items.into_iter().map(|item| (item, 0)).collect(); - } - let mut buf = Vec::new(); - let mut items: Vec<_> = items - .into_iter() - .filter_map(|item| { - self.score(Utf32Str::new(item.string(), &mut buf), matcher) - .map(|score| (item, score)) - }) - .collect(); - items.sort_by_key(|(_, score)| Reverse(*score)); - items - } - - /// Matches this pattern against `haystack` (using the allocation and configuration - /// from `matcher`) and calculates a ranking score. See the [`Matcher`]. - /// Documentation for more details. - /// - /// *Note:* The `ignore_case` setting is overwritten to match the casing of - /// each pattern atom. - pub fn score(&self, haystack: Utf32Str<'_>, matcher: &mut Matcher) -> Option { - if self.atoms.is_empty() { - return Some(0); - } - let mut score = 0; - for pattern in &self.atoms { - score += pattern.score(haystack, matcher)? as u32; - } - Some(score) - } - - /// Matches this pattern against `haystack` (using the allocation and - /// configuration from `matcher`), calculates a ranking score and the match - /// indices. See the [`Matcher`]. Documentation for more - /// details. - /// - /// *Note:* The `ignore_case` setting is overwritten to match the casing of - /// each pattern atom. - /// - /// *Note:* The indices for each pattern are calculated individually - /// and simply appended to the `indices` vector and not deduplicated/sorted. - /// This allows associating the match indices to their source pattern. If - /// required (like for highlighting) unique/sorted indices can be obtained - /// as follows: - /// - /// ``` - /// # let mut indices: Vec = Vec::new(); - /// indices.sort_unstable(); - /// indices.dedup(); - /// ``` - pub fn indices( - &self, - haystack: Utf32Str<'_>, - matcher: &mut Matcher, - indices: &mut Vec, - ) -> Option { - if self.atoms.is_empty() { - return Some(0); - } - let mut score = 0; - for pattern in &self.atoms { - score += pattern.indices(haystack, matcher, indices)? as u32; - } - Some(score) - } - - /// Refreshes this pattern by reparsing it from a string. This is mostly - /// equivalent to just constructing a new pattern using [`Pattern::parse`] - /// but is slightly more efficient by reusing some allocations - pub fn reparse( - &mut self, - pattern: &str, - case_matching: CaseMatching, - normalize: Normalization, - ) { - self.atoms.clear(); - let atoms = pattern_atoms(pattern).filter_map(|atom| { - let atom = Atom::parse(atom, case_matching, normalize); - if atom.needle.is_empty() { - return None; - } - Some(atom) - }); - self.atoms.extend(atoms); - } -} - -impl Clone for Pattern { - fn clone(&self) -> Self { - Self { - atoms: self.atoms.clone(), - } - } - - fn clone_from(&mut self, source: &Self) { - self.atoms.clone_from(&source.atoms); - } -} - -pub trait Matchable { - fn string(&self) -> &str; -} diff --git a/matcher/src/pattern/tests.rs b/matcher/src/pattern/tests.rs deleted file mode 100644 index 246c2dbb..00000000 --- a/matcher/src/pattern/tests.rs +++ /dev/null @@ -1,114 +0,0 @@ -use crate::pattern::{Atom, AtomKind, CaseMatching, Normalization}; - -#[test] -fn negative() { - let pat = Atom::parse("!foo", CaseMatching::Smart, Normalization::Smart); - assert!(pat.negative); - assert_eq!(pat.kind, AtomKind::Substring); - assert_eq!(pat.needle.to_string(), "foo"); - let pat = Atom::parse("!^foo", CaseMatching::Smart, Normalization::Smart); - assert!(pat.negative); - assert_eq!(pat.kind, AtomKind::Prefix); - assert_eq!(pat.needle.to_string(), "foo"); - let pat = Atom::parse("!foo$", CaseMatching::Smart, Normalization::Smart); - assert!(pat.negative); - assert_eq!(pat.kind, AtomKind::Postfix); - assert_eq!(pat.needle.to_string(), "foo"); - let pat = Atom::parse("!^foo$", CaseMatching::Smart, Normalization::Smart); - assert!(pat.negative); - assert_eq!(pat.kind, AtomKind::Exact); - assert_eq!(pat.needle.to_string(), "foo"); -} - -#[test] -fn pattern_kinds() { - let pat = Atom::parse("foo", CaseMatching::Smart, Normalization::Smart); - assert!(!pat.negative); - assert_eq!(pat.kind, AtomKind::Fuzzy); - assert_eq!(pat.needle.to_string(), "foo"); - let pat = Atom::parse("'foo", CaseMatching::Smart, Normalization::Smart); - assert!(!pat.negative); - assert_eq!(pat.kind, AtomKind::Substring); - assert_eq!(pat.needle.to_string(), "foo"); - let pat = Atom::parse("^foo", CaseMatching::Smart, Normalization::Smart); - assert!(!pat.negative); - assert_eq!(pat.kind, AtomKind::Prefix); - assert_eq!(pat.needle.to_string(), "foo"); - let pat = Atom::parse("foo$", CaseMatching::Smart, Normalization::Smart); - assert!(!pat.negative); - assert_eq!(pat.kind, AtomKind::Postfix); - assert_eq!(pat.needle.to_string(), "foo"); - let pat = Atom::parse("^foo$", CaseMatching::Smart, Normalization::Smart); - assert!(!pat.negative); - assert_eq!(pat.kind, AtomKind::Exact); - assert_eq!(pat.needle.to_string(), "foo"); -} - -#[test] -fn case_matching() { - let pat = Atom::parse("foo", CaseMatching::Smart, Normalization::Smart); - assert!(pat.ignore_case); - assert_eq!(pat.needle.to_string(), "foo"); - let pat = Atom::parse("Foo", CaseMatching::Smart, Normalization::Smart); - assert!(!pat.ignore_case); - assert_eq!(pat.needle.to_string(), "Foo"); - let pat = Atom::parse("Foo", CaseMatching::Ignore, Normalization::Smart); - assert!(pat.ignore_case); - assert_eq!(pat.needle.to_string(), "foo"); - let pat = Atom::parse("Foo", CaseMatching::Respect, Normalization::Smart); - assert!(!pat.ignore_case); - assert_eq!(pat.needle.to_string(), "Foo"); - let pat = Atom::parse("Foo", CaseMatching::Respect, Normalization::Smart); - assert!(!pat.ignore_case); - assert_eq!(pat.needle.to_string(), "Foo"); - let pat = Atom::parse("Äxx", CaseMatching::Ignore, Normalization::Smart); - assert!(pat.ignore_case); - assert_eq!(pat.needle.to_string(), "äxx"); - let pat = Atom::parse("Äxx", CaseMatching::Respect, Normalization::Smart); - assert!(!pat.ignore_case); - let pat = Atom::parse("Axx", CaseMatching::Smart, Normalization::Smart); - assert!(!pat.ignore_case); - assert_eq!(pat.needle.to_string(), "Axx"); - let pat = Atom::parse("你xx", CaseMatching::Smart, Normalization::Smart); - assert!(pat.ignore_case); - assert_eq!(pat.needle.to_string(), "你xx"); - let pat = Atom::parse("你xx", CaseMatching::Ignore, Normalization::Smart); - assert!(pat.ignore_case); - assert_eq!(pat.needle.to_string(), "你xx"); - let pat = Atom::parse("Ⲽxx", CaseMatching::Smart, Normalization::Smart); - assert!(!pat.ignore_case); - assert_eq!(pat.needle.to_string(), "Ⲽxx"); - let pat = Atom::parse("Ⲽxx", CaseMatching::Ignore, Normalization::Smart); - assert!(pat.ignore_case); - assert_eq!(pat.needle.to_string(), "ⲽxx"); -} - -#[test] -fn escape() { - let pat = Atom::parse("foo\\ bar", CaseMatching::Smart, Normalization::Smart); - assert_eq!(pat.needle.to_string(), "foo bar"); - let pat = Atom::parse("\\!foo", CaseMatching::Smart, Normalization::Smart); - assert_eq!(pat.needle.to_string(), "!foo"); - assert_eq!(pat.kind, AtomKind::Fuzzy); - let pat = Atom::parse("\\'foo", CaseMatching::Smart, Normalization::Smart); - assert_eq!(pat.needle.to_string(), "'foo"); - assert_eq!(pat.kind, AtomKind::Fuzzy); - let pat = Atom::parse("\\^foo", CaseMatching::Smart, Normalization::Smart); - assert_eq!(pat.needle.to_string(), "^foo"); - assert_eq!(pat.kind, AtomKind::Fuzzy); - let pat = Atom::parse("foo\\$", CaseMatching::Smart, Normalization::Smart); - assert_eq!(pat.needle.to_string(), "foo$"); - assert_eq!(pat.kind, AtomKind::Fuzzy); - let pat = Atom::parse("^foo\\$", CaseMatching::Smart, Normalization::Smart); - assert_eq!(pat.needle.to_string(), "foo$"); - assert_eq!(pat.kind, AtomKind::Prefix); - let pat = Atom::parse("\\^foo\\$", CaseMatching::Smart, Normalization::Smart); - assert_eq!(pat.needle.to_string(), "^foo$"); - assert_eq!(pat.kind, AtomKind::Fuzzy); - let pat = Atom::parse("\\!^foo\\$", CaseMatching::Smart, Normalization::Smart); - assert_eq!(pat.needle.to_string(), "!^foo$"); - assert_eq!(pat.kind, AtomKind::Fuzzy); - let pat = Atom::parse("!\\^foo\\$", CaseMatching::Smart, Normalization::Smart); - assert_eq!(pat.needle.to_string(), "^foo$"); - assert_eq!(pat.kind, AtomKind::Substring); -} diff --git a/matcher/src/prefilter.rs b/matcher/src/prefilter.rs deleted file mode 100644 index 7e4de945..00000000 --- a/matcher/src/prefilter.rs +++ /dev/null @@ -1,95 +0,0 @@ -use ::memchr::{memchr, memchr2, memrchr, memrchr2}; - -use crate::chars::Char; -use crate::utf32_str::Utf32Str; -use crate::Matcher; - -#[inline(always)] -fn find_ascii_ignore_case(c: u8, haystack: &[u8]) -> Option { - if c >= b'a' && c <= b'z' { - memchr2(c, c - 32, haystack) - } else { - memchr(c, haystack) - } -} - -#[inline(always)] -fn find_ascii_ignore_case_rev(c: u8, haystack: &[u8]) -> Option { - if c >= b'a' && c <= b'z' { - memrchr2(c, c - 32, haystack) - } else { - memrchr(c, haystack) - } -} - -impl Matcher { - pub(crate) fn prefilter_ascii( - &self, - mut haystack: &[u8], - needle: &[u8], - only_greedy: bool, - ) -> Option<(usize, usize, usize)> { - if self.config.ignore_case { - let start = - find_ascii_ignore_case(needle[0], &haystack[..haystack.len() - needle.len() + 1])?; - let mut greedy_end = start + 1; - haystack = &haystack[greedy_end..]; - for &c in &needle[1..] { - let idx = find_ascii_ignore_case(c, haystack)? + 1; - greedy_end += idx; - haystack = &haystack[idx..]; - } - if only_greedy { - Some((start, greedy_end, greedy_end)) - } else { - let end = greedy_end - + find_ascii_ignore_case_rev(*needle.last().unwrap(), haystack) - .map_or(0, |i| i + 1); - Some((start, greedy_end, end)) - } - } else { - let start = memchr(needle[0], &haystack[..haystack.len() - needle.len() + 1])?; - let mut greedy_end = start + 1; - haystack = &haystack[greedy_end..]; - for &c in &needle[1..] { - let idx = memchr(c, haystack)? + 1; - greedy_end += idx; - haystack = &haystack[idx..]; - } - if only_greedy { - Some((start, greedy_end, greedy_end)) - } else { - let end = - greedy_end + memrchr(*needle.last().unwrap(), haystack).map_or(0, |i| i + 1); - Some((start, greedy_end, end)) - } - } - } - - pub(crate) fn prefilter_non_ascii( - &self, - haystack: &[char], - needle: Utf32Str<'_>, - only_greedy: bool, - ) -> Option<(usize, usize)> { - let needle_char = needle.get(0); - let start = haystack[..haystack.len() - needle.len() + 1] - .iter() - .position(|c| c.normalize(&self.config) == needle_char)?; - let needle_char = needle.last(); - if only_greedy { - Some((start, start + 1)) - } else { - let end = haystack.len() - - haystack[start + 1..] - .iter() - .rev() - .position(|c| c.normalize(&self.config) == needle_char)?; - if end - start < needle.len() { - return None; - } - - Some((start, end)) - } - } -} diff --git a/matcher/src/score.rs b/matcher/src/score.rs deleted file mode 100644 index c934a8ef..00000000 --- a/matcher/src/score.rs +++ /dev/null @@ -1,158 +0,0 @@ -use std::cmp::max; - -use crate::chars::{Char, CharClass}; -use crate::{Config, Matcher}; - -pub(crate) const SCORE_MATCH: u16 = 16; -pub(crate) const PENALTY_GAP_START: u16 = 3; -pub(crate) const PENALTY_GAP_EXTENSION: u16 = 1; -/// If the prefer_prefix option is enabled we want to penalize -/// the initial gap. The prefix should not be too much -pub(crate) const PREFIX_BONUS_SCALE: u16 = 2; -pub(crate) const MAX_PREFIX_BONUS: u16 = BONUS_BOUNDARY; - -// We prefer matches at the beginning of a word, but the bonus should not be -// too great to prevent the longer acronym matches from always winning over -// shorter fuzzy matches. The bonus point here was specifically chosen that -// the bonus is cancelled when the gap between the acronyms grows over -// 8 characters, which is approximately the average length of the words found -// in web2 dictionary and my file system. -pub(crate) const BONUS_BOUNDARY: u16 = SCORE_MATCH / 2; - -// Edge-triggered bonus for matches in camelCase words. -// Their value should be BONUS_BOUNDARY - PENALTY_GAP_EXTENSION = 7. -// However, this priporitzes camel case over non-camel case. -// In fzf/skim this is not a problem since they score off the max -// consecutive bonus. However, we don't do that (because its incorrect) -// so to avoids prioritizing camel we use a lower bonus. I think that's fine -// usually camel case is wekaer boundary than actual wourd boundaries anyway -// This also has the nice sideeffect of perfectly balancing out -// camel case, snake case and the consecutive version of the word -pub(crate) const BONUS_CAMEL123: u16 = BONUS_BOUNDARY - PENALTY_GAP_START; - -/// Although bonus point for non-word characters is non-contextual, we need it -/// for computing bonus points for consecutive chunks starting with a non-word -/// character. -pub(crate) const BONUS_NON_WORD: u16 = BONUS_BOUNDARY; - -// Minimum bonus point given to characters in consecutive chunks. -// Note that bonus points for consecutive matches shouldn't have needed if we -// used fixed match score as in the original algorithm. -pub(crate) const BONUS_CONSECUTIVE: u16 = PENALTY_GAP_START + PENALTY_GAP_EXTENSION; - -// The first character in the typed pattern usually has more significance -// than the rest so it's important that it appears at special positions where -// bonus points are given, e.g. "to-go" vs. "ongoing" on "og" or on "ogo". -// The amount of the extra bonus should be limited so that the gap penalty is -// still respected. -pub(crate) const BONUS_FIRST_CHAR_MULTIPLIER: u16 = 2; - -impl Config { - #[inline] - pub(crate) fn bonus_for(&self, prev_class: CharClass, class: CharClass) -> u16 { - if class > CharClass::Delimiter { - // transition from non word to word - match prev_class { - CharClass::Whitespace => return self.bonus_boundary_white, - CharClass::Delimiter => return self.bonus_boundary_delimiter, - CharClass::NonWord => return BONUS_BOUNDARY, - _ => (), - } - } - if prev_class == CharClass::Lower && class == CharClass::Upper - || prev_class != CharClass::Number && class == CharClass::Number - { - // camelCase letter123 - BONUS_CAMEL123 - } else if class == CharClass::Whitespace { - self.bonus_boundary_white - } else if class == CharClass::NonWord { - return BONUS_NON_WORD; - } else { - 0 - } - } -} -impl Matcher { - #[inline(always)] - pub(crate) fn bonus_for(&self, prev_class: CharClass, class: CharClass) -> u16 { - self.config.bonus_for(prev_class, class) - } - - pub(crate) fn calculate_score, N: Char>( - &mut self, - haystack: &[H], - needle: &[N], - start: usize, - end: usize, - indices: &mut Vec, - ) -> u16 { - if INDICES { - indices.reserve(needle.len()); - } - - let mut prev_class = start - .checked_sub(1) - .map(|i| haystack[i].char_class(&self.config)) - .unwrap_or(self.config.initial_char_class); - let mut needle_iter = needle.iter(); - let mut needle_char = *needle_iter.next().unwrap(); - - let mut in_gap = false; - let mut consecutive = 1; - - // unrolled the first iteration to make applying the first char multiplier less awkward - if INDICES { - indices.push(start as u32) - } - let class = haystack[start].char_class(&self.config); - let mut first_bonus = self.bonus_for(prev_class, class); - let mut score = SCORE_MATCH + first_bonus * BONUS_FIRST_CHAR_MULTIPLIER; - prev_class = class; - needle_char = *needle_iter.next().unwrap_or(&needle_char); - - for (i, c) in haystack[start + 1..end].iter().enumerate() { - let (c, class) = c.char_class_and_normalize(&self.config); - if c == needle_char { - if INDICES { - indices.push(i as u32 + start as u32 + 1) - } - let mut bonus = self.bonus_for(prev_class, class); - if consecutive != 0 { - if bonus >= BONUS_BOUNDARY && bonus > first_bonus { - first_bonus = bonus - } - bonus = max(max(bonus, first_bonus), BONUS_CONSECUTIVE); - } else { - first_bonus = bonus; - } - score += SCORE_MATCH + bonus; - in_gap = false; - consecutive += 1; - if let Some(&next) = needle_iter.next() { - needle_char = next; - } - } else { - let penalty = if in_gap { - PENALTY_GAP_EXTENSION - } else { - PENALTY_GAP_START - }; - score = score.saturating_sub(penalty); - in_gap = true; - consecutive = 0; - } - prev_class = class; - } - if self.config.prefer_prefix { - if start != 0 { - let penalty = PENALTY_GAP_START - + PENALTY_GAP_START * (start - 1).min(u16::MAX as usize) as u16; - score += MAX_PREFIX_BONUS.saturating_sub(penalty / PREFIX_BONUS_SCALE); - } else { - score += MAX_PREFIX_BONUS; - } - } - score - } -} diff --git a/matcher/src/tests.rs b/matcher/src/tests.rs deleted file mode 100644 index abae3e69..00000000 --- a/matcher/src/tests.rs +++ /dev/null @@ -1,728 +0,0 @@ -use crate::chars::Char; -use crate::pattern::{CaseMatching, Normalization, Pattern}; -use crate::score::{ - BONUS_BOUNDARY, BONUS_CAMEL123, BONUS_CONSECUTIVE, BONUS_FIRST_CHAR_MULTIPLIER, BONUS_NON_WORD, - MAX_PREFIX_BONUS, PENALTY_GAP_EXTENSION, PENALTY_GAP_START, SCORE_MATCH, -}; -use crate::utf32_str::Utf32Str; -use crate::{Config, Matcher}; - -use Algorithm::*; - -#[derive(Debug)] -enum Algorithm { - FuzzyOptimal, - FuzzyGreedy, - Substring, - Prefix, - Postfix, - Exact, -} - -fn assert_matches( - algorithm: &[Algorithm], - normalize: bool, - case_sensitive: bool, - path: bool, - prefer_prefix: bool, - cases: &[(&str, &str, &[u32], u16)], -) { - let mut config = Config { - normalize, - ignore_case: !case_sensitive, - prefer_prefix, - ..Config::DEFAULT - }; - if path { - config.set_match_paths(); - } - let mut matcher = Matcher::new(config); - let mut matched_indices = Vec::new(); - let mut needle_buf = Vec::new(); - let mut haystack_buf = Vec::new(); - for &(haystack, needle, indices, mut score) in cases { - let needle = if !case_sensitive { - needle.to_lowercase() - } else { - needle.to_owned() - }; - let needle = Utf32Str::new(&needle, &mut needle_buf); - let haystack = Utf32Str::new(haystack, &mut haystack_buf); - score += needle.len() as u16 * SCORE_MATCH; - for algo in algorithm { - println!("xx {matched_indices:?} {algo:?}"); - matched_indices.clear(); - let res = match algo { - FuzzyOptimal => matcher.fuzzy_indices(haystack, needle, &mut matched_indices), - FuzzyGreedy => matcher.fuzzy_indices_greedy(haystack, needle, &mut matched_indices), - Substring => matcher.substring_indices(haystack, needle, &mut matched_indices), - Prefix => matcher.prefix_indices(haystack, needle, &mut matched_indices), - Postfix => matcher.postfix_indices(haystack, needle, &mut matched_indices), - Exact => matcher.exact_indices(haystack, needle, &mut matched_indices), - }; - println!("{matched_indices:?}"); - let match_chars: Vec<_> = matched_indices - .iter() - .map(|&i| haystack.get(i).normalize(&matcher.config)) - .collect(); - let needle_chars: Vec<_> = needle.chars().collect(); - - assert_eq!( - res, - Some(score), - "{needle:?} did not match {haystack:?}: matched {match_chars:?} {matched_indices:?} {algo:?}" - ); - assert_eq!( - matched_indices, indices, - "{needle:?} match {haystack:?} {algo:?}" - ); - assert_eq!( - match_chars, needle_chars, - "{needle:?} match {haystack:?} indices are incorrect {matched_indices:?} {algo:?}" - ); - } - } -} - -pub fn assert_not_matches( - normalize: bool, - case_sensitive: bool, - path: bool, - cases: &[(&str, &str)], -) { - let mut config = Config { - normalize, - ignore_case: !case_sensitive, - ..Config::DEFAULT - }; - if path { - config.set_match_paths(); - } - let mut matcher = Matcher::new(config); - let mut needle_buf = Vec::new(); - let mut haystack_buf = Vec::new(); - for &(haystack, needle) in cases { - let needle = if !case_sensitive { - needle.to_lowercase() - } else { - needle.to_owned() - }; - let needle = Utf32Str::new(&needle, &mut needle_buf); - let haystack = Utf32Str::new(haystack, &mut haystack_buf); - - let res = matcher.fuzzy_match(haystack, needle); - assert_eq!(res, None, "{needle:?} should not match {haystack:?}"); - let res = matcher.fuzzy_match_greedy(haystack, needle); - assert_eq!( - res, None, - "{needle:?} should not match {haystack:?} (greedy)" - ); - let res = matcher.substring_match(haystack, needle); - assert_eq!( - res, None, - "{needle:?} should not match {haystack:?} (substring)" - ); - let res = matcher.prefix_match(haystack, needle); - assert_eq!( - res, None, - "{needle:?} should not match {haystack:?} (prefix)" - ); - let res = matcher.postfix_match(haystack, needle); - assert_eq!( - res, None, - "{needle:?} should not match {haystack:?} (postfix)" - ); - } -} - -const BONUS_BOUNDARY_WHITE: u16 = Config::DEFAULT.bonus_boundary_white; -const BONUS_BOUNDARY_DELIMITER: u16 = Config::DEFAULT.bonus_boundary_delimiter; - -#[test] -fn test_fuzzy() { - assert_matches( - &[FuzzyGreedy, FuzzyOptimal], - false, - false, - false, - false, - &[ - ( - "fooBarbaz1", - "obr", - &[2, 3, 5], - BONUS_CAMEL123 - PENALTY_GAP_START, - ), - ( - "/usr/share/doc/at/ChangeLog", - "changelog", - &[18, 19, 20, 21, 22, 23, 24, 25, 26], - (BONUS_FIRST_CHAR_MULTIPLIER + 8) * BONUS_BOUNDARY_DELIMITER, - ), - ( - "fooBarbaz1", - "br", - &[3, 5], - BONUS_CAMEL123 * BONUS_FIRST_CHAR_MULTIPLIER - PENALTY_GAP_START, - ), - ( - "foo bar baz", - "fbb", - &[0, 4, 8], - BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_WHITE * 2 - - 2 * PENALTY_GAP_START - - 4 * PENALTY_GAP_EXTENSION, - ), - ( - "/AutomatorDocument.icns", - "rdoc", - &[9, 10, 11, 12], - BONUS_CAMEL123 + 2 * BONUS_CONSECUTIVE, - ), - ( - "/man1/zshcompctl.1", - "zshc", - &[6, 7, 8, 9], - BONUS_BOUNDARY_DELIMITER * (BONUS_FIRST_CHAR_MULTIPLIER + 3), - ), - ( - "/.oh-my-zsh/cache", - "zshc", - &[8, 9, 10, 12], - BONUS_BOUNDARY * (BONUS_FIRST_CHAR_MULTIPLIER + 2) - PENALTY_GAP_START - + BONUS_BOUNDARY_DELIMITER, - ), - ( - "ab0123 456", - "12356", - &[3, 4, 5, 8, 9], - BONUS_CONSECUTIVE * 3 - PENALTY_GAP_START - PENALTY_GAP_EXTENSION, - ), - ( - "abc123 456", - "12356", - &[3, 4, 5, 8, 9], - BONUS_CAMEL123 * (BONUS_FIRST_CHAR_MULTIPLIER + 2) - - PENALTY_GAP_START - - PENALTY_GAP_EXTENSION - + BONUS_CONSECUTIVE, - ), - ( - "foo/bar/baz", - "fbb", - &[0, 4, 8], - BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_DELIMITER * 2 - - 2 * PENALTY_GAP_START - - 4 * PENALTY_GAP_EXTENSION, - ), - ( - "fooBarBaz", - "fbb", - &[0, 3, 6], - BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_CAMEL123 * 2 - - 2 * PENALTY_GAP_START - - 2 * PENALTY_GAP_EXTENSION, - ), - ( - "foo barbaz", - "fbb", - &[0, 4, 7], - BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_WHITE - - PENALTY_GAP_START * 2 - - PENALTY_GAP_EXTENSION * 3, - ), - ( - "fooBar Baz", - "foob", - &[0, 1, 2, 3], - BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 3), - ), - ( - "xFoo-Bar Baz", - "foo-b", - &[1, 2, 3, 4, 5], - BONUS_CAMEL123 * (BONUS_FIRST_CHAR_MULTIPLIER + 2) + 2 * BONUS_NON_WORD, - ), - ], - ); -} - -#[test] -fn empty_needle() { - assert_matches( - &[Substring, Prefix, Postfix, FuzzyGreedy, FuzzyOptimal, Exact], - false, - false, - false, - false, - &[("foo bar baz", "", &[], 0)], - ); -} - -#[test] -fn test_substring() { - assert_matches( - &[Substring, Prefix], - false, - false, - false, - false, - &[ - ( - "foo bar baz", - "foo", - &[0, 1, 2], - BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 2), - ), - ( - " foo bar baz", - "FOO", - &[1, 2, 3], - BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 2), - ), - ( - " foo bar baz", - " FOO", - &[0, 1, 2, 3], - BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 3), - ), - ], - ); - assert_matches( - &[Substring, Postfix], - false, - false, - false, - false, - &[ - ( - "foo bar baz", - "baz", - &[8, 9, 10], - BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 2), - ), - ( - "foo bar baz ", - "baz", - &[8, 9, 10], - BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 2), - ), - ( - "foo bar baz ", - "baz ", - &[8, 9, 10, 11], - BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 3), - ), - ], - ); - assert_matches( - &[Substring, Prefix, Postfix, Exact, FuzzyGreedy, FuzzyOptimal], - false, - false, - false, - false, - &[ - ( - "foo", - "foo", - &[0, 1, 2], - BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 2), - ), - ( - " foo", - "foo", - &[1, 2, 3], - BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 2), - ), - ( - " foo", - " foo", - &[0, 1, 2, 3], - BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 3), - ), - ], - ); - assert_matches( - &[Substring], - false, - false, - false, - false, - &[ - ( - "fooBarbaz1", - "oba", - &[2, 3, 4], - BONUS_CAMEL123 + BONUS_CONSECUTIVE, - ), - ( - "/AutomatorDocument.icns", - "rdoc", - &[9, 10, 11, 12], - BONUS_CAMEL123 + 2 * BONUS_CONSECUTIVE, - ), - ( - "/man1/zshcompctl.1", - "zshc", - &[6, 7, 8, 9], - BONUS_BOUNDARY_DELIMITER * (BONUS_FIRST_CHAR_MULTIPLIER + 3), - ), - ( - "/.oh-my-zsh/cache", - "zsh/c", - &[8, 9, 10, 11, 12], - BONUS_BOUNDARY * (BONUS_FIRST_CHAR_MULTIPLIER + 2) - + BONUS_NON_WORD - + BONUS_BOUNDARY_DELIMITER, - ), - ], - ); -} - -#[test] -fn test_fuzzy_case_sensitive() { - assert_matches( - &[FuzzyGreedy, FuzzyOptimal], - false, - true, - false, - false, - &[ - ( - "fooBarbaz1", - "oBr", - &[2, 3, 5], - BONUS_CAMEL123 - PENALTY_GAP_START, - ), - ( - "Foo/Bar/Baz", - "FBB", - &[0, 4, 8], - BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_BOUNDARY_DELIMITER * 2 - - 2 * PENALTY_GAP_START - - 4 * PENALTY_GAP_EXTENSION, - ), - ( - "FooBarBaz", - "FBB", - &[0, 3, 6], - BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER + BONUS_CAMEL123 * 2 - - 2 * PENALTY_GAP_START - - 2 * PENALTY_GAP_EXTENSION, - ), - ( - "FooBar Baz", - "FooB", - &[0, 1, 2, 3], - BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 3), - ), - ("foo-bar", "o-ba", &[2, 3, 4, 5], BONUS_NON_WORD * 3), - ], - ); -} - -#[test] -fn test_normalize() { - assert_matches( - &[FuzzyGreedy, FuzzyOptimal], - true, - false, - false, - false, - &[ - ( - "Só Danço Samba", - "So", - &[0, 1], - BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 1), - ), - ( - "Só Danço Samba", - "sodc", - &[0, 1, 3, 6], - BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 1) - PENALTY_GAP_START - + BONUS_BOUNDARY_WHITE - - PENALTY_GAP_START - - PENALTY_GAP_EXTENSION, - ), - ( - "Danço", - "danco", - &[0, 1, 2, 3, 4], - BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 4), - ), - ( - "DanÇo", - "danco", - &[0, 1, 2, 3, 4], - BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 4), - ), - ( - "xÇando", - "cando", - &[1, 2, 3, 4, 5], - BONUS_CAMEL123 * (BONUS_FIRST_CHAR_MULTIPLIER + 4), - ), - ("ۂ(GCGɴCG", "n", &[5], 0), - ], - ) -} - -#[test] -fn test_unicode() { - assert_matches( - &[FuzzyGreedy, FuzzyOptimal, Substring], - true, - false, - false, - false, - &[( - "你好世界", - "你好", - &[0, 1], - BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 1), - )], - ); - assert_matches( - &[FuzzyGreedy, FuzzyOptimal], - true, - false, - false, - false, - &[( - "你好世界", - "你世", - &[0, 2], - BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER - PENALTY_GAP_START, - )], - ); - assert_not_matches( - false, - false, - false, - &[("Flibbertigibbet / イタズラっ子たち", "lying")], - ); -} - -#[test] -fn test_long_str() { - assert_matches( - &[FuzzyGreedy, FuzzyOptimal], - false, - false, - false, - false, - &[( - &"x".repeat(u16::MAX as usize + 1), - "xx", - &[0, 1], - BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 1), - )], - ); -} - -#[test] -fn test_casing() { - assert_matches( - &[FuzzyGreedy, FuzzyOptimal], - false, - false, - false, - false, - &[ - // these two have the same score - ( - "fooBar", - "foobar", - &[0, 1, 2, 3, 4, 5], - BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 5), - ), - ( - "foobar", - "foobar", - &[0, 1, 2, 3, 4, 5], - BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 5), - ), - // these two have the same score (slightly lower than the other two: 60 instead of 70) - ( - "foo-bar", - "foobar", - &[0, 1, 2, 4, 5, 6], - BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 2) - PENALTY_GAP_START - + BONUS_BOUNDARY * 3, - ), - ( - "foo_bar", - "foobar", - &[0, 1, 2, 4, 5, 6], - BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 2) - PENALTY_GAP_START - + BONUS_BOUNDARY * 3, - ), - ], - ) -} - -#[test] -fn test_optimal() { - assert_matches( - &[FuzzyOptimal], - false, - false, - false, - false, - &[ - ( - "axxx xx ", - "xx", - &[5, 6], - BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 1), - ), - ( - "SS!H", - "S!", - &[0, 2], - BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER - PENALTY_GAP_START - + BONUS_NON_WORD, - ), - // this case is a cool example of why our algorithm is more than fzf - // we handle this corretly detect that it's better to match - // the second f instead of the third yielding a higher score - // (despite using the same scoring function!) - ( - "xf.foo", - "xfoo", - &[0, 3, 4, 5], - BONUS_BOUNDARY_WHITE * BONUS_FIRST_CHAR_MULTIPLIER - - PENALTY_GAP_START - - PENALTY_GAP_EXTENSION - + BONUS_BOUNDARY * 3, - ), - ( - "xf fo", - "xfo", - &[0, 3, 4], - BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 2) - - PENALTY_GAP_START - - PENALTY_GAP_EXTENSION, - ), - ], - ); -} - -#[test] -fn test_reject() { - assert_not_matches( - true, - false, - false, - &[ - ("你好界", "abc"), - ("你好界", "a"), - ("你好世界", "富"), - ("Só Danço Samba", "sox"), - ("fooBarbaz", "fooBarbazz"), - ("fooBarbaz", "c"), - ], - ); - assert_not_matches( - true, - true, - false, - &[ - ("你好界", "abc"), - ("abc", "你"), - ("abc", "A"), - ("abc", "d"), - ("你好世界", "富"), - ("Só Danço Samba", "sox"), - ("fooBarbaz", "oBZ"), - ("Foo Bar Baz", "fbb"), - ("fooBarbaz", "fooBarbazz"), - ], - ); - assert_not_matches( - false, - true, - false, - &[ - ("Só Danço Samba", "sod"), - ("Só Danço Samba", "soc"), - ("Só Danç", "So"), - ], - ); - assert_not_matches(false, false, false, &[("ۂۂfoۂۂ", "foo")]); -} - -#[test] -fn test_prefer_prefix() { - assert_matches( - &[FuzzyOptimal, FuzzyGreedy], - false, - false, - false, - true, - &[ - ( - "Moby Dick", - "md", - &[0, 5], - BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 1) + MAX_PREFIX_BONUS - - PENALTY_GAP_START - - 3 * PENALTY_GAP_EXTENSION, - ), - ( - "Though I cannot tell why it was exactly that those stage managers, the Fates, put me down for this shabby part of a whaling voyage", - "md", - &[82, 85], - BONUS_BOUNDARY_WHITE * (BONUS_FIRST_CHAR_MULTIPLIER + 1) - - PENALTY_GAP_START - - PENALTY_GAP_EXTENSION, - ), - ], - ); -} - -#[test] -fn test_single_char_needle() { - assert_matches( - &[FuzzyOptimal], - false, - false, - false, - false, - &[( - "foO", - "o", - &[2], - BONUS_FIRST_CHAR_MULTIPLIER * BONUS_CAMEL123, - )], - ); - assert_matches( - &[FuzzyOptimal], - false, - false, - false, - false, - &[( - "föÖ", - "ö", - &[2], - BONUS_FIRST_CHAR_MULTIPLIER * BONUS_CAMEL123, - )], - ); -} - -#[test] -fn umlaut() { - let paths = ["be", "bë"]; - let mut matcher = Matcher::new(Config::DEFAULT); - let matches = Pattern::parse("ë", CaseMatching::Ignore, Normalization::Smart) - .match_list(paths, &mut matcher); - assert_eq!(matches.len(), 1); - let matches = Pattern::parse("e", CaseMatching::Ignore, Normalization::Never) - .match_list(paths, &mut matcher); - assert_eq!(matches.len(), 1); - let matches = Pattern::parse("e", CaseMatching::Ignore, Normalization::Smart) - .match_list(paths, &mut matcher); - assert_eq!(matches.len(), 2); -} diff --git a/matcher/src/utf32_str.rs b/matcher/src/utf32_str.rs deleted file mode 100644 index 1821b46c..00000000 --- a/matcher/src/utf32_str.rs +++ /dev/null @@ -1,355 +0,0 @@ -use std::borrow::Cow; -use std::ops::{Bound, RangeBounds}; -use std::{fmt, slice}; - -use crate::chars; - -/// A UTF32 encoded (char array) string that is used as an input to (fuzzy) matching. -/// -/// Usually rusts' utf8 encoded strings are great. However during fuzzy matching -/// operates on codepoints (it should operate on graphemes but that's too much -/// hassle to deal with). We want to quickly iterate these codepoints between -/// (up to 5 times) during matching. -/// -/// Doing codepoint segmentation on the fly not only blows trough the cache -/// (lookuptables and Icache) but also has nontrivial runtime compared to the -/// matching itself. Furthermore there are a lot of exta optimizations available -/// for ascii only text (but checking during each match has too much overhead). -/// -/// Ofcourse this comes at exta memory cost as we usually still need the ut8 -/// encoded variant for rendering. In the (dominant) case of ascii-only text -/// we don't require a copy. Furthermore fuzzy matching usually is applied while -/// the user is typing on the fly so the same item is potentially matched many -/// times (making the the upfront cost more worth it). That means that its -/// basically always worth it to presegment the string. -/// -/// For usecases that only match (a lot of) strings once its possible to keep -/// char buffer around that is filled with the presegmented chars -/// -/// Another advantage of this approach is that the matcher will naturally -/// produce char indices (instead of utf8 offsets) anyway. With a -/// codepoint basic representation like this the indices can be used -/// directly -#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Hash)] -pub enum Utf32Str<'a> { - /// A string represented as ASCII encoded bytes. - /// Correctness invariant: must only contain valid ASCII (<=127) - Ascii(&'a [u8]), - /// A string represented as an array of unicode codepoints (basically UTF-32). - Unicode(&'a [char]), -} - -impl<'a> Utf32Str<'a> { - /// Convenience method to construct a `Utf32Str` from a normal utf8 str - pub fn new(str: &'a str, buf: &'a mut Vec) -> Self { - if str.is_ascii() { - Utf32Str::Ascii(str.as_bytes()) - } else { - buf.clear(); - buf.extend(crate::chars::graphemes(str)); - if buf.iter().all(|c| c.is_ascii()) { - return Utf32Str::Ascii(str.as_bytes()); - } - Utf32Str::Unicode(&*buf) - } - } - - /// Returns the number of characters in this string. - #[inline] - pub fn len(self) -> usize { - match self { - Utf32Str::Unicode(codepoints) => codepoints.len(), - Utf32Str::Ascii(ascii_bytes) => ascii_bytes.len(), - } - } - - /// Returns whether this string is empty. - #[inline] - pub fn is_empty(self) -> bool { - match self { - Utf32Str::Unicode(codepoints) => codepoints.is_empty(), - Utf32Str::Ascii(ascii_bytes) => ascii_bytes.is_empty(), - } - } - - /// Creates a slice with a string that contains the characters in - /// the specified **character range**. - #[inline] - pub fn slice(self, range: impl RangeBounds) -> Utf32Str<'a> { - let start = match range.start_bound() { - Bound::Included(&start) => start, - Bound::Excluded(&start) => start + 1, - Bound::Unbounded => 0, - }; - let end = match range.end_bound() { - Bound::Included(&end) => end + 1, - Bound::Excluded(&end) => end, - Bound::Unbounded => self.len(), - }; - match self { - Utf32Str::Ascii(bytes) => Utf32Str::Ascii(&bytes[start..end]), - Utf32Str::Unicode(codepoints) => Utf32Str::Unicode(&codepoints[start..end]), - } - } - - /// Returns the number of leading whitespaces in this string - #[inline] - pub(crate) fn leading_white_space(self) -> usize { - match self { - Utf32Str::Ascii(bytes) => bytes - .iter() - .position(|b| !b.is_ascii_whitespace()) - .unwrap_or(0), - Utf32Str::Unicode(codepoints) => codepoints - .iter() - .position(|c| !c.is_whitespace()) - .unwrap_or(0), - } - } - - /// Returns the number of leading whitespaces in this string - #[inline] - pub(crate) fn trailing_white_space(self) -> usize { - match self { - Utf32Str::Ascii(bytes) => bytes - .iter() - .rev() - .position(|b| !b.is_ascii_whitespace()) - .unwrap_or(0), - Utf32Str::Unicode(codepoints) => codepoints - .iter() - .rev() - .position(|c| !c.is_whitespace()) - .unwrap_or(0), - } - } - - /// Same as `slice` but accepts a u32 range for convenience since - /// those are the indices returned by the matcher. - #[inline] - pub fn slice_u32(self, range: impl RangeBounds) -> Utf32Str<'a> { - let start = match range.start_bound() { - Bound::Included(&start) => start as usize, - Bound::Excluded(&start) => start as usize + 1, - Bound::Unbounded => 0, - }; - let end = match range.end_bound() { - Bound::Included(&end) => end as usize + 1, - Bound::Excluded(&end) => end as usize, - Bound::Unbounded => self.len(), - }; - match self { - Utf32Str::Ascii(bytes) => Utf32Str::Ascii(&bytes[start..end]), - Utf32Str::Unicode(codepoints) => Utf32Str::Unicode(&codepoints[start..end]), - } - } - - /// Returns whether this string only contains ascii text. - pub fn is_ascii(self) -> bool { - matches!(self, Utf32Str::Ascii(_)) - } - - /// Returns the `n`th character in this string. - pub fn get(self, n: u32) -> char { - match self { - Utf32Str::Ascii(bytes) => bytes[n as usize] as char, - Utf32Str::Unicode(codepoints) => codepoints[n as usize], - } - } - pub(crate) fn last(self) -> char { - match self { - Utf32Str::Ascii(bytes) => bytes[bytes.len() - 1] as char, - Utf32Str::Unicode(codepoints) => codepoints[codepoints.len() - 1], - } - } - - pub(crate) fn first(self) -> char { - match self { - Utf32Str::Ascii(bytes) => bytes[0] as char, - Utf32Str::Unicode(codepoints) => codepoints[0], - } - } - - /// Returns an iterator over the characters in this string - pub fn chars(self) -> Chars<'a> { - match self { - Utf32Str::Ascii(bytes) => Chars::Ascii(bytes.iter()), - Utf32Str::Unicode(codepoints) => Chars::Unicode(codepoints.iter()), - } - } -} - -impl fmt::Debug for Utf32Str<'_> { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "\"")?; - for c in self.chars() { - for c in c.escape_debug() { - write!(f, "{c}")? - } - } - write!(f, "\"") - } -} - -impl fmt::Display for Utf32Str<'_> { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - for c in self.chars() { - write!(f, "{c}")? - } - Ok(()) - } -} - -pub enum Chars<'a> { - Ascii(slice::Iter<'a, u8>), - Unicode(slice::Iter<'a, char>), -} -impl<'a> Iterator for Chars<'a> { - type Item = char; - - fn next(&mut self) -> Option { - match self { - Chars::Ascii(iter) => iter.next().map(|&c| c as char), - Chars::Unicode(iter) => iter.next().copied(), - } - } -} - -impl DoubleEndedIterator for Chars<'_> { - fn next_back(&mut self) -> Option { - match self { - Chars::Ascii(iter) => iter.next_back().map(|&c| c as char), - Chars::Unicode(iter) => iter.next_back().copied(), - } - } -} - -#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Hash)] -/// An owned version of [`Utf32Str`]. -pub enum Utf32String { - /// A string represented as ASCII encoded bytes. - /// Correctness invariant: must only contain valid ASCII (<=127) - Ascii(Box), - /// A string represented as an array of unicode codepoints (basically UTF-32). - Unicode(Box<[char]>), -} - -impl Default for Utf32String { - fn default() -> Self { - Self::Ascii(String::new().into_boxed_str()) - } -} - -impl Utf32String { - /// Returns the number of characters in this string. - #[inline] - pub fn len(&self) -> usize { - match self { - Utf32String::Unicode(codepoints) => codepoints.len(), - Utf32String::Ascii(ascii_bytes) => ascii_bytes.len(), - } - } - - /// Returns whether this string is empty. - #[inline] - pub fn is_empty(&self) -> bool { - match self { - Utf32String::Unicode(codepoints) => codepoints.is_empty(), - Utf32String::Ascii(ascii_bytes) => ascii_bytes.is_empty(), - } - } - - /// Creates a slice with a string that contains the characters in - /// the specified **character range**. - #[inline] - pub fn slice(&self, range: impl RangeBounds) -> Utf32Str { - let start = match range.start_bound() { - Bound::Included(&start) => start, - Bound::Excluded(&start) => start + 1, - Bound::Unbounded => 0, - }; - let end = match range.end_bound() { - Bound::Included(&end) => end + 1, - Bound::Excluded(&end) => end, - Bound::Unbounded => self.len(), - }; - match self { - Utf32String::Ascii(bytes) => Utf32Str::Ascii(&bytes.as_bytes()[start..end]), - Utf32String::Unicode(codepoints) => Utf32Str::Unicode(&codepoints[start..end]), - } - } - - /// Same as `slice` but accepts a u32 range for convenience since - /// those are the indices returned by the matcher. - #[inline] - pub fn slice_u32(&self, range: impl RangeBounds) -> Utf32Str { - let start = match range.start_bound() { - Bound::Included(&start) => start, - Bound::Excluded(&start) => start + 1, - Bound::Unbounded => 0, - }; - let end = match range.end_bound() { - Bound::Included(&end) => end + 1, - Bound::Excluded(&end) => end, - Bound::Unbounded => self.len() as u32, - }; - match self { - Utf32String::Ascii(bytes) => { - Utf32Str::Ascii(&bytes.as_bytes()[start as usize..end as usize]) - } - Utf32String::Unicode(codepoints) => { - Utf32Str::Unicode(&codepoints[start as usize..end as usize]) - } - } - } -} - -impl From<&str> for Utf32String { - #[inline] - fn from(value: &str) -> Self { - if value.is_ascii() { - Self::Ascii(value.to_owned().into_boxed_str()) - } else { - Self::Unicode(chars::graphemes(value).collect()) - } - } -} - -impl From> for Utf32String { - fn from(value: Box) -> Self { - if value.is_ascii() { - Self::Ascii(value) - } else { - Self::Unicode(chars::graphemes(&value).collect()) - } - } -} - -impl From for Utf32String { - #[inline] - fn from(value: String) -> Self { - value.into_boxed_str().into() - } -} - -impl<'a> From> for Utf32String { - #[inline] - fn from(value: Cow<'a, str>) -> Self { - match value { - Cow::Borrowed(value) => value.into(), - Cow::Owned(value) => value.into(), - } - } -} - -impl fmt::Debug for Utf32String { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "{:?}", self.slice(..)) - } -} - -impl fmt::Display for Utf32String { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "{}", self.slice(..)) - } -} diff --git a/src/completion.rs b/src/completion.rs deleted file mode 100644 index ebbbcebd..00000000 --- a/src/completion.rs +++ /dev/null @@ -1,847 +0,0 @@ -use std::{path::{Path, PathBuf}, time::SystemTime}; - -use itertools::Itertools; -use nanoid::nanoid; - -use nucleo_matcher::{ - pattern::{self, Matchable, Normalization}, - Matcher, -}; -use once_cell::sync::Lazy; -use rayon::prelude::*; - -use regex::Regex; -use tower_lsp::lsp_types::{ - Command, CompletionItem, CompletionItemKind, CompletionItemLabelDetails, CompletionList, - CompletionParams, CompletionResponse, CompletionTextEdit, Documentation, InsertTextFormat, - MarkupContent, MarkupKind, Position, Range, TextEdit, Url, -}; - -use crate::{ - ui::preview_referenceable, - vault::{ - get_obsidian_ref_path, Block, MyRange, Preview, Reference, Referenceable, Refname, Vault, MDTag, - }, -}; - -fn get_wikilink_index(line: &Vec, cursor_character: usize) -> Option { - let index = line.get(0..=cursor_character)? // select only the characters up to the cursor - .iter() - .enumerate() // attach indexes - .tuple_windows() // window into pairs of characters - .collect::>() - .into_iter() - .rev() // search from the cursor back - .find(|((_, &c1), (_, &c2))| c1 == '[' && c2 == '[') - .map(|(_, (i, _))| i); // only take the index; using map because find returns an option - - index.and_then(|index| { - if line.get(index..cursor_character)?.into_iter().contains(&']') { - return None - } else { - return Some(index) - } - }) -} - -/// Range indexes for one line of the file; NOT THE WHOLE FILE -type LineRange = std::ops::Range; - -#[derive(Debug, PartialEq, Eq)] -struct CompletableMDLink { - display: (String, LineRange), - path: (String, LineRange), - infile_ref: Option<(String, LineRange)>, - partial: (String, LineRange), - full_range: LineRange, -} - -fn get_completable_mdlink(line: &Vec, cursor_character: usize) -> Option { - let line_to_cursor = line.get(0..cursor_character)?; - - static PARTIAL_MDLINK_REGEX: Lazy = Lazy::new(|| { - Regex::new(r"\[(?[^\[\]\(\)]*)\]\((?[^\[\]\(\)\#]*)(\#(?[^\[\]\(\)]*))?$").unwrap() - }); // [display](relativePath) - - let string_to_char = String::from_iter(line_to_cursor); - - let captures = PARTIAL_MDLINK_REGEX.captures(&string_to_char)?; - - let (full, display, reftext, infileref) = ( - captures.get(0)?, - captures.name("display")?, - captures.name("path")?, - captures.name("infileref"), - ); - - let reference_under_cursor = - Reference::new(&String::from_iter(line)) - .into_iter() - .find(|reference| { - reference.range.start.character <= cursor_character as u32 - && reference.range.end.character >= cursor_character as u32 - }); - - let full_range = match reference_under_cursor { - Some( - reference @ (Reference::MDFileLink(..) - | Reference::MDHeadingLink(..) - | Reference::MDIndexedBlockLink(..)), - ) => reference.range.start.character as usize..reference.range.end.character as usize, - None if line.get(cursor_character) == Some(&')') => { - full.range().start..full.range().end + 1 - } - _ => full.range(), - }; - - let partial = Some(CompletableMDLink { - path: (reftext.as_str().to_string(), reftext.range()), - display: (display.as_str().to_string(), display.range()), - infile_ref: infileref - .map(|infile_ref| (infile_ref.as_str().to_string(), infile_ref.range())), - partial: (full.as_str().to_string(), full.range()), - full_range, - }); - - return partial; -} - -#[derive(Debug, PartialEq, Eq)] -pub struct CompletableTag { - full_range: LineRange, - /// Tag name and range not including the '#' - inputted_tag: (String, LineRange) -} - -fn get_completable_tag(line: &Vec, cursor_character: usize) -> Option { - static PARTIAL_TAG_REGEX: Lazy = Lazy::new(|| { - Regex::new(r"\#(?[a-zA-Z0-9\/]*)").unwrap() - }); - - let line_string = String::from_iter(line); - - let captures_iter = PARTIAL_TAG_REGEX.captures_iter(&line_string); - - return captures_iter - .flat_map(|captures| { - - let (full, tag_text) = ( - captures.get(0)?, - captures.name("text")?, - ); - - // check if the cursor is in the tag - let preceding_character = cursor_character - 1; // User is inserting into the position after the character they are looking at; "#tag|" cursor is a position 4; I want pos 3; the end of the tag - if preceding_character >= full.range().start && preceding_character < full.range().end { // end is exclusive - return Some(CompletableTag { - full_range: full.range(), - inputted_tag: (tag_text.as_str().to_string(), tag_text.range()) - }) - } else { - return None - } - - }) - .next() - - - -} - -pub fn get_completions( - vault: &Vault, - initial_completion_files: &[PathBuf], - params: &CompletionParams, - _path: &Path, -) -> Option { - let Ok(path) = params - .text_document_position - .text_document - .uri - .to_file_path() - else { - return None; - }; - - let line = params.text_document_position.position.line as usize; - let character = params.text_document_position.position.character as usize; - - let selected_line = vault.select_line(&path.to_path_buf(), line as isize)?; - - if let Some(index) = get_wikilink_index(&selected_line, character) { - - // completions for wikilinks `[[text|` where | is the cursor - let range = Range { - start: Position { - line: line as u32, - character: index as u32 + 1, - }, - end: Position { - line: line as u32, - character: character as u32, - }, - }; - - let cmp_text = selected_line.get(index + 1..character)?; - - return match *cmp_text { - [] => Some(CompletionResponse::List(CompletionList { - items: initial_completion_files - .iter() - .map(|path| { - match std::fs::metadata(path).and_then(|meta| meta.modified()) { - Ok(modified) => (path, modified), - Err(_) => (path, SystemTime::UNIX_EPOCH), - } - }) - .sorted_by_key(|(_, modified)| *modified) - .take(5) - .filter_map(|(path_i, _)| { - Some( - vault - .select_referenceable_nodes(Some(path_i)) - .into_iter() - .filter(|referenceable| { - if initial_completion_files.len() > 1 { - if *path_i != path { - !matches!(referenceable, Referenceable::Tag(_, _)) - && !matches!( - referenceable, - Referenceable::Footnote(_, _) - ) - } else { - false - } - } else { - !matches!(referenceable, Referenceable::Tag(_, _)) - && !matches!( - referenceable, - Referenceable::Footnote(_, _) - ) - } - }) - .collect_vec(), - ) - }) - .flatten() - .filter_map(|referenceable| { - default_completion_item(vault, &referenceable, None) - }) - .collect::>(), - is_incomplete: true, - })), - [' ', ref text @ ..] if !text.contains(&']') => { - let blocks = vault.select_blocks(); - - let matches = fuzzy_match(&String::from_iter(text), blocks); - - let rand_id = nanoid!( - 5, - &[ - 'a', 'b', 'c', 'd', 'e', 'f', 'g', '1', '2', '3', '4', '5', '6', '7', '8', - '9' - ] - ); - - return Some(CompletionResponse::List(CompletionList { - is_incomplete: true, - items: matches - .into_par_iter() - .take(50) - .filter(|(block, _)| { - String::from_iter(selected_line.clone()).trim() != block.text - }) - .flat_map(|(block, rank)| { - let path_ref = get_obsidian_ref_path(vault.root_dir(), &block.file)?; - let file_name = block.file.file_stem()?.to_str()?; - - let url = Url::from_file_path(&block.file).ok()?; - Some(CompletionItem { - label: block.text.clone(), - sort_text: Some(rank.to_string()), - documentation: Some(Documentation::MarkupContent(MarkupContent { - kind: MarkupKind::Markdown, - value: (block.range.start.line as isize - 5 - ..=block.range.start.line as isize + 5) - .flat_map(|i| Some((vault.select_line(&block.file, i)?, i))) - .map(|(iter, ln)| { - if ln == block.range.start.line as isize { - format!("**{}**\n", String::from_iter(iter).trim()) - // highlight the block to be references - } else { - String::from_iter(iter) - } - }) - .join(""), - })), - filter_text: Some(format!(" {}", block.text)), - text_edit: Some(CompletionTextEdit::Edit(TextEdit { - range, - new_text: format!("{}#^{}", file_name, rand_id), - })), - command: Some(Command { - title: "Insert Block Reference Into File".into(), - command: "apply_edits".into(), - arguments: Some(vec![serde_json::to_value( - tower_lsp::lsp_types::WorkspaceEdit { - changes: Some( - vec![( - url, - vec![TextEdit { - range: Range { - start: Position { - line: block.range.end.line, - character: block - .range - .end - .character - - 1, - }, - end: Position { - line: block.range.end.line, - character: block - .range - .end - .character - - 1, - }, - }, - new_text: format!(" ^{}", rand_id), - }], - )] - .into_iter() - .collect(), - ), - change_annotations: None, - document_changes: None, - }, - ) - .ok()?]), - }), - ..Default::default() - }) - }) - .collect(), - })); - } - ref filter_text @ [..] if !filter_text.contains(&']') => { - let all_links = MatchableReferenceable::from_vault(vault); - let matches = fuzzy_match(&String::from_iter(filter_text), all_links); - - return Some(CompletionResponse::List(CompletionList { - is_incomplete: true, - items: matches - .into_iter() - .take(30) - .filter(|(MatchableReferenceable(r, name), _)| { - !(*name == String::from_iter(filter_text) && matches!(r, Referenceable::UnresovledFile(..) | Referenceable::UnresolvedHeading(..) | Referenceable::UnresovledIndexedBlock(..))) - }) - .filter_map(|(MatchableReferenceable(referenceable, _), rank)| { - default_completion_item( - vault, - &referenceable, - Some(CompletionTextEdit::Edit(TextEdit { - range, - new_text: referenceable.get_refname(&vault.root_dir())?.file_refname()? - })), - ) - .and_then(|item| Some(CompletionItem { - sort_text: Some(rank.to_string()), - filter_text: Some(referenceable.get_refname(&vault.root_dir())?.to_string()), - ..item - })) - }) - .collect::>(), - })); - } - _ => None, - }; - } else if let Some(partialmdlink) = get_completable_mdlink(&selected_line, character) { - match partialmdlink { - CompletableMDLink { - path, - infile_ref, - full_range, - display, - partial, - } => { - let inputted_refname = format!( - "{}{}", - path.0, - infile_ref - .clone() - .map(|(string, _)| format!("#{}", string)) - .unwrap_or("".to_string()) - ); - - - let all_links = MatchableReferenceable::from_vault(vault); - - let matches = fuzzy_match(&inputted_refname, all_links); - - return Some(CompletionResponse::List(CompletionList { - is_incomplete: true, - items: matches - .into_iter() - .take(50) - .filter(|(MatchableReferenceable(r, name), _)| - !(*name == inputted_refname && matches!(r, Referenceable::UnresovledFile(..) | Referenceable::UnresolvedHeading(..) | Referenceable::UnresovledIndexedBlock(..))) - ) - .flat_map(|(MatchableReferenceable(referenceable, _), rank)| { - default_completion_item( - vault, - &referenceable, - Some(CompletionTextEdit::Edit(TextEdit { - range: Range { - start: Position { - line: line as u32, - character: full_range.start as u32, - }, - end: Position { - line: line as u32, - character: full_range.end as u32, - }, - }, - new_text: format!( - "[${{1:{}}}]({}{}{}{})", - match ( - display.0.as_str(), - referenceable.get_refname(vault.root_dir())?.infile_ref - ) { - ("", Some(infile_ref_text)) => infile_ref_text.clone(), - ("", None) => { - match referenceable { - Referenceable::File(_, mdfile) => { - match mdfile.headings.first() { - Some(heading) => { - heading.heading_text.clone() - } - None => "".to_string(), - } - } - - _ => "".to_string(), - } - } - (display_text, _) => display_text.to_string(), - }, - if referenceable - .get_refname(vault.root_dir())? - .path? - .contains(" ") - { - "<" - } else { - "" - }, - referenceable - .get_refname(vault.root_dir())? - .link_file_key()?, - match referenceable - .get_refname(vault.root_dir())? - .infile_ref - { - Some(string) => format!("#{}", string), - None => "".to_string(), - }, - if referenceable - .get_refname(vault.root_dir())? - .path? - .contains(" ") - { - ">" - } else { - "" - }, - ), - })), - ) - .and_then(|item| { - Some(CompletionItem { - label: format!("{}{}", - referenceable.get_refname(vault.root_dir())?.link_file_key()?, - referenceable.get_refname(vault.root_dir())?.infile_ref.map(|thing| format!("#{}", thing)).unwrap_or("".into()) - ), - sort_text: Some(rank.to_string()), - insert_text_format: Some(InsertTextFormat::SNIPPET), - filter_text: Some(format!( - "[{}]({}", - display.0, - referenceable.get_refname(vault.root_dir())?.to_string() - )), - ..item - }) - }) - }) - .collect::>(), - })); - } - } - } else if let Some(CompletableTag{ full_range, inputted_tag: (completable_tag_name, tag_name_range) }) = get_completable_tag(&selected_line, character) { - // Initial Tag completion - let tag_refereneables = - vault - .select_referenceable_nodes(None) - .into_iter() - .flat_map(|referenceable| match referenceable { - tag @ Referenceable::Tag(_, _) => Some(tag), - _ => None, - }) - .flat_map(|tag| Some(MatchableReferenceable(tag.clone(), tag.get_refname(&vault.root_dir())?.path?))) - .collect_vec(); - - let matches = fuzzy_match(&completable_tag_name, tag_refereneables); - - return Some(CompletionResponse::List(CompletionList { - is_incomplete: true, - items: matches - .into_iter() - .take(20) - .filter(|(MatchableReferenceable(_, tag_name), _)| *tag_name != completable_tag_name) - .flat_map(|(MatchableReferenceable(tag, tag_name), ranking)| { - default_completion_item(vault, &tag, Some(CompletionTextEdit::Edit(TextEdit { - new_text: format!("#{}", tag_name.clone()), - range: Range { - start: Position { - line: line as u32, - character: full_range.start as u32, - }, - end: Position { - line: line as u32, - character: full_range.end as u32, - }, - } - }))) - .map(|item| CompletionItem { - label: tag_name.clone(), - sort_text: Some(ranking.to_string()), - filter_text: Some(format!("#{}", tag_name)), - ..item - }) - }) - .unique_by(|c| c.label.to_owned()) - .collect_vec(), - })); - } else if character - .checked_sub(1) - .and_then(|start| selected_line.get(start..character)) - == Some(&['[']) - { - let footnote_referenceables = vault - .select_referenceable_nodes(Some(&path)) - .into_iter() - .flat_map(|referenceable| match referenceable { - Referenceable::Footnote(footnote_path, _) - if footnote_path.as_path() == path.as_path() => - { - Some(referenceable) - } - _ => None, - }); - - return Some(CompletionResponse::Array( - footnote_referenceables - .filter_map(|footnote| { - footnote - .get_refname(vault.root_dir()) - .map(|root| CompletionItem { - kind: Some(CompletionItemKind::REFERENCE), - label: root.clone(), - documentation: preview_referenceable(vault, &footnote) - .map(Documentation::MarkupContent), - filter_text: vault - .select_referenceable_preview(&footnote) - .and_then(|preview| match preview { - Preview::Text(string) => Some(string), - Preview::Empty => None, - }) - .map(|preview_string| format!("{}{}", *root, &preview_string)), - ..Default::default() - }) - }) - .unique_by(|c| c.label.to_owned()) - .collect_vec(), - )); - } else { - return None; - } -} - -fn default_completion_item( - vault: &Vault, - referenceable: &Referenceable, - text_edit: Option, -) -> Option { - let refname = referenceable.get_refname(vault.root_dir())?; - let completion = CompletionItem { - kind: match &referenceable { - Referenceable::File(..) => Some(CompletionItemKind::FILE), - Referenceable::Heading(..) - | Referenceable::IndexedBlock(..) - | Referenceable::Footnote(..) - | Referenceable::LinkRefDef(..) - => Some(CompletionItemKind::REFERENCE), - Referenceable::UnresovledFile(..) - | Referenceable::UnresolvedHeading(..) - | Referenceable::UnresovledIndexedBlock(..) - => Some(CompletionItemKind::KEYWORD), - Referenceable::Tag(..) => Some(CompletionItemKind::CONSTANT), - }, - label: refname.file_refname()?, - label_details: match referenceable.is_unresolved() { - true => Some(CompletionItemLabelDetails { - detail: Some("Unresolved".into()), - description: None, - }), - false => None, - }, - text_edit, - documentation: preview_referenceable(vault, referenceable) - .map(Documentation::MarkupContent), - ..Default::default() - }; - - Some(completion) -} - -struct MatchableReferenceable<'a>(Referenceable<'a>, String); - -impl MatchableReferenceable<'_> { - fn from_vault<'a>(vault: &'a Vault) -> Vec> { - let all_links = vault - .select_referenceable_nodes(None) - .into_par_iter() - .filter(|referenceable| { - !matches!(referenceable, Referenceable::Tag(..)) - && !matches!(referenceable, Referenceable::Footnote(..)) - }) - .filter_map(|referenceable| { - referenceable - .get_refname(vault.root_dir()) - .map(|string| MatchableReferenceable(referenceable, string.to_string())) - }) - .collect::>(); - - all_links - } -} - -impl Matchable for MatchableReferenceable<'_> { - fn string(&self) -> &str { - &self.1 - } -} - -impl Matchable for Block { - fn string(&self) -> &str { - &self.text - } -} - -fn fuzzy_match( - filter_text: &str, - items: impl IntoIterator, -) -> Vec<(T, u32)> { - let mut matcher = Matcher::new(nucleo_matcher::Config::DEFAULT); - let matches = pattern::Pattern::parse( - filter_text, - pattern::CaseMatching::Ignore, - Normalization::Smart, - ) - .match_list(items, &mut matcher); - - return matches; -} - -#[cfg(test)] -mod tests { - use itertools::Itertools; - - use super::{get_wikilink_index, CompletableMDLink, CompletableTag, get_completable_tag}; - - #[test] - fn test_index() { - let s = "test [[linjfkdfjds]]"; - - let expected = 6; - - let actual = get_wikilink_index(&s.chars().collect(), 10); - - assert_eq!(Some(expected), actual); - - assert_eq!(Some("lin"), s.get(expected + 1..10)); - } - - #[test] - fn test_partial_mdlink() { - let line = "This is line [display](partialpa"; // (th) - - let expected = Some(CompletableMDLink { - partial: ("[display](partialpa".to_string(), 13..32), - display: ("display".to_string(), 14..21), - path: ("partialpa".to_string(), 23..32), - infile_ref: None, - full_range: 13..32, - }); - - let actual = super::get_completable_mdlink(&line.chars().collect(), 32); - - assert_eq!(actual, expected); - - let line = "This is line [display](partialpath)"; // (th) - - let expected = Some(CompletableMDLink { - partial: ("[display](partialpa".to_string(), 13..32), - display: ("display".to_string(), 14..21), - path: ("partialpa".to_string(), 23..32), - infile_ref: None, - full_range: 13..35, - }); - - let actual = super::get_completable_mdlink(&line.chars().collect(), 32); - - assert_eq!(actual, expected); - - let line = "[disp](pp) This is line [display](partialpath)"; // (th) - - let expected = Some(CompletableMDLink { - partial: ("[display](partialpa".to_string(), 24..43), - display: ("display".to_string(), 25..32), - path: ("partialpa".to_string(), 34..43), - infile_ref: None, - full_range: 24..46, - }); - - let actual = super::get_completable_mdlink(&line.chars().collect(), 43); - - assert_eq!(actual, expected); - - let line = "[disp](pp) This is line [display](partialpath)"; // (th) - - let expected = Some(CompletableMDLink { - partial: ("[display](partialpath".to_string(), 24..45), - display: ("display".to_string(), 25..32), - path: ("partialpath".to_string(), 34..45), - infile_ref: None, - full_range: 24..46, - }); - - let actual = super::get_completable_mdlink(&line.chars().collect(), 45); - - assert_eq!(actual, expected); - } - - #[test] - fn test_partial_mdlink_infile_refs() { - let line = "This is line [display](partialpa#"; // (th) - - let expected = Some(CompletableMDLink { - partial: ("[display](partialpa#".to_string(), 13..33), - display: ("display".to_string(), 14..21), - path: ("partialpa".to_string(), 23..32), - infile_ref: Some(("".to_string(), 33..33)), - full_range: 13..33, - }); - - let actual = super::get_completable_mdlink(&line.chars().collect(), 33); - - assert_eq!(actual, expected); - - let line = "[disp](pp) This is line [display](partialpath#Display)"; // (th) - - let expected = Some(CompletableMDLink { - partial: ("[display](partialpath#Display".to_string(), 24..53), - display: ("display".to_string(), 25..32), - path: ("partialpath".to_string(), 34..45), - infile_ref: Some(("Display".to_string(), 46..53)), - full_range: 24..54, - }); - - let actual = super::get_completable_mdlink(&line.chars().collect(), 53); - - assert_eq!(actual, expected); - - let line = "[disp](pp) This is line [display](partialpath#Display)"; // (th) - - let expected = Some(CompletableMDLink { - partial: ("[display](partialpath#Disp".to_string(), 24..50), - display: ("display".to_string(), 25..32), - path: ("partialpath".to_string(), 34..45), - infile_ref: Some(("Disp".to_string(), 46..50)), - full_range: 24..54, - }); - - let actual = super::get_completable_mdlink(&line.chars().collect(), 50); - - assert_eq!(actual, expected); - } - - #[test] - fn test_completable_tag_parsing() { - // 0 1 2 - // 01234567890123456789012345678 - let text = "text over here #tag more text"; - - let insert_position = 19; - - let expected = CompletableTag { - full_range: 15..19, - inputted_tag: ("tag".to_string(), 16..19) // not inclusive - }; - - let actual = get_completable_tag(&text.chars().collect_vec(), insert_position); - - - assert_eq!(Some(expected), actual); - - - - // 0 1 2 - // 01234567890123456789012345678 - let text = "text over here #tag more text"; - - let insert_position = 20; - - let actual = get_completable_tag(&text.chars().collect_vec(), insert_position); - - - assert_eq!(None, actual); - - - // 0 1 2 - // 01234567890123456789012345678 - let text = "text over here # more text"; - - let insert_position = 16; - - let actual = get_completable_tag(&text.chars().collect_vec(), insert_position); - - let expected = Some(CompletableTag { - full_range: 15..16, - inputted_tag: ("".to_string(), 16..16) - }); - - - assert_eq!(expected, actual); - - - // 0 1 2 - // 01234567890123456789012345678 - let text = "text over here #tag mor #tag "; - - let insert_position = 28; - - let expected = CompletableTag { - full_range: 24..28, - inputted_tag: ("tag".to_string(), 25..28) // not inclusive - }; - - let actual = get_completable_tag(&text.chars().collect_vec(), insert_position); - - - assert_eq!(Some(expected), actual); - - - } -} - diff --git a/src/completion/link_completer.rs b/src/completion/link_completer.rs new file mode 100644 index 00000000..3865df31 --- /dev/null +++ b/src/completion/link_completer.rs @@ -0,0 +1,382 @@ +use std::{path::PathBuf, time::SystemTime}; + +use itertools::Itertools; +use once_cell::sync::Lazy; +use rayon::prelude::*; +use regex::Regex; +use tower_lsp::lsp_types::{CompletionItem, CompletionItemKind, CompletionItemLabelDetails, CompletionTextEdit, Position, Range, TextEdit}; + +use crate::vault::{get_obsidian_ref_path, MDFile, MDHeading, MDIndexedBlock, Reference, Referenceable, Vault}; + +use super::{matcher::{fuzzy_match, fuzzy_match_completions, Matchable, OrderedCompletion}, Completable, Completer, Context}; + +/// Range on a single line; assumes that the line number is known. +type LineRange = std::ops::Range; + +pub struct MarkdownLinkCompleter<'a> { + /// The display text of a link to be completed + display: (String, LineRange), + /// the filepath of the markdown link to be completed + path: (String, LineRange), + /// the infile ref; the range is the whole span of the infile ref. (including the ^ for Block refs) + infile_ref: Option<(PartialInfileRef, LineRange)>, + + partial_link: (String, LineRange), + full_range: LineRange, + line_nr: usize, + file_path: std::path::PathBuf, + vault: &'a Vault +} + +impl<'a> Completer<'a> for MarkdownLinkCompleter<'a> { + + fn construct(context: Context<'a>, path: &std::path::Path, line: usize, character: usize) -> Option + where Self: Sized { + + let Context { vault, opened_files: _ } = context; + + let line_chars = vault.select_line(path, line as isize)?; + let line_to_cursor = line_chars.get(0..character)?; + + static PARTIAL_MDLINK_REGEX: Lazy = Lazy::new(|| { + Regex::new(r"\[(?[^\[\]\(\)]*)\]\((?[^\[\]\(\)\#]*)(\#(?[^\[\]\(\)]*))?$").unwrap() + }); // [display](relativePath) + + let line_string = String::from_iter(line_to_cursor); + + let captures = PARTIAL_MDLINK_REGEX.captures(&line_string)?; + + let (full, display, reftext, infileref) = ( + captures.get(0)?, + captures.name("display")?, + captures.name("path")?, + captures.name("infileref"), + ); + + let reference_under_cursor = + Reference::new(&line_string) + .into_iter() + .find(|reference| { + reference.range.start.character <= character as u32 + && reference.range.end.character >= character as u32 + }); + + let full_range = match reference_under_cursor { + Some( reference @ (Reference::MDFileLink(..) + | Reference::MDHeadingLink(..) + | Reference::MDIndexedBlockLink(..)), + ) => reference.range.start.character as usize..reference.range.end.character as usize, + None if line_to_cursor.get(character) == Some(&')') => { + full.range().start..full.range().end + 1 + } + _ => full.range(), + }; + + + let partial_infileref = infileref.map(|infileref| { + + let chars = infileref.as_str().chars().collect::>(); + + let range = infileref.range(); + + match chars.as_slice() { + ['^', rest @ ..] => (PartialInfileRef::BlockRef(String::from_iter(rest)), range), + [rest @ ..] => (PartialInfileRef::HeadingRef(String::from_iter(rest)), range), + } + + }); + + let partial = Some(MarkdownLinkCompleter { + path: (reftext.as_str().to_string(), reftext.range()), + display: (display.as_str().to_string(), display.range()), + infile_ref: partial_infileref, + partial_link: (full.as_str().to_string(), full.range()), + full_range, + line_nr: line, + file_path: path.to_path_buf(), + vault + }); + + partial + } + + fn completions(&self) -> Vec>> { + + let filter_text = format!( + "{}{}", + self.path.0, + self.infile_ref + .clone() + .map(|(infile, _)| format!("#{}", infile.completion_string())) + .unwrap_or("".to_string()) + ); + + + let referenceables = self.vault.select_referenceable_nodes(None); + + // Get and filter referenceables + let completions = referenceables + .into_par_iter() + .flat_map(|referenceable| LinkCompletion::new(referenceable.clone())) + .collect::>(); + + let filtered = fuzzy_match_completions(&filter_text, completions); + + filtered + + } +} + +#[derive(Debug, Clone)] +enum PartialInfileRef { + HeadingRef(String), + /// The partial reference to a block, not including the ^ index + BlockRef(String) +} + +impl PartialInfileRef { + fn completion_string(&self) -> String { + match self { + PartialInfileRef::HeadingRef(s) => s.to_string(), + PartialInfileRef::BlockRef(s) => format!("^{}", s), + } + } +} + + + + + +#[derive(Debug, Clone)] +enum LinkCompletion<'a> { + File { + mdfile: &'a MDFile, + match_string: String, + }, + Heading { + heading: &'a MDHeading, + match_string: String, + }, + Block { + indexed: &'a MDIndexedBlock, + match_string: String, + }, + Unresolved { + match_string: String, + /// Infile ref includes all after #, including ^ + infile_ref: Option, + }, +} + +use LinkCompletion::*; + +impl LinkCompletion<'_> { + fn new<'a>(referenceable: Referenceable<'a>) -> Option> { + match referenceable { + Referenceable::File(_, mdfile) => Some(File { mdfile, match_string: mdfile.path.file_stem()?.to_str()?.to_string() }), + Referenceable::Heading(path, mdheading) => Some(Heading {heading: mdheading, match_string: format!("{}#{}", path.file_stem()?.to_str()?, mdheading.heading_text)}), + Referenceable::IndexedBlock(path, indexed) => Some(Block{ indexed, match_string: format!("{}#^{}", path.file_stem()?.to_str()?, indexed.index)}), + Referenceable::UnresovledFile(_, file) => Some(Unresolved { match_string: file.clone(), infile_ref: None }), + Referenceable::UnresolvedHeading(_, s1, s2) => Some(Unresolved { match_string: format!("{}#{}", s1, s2), infile_ref: Some(s2.clone()) }), + Referenceable::UnresovledIndexedBlock(_, s1, s2) => Some(Unresolved { match_string: format!("{}#^{}", s1, s2), infile_ref: Some(format!("^{}", s2)) }), + _ => None + } + } +} + + +impl<'a> Completable<'a, MarkdownLinkCompleter<'a>> for LinkCompletion<'a> { + fn completion(&self, markdown_link_completer: &MarkdownLinkCompleter) -> CompletionItem { + + let label = self.match_string(); + + let MarkdownLinkCompleter { display, path: _, infile_ref: _, partial_link: _, full_range, line_nr, file_path: _, vault: _ } = markdown_link_completer; + + let link_infile_ref = match self { + File { mdfile: _, match_string: _ } => None, + Self::Block { indexed, match_string: _ } => Some(format!("#^{}", indexed.index)), + Self::Heading { heading, match_string: _ } => Some(format!("#{}", heading.heading_text)), + Self::Unresolved { match_string: _, infile_ref } => infile_ref.clone() + }; + + let binding = (display.0.as_str(), link_infile_ref); + let link_display_text = match binding { + ("", Some(ref infile)) => &infile, + // Get the first heading of the file, if possible. + ("", None) => match self { + Self::File { mdfile, match_string: _ } => mdfile.headings.get(0).map(|heading| heading.heading_text.as_str()).unwrap_or(""), + _ => "" + } + (display, _) => display, + }; + + + let link_ref_text = match label.contains(' ') { + true => format!("<{}>", label), + false => label.to_owned() + }; + + let link_text = format!( + "[${{1:{}}}]({})", + link_display_text, + link_ref_text + ); + + + let text_edit = CompletionTextEdit::Edit(TextEdit { + range: Range { + start: Position { + line: *line_nr as u32, + character: full_range.start as u32, + }, + end: Position { + line: *line_nr as u32, + character: full_range.end as u32, + }, + }, + new_text: link_text, + }); + + CompletionItem { + label: label.to_string(), + kind: Some(match self { + Self::File { mdfile: _, match_string: _ } => CompletionItemKind::FILE, + Self::Heading { heading: _, match_string: _ } | Self::Block { indexed: _, match_string: _ } => CompletionItemKind::REFERENCE, + Self::Unresolved { match_string: _, infile_ref: _ } => CompletionItemKind::KEYWORD + }), + label_details: match self { + Self::Unresolved { match_string: _, infile_ref: _ } => Some(CompletionItemLabelDetails{ + detail: Some("Unresolved".into()), + description: None + }), + _ => None + }, + text_edit: Some(text_edit), + ..Default::default() + } + + } +} + + +impl<'a> Completable<'a, WikiLinkCompleter<'a>> for LinkCompletion<'a> { + fn completion(&self, completer: &WikiLinkCompleter<'a>) -> CompletionItem { + todo!() + } +} + + +impl Matchable for LinkCompletion<'_> { + fn match_string(&self) -> &str { + match self { + File{mdfile: _, match_string} + | Heading { heading: _, match_string } + | Block { indexed: _, match_string } + | Unresolved { match_string, infile_ref: _ } + => &match_string, + } + } +} + + +pub struct WikiLinkCompleter<'a> { + vault: &'a Vault, + cmp_text: Vec, + files: &'a [PathBuf] +} + +impl<'a> Completer<'a> for WikiLinkCompleter<'a> { + + + fn construct(context: Context<'a>, path: &std::path::Path, line: usize, character: usize) -> Option + where Self: Sized { + + let Context { vault, opened_files } = context; + + let line_chars = vault.select_line(path, line as isize)?; + + let index = line_chars.get(0..=character)? // select only the characters up to the cursor + .iter() + .enumerate() // attach indexes + .tuple_windows() // window into pairs of characters + .collect::>() + .into_iter() + .rev() // search from the cursor back + .find(|((_, &c1), (_, &c2))| c1 == '[' && c2 == '[') + .map(|(_, (i, _))| i); // only take the index; using map because find returns an option + + let index = index.and_then(|index| { + if line_chars.get(index..character)?.into_iter().contains(&']') { + None + } else { + Some(index) + } + }); + + index.and_then(|index| { + let cmp_text = line_chars.get(index+1..character)?; + + Some(WikiLinkCompleter{ + vault, + cmp_text: cmp_text.to_vec(), + files: opened_files + }) + }) + } + + fn completions(&self) -> Vec> where Self: Sized { + let WikiLinkCompleter { vault, cmp_text, files } = self; + + match *self.cmp_text { + // Give recent referenceables; TODO: improve this; + [] => { + files + .iter() + .map(|path| { + match std::fs::metadata(path).and_then(|meta| meta.modified()) { + Ok(modified) => (path, modified), + Err(_) => (path, SystemTime::UNIX_EPOCH), + } + }) + .sorted_by_key(|(_, modified)| *modified) + .flat_map(|(path, modified)| { + + let referenceables = vault.select_referenceable_nodes(Some(&path)); + + let modified_string = modified.duration_since(SystemTime::UNIX_EPOCH).ok()?.as_secs().to_string(); + + Some(referenceables.into_iter() + .flat_map(move |referenceable| Some( + OrderedCompletion::::new( + LinkCompletion::new(referenceable)?, + modified_string.clone() + )) + )) + + }) + .flatten() + .collect_vec() + }, + ref filter_text @ [..] if !filter_text.contains(&']') => { + let filter_text = &self.cmp_text; + + + let referenceables = self.vault.select_referenceable_nodes(None); + + // Get and filter referenceables + let completions = referenceables + .into_par_iter() + .flat_map(|referenceable| LinkCompletion::new(referenceable.clone())) + .collect::>(); + + let filtered = fuzzy_match_completions(&String::from_iter(filter_text), completions); + + filtered + }, + _ => vec![] + } + } +} + + + diff --git a/src/completion/matcher.rs b/src/completion/matcher.rs new file mode 100644 index 00000000..ddfa9baa --- /dev/null +++ b/src/completion/matcher.rs @@ -0,0 +1,91 @@ +use std::ops::Deref; + +use nucleo_matcher::{pattern::{self, Normalization}, Matcher}; +use tower_lsp::lsp_types::CompletionItem; + +use crate::vault::Reference; + +use super::{Completable, Completer}; + +pub trait Matchable { + fn match_string(&self) -> &str; +} + +struct NucleoMatchable(T); +impl Deref for NucleoMatchable { + type Target = T; + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl AsRef for NucleoMatchable { + fn as_ref(&self) -> &str { + &self.match_string() + } +} + + +pub struct OrderedCompletion<'a, C, T> where C: Completer<'a>, T: Completable<'a, C> { + completable: T, + rank: String, + __phantom: std::marker::PhantomData<&'a T>, + __phantom2: std::marker::PhantomData +} + +impl<'a, C: Completer<'a>, T: Completable<'a, C>> OrderedCompletion<'a, C, T> { + pub fn new(completable: T, rank: String) -> Self { + Self { + completable, + rank, + __phantom: std::marker::PhantomData, + __phantom2: std::marker::PhantomData, + } + } +} + +impl<'a, C: Completer<'a>, T: Completable<'a, C>> Completable<'a, C> for OrderedCompletion<'a, C, T> { + fn completion(&self, completer: &C) -> tower_lsp::lsp_types::CompletionItem { + let completion = self.completable.completion(completer); + + CompletionItem { + sort_text: Some(self.rank.to_string()), + ..completion + } + } +} + +pub fn fuzzy_match_completions<'a, 'b, C: Completer<'a>, T: Matchable + Completable<'a, C>>( + filter_text: &'b str, + items: impl IntoIterator +) -> Vec> { + + let normal_fuzzy_match = fuzzy_match(filter_text, items); + + normal_fuzzy_match + .into_iter() + .map(|(item, score)| OrderedCompletion::new(item, score.to_string())) + .collect::>() + +} + +pub fn fuzzy_match<'a, T: Matchable>( + filter_text: &str, + items: impl IntoIterator, +) -> Vec<(T, u32)> { + + let items = items.into_iter().map(NucleoMatchable); + + let mut matcher = Matcher::new(nucleo_matcher::Config::DEFAULT); + let matches = pattern::Pattern::parse( + filter_text, + pattern::CaseMatching::Ignore, + Normalization::Smart, + ) + .match_list(items, &mut matcher); + + matches + .into_iter() + .map(|(item, score)| (item.0, score)) + .collect() +} diff --git a/src/completion/mod.rs b/src/completion/mod.rs new file mode 100644 index 00000000..5ffca9d5 --- /dev/null +++ b/src/completion/mod.rs @@ -0,0 +1,774 @@ +use std::{path::{Path, PathBuf}, time::SystemTime}; + +use itertools::Itertools; +use nanoid::nanoid; + +use nucleo_matcher::{ + pattern::{self, Normalization}, + Matcher, +}; +use once_cell::sync::Lazy; +use rayon::prelude::*; + +use regex::Regex; +use tower_lsp::lsp_types::{ + Command, CompletionItem, CompletionItemKind, CompletionItemLabelDetails, CompletionList, CompletionParams, CompletionResponse, CompletionTextEdit, Documentation, InsertTextFormat, InsertTextMode, MarkupContent, MarkupKind, Position, Range, TextEdit, Url +}; + +use crate::{ + ui::preview_referenceable, + vault::{ + get_obsidian_ref_path, Block, MyRange, Preview, Reference, Referenceable, Refname, Vault, MDTag, + }, +}; + +use self::{link_completer::{MarkdownLinkCompleter, WikiLinkCompleter}, matcher::fuzzy_match}; + +mod link_completer; +mod matcher; + +#[derive(Clone, Copy)] +pub struct Context<'a>{ + vault: &'a Vault, + opened_files: &'a [PathBuf], +} + +trait Completer<'a> { + fn construct(context: Context<'a>, path: &Path, line: usize, character: usize) -> Option + where Self: Sized; + + fn completions(&self) -> Vec> where Self: Sized; + + // fn compeltion_resolve(&self, vault: &Vault, resolve_item: CompletionItem) -> Option; +} + + +trait Completable<'a, T: Completer<'a>> { + fn completion(&self, completer: &T) -> CompletionItem; +} + +/// Range indexes for one line of the file; NOT THE WHOLE FILE +type LineRange = std::ops::Range; + +#[derive(Debug, PartialEq, Eq)] +pub struct CompletableTag { + full_range: LineRange, + /// Tag name and range not including the '#' + inputted_tag: (String, LineRange) +} + +fn get_completable_tag(line: &Vec, cursor_character: usize) -> Option { + static PARTIAL_TAG_REGEX: Lazy = Lazy::new(|| { + Regex::new(r"\#(?[a-zA-Z0-9\/]*)").unwrap() + }); + + let line_string = String::from_iter(line); + + let captures_iter = PARTIAL_TAG_REGEX.captures_iter(&line_string); + + return captures_iter + .flat_map(|captures| { + + let (full, tag_text) = ( + captures.get(0)?, + captures.name("text")?, + ); + + // check if the cursor is in the tag + let preceding_character = cursor_character - 1; // User is inserting into the position after the character they are looking at; "#tag|" cursor is a position 4; I want pos 3; the end of the tag + if preceding_character >= full.range().start && preceding_character < full.range().end { // end is exclusive + return Some(CompletableTag { + full_range: full.range(), + inputted_tag: (tag_text.as_str().to_string(), tag_text.range()) + }) + } else { + return None + } + + }) + .next() + + + +} + +pub fn get_completions( + vault: &Vault, + initial_completion_files: &[PathBuf], + params: &CompletionParams, + _path: &Path, +) -> Option { + let Ok(path) = params + .text_document_position + .text_document + .uri + .to_file_path() + else { + return None; + }; + + let completion_context = Context { + vault, + opened_files: initial_completion_files + }; + + run_completer::(completion_context, &path, params.text_document_position.position.line, params.text_document_position.position.character) + .or_else(|| run_completer::(completion_context, &path, params.text_document_position.position.line, params.text_document_position.position.character)) + +} +// +// if let Some(index) = get_wikilink_index(&selected_line, character) { +// +// // completions for wikilinks `[[text|` where | is the cursor +// let range = Range { +// start: Position { +// line: line as u32, +// character: index as u32 + 1, +// }, +// end: Position { +// line: line as u32, +// character: character as u32, +// }, +// }; +// +// let cmp_text = selected_line.get(index + 1..character)?; +// +// return match *cmp_text { +// [] => Some(CompletionResponse::List(CompletionList { +// items: initial_completion_files +// .iter() +// .map(|path| { +// match std::fs::metadata(path).and_then(|meta| meta.modified()) { +// Ok(modified) => (path, modified), +// Err(_) => (path, SystemTime::UNIX_EPOCH), +// } +// }) +// .sorted_by_key(|(_, modified)| *modified) +// .take(15) +// .filter_map(|(path_i, _)| { +// Some( +// vault +// .select_referenceable_nodes(Some(path_i)) +// .into_iter() +// .filter(|referenceable| { +// !matches!(referenceable, Referenceable::Tag(_, _)) +// && !matches!( +// referenceable, +// Referenceable::Footnote(_, _) +// ) +// }) +// .collect_vec(), +// ) +// }) +// .flatten() +// .filter_map(|referenceable| { +// default_completion_item(vault, &referenceable, None) +// }) +// .collect::>(), +// is_incomplete: true, +// })), +// [' ', ref text @ ..] if !text.contains(&']') => { +// let blocks = vault.select_blocks(); +// +// let matches = fuzzy_match(&String::from_iter(text), blocks); +// +// let rand_id = nanoid!( +// 5, +// &[ +// 'a', 'b', 'c', 'd', 'e', 'f', 'g', '1', '2', '3', '4', '5', '6', '7', '8', +// '9' +// ] +// ); +// +// return Some(CompletionResponse::List(CompletionList { +// is_incomplete: true, +// items: matches +// .into_par_iter() +// .take(50) +// .filter(|(block, _)| { +// String::from_iter(selected_line.clone()).trim() != block.text +// }) +// .flat_map(|(block, rank)| { +// let path_ref = get_obsidian_ref_path(vault.root_dir(), &block.file)?; +// let file_name = block.file.file_stem()?.to_str()?; +// +// let url = Url::from_file_path(&block.file).ok()?; +// Some(CompletionItem { +// label: block.text.clone(), +// sort_text: Some(rank.to_string()), +// documentation: Some(Documentation::MarkupContent(MarkupContent { +// kind: MarkupKind::Markdown, +// value: (block.range.start.line as isize - 5 +// ..=block.range.start.line as isize + 5) +// .flat_map(|i| Some((vault.select_line(&block.file, i)?, i))) +// .map(|(iter, ln)| { +// if ln == block.range.start.line as isize { +// format!("**{}**\n", String::from_iter(iter).trim()) +// // highlight the block to be references +// } else { +// String::from_iter(iter) +// } +// }) +// .join(""), +// })), +// filter_text: Some(format!(" {}", block.text)), +// text_edit: Some(CompletionTextEdit::Edit(TextEdit { +// range, +// new_text: format!("{}#^{}", file_name, rand_id), +// })), +// command: Some(Command { +// title: "Insert Block Reference Into File".into(), +// command: "apply_edits".into(), +// arguments: Some(vec![serde_json::to_value( +// tower_lsp::lsp_types::WorkspaceEdit { +// changes: Some( +// vec![( +// url, +// vec![TextEdit { +// range: Range { +// start: Position { +// line: block.range.end.line, +// character: block +// .range +// .end +// .character +// - 1, +// }, +// end: Position { +// line: block.range.end.line, +// character: block +// .range +// .end +// .character +// - 1, +// }, +// }, +// new_text: format!(" ^{}", rand_id), +// }], +// )] +// .into_iter() +// .collect(), +// ), +// change_annotations: None, +// document_changes: None, +// }, +// ) +// .ok()?]), +// }), +// ..Default::default() +// }) +// }) +// .collect(), +// })); +// } +// ref filter_text @ [..] if !filter_text.contains(&']') => { +// let all_links = MatchableReferenceable::from_vault(vault); +// let matches = fuzzy_match(&String::from_iter(filter_text), all_links); +// +// return Some(CompletionResponse::List(CompletionList { +// is_incomplete: true, +// items: matches +// .into_iter() +// .take(30) +// .filter(|(MatchableReferenceable(r, name), _)| { +// !(*name == String::from_iter(filter_text) && matches!(r, Referenceable::UnresovledFile(..) | Referenceable::UnresolvedHeading(..) | Referenceable::UnresovledIndexedBlock(..))) +// }) +// .filter_map(|(MatchableReferenceable(referenceable, _), rank)| { +// default_completion_item( +// vault, +// &referenceable, +// Some(CompletionTextEdit::Edit(TextEdit { +// range, +// new_text: referenceable.get_refname(&vault.root_dir())?.file_refname()? +// })), +// ) +// .and_then(|item| Some(CompletionItem { +// sort_text: Some(rank.to_string()), +// filter_text: Some(referenceable.get_refname(&vault.root_dir())?.to_string()), +// ..item +// })) +// }) +// .collect::>(), +// })); +// } +// _ => None, +// }; +// } else if let Some(partialmdlink) = get_completable_mdlink(&selected_line, character) { +// match partialmdlink { +// CompletableMDLink { +// path, +// infile_ref, +// full_range, +// display, +// partial, +// } => { +// let inputted_refname = format!( +// "{}{}", +// path.0, +// infile_ref +// .clone() +// .map(|(string, _)| format!("#{}", string)) +// .unwrap_or("".to_string()) +// ); +// +// +// let all_links = MatchableReferenceable::from_vault(vault); +// +// let matches = fuzzy_match(&inputted_refname, all_links); +// +// return Some(CompletionResponse::List(CompletionList { +// is_incomplete: true, +// items: matches +// .into_iter() +// .take(50) +// .filter(|(MatchableReferenceable(r, name), _)| +// !(*name == inputted_refname && matches!(r, Referenceable::UnresovledFile(..) | Referenceable::UnresolvedHeading(..) | Referenceable::UnresovledIndexedBlock(..))) +// ) +// .flat_map(|(MatchableReferenceable(referenceable, _), rank)| { +// default_completion_item( +// vault, +// &referenceable, +// Some(CompletionTextEdit::Edit(TextEdit { +// range: Range { +// start: Position { +// line: line as u32, +// character: full_range.start as u32, +// }, +// end: Position { +// line: line as u32, +// character: full_range.end as u32, +// }, +// }, +// new_text: format!( +// "[${{1:{}}}]({}{}{}{})", +// match ( +// display.0.as_str(), +// referenceable.get_refname(vault.root_dir())?.infile_ref +// ) { +// ("", Some(infile_ref_text)) => infile_ref_text.clone(), +// ("", None) => { +// match referenceable { +// Referenceable::File(_, mdfile) => { +// match mdfile.headings.first() { +// Some(heading) => { +// heading.heading_text.clone() +// } +// None => "".to_string(), +// } +// } +// +// _ => "".to_string(), +// } +// } +// (display_text, _) => display_text.to_string(), +// }, +// if referenceable +// .get_refname(vault.root_dir())? +// .path? +// .contains(" ") +// { +// "<" +// } else { +// "" +// }, +// referenceable +// .get_refname(vault.root_dir())? +// .link_file_key()?, +// match referenceable +// .get_refname(vault.root_dir())? +// .infile_ref +// { +// Some(string) => format!("#{}", string), +// None => "".to_string(), +// }, +// if referenceable +// .get_refname(vault.root_dir())? +// .path? +// .contains(" ") +// { +// ">" +// } else { +// "" +// }, +// ), +// })), +// ) +// .and_then(|item| { +// Some(CompletionItem { +// label: format!("{}{}", +// referenceable.get_refname(vault.root_dir())?.link_file_key()?, +// referenceable.get_refname(vault.root_dir())?.infile_ref.map(|thing| format!("#{}", thing)).unwrap_or("".into()) +// ), +// sort_text: Some(rank.to_string()), +// insert_text_format: Some(InsertTextFormat::SNIPPET), +// filter_text: Some(format!( +// "[{}]({}", +// display.0, +// referenceable.get_refname(vault.root_dir())?.to_string() +// )), +// ..item +// }) +// }) +// }) +// .collect::>(), +// })); +// } +// } +// } else if let Some(CompletableTag{ full_range, inputted_tag: (completable_tag_name, tag_name_range) }) = get_completable_tag(&selected_line, character) { +// // Initial Tag completion +// let tag_refereneables = +// vault +// .select_referenceable_nodes(None) +// .into_iter() +// .flat_map(|referenceable| match referenceable { +// tag @ Referenceable::Tag(_, _) => Some(tag), +// _ => None, +// }) +// .flat_map(|tag| Some(MatchableReferenceable(tag.clone(), tag.get_refname(&vault.root_dir())?.path?))) +// .collect_vec(); +// +// let matches = fuzzy_match(&completable_tag_name, tag_refereneables); +// +// return Some(CompletionResponse::List(CompletionList { +// is_incomplete: true, +// items: matches +// .into_iter() +// .take(20) +// .filter(|(MatchableReferenceable(_, tag_name), _)| *tag_name != completable_tag_name) +// .flat_map(|(MatchableReferenceable(tag, tag_name), ranking)| { +// default_completion_item(vault, &tag, Some(CompletionTextEdit::Edit(TextEdit { +// new_text: format!("#{}", tag_name.clone()), +// range: Range { +// start: Position { +// line: line as u32, +// character: full_range.start as u32, +// }, +// end: Position { +// line: line as u32, +// character: full_range.end as u32, +// }, +// } +// }))) +// .map(|item| CompletionItem { +// label: tag_name.clone(), +// sort_text: Some(ranking.to_string()), +// filter_text: Some(format!("#{}", tag_name)), +// ..item +// }) +// }) +// .unique_by(|c| c.label.to_owned()) +// .collect_vec(), +// })); +// } else if character +// .checked_sub(1) +// .and_then(|start| selected_line.get(start..character)) +// == Some(&['[']) +// { +// let footnote_referenceables = vault +// .select_referenceable_nodes(Some(&path)) +// .into_iter() +// .flat_map(|referenceable| match referenceable { +// Referenceable::Footnote(footnote_path, _) +// if footnote_path.as_path() == path.as_path() => +// { +// Some(referenceable) +// } +// _ => None, +// }); +// +// return Some(CompletionResponse::Array( +// footnote_referenceables +// .filter_map(|footnote| { +// footnote +// .get_refname(vault.root_dir()) +// .map(|root| CompletionItem { +// kind: Some(CompletionItemKind::REFERENCE), +// label: root.clone(), +// documentation: preview_referenceable(vault, &footnote) +// .map(Documentation::MarkupContent), +// filter_text: vault +// .select_referenceable_preview(&footnote) +// .and_then(|preview| match preview { +// Preview::Text(string) => Some(string), +// Preview::Empty => None, +// }) +// .map(|preview_string| format!("{}{}", *root, &preview_string)), +// ..Default::default() +// }) +// }) +// .unique_by(|c| c.label.to_owned()) +// .collect_vec(), +// )); +// } else { +// return None; +// } +// } +// +// fn default_completion_item( +// vault: &Vault, +// referenceable: &Referenceable, +// text_edit: Option, +// ) -> Option { +// let refname = referenceable.get_refname(vault.root_dir())?; +// let completion = CompletionItem { +// kind: match &referenceable { +// Referenceable::File(..) => Some(CompletionItemKind::FILE), +// Referenceable::Heading(..) +// | Referenceable::IndexedBlock(..) +// | Referenceable::Footnote(..) +// | Referenceable::LinkRefDef(..) +// => Some(CompletionItemKind::REFERENCE), +// Referenceable::UnresovledFile(..) +// | Referenceable::UnresolvedHeading(..) +// | Referenceable::UnresovledIndexedBlock(..) +// => Some(CompletionItemKind::KEYWORD), +// Referenceable::Tag(..) => Some(CompletionItemKind::CONSTANT), +// }, +// label: refname.file_refname()?, +// label_details: match referenceable.is_unresolved() { +// true => Some(CompletionItemLabelDetails { +// detail: Some("Unresolved".into()), +// description: None, +// }), +// false => None, +// }, +// text_edit, +// documentation: preview_referenceable(vault, referenceable) +// .map(Documentation::MarkupContent), +// ..Default::default() +// }; +// +// Some(completion) +// } +// +// struct MatchableReferenceable<'a>(Referenceable<'a>, String); +// +// impl MatchableReferenceable<'_> { +// fn from_vault<'a>(vault: &'a Vault) -> Vec> { +// let all_links = vault +// .select_referenceable_nodes(None) +// .into_par_iter() +// .filter(|referenceable| { +// !matches!(referenceable, Referenceable::Tag(..)) +// && !matches!(referenceable, Referenceable::Footnote(..)) +// }) +// .filter_map(|referenceable| { +// referenceable +// .get_refname(vault.root_dir()) +// .map(|string| MatchableReferenceable(referenceable, string.to_string())) +// }) +// .collect::>(); +// +// all_links +// } +// } +// +// +// #[cfg(test)] +// mod tests { +// use itertools::Itertools; +// +// use super::{get_wikilink_index, CompletableMDLink, CompletableTag, get_completable_tag}; +// +// #[test] +// fn test_index() { +// let s = "test [[linjfkdfjds]]"; +// +// let expected = 6; +// +// let actual = get_wikilink_index(&s.chars().collect(), 10); +// +// assert_eq!(Some(expected), actual); +// +// assert_eq!(Some("lin"), s.get(expected + 1..10)); +// } +// +// #[test] +// fn test_partial_mdlink() { +// let line = "This is line [display](partialpa"; // (th) +// +// let expected = Some(CompletableMDLink { +// partial: ("[display](partialpa".to_string(), 13..32), +// display: ("display".to_string(), 14..21), +// path: ("partialpa".to_string(), 23..32), +// infile_ref: None, +// full_range: 13..32, +// }); +// +// let actual = super::get_completable_mdlink(&line.chars().collect(), 32); +// +// assert_eq!(actual, expected); +// +// let line = "This is line [display](partialpath)"; // (th) +// +// let expected = Some(CompletableMDLink { +// partial: ("[display](partialpa".to_string(), 13..32), +// display: ("display".to_string(), 14..21), +// path: ("partialpa".to_string(), 23..32), +// infile_ref: None, +// full_range: 13..35, +// }); +// +// let actual = super::get_completable_mdlink(&line.chars().collect(), 32); +// +// assert_eq!(actual, expected); +// +// let line = "[disp](pp) This is line [display](partialpath)"; // (th) +// +// let expected = Some(CompletableMDLink { +// partial: ("[display](partialpa".to_string(), 24..43), +// display: ("display".to_string(), 25..32), +// path: ("partialpa".to_string(), 34..43), +// infile_ref: None, +// full_range: 24..46, +// }); +// +// let actual = super::get_completable_mdlink(&line.chars().collect(), 43); +// +// assert_eq!(actual, expected); +// +// let line = "[disp](pp) This is line [display](partialpath)"; // (th) +// +// let expected = Some(CompletableMDLink { +// partial: ("[display](partialpath".to_string(), 24..45), +// display: ("display".to_string(), 25..32), +// path: ("partialpath".to_string(), 34..45), +// infile_ref: None, +// full_range: 24..46, +// }); +// +// let actual = super::get_completable_mdlink(&line.chars().collect(), 45); +// +// assert_eq!(actual, expected); +// } +// +// #[test] +// fn test_partial_mdlink_infile_refs() { +// let line = "This is line [display](partialpa#"; // (th) +// +// let expected = Some(CompletableMDLink { +// partial: ("[display](partialpa#".to_string(), 13..33), +// display: ("display".to_string(), 14..21), +// path: ("partialpa".to_string(), 23..32), +// infile_ref: Some(("".to_string(), 33..33)), +// full_range: 13..33, +// }); +// +// let actual = super::get_completable_mdlink(&line.chars().collect(), 33); +// +// assert_eq!(actual, expected); +// +// let line = "[disp](pp) This is line [display](partialpath#Display)"; // (th) +// +// let expected = Some(CompletableMDLink { +// partial: ("[display](partialpath#Display".to_string(), 24..53), +// display: ("display".to_string(), 25..32), +// path: ("partialpath".to_string(), 34..45), +// infile_ref: Some(("Display".to_string(), 46..53)), +// full_range: 24..54, +// }); +// +// let actual = super::get_completable_mdlink(&line.chars().collect(), 53); +// +// assert_eq!(actual, expected); +// +// let line = "[disp](pp) This is line [display](partialpath#Display)"; // (th) +// +// let expected = Some(CompletableMDLink { +// partial: ("[display](partialpath#Disp".to_string(), 24..50), +// display: ("display".to_string(), 25..32), +// path: ("partialpath".to_string(), 34..45), +// infile_ref: Some(("Disp".to_string(), 46..50)), +// full_range: 24..54, +// }); +// +// let actual = super::get_completable_mdlink(&line.chars().collect(), 50); +// +// assert_eq!(actual, expected); +// } +// +// #[test] +// fn test_completable_tag_parsing() { +// // 0 1 2 +// // 01234567890123456789012345678 +// let text = "text over here #tag more text"; +// +// let insert_position = 19; +// +// let expected = CompletableTag { +// full_range: 15..19, +// inputted_tag: ("tag".to_string(), 16..19) // not inclusive +// }; +// +// let actual = get_completable_tag(&text.chars().collect_vec(), insert_position); +// +// +// assert_eq!(Some(expected), actual); +// +// +// +// // 0 1 2 +// // 01234567890123456789012345678 +// let text = "text over here #tag more text"; +// +// let insert_position = 20; +// +// let actual = get_completable_tag(&text.chars().collect_vec(), insert_position); +// +// +// assert_eq!(None, actual); +// +// +// // 0 1 2 +// // 01234567890123456789012345678 +// let text = "text over here # more text"; +// +// let insert_position = 16; +// +// let actual = get_completable_tag(&text.chars().collect_vec(), insert_position); +// +// let expected = Some(CompletableTag { +// full_range: 15..16, +// inputted_tag: ("".to_string(), 16..16) +// }); +// +// +// assert_eq!(expected, actual); +// +// +// // 0 1 2 +// // 01234567890123456789012345678 +// let text = "text over here #tag mor #tag "; +// +// let insert_position = 28; +// +// let expected = CompletableTag { +// full_range: 24..28, +// inputted_tag: ("tag".to_string(), 25..28) // not inclusive +// }; +// +// let actual = get_completable_tag(&text.chars().collect_vec(), insert_position); +// +// +// assert_eq!(Some(expected), actual); +// +// +// } +// } + + +fn run_completer<'a, T: Completer<'a>>(context: Context<'a>, path: &Path, line: u32, character: u32) -> Option { + + let completer = T::construct(context, path, line as usize, character as usize)?; + + let completions = completer.completions() + .into_iter() + .take(50) + .map(|completable| completable.completion(&completer)) + .collect_vec(); + + Some(CompletionResponse::List(CompletionList { is_incomplete: true, items: completions })) + + +} + diff --git a/src/main.rs b/src/main.rs index 9a245047..88609de5 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,3 +1,6 @@ +#![feature(closure_lifetime_binder)] +#![feature(non_lifetime_binders)] + use std::collections::HashSet; use std::ops::{Deref, DerefMut}; use std::path::{Path, PathBuf}; @@ -48,24 +51,26 @@ struct TextDocumentItem { impl Backend { async fn update_vault(&self, params: TextDocumentItem) { + self.client.log_message(MessageType::WARNING, "Update Vault Started").await; + + let Ok(path) = params.uri.to_file_path() else { + self.client + .log_message(MessageType::ERROR, "Failed to parse URI path") + .await; + return; + }; + { - let Some(ref mut vault) = *self.vault.write().await else { - self.client - .log_message(MessageType::ERROR, "Vault is not initialized") - .await; + let _ = self.bind_vault_mut(|vault| { + let text = ¶ms.text; + Vault::update_vault(vault, (&path, text)); + + Ok(()) + }).await; - return; - }; + } // close the lock - let Ok(path) = params.uri.to_file_path() else { - self.client - .log_message(MessageType::ERROR, "Failed to parse URI path") - .await; - return; - }; - let text = ¶ms.text; - Vault::update_vault(vault, (&path, text)); - } // must close the write lock before publishing diagnostics; I don't really like how imperative this is + self.client.log_message(MessageType::WARNING, "Update Vault Done").await; match self.publish_diagnostics().await { Ok(_) => (), @@ -135,6 +140,8 @@ impl Backend { } async fn publish_diagnostics(&self) -> Result<()> { + self.client.log_message(MessageType::WARNING, "Diagnostics Started").await; + let uris = self.bind_opened_files(|files| { Ok(files .into_par_iter() @@ -159,6 +166,9 @@ impl Backend { self.client.publish_diagnostics(uri, diags, None).await; } + + self.client.log_message(MessageType::WARNING, "Diagnostics Done").await; + Ok(()) } @@ -181,6 +191,12 @@ impl Backend { } async fn bind_vault_mut(&self, callback: impl Fn(&mut Vault) -> Result) -> Result { + if let Err(e) = self.vault.try_write() { + self.client.log_message(MessageType::ERROR, format!("Failed to get VAULT lock for write {:?}", e)).await; + } else { + self.client.log_message(MessageType::ERROR, "VAULT Lock is good").await; + } + let mut guard = self.vault.write().await; let Some(ref mut vault) = *guard else { return Err(Error::new(ErrorCode::ServerError(0))); @@ -201,6 +217,14 @@ impl Backend { &self, callback: impl Fn(&mut HashSet) -> Result, ) -> Result { + + + if let Err(e) = self.opened_files.try_write() { + self.client.log_message(MessageType::ERROR, format!("Failed to get FILES lock for write {:?}", e)).await; + } else { + self.client.log_message(MessageType::ERROR, "FILES Lock is good").await; + } + let mut opened_files = self.opened_files.write().await; callback(opened_files.deref_mut()) } @@ -415,6 +439,8 @@ impl LanguageServer for Backend { } async fn completion(&self, params: CompletionParams) -> Result> { + self.client.log_message(MessageType::WARNING, "Completions Started").await; + let timer = std::time::Instant::now(); let path = params_position_path!(params)?; @@ -432,12 +458,7 @@ impl LanguageServer for Backend { let elapsed = timer.elapsed(); - self.client - .log_message( - MessageType::WARNING, - format!("Completion Calculation took {}ms", elapsed.as_millis()), - ) - .await; + self.client.log_message(MessageType::WARNING, format!("Completions Done took {}ms", elapsed.as_millis())).await; res } diff --git a/src/vault/mod.rs b/src/vault/mod.rs index 328a17c0..c08c3b36 100644 --- a/src/vault/mod.rs +++ b/src/vault/mod.rs @@ -18,6 +18,11 @@ use serde::{Deserialize, Serialize}; use tower_lsp::lsp_types::Position; use walkdir::WalkDir; + + +mod referenceable; + + impl Vault { pub fn construct_vault(root_dir: &Path) -> Result { let md_file_paths = WalkDir::new(root_dir) @@ -993,8 +998,9 @@ impl MDHeading { #[derive(Debug, PartialEq, Eq, Clone)] pub struct MDIndexedBlock { - index: String, - range: MyRange, + /// THe index of the block; does not include '^' + pub index: String, + pub range: MyRange, } impl Hash for MDIndexedBlock { @@ -1147,6 +1153,7 @@ pub enum Referenceable<'a> { Footnote(&'a PathBuf, &'a MDFootnote), UnresovledFile(PathBuf, &'a String), UnresolvedHeading(PathBuf, &'a String, &'a String), + /// full path, link path, index (without ^) UnresovledIndexedBlock(PathBuf, &'a String, &'a String), LinkRefDef(&'a PathBuf, &'a MDLinkReferenceDefinition), } diff --git a/src/vault/referenceable.rs b/src/vault/referenceable.rs new file mode 100644 index 00000000..e69de29b