Skip to content

Commit

Permalink
feat: Add markdown link/image support
Browse files Browse the repository at this point in the history
  • Loading branch information
ppodolsky committed Nov 24, 2023
1 parent 33db316 commit 2a41766
Showing 1 changed file with 152 additions and 7 deletions.
159 changes: 152 additions & 7 deletions summa-core/src/components/tokenizers/mmd_tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ pub struct MmdTokenStream<'a> {
drop_commands: &'a HashSet<&'static str>,
known_commands: &'a HashSet<&'static str>,
base_offset: usize,
maybe_link: bool,
}

#[inline]
Expand Down Expand Up @@ -125,6 +126,7 @@ impl<'a> MmdTokenStream<'a> {
drop_commands,
known_commands,
base_offset: 0,
maybe_link: false,
}
}

Expand Down Expand Up @@ -153,6 +155,7 @@ impl<'a> MmdTokenStream<'a> {
drop_commands,
known_commands,
base_offset: offset,
maybe_link: false,
}
}

Expand All @@ -172,6 +175,10 @@ impl<'a> MmdTokenStream<'a> {
self.token.offset_from = usize::MAX;
let mut is_command = false;
let mut spec_counter = 0;
let mut start_skipping_round_bracket = false;
let mut skipped_round_bracket = 0;
let mut start_skipping_figure_bracket = false;
let mut skipped_figure_bracket = 0;

if let Some((stacked_char, stacked_offset)) = self.stacked_char.take() {
accept_char(&mut self.token, stacked_char, self.base_offset + stacked_offset);
Expand All @@ -181,10 +188,15 @@ impl<'a> MmdTokenStream<'a> {
if stacked_char == '\\' {
is_command = true;
}
if stacked_char == '[' {
self.maybe_link = true;
}
}

for (offset, c) in &mut self.chars {
eprintln!("status {} {} {}", c, is_command, spec_counter);
let real_offset = self.base_offset + offset;

if let Some(skip_list) = &self.skip_list {
while self.skip_iter < skip_list.len() && skip_list[self.skip_iter].1 <= real_offset {
self.skip_iter += 1;
Expand All @@ -194,6 +206,35 @@ impl<'a> MmdTokenStream<'a> {
}
}

if start_skipping_round_bracket || skipped_round_bracket > 0 {
start_skipping_round_bracket = false;
if c == '(' {
skipped_round_bracket += 1;
continue;
} else if c == ')' {
skipped_round_bracket -= 1;
if skipped_round_bracket == 0 {
start_skipping_figure_bracket = true;
}
continue;
} else if skipped_round_bracket > 0 {
continue;
}
}

if start_skipping_figure_bracket || skipped_figure_bracket > 0 {
start_skipping_figure_bracket = false;
if c == '{' {
skipped_figure_bracket += 1;
continue;
} else if c == '}' {
skipped_figure_bracket -= 1;
continue;
} else if skipped_figure_bracket > 0 {
continue;
}
}

if is_cjk(&c) {
if !self.token.text.is_empty() {
self.stacked_char = Some((c, offset));
Expand All @@ -202,20 +243,27 @@ impl<'a> MmdTokenStream<'a> {
accept_char(&mut self.token, c, real_offset);
return true;
}

if c == '\\' {
if !self.token.text.is_empty() {
self.stacked_char = Some((c, offset));
return true;
}
is_command = true;
accept_char(&mut self.token, c, real_offset);
continue;
} else if c == '[' && !is_command {
if !self.token.text.is_empty() {
self.stacked_char = Some((c, offset));
return true;
}
self.maybe_link = true;
} else if c == ']' && self.maybe_link && !is_command {
self.maybe_link = false;
start_skipping_round_bracket = true;
} else if c == '^' || c == '~' {
self.token.offset_to += 1;
continue;
} else if c == '*' || c == '_' {
spec_counter += 1;
continue;
} else if c.is_alphanumeric() || c == '#' || c == '+' {
if spec_counter == 1 {
self.stacked_char = Some((c, offset));
Expand All @@ -225,11 +273,10 @@ impl<'a> MmdTokenStream<'a> {
spec_counter = 0;
};
accept_char(&mut self.token, c, real_offset);
continue;
} else if is_command && (c == '(' || c == ')' || c == '[' || c == ']') && self.token.text.len() == 1 {
accept_char(&mut self.token, c, real_offset);
break;
} else if (c == '{' || c == '}') && is_command {
} else if is_command && (c == '{' || c == '}') {
if self.drop_commands.contains(&self.token.text.as_str()) {
is_command = false;
self.token.text.clear();
Expand All @@ -242,7 +289,6 @@ impl<'a> MmdTokenStream<'a> {
if c == '}' {
break;
}
continue;
} else if !self.token.text.is_empty() {
break;
}
Expand Down Expand Up @@ -284,7 +330,11 @@ impl<'a> tantivy::tokenizer::TokenStream for MmdTokenStream<'a> {
self.token.offset_from += 1;
self.token.text = self.token.text[1..].to_string()
}
break;
if self.token.text == "]" || self.token.text == "}" || self.token.text == ")" {
result = self.advance_token(false);
} else {
break;
}
}
}
result
Expand Down Expand Up @@ -615,6 +665,101 @@ pub mod tests {
},
],
);
assert_tokenization(&mut tokenizer, "![]()", &[]);
assert_tokenization(
&mut tokenizer,
"![image text](https://example.com/image.jpg){width=1}",
&[
Token {
offset_from: 2,
offset_to: 7,
position: 0,
text: "image".to_string(),
position_length: 1,
},
Token {
offset_from: 8,
offset_to: 12,
position: 1,
text: "text".to_string(),
position_length: 1,
},
],
);
assert_tokenization(
&mut tokenizer,
"[ref] (author)",
&[
Token {
offset_from: 1,
offset_to: 4,
position: 0,
text: "ref".to_string(),
position_length: 1,
},
Token {
offset_from: 7,
offset_to: 13,
position: 1,
text: "author".to_string(),
position_length: 1,
},
],
);
assert_tokenization(
&mut tokenizer,
"[ref]test [ref](l)test",
&[
Token {
offset_from: 1,
offset_to: 9,
position: 0,
text: "reftest".to_string(),
position_length: 1,
},
Token {
offset_from: 11,
offset_to: 22,
position: 1,
text: "reftest".to_string(),
position_length: 1,
},
],
);
assert_tokenization(
&mut tokenizer,
"![ref](hehe)-abc{} \\[34\\] \\] \\) \\} 1 ### abc \\(",
&[
Token {
offset_from: 2,
offset_to: 5,
position: 0,
text: "ref".to_string(),
position_length: 1,
},
Token {
offset_from: 13,
offset_to: 16,
position: 1,
text: "abc".to_string(),
position_length: 1,
},
Token {
offset_from: 35,
offset_to: 36,
position: 2,
text: "1".to_string(),
position_length: 1,
},
Token {
offset_from: 41,
offset_to: 44,
position: 3,
text: "abc".to_string(),
position_length: 1,
},
],
);
}

#[test]
Expand Down

0 comments on commit 2a41766

Please sign in to comment.