Skip to content

Commit

Permalink
Change Lexing/Parsing of embedded docs to not eagerly validate (#507)
Browse files Browse the repository at this point in the history
This changes the lexer and parser to pass through strings enclosed in backticks un-parsed. (At current, these documents are parsed during lowering).

Since embedded documents may themselves contain backticks, beginning and ending delimiters consist of an arbitrary odd numbers of backticks (e.g., `` ` ``, `` ``` ``, `` ````` `` etc.) that must be paired (e.g., `` `$ion_data_here::[]` ``, `` ```$ion_data_here::[ $string_with_embedded_backtick:"`" ]``` ``, etc.).

As opening and closing delimiters are required to be odd in count of backticks, a contiguous string of backticks that is even is interpreted as an empty document.
  • Loading branch information
jpschorr committed Oct 22, 2024
1 parent a7993de commit 5d4efaa
Show file tree
Hide file tree
Showing 13 changed files with 214 additions and 225 deletions.
2 changes: 1 addition & 1 deletion extension/partiql-extension-visualize/src/ast_to_dot.rs
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ fn lit_to_str(ast: &ast::Lit) -> String {
Lit::FloatLit(l) => l.to_string(),
Lit::DoubleLit(l) => l.to_string(),
Lit::BoolLit(l) => (if *l { "TRUE" } else { "FALSE" }).to_string(),
Lit::IonStringLit(l) => format!("`{}`", l),
Lit::EmbeddedDocLit(l) => format!("`{}`", l),
Lit::CharStringLit(l) => format!("'{}'", l),
Lit::NationalCharStringLit(l) => format!("'{}'", l),
Lit::BitStringLit(l) => format!("b'{}'", l),
Expand Down
2 changes: 1 addition & 1 deletion partiql-ast/src/ast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -444,7 +444,7 @@ pub enum Lit {
#[visit(skip)]
BoolLit(bool),
#[visit(skip)]
IonStringLit(String),
EmbeddedDocLit(String),
#[visit(skip)]
CharStringLit(String),
#[visit(skip)]
Expand Down
2 changes: 1 addition & 1 deletion partiql-ast/src/pretty.rs
Original file line number Diff line number Diff line change
Expand Up @@ -394,7 +394,7 @@ impl PrettyDoc for Lit {
Lit::FloatLit(inner) => arena.text(inner.to_string()),
Lit::DoubleLit(inner) => arena.text(inner.to_string()),
Lit::BoolLit(inner) => arena.text(inner.to_string()),
Lit::IonStringLit(inner) => inner.pretty_doc(arena),
Lit::EmbeddedDocLit(inner) => inner.pretty_doc(arena), // TODO better pretty for embedded doc: https://github.com/partiql/partiql-lang-rust/issues/508
Lit::CharStringLit(inner) => inner.pretty_doc(arena),
Lit::NationalCharStringLit(inner) => inner.pretty_doc(arena),
Lit::BitStringLit(inner) => inner.pretty_doc(arena),
Expand Down
3 changes: 2 additions & 1 deletion partiql-logical-planner/src/lower.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1933,7 +1933,7 @@ fn lit_to_value(lit: &Lit) -> Result<Value, AstTransformError> {
Lit::FloatLit(f) => Value::Real(OrderedFloat::from(f64::from(*f))),
Lit::DoubleLit(f) => Value::Real(OrderedFloat::from(*f)),
Lit::BoolLit(b) => Value::Boolean(*b),
Lit::IonStringLit(s) => parse_embedded_ion_str(s)?,
Lit::EmbeddedDocLit(s) => parse_embedded_ion_str(s)?,
Lit::CharStringLit(s) => Value::String(Box::new(s.clone())),
Lit::NationalCharStringLit(s) => Value::String(Box::new(s.clone())),
Lit::BitStringLit(_) => {
Expand Down Expand Up @@ -1978,6 +1978,7 @@ fn lit_to_value(lit: &Lit) -> Result<Value, AstTransformError> {
Ok(val)
}

// TODO
fn parse_embedded_ion_str(contents: &str) -> Result<Value, AstTransformError> {
fn lit_err(literal: &str, err: impl std::error::Error) -> AstTransformError {
AstTransformError::Literal {
Expand Down
1 change: 1 addition & 0 deletions partiql-parser/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ serde = { version = "1", features = ["derive"], optional = true }

[dev-dependencies]
criterion = "0.5"
assert_matches = "1"

[features]
default = []
Expand Down
4 changes: 2 additions & 2 deletions partiql-parser/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ pub enum LexError<'input> {
#[error("Lexing error: invalid input `{}`", .0)]
InvalidInput(Cow<'input, str>),
/// Embedded Ion value is not properly terminated.
#[error("Lexing error: unterminated ion literal")]
UnterminatedIonLiteral,
#[error("Lexing error: unterminated embedded document literal")]
UnterminatedDocLiteral,
/// Comment is not properly terminated.
#[error("Lexing error: unterminated comment")]
UnterminatedComment,
Expand Down
112 changes: 112 additions & 0 deletions partiql-parser/src/lexer/embedded_doc.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
use crate::error::LexError;
use crate::lexer::SpannedResult;
use logos::{Logos, Span};
use partiql_common::syntax::line_offset_tracker::LineOffsetTracker;
use partiql_common::syntax::location::ByteOffset;

/// An embedded Doc string (e.g. `[{a: 1}, {b: 2}]`) with [`ByteOffset`] span
/// relative to lexed source.
///
/// Note:
/// - The lexer parses the embedded Doc value enclosed in backticks.
/// - The returned string *does not* include the backticks
/// - The returned `ByteOffset` span *does* include the backticks
type EmbeddedDocStringResult<'input> = SpannedResult<&'input str, ByteOffset, LexError<'input>>;

/// Tokens used to parse Doc literals embedded in backticks (\`)
#[derive(Logos, Debug, Clone, PartialEq)]
#[logos(skip r#"[^/*'"`\r\n\u0085\u2028\u2029]+"#)] // skip things that aren't newlines or backticks
enum EmbeddedDocToken {
// Skip newlines, but record their position.
// For line break recommendations,
// see https://www.unicode.org/standard/reports/tr13/tr13-5.html
#[regex(r"(([\r])?[\n])|\u0085|\u2028|\u2029")]
Newline,

// An embed open/close tag is a (greedily-captured) odd-number of backticks
#[regex(r"`(``)*")]
Embed,
}

/// A Lexer for Doc literals embedded in backticks (\`) that returns the parsed [`EmbeddedDocString`]
///
/// Parses just enough Doc to make sure not to include a backtick that is inside a string or comment.
pub struct EmbeddedDocLexer<'input, 'tracker> {
/// Wrap a logos-generated lexer
lexer: logos::Lexer<'input, EmbeddedDocToken>,
tracker: &'tracker mut LineOffsetTracker,
}

impl<'input, 'tracker> EmbeddedDocLexer<'input, 'tracker> {
/// Creates a new embedded Doc lexer over `input` text.
#[inline]
pub fn new(input: &'input str, tracker: &'tracker mut LineOffsetTracker) -> Self {
EmbeddedDocLexer {
lexer: EmbeddedDocToken::lexer(input),
tracker,
}
}

/// Parses a single embedded Doc value, quoted between backticks (`), and returns it
fn next_internal(&mut self) -> Option<EmbeddedDocStringResult<'input>> {
let next_token = self.lexer.next();
match next_token {
Some(Ok(EmbeddedDocToken::Embed)) => {
let Span {
start: b_start,
end: b_end,
} = self.lexer.span();
let start_quote_len = b_end - b_start;
loop {
let next_tok = self.lexer.next();
match next_tok {
Some(Ok(EmbeddedDocToken::Newline)) => {
// track the newline, and keep accumulating
self.tracker.record(self.lexer.span().end.into());
}
Some(Ok(EmbeddedDocToken::Embed)) => {
let Span {
start: e_start,
end: e_end,
} = self.lexer.span();
let end_quote_len = e_end - e_start;
if end_quote_len >= start_quote_len {
let backup = end_quote_len - start_quote_len;
let (str_start, str_end) =
(b_start + start_quote_len, e_end - end_quote_len);
let doc_value = &self.lexer.source()[str_start..str_end];

return Some(Ok((
b_start.into(),
doc_value,
(e_end - backup).into(),
)));
}
}
Some(_) => {
// just consume all other tokens
}
None => {
let Span { end, .. } = self.lexer.span();
return Some(Err((
b_start.into(),
LexError::UnterminatedDocLiteral,
end.into(),
)));
}
}
}
}
_ => None,
}
}
}

impl<'input, 'tracker> Iterator for EmbeddedDocLexer<'input, 'tracker> {
type Item = EmbeddedDocStringResult<'input>;

#[inline(always)]
fn next(&mut self) -> Option<Self::Item> {
self.next_internal()
}
}
135 changes: 0 additions & 135 deletions partiql-parser/src/lexer/embedded_ion.rs

This file was deleted.

Loading

0 comments on commit 5d4efaa

Please sign in to comment.