Skip to content

Commit

Permalink
Add range_qname and range_value methods to Attribute.
Browse files Browse the repository at this point in the history
  • Loading branch information
Jayonas authored May 23, 2024
1 parent 6e7293b commit dfed9be
Show file tree
Hide file tree
Showing 6 changed files with 125 additions and 6 deletions.
3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,6 @@ exclude = ["testing-tools"]
default = ["std", "positions"]
std = []
# Enables Nodes and Attributes position in the original document preserving.
# Increases memory usage by `Range<usize>` for each Node and Attribute.
# Increases memory usage by `Range<usize>` for each Node.
# Increases memory usage by `Range<usize>` + `u16` + `u8` for each Attribute.
positions = []
41 changes: 41 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -490,6 +490,10 @@ struct AttributeData<'input> {
value: StringStorage<'input>,
#[cfg(feature = "positions")]
range: Range<usize>,
#[cfg(feature = "positions")]
qname_len: u16,
#[cfg(feature = "positions")]
eq_len: u8, // includes any surrounding spaces
}

/// An attribute.
Expand Down Expand Up @@ -587,6 +591,43 @@ impl<'a, 'input> Attribute<'a, 'input> {
pub fn range(&self) -> Range<usize> {
self.data.range.clone()
}

/// Returns attribute's qname's range in bytes in the original document.
///
/// ```text
/// <e n:attr='value'/>
/// ^^^^^^
/// ```
///
/// To reduce memory usage the qname length is limited by u16::MAX.
/// If the attribute exceeds that limit then the end of the returned range will be incorrect.
#[cfg(feature = "positions")]
#[inline]
pub fn range_qname(&self) -> Range<usize> {
let end = self.data.range.start + usize::from(self.data.qname_len);
self.data.range.start..end
}

/// Returns attribute's value's range in bytes in the original document, excluding the surrounding quotes.
///
/// If the attribute's value is an empty string then the `start` and `end` of this `Range` are equal, and indicate the closing quote.
///
/// ```text
/// <e n:attr='value'/>
/// ^^^^^
/// ```
///
/// To reduce memory usage the qname length is limited by u16::MAX,
/// and the number of spaces around the equal sign is limited by u8::MAX.
/// If the attribute exceeds those limits then the start of the returned range will be incorrect.
#[cfg(feature = "positions")]
#[inline]
pub fn range_value(&self) -> Range<usize> {
// +1 on start and -1 on end are to exclude the quotes around the value (all valid quotes are 1 byte)
let start = self.data.range.start + usize::from(self.data.qname_len) + usize::from(self.data.eq_len) + 1;
let end = self.data.range.end - 1;
start..end
}
}

impl PartialEq for Attribute<'_, '_> {
Expand Down
16 changes: 14 additions & 2 deletions src/parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -353,6 +353,10 @@ struct TempAttributeData<'input> {
local: &'input str,
value: StringStorage<'input>,
range: Range<usize>,
#[allow(unused)] // only used for feature "positions"
qname_len: u16,
#[allow(unused)] // only used for feature "positions"
eq_len: u8,
}

impl<'input> Document<'input> {
Expand Down Expand Up @@ -644,8 +648,8 @@ impl<'input> tokenizer::XmlEvents<'input> for Context<'input> {

self.after_text = false;
}
tokenizer::Token::Attribute(range, prefix, local, value) => {
process_attribute(range, prefix, local, value, self)?;
tokenizer::Token::Attribute(range, qname_len, eq_len, prefix, local, value) => {
process_attribute(range, qname_len, eq_len, prefix, local, value, self)?;
}
tokenizer::Token::ElementEnd(end, range) => {
process_element(end, range, self)?;
Expand All @@ -666,6 +670,8 @@ impl<'input> tokenizer::XmlEvents<'input> for Context<'input> {
#[allow(clippy::too_many_arguments)]
fn process_attribute<'input>(
range: Range<usize>,
qname_len: u16,
eq_len: u8,
prefix: &'input str,
local: &'input str,
value: StrSpan<'input>,
Expand Down Expand Up @@ -732,6 +738,8 @@ fn process_attribute<'input>(
local,
value,
range,
qname_len,
eq_len,
});
}

Expand Down Expand Up @@ -909,6 +917,10 @@ fn resolve_attributes(namespaces: ShortRange, ctx: &mut Context) -> Result<Short
value: attr.value,
#[cfg(feature = "positions")]
range: attr.range,
#[cfg(feature = "positions")]
qname_len: attr.qname_len,
#[cfg(feature = "positions")]
eq_len: attr.eq_len,
});
}

Expand Down
7 changes: 5 additions & 2 deletions src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ pub enum Token<'input> {
ElementStart(&'input str, &'input str, usize),

// ns:attr="value"
Attribute(Range<usize>, &'input str, &'input str, StrSpan<'input>),
Attribute(Range<usize>, u16, u8, &'input str, &'input str, StrSpan<'input>),

ElementEnd(ElementEnd<'input>, Range<usize>),

Expand Down Expand Up @@ -553,7 +553,10 @@ fn parse_element<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'inp
// We cannot mark `parse_attribute` as `#[inline(always)]`
// because it will blow up the binary size.
let (prefix, local) = s.consume_qname()?;
let qname_end = s.pos();
let qname_len = u16::try_from(qname_end - start).unwrap_or(u16::MAX);
s.consume_eq()?;
let eq_len = u8::try_from(s.pos() - qname_end).unwrap_or(u8::MAX);
let quote = s.consume_quote()?;
let quote_c = quote as char;
// The attribute value must not contain the < character.
Expand All @@ -562,7 +565,7 @@ fn parse_element<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'inp
let value = s.slice_back_span(value_start);
s.consume_byte(quote)?;
let end = s.pos();
events.token(Token::Attribute(start..end, prefix, local, value))?;
events.token(Token::Attribute(start..end, qname_len, eq_len, prefix, local, value))?;
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/tokenizer_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ impl<'a> xml::XmlEvents<'a> for EventsCollector<'a> {
xml::Token::ElementStart(prefix, local, start) => {
Token::ElementStart(prefix, local, start)
}
xml::Token::Attribute(_, prefix, local, value) => {
xml::Token::Attribute(_, _, _, prefix, local, value) => {
Token::Attribute(prefix, local, value.as_str())
}
xml::Token::ElementEnd(end, range) => Token::ElementEnd(
Expand Down
62 changes: 62 additions & 0 deletions tests/api.rs
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,10 @@ fn text_pos_01() {
if let Some(attr) = node.attribute_node("a") {
assert_eq!(doc.text_pos_at(attr.range().start), TextPos::new(1, 4));
assert_eq!(doc.text_pos_at(attr.range().end), TextPos::new(1, 9));
assert_eq!(doc.text_pos_at(attr.range_qname().start), TextPos::new(1, 4));
assert_eq!(doc.text_pos_at(attr.range_qname().end), TextPos::new(1, 5));
assert_eq!(doc.text_pos_at(attr.range_value().start), TextPos::new(1, 7));
assert_eq!(doc.text_pos_at(attr.range_value().end), TextPos::new(1, 8));
}

// first child is a text/whitespace, not a comment
Expand All @@ -184,6 +188,10 @@ fn text_pos_02() {
if let Some(attr) = node.attribute_node(("http://www.w3.org", "a")) {
assert_eq!(doc.text_pos_at(attr.range().start), TextPos::new(1, 36));
assert_eq!(doc.text_pos_at(attr.range().end), TextPos::new(1, 44));
assert_eq!(doc.text_pos_at(attr.range_qname().start), TextPos::new(1, 36));
assert_eq!(doc.text_pos_at(attr.range_qname().end), TextPos::new(1, 40));
assert_eq!(doc.text_pos_at(attr.range_value().start), TextPos::new(1, 42));
assert_eq!(doc.text_pos_at(attr.range_value().end), TextPos::new(1, 43));
}
}

Expand All @@ -202,6 +210,60 @@ fn text_pos_03() {
assert_eq!(doc.text_pos_at(node.range().end), TextPos::new(2, 5));
}

#[cfg(feature = "positions")]
#[test]
fn text_pos_04() {
let data = "<n1:e xmlns:n1='http://www.w3.org' n1:a=''/>";

let doc = Document::parse(data).unwrap();
let node = doc.root_element();

if let Some(attr) = node.attribute_node("a") {
assert_eq!(doc.text_pos_at(attr.range().start), TextPos::new(1, 36));
assert_eq!(doc.text_pos_at(attr.range().end), TextPos::new(1, 43));
assert_eq!(doc.text_pos_at(attr.range_qname().start), TextPos::new(1, 36));
assert_eq!(doc.text_pos_at(attr.range_qname().end), TextPos::new(1, 40));
assert_eq!(doc.text_pos_at(attr.range_value().start), TextPos::new(1, 42));
assert_eq!(doc.text_pos_at(attr.range_value().end), TextPos::new(1, 42));
}
}

#[cfg(feature = "positions")]
#[test]
fn text_pos_05() {
let data = "<n1:e xmlns:n1='http://www.w3.org' n1:a = 'b'/>";

let doc = Document::parse(data).unwrap();
let node = doc.root_element();

if let Some(attr) = node.attribute_node("a") {
assert_eq!(doc.text_pos_at(attr.range().start), TextPos::new(1, 36));
assert_eq!(doc.text_pos_at(attr.range().end), TextPos::new(1, 48));
assert_eq!(doc.text_pos_at(attr.range_qname().start), TextPos::new(1, 36));
assert_eq!(doc.text_pos_at(attr.range_qname().end), TextPos::new(1, 40));
assert_eq!(doc.text_pos_at(attr.range_value().start), TextPos::new(1, 47));
assert_eq!(doc.text_pos_at(attr.range_value().end), TextPos::new(1, 48));
}
}

#[cfg(feature = "positions")]
#[test]
fn text_pos_06() {
// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
let data = "<e a = 'b'/>";

let doc = Document::parse(data).unwrap();
let node = doc.root_element();

if let Some(attr) = node.attribute_node("a") {
assert_eq!(doc.text_pos_at(attr.range().start), TextPos::new(1, 4));
assert_eq!(doc.text_pos_at(attr.range().end), TextPos::new(1, 269));
assert_eq!(doc.text_pos_at(attr.range_qname().start), TextPos::new(1, 4));
assert_eq!(doc.text_pos_at(attr.range_qname().end), TextPos::new(1, 5));
attr.range_value(); // unreliable since >254 spaces around equal sign, but still shouldn't panic
}
}

#[test]
fn next_sibling_element_01() {
let data = "<root><a/><b/><c/></root>";
Expand Down

0 comments on commit dfed9be

Please sign in to comment.