From 20386c7ec2f9e473fc2af7caf4e8dd623a00f10d Mon Sep 17 00:00:00 2001 From: Nordine Bittich Date: Thu, 28 Nov 2024 21:48:27 +0100 Subject: [PATCH] WIP iri --- src/iri.rs | 125 +++++++++++++++++++++++++++++++++++----------- src/iri_spect.txt | 30 ++++++++--- 2 files changed, 119 insertions(+), 36 deletions(-) diff --git a/src/iri.rs b/src/iri.rs index 507f588..e889815 100644 --- a/src/iri.rs +++ b/src/iri.rs @@ -1,3 +1,37 @@ +pub struct IRI { + pub scheme: Option, + pub i_hier_part: Option, +} + +pub struct IHierPart { + pub authority: Option, +} + +pub struct Authority { + pub user_info: Option, + pub host: Host, + pub port: Option, +} + +pub enum Host { + IPV4(Vec), + IPV6(Vec), + RegName(Option), +} + +pub enum IPath { + AbEmpty(Vec), // starts with / or is empty + AbAbsolute { + snz: String, // segment non zero (isegment-nz) + segments: Vec, // isegment + }, + Rootless { + snz_nc: String, // isegment-nz-nc + segments: Vec, + }, + Empty, // ipath-empty +} + #[allow(unused)] mod ip { use nom::{ @@ -79,27 +113,6 @@ mod ip { } } -struct IRI { - scheme: Option, - i_hier_part: Option, -} - -struct IHierPart { - authority: Option, -} - -struct Authority { - user_info: Option, - host: Host, - port: Option, -} - -enum Host { - IPV4(Vec), - IPV6(Vec), - RegName(Option), -} - #[allow(unused)] mod parser { use nom::{ @@ -113,7 +126,7 @@ mod parser { use super::{ ip::{self, parse_ip_v4, parse_ip_v6}, - Authority, Host, + Authority, Host, IPath, }; fn parse_authority(s: &str) -> ParserResult { @@ -138,6 +151,51 @@ mod parser { map(opt(parse_i_reg_name), Host::RegName), ))(s) } + + fn parse_i_fragment(s: &str) -> ParserResult { + fold_many0( + alt((parse_ip_char, tag("/"), tag("?"))), + String::new, + |mut acc, item| { + acc.push_str(item); + acc + }, + )(s) + } + fn parse_ipath_empty(s: &str) -> ParserResult { + map( + verify(peek(opt(parse_ip_char)), |ip_char| ip_char.is_none()), + |_| IPath::Empty, + )(s) + } + + fn parse_ipath_rootless(s: &str) -> ParserResult { + map( + pair( + parse_i_segmentnz_nc, + many0(preceded(tag("/"), parse_i_segment0)), + ), + |(snz_nc, segments)| IPath::Rootless { snz_nc, segments }, + )(s) + } + + fn parse_ipath_abempty(s: &str) -> ParserResult { + map(many0(preceded(tag("/"), parse_i_segment0)), IPath::AbEmpty)(s) + } + fn parse_ipath_absolute(s: &str) -> ParserResult { + let (first_two, _) = peek(take(2usize))(s)?; + let parser = pair( + parse_i_segmentnz, + many0(preceded(tag("/"), parse_i_segment0)), + ); + verify( + map(parser, |(snz, segments)| IPath::AbAbsolute { + snz, + segments, + }), + move |_| first_two.starts_with("/") && first_two != "//", + )(s) + } fn parse_i_segmentnz(s: &str) -> ParserResult { fold_many0(parse_ip_char, String::new, |mut acc, item| { acc.push_str(item); @@ -209,6 +267,18 @@ mod parser { }, )(s) } + fn parse_i_private(s: &str) -> ParserResult<&str> { + verify(take(1usize), |hex: &str| { + hex.starts_with(|c: char| { + matches!( + c, + '\u{E000}'..='\u{F8FF}' + | '\u{F0000}'..='\u{FFFFD}' + | '\u{100000}'..='\u{10FFFD}' + ) + }) + })(s) + } fn parse_i_unreserved(s: &str) -> ParserResult<&str> { fn is_ucs_char(c: &char) -> bool { matches!(c, @@ -231,12 +301,11 @@ mod parser { '\u{E1000}'..='\u{EFFFD}' ) } - alt(( - verify(take(2usize), |hex: &str| { - hex_to_char(hex).filter(|x| is_ucs_char(x)).is_some() - }), - take_while1(|c: char| c.is_alphanum() || c == '-' || c == '.' || c == '_' || c == '~'), - ))(s) + verify(take(1usize), |unres: &str| { + unres.starts_with(|c: char| { + c.is_alphanum() || c == '-' || c == '.' || c == '_' || c == '~' || is_ucs_char(&c) + }) + })(s) } fn parse_sub_delims(s: &str) -> ParserResult<&str> { verify(take(1usize), |c: &str| { diff --git a/src/iri_spect.txt b/src/iri_spect.txt index 3fe44cf..fe6a475 100644 --- a/src/iri_spect.txt +++ b/src/iri_spect.txt @@ -1,9 +1,20 @@ -FC 3987 Internationalized Resource Identifiers January 2005 + IRI = scheme ":" ihier-part [ "?" iquery ] + [ "#" ifragment ] - / ipath-noscheme + ihier-part = "//" iauthority ipath-abempty + / ipath-absolute + / ipath-rootless / ipath-empty + IRI-reference = IRI / irelative-ref + + absolute-IRI = scheme ":" ihier-part [ "?" iquery ] + + irelative-ref = irelative-part [ "?" iquery ] [ "#" ifragment ] + + irelative-part = "//" iauthority ipath-abempty + / ipath-absolute @@ -13,11 +24,7 @@ FC 3987 Internationalized Resource Identifiers January 2005 / ipath-rootless ; begins with a segment / ipath-empty ; zero characters - ipath-abempty = *( "/" isegment ) - ipath-absolute = "/" [ isegment-nz *( "/" isegment ) ] ipath-noscheme = isegment-nz-nc *( "/" isegment ) - ipath-rootless = isegment-nz *( "/" isegment ) - ipath-empty = 0 @@ -25,11 +32,9 @@ FC 3987 Internationalized Resource Identifiers January 2005 iquery = *( ipchar / iprivate / "/" / "?" ) - ifragment = *( ipchar / "/" / "?" ) - iprivate = %xE000-F8FF / %xF0000-FFFFD / %x100000-10FFFD Some productions are ambiguous. The "first-match-wins" (a.k.a. "greedy") algorithm applies. For details, see [RFC3986]. @@ -68,6 +73,7 @@ RFC 3987 Internationalized Resource Identifiers January 2005 DONE: + ipath-abempty = *( "/" isegment ) ipchar = iunreserved / pct-encoded / sub-delims / ":" / "@" iauthority = [ iuserinfo "@" ] ihost [ ":" port ] @@ -116,3 +122,11 @@ ls32 = ( h16 ":" h16 ) / IPv4address isegment-nz-nc = 1*( iunreserved / pct-encoded / sub-delims / "@" ) ; non-zero-length segment without any colon ":" + ipath-absolute = "/" [ isegment-nz *( "/" isegment ) ] + + ipath-rootless = isegment-nz *( "/" isegment ) + ipath-empty = 0 + + ifragment = *( ipchar / "/" / "?" ) + iprivate = %xE000-F8FF / %xF0000-FFFFD / %x100000-10FFFD +