From 7e8451996804ab0f30a3627a7d449b373f08c397 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rimas=20Misevi=C4=8Dius?= Date: Sun, 23 Jun 2024 11:28:09 +0300 Subject: [PATCH] Fix host_parser::parse_host function Do not report error on forbidden < and > characters if they are NFC normalized to valid ones. --- include/upa/url_host.h | 10 +++++++--- test/data/my-toascii.json | 28 ++++++++++++++++++++++++++++ test/wpt-url.cpp | 1 + 3 files changed, 36 insertions(+), 3 deletions(-) create mode 100644 test/data/my-toascii.json diff --git a/include/upa/url_host.h b/include/upa/url_host.h index 54b777c0..2677db1c 100644 --- a/include/upa/url_host.h +++ b/include/upa/url_host.h @@ -195,9 +195,13 @@ inline validation_errc host_parser::parse_host(const CharT* first, const CharT* return validation_errc::ok; } } else if (static_cast(*ptr) < 0x80 && *ptr != '%') { - // 7. If asciiDomain contains a forbidden domain code point, domain-invalid-code-point - // validation error, return failure. - return validation_errc::domain_invalid_code_point; + // NFC normalizes U+003C (<), U+003D (=), U+003E (>) characters if they precede + // U+0338. Therefore, no errors are reported here for forbidden < and > characters + // if there is a possibility to normalize them. + if (!(*ptr >= 0x3C && *ptr <= 0x3E && ptr + 1 < last && static_cast(ptr[1]) >= 0x80)) + // 7. If asciiDomain contains a forbidden domain code point, domain-invalid-code-point + // validation error, return failure. + return validation_errc::domain_invalid_code_point; } // Input for domain_to_ascii diff --git a/test/data/my-toascii.json b/test/data/my-toascii.json new file mode 100644 index 00000000..26062819 --- /dev/null +++ b/test/data/my-toascii.json @@ -0,0 +1,28 @@ +[ + { + "comment": "NFC normalization (forbidden < and > characters are normalized to valid ones)", + "input": "=\u0338", + "output": "xn--1ch" + }, + { + "input": "<\u0338", + "output": "xn--gdh" + }, + { + "input": ">\u0338", + "output": "xn--hdh" + }, + { + "comment": "Same with inserted IDNA ignored code point", + "input": "=\u00AD\u0338", + "output": "xn--1ch" + }, + { + "input": "<\u00AD\u0338", + "output": "xn--gdh" + }, + { + "input": ">\u00AD\u0338", + "output": "xn--hdh" + } +] diff --git a/test/wpt-url.cpp b/test/wpt-url.cpp index db34fb0d..ee3abf0a 100644 --- a/test/wpt-url.cpp +++ b/test/wpt-url.cpp @@ -47,6 +47,7 @@ int main(int argc, char** argv) // additional tests err |= test_from_file(run_parser_tests, "data/my-urltestdata.json"); + err |= test_from_file(run_host_parser_tests, "data/my-toascii.json"); err |= test_from_file(run_setter_tests, "data/my-setters_tests.json"); // Free memory