diff --git a/include/upa/url.h b/include/upa/url.h index 4d9c452..def36cd 100644 --- a/include/upa/url.h +++ b/include/upa/url.h @@ -214,7 +214,7 @@ class url { template = 0> static bool can_parse(T&& str_url, const url* pbase = nullptr) { upa::url url; - return url.parse(std::forward(str_url), pbase) == validation_errc::ok; + return url.for_can_parse(std::forward(str_url), pbase) == validation_errc::ok; } /// @brief Checks if a given URL string can be successfully parsed @@ -243,7 +243,7 @@ class url { static bool can_parse(T&& str_url, TB&& str_base) { upa::url base; return - base.parse(std::forward(str_base), nullptr) == validation_errc::ok && + base.for_can_parse(std::forward(str_base), nullptr) == validation_errc::ok && can_parse(std::forward(str_url), &base); } @@ -640,6 +640,9 @@ class url { template validation_errc do_parse(const CharT* first, const CharT* last, const url* base); + template = 0> + validation_errc for_can_parse(T&& str_url, const url* base); + // get scheme info static const scheme_info kSchemes[]; static const scheme_info* get_scheme_info(string_view src); @@ -699,8 +702,9 @@ class url_serializer : public host_output { url_serializer(const url_serializer&) = delete; url_serializer& operator=(const url_serializer&) = delete; - explicit url_serializer(url& dest_url) - : url_(dest_url) + explicit url_serializer(url& dest_url, bool need_save = true) + : host_output(need_save) + , url_(dest_url) , last_pt_(url::SCHEME) {} @@ -1398,6 +1402,32 @@ inline validation_errc url::do_parse(const CharT* first, const CharT* last, cons return res; } +template > +validation_errc url::for_can_parse(T&& str_url, const url* base) { + const auto inp = make_str_arg(std::forward(str_url)); + const auto* first = inp.begin(); + const auto* last = inp.end(); + const validation_errc res = [&]() { + detail::url_serializer urls(*this, false); + + // reset URL + urls.new_url(); + + // is base URL valid? + if (base && !base->is_valid()) + return validation_errc::invalid_base; + + // remove any leading and trailing C0 control or space: + detail::do_trim(first, last); + //TODO-WARN: validation error if trimmed + + return detail::url_parser::url_parse(urls, first, last, base); + }(); + if (res == validation_errc::ok) + set_flag(VALID_FLAG); + return res; +} + // Setters template > @@ -1826,19 +1856,21 @@ inline validation_errc url_parser::url_parse(url_serializer& urls, const CharT* return validation_errc::host_missing; } //TODO-WARN: validation error - const auto it_colon = std::find(pointer, it_eta, ':'); - // url includes credentials? - const bool not_empty_password = std::distance(it_colon, it_eta) > 1; - if (not_empty_password || std::distance(pointer, it_colon) > 0 /*not empty username*/) { - // username - std::string& str_username = urls.start_part(url::USERNAME); - detail::append_utf8_percent_encoded(pointer, it_colon, userinfo_no_encode_set, str_username); // UTF-8 percent encode, @ -> %40 - urls.save_part(); - // password - if (not_empty_password) { - std::string& str_password = urls.start_part(url::PASSWORD); - detail::append_utf8_percent_encoded(it_colon + 1, it_eta, userinfo_no_encode_set, str_password); // UTF-8 percent encode, @ -> %40 + if (urls.need_save()) { + const auto it_colon = std::find(pointer, it_eta, ':'); + // url includes credentials? + const bool not_empty_password = std::distance(it_colon, it_eta) > 1; + if (not_empty_password || std::distance(pointer, it_colon) > 0 /*not empty username*/) { + // username + std::string& str_username = urls.start_part(url::USERNAME); + detail::append_utf8_percent_encoded(pointer, it_colon, userinfo_no_encode_set, str_username); // UTF-8 percent encode, @ -> %40 urls.save_part(); + // password + if (not_empty_password) { + std::string& str_password = urls.start_part(url::PASSWORD); + detail::append_utf8_percent_encoded(it_colon + 1, it_eta, userinfo_no_encode_set, str_password); // UTF-8 percent encode, @ -> %40 + urls.save_part(); + } } } // after '@' @@ -1927,14 +1959,16 @@ inline validation_errc url_parser::url_parse(url_serializer& urls, const CharT* if (port > 0xFFFF) return validation_errc::port_out_of_range; } - // set port if not default - if (urls.scheme_inf() == nullptr || urls.scheme_inf()->default_port != port) { - util::unsigned_to_str(port, urls.start_part(url::PORT), 10); - urls.save_part(); - urls.set_flag(url::PORT_FLAG); - } else { - // (2-1-3) Set url's port to null - urls.clear_part(url::PORT); + if (urls.need_save()) { + // set port if not default + if (urls.scheme_inf() == nullptr || urls.scheme_inf()->default_port != port) { + util::unsigned_to_str(port, urls.start_part(url::PORT), 10); + urls.save_part(); + urls.set_flag(url::PORT_FLAG); + } else { + // (2-1-3) Set url's port to null + urls.clear_part(url::PORT); + } } } // 2.2. If state override is given, then return @@ -2012,7 +2046,7 @@ inline validation_errc url_parser::url_parse(url_serializer& urls, const CharT* break; default: - if (base && base->is_file_scheme()) { + if (base && base->is_file_scheme() && urls.need_save()) { // It is important to first set host, then path, otherwise serializer // will fail. @@ -2058,7 +2092,7 @@ inline validation_errc url_parser::url_parse(url_serializer& urls, const CharT* } else { // parse and set host: const auto res = parse_host(urls, pointer, end_of_authority); - if (res != validation_errc::ok) + if (res != validation_errc::ok || !urls.need_save()) return res; // TODO-ERR: failure // if host is "localhost", then set host to the empty string if (urls.get_part_view(url::HOST) == string_view{ "localhost", 9 }) { @@ -2073,6 +2107,9 @@ inline validation_errc url_parser::url_parse(url_serializer& urls, const CharT* } } + if (!urls.need_save()) + return validation_errc::ok; + if (state == path_start_state) { if (urls.is_special_scheme()) { if (pointer != last) { @@ -2656,6 +2693,8 @@ inline void url_serializer::append_parts(const url& src, url::PartType t1, url:: return t1; }(); + if (!need_save()) return; + // copy flags; they can be used when copying / serializing url parts below unsigned mask = 0; for (int ind = t1; ind <= t2; ++ind) { diff --git a/include/upa/url_host.h b/include/upa/url_host.h index 96f3e9b..80e3d42 100644 --- a/include/upa/url_host.h +++ b/include/upa/url_host.h @@ -39,6 +39,8 @@ enum class HostType { class host_output { protected: host_output() = default; + host_output(bool need_save) + : need_save_{ need_save } {} public: host_output(const host_output&) = delete; host_output& operator=(const host_output&) = delete; @@ -46,6 +48,9 @@ class host_output { virtual std::string& hostStart() = 0; virtual void hostDone(HostType /*ht*/) = 0; + bool need_save() const noexcept { return need_save_; } +private: + bool need_save_ = true; }; class host_parser { @@ -190,10 +195,12 @@ inline validation_errc host_parser::parse_host(const CharT* first, const CharT* if (hostname_ends_in_a_number(first, last)) return parse_ipv4(first, last, dest); - // Return asciiDomain lower cased - std::string& str_host = dest.hostStart(); - util::append_ascii_lowercase(str_host, first, last); - dest.hostDone(HostType::Domain); + if (dest.need_save()) { + // Return asciiDomain lower cased + std::string& str_host = dest.hostStart(); + util::append_ascii_lowercase(str_host, first, last); + dest.hostDone(HostType::Domain); + } return validation_errc::ok; } } else if (static_cast(*ptr) < 0x80 && *ptr != '%') { @@ -270,10 +277,12 @@ inline validation_errc host_parser::parse_host(const CharT* first, const CharT* if (hostname_ends_in_a_number(buff_ascii.begin(), buff_ascii.end())) return parse_ipv4(buff_ascii.begin(), buff_ascii.end(), dest); - // Return asciiDomain - std::string& str_host = dest.hostStart(); - util::append(str_host, buff_ascii); - dest.hostDone(HostType::Domain); + if (dest.need_save()) { + // Return asciiDomain + std::string& str_host = dest.hostStart(); + util::append(str_host, buff_ascii); + dest.hostDone(HostType::Domain); + } return validation_errc::ok; } @@ -293,31 +302,33 @@ inline validation_errc host_parser::parse_opaque_host(const CharT* first, const // 3. If input contains a U+0025 (%) and the two code points following it are not ASCII hex digits, // invalid-URL-unit validation error. - std::string& str_host = dest.hostStart(); - - //TODO: UTF-8 percent encode it using the C0 control percent-encode set - //detail::append_utf8_percent_encoded(first, last, detail::CHAR_C0_CTRL, str_host); - using UCharT = typename std::make_unsigned::type; - - const CharT* pointer = first; - while (pointer < last) { - // UTF-8 percent encode c using the C0 control percent-encode set (U+0000 ... U+001F and >U+007E) - const auto uch = static_cast(*pointer); - if (uch >= 0x7f) { - // invalid utf-8/16/32 sequences will be replaced with 0xfffd - detail::append_utf8_percent_encoded_char(pointer, last, str_host); - } else { - // Just append the 7-bit character, percent encoding C0 control chars - const auto uc = static_cast(uch); - if (uc <= 0x1f) - detail::append_percent_encoded_byte(uc, str_host); - else - str_host.push_back(uc); - ++pointer; + if (dest.need_save()) { + std::string& str_host = dest.hostStart(); + + //TODO: UTF-8 percent encode it using the C0 control percent-encode set + //detail::append_utf8_percent_encoded(first, last, detail::CHAR_C0_CTRL, str_host); + using UCharT = typename std::make_unsigned::type; + + const CharT* pointer = first; + while (pointer < last) { + // UTF-8 percent encode c using the C0 control percent-encode set (U+0000 ... U+001F and >U+007E) + const auto uch = static_cast(*pointer); + if (uch >= 0x7f) { + // invalid utf-8/16/32 sequences will be replaced with 0xfffd + detail::append_utf8_percent_encoded_char(pointer, last, str_host); + } else { + // Just append the 7-bit character, percent encoding C0 control chars + const auto uc = static_cast(uch); + if (uc <= 0x1f) + detail::append_percent_encoded_byte(uc, str_host); + else + str_host.push_back(uc); + ++pointer; + } } - } - dest.hostDone(str_host.empty() ? HostType::Empty : HostType::Opaque); + dest.hostDone(str_host.empty() ? HostType::Empty : HostType::Opaque); + } return validation_errc::ok; } @@ -326,7 +337,7 @@ inline validation_errc host_parser::parse_ipv4(const CharT* first, const CharT* uint32_t ipv4; // NOLINT(cppcoreguidelines-init-variables) const auto res = ipv4_parse(first, last, ipv4); - if (res == validation_errc::ok) { + if (res == validation_errc::ok && dest.need_save()) { std::string& str_ipv4 = dest.hostStart(); ipv4_serialize(ipv4, str_ipv4); dest.hostDone(HostType::IPv4); @@ -339,7 +350,7 @@ inline validation_errc host_parser::parse_ipv6(const CharT* first, const CharT* uint16_t ipv6addr[8]; const auto res = ipv6_parse(first, last, ipv6addr); - if (res == validation_errc::ok) { + if (res == validation_errc::ok && dest.need_save()) { std::string& str_ipv6 = dest.hostStart(); str_ipv6.push_back('['); ipv6_serialize(ipv6addr, str_ipv6); diff --git a/test/test-url.cpp b/test/test-url.cpp index 93c6e67..35a1249 100644 --- a/test/test-url.cpp +++ b/test/test-url.cpp @@ -369,6 +369,9 @@ TEST_CASE("Parse URL with invalid base") { CHECK(url.is_valid()); CHECK(url.parse("https://h/", base) == upa::validation_errc::invalid_base); CHECK_FALSE(url.is_valid()); + + // url::can_parse + CHECK_FALSE(upa::url::can_parse("about:blank", base)); } SUBCASE("Invalid base") { upa::url base; @@ -386,6 +389,9 @@ TEST_CASE("Parse URL with invalid base") { CHECK(url.is_valid()); CHECK(url.parse("https://h/", base) == upa::validation_errc::invalid_base); CHECK_FALSE(url.is_valid()); + + // url::can_parse + CHECK_FALSE(upa::url::can_parse("about:blank", base)); } } diff --git a/test/wpt-url.cpp b/test/wpt-url.cpp index 501ab6e..2c203ad 100644 --- a/test/wpt-url.cpp +++ b/test/wpt-url.cpp @@ -127,6 +127,20 @@ void test_parser(DataDrivenTest& ddt, ParserObj& obj) // check "failure" tc.assert_equal(obj.failure, !parse_success, "parse failure WITH NO BASE"); } + + // Test url::can_parse + + bool can_parse_success = base.empty() + ? upa::url::can_parse(input) + : upa::url::can_parse(input, base); + + // check "success" + tc.assert_equal(!obj.failure, can_parse_success, "can_parse"); + + if (obj.failure && !base.empty()) { + can_parse_success = upa::url::can_parse(input); + tc.assert_equal(!obj.failure, can_parse_success, "can_parse WITH NO BASE"); + } }); }