Skip to content

Commit

Permalink
Improve url::can_parse performance
Browse files Browse the repository at this point in the history
Only scheme is saved when parsing for can_parse, other components are
only checked.
  • Loading branch information
rmisev committed Jul 13, 2024
1 parent 5c29303 commit f839ae0
Show file tree
Hide file tree
Showing 4 changed files with 129 additions and 59 deletions.
91 changes: 65 additions & 26 deletions include/upa/url.h
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ class url {
template <class T, enable_if_str_arg_t<T> = 0>
static bool can_parse(T&& str_url, const url* pbase = nullptr) {
upa::url url;
return url.parse(std::forward<T>(str_url), pbase) == validation_errc::ok;
return url.for_can_parse(std::forward<T>(str_url), pbase) == validation_errc::ok;
}

/// @brief Checks if a given URL string can be successfully parsed
Expand Down Expand Up @@ -243,7 +243,7 @@ class url {
static bool can_parse(T&& str_url, TB&& str_base) {
upa::url base;
return
base.parse(std::forward<TB>(str_base), nullptr) == validation_errc::ok &&
base.for_can_parse(std::forward<TB>(str_base), nullptr) == validation_errc::ok &&
can_parse(std::forward<T>(str_url), &base);
}

Expand Down Expand Up @@ -640,6 +640,9 @@ class url {
template <typename CharT>
validation_errc do_parse(const CharT* first, const CharT* last, const url* base);

template <class T, enable_if_str_arg_t<T> = 0>
validation_errc for_can_parse(T&& str_url, const url* base);

// get scheme info
static const scheme_info kSchemes[];
static const scheme_info* get_scheme_info(string_view src);
Expand Down Expand Up @@ -699,8 +702,9 @@ class url_serializer : public host_output {
url_serializer(const url_serializer&) = delete;
url_serializer& operator=(const url_serializer&) = delete;

explicit url_serializer(url& dest_url)
: url_(dest_url)
explicit url_serializer(url& dest_url, bool need_save = true)
: host_output(need_save)
, url_(dest_url)
, last_pt_(url::SCHEME)
{}

Expand Down Expand Up @@ -1398,6 +1402,32 @@ inline validation_errc url::do_parse(const CharT* first, const CharT* last, cons
return res;
}

template <class T, enable_if_str_arg_t<T>>
validation_errc url::for_can_parse(T&& str_url, const url* base) {
const auto inp = make_str_arg(std::forward<T>(str_url));
const auto* first = inp.begin();
const auto* last = inp.end();
const validation_errc res = [&]() {
detail::url_serializer urls(*this, false);

// reset URL
urls.new_url();

// is base URL valid?
if (base && !base->is_valid())
return validation_errc::invalid_base;

// remove any leading and trailing C0 control or space:
detail::do_trim(first, last);
//TODO-WARN: validation error if trimmed

return detail::url_parser::url_parse(urls, first, last, base);
}();
if (res == validation_errc::ok)
set_flag(VALID_FLAG);
return res;
}

// Setters

template <class StrT, enable_if_str_arg_t<StrT>>
Expand Down Expand Up @@ -1826,19 +1856,21 @@ inline validation_errc url_parser::url_parse(url_serializer& urls, const CharT*
return validation_errc::host_missing;
}
//TODO-WARN: validation error
const auto it_colon = std::find(pointer, it_eta, ':');
// url includes credentials?
const bool not_empty_password = std::distance(it_colon, it_eta) > 1;
if (not_empty_password || std::distance(pointer, it_colon) > 0 /*not empty username*/) {
// username
std::string& str_username = urls.start_part(url::USERNAME);
detail::append_utf8_percent_encoded(pointer, it_colon, userinfo_no_encode_set, str_username); // UTF-8 percent encode, @ -> %40
urls.save_part();
// password
if (not_empty_password) {
std::string& str_password = urls.start_part(url::PASSWORD);
detail::append_utf8_percent_encoded(it_colon + 1, it_eta, userinfo_no_encode_set, str_password); // UTF-8 percent encode, @ -> %40
if (urls.need_save()) {
const auto it_colon = std::find(pointer, it_eta, ':');
// url includes credentials?
const bool not_empty_password = std::distance(it_colon, it_eta) > 1;
if (not_empty_password || std::distance(pointer, it_colon) > 0 /*not empty username*/) {
// username
std::string& str_username = urls.start_part(url::USERNAME);
detail::append_utf8_percent_encoded(pointer, it_colon, userinfo_no_encode_set, str_username); // UTF-8 percent encode, @ -> %40
urls.save_part();
// password
if (not_empty_password) {
std::string& str_password = urls.start_part(url::PASSWORD);
detail::append_utf8_percent_encoded(it_colon + 1, it_eta, userinfo_no_encode_set, str_password); // UTF-8 percent encode, @ -> %40
urls.save_part();
}
}
}
// after '@'
Expand Down Expand Up @@ -1927,14 +1959,16 @@ inline validation_errc url_parser::url_parse(url_serializer& urls, const CharT*
if (port > 0xFFFF)
return validation_errc::port_out_of_range;
}
// set port if not default
if (urls.scheme_inf() == nullptr || urls.scheme_inf()->default_port != port) {
util::unsigned_to_str(port, urls.start_part(url::PORT), 10);
urls.save_part();
urls.set_flag(url::PORT_FLAG);
} else {
// (2-1-3) Set url's port to null
urls.clear_part(url::PORT);
if (urls.need_save()) {
// set port if not default
if (urls.scheme_inf() == nullptr || urls.scheme_inf()->default_port != port) {
util::unsigned_to_str(port, urls.start_part(url::PORT), 10);
urls.save_part();
urls.set_flag(url::PORT_FLAG);
} else {
// (2-1-3) Set url's port to null
urls.clear_part(url::PORT);
}
}
}
// 2.2. If state override is given, then return
Expand Down Expand Up @@ -2012,7 +2046,7 @@ inline validation_errc url_parser::url_parse(url_serializer& urls, const CharT*
break;

default:
if (base && base->is_file_scheme()) {
if (base && base->is_file_scheme() && urls.need_save()) {
// It is important to first set host, then path, otherwise serializer
// will fail.

Expand Down Expand Up @@ -2058,7 +2092,7 @@ inline validation_errc url_parser::url_parse(url_serializer& urls, const CharT*
} else {
// parse and set host:
const auto res = parse_host(urls, pointer, end_of_authority);
if (res != validation_errc::ok)
if (res != validation_errc::ok || !urls.need_save())
return res; // TODO-ERR: failure
// if host is "localhost", then set host to the empty string
if (urls.get_part_view(url::HOST) == string_view{ "localhost", 9 }) {
Expand All @@ -2073,6 +2107,9 @@ inline validation_errc url_parser::url_parse(url_serializer& urls, const CharT*
}
}

if (!urls.need_save())
return validation_errc::ok;

if (state == path_start_state) {
if (urls.is_special_scheme()) {
if (pointer != last) {
Expand Down Expand Up @@ -2656,6 +2693,8 @@ inline void url_serializer::append_parts(const url& src, url::PartType t1, url::
return t1;
}();

if (!need_save()) return;

// copy flags; they can be used when copying / serializing url parts below
unsigned mask = 0;
for (int ind = t1; ind <= t2; ++ind) {
Expand Down
77 changes: 44 additions & 33 deletions include/upa/url_host.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,18 @@ enum class HostType {
class host_output {
protected:
host_output() = default;
host_output(bool need_save)
: need_save_{ need_save } {}
public:
host_output(const host_output&) = delete;
host_output& operator=(const host_output&) = delete;
virtual ~host_output() = default;

virtual std::string& hostStart() = 0;
virtual void hostDone(HostType /*ht*/) = 0;
bool need_save() const noexcept { return need_save_; }
private:
bool need_save_ = true;
};

class host_parser {
Expand Down Expand Up @@ -190,10 +195,12 @@ inline validation_errc host_parser::parse_host(const CharT* first, const CharT*
if (hostname_ends_in_a_number(first, last))
return parse_ipv4(first, last, dest);

// Return asciiDomain lower cased
std::string& str_host = dest.hostStart();
util::append_ascii_lowercase(str_host, first, last);
dest.hostDone(HostType::Domain);
if (dest.need_save()) {
// Return asciiDomain lower cased
std::string& str_host = dest.hostStart();
util::append_ascii_lowercase(str_host, first, last);
dest.hostDone(HostType::Domain);
}
return validation_errc::ok;
}
} else if (static_cast<UCharT>(*ptr) < 0x80 && *ptr != '%') {
Expand Down Expand Up @@ -270,10 +277,12 @@ inline validation_errc host_parser::parse_host(const CharT* first, const CharT*
if (hostname_ends_in_a_number(buff_ascii.begin(), buff_ascii.end()))
return parse_ipv4(buff_ascii.begin(), buff_ascii.end(), dest);

// Return asciiDomain
std::string& str_host = dest.hostStart();
util::append(str_host, buff_ascii);
dest.hostDone(HostType::Domain);
if (dest.need_save()) {
// Return asciiDomain
std::string& str_host = dest.hostStart();
util::append(str_host, buff_ascii);
dest.hostDone(HostType::Domain);
}
return validation_errc::ok;
}

Expand All @@ -293,31 +302,33 @@ inline validation_errc host_parser::parse_opaque_host(const CharT* first, const
// 3. If input contains a U+0025 (%) and the two code points following it are not ASCII hex digits,
// invalid-URL-unit validation error.

std::string& str_host = dest.hostStart();

//TODO: UTF-8 percent encode it using the C0 control percent-encode set
//detail::append_utf8_percent_encoded(first, last, detail::CHAR_C0_CTRL, str_host);
using UCharT = typename std::make_unsigned<CharT>::type;

const CharT* pointer = first;
while (pointer < last) {
// UTF-8 percent encode c using the C0 control percent-encode set (U+0000 ... U+001F and >U+007E)
const auto uch = static_cast<UCharT>(*pointer);
if (uch >= 0x7f) {
// invalid utf-8/16/32 sequences will be replaced with 0xfffd
detail::append_utf8_percent_encoded_char(pointer, last, str_host);
} else {
// Just append the 7-bit character, percent encoding C0 control chars
const auto uc = static_cast<unsigned char>(uch);
if (uc <= 0x1f)
detail::append_percent_encoded_byte(uc, str_host);
else
str_host.push_back(uc);
++pointer;
if (dest.need_save()) {
std::string& str_host = dest.hostStart();

//TODO: UTF-8 percent encode it using the C0 control percent-encode set
//detail::append_utf8_percent_encoded(first, last, detail::CHAR_C0_CTRL, str_host);
using UCharT = typename std::make_unsigned<CharT>::type;

const CharT* pointer = first;
while (pointer < last) {
// UTF-8 percent encode c using the C0 control percent-encode set (U+0000 ... U+001F and >U+007E)
const auto uch = static_cast<UCharT>(*pointer);
if (uch >= 0x7f) {
// invalid utf-8/16/32 sequences will be replaced with 0xfffd
detail::append_utf8_percent_encoded_char(pointer, last, str_host);
} else {
// Just append the 7-bit character, percent encoding C0 control chars
const auto uc = static_cast<unsigned char>(uch);
if (uc <= 0x1f)
detail::append_percent_encoded_byte(uc, str_host);
else
str_host.push_back(uc);
++pointer;
}
}
}

dest.hostDone(str_host.empty() ? HostType::Empty : HostType::Opaque);
dest.hostDone(str_host.empty() ? HostType::Empty : HostType::Opaque);
}
return validation_errc::ok;
}

Expand All @@ -326,7 +337,7 @@ inline validation_errc host_parser::parse_ipv4(const CharT* first, const CharT*
uint32_t ipv4; // NOLINT(cppcoreguidelines-init-variables)

const auto res = ipv4_parse(first, last, ipv4);
if (res == validation_errc::ok) {
if (res == validation_errc::ok && dest.need_save()) {
std::string& str_ipv4 = dest.hostStart();
ipv4_serialize(ipv4, str_ipv4);
dest.hostDone(HostType::IPv4);
Expand All @@ -339,7 +350,7 @@ inline validation_errc host_parser::parse_ipv6(const CharT* first, const CharT*
uint16_t ipv6addr[8];

const auto res = ipv6_parse(first, last, ipv6addr);
if (res == validation_errc::ok) {
if (res == validation_errc::ok && dest.need_save()) {
std::string& str_ipv6 = dest.hostStart();
str_ipv6.push_back('[');
ipv6_serialize(ipv6addr, str_ipv6);
Expand Down
6 changes: 6 additions & 0 deletions test/test-url.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,9 @@ TEST_CASE("Parse URL with invalid base") {
CHECK(url.is_valid());
CHECK(url.parse("https://h/", base) == upa::validation_errc::invalid_base);
CHECK_FALSE(url.is_valid());

// url::can_parse
CHECK_FALSE(upa::url::can_parse("about:blank", base));
}
SUBCASE("Invalid base") {
upa::url base;
Expand All @@ -386,6 +389,9 @@ TEST_CASE("Parse URL with invalid base") {
CHECK(url.is_valid());
CHECK(url.parse("https://h/", base) == upa::validation_errc::invalid_base);
CHECK_FALSE(url.is_valid());

// url::can_parse
CHECK_FALSE(upa::url::can_parse("about:blank", base));
}
}

Expand Down
14 changes: 14 additions & 0 deletions test/wpt-url.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,20 @@ void test_parser(DataDrivenTest& ddt, ParserObj& obj)
// check "failure"
tc.assert_equal(obj.failure, !parse_success, "parse failure WITH NO BASE");
}

// Test url::can_parse

bool can_parse_success = base.empty()
? upa::url::can_parse(input)
: upa::url::can_parse(input, base);

// check "success"
tc.assert_equal(!obj.failure, can_parse_success, "can_parse");

if (obj.failure && !base.empty()) {
can_parse_success = upa::url::can_parse(input);
tc.assert_equal(!obj.failure, can_parse_success, "can_parse WITH NO BASE");
}
});
}

Expand Down

0 comments on commit f839ae0

Please sign in to comment.