From 2758da8a9126c9d89607bebb4868a1d693641370 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rimas=20Misevi=C4=8Dius?= Date: Tue, 29 Oct 2024 23:18:53 +0200 Subject: [PATCH 1/3] Add more `constexpr` --- include/upa/str_arg.h | 21 +++++++++++---------- include/upa/url.h | 18 +++++++++--------- include/upa/url_percent_encode.h | 14 +++++++------- include/upa/url_result.h | 6 +++--- include/upa/url_utf.h | 10 +++++----- test/wpt-url.cpp | 6 +++--- 6 files changed, 38 insertions(+), 37 deletions(-) diff --git a/include/upa/str_arg.h b/include/upa/str_arg.h index 509918f..21c5067 100644 --- a/include/upa/str_arg.h +++ b/include/upa/str_arg.h @@ -18,6 +18,7 @@ inline void procfn(StrT&& str) { #ifndef UPA_STR_ARG_H #define UPA_STR_ARG_H +#include "config.h" #include "url_utf.h" #include #include @@ -76,26 +77,26 @@ class str_arg { >; // constructors - str_arg(const str_arg&) noexcept = default; + constexpr str_arg(const str_arg&) noexcept = default; - str_arg(const CharT* s) + constexpr str_arg(const CharT* s) : first_(s) , last_(s + traits_type::length(s)) {} template , int> = 0> - str_arg(const CharT* s, SizeT length) + constexpr str_arg(const CharT* s, SizeT length) : first_(s) , last_(s + length) { assert(length >= 0); } - str_arg(const CharT* first, const CharT* last) + constexpr str_arg(const CharT* first, const CharT* last) : first_(first) , last_(last) { assert(first <= last); } // destructor - ~str_arg() noexcept = default; + UPA_CONSTEXPR_20 ~str_arg() noexcept = default; // assignment is not used str_arg& operator=(const str_arg&) = delete; @@ -176,7 +177,7 @@ constexpr bool convertible_to_string_view_v = template struct str_arg_char_common { using type = CharT; - static str_arg to_str_arg(ArgT str) { + static constexpr str_arg to_str_arg(ArgT str) { return { str.data(), str.size() }; } }; @@ -234,7 +235,7 @@ struct str_arg_char : detail::str_arg_char_default {}; template struct str_arg_char>>> { using type = remove_cvref_t; - static str_arg to_str_arg(const type* s) { + static constexpr str_arg to_str_arg(const type* s) { return s; } }; @@ -243,7 +244,7 @@ struct str_arg_char struct str_arg_char> { using type = CharT; - static str_arg to_str_arg(str_arg s) { + static constexpr str_arg to_str_arg(str_arg s) { return s; } }; @@ -267,7 +268,7 @@ using enable_if_str_arg_t = std::enable_if_t< // String arguments helper function template -inline auto make_str_arg(StrT&& str) -> str_arg> { +constexpr auto make_str_arg(StrT&& str) -> str_arg> { return str_arg_char_s::to_str_arg(std::forward(str)); } @@ -304,7 +305,7 @@ inline std::string&& make_string(std::string&& str) { } template = 0> -inline string_view make_string(StrT&& str) { +constexpr string_view make_string(StrT&& str) { const auto inp = make_str_arg(std::forward(str)); return { inp.data(), inp.length() }; } diff --git a/include/upa/url.h b/include/upa/url.h index 55bb3c2..88d08c9 100644 --- a/include/upa/url.h +++ b/include/upa/url.h @@ -907,7 +907,7 @@ class url_parser { // part start extern const uint8_t kPartStart[url::PART_COUNT]; -inline int port_from_str(const char* first, const char* last) noexcept { +constexpr int port_from_str(const char* first, const char* last) noexcept { int port = 0; for (auto it = first; it != last; ++it) { port = port * 10 + (*it - '0'); @@ -930,7 +930,7 @@ constexpr bool is_removable_char(CharT ch) noexcept { } template -inline void do_trim(const CharT*& first, const CharT*& last) noexcept { +constexpr void do_trim(const CharT*& first, const CharT*& last) noexcept { // remove leading C0 controls and space while (first < last && is_trim_char(*first)) ++first; @@ -965,7 +965,7 @@ inline void do_remove_whitespace(const CharT*& first, const CharT*& last, simple // reverse find template -inline InputIt find_last(InputIt first, InputIt last, const T& value) { +constexpr InputIt find_last(InputIt first, InputIt last, const T& value) { for (auto it = last; it > first;) { --it; if (*it == value) return it; @@ -1023,7 +1023,7 @@ constexpr bool is_normalized_windows_drive(CharT c1, CharT c2) noexcept { // https://url.spec.whatwg.org/#start-with-a-windows-drive-letter template -inline bool starts_with_windows_drive(const CharT* pointer, const CharT* last) noexcept { +constexpr bool starts_with_windows_drive(const CharT* pointer, const CharT* last) noexcept { const auto length = last - pointer; return (length == 2 || (length > 2 && detail::is_special_authority_end_char(pointer[2]))) && @@ -1042,7 +1042,7 @@ inline bool starts_with_windows_drive(const CharT* pointer, const CharT* last) n // Check url's pathname has Windows drive, i.e. starts with "/C:/" or is "/C:" // see also: detail::starts_with_windows_drive -inline bool pathname_has_windows_os_drive(string_view pathname) noexcept { +constexpr bool pathname_has_windows_os_drive(string_view pathname) noexcept { return (pathname.length() == 3 || (pathname.length() > 3 && is_windows_slash(pathname[3]))) && is_windows_slash(pathname[0]) && @@ -2319,11 +2319,11 @@ inline void url_parser::parse_path(url_serializer& urls, const CharT* first, con // path state; includes: // 1. [ (/,\) - 1, 2, 3, 4 - [ 1 (if first segment), 2 ] ] // 2. [ 1 ... 4 ] - static const auto escaped_dot = [](const CharT* const pointer) -> bool { + static constexpr auto escaped_dot = [](const CharT* const pointer) constexpr -> bool { // "%2e" or "%2E" return pointer[0] == '%' && pointer[1] == '2' && (pointer[2] | 0x20) == 'e'; }; - static const auto double_dot = [](const CharT* const pointer, const std::size_t len) -> bool { + static constexpr auto double_dot = [](const CharT* const pointer, const std::size_t len) constexpr -> bool { switch (len) { case 2: // ".." return pointer[0] == '.' && pointer[1] == '.'; @@ -2336,7 +2336,7 @@ inline void url_parser::parse_path(url_serializer& urls, const CharT* first, con return false; } }; - static const auto single_dot = [](const CharT* const pointer, const std::size_t len) -> bool { + static constexpr auto single_dot = [](const CharT* const pointer, const std::size_t len) constexpr -> bool { switch (len) { case 1: return pointer[0] == '.'; case 3: return escaped_dot(pointer); // "%2e" @@ -3031,7 +3031,7 @@ inline const CharT* is_unc_path(const CharT* first, const CharT* last) /// @param[in] is_slash function to check char is slash (or backslash) /// @return true if path contains ".." segment template -inline bool has_dot_dot_segment(const CharT* first, const CharT* last, IsSlash is_slash) { +constexpr bool has_dot_dot_segment(const CharT* first, const CharT* last, IsSlash is_slash) { if (last - first >= 2) { const auto* ptr = first; const auto* end = last - 1; diff --git a/include/upa/url_percent_encode.h b/include/upa/url_percent_encode.h index e8a4ca4..bc1baee 100644 --- a/include/upa/url_percent_encode.h +++ b/include/upa/url_percent_encode.h @@ -297,37 +297,37 @@ inline constexpr code_points_multiset code_points; // Check char is in predefined set template -inline bool is_char_in_set(CharT c, const code_point_set& cpset) { +constexpr bool is_char_in_set(CharT c, const code_point_set& cpset) { return cpset[c]; } template -inline bool is_ipv4_char(CharT c) { +constexpr bool is_ipv4_char(CharT c) { return code_points.char_in_set(c, IPV4_CHAR_SET); } template -inline bool is_hex_char(CharT c) { +constexpr bool is_hex_char(CharT c) { return code_points.char_in_set(c, HEX_DIGIT_SET); } template -inline bool is_scheme_char(CharT c) { +constexpr bool is_scheme_char(CharT c) { return code_points.char_in_set(c, SCHEME_SET); } template -inline bool is_forbidden_domain_char(CharT c) { +constexpr bool is_forbidden_domain_char(CharT c) { return code_points.char_in_set(c, DOMAIN_FORBIDDEN_SET); } template -inline bool is_forbidden_host_char(CharT c) { +constexpr bool is_forbidden_host_char(CharT c) { return code_points.char_in_set(c, HOST_FORBIDDEN_SET); } template -inline bool is_ascii_domain_char(CharT c) { +constexpr bool is_ascii_domain_char(CharT c) { return code_points.char_in_set(c, ASCII_DOMAIN_SET); } diff --git a/include/upa/url_result.h b/include/upa/url_result.h index f69fbc6..4502b25 100644 --- a/include/upa/url_result.h +++ b/include/upa/url_result.h @@ -123,11 +123,11 @@ struct result_value { T value{}; R result{}; - result_value(R res) noexcept + constexpr result_value(R res) noexcept : result(res) {} - result_value(R res, T val) noexcept + constexpr result_value(R res, T val) noexcept : value(val), result(res) {} - [[nodiscard]] operator R() const noexcept { + [[nodiscard]] constexpr operator R() const noexcept { return result; } }; diff --git a/include/upa/url_utf.h b/include/upa/url_utf.h index dd01b96..c367271 100644 --- a/include/upa/url_utf.h +++ b/include/upa/url_utf.h @@ -41,8 +41,8 @@ class url_utf { protected: // low level static bool read_code_point(const char*& first, const char* last, uint32_t& code_point) noexcept; - static bool read_code_point(const char16_t*& first, const char16_t* last, uint32_t& code_point) noexcept; - static bool read_code_point(const char32_t*& first, const char32_t* last, uint32_t& code_point) noexcept; + static constexpr bool read_code_point(const char16_t*& first, const char16_t* last, uint32_t& code_point) noexcept; + static constexpr bool read_code_point(const char32_t*& first, const char32_t* last, uint32_t& code_point) noexcept; private: const static char kReplacementCharUtf8[]; const static uint8_t k_U8_LEAD3_T1_BITS[16]; @@ -179,7 +179,7 @@ namespace detail { // Get a supplementary code point value (U+10000..U+10ffff) // from its lead and trail surrogates. // Based on U16_GET_SUPPLEMENTARY in utf16.h from ICU - inline uint32_t u16_get_supplementary(uint32_t lead, uint32_t trail) noexcept { + constexpr uint32_t u16_get_supplementary(uint32_t lead, uint32_t trail) noexcept { constexpr uint32_t u16_surrogate_offset = (0xd800 << 10UL) + 0xdc00 - 0x10000; return (lead << 10UL) + trail - u16_surrogate_offset; } @@ -187,7 +187,7 @@ namespace detail { // Modified version of the U16_NEXT_OR_FFFD macro in utf16.h from ICU -inline bool url_utf::read_code_point(const char16_t*& first, const char16_t* last, uint32_t& c) noexcept { +constexpr bool url_utf::read_code_point(const char16_t*& first, const char16_t* last, uint32_t& c) noexcept { c = *first++; if (detail::u16_is_surrogate(c)) { if (detail::u16_is_surrogate_lead(c) && first != last && detail::u16_is_trail(*first)) { @@ -201,7 +201,7 @@ inline bool url_utf::read_code_point(const char16_t*& first, const char16_t* las return true; } -inline bool url_utf::read_code_point(const char32_t*& first, const char32_t*, uint32_t& c) noexcept { +constexpr bool url_utf::read_code_point(const char32_t*& first, const char32_t*, uint32_t& c) noexcept { // no conversion c = *first++; // don't allow surogates (U+D800..U+DFFF) and too high values diff --git a/test/wpt-url.cpp b/test/wpt-url.cpp index fe055e5..e409b5b 100644 --- a/test/wpt-url.cpp +++ b/test/wpt-url.cpp @@ -221,7 +221,7 @@ void test_host_parser(DataDrivenTest& ddt, const parsed_obj& obj) // Test file format (toascii.json): // https://github.com/web-platform-tests/wpt/tree/master/url#toasciijson // https://github.com/web-platform-tests/wpt/pull/5976 - static const auto make_url = [](const std::string& host)->std::string { + static constexpr auto make_url = [](const std::string& host)->std::string { std::string str_url("http://"); str_url += host; str_url += "/x"; @@ -287,14 +287,14 @@ void test_idna_v2(DataDrivenTest& ddt, const parsed_obj& obj) // Test file format (toascii.json): // https://github.com/web-platform-tests/wpt/tree/master/url#toasciijson // https://github.com/web-platform-tests/wpt/pull/5976 - static const auto make_url = [](const std::string& host) -> std::string { + static constexpr auto make_url = [](const std::string& host) -> std::string { std::string str_url("http://"); str_url += host; str_url += "/x"; return str_url; }; - static const auto encodeHostEndingCodePoints = [](const std::string& input) -> std::string { + static constexpr auto encodeHostEndingCodePoints = [](const std::string& input) -> std::string { if (input.find_first_of(":/?#\\") != input.npos) return encodeURIComponent(input); return input; From 64559e5a17e69c9dbbffe239e694a28980026d39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rimas=20Misevi=C4=8Dius?= Date: Mon, 4 Nov 2024 20:49:51 +0200 Subject: [PATCH 2/3] Change kReplacementCharUtf8 type to constexpr string_view --- include/upa/url_utf.h | 7 +++++-- src/url_utf.cpp | 7 ++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/include/upa/url_utf.h b/include/upa/url_utf.h index c367271..915c0e1 100644 --- a/include/upa/url_utf.h +++ b/include/upa/url_utf.h @@ -9,6 +9,7 @@ #include "url_result.h" #include // uint8_t, uint32_t #include +#include namespace upa { @@ -44,7 +45,9 @@ class url_utf { static constexpr bool read_code_point(const char16_t*& first, const char16_t* last, uint32_t& code_point) noexcept; static constexpr bool read_code_point(const char32_t*& first, const char32_t* last, uint32_t& code_point) noexcept; private: - const static char kReplacementCharUtf8[]; + // Replacement character (U+FFFD) + static inline constexpr std::string_view kReplacementCharUtf8{ "\xEF\xBF\xBD" }; + const static uint8_t k_U8_LEAD3_T1_BITS[16]; const static uint8_t k_U8_LEAD4_T1_BITS[16]; }; @@ -95,7 +98,7 @@ inline void url_utf::read_char_append_utf8(const char*& it, const char* last, st if (read_code_point(it, last, code_point)) output.append(start, it); else - output.append(static_cast(kReplacementCharUtf8)); + output.append(kReplacementCharUtf8); } // ------------------------------------------------------------------------ diff --git a/src/url_utf.cpp b/src/url_utf.cpp index 68c3d48..833084e 100644 --- a/src/url_utf.cpp +++ b/src/url_utf.cpp @@ -38,7 +38,7 @@ void url_utf::check_fix_utf8(std::string& str) { // replace invalid UTF-8 byte sequences with replacement char std::string buff; buff.append(first, ptr); - buff.append(static_cast(kReplacementCharUtf8)); + buff.append(kReplacementCharUtf8); const char* bgn = it; ptr = it; @@ -47,7 +47,7 @@ void url_utf::check_fix_utf8(std::string& str) { ptr = it; } else { buff.append(bgn, ptr); - buff.append(static_cast(kReplacementCharUtf8)); + buff.append(kReplacementCharUtf8); bgn = it; ptr = it; } @@ -93,9 +93,6 @@ int url_utf::compare_by_code_units(const char* first1, const char* last1, const return 0; } -// Replacement character -const char url_utf::kReplacementCharUtf8[] = "\xEF\xBF\xBD"; - // // (c) 2016 and later: Unicode, Inc. and others. // License & terms of use: https://www.unicode.org/copyright.html From 229a8c79633a237071781bff5973688a3e727429 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rimas=20Misevi=C4=8Dius?= Date: Mon, 4 Nov 2024 21:40:18 +0200 Subject: [PATCH 3/3] Add `constexpr` to `url_utf::read_utf_char` function --- LICENSE | 4 ++-- include/upa/url_utf.h | 32 +++++++++++++++++++++++++------- src/url_utf.cpp | 22 +--------------------- 3 files changed, 28 insertions(+), 30 deletions(-) diff --git a/LICENSE b/LICENSE index 69fafad..a527771 100644 --- a/LICENSE +++ b/LICENSE @@ -58,8 +58,8 @@ portions of modified code from the Chromium project licensed as follows: ------------------------------------------------------------------------------- -Files config.h, url_utf.cpp, url_utf.h contains portions of modified code from -the ICU project licensed as follows: +Files config.h, url_utf.h contains portions of modified code from the ICU +project licensed as follows: UNICODE LICENSE V3 diff --git a/include/upa/url_utf.h b/include/upa/url_utf.h index 915c0e1..d5a120b 100644 --- a/include/upa/url_utf.h +++ b/include/upa/url_utf.h @@ -2,6 +2,9 @@ // Distributed under the BSD-style license that can be // found in the LICENSE file. // +// This file contains portions of modified code from the ICU project. +// Copyright (c) 2016-2023 Unicode, Inc. +// #ifndef UPA_URL_UTF_H #define UPA_URL_UTF_H @@ -17,7 +20,7 @@ namespace upa { class url_utf { public: template - static detail::result_value read_utf_char(const CharT*& first, const CharT* last) noexcept; + static constexpr detail::result_value read_utf_char(const CharT*& first, const CharT* last) noexcept; template static void read_char_append_utf8(const CharT*& it, const CharT* last, std::string& output); @@ -41,15 +44,30 @@ class url_utf { static int compare_by_code_units(const char* first1, const char* last1, const char* first2, const char* last2) noexcept; protected: // low level - static bool read_code_point(const char*& first, const char* last, uint32_t& code_point) noexcept; + static constexpr bool read_code_point(const char*& first, const char* last, uint32_t& code_point) noexcept; static constexpr bool read_code_point(const char16_t*& first, const char16_t* last, uint32_t& code_point) noexcept; static constexpr bool read_code_point(const char32_t*& first, const char32_t* last, uint32_t& code_point) noexcept; private: // Replacement character (U+FFFD) static inline constexpr std::string_view kReplacementCharUtf8{ "\xEF\xBF\xBD" }; - const static uint8_t k_U8_LEAD3_T1_BITS[16]; - const static uint8_t k_U8_LEAD4_T1_BITS[16]; + // Following two arrays have values from corresponding macros in ICU 74.1 library's + // include\unicode\utf8.h file. + + // Internal bit vector for 3-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD3_AND_T1. + // Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence. + // Lead byte E0..EF bits 3..0 are used as byte index, + // first trail byte bits 7..5 are used as bit index into that byte. + static inline constexpr uint8_t k_U8_LEAD3_T1_BITS[16] = { + 0x20, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x10, 0x30, 0x30 + }; + // Internal bit vector for 4-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD4_AND_T1. + // Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence. + // First trail byte bits 7..4 are used as byte index, + // lead byte F0..F4 bits 2..0 are used as bit index into that byte. + static inline constexpr uint8_t k_U8_LEAD4_T1_BITS[16] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1E, 0x0F, 0x0F, 0x0F, 0x00, 0x00, 0x00, 0x00 + }; }; @@ -72,9 +90,9 @@ class url_utf { // and advances `first` to point to the next character. template -inline detail::result_value url_utf::read_utf_char(const CharT*& first, const CharT* last) noexcept { +constexpr detail::result_value url_utf::read_utf_char(const CharT*& first, const CharT* last) noexcept { // read_code_point always initializes code_point - uint32_t code_point; // NOLINT(cppcoreguidelines-init-variables) + uint32_t code_point{}; if (read_code_point(first, last, code_point)) return { true, code_point }; return { false, 0xFFFD }; // REPLACEMENT CHARACTER @@ -113,7 +131,7 @@ inline void url_utf::read_char_append_utf8(const char*& it, const char* last, st // Modified version of the U8_INTERNAL_NEXT_OR_SUB macro in utf8.h from ICU -inline bool url_utf::read_code_point(const char*& first, const char* last, uint32_t& c) noexcept { +constexpr bool url_utf::read_code_point(const char*& first, const char* last, uint32_t& c) noexcept { c = static_cast(*first++); if (c & 0x80) { uint8_t tmp = 0; diff --git a/src/url_utf.cpp b/src/url_utf.cpp index 833084e..ba1b794 100644 --- a/src/url_utf.cpp +++ b/src/url_utf.cpp @@ -1,4 +1,4 @@ -// Copyright 2016-2023 Rimas Misevičius +// Copyright 2016-2024 Rimas Misevičius // Distributed under the BSD-style license that can be // found in the LICENSE file. // @@ -93,24 +93,4 @@ int url_utf::compare_by_code_units(const char* first1, const char* last1, const return 0; } -// -// (c) 2016 and later: Unicode, Inc. and others. -// License & terms of use: https://www.unicode.org/copyright.html -// - -// Following two arrays have values from corresponding macros in ICU 74.1 library's -// include\unicode\utf8.h file. - -// Internal bit vector for 3-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD3_AND_T1. -// Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence. -// Lead byte E0..EF bits 3..0 are used as byte index, -// first trail byte bits 7..5 are used as bit index into that byte. -const uint8_t url_utf::k_U8_LEAD3_T1_BITS[16] = { 0x20, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x10, 0x30, 0x30 }; - -// Internal bit vector for 4-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD4_AND_T1. -// Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence. -// First trail byte bits 7..4 are used as byte index, -// lead byte F0..F4 bits 2..0 are used as bit index into that byte. -const uint8_t url_utf::k_U8_LEAD4_T1_BITS[16] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1E, 0x0F, 0x0F, 0x0F, 0x00, 0x00, 0x00, 0x00 }; - } // namespace upa