diff --git a/CMakeLists.txt b/CMakeLists.txt index ddb3b39..3748187 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,10 +1,9 @@ -# FindICU requires 3.7 # add_link_options() requires 3.13 cmake_minimum_required(VERSION 3.13) set(CMAKE_SUPPRESS_REGENERATION true) -# use ICU_ROOT +# https://cmake.org/cmake/help/latest/policy/CMP0074.html if(POLICY CMP0074) cmake_policy(SET CMP0074 NEW) endif() @@ -23,7 +22,6 @@ project(upa_url VERSION ${UPA_URL_VERSION} LANGUAGES CXX) # ${upa_lib_name}-config.cmake # It also must be used as the package name argument to find_package set(upa_lib_name upa) -set(upa_lib_name_in upa-icu) # Exported name for library target files; also used to create an alias # target: upa::${upa_lib_export} set(upa_lib_export url) @@ -55,7 +53,6 @@ option(UPA_BUILD_TOOLS "Build tools." OFF) option(UPA_INSTALL "Generate the install target." ON) # library options option(UPA_AMALGAMATED "Use amalgamated URL library source." OFF) -option(UPA_USE_WINDOWS_ICU "Use ICU library bundled with Windows 10 version 1903 or later." OFF) # tests build options option(UPA_TEST_COVERAGE "Build tests with code coverage reporting" OFF) option(UPA_TEST_COVERAGE_CLANG "Build tests with Clang source-based code coverage" OFF) @@ -114,14 +111,9 @@ endif() include_directories(deps) -# Are Upa URL and ICU libraries needed? +# Is the Upa URL library needed? if (UPA_BUILD_TESTS OR UPA_BUILD_BENCH OR UPA_BUILD_FUZZER OR UPA_BUILD_EXAMPLES OR UPA_BUILD_EXTRACTED OR UPA_INSTALL OR NOT UPA_BUILD_TOOLS) - if (NOT UPA_USE_WINDOWS_ICU) - # This library depends on ICU - find_package(ICU REQUIRED COMPONENTS i18n uc) - endif() - if (UPA_AMALGAMATED) add_library(${upa_lib_target} STATIC single_include/upa/url.cpp) @@ -131,7 +123,6 @@ if (UPA_BUILD_TESTS OR UPA_BUILD_BENCH OR UPA_BUILD_FUZZER OR UPA_BUILD_EXAMPLES add_library(${upa_lib_target} STATIC src/idna.cpp src/url.cpp - src/url_idna.cpp src/url_ip.cpp src/url_percent_encode.cpp src/url_search_params.cpp @@ -143,13 +134,6 @@ if (UPA_BUILD_TESTS OR UPA_BUILD_BENCH OR UPA_BUILD_FUZZER OR UPA_BUILD_EXAMPLES add_library(upa::${upa_lib_export} ALIAS ${upa_lib_target}) set_target_properties(${upa_lib_target} PROPERTIES EXPORT_NAME ${upa_lib_export}) - if (UPA_USE_WINDOWS_ICU) - target_compile_definitions(${upa_lib_target} PRIVATE UPA_USE_WINDOWS_ICU=1) - set(upa_lib_name_in ${upa_lib_name}) - else() - target_include_directories(${upa_lib_target} PRIVATE ${ICU_INCLUDE_DIR}) - target_link_libraries(${upa_lib_target} INTERFACE ICU::i18n ICU::uc) - endif() endif() # Test targets @@ -274,7 +258,7 @@ if (UPA_INSTALL AND NOT UPA_AMALGAMATED) # generate the config file that includes the exports configure_package_config_file( - ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${upa_lib_name_in}-config.cmake.in + ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${upa_lib_name}-config.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/${upa_lib_name}-config.cmake INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${upa_lib_name} NO_SET_AND_CHECK_MACRO diff --git a/cmake/upa-icu-config.cmake.in b/cmake/upa-icu-config.cmake.in deleted file mode 100644 index ad8d69b..0000000 --- a/cmake/upa-icu-config.cmake.in +++ /dev/null @@ -1,6 +0,0 @@ -@PACKAGE_INIT@ - -include(CMakeFindDependencyMacro) -find_dependency(ICU REQUIRED COMPONENTS i18n uc) - -include("${CMAKE_CURRENT_LIST_DIR}/upa-targets.cmake") diff --git a/include/upa/url_host.h b/include/upa/url_host.h index 0a8b19f..1444593 100644 --- a/include/upa/url_host.h +++ b/include/upa/url_host.h @@ -6,18 +6,10 @@ #ifndef UPA_URL_HOST_H #define UPA_URL_HOST_H -#ifndef UPA_USE_ICU -#define UPA_USE_ICU 0 // NOLINT(*-macro-*) -#endif // UPA_USE_ICU - #include "buffer.h" #include "config.h" +#include "idna.h" #include "str_arg.h" -#if UPA_USE_ICU -# include "url_idna.h" -#else -# include "idna.h" -#endif #include "url_ip.h" #include "url_percent_encode.h" #include "url_result.h" @@ -221,62 +213,6 @@ inline validation_errc host_parser::parse_host(const CharT* first, const CharT* return validation_errc::domain_invalid_code_point; } -#if UPA_USE_ICU - // Input for domain_to_ascii - simple_buffer buff_uc; - - // copy ASCII chars - for (auto it = first; it != ptr; ++it) { - const auto uch = static_cast(*it); - buff_uc.push_back(static_cast(uch)); - } - - // Let buff_uc be the result of running UTF-8 decode (to UTF-16) without BOM - // on the percent decoding of UTF-8 encode on input - for (auto it = ptr; it != last;) { - const auto uch = static_cast(*it++); - if (uch < 0x80) { - if (uch != '%') { - buff_uc.push_back(static_cast(uch)); - continue; - } - // uch == '%' - unsigned char uc8; // NOLINT(cppcoreguidelines-init-variables) - if (detail::decode_hex_to_byte(it, last, uc8)) { - if (uc8 < 0x80) { - buff_uc.push_back(static_cast(uc8)); - continue; - } - // percent encoded utf-8 sequence - // TODO: gal po vieną code_point, tuomet užtektų utf-8 buferio vienam simboliui - simple_buffer buff_utf8; - buff_utf8.push_back(static_cast(uc8)); - while (it != last && *it == '%') { - ++it; // skip '%' - if (!detail::decode_hex_to_byte(it, last, uc8)) - uc8 = '%'; - buff_utf8.push_back(static_cast(uc8)); - } - url_utf::convert_utf8_to_utf16(buff_utf8.data(), buff_utf8.data() + buff_utf8.size(), buff_uc); - //buff_utf8.clear(); - continue; - } - // detected an invalid percent-encoding sequence - buff_uc.push_back('%'); - } else { // uch >= 0x80 - --it; - url_utf::append_utf16(url_utf::read_utf_char(it, last).value, buff_uc); - } - } - - - // domain to ASCII - simple_buffer buff_ascii; - - const auto res = domain_to_ascii(buff_uc.data(), buff_uc.size(), buff_ascii); - if (res != validation_errc::ok) - return res; -#else std::string buff_ascii; const auto pes = std::find(ptr, last, '%'); @@ -337,7 +273,6 @@ inline validation_errc host_parser::parse_host(const CharT* first, const CharT* if (!idna::domain_to_ascii(buff_ascii, buff_uc.begin(), buff_uc.end())) return validation_errc::domain_to_ascii; } -#endif if (detail::contains_forbidden_domain_char(buff_ascii.data(), buff_ascii.data() + buff_ascii.size())) { // 7. If asciiDomain contains a forbidden domain code point, domain-invalid-code-point diff --git a/src/url_idna.cpp b/src/url_idna.cpp deleted file mode 100644 index 9a8030b..0000000 --- a/src/url_idna.cpp +++ /dev/null @@ -1,230 +0,0 @@ -// Copyright 2016-2024 Rimas Misevičius -// Distributed under the BSD-style license that can be -// found in the LICENSE file. -// -// This file contains portions of modified code from: -// https://cs.chromium.org/chromium/src/url/url_idna_icu.cc -// Copyright 2013 The Chromium Authors. All rights reserved. -// - -// Define UPA_USE_WINDOWS_ICU = 1 to use the ICU library bundled with -// Windows 10 version 1903 or later. For more information, see: -// https://learn.microsoft.com/en-us/windows/win32/intl/international-components-for-unicode--icu- -#ifndef UPA_USE_WINDOWS_ICU -# define UPA_USE_WINDOWS_ICU 0 // NOLINT(*-macro-*) -#endif // UPA_USE_WINDOWS_ICU - -#include "upa/config.h" -#include "upa/url_idna.h" -#include "upa/util.h" - -#if UPA_USE_WINDOWS_ICU -# include -# pragma comment( lib, "icu" ) -#else -// ICU: only C API is used (U_SHOW_CPLUSPLUS_API 0) -// https://unicode-org.github.io/icu/userguide/icu4c/build.html#icu-as-a-system-level-library -# define U_SHOW_CPLUSPLUS_API 0 // NOLINT(*-macro-*) -# include // u_getUnicodeVersion -# include // u_cleanup -# include -# include // u_getVersion -#endif - -#include -#include -#include // uint32_t - -namespace upa { - -namespace { - -// Return UTS46 ICU handler opened with uidna_openUTS46() - -// NOLINTBEGIN(cppcoreguidelines-avoid-non-const-global-variables) -UIDNA* uidna_ptr = nullptr; -unsigned icu_version_major = 0; -// NOLINTEND(cppcoreguidelines-avoid-non-const-global-variables) - -const UIDNA* get_uidna() { - // initialize uidna_ptr - static struct Once { - Once() { - UErrorCode err = U_ZERO_ERROR; - // https://url.spec.whatwg.org/#idna - // UseSTD3ASCIIRules = false - // Transitional_Processing = false - // CheckBidi = true - // CheckJoiners = true - uidna_ptr = uidna_openUTS46( - UIDNA_CHECK_BIDI - | UIDNA_CHECK_CONTEXTJ - | UIDNA_NONTRANSITIONAL_TO_ASCII - | UIDNA_NONTRANSITIONAL_TO_UNICODE, &err); - assert(U_SUCCESS(err) && uidna_ptr != nullptr); - // Get ICU major version - UVersionInfo ver; - u_getVersion(ver); // NOLINT(cppcoreguidelines-pro-bounds-array-to-pointer-decay,hicpp-no-array-decay) - icu_version_major = ver[0]; - } - } const once; - - return uidna_ptr; -} - -} // namespace - - -void idna_close(bool close_lib) { - if (uidna_ptr) { - uidna_close(uidna_ptr); - uidna_ptr = nullptr; - } - if (close_lib) { - // ICU cleanup - u_cleanup(); - } -} - -unsigned idna_unicode_version() { - UVersionInfo ver; - u_getUnicodeVersion(ver); // NOLINT(cppcoreguidelines-pro-bounds-array-to-pointer-decay,hicpp-no-array-decay) - return make_unicode_version(ver[0], ver[1], ver[2], ver[3]); -} - -// Conversion to ICU UChar - -namespace { - -static_assert(sizeof(UChar) == sizeof(char16_t), "UChar must be the same size as char16_t"); - -inline const UChar* to_UChar_ptr(const char16_t* p) noexcept { - UPA_ALIASING_BARRIER(p) - return reinterpret_cast(p); -} - -inline UChar* to_UChar_ptr(char16_t* p) noexcept { - UPA_ALIASING_BARRIER(p) - return reinterpret_cast(p); -} - -} // namespace - -// Implements the domain to ASCII algorithm -// https://url.spec.whatwg.org/#concept-domain-to-ascii -// with beStrict = false - -validation_errc domain_to_ascii(const char16_t* src, std::size_t src_len, simple_buffer& output) { - // https://url.spec.whatwg.org/#concept-domain-to-ascii - // https://www.unicode.org/reports/tr46/#ToASCII - static constexpr uint32_t UIDNA_ERR_MASK = ~static_cast( - // VerifyDnsLength = false - UIDNA_ERROR_EMPTY_LABEL - | UIDNA_ERROR_LABEL_TOO_LONG - | UIDNA_ERROR_DOMAIN_NAME_TOO_LONG - // CheckHyphens = false - | UIDNA_ERROR_LEADING_HYPHEN - | UIDNA_ERROR_TRAILING_HYPHEN - | UIDNA_ERROR_HYPHEN_3_4 - ); - - // uidna_nameToASCII uses int32_t length - // https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/uidna_8h.html#ac45d3ad275df9e5a2c2e84561862d005 - if (src_len > util::unsigned_limit::max()) - return validation_errc::overflow; // too long - - // The static_cast(output.capacity()) must be safe: - assert(output.capacity() <= util::unsigned_limit::max()); - - const UIDNA* uidna = get_uidna(); - assert(uidna != nullptr); - while (true) { - UErrorCode err = U_ZERO_ERROR; - UIDNAInfo info = UIDNA_INFO_INITIALIZER; - const int32_t output_length = uidna_nameToASCII(uidna, - to_UChar_ptr(src), static_cast(src_len), - to_UChar_ptr(output.data()), static_cast(output.capacity()), - &info, &err); - if (U_SUCCESS(err) && (info.errors & UIDNA_ERR_MASK) == 0) { - output.resize(output_length); - // 3. If result is the empty string, domain-to-ASCII validation error, return failure. - // - // Note. Result of uidna_nameToASCII can be the empty string if input: - // 1) consists entirely of IDNA ignored code points; - // 2) is "xn--". - if (output_length == 0) - return validation_errc::domain_to_ascii; - if (icu_version_major < 68) { - // Workaround of ICU bug ICU-21212: https://unicode-org.atlassian.net/browse/ICU-21212 - // For some "xn--" labels which contain non ASCII chars, uidna_nameToASCII returns no error, - // and leaves these labels unchanged in the output. Bug fixed in ICU 68.1 - if (std::any_of(output.begin(), output.end(), [](char16_t c) { return c >= 0x80; })) - return validation_errc::domain_to_ascii; - } - return validation_errc::ok; - } - - if (err != U_BUFFER_OVERFLOW_ERROR || (info.errors & UIDNA_ERR_MASK) != 0) - // 2. If result is a failure value, domain-to-ASCII validation error, return failure. - return validation_errc::domain_to_ascii; - - // Not enough room in our buffer, expand. - output.reserve(output_length); - } -} - -// Implements the domain to Unicode algorithm -// https://url.spec.whatwg.org/#concept-domain-to-unicode -// with beStrict = false - -validation_errc domain_to_unicode(const char* src, std::size_t src_len, simple_buffer& output) { -#if 0 - // https://url.spec.whatwg.org/#concept-domain-to-unicode - // https://www.unicode.org/reports/tr46/#ToUnicode - static constexpr uint32_t UIDNA_ERR_MASK = ~static_cast( - // VerifyDnsLength = false - UIDNA_ERROR_EMPTY_LABEL - | UIDNA_ERROR_LABEL_TOO_LONG - | UIDNA_ERROR_DOMAIN_NAME_TOO_LONG - // CheckHyphens = false - | UIDNA_ERROR_LEADING_HYPHEN - | UIDNA_ERROR_TRAILING_HYPHEN - | UIDNA_ERROR_HYPHEN_3_4 - ); -#endif - - // uidna_nameToUnicodeUTF8 uses int32_t length - // https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/uidna_8h.html#afd9ae1e0ae5318e20c87bcb0149c3ada - if (src_len > util::unsigned_limit::max()) - return validation_errc::overflow; // too long - - // The static_cast(output.capacity()) must be safe: - assert(output.capacity() <= util::unsigned_limit::max()); - - const UIDNA* uidna = get_uidna(); - assert(uidna != nullptr); - while (true) { - UErrorCode err = U_ZERO_ERROR; - UIDNAInfo info = UIDNA_INFO_INITIALIZER; - const int32_t output_length = uidna_nameToUnicodeUTF8(uidna, - src, static_cast(src_len), - output.data(), static_cast(output.capacity()), - &info, &err); - if (U_SUCCESS(err)) { - output.resize(output_length); - // https://url.spec.whatwg.org/#concept-domain-to-unicode - // TODO: Signify domain-to-Unicode validation errors for any returned errors (i.e. - // if (info.errors & UIDNA_ERR_MASK) != 0), and then, return result. - return validation_errc::ok; - } - - if (err != U_BUFFER_OVERFLOW_ERROR) - return validation_errc::domain_to_unicode; - - // Not enough room in our buffer, expand. - output.reserve(output_length); - } -} - - -} // namespace upa diff --git a/test/doctest-main.h b/test/doctest-main.h index 9729218..99c9456 100644 --- a/test/doctest-main.h +++ b/test/doctest-main.h @@ -1,11 +1,10 @@ -// Copyright 2016-2023 Rimas Misevičius +// Copyright 2016-2024 Rimas Misevičius // Distributed under the BSD-style license that can be // found in the LICENSE file. // #define DOCTEST_CONFIG_IMPLEMENT #include "doctest/doctest.h" -#include "url_cleanup.h" // The main() entry point // https://github.com/doctest/doctest/blob/master/doc/markdown/main.md @@ -17,10 +16,5 @@ int main(int argc, char** argv) { context.applyCommandLine(argc, argv); // run test cases - const int res = context.run(); - - // Free memory - upa::url_cleanup(); - - return res; + return context.run(); } diff --git a/test/test-url_host.cpp b/test/test-url_host.cpp index d3beebe..2724d12 100644 --- a/test/test-url_host.cpp +++ b/test/test-url_host.cpp @@ -1,4 +1,4 @@ -// Copyright 2016-2023 Rimas Misevičius +// Copyright 2016-2024 Rimas Misevičius // Distributed under the BSD-style license that can be // found in the LICENSE file. // @@ -247,6 +247,7 @@ TEST_SUITE("url_host") { } } +#if 0 // Test upa::domain_to_unicode function @@ -269,3 +270,5 @@ TEST_SUITE("domain_to_unicode") { CHECK(upa::domain_to_unicode("xn--a.op", 8, output) == upa::validation_errc::ok); } } + +#endif diff --git a/test/url_cleanup.h b/test/url_cleanup.h deleted file mode 100644 index 7635b38..0000000 --- a/test/url_cleanup.h +++ /dev/null @@ -1,21 +0,0 @@ -// Copyright 2016-2023 Rimas Misevičius -// Distributed under the BSD-style license that can be -// found in the LICENSE file. -// - -#ifndef UPA_URL_CLEANUP_H -#define UPA_URL_CLEANUP_H - -// For IDNA library cleanup -#include "upa/url_idna.h" - -namespace upa { - -inline void url_cleanup() -{ - upa::idna_close(true); -} - -} - -#endif // UPA_URL_CLEANUP_H diff --git a/test/wpt-url.cpp b/test/wpt-url.cpp index 85bfc1e..9ee8e83 100644 --- a/test/wpt-url.cpp +++ b/test/wpt-url.cpp @@ -10,7 +10,6 @@ #include "upa/url.h" #include "ddt/DataDrivenTest.hpp" #include "test-utils.h" -#include "url_cleanup.h" #include "picojson_util.h" @@ -42,7 +41,7 @@ int main(int argc, char** argv) err |= test_from_file(run_host_parser_tests, "wpt/toascii.json"); err |= test_from_file(run_setter_tests, "wpt/setters_tests.json"); err |= test_from_file(run_percent_encoding_tests, "wpt/percent-encoding.json"); - if (upa::idna_unicode_version() >= upa::make_unicode_version(13)) { + if (upa::idna::unicode_version() >= upa::idna::make_unicode_version(13)) { // Only the IDNA library that conforms to Unicode 13.0 or later // (e.g., ICU 66.1 or later) passes all IdnaTestV2.json tests err |= test_from_file(run_idna_v2_tests, "wpt/IdnaTestV2.json", "data/IdnaTestV2-fixes.json"); @@ -53,9 +52,6 @@ int main(int argc, char** argv) err |= test_from_file(run_host_parser_tests, "data/my-toascii.json"); err |= test_from_file(run_setter_tests, "data/my-setters_tests.json"); - // Free memory - upa::url_cleanup(); - return err; } diff --git a/test/wpt-urlencoded-parser.cpp b/test/wpt-urlencoded-parser.cpp index 0599389..b99280e 100644 --- a/test/wpt-urlencoded-parser.cpp +++ b/test/wpt-urlencoded-parser.cpp @@ -1,10 +1,9 @@ -// Copyright 2016-2023 Rimas Misevičius +// Copyright 2016-2024 Rimas Misevičius // Distributed under the BSD-style license that can be // found in the LICENSE file. // #include "upa/url.h" -#include "url_cleanup.h" // https://github.com/kazuho/picojson #include "picojson_util.h" @@ -37,9 +36,6 @@ int main(int argc, char** argv) // NOTE: "Sorting non-existent params removes ? from URL" test based on // urlsearchparams-sort.any.js is in the wpt-url_search_params.cpp - // Free memory - upa::url_cleanup(); - return err; }