From a5af0ab77caa5ed7a6844fc5f4f459e5edfe23d3 Mon Sep 17 00:00:00 2001 From: Tarek Mahmoud Sayed <10833894+tarekgh@users.noreply.github.com> Date: Mon, 9 Dec 2024 15:31:45 -0800 Subject: [PATCH] Normalization APIs using the spans (#110465) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Normalization APIs using the spans * Address the feedback * Update src/libraries/System.Private.CoreLib/src/System/Globalization/Normalization.Icu.cs Co-authored-by: Günther Foidl * Fix comment indent --------- Co-authored-by: Günther Foidl Co-authored-by: Eric StJohn --- .../src/Resources/Strings.resx | 57 ++++--- .../System/Globalization/Normalization.Icu.cs | 105 ++++++++++-- .../System/Globalization/Normalization.Nls.cs | 158 +++++++++++++----- .../src/System/Globalization/Normalization.cs | 73 +++++++- .../System/StringNormalizationExtensions.cs | 57 +++++++ .../System.Runtime/ref/System.Runtime.cs | 3 + .../Normalization/NormalizationAll.cs | 44 +++++ .../Normalization/StringNormalizationTests.cs | 53 +++++- 8 files changed, 461 insertions(+), 89 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/Resources/Strings.resx b/src/libraries/System.Private.CoreLib/src/Resources/Strings.resx index e7dadba40a539..3e91b3ecb8340 100644 --- a/src/libraries/System.Private.CoreLib/src/Resources/Strings.resx +++ b/src/libraries/System.Private.CoreLib/src/Resources/Strings.resx @@ -1,17 +1,17 @@  - @@ -1315,6 +1315,9 @@ Invalid or unsupported normalization form. + + `NormalizationForm.FormKC` and `NormalizationForm.FormKD` are not supported in browser environments or WebAssembly. + An undefined NumberStyles value is being used. diff --git a/src/libraries/System.Private.CoreLib/src/System/Globalization/Normalization.Icu.cs b/src/libraries/System.Private.CoreLib/src/System/Globalization/Normalization.Icu.cs index 6ef9df95aa79d..076629fa32b84 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Globalization/Normalization.Icu.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Globalization/Normalization.Icu.cs @@ -10,31 +10,33 @@ namespace System.Globalization { internal static partial class Normalization { - private static unsafe bool IcuIsNormalized(string strInput, NormalizationForm normalizationForm) + private static unsafe bool IcuIsNormalized(ReadOnlySpan source, NormalizationForm normalizationForm) { Debug.Assert(!GlobalizationMode.Invariant); Debug.Assert(!GlobalizationMode.UseNls); + Debug.Assert(!source.IsEmpty); + Debug.Assert(normalizationForm is NormalizationForm.FormC or NormalizationForm.FormD or NormalizationForm.FormKC or NormalizationForm.FormKD); - ValidateArguments(strInput, normalizationForm); + ValidateArguments(source, normalizationForm, nameof(source)); int ret; - fixed (char* pInput = strInput) + fixed (char* pInput = source) { #if TARGET_MACCATALYST || TARGET_IOS || TARGET_TVOS if (GlobalizationMode.Hybrid) { - ret = Interop.Globalization.IsNormalizedNative(normalizationForm, pInput, strInput.Length); + ret = Interop.Globalization.IsNormalizedNative(normalizationForm, pInput, source.Length); } else #endif { - ret = Interop.Globalization.IsNormalized(normalizationForm, pInput, strInput.Length); + ret = Interop.Globalization.IsNormalized(normalizationForm, pInput, source.Length); } } if (ret == -1) { - throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex, nameof(strInput)); + throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex, nameof(source)); } return ret == 1; @@ -44,6 +46,7 @@ private static unsafe string IcuNormalize(string strInput, NormalizationForm nor { Debug.Assert(!GlobalizationMode.Invariant); Debug.Assert(!GlobalizationMode.UseNls); + Debug.Assert(normalizationForm == NormalizationForm.FormC || normalizationForm == NormalizationForm.FormD || normalizationForm == NormalizationForm.FormKC || normalizationForm == NormalizationForm.FormKD); ValidateArguments(strInput, normalizationForm); @@ -114,25 +117,95 @@ private static unsafe string IcuNormalize(string strInput, NormalizationForm nor } } - private static void ValidateArguments(string strInput, NormalizationForm normalizationForm) + private static unsafe bool IcuTryNormalize(ReadOnlySpan source, Span destination, out int charsWritten, NormalizationForm normalizationForm = NormalizationForm.FormC) { - Debug.Assert(strInput != null); + Debug.Assert(!GlobalizationMode.Invariant); + Debug.Assert(!GlobalizationMode.UseNls); + Debug.Assert(!source.IsEmpty); + Debug.Assert(normalizationForm == NormalizationForm.FormC || normalizationForm == NormalizationForm.FormD || normalizationForm == NormalizationForm.FormKC || normalizationForm == NormalizationForm.FormKD); - if ((OperatingSystem.IsBrowser() || OperatingSystem.IsWasi())&& (normalizationForm == NormalizationForm.FormKC || normalizationForm == NormalizationForm.FormKD)) + if (destination.IsEmpty) { - // Browser's ICU doesn't contain data needed for FormKC and FormKD - throw new PlatformNotSupportedException(); + charsWritten = 0; + return false; + } + + ValidateArguments(source, normalizationForm, nameof(source)); + + int realLen; + fixed (char* pInput = source) + fixed (char* pDest = destination) + { +#if TARGET_MACCATALYST || TARGET_IOS || TARGET_TVOS + if (GlobalizationMode.Hybrid) + { + realLen = Interop.Globalization.NormalizeStringNative(normalizationForm, pInput, source.Length, pDest, destination.Length); + } + else +#endif + { + realLen = Interop.Globalization.NormalizeString(normalizationForm, pInput, source.Length, pDest, destination.Length); + } + } + + if (realLen < 0) + { + throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex, nameof(source)); + } + + if (realLen <= destination.Length) + { + charsWritten = realLen; + return true; + } + + charsWritten = 0; + return false; + } + + private static unsafe int IcuGetNormalizedLength(ReadOnlySpan source, NormalizationForm normalizationForm) + { + Debug.Assert(!GlobalizationMode.Invariant); + Debug.Assert(!GlobalizationMode.UseNls); + Debug.Assert(!source.IsEmpty); + Debug.Assert(normalizationForm == NormalizationForm.FormC || normalizationForm == NormalizationForm.FormD || normalizationForm == NormalizationForm.FormKC || normalizationForm == NormalizationForm.FormKD); + + ValidateArguments(source, normalizationForm, nameof(source)); + + int realLen; + fixed (char* pInput = source) + { +#if TARGET_MACCATALYST || TARGET_IOS || TARGET_TVOS + if (GlobalizationMode.Hybrid) + { + realLen = Interop.Globalization.NormalizeStringNative(normalizationForm, pInput, source.Length, null, 0); + } + else +#endif + { + realLen = Interop.Globalization.NormalizeString(normalizationForm, pInput, source.Length, null, 0); + } + } + + if (realLen < 0) + { + throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex, nameof(source)); } - if (normalizationForm != NormalizationForm.FormC && normalizationForm != NormalizationForm.FormD && - normalizationForm != NormalizationForm.FormKC && normalizationForm != NormalizationForm.FormKD) + return realLen; + } + + private static void ValidateArguments(ReadOnlySpan strInput, NormalizationForm normalizationForm, string paramName = "strInput") + { + if ((OperatingSystem.IsBrowser() || OperatingSystem.IsWasi()) && (normalizationForm == NormalizationForm.FormKC || normalizationForm == NormalizationForm.FormKD)) { - throw new ArgumentException(SR.Argument_InvalidNormalizationForm, nameof(normalizationForm)); + // Browser's ICU doesn't contain data needed for FormKC and FormKD + throw new PlatformNotSupportedException(SR.Argument_UnsupportedNormalizationFormInBrowser); } if (HasInvalidUnicodeSequence(strInput)) { - throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex, nameof(strInput)); + throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex, paramName); } } @@ -143,7 +216,7 @@ private static void ValidateArguments(string strInput, NormalizationForm normali /// We walk the string ourselves looking for these bad sequences so we can continue to throw /// ArgumentException in these cases. /// - private static bool HasInvalidUnicodeSequence(string s) + private static bool HasInvalidUnicodeSequence(ReadOnlySpan s) { for (int i = 0; i < s.Length; i++) { diff --git a/src/libraries/System.Private.CoreLib/src/System/Globalization/Normalization.Nls.cs b/src/libraries/System.Private.CoreLib/src/System/Globalization/Normalization.Nls.cs index 3abea57252989..2e63cd5daa5b8 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Globalization/Normalization.Nls.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Globalization/Normalization.Nls.cs @@ -3,6 +3,7 @@ using System.Buffers; using System.Diagnostics; +using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Text; @@ -10,45 +11,23 @@ namespace System.Globalization { internal static partial class Normalization { - private static unsafe bool NlsIsNormalized(string strInput, NormalizationForm normalizationForm) + private static unsafe bool NlsIsNormalized(ReadOnlySpan source, NormalizationForm normalizationForm) { Debug.Assert(!GlobalizationMode.Invariant); Debug.Assert(GlobalizationMode.UseNls); - Debug.Assert(strInput != null); + Debug.Assert(!source.IsEmpty); + Debug.Assert(normalizationForm == NormalizationForm.FormC || normalizationForm == NormalizationForm.FormD || normalizationForm == NormalizationForm.FormKC || normalizationForm == NormalizationForm.FormKD); - // The only way to know if IsNormalizedString failed is through checking the Win32 last error - // IsNormalizedString pinvoke has SetLastError attribute property which will set the last error - // to 0 (ERROR_SUCCESS) before executing the calls. Interop.BOOL result; - fixed (char* pInput = strInput) + fixed (char* pInput = source) { - result = Interop.Normaliz.IsNormalizedString(normalizationForm, pInput, strInput.Length); + result = Interop.Normaliz.IsNormalizedString(normalizationForm, pInput, source.Length); } - int lastError = Marshal.GetLastPInvokeError(); - switch (lastError) - { - case Interop.Errors.ERROR_SUCCESS: - break; - - case Interop.Errors.ERROR_INVALID_PARAMETER: - case Interop.Errors.ERROR_NO_UNICODE_TRANSLATION: - if (normalizationForm != NormalizationForm.FormC && - normalizationForm != NormalizationForm.FormD && - normalizationForm != NormalizationForm.FormKC && - normalizationForm != NormalizationForm.FormKD) - { - throw new ArgumentException(SR.Argument_InvalidNormalizationForm, nameof(normalizationForm)); - } - - throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex, nameof(strInput)); - - case Interop.Errors.ERROR_NOT_ENOUGH_MEMORY: - throw new OutOfMemoryException(); - - default: - throw new InvalidOperationException(SR.Format(SR.UnknownError_Num, lastError)); - } + // The only way to know if IsNormalizedString failed is through checking the Win32 last error + // IsNormalizedString pinvoke has SetLastError attribute property which will set the last error + // to 0 (ERROR_SUCCESS) before executing the calls. + CheckLastErrorAndThrowIfFailed(nameof(source)); return result != Interop.BOOL.FALSE; } @@ -58,6 +37,7 @@ private static unsafe string NlsNormalize(string strInput, NormalizationForm nor Debug.Assert(!GlobalizationMode.Invariant); Debug.Assert(GlobalizationMode.UseNls); Debug.Assert(strInput != null); + Debug.Assert(normalizationForm == NormalizationForm.FormC || normalizationForm == NormalizationForm.FormD || normalizationForm == NormalizationForm.FormKC || normalizationForm == NormalizationForm.FormKD); if (strInput.Length == 0) { @@ -111,14 +91,6 @@ private static unsafe string NlsNormalize(string strInput, NormalizationForm nor case Interop.Errors.ERROR_INVALID_PARAMETER: case Interop.Errors.ERROR_NO_UNICODE_TRANSLATION: - if (normalizationForm != NormalizationForm.FormC && - normalizationForm != NormalizationForm.FormD && - normalizationForm != NormalizationForm.FormKC && - normalizationForm != NormalizationForm.FormKD) - { - throw new ArgumentException(SR.Argument_InvalidNormalizationForm, nameof(normalizationForm)); - } - // Illegal code point or order found. Ie: FFFE or D800 D800, etc. throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex, nameof(strInput)); @@ -139,5 +111,113 @@ private static unsafe string NlsNormalize(string strInput, NormalizationForm nor } } } + + private static unsafe bool NlsTryNormalize(ReadOnlySpan source, Span destination, out int charsWritten, NormalizationForm normalizationForm = NormalizationForm.FormC) + { + Debug.Assert(!GlobalizationMode.Invariant); + Debug.Assert(GlobalizationMode.UseNls); + Debug.Assert(!source.IsEmpty); + Debug.Assert(normalizationForm == NormalizationForm.FormC || normalizationForm == NormalizationForm.FormD || normalizationForm == NormalizationForm.FormKC || normalizationForm == NormalizationForm.FormKD); + + if (destination.IsEmpty) + { + charsWritten = 0; + return false; + } + + // we depend on Win32 last error when calling NormalizeString + // NormalizeString pinvoke has SetLastError attribute property which will set the last error + // to 0 (ERROR_SUCCESS) before executing the calls. + + int realLength; + fixed (char* pInput = source) + fixed (char* pDest = destination) + { + realLength = Interop.Normaliz.NormalizeString(normalizationForm, pInput, source.Length, pDest, destination.Length); + } + + int lastError = Marshal.GetLastPInvokeError(); + switch (lastError) + { + case Interop.Errors.ERROR_SUCCESS: + charsWritten = realLength; + return true; + + // Do appropriate stuff for the individual errors: + case Interop.Errors.ERROR_INSUFFICIENT_BUFFER: + charsWritten = 0; + return false; + + case Interop.Errors.ERROR_INVALID_PARAMETER: + case Interop.Errors.ERROR_NO_UNICODE_TRANSLATION: + // Illegal code point or order found. Ie: FFFE or D800 D800, etc. + throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex, nameof(source)); + + case Interop.Errors.ERROR_NOT_ENOUGH_MEMORY: + throw new OutOfMemoryException(); + + default: + // We shouldn't get here... + throw new InvalidOperationException(SR.Format(SR.UnknownError_Num, lastError)); + } + } + + private static unsafe int NlsGetNormalizedLength(ReadOnlySpan source, NormalizationForm normalizationForm) + { + Debug.Assert(!GlobalizationMode.Invariant); + Debug.Assert(GlobalizationMode.UseNls); + Debug.Assert(!source.IsEmpty); + Debug.Assert(normalizationForm == NormalizationForm.FormC || normalizationForm == NormalizationForm.FormD || normalizationForm == NormalizationForm.FormKC || normalizationForm == NormalizationForm.FormKD); + + // we depend on Win32 last error when calling NormalizeString + // NormalizeString pinvoke has SetLastError attribute property which will set the last error + // to 0 (ERROR_SUCCESS) before executing the calls. + + int realLength; + fixed (char* pInput = source) + { + realLength = Interop.Normaliz.NormalizeString(normalizationForm, pInput, source.Length, null, 0); + } + + int lastError = Marshal.GetLastPInvokeError(); + switch (lastError) + { + case Interop.Errors.ERROR_SUCCESS: + return realLength; + + case Interop.Errors.ERROR_INVALID_PARAMETER: + case Interop.Errors.ERROR_NO_UNICODE_TRANSLATION: + // Illegal code point or order found. Ie: FFFE or D800 D800, etc. + throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex, nameof(source)); + + case Interop.Errors.ERROR_NOT_ENOUGH_MEMORY: + throw new OutOfMemoryException(); + + default: + // We shouldn't get here... + throw new InvalidOperationException(SR.Format(SR.UnknownError_Num, lastError)); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void CheckLastErrorAndThrowIfFailed(string inputName) + { + int lastError = Marshal.GetLastPInvokeError(); + switch (lastError) + { + case Interop.Errors.ERROR_SUCCESS: + break; + + case Interop.Errors.ERROR_INVALID_PARAMETER: + case Interop.Errors.ERROR_NO_UNICODE_TRANSLATION: + throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex, inputName); + + case Interop.Errors.ERROR_NOT_ENOUGH_MEMORY: + throw new OutOfMemoryException(); + + default: + throw new InvalidOperationException(SR.Format(SR.UnknownError_Num, lastError)); + } + } } } diff --git a/src/libraries/System.Private.CoreLib/src/System/Globalization/Normalization.cs b/src/libraries/System.Private.CoreLib/src/System/Globalization/Normalization.cs index d120302a8aa8e..647e097601cd7 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Globalization/Normalization.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Globalization/Normalization.cs @@ -2,28 +2,33 @@ // The .NET Foundation licenses this file to you under the MIT license. using System.Diagnostics; +using System.Runtime.CompilerServices; using System.Text; namespace System.Globalization { internal static partial class Normalization { - internal static bool IsNormalized(string strInput, NormalizationForm normalizationForm) + internal static bool IsNormalized(ReadOnlySpan source, NormalizationForm normalizationForm = NormalizationForm.FormC) { - if (GlobalizationMode.Invariant) + CheckNormalizationForm(normalizationForm); + + // In Invariant mode we assume all characters are normalized. + if (GlobalizationMode.Invariant || source.IsEmpty || Ascii.IsValid(source)) { - // In Invariant mode we assume all characters are normalized. // This is because we don't support any linguistic operation on the strings return true; } return GlobalizationMode.UseNls ? - NlsIsNormalized(strInput, normalizationForm) : - IcuIsNormalized(strInput, normalizationForm); + NlsIsNormalized(source, normalizationForm) : + IcuIsNormalized(source, normalizationForm); } internal static string Normalize(string strInput, NormalizationForm normalizationForm) { + CheckNormalizationForm(normalizationForm); + if (GlobalizationMode.Invariant) { // In Invariant mode we assume all characters are normalized. @@ -35,5 +40,63 @@ internal static string Normalize(string strInput, NormalizationForm normalizatio NlsNormalize(strInput, normalizationForm) : IcuNormalize(strInput, normalizationForm); } + + internal static bool TryNormalize(ReadOnlySpan source, Span destination, out int charsWritten, NormalizationForm normalizationForm = NormalizationForm.FormC) + { + CheckNormalizationForm(normalizationForm); + + if (source.IsEmpty) + { + charsWritten = 0; + return true; + } + + if (GlobalizationMode.Invariant || Ascii.IsValid(source)) + { + // In Invariant mode we assume all characters are normalized. + // This is because we don't support any linguistic operation on the strings + + if (source.TryCopyTo(destination)) + { + charsWritten = source.Length; + return true; + } + + charsWritten = 0; + return false; + } + + return GlobalizationMode.UseNls ? + NlsTryNormalize(source, destination, out charsWritten, normalizationForm) : + IcuTryNormalize(source, destination, out charsWritten, normalizationForm); + } + + internal static int GetNormalizedLength(this ReadOnlySpan source, NormalizationForm normalizationForm = NormalizationForm.FormC) + { + CheckNormalizationForm(normalizationForm); + + if (GlobalizationMode.Invariant || source.IsEmpty || Ascii.IsValid(source)) + { + // In Invariant mode we assume all characters are normalized. + // This is because we don't support any linguistic operation on the strings + return source.Length; + } + + return GlobalizationMode.UseNls ? + NlsGetNormalizedLength(source, normalizationForm) : + IcuGetNormalizedLength(source, normalizationForm); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void CheckNormalizationForm(NormalizationForm normalizationForm) + { + if (normalizationForm != NormalizationForm.FormC && + normalizationForm != NormalizationForm.FormD && + normalizationForm != NormalizationForm.FormKC && + normalizationForm != NormalizationForm.FormKD) + { + throw new ArgumentException(SR.Argument_InvalidNormalizationForm, nameof(normalizationForm)); + } + } } } diff --git a/src/libraries/System.Private.CoreLib/src/System/StringNormalizationExtensions.cs b/src/libraries/System.Private.CoreLib/src/System/StringNormalizationExtensions.cs index 712752626fe54..7a94a853581aa 100644 --- a/src/libraries/System.Private.CoreLib/src/System/StringNormalizationExtensions.cs +++ b/src/libraries/System.Private.CoreLib/src/System/StringNormalizationExtensions.cs @@ -5,13 +5,27 @@ namespace System { + /// + /// Extensions for string normalization. + /// public static partial class StringNormalizationExtensions { + /// + /// Determines whether the specified string is in a normalized . + /// + /// The string to check. + /// if the specified string is in a normalized form; otherwise, . public static bool IsNormalized(this string strInput) { return IsNormalized(strInput, NormalizationForm.FormC); } + /// + /// Determines whether the specified string is in a normalized form. + /// + /// The string to check. + /// The normalization form to use. + /// if the specified string is in a normalized form; otherwise, . public static bool IsNormalized(this string strInput, NormalizationForm normalizationForm) { ArgumentNullException.ThrowIfNull(strInput); @@ -19,17 +33,60 @@ public static bool IsNormalized(this string strInput, NormalizationForm normaliz return strInput.IsNormalized(normalizationForm); } + /// + /// Determines whether the specified span of characters is in a normalized form. + /// + /// The span of characters to check. + /// The normalization form to use. + /// if the specified span of characters is in a normalized form; otherwise, . + /// The specified character span contains an invalid code point or the normalization form is invalid. + public static bool IsNormalized(this ReadOnlySpan source, NormalizationForm normalizationForm = NormalizationForm.FormC) => + System.Globalization.Normalization.IsNormalized(source, normalizationForm); + + /// + /// Normalizes the specified string to the . + /// + /// The string to normalize. + /// The normalized string in . public static string Normalize(this string strInput) { // Default to Form C return Normalize(strInput, NormalizationForm.FormC); } + /// + /// Normalizes the specified string to the specified normalization form. + /// + /// The string to normalize. + /// The normalization form to use. + /// The normalized string in the specified normalization form. public static string Normalize(this string strInput, NormalizationForm normalizationForm) { ArgumentNullException.ThrowIfNull(strInput); return strInput.Normalize(normalizationForm); } + + /// + /// Normalizes the specified span of characters to the specified normalization form. + /// + /// The span of characters to normalize. + /// The buffer to write the normalized characters to. + /// When this method returns, contains the number of characters written to . + /// The normalization form to use. + /// if the specified span of characters was successfully normalized; otherwise, . + /// The specified character span contains an invalid code point or the normalization form is invalid. + public static bool TryNormalize(this ReadOnlySpan source, Span destination, out int charsWritten, NormalizationForm normalizationForm = NormalizationForm.FormC) => + System.Globalization.Normalization.TryNormalize(source, destination, out charsWritten, normalizationForm); + + /// + /// Gets the estimated length of the normalized form of the specified string in the . + /// + /// The character span to get the estimated length of the normalized form. + /// The normalization form to use. + /// The estimated length of the normalized form of the specified string. + /// The specified character span contains an invalid code point or the normalization form is invalid. + public static int GetNormalizedLength(this ReadOnlySpan source, NormalizationForm normalizationForm = NormalizationForm.FormC) => + System.Globalization.Normalization.GetNormalizedLength(source, normalizationForm); } } diff --git a/src/libraries/System.Runtime/ref/System.Runtime.cs b/src/libraries/System.Runtime/ref/System.Runtime.cs index 0d79bc4c54e23..da44791d9607c 100644 --- a/src/libraries/System.Runtime/ref/System.Runtime.cs +++ b/src/libraries/System.Runtime/ref/System.Runtime.cs @@ -5819,8 +5819,11 @@ public static partial class StringNormalizationExtensions { public static bool IsNormalized(this string strInput) { throw null; } public static bool IsNormalized(this string strInput, System.Text.NormalizationForm normalizationForm) { throw null; } + public static bool IsNormalized(this ReadOnlySpan source, System.Text.NormalizationForm normalizationForm = System.Text.NormalizationForm.FormC) { throw null; } public static string Normalize(this string strInput) { throw null; } + public static bool TryNormalize(this ReadOnlySpan source, Span destination, out int charsWritten, System.Text.NormalizationForm normalizationForm = System.Text.NormalizationForm.FormC) { throw null; } public static string Normalize(this string strInput, System.Text.NormalizationForm normalizationForm) { throw null; } + public static int GetNormalizedLength(this ReadOnlySpan source, System.Text.NormalizationForm normalizationForm = System.Text.NormalizationForm.FormC) { throw null; } } [System.FlagsAttribute] public enum StringSplitOptions diff --git a/src/libraries/System.Runtime/tests/System.Globalization.Extensions.Tests/Normalization/NormalizationAll.cs b/src/libraries/System.Runtime/tests/System.Globalization.Extensions.Tests/Normalization/NormalizationAll.cs index 703988c1cffb3..bedd07baf4637 100644 --- a/src/libraries/System.Runtime/tests/System.Globalization.Extensions.Tests/Normalization/NormalizationAll.cs +++ b/src/libraries/System.Runtime/tests/System.Globalization.Extensions.Tests/Normalization/NormalizationAll.cs @@ -93,6 +93,18 @@ private static void VerifyConformanceInvariant(NormalizationForm normForm, strin string normalized4 = c4.Normalize(normForm); string normalized5 = c5.Normalize(normForm); + Span normalizedSpan1 = new char[normalized1.Length]; + Span normalizedSpan2 = new char[normalized2.Length]; + Span normalizedSpan3 = new char[normalized3.Length]; + Span normalizedSpan4 = new char[normalized4.Length]; + Span normalizedSpan5 = new char[normalized5.Length]; + + Assert.True(c1.AsSpan().TryNormalize(normalizedSpan1, out int charsWritten1, normForm), $"'{c1}' is not normalized with form {normForm}"); + Assert.True(c2.AsSpan().TryNormalize(normalizedSpan2, out int charsWritten2, normForm), $"'{c2}' is not normalized with form {normForm}"); + Assert.True(c3.AsSpan().TryNormalize(normalizedSpan3, out int charsWritten3, normForm), $"'{c3}' is not normalized with form {normForm}"); + Assert.True(c4.AsSpan().TryNormalize(normalizedSpan4, out int charsWritten4, normForm), $"'{c4}' is not normalized with form {normForm}"); + Assert.True(c5.AsSpan().TryNormalize(normalizedSpan5, out int charsWritten5, normForm), $"'{c5}' is not normalized with form {normForm}"); + switch (normForm) { case NormalizationForm.FormC: @@ -101,15 +113,24 @@ private static void VerifyConformanceInvariant(NormalizationForm normForm, strin AssertEqualsForm(c2, normalized2); AssertEqualsForm(c2, normalized3); + AssertEqualsForm(c2, normalizedSpan1.Slice(0, charsWritten1).ToString()); + AssertEqualsForm(c2, normalizedSpan2.Slice(0, charsWritten2).ToString()); + AssertEqualsForm(c2, normalizedSpan3.Slice(0, charsWritten3).ToString()); + // c4 == NFC(c4) == NFC(c5) AssertEqualsForm(c4, normalized4); AssertEqualsForm(c4, normalized5); + AssertEqualsForm(c4, normalizedSpan4.Slice(0, charsWritten4).ToString()); + AssertEqualsForm(c4, normalizedSpan5.Slice(0, charsWritten5).ToString()); + // c2 is normalized to Form C Assert.True(c2.IsNormalized(normForm), $"'{c2}' is marked as not normalized with form {normForm}"); + Assert.True(c2.AsSpan().IsNormalized(normForm), $"'{c2}' span is marked as not normalized with form {normForm}"); // c4 is normalized to Form C Assert.True(c4.IsNormalized(normForm), $"'{c4}' is marked as not normalized with form {normForm}"); + Assert.True(c4.AsSpan().IsNormalized(normForm), $"'{c4}' span is marked as not normalized with form {normForm}"); break; case NormalizationForm.FormD: @@ -118,15 +139,24 @@ private static void VerifyConformanceInvariant(NormalizationForm normForm, strin AssertEqualsForm(c3, normalized2); AssertEqualsForm(c3, normalized3); + AssertEqualsForm(c3, normalizedSpan1.Slice(0, charsWritten1).ToString()); + AssertEqualsForm(c3, normalizedSpan2.Slice(0, charsWritten2).ToString()); + AssertEqualsForm(c3, normalizedSpan3.Slice(0, charsWritten3).ToString()); + // c5 == NFD(c4) == NFD(c5) AssertEqualsForm(c5, normalized4); AssertEqualsForm(c5, normalized5); + AssertEqualsForm(c5, normalizedSpan4.Slice(0, charsWritten4).ToString()); + AssertEqualsForm(c5, normalizedSpan5.Slice(0, charsWritten5).ToString()); + // c3 is normalized to Form D Assert.True(c3.IsNormalized(normForm), $"'{c3}' is marked as not normalized with form {normForm}"); + Assert.True(c3.AsSpan().IsNormalized(normForm), $"'{c3}' span is marked as not normalized with form {normForm}"); // c5 is normalized to Form D Assert.True(c5.IsNormalized(normForm), $"'{c5}' is marked as not normalized with form {normForm}"); + Assert.True(c5.AsSpan().IsNormalized(normForm), $"'{c5}' span is marked as not normalized with form {normForm}"); break; case NormalizationForm.FormKC: @@ -138,8 +168,15 @@ private static void VerifyConformanceInvariant(NormalizationForm normForm, strin AssertEqualsForm(c4, normalized4); AssertEqualsForm(c4, normalized5); + AssertEqualsForm(c4, normalizedSpan1.Slice(0, charsWritten1).ToString()); + AssertEqualsForm(c4, normalizedSpan2.Slice(0, charsWritten2).ToString()); + AssertEqualsForm(c4, normalizedSpan3.Slice(0, charsWritten3).ToString()); + AssertEqualsForm(c4, normalizedSpan4.Slice(0, charsWritten4).ToString()); + AssertEqualsForm(c4, normalizedSpan5.Slice(0, charsWritten5).ToString()); + // c4 is normalized to Form KC Assert.True(c4.IsNormalized(normForm), $"'{c4}' is marked as not normalized with form {normForm}"); + Assert.True(c4.AsSpan().IsNormalized(normForm), $"'{c4}' span is marked as not normalized with form {normForm}"); break; case NormalizationForm.FormKD: @@ -151,8 +188,15 @@ private static void VerifyConformanceInvariant(NormalizationForm normForm, strin AssertEqualsForm(c5, normalized4); AssertEqualsForm(c5, normalized5); + AssertEqualsForm(c5, normalizedSpan1.Slice(0, charsWritten1).ToString()); + AssertEqualsForm(c5, normalizedSpan2.Slice(0, charsWritten2).ToString()); + AssertEqualsForm(c5, normalizedSpan3.Slice(0, charsWritten3).ToString()); + AssertEqualsForm(c5, normalizedSpan4.Slice(0, charsWritten4).ToString()); + AssertEqualsForm(c5, normalizedSpan5.Slice(0, charsWritten5).ToString()); + // c5 is normalized to Form KD Assert.True(c5.IsNormalized(normForm), $"'{c5}' is marked as not normalized with form {normForm}"); + Assert.True(c5.AsSpan().IsNormalized(normForm), $"'{c5}' span is marked as not normalized with form {normForm}"); break; } } diff --git a/src/libraries/System.Runtime/tests/System.Globalization.Extensions.Tests/Normalization/StringNormalizationTests.cs b/src/libraries/System.Runtime/tests/System.Globalization.Extensions.Tests/Normalization/StringNormalizationTests.cs index 1b70a79b6ae6d..bba0ddb088d46 100644 --- a/src/libraries/System.Runtime/tests/System.Globalization.Extensions.Tests/Normalization/StringNormalizationTests.cs +++ b/src/libraries/System.Runtime/tests/System.Globalization.Extensions.Tests/Normalization/StringNormalizationTests.cs @@ -2,6 +2,7 @@ // The .NET Foundation licenses this file to you under the MIT license. using System; +using System.Linq; using System.Text; using Xunit; using System.Collections.Generic; @@ -21,16 +22,23 @@ public void IsNormalized(string value, NormalizationForm normalizationForm, bool if (normalizationForm == NormalizationForm.FormC) { Assert.Equal(expected, value.IsNormalized()); + Assert.Equal(expected, value.AsSpan().IsNormalized()); } Assert.Equal(expected, value.IsNormalized(normalizationForm)); + Assert.Equal(expected, value.AsSpan().IsNormalized(normalizationForm)); } [Fact] public void IsNormalized_Invalid() { Assert.Throws(() => "\uFB01".IsNormalized((NormalizationForm)10)); - AssertExtensions.Throws("strInput", () => "\uFFFE".IsNormalized()); // Invalid codepoint - AssertExtensions.Throws("strInput", () => "\uD800\uD800".IsNormalized()); // Invalid surrogate pair + Assert.Throws(() => "\uFB01".AsSpan().IsNormalized((NormalizationForm)10)); + + AssertExtensions.Throws("source", () => "\uFFFE".IsNormalized()); // Invalid codepoint + AssertExtensions.Throws("source", () => "\uFFFE".AsSpan().IsNormalized()); // Invalid codepoint + + AssertExtensions.Throws("source", () => "\uD800\uD800".IsNormalized()); // Invalid surrogate pair + AssertExtensions.Throws("source", () => "\uD800\uD800".AsSpan().IsNormalized()); // Invalid surrogate pair } [Fact] @@ -63,20 +71,61 @@ public static IEnumerable NormalizeTestData() [MemberData(nameof(NormalizeTestData))] public void Normalize(string value, NormalizationForm normalizationForm, string expected) { + Span destination = new char[expected.Length + 1]; // NLS sometimes need extra character in the buffer mostly if need to insert the null terminator + int charsWritten; + if (normalizationForm == NormalizationForm.FormC) { Assert.Equal(expected, value.Normalize()); + + Assert.True(value.AsSpan().TryNormalize(destination, out charsWritten)); + Assert.Equal(expected, destination.Slice(0, charsWritten).ToString()); + + if (PlatformDetection.IsNlsGlobalization) + { + // NLS return estimated normalized length that is enough to hold the result but doesn't return the exact length + Assert.True(expected.Length <= value.GetNormalizedLength(), $"Expected: {expected.Length}, Actual: {value.GetNormalizedLength()}"); + } + else + { + // ICU returns the exact normalized length + Assert.Equal(expected.Length, value.AsSpan().GetNormalizedLength()); + } } + Assert.Equal(expected, value.Normalize(normalizationForm)); + + if (expected.Length > 0) + { + Assert.False(value.AsSpan().TryNormalize(destination.Slice(0, expected.Length - 1), out charsWritten, normalizationForm), $"Trying to normalize '{value}' to a buffer of length {expected.Length - 1} succeeded!"); + } + + Assert.True(value.AsSpan().TryNormalize(destination, out charsWritten, normalizationForm), $"Failed to normalize '{value}' to a buffer of length {destination.Length}"); + Assert.Equal(expected, destination.Slice(0, charsWritten).ToString()); + if (PlatformDetection.IsNlsGlobalization) + { + // NLS return estimated normalized length that is enough to hold the result but doesn't return the exact length + Assert.True(expected.Length <= value.AsSpan().GetNormalizedLength(normalizationForm), $"Expected: {expected.Length}, Actual: {value.AsSpan().GetNormalizedLength(normalizationForm)}"); + } + else + { + // ICU returns the exact normalized length + Assert.Equal(expected.Length, value.AsSpan().GetNormalizedLength(normalizationForm)); + } } [Fact] public void Normalize_Invalid() { + char[] destination = new char[100]; Assert.Throws(() => "\uFB01".Normalize((NormalizationForm)7)); + Assert.Throws(() => "\uFB01".AsSpan().TryNormalize(destination.AsSpan(), out int charsWritten, (NormalizationForm)7)); AssertExtensions.Throws("strInput", () => "\uFFFE".Normalize()); // Invalid codepoint + AssertExtensions.Throws("source", () => "\uFFFE".AsSpan().TryNormalize(destination.AsSpan(), out int charsWritten)); // Invalid codepoint + AssertExtensions.Throws("strInput", () => "\uD800\uD800".Normalize()); // Invalid surrogate pair + AssertExtensions.Throws("source", () => "\uD800\uD800".AsSpan().TryNormalize(destination, out int charsWritten)); // Invalid surrogate pair } [Fact]