Skip to content

Commit

Permalink
[browser][HybridGlobalization] Improve speed performance of IndexOf a…
Browse files Browse the repository at this point in the history
…nd LastIndexOf text APIs with HybridGlobalization mode (dotnet#95583)

* Re-implement the grapheme segmenter from Intl.

* Load segmentation rules as static json asset
  • Loading branch information
matouskozak authored Jan 16, 2024
1 parent e42a873 commit a08d3fc
Show file tree
Hide file tree
Showing 19 changed files with 343 additions and 71 deletions.
14 changes: 14 additions & 0 deletions THIRD-PARTY-NOTICES.TXT
Original file line number Diff line number Diff line change
Expand Up @@ -1331,3 +1331,17 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

Aspects of base64 encoding / decoding are based on algorithm described in "Base64 encoding and decoding at almost the speed of a memory
copy", Wojciech Muła and Daniel Lemire. https://arxiv.org/pdf/1910.05109.pdf

License for FormatJS Intl.Segmenter grapheme segmentation algorithm
--------------------------------------------------------------------------
Available at https://github.com/formatjs/formatjs/blob/58d6a7b398d776ca3d2726d72ae1573b65cc3bef/packages/intl-segmenter/LICENSE.md

MIT License

Copyright (c) 2022 FormatJS

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
3 changes: 2 additions & 1 deletion eng/liveBuilds.targets
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,8 @@
$(LibrariesNativeArtifactsPath)package.json;
$(LibrariesNativeArtifactsPath)dotnet.native.wasm;
$(LibrariesNativeArtifactsPath)dotnet.native.js.symbols;
$(LibrariesNativeArtifactsPath)*.dat;"
$(LibrariesNativeArtifactsPath)*.dat;
$(LibrariesNativeArtifactsPath)segmentation-rules.json;"
IsNative="true" />
<!-- for threaded wasm -->
<LibrariesRuntimeFiles Condition="'$(TargetOS)' == 'browser' and Exists('$(LibrariesNativeArtifactsPath)dotnet.native.worker.js')"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,7 @@
<PlatformManifestFileEntry Include="icudt_optimal.dat" IsNative="true" />
<PlatformManifestFileEntry Include="icudt_optimal_no_CJK.dat" IsNative="true" />
<PlatformManifestFileEntry Include="icudt_hybrid.dat" IsNative="true" />
<PlatformManifestFileEntry Include="segmentation-rules.json" IsNative="true" />
<PlatformManifestFileEntry Include="package.json" IsNative="true" />
<PlatformManifestFileEntry Include="dotnet.es6.pre.js" IsNative="true" />
<PlatformManifestFileEntry Include="dotnet.es6.lib.js" IsNative="true" />
Expand Down
3 changes: 2 additions & 1 deletion src/mono/browser/browser.proj
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,8 @@
<ItemGroup>
<ICULibNativeFiles Include="$(ICULibDir)/libicuuc.a;
$(ICULibDir)/libicui18n.a;
$(ICULibDir)/libicudata.a" />
$(ICULibDir)/libicudata.a;
$(BrowserProjectRoot)runtime/hybrid-globalization/segmentation-rules.json" />
<ICULibFiles Include="$(ICULibDir)/*.dat" />
</ItemGroup>
<PropertyGroup>
Expand Down
3 changes: 2 additions & 1 deletion src/mono/browser/build/BrowserWasmApp.targets
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,9 @@

<ItemGroup Condition="'$(InvariantGlobalization)' != 'true'">
<_HybridGlobalizationDataFiles Include="$(MicrosoftNetCoreAppRuntimePackRidNativeDir)icudt_hybrid.dat"/>
<_HybridGlobalizationDataFiles Include="$(MicrosoftNetCoreAppRuntimePackRidNativeDir)segmentation-rules.json"/>
<_IcuAvailableDataFiles Include="$(MicrosoftNetCoreAppRuntimePackRidNativeDir)icudt_*" Exclude="@(_HybridGlobalizationDataFiles);$(_WasmIcuDataFileName)"/>
<WasmIcuDataFileNames Condition="'$(HybridGlobalization)' == 'true'" Include="$(MicrosoftNetCoreAppRuntimePackRidNativeDir)icudt_hybrid.dat"/>
<WasmIcuDataFileNames Condition="'$(HybridGlobalization)' == 'true'" Include="@(_HybridGlobalizationDataFiles)"/>
<WasmIcuDataFileNames Condition="'$(HybridGlobalization)' != 'true' and '$(WasmIncludeFullIcuData)' == 'true'" Include="$(MicrosoftNetCoreAppRuntimePackRidNativeDir)icudt.dat"/>
<WasmIcuDataFileNames Condition="'$(HybridGlobalization)' != 'true' and '$(WasmIncludeFullIcuData)' != 'true' and '$(_WasmIcuDataFileName)' == ''" Include="@(_IcuAvailableDataFiles)"/>
<WasmIcuDataFileNames Condition="'$(HybridGlobalization)' != 'true' and '$(WasmIncludeFullIcuData)' != 'true' and '$(_WasmIcuDataFileName)' != ''" Include="$(_WasmIcuDataFileName)"/>
Expand Down
12 changes: 12 additions & 0 deletions src/mono/browser/runtime/assets.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import { endMeasure, MeasuredBlock, startMeasure } from "./profiler";
import { AssetEntryInternal } from "./types/internal";
import { AssetEntry } from "./types";
import { VoidPtr } from "./types/emscripten";
import { setSegmentationRulesFromJson } from "./hybrid-globalization/grapheme-segmenter";

// this need to be run only after onRuntimeInitialized event, when the memory is ready
export function instantiate_asset(asset: AssetEntry, url: string, bytes: Uint8Array): void {
Expand All @@ -25,6 +26,7 @@ export function instantiate_asset(asset: AssetEntry, url: string, bytes: Uint8Ar
case "dotnetwasm":
case "js-module-threads":
case "symbols":
case "segmentation-rules":
// do nothing
break;
case "resource":
Expand Down Expand Up @@ -104,6 +106,16 @@ export async function instantiate_symbols_asset(pendingAsset: AssetEntryInternal
}
}

export async function instantiate_segmentation_rules_asset(pendingAsset: AssetEntryInternal): Promise<void> {
try {
const response = await pendingAsset.pendingDownloadInternal!.response;
const json = await response.json();
setSegmentationRulesFromJson(json);
} catch (error: any) {
mono_log_info(`Error loading static json asset ${pendingAsset.name}: ${JSON.stringify(error)}`);
}
}

export async function wait_for_all_assets() {
// wait for all assets in memory
await runtimeHelpers.allAssetsInMemory.promise;
Expand Down
3 changes: 2 additions & 1 deletion src/mono/browser/runtime/exports.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ import { mono_bind_static_method } from "./net6-legacy/method-calls";
import { export_binding_api, export_internal_api, export_mono_api } from "./net6-legacy/exports-legacy";
import { initializeLegacyExports } from "./net6-legacy/globals";
import { mono_log_warn, mono_wasm_stringify_as_error_with_stack } from "./logging";
import { instantiate_asset, instantiate_symbols_asset } from "./assets";
import { instantiate_asset, instantiate_symbols_asset, instantiate_segmentation_rules_asset } from "./assets";
import { jiterpreter_dump_stats } from "./jiterpreter";
import { forceDisposeProxies } from "./gc-handles";

Expand All @@ -46,6 +46,7 @@ function initializeExports(globalObjects: GlobalObjects): RuntimeAPI {
instantiate_asset,
jiterpreter_dump_stats,
forceDisposeProxies,
instantiate_segmentation_rules_asset,
});

const API = export_api();
Expand Down
15 changes: 1 addition & 14 deletions src/mono/browser/runtime/hybrid-globalization/change-case.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,7 @@ import { MonoObject, MonoObjectRef, MonoString, MonoStringRef } from "../types/i
import { Int32Ptr } from "../types/emscripten";
import { wrap_error_root, wrap_no_error_root } from "../invoke-js";
import { localHeapViewU16, setU16_local } from "../memory";

const SURROGATE_HIGHER_START = "\uD800";
const SURROGATE_HIGHER_END = "\uDBFF";
const SURROGATE_LOWER_START = "\uDC00";
const SURROGATE_LOWER_END = "\uDFFF";
import { isSurrogate } from "./helpers";

export function mono_wasm_change_case_invariant(src: number, srcLength: number, dst: number, dstLength: number, toUpper: number, is_exception: Int32Ptr, ex_address: MonoObjectRef): void {
const exceptionRoot = mono_wasm_new_external_root<MonoObject>(ex_address);
Expand Down Expand Up @@ -160,15 +156,6 @@ export function mono_wasm_change_case(culture: MonoStringRef, src: number, srcLe
}
}

function isSurrogate(str: string, startIdx: number) : boolean
{
return SURROGATE_HIGHER_START <= str[startIdx] &&
str[startIdx] <= SURROGATE_HIGHER_END &&
startIdx+1 < str.length &&
SURROGATE_LOWER_START <= str[startIdx+1] &&
str[startIdx+1] <= SURROGATE_LOWER_END;
}

function appendSurrogateToMemory(heapI16: Uint16Array, dst: number, surrogate: string, idx: number)
{
setU16_local(heapI16, dst + idx*2, surrogate.charCodeAt(0));
Expand Down
97 changes: 44 additions & 53 deletions src/mono/browser/runtime/hybrid-globalization/collations.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@ import { monoStringToString, utf16ToString } from "../strings";
import { MonoObject, MonoObjectRef, MonoString, MonoStringRef } from "../types/internal";
import { Int32Ptr } from "../types/emscripten";
import { wrap_error_root, wrap_no_error_root } from "../invoke-js";
import { GraphemeSegmenter } from "./grapheme-segmenter";

const COMPARISON_ERROR = -2;
const INDEXING_ERROR = -1;
let graphemeSegmenterCached: GraphemeSegmenter | null;

export function mono_wasm_compare_string(culture: MonoStringRef, str1: number, str1Length: number, str2: number, str2Length: number, options: number, is_exception: Int32Ptr, ex_address: MonoObjectRef): number {
const cultureRoot = mono_wasm_new_external_root<MonoString>(culture),
Expand All @@ -20,7 +22,7 @@ export function mono_wasm_compare_string(culture: MonoStringRef, str1: number, s
const casePicker = (options & 0x1f);
const locale = cultureName ? cultureName : undefined;
wrap_no_error_root(is_exception, exceptionRoot);
return compare_strings(string1, string2, locale, casePicker);
return compareStrings(string1, string2, locale, casePicker);
}
catch (ex: any) {
wrap_error_root(is_exception, ex, exceptionRoot);
Expand All @@ -37,19 +39,19 @@ export function mono_wasm_starts_with(culture: MonoStringRef, str1: number, str1
exceptionRoot = mono_wasm_new_external_root<MonoObject>(ex_address);
try {
const cultureName = monoStringToString(cultureRoot);
const prefix = decode_to_clean_string(str2, str2Length);
const prefix = decodeToCleanString(str2, str2Length);
// no need to look for an empty string
if (prefix.length == 0)
return 1; // true

const source = decode_to_clean_string(str1, str1Length);
const source = decodeToCleanString(str1, str1Length);
if (source.length < prefix.length)
return 0; //false
const sourceOfPrefixLength = source.slice(0, prefix.length);

const casePicker = (options & 0x1f);
const locale = cultureName ? cultureName : undefined;
const result = compare_strings(sourceOfPrefixLength, prefix, locale, casePicker);
const result = compareStrings(sourceOfPrefixLength, prefix, locale, casePicker);
wrap_no_error_root(is_exception, exceptionRoot);
return result === 0 ? 1 : 0; // equals ? true : false
}
Expand All @@ -68,19 +70,19 @@ export function mono_wasm_ends_with(culture: MonoStringRef, str1: number, str1Le
exceptionRoot = mono_wasm_new_external_root<MonoObject>(ex_address);
try {
const cultureName = monoStringToString(cultureRoot);
const suffix = decode_to_clean_string(str2, str2Length);
const suffix = decodeToCleanString(str2, str2Length);
if (suffix.length == 0)
return 1; // true

const source = decode_to_clean_string(str1, str1Length);
const source = decodeToCleanString(str1, str1Length);
const diff = source.length - suffix.length;
if (diff < 0)
return 0; //false
const sourceOfSuffixLength = source.slice(diff, source.length);

const casePicker = (options & 0x1f);
const locale = cultureName ? cultureName : undefined;
const result = compare_strings(sourceOfSuffixLength, suffix, locale, casePicker);
const result = compareStrings(sourceOfSuffixLength, suffix, locale, casePicker);
wrap_no_error_root(is_exception, exceptionRoot);
return result === 0 ? 1 : 0; // equals ? true : false
}
Expand All @@ -100,68 +102,57 @@ export function mono_wasm_index_of(culture: MonoStringRef, needlePtr: number, ne
try {
const needle = utf16ToString(<any>needlePtr, <any>(needlePtr + 2 * needleLength));
// no need to look for an empty string
if (clean_string(needle).length == 0) {
if (cleanString(needle).length == 0) {
wrap_no_error_root(is_exception, exceptionRoot);
return fromBeginning ? 0 : srcLength;
}

const source = utf16ToString(<any>srcPtr, <any>(srcPtr + 2 * srcLength));
// no need to look in an empty string
if (clean_string(source).length == 0) {
if (cleanString(source).length == 0) {
wrap_no_error_root(is_exception, exceptionRoot);
return fromBeginning ? 0 : srcLength;
}
const cultureName = monoStringToString(cultureRoot);
const locale = cultureName ? cultureName : undefined;
const casePicker = (options & 0x1f);

const segmenter = new Intl.Segmenter(locale, { granularity: "grapheme" });
const needleSegments = Array.from(segmenter.segment(needle)).map(s => s.segment);
let i = 0;
let stop = false;
let result = -1;
let segmentWidth = 0;
let index = 0;
let nextIndex = 0;
while (!stop) {
// we need to restart the iterator in this outer loop because we have shifted it in the inner loop
const iteratorSrc = segmenter.segment(source.slice(i, source.length))[Symbol.iterator]();
let srcNext = iteratorSrc.next();

if (srcNext.done)
break;
const graphemeSegmenter = graphemeSegmenterCached || (graphemeSegmenterCached = new GraphemeSegmenter());
const needleSegments = [];
let needleIdx = 0;

// Grapheme segmentation of needle string
while (needleIdx < needle.length) {
const needleGrapheme = graphemeSegmenter.nextGrapheme(needle, needleIdx);
needleSegments.push(needleGrapheme);
needleIdx += needleGrapheme.length;
}

let srcIdx = 0;
while (srcIdx < source.length) {
const srcGrapheme = graphemeSegmenter.nextGrapheme(source, srcIdx);
srcIdx += srcGrapheme.length;

let matchFound = check_match_found(srcNext.value.segment, needleSegments[0], locale, casePicker);
index = nextIndex;
srcNext = iteratorSrc.next();
if (srcNext.done) {
result = matchFound ? index : result;
break;
if (!checkMatchFound(srcGrapheme, needleSegments[0], locale, casePicker)) {
continue;
}
segmentWidth = srcNext.value.index;
nextIndex = index + segmentWidth;
if (matchFound) {
for (let j = 1; j < needleSegments.length; j++) {
if (srcNext.done) {
stop = true;
break;
}
matchFound = check_match_found(srcNext.value.segment, needleSegments[j], locale, casePicker);
if (!matchFound)
break;

srcNext = iteratorSrc.next();
}
if (stop)
let j;
let srcNextIdx = srcIdx;
for (j = 1; j < needleSegments.length; j++) {
const srcGrapheme = graphemeSegmenter.nextGrapheme(source, srcNextIdx);

if (!checkMatchFound(srcGrapheme, needleSegments[j], locale, casePicker)) {
break;
}
srcNextIdx += srcGrapheme.length;
}

if (matchFound) {
result = index;
if (j === needleSegments.length) {
result = srcIdx - srcGrapheme.length;
if (fromBeginning)
break;
}
i = nextIndex;
}
wrap_no_error_root(is_exception, exceptionRoot);
return result;
Expand All @@ -175,12 +166,12 @@ export function mono_wasm_index_of(culture: MonoStringRef, needlePtr: number, ne
exceptionRoot.release();
}

function check_match_found(str1: string, str2: string, locale: string | undefined, casePicker: number): boolean {
return compare_strings(str1, str2, locale, casePicker) === 0;
function checkMatchFound(str1: string, str2: string, locale: string | undefined, casePicker: number): boolean {
return compareStrings(str1, str2, locale, casePicker) === 0;
}
}

function compare_strings(string1: string, string2: string, locale: string | undefined, casePicker: number): number {
function compareStrings(string1: string, string2: string, locale: string | undefined, casePicker: number): number {
switch (casePicker) {
case 0:
// 0: None - default algorithm for the platform OR
Expand Down Expand Up @@ -272,12 +263,12 @@ function compare_strings(string1: string, string2: string, locale: string | unde
}
}

function decode_to_clean_string(strPtr: number, strLen: number) {
function decodeToCleanString(strPtr: number, strLen: number) {
const str = utf16ToString(<any>strPtr, <any>(strPtr + 2 * strLen));
return clean_string(str);
return cleanString(str);
}

function clean_string(str: string) {
function cleanString(str: string) {
const nStr = str.normalize();
return nStr.replace(/[\u200B-\u200D\uFEFF\0]/g, "");
}
Loading

0 comments on commit a08d3fc

Please sign in to comment.