diff --git a/compiler/test/stdlib/bytes.test.gr b/compiler/test/stdlib/bytes.test.gr index 6461c2126..d4f496599 100644 --- a/compiler/test/stdlib/bytes.test.gr +++ b/compiler/test/stdlib/bytes.test.gr @@ -17,6 +17,25 @@ assert Bytes.length(Bytes.empty) == 0 let bytes = Bytes.make(64) assert Bytes.length(bytes) == 64 +// Bytes.getChar +let bytes = Bytes.fromString("ab©✨🍞") +assert Bytes.getChar(0, bytes) == 'a' +assert Bytes.getChar(1, bytes) == 'b' +assert Bytes.getChar(2, bytes) == '©' +assert Bytes.getChar(4, bytes) == '✨' +assert Bytes.getChar(7, bytes) == '🍞' + +// Bytes.setChar +let bytes = Bytes.make(16) +Bytes.setChar(0, 'a', bytes) +assert Bytes.getChar(0, bytes) == 'a' +Bytes.setChar(1, '©', bytes) +assert Bytes.getChar(1, bytes) == '©' +Bytes.setChar(3, '✨', bytes) +assert Bytes.getChar(3, bytes) == '✨' +Bytes.setChar(7, '🍞', bytes) +assert Bytes.getChar(7, bytes) == '🍞' + // Bytes.setInt8, Bytes.setUint8, Bytes.getInt8, Bytes.getUint8 let bytes = Bytes.make(1) Bytes.setInt8(0, 0xffs, bytes) diff --git a/stdlib/buffer.gr b/stdlib/buffer.gr index 9a0bb36aa..3abf99aaf 100644 --- a/stdlib/buffer.gr +++ b/stdlib/buffer.gr @@ -16,13 +16,15 @@ from "runtime/unsafe/wasmi32" include WasmI32 from "runtime/unsafe/conv" include Conv from "runtime/exception" include Exception from "runtime/dataStructures" include DataStructures -use DataStructures.{ untagChar } +use DataStructures.{ untagChar, tagSimpleNumber } from "int32" include Int32 from "bytes" include Bytes from "string" include String from "char" include Char from "runtime/numbers" include Numbers use Numbers.{ coerceNumberToWasmI32 } +from "runtime/utf8" include Utf8 +use Utf8.{ usvEncodeLength } abstract record Buffer { mut len: Number, @@ -389,48 +391,12 @@ provide let addString = (string, buffer) => { */ @unsafe provide let addChar = (char, buffer) => { - use WasmI32.{ (-), (*), (&), (|), (>>>), ltU as (<), gtU as (>), leU as (<=) } let usv = untagChar(char) - - let bytelen = if (usv < 0x80n) { - autogrow(1, buffer) - use WasmI32.{ (+) } - let off = coerceNumberToWasmI32(buffer.len) - let dst = WasmI32.fromGrain(buffer.data) + _VALUE_OFFSET - WasmI32.store8(dst, usv, off) - 1 - } else { - let mut count = 0n - let mut bytelen = 0 - let mut offset = 0n - if (usv <= 0x07FFn) { - count = 1n - bytelen = 2 - offset = 0xC0n - } else if (usv <= 0xFFFFn) { - count = 2n - bytelen = 3 - offset = 0xE0n - } else { - count = 3n - bytelen = 4 - offset = 0xF0n - } - use WasmI32.{ (+) } - autogrow(bytelen, buffer) - let off = coerceNumberToWasmI32(buffer.len) - let dst = WasmI32.fromGrain(buffer.data) + _VALUE_OFFSET - WasmI32.store8(dst, (usv >>> (6n * count)) + offset, off) - let mut n = 0n - while (count > 0n) { - n += 1n - let temp = usv >>> (6n * (count - 1n)) - WasmI32.store8(dst + n, 0x80n | temp & 0x3Fn, off) - count -= 1n - } - bytelen - } - buffer.len += bytelen + let byteCount = tagSimpleNumber(usvEncodeLength(usv)) + autogrow(byteCount, buffer) + let index = buffer.len + buffer.len += byteCount + Bytes.setChar(index, char, buffer.data) } /** diff --git a/stdlib/bytes.gr b/stdlib/bytes.gr index f63e7f246..e5ee3afb7 100644 --- a/stdlib/bytes.gr +++ b/stdlib/bytes.gr @@ -18,10 +18,12 @@ from "runtime/unsafe/wasmf64" include WasmF64 from "runtime/unsafe/conv" include Conv from "runtime/dataStructures" include DataStructures use DataStructures.{ + tagChar, tagInt8, tagUint8, tagInt16, tagUint16, + untagChar, untagInt8, untagUint8, untagInt16, @@ -33,6 +35,14 @@ from "runtime/exception" include Exception from "int32" include Int32 from "runtime/numbers" include Numbers use Numbers.{ coerceNumberToWasmI32 } +from "runtime/utf8" include Utf8 +use Utf8.{ + utf8ByteCount, + getCodePoint, + usvEncodeLength, + writeUtf8CodePoint, + exception MalformedUnicode, +} @unsafe let _SIZE_OFFSET = 4n @@ -396,6 +406,70 @@ provide let clear = (bytes: Bytes) => { ignore(bytes) } +/** + * Gets the UTF-8 encoded character at the given byte index. + * + * @param index: The byte index to access + * @param bytes: The byte sequence to access + * @returns The character that starts at the given index + * + * @throws IndexOutOfBounds: When `index` is negative + * @throws MalformedUnicode: When the requested character is not a valid UTF-8 sequence + * + * @example + * let bytes = Bytes.fromString("Hello") + * assert Bytes.getChar(0, bytes) == 'H' + * + * @since v0.7.0 + */ +@unsafe +provide let getChar = (index: Number, bytes: Bytes) => { + // result + use WasmI32.{ (+), (&), (+), (==), (>) } + let ptr = WasmI32.fromGrain(bytes) + let size = getSize(ptr) + let offset = coerceNumberToWasmI32(index) + checkIndexIsInBounds(offset, 1n, size) + let byte = WasmI32.load8U(ptr + offset, _VALUE_OFFSET) + let charSize = utf8ByteCount(byte) + if (offset + charSize > size) { + throw MalformedUnicode + } + let codePoint = getCodePoint(ptr + offset + _VALUE_OFFSET) + ignore(bytes) + tagChar(codePoint) +} + +/** + * UTF-8 encodes a character starting at the given byte index. + * + * @param index: The byte index to update + * @param value: The value to set + * @param bytes: The byte sequence to mutate + * + * @throws IndexOutOfBounds: When `index` is negative + * @throws IndexOutOfBounds: When `index + charSize` is greater than the bytes size, `charSize` is the number of bytes in the character ranging from 1 to 4 + * + * @example + * let bytes = Bytes.make(1) + * Bytes.setChar(0, 'a', bytes) + * assert Bytes.getChar(0, bytes) == 'a' + * + * @since v0.7.0 + */ +@unsafe +provide let setChar = (index: Number, value: Char, bytes: Bytes) => { + use WasmI32.{ (+) } + let ptr = WasmI32.fromGrain(bytes) + let size = getSize(ptr) + let offset = coerceNumberToWasmI32(index) + let usv = untagChar(value) + let charSize = usvEncodeLength(usv) + checkIndexIsInBounds(offset, charSize, size) + writeUtf8CodePoint(ptr + offset + _VALUE_OFFSET, usv) + ignore(bytes) +} + /** * Gets a signed 8-bit integer starting at the given byte index. * @@ -452,8 +526,8 @@ provide let setInt8 = (index: Number, value: Int8, bytes: Bytes) => { let offset = coerceNumberToWasmI32(index) checkIndexIsInBounds(offset, _INT8_BYTE_SIZE, size) let v = untagInt8(value) - ignore(bytes) WasmI32.store8(ptr + offset, v, _VALUE_OFFSET) + ignore(bytes) } /** diff --git a/stdlib/bytes.md b/stdlib/bytes.md index 4313c071f..9f619cd5f 100644 --- a/stdlib/bytes.md +++ b/stdlib/bytes.md @@ -431,6 +431,85 @@ Bytes.clear(bytes) assert bytes == b"\x00\x00\x00\x00\x00" ``` +### Bytes.**getChar** + +
+Added in next +No other changes yet. +
+ +```grain +getChar : (index: Number, bytes: Bytes) => Char +``` + +Gets the UTF-8 encoded character at the given byte index. + +Parameters: + +|param|type|description| +|-----|----|-----------| +|`index`|`Number`|The byte index to access| +|`bytes`|`Bytes`|The byte sequence to access| + +Returns: + +|type|description| +|----|-----------| +|`Char`|The character that starts at the given index| + +Throws: + +`IndexOutOfBounds` + +* When `index` is negative + +`MalformedUnicode` + +* When the requested character is not a valid UTF-8 sequence + +Examples: + +```grain +let bytes = Bytes.fromString("Hello") +assert Bytes.getChar(0, bytes) == 'H' +``` + +### Bytes.**setChar** + +
+Added in next +No other changes yet. +
+ +```grain +setChar : (index: Number, value: Char, bytes: Bytes) => Void +``` + +UTF-8 encodes a character starting at the given byte index. + +Parameters: + +|param|type|description| +|-----|----|-----------| +|`index`|`Number`|The byte index to update| +|`value`|`Char`|The value to set| +|`bytes`|`Bytes`|The byte sequence to mutate| + +Throws: + +`IndexOutOfBounds` + +* When `index` is negative +* When `index + charSize` is greater than the bytes size, `charSize` is the number of bytes in the character ranging from 1 to 4 + +Examples: + +```grain +let bytes = Bytes.make(1) +Bytes.setChar(0, 'a', bytes) +assert Bytes.getChar(0, bytes) == 'a' +``` + ### Bytes.**getInt8**
diff --git a/stdlib/char.gr b/stdlib/char.gr index 0ca14c6ab..10ef6361b 100644 --- a/stdlib/char.gr +++ b/stdlib/char.gr @@ -16,8 +16,8 @@ module Char from "runtime/unsafe/wasmi32" include WasmI32 from "runtime/dataStructures" include DataStructures use DataStructures.{ tagSimpleNumber, tagChar, untagChar, allocateString } - -exception MalformedUtf8 +from "runtime/utf8" include Utf8 +use Utf8.{ usvEncodeLength, writeUtf8CodePoint } /** * The minimum valid Unicode scalar value. @@ -164,52 +164,12 @@ provide let pred = char => { */ @unsafe provide let toString = (char: Char) => { - use WasmI32.{ - (+), - (-), - (*), - (&), - (|), - (>>>), - ltU as (<), - gtU as (>), - leU as (<=), - } - + use WasmI32.{ (+) } let usv = untagChar(char) - - let result = if (usv < 0x80n) { - let string = allocateString(1n) - WasmI32.store8(string, usv, 8n) - WasmI32.toGrain(string): String - } else { - let mut count = 0n - let mut offset = 0n - if (usv <= 0x07FFn) { - count = 1n - offset = 0xC0n - } else if (usv <= 0xFFFFn) { - count = 2n - offset = 0xE0n - } else { - count = 3n - offset = 0xF0n - } - let string = allocateString(count + 1n) - WasmI32.store8(string, (usv >>> (6n * count)) + offset, 8n) - - let mut n = 0n - while (count > 0n) { - n += 1n - let temp = usv >>> (6n * (count - 1n)) - WasmI32.store8(string + n, 0x80n | temp & 0x3Fn, 8n) - count -= 1n - } - - WasmI32.toGrain(string): String - } - - result + let byteCount = usvEncodeLength(usv) + let string = allocateString(byteCount) + writeUtf8CodePoint(string + 8n, usv) + WasmI32.toGrain(string): String } /** diff --git a/stdlib/json.gr b/stdlib/json.gr index 9a8cbeea6..e5fc87fbe 100644 --- a/stdlib/json.gr +++ b/stdlib/json.gr @@ -17,6 +17,8 @@ from "runtime/bigint" include Bigint as BI from "runtime/dataStructures" include DataStructures from "runtime/numbers" include Numbers from "runtime/numberUtils" include NumberUtils +from "runtime/utf8" include Utf8 +use Utf8.{ getCodePoint } from "runtime/string" include String as RuntimeString from "runtime/unsafe/tags" include Tags from "runtime/unsafe/wasmi32" include WasmI32 @@ -28,7 +30,7 @@ from "char" include Char from "string" include String from "list" include List from "uint8" include Uint8 -use RuntimeString.{ toString as runtimeToString, getCodePoint } +use RuntimeString.{ toString as runtimeToString } use Numbers.{ coerceNumberToWasmI32 } use DataStructures.{ tagSimpleNumber, untagSimpleNumber } @@ -1321,6 +1323,7 @@ let rec readCodePoint = (bytePosition: Number, string: String) => { if (bytePositionW32 < byteSize) { let codePoint = getCodePoint(ptr) + tagSimpleNumber(codePoint) } else { _END_OF_INPUT @@ -2072,7 +2075,7 @@ provide let parse: (str: String) => Result = (str: String) /** * Utilities for accessing and updating JSON data. - * + * * @example * let obj = JsonObject([("x", JsonNumber(123))]) * assert get(property("x") ||> number, obj) == Some(123) @@ -2103,11 +2106,11 @@ provide module Lenses { /** * Reads the value focused on by the given lens from the input data. - * + * * @param lens: The lens to apply to the subject data * @param subject: The data which will have the lens applied to it * @returns `Some(data)` containing the data read by the lens if the lens matches the given data, or `None` if the data cannot be matched to the lens - * + * * @example assert get(number, JsonNumber(123)) == Some(123) * @example assert get(string, JsonString("abc")) == Some("abc") * @example assert get(number, JsonString("abc")) == None @@ -2117,12 +2120,12 @@ provide module Lenses { /** * Sets the value focused on by the given lens from the input data to the * desired new value. - * + * * @param lens: The lens to apply to the subject data * @param newValue: The new value to set at the focus of the lens * @param subject: The data which will have the lens applied to it * @returns `Some(data)` containing the new data after the lens substitution if the lens matches the given data, or `None` if the data cannot be matched to the lens - * + * * @example assert set(number, 123, JsonBoolean(true)) == Some(JsonNumber(123)) * @example assert set(property("a"), JsonNumber(123), JsonObject([("a", JsonNull)])) == Some(JsonObject([("a", JsonNumber(123))])) * @example assert set(property("a"), JsonNumber(123), JsonBoolean(true)) == None @@ -2132,12 +2135,12 @@ provide module Lenses { /** * Updates the value focused on by the given lens from the input data by * applying a function to it and setting the focus to the result of the function - * + * * @param lens: The lens to apply to the subject data * @param fn: The function to apply to the matched data at the lens if matched * @param subject: The data which will have the lens applied to it * @returns `Some(data)` containing the new data after the lens mapping has been applied if the lens matches the given data, or `None` if the data cannot be matched to the lens - * + * * @example assert map(number, x => x * 2, JsonNumber(5)) == Some(JsonNumber(10)) * @example * assert map(property("x"), x => JsonArray([x, x]), JsonObject([("x", JsonNumber(1))])) == @@ -2153,9 +2156,9 @@ provide module Lenses { /** * A lens whose focus is a JSON value. - * + * * @example assert get(json, JsonString("abc")) == Some(JsonString("abc")) - * + * * @since v0.7.0 */ provide let json = { @@ -2165,9 +2168,9 @@ provide module Lenses { /** * A lens whose focus is a JSON boolean value. - * + * * @example assert get(boolean, JsonBoolean(true)) == Some(true) - * + * * @since v0.7.0 */ provide let boolean = { @@ -2182,9 +2185,9 @@ provide module Lenses { /** * A lens whose focus is a JSON string value. - * + * * @example assert get(string, JsonString("abc")) == Some("abc") - * + * * @since v0.7.0 */ provide let string = { @@ -2199,9 +2202,9 @@ provide module Lenses { /** * A lens whose focus is a JSON number value. - * + * * @example assert get(number, JsonNumber(123)) == Some(123) - * + * * @since v0.7.0 */ provide let number = { @@ -2216,9 +2219,9 @@ provide module Lenses { /** * A lens whose focus is a JSON array. - * + * * @example assert get(array, JsonArray([JsonNumber(123)])) == Some([JsonNumber(123)]) - * + * * @since v0.7.0 */ provide let array = { @@ -2233,9 +2236,9 @@ provide module Lenses { /** * A lens whose focus is the property pair list of a JSON object. - * + * * @example assert get(objectProperties, JsonObject([("a", JsonNumber(123))])) == Some([("a", JsonNumber(123))]) - * + * * @since v0.7.0 */ provide let objectProperties = { @@ -2264,15 +2267,15 @@ provide module Lenses { /** * Creates a lens whose focus is a given property of a JSON object. - * + * * @param propertyName: The property name of the JSON object to focus on * @returns A lens whose focus is the given property of a JSON object - * + * * @example assert get(property("x"), JsonObject([("x", JsonNumber(123))])) == Some(JsonNumber(123)) * @example * assert set(property("x"), JsonString("new"), JsonObject([("x", JsonNumber(123))])) == * Some(JsonObject([("x", JsonString("new"))])) - * + * * @since v0.7.0 */ provide let property = propertyName => @@ -2298,12 +2301,12 @@ provide module Lenses { * the result will be enclosed in `Some`; if the lens does not match but the * value focused is null, then the lens will still successfully match and * `None` will be returned. - * + * * @example assert get(nullable(number), JsonNumber(123)) == Some(Some(123)) * @example assert get(nullable(number), JsonNull) == Some(None) * @example assert get(nullable(number), JsonString("abc")) == None * @example assert set(nullable(number), Some(123), JsonString("abc")) == Some(JsonNumber(123)) - * + * * @since v0.7.0 */ provide let nullable = lens => @@ -2331,16 +2334,16 @@ provide module Lenses { /** * Reverse lens composition. - * + * * @param lens1: The lens which will be applied first * @param lens2: The lens which will be applied second * @returns A lens which combines the two given lenses, passing through the first and then the second - * + * * @example assert get(property("x") ||> number, JsonObject([("x", JsonNumber(123))])) == Some(123) * @example * assert set(property("x") ||> string, "new", JsonObject([("x", JsonNumber(123))])) == * Some(JsonObject([("x", JsonString("new"))])) - * + * * @since v0.7.0 */ let (||>) = (lens1, lens2) => @@ -2361,14 +2364,14 @@ provide module Lenses { /** * Creates a lens whose focus is a given property path within a JSON object tree. - * + * * @param propertyNames: The property path of the JSON object to create a focus on * @returns A lens whose focus is the given property path of a JSON object - * + * * @example * let nestedObj = JsonObject([("a", JsonObject([("b", JsonNumber(123))]))]) * assert get(propertyPath(["a", "b"]), nestedObj) == Some(JsonNumber(123)) - * + * * @since v0.7.0 */ provide let propertyPath = propertyNames => { diff --git a/stdlib/runtime/string.gr b/stdlib/runtime/string.gr index a1836dacf..bbaec3675 100644 --- a/stdlib/runtime/string.gr +++ b/stdlib/runtime/string.gr @@ -27,7 +27,8 @@ from "runtime/bigint" include Bigint as BI from "runtime/unsafe/memory" include Memory from "runtime/unsafe/tags" include Tags from "runtime/numberUtils" include NumberUtils - +from "runtime/utf8" include Utf8 +use Utf8.{ usvEncodeLength, writeUtf8CodePoint } from "runtime/dataStructures" include DataStructures use DataStructures.{ allocateString, allocateArray, untagSimpleNumber } @@ -39,9 +40,6 @@ primitive (&&) = "@and" primitive (||) = "@or" primitive builtinId = "@builtin.id" primitive ignore = "@ignore" -primitive throw = "@throw" - -exception MalformedUnicode @unsafe primitive typeMetadata = "@heap.type_metadata" @@ -404,40 +402,6 @@ let escapeChar = (s: String) => { escape(WasmI32.fromGrain(s), false) } -@unsafe -let usvToString = usv => { - if (usv < 0x80n) { - let string = allocateString(1n) - WasmI32.store8(string, usv, 8n) - WasmI32.toGrain(string): String - } else { - let mut count = 0n - let mut offset = 0n - if (usv <= 0x07FFn) { - count = 1n - offset = 0xC0n - } else if (usv <= 0xFFFFn) { - count = 2n - offset = 0xE0n - } else { - count = 3n - offset = 0xF0n - } - let string = allocateString(count + 1n) - WasmI32.store8(string, (usv >>> (6n * count)) + offset, 8n) - - let mut n = 0n - while (count > 0n) { - n += 1n - let temp = usv >>> (6n * (count - 1n)) - WasmI32.store8(string + n, 0x80n | temp & 0x3Fn, 8n) - count -= 1n - } - - WasmI32.toGrain(string): String - } -} - @unsafe let reportCycle = (ptr, cycles) => { let mut cycleNum = vecFindIndex(cycles, ptr) @@ -707,7 +671,10 @@ and toStringHelp = (grainValue, extraIndents, toplevel, cycles) => { let shortVal = grainValue >> 8n let shortValTag = (grainValue & 0xF8n) >> 3n if (shortValTag == Tags._GRAIN_CHAR_SHORTVAL_TAG) { - let string = usvToString(shortVal) + let byteCount = usvEncodeLength(shortVal) + let string = allocateString(byteCount) + writeUtf8CodePoint(string + 8n, shortVal) + let string = WasmI32.toGrain(string): String if (toplevel) { string } else { @@ -892,54 +859,3 @@ provide let print = (value, suffix="\n") => { ignore(suffix) void } - -@unsafe -provide let getCodePoint = (ptr: WasmI32) => { - // Algorithm from https://encoding.spec.whatwg.org/#utf-8-decoder - use WasmI32.{ (+), (&), (|), (<<), leU as (<=), geU as (>=), (==) } - - let mut codePoint = 0n - let mut bytesSeen = 0n - let mut bytesNeeded = 0n - let mut lowerBoundary = 0x80n - let mut upperBoundary = 0xBFn - - let mut offset = 0n - - while (true) { - let byte = WasmI32.load8U(ptr + offset, 0n) - offset += 1n - if (bytesNeeded == 0n) { - if (byte >= 0x00n && byte <= 0x7Fn) { - return byte - } else if (byte >= 0xC2n && byte <= 0xDFn) { - bytesNeeded = 1n - codePoint = byte & 0x1Fn - } else if (byte >= 0xE0n && byte <= 0xEFn) { - if (byte == 0xE0n) lowerBoundary = 0xA0n - if (byte == 0xEDn) upperBoundary = 0x9Fn - bytesNeeded = 2n - codePoint = byte & 0xFn - } else if (byte >= 0xF0n && byte <= 0xF4n) { - if (byte == 0xF0n) lowerBoundary = 0x90n - if (byte == 0xF4n) upperBoundary = 0x8Fn - bytesNeeded = 3n - codePoint = byte & 0x7n - } else { - throw MalformedUnicode - } - continue - } - if (!(lowerBoundary <= byte && byte <= upperBoundary)) { - throw MalformedUnicode - } - lowerBoundary = 0x80n - upperBoundary = 0xBFn - codePoint = codePoint << 6n | byte & 0x3Fn - bytesSeen += 1n - if (bytesSeen == bytesNeeded) { - return codePoint - } - } - return 0n -} diff --git a/stdlib/runtime/string.md b/stdlib/runtime/string.md index 9769c7139..4f403457f 100644 --- a/stdlib/runtime/string.md +++ b/stdlib/runtime/string.md @@ -93,9 +93,3 @@ Parameters: |`value`|`a`|The operand| |`?suffix`|`String`|The string to print after the argument| -### String.**getCodePoint** - -```grain -getCodePoint : (ptr: WasmI32) => WasmI32 -``` - diff --git a/stdlib/runtime/utf8.gr b/stdlib/runtime/utf8.gr new file mode 100644 index 000000000..4ecf907cf --- /dev/null +++ b/stdlib/runtime/utf8.gr @@ -0,0 +1,176 @@ +@noPervasives +/** + * The `Utf8` module provides functions for working with UTF-8 encoded strings. + */ +module Utf8 + +primitive (!) = "@not" +primitive (&&) = "@and" +primitive throw = "@throw" + +from "runtime/unsafe/wasmi32" include WasmI32 + +/** + * An exception thrown when a string is not valid UTF-8. + */ +provide exception MalformedUnicode + +/** + * Returns the total number of bytes for a UTF-8 code point given the first byte. + * + * @param byte: The first byte of the UTF-8 code point + * + * @returns The number of bytes in the UTF-8 code point + */ +@unsafe +provide let utf8ByteCount = byte => { + use WasmI32.{ (&), (==) } + if ((byte & 0x80n) == 0x00n) { + 1n + } else if ((byte & 0xF0n) == 0xF0n) { + 4n + } else if ((byte & 0xE0n) == 0xE0n) { + 3n + } else { + 2n + } +} + +/** + * Returns the number of bytes required to encode the given USV as UTF-8. + * + * @param usv: The Unicode scalar value + * + * @returns The number of bytes required to encode the given USV as UTF-8 + */ +@unsafe +provide let usvEncodeLength = usv => { + use WasmI32.{ (<=) } + if (usv <= 0x007Fn) { + 1n + } else if (usv <= 0x07FFn) { + 2n + } else if (usv <= 0xFFFFn) { + 3n + } else { + 4n + } +} + +/** + * Returns the Unicode code point of the encoded value at the given pointer. + * + * @param ptr: The pointer to the encoded value in memory + * + * @returns The Unicode code point of the encoded value at the given pointer + * + * @throws MalformedUnicode: if the encoded value is not a valid UTF-8 sequence + */ +@unsafe +provide let getCodePoint = (ptr: WasmI32) => { + // Algorithm from https://encoding.spec.whatwg.org/#utf-8-decoder + use WasmI32.{ (+), (&), (|), (<<), leU as (<=), geU as (>=), (==) } + + let mut codePoint = 0n + let mut bytesSeen = 0n + let mut bytesNeeded = 0n + let mut lowerBoundary = 0x80n + let mut upperBoundary = 0xBFn + + let mut offset = 0n + + while (true) { + let byte = WasmI32.load8U(ptr + offset, 0n) + offset += 1n + if (bytesNeeded == 0n) { + if (byte >= 0x00n && byte <= 0x7Fn) { + return byte + } else if (byte >= 0xC2n && byte <= 0xDFn) { + bytesNeeded = 1n + codePoint = byte & 0x1Fn + } else if (byte >= 0xE0n && byte <= 0xEFn) { + if (byte == 0xE0n) lowerBoundary = 0xA0n + if (byte == 0xEDn) upperBoundary = 0x9Fn + bytesNeeded = 2n + codePoint = byte & 0xFn + } else if (byte >= 0xF0n && byte <= 0xF4n) { + if (byte == 0xF0n) lowerBoundary = 0x90n + if (byte == 0xF4n) upperBoundary = 0x8Fn + bytesNeeded = 3n + codePoint = byte & 0x7n + } else { + throw MalformedUnicode + } + continue + } + if (!(lowerBoundary <= byte && byte <= upperBoundary)) { + throw MalformedUnicode + } + lowerBoundary = 0x80n + upperBoundary = 0xBFn + codePoint = codePoint << 6n | byte & 0x3Fn + bytesSeen += 1n + if (bytesSeen == bytesNeeded) { + return codePoint + } + } + return 0n +} + +/** + * Writes the given Unicode code point to the given pointer as encoded UTF-8. + * + * @param ptr: The pointer to write the UTF-8 character to + * @param codePoint: The Unicode code point to write + * + * @returns The number of bytes written + */ +@unsafe +provide let writeUtf8CodePoint = (ptr, codePoint) => { + use WasmI32.{ (+), (-), (&), (|), (>>>), ltU as (<), leU as (<=), (==) } + if (codePoint <= 0x007Fn) { + // Code points in the ASCII range are written as just one byte with the + // leading bit equal to zero (0xxxxxxx). Just store the value as one byte + // directly. Note that the value is already guaranteed to start with most + // significant bit equal to zero because of the check in the if statement + // above, so there's no need to bit-mask it. + WasmI32.store8(ptr, codePoint, 0n) + 1n + } else if (codePoint <= 0x07FFn) { + // Code points in the range 0x0080..0x07FF are written as two bytes. + // The first byte has a three bit prefix of 110, followed by 5 bits of the + // codepoint. The second byte has a two bit prefix of 10, followed by 6 bits + // of the codepoint. + let high = codePoint >>> 6n & 0b000_11111n | 0b110_00000n + let low = codePoint & 0b00_111111n | 0b10_000000n + WasmI32.store8(ptr, high, 0n) + WasmI32.store8(ptr, low, 1n) + 2n + } else if (codePoint <= 0xFFFFn) { + // Code points in the range 0x0800..0xFFFF are written as three bytes. + // The first byte has a four bit prefix of 1110, followed by 4 bits of the + // codepoint. Remaining bytes each have a two bit prefix of 10, followed by + // 6 bits of the codepoint. + let high = codePoint >>> 12n & 0b0000_1111n | 0b1110_0000n + let mid = codePoint >>> 6n & 0b00_111111n | 0b10_000000n + let low = codePoint & 0b00_111111n | 0b10_000000n + WasmI32.store8(ptr, high, 0n) + WasmI32.store8(ptr, mid, 1n) + WasmI32.store8(ptr, low, 2n) + 3n + } else { + // Code points in the range 0x10000..0x10FFFF are written as four bytes. + // The first byte has a five bit prefix of 11110, followed by 3 bits of the + // codepoint. Remaining bytes each have a two bit prefix of 10, followed by + // 6 bits of the codepoint. + let high = codePoint >>> 18n & 0b00000_111n | 0b11110_000n + let mid1 = codePoint >>> 12n & 0b00_111111n | 0b10_000000n + let mid2 = codePoint >>> 6n & 0b00_111111n | 0b10_000000n + let low = codePoint & 0b00_111111n | 0b10_000000n + WasmI32.store8(ptr, high, 0n) + WasmI32.store8(ptr, mid1, 1n) + WasmI32.store8(ptr, mid2, 2n) + WasmI32.store8(ptr, low, 3n) + 4n + } +} diff --git a/stdlib/runtime/utf8.md b/stdlib/runtime/utf8.md new file mode 100644 index 000000000..6337940bb --- /dev/null +++ b/stdlib/runtime/utf8.md @@ -0,0 +1,97 @@ +--- +title: Utf8 +--- + +The `Utf8` module provides functions for working with UTF-8 encoded strings. + +## Values + +Functions and constants included in the Utf8 module. + +### Utf8.**utf8ByteCount** + +```grain +utf8ByteCount : (byte: WasmI32) => WasmI32 +``` + +Returns the total number of bytes for a UTF-8 code point given the first byte. + +Parameters: + +|param|type|description| +|-----|----|-----------| +|`byte`|`WasmI32`|The first byte of the UTF-8 code point| + +Returns: + +|type|description| +|----|-----------| +|`WasmI32`|The number of bytes in the UTF-8 code point| + +### Utf8.**usvEncodeLength** + +```grain +usvEncodeLength : (usv: WasmI32) => WasmI32 +``` + +Returns the number of bytes required to encode the given USV as UTF-8. + +Parameters: + +|param|type|description| +|-----|----|-----------| +|`usv`|`WasmI32`|The Unicode scalar value| + +Returns: + +|type|description| +|----|-----------| +|`WasmI32`|The number of bytes required to encode the given USV as UTF-8| + +### Utf8.**getCodePoint** + +```grain +getCodePoint : (ptr: WasmI32) => WasmI32 +``` + +Returns the Unicode code point of the encoded value at the given pointer. + +Parameters: + +|param|type|description| +|-----|----|-----------| +|`ptr`|`WasmI32`|The pointer to the encoded value in memory| + +Returns: + +|type|description| +|----|-----------| +|`WasmI32`|The Unicode code point of the encoded value at the given pointer| + +Throws: + +`MalformedUnicode` + +* if the encoded value is not a valid UTF-8 sequence + +### Utf8.**writeUtf8CodePoint** + +```grain +writeUtf8CodePoint : (ptr: WasmI32, codePoint: WasmI32) => WasmI32 +``` + +Writes the given Unicode code point to the given pointer as encoded UTF-8. + +Parameters: + +|param|type|description| +|-----|----|-----------| +|`ptr`|`WasmI32`|The pointer to write the UTF-8 character to| +|`codePoint`|`WasmI32`|The Unicode code point to write| + +Returns: + +|type|description| +|----|-----------| +|`WasmI32`|The number of bytes written| + diff --git a/stdlib/string.gr b/stdlib/string.gr index c4dd188ae..6aa3b5a7e 100644 --- a/stdlib/string.gr +++ b/stdlib/string.gr @@ -24,6 +24,8 @@ use DataStructures.{ allocateString, allocateBytes, } +from "runtime/utf8" include Utf8 +use Utf8.{ utf8ByteCount, usvEncodeLength, getCodePoint, writeUtf8CodePoint } /** * Byte encodings @@ -153,15 +155,7 @@ provide let indexOf = (search: String, string: String) => { } idx += 1n let byte = WasmI32.load8U(ptr, 0n) - if ((byte & 0x80n) == 0x00n) { - ptr += 1n - } else if ((byte & 0xF0n) == 0xF0n) { - ptr += 4n - } else if ((byte & 0xE0n) == 0xE0n) { - ptr += 3n - } else { - ptr += 2n - } + ptr += utf8ByteCount(byte) } ignore(search) @@ -220,57 +214,6 @@ provide let lastIndexOf = (search: String, string: String) => { return None } -@unsafe -let getCodePoint = (ptr: WasmI32) => { - // Algorithm from https://encoding.spec.whatwg.org/#utf-8-decoder - use WasmI32.{ (+), (&), (|), (<<), leU as (<=), geU as (>=), (==) } - - let mut codePoint = 0n - let mut bytesSeen = 0n - let mut bytesNeeded = 0n - let mut lowerBoundary = 0x80n - let mut upperBoundary = 0xBFn - - let mut offset = 0n - - while (true) { - let byte = WasmI32.load8U(ptr + offset, 0n) - offset += 1n - if (bytesNeeded == 0n) { - if (byte >= 0x00n && byte <= 0x7Fn) { - return byte - } else if (byte >= 0xC2n && byte <= 0xDFn) { - bytesNeeded = 1n - codePoint = byte & 0x1Fn - } else if (byte >= 0xE0n && byte <= 0xEFn) { - if (byte == 0xE0n) lowerBoundary = 0xA0n - if (byte == 0xEDn) upperBoundary = 0x9Fn - bytesNeeded = 2n - codePoint = byte & 0xFn - } else if (byte >= 0xF0n && byte <= 0xF4n) { - if (byte == 0xF0n) lowerBoundary = 0x90n - if (byte == 0xF4n) upperBoundary = 0x8Fn - bytesNeeded = 3n - codePoint = byte & 0x7n - } else { - throw MalformedUnicode - } - continue - } - if (!(lowerBoundary <= byte && byte <= upperBoundary)) { - throw MalformedUnicode - } - lowerBoundary = 0x80n - upperBoundary = 0xBFn - codePoint = codePoint << 6n | byte & 0x3Fn - bytesSeen += 1n - if (bytesSeen == bytesNeeded) { - return codePoint - } - } - return 0n -} - @unsafe let charAtHelp = (position, string: String) => { if (length(string) <= position || position < 0) { @@ -289,17 +232,8 @@ let charAtHelp = (position, string: String) => { return getCodePoint(ptr) } let byte = WasmI32.load8U(ptr, 0n) - let n = if ((byte & 0x80n) == 0x00n) { - 1n - } else if ((byte & 0xF0n) == 0xF0n) { - 4n - } else if ((byte & 0xE0n) == 0xE0n) { - 3n - } else { - 2n - } + ptr += utf8ByteCount(byte) counter += 1n - ptr += n } ignore(string) @@ -362,15 +296,7 @@ let explodeHelp = (s: String, chars) => { while (ptr < end) { let byte = WasmI32.load8U(ptr, 0n) - let n = if ((byte & 0x80n) == 0x00n) { - 1n - } else if ((byte & 0xF0n) == 0xF0n) { - 4n - } else if ((byte & 0xE0n) == 0xE0n) { - 3n - } else { - 2n - } + let n = utf8ByteCount(byte) let c = if (chars) { WasmI32.fromGrain(tagChar(getCodePoint(ptr))) @@ -438,18 +364,7 @@ provide let implode = (arr: Array) => { for (let mut i = 0n; i < arrLength; i += 1n) { let usv = untagChar(arr[tagSimpleNumber(i)]) - - let n = if (usv <= 0x7Fn) { - 1n - } else if (usv <= 0x07FFn) { - 2n - } else if (usv <= 0xFFFFn) { - 3n - } else { - 4n - } - - stringByteLength += n + stringByteLength += usvEncodeLength(usv) } let str = allocateString(stringByteLength) @@ -457,33 +372,7 @@ provide let implode = (arr: Array) => { for (let mut i = 0n; i < arrLength; i += 1n) { let usv = untagChar(arr[tagSimpleNumber(i)]) - - if (usv < 0x7Fn) { - WasmI32.store8(str + offset, usv, 0n) - offset += 1n - } else { - let mut count = 0n - let mut marker = 0n - if (usv <= 0x07FFn) { - count = 1n - marker = 0xC0n - } else if (usv <= 0xFFFFn) { - count = 2n - marker = 0xE0n - } else { - count = 3n - marker = 0xF0n - } - WasmI32.store8(str + offset, (usv >>> (6n * count)) + marker, 0n) - offset += 1n - - while (count > 0n) { - let temp = usv >>> (6n * (count - 1n)) - WasmI32.store8(str + offset, 0x80n | temp & 0x3Fn, 0n) - count -= 1n - offset += 1n - } - } + offset += writeUtf8CodePoint(str + offset, usv) } WasmI32.toGrain(str): String @@ -553,15 +442,7 @@ provide let split = (separator: String, string: String) => { numStrings += 1n } let byte = WasmI32.load8U(ptr, 0n) - if ((byte & 0x80n) == 0x00n) { - ptr += 1n - } else if ((byte & 0xF0n) == 0xF0n) { - ptr += 4n - } else if ((byte & 0xE0n) == 0xE0n) { - ptr += 3n - } else { - ptr += 2n - } + ptr += utf8ByteCount(byte) } ptr = stringPtr + 8n @@ -581,15 +462,7 @@ provide let split = (separator: String, string: String) => { continue } let byte = WasmI32.load8U(ptr, 0n) - if ((byte & 0x80n) == 0x00n) { - ptr += 1n - } else if ((byte & 0xF0n) == 0xF0n) { - ptr += 4n - } else if ((byte & 0xE0n) == 0xE0n) { - ptr += 3n - } else { - ptr += 2n - } + ptr += utf8ByteCount(byte) } // Grab last string @@ -1122,15 +995,7 @@ let utf16Length = (s: String) => { while (ptr < end) { let byte = WasmI32.load8U(ptr, 0n) - let n = if ((byte & 0x80n) == 0x00n) { - 1n - } else if ((byte & 0xF0n) == 0xF0n) { - 4n - } else if ((byte & 0xE0n) == 0xE0n) { - 3n - } else { - 2n - } + let n = utf8ByteCount(byte) if (n == 4n) { size += 2n } else { @@ -1246,15 +1111,7 @@ let encodeAtHelp = ( while (ptr < end) { let byte = WasmI32.load8U(ptr, 0n) // number of bytes spanning this UTF-8-encoded scalar value - let n = if ((byte & 0x80n) == 0x00n) { - 1n - } else if ((byte & 0xF0n) == 0xF0n) { - 4n - } else if ((byte & 0xE0n) == 0xE0n) { - 3n - } else { - 2n - } + let n = utf8ByteCount(byte) match (encoding) { UTF8 => { // With the optimization above for bulk memory copy, this match @@ -1458,56 +1315,6 @@ provide let encode = (string: String, encoding: Encoding, includeBom=false) => { // Byte->String decoding and helper functions: -@unsafe -let writeUtf8CodePoint = (ptr, codePoint) => { - use WasmI32.{ (+), (-), (&), (|), (>>>), ltU as (<), leU as (<=), (==) } - if (codePoint <= 0x007Fn) { - // Code points in the ASCII range are written as just one byte with the - // leading bit equal to zero (0xxxxxxx). Just store the value as one byte - // directly. Note that the value is already guaranteed to start with most - // significant bit equal to zero because of the check in the if statement - // above, so there's no need to bit-mask it. - WasmI32.store8(ptr, codePoint, 0n) - 1n - } else if (codePoint <= 0x07FFn) { - // Code points in the range 0x0080..0x07FF are written as two bytes. - // The first byte has a three bit prefix of 110, followed by 5 bits of the - // codepoint. The second byte has a two bit prefix of 10, followed by 6 bits - // of the codepoint. - let high = codePoint >>> 6n & 0b000_11111n | 0b110_00000n - let low = codePoint & 0b00_111111n | 0b10_000000n - WasmI32.store8(ptr, high, 0n) - WasmI32.store8(ptr + 1n, low, 0n) - 2n - } else if (codePoint <= 0xFFFFn) { - // Code points in the range 0x0800..0xFFFF are written as three bytes. - // The first byte has a four bit prefix of 1110, followed by 4 bits of the - // codepoint. Remaining bytes each have a two bit prefix of 10, followed by - // 6 bits of the codepoint. - let high = codePoint >>> 12n & 0b0000_1111n | 0b1110_0000n - let mid = codePoint >>> 6n & 0b00_111111n | 0b10_000000n - let low = codePoint & 0b00_111111n | 0b10_000000n - WasmI32.store8(ptr, high, 0n) - WasmI32.store8(ptr + 1n, mid, 0n) - WasmI32.store8(ptr + 2n, low, 0n) - 3n - } else { - // Code points in the range 0x10000..0x10FFFF are written as four bytes. - // The first byte has a five bit prefix of 11110, followed by 3 bits of the - // codepoint. Remaining bytes each have a two bit prefix of 10, followed by - // 6 bits of the codepoint. - let high = codePoint >>> 18n & 0b00000_111n | 0b11110_000n - let mid1 = codePoint >>> 12n & 0b00_111111n | 0b10_000000n - let mid2 = codePoint >>> 6n & 0b00_111111n | 0b10_000000n - let low = codePoint & 0b00_111111n | 0b10_000000n - WasmI32.store8(ptr, high, 0n) - WasmI32.store8(ptr + 1n, mid1, 0n) - WasmI32.store8(ptr + 2n, mid2, 0n) - WasmI32.store8(ptr + 3n, low, 0n) - 4n - } -} - @unsafe let bytesHaveBom = (bytes: Bytes, encoding: Encoding, start: WasmI32) => { use WasmI32.{ (+), geU as (>=), (==) } @@ -1945,15 +1752,7 @@ provide let forEachCodePoint = (fn: Number => Void, str: String) => { while (ptr < end) { let byte = WasmI32.load8U(ptr, 0n) - let codePointByteCount = if ((byte & 0x80n) == 0x00n) { - 1n - } else if ((byte & 0xF0n) == 0xF0n) { - 4n - } else if ((byte & 0xE0n) == 0xE0n) { - 3n - } else { - 2n - } + let codePointByteCount = utf8ByteCount(byte) // Note that even if up to 4 bytes are needed to represent Unicode // codepoints, this doesn't mean 32 bits. The highest allowed code point is @@ -1999,15 +1798,7 @@ provide let forEachCodePointi = (fn: (Number, Number) => Void, str: String) => { let mut idx = 0n while (ptr < end) { let byte = WasmI32.load8U(ptr, 0n) - let codePointByteCount = if ((byte & 0x80n) == 0x00n) { - 1n - } else if ((byte & 0xF0n) == 0xF0n) { - 4n - } else if ((byte & 0xE0n) == 0xE0n) { - 3n - } else { - 2n - } + let codePointByteCount = utf8ByteCount(byte) // Note that even if up to 4 bytes are needed to represent Unicode // codepoints, this doesn't mean 32 bits. The highest allowed code point is @@ -2051,15 +1842,7 @@ provide let forEachChar = (fn: Char => Void, str: String) => { while (ptr < end) { let byte = WasmI32.load8U(ptr, 0n) - let codePointByteCount = if ((byte & 0x80n) == 0x00n) { - 1n - } else if ((byte & 0xF0n) == 0xF0n) { - 4n - } else if ((byte & 0xE0n) == 0xE0n) { - 3n - } else { - 2n - } + let codePointByteCount = utf8ByteCount(byte) // Note that even if up to 4 bytes are needed to represent Unicode // codepoints, this doesn't mean 32 bits. The highest allowed code point is @@ -2102,15 +1885,7 @@ provide let forEachChari = (fn: (Char, Number) => Void, str: String) => { let mut idx = 0n while (ptr < end) { let byte = WasmI32.load8U(ptr, 0n) - let codePointByteCount = if ((byte & 0x80n) == 0x00n) { - 1n - } else if ((byte & 0xF0n) == 0xF0n) { - 4n - } else if ((byte & 0xE0n) == 0xE0n) { - 3n - } else { - 2n - } + let codePointByteCount = utf8ByteCount(byte) // Note that even if up to 4 bytes are needed to represent Unicode // codepoints, this doesn't mean 32 bits. The highest allowed code point is