diff --git a/compiler/test/stdlib/bytes.test.gr b/compiler/test/stdlib/bytes.test.gr
index 6461c2126..d4f496599 100644
--- a/compiler/test/stdlib/bytes.test.gr
+++ b/compiler/test/stdlib/bytes.test.gr
@@ -17,6 +17,25 @@ assert Bytes.length(Bytes.empty) == 0
let bytes = Bytes.make(64)
assert Bytes.length(bytes) == 64
+// Bytes.getChar
+let bytes = Bytes.fromString("ab©✨🍞")
+assert Bytes.getChar(0, bytes) == 'a'
+assert Bytes.getChar(1, bytes) == 'b'
+assert Bytes.getChar(2, bytes) == '©'
+assert Bytes.getChar(4, bytes) == '✨'
+assert Bytes.getChar(7, bytes) == '🍞'
+
+// Bytes.setChar
+let bytes = Bytes.make(16)
+Bytes.setChar(0, 'a', bytes)
+assert Bytes.getChar(0, bytes) == 'a'
+Bytes.setChar(1, '©', bytes)
+assert Bytes.getChar(1, bytes) == '©'
+Bytes.setChar(3, '✨', bytes)
+assert Bytes.getChar(3, bytes) == '✨'
+Bytes.setChar(7, '🍞', bytes)
+assert Bytes.getChar(7, bytes) == '🍞'
+
// Bytes.setInt8, Bytes.setUint8, Bytes.getInt8, Bytes.getUint8
let bytes = Bytes.make(1)
Bytes.setInt8(0, 0xffs, bytes)
diff --git a/stdlib/buffer.gr b/stdlib/buffer.gr
index 9a0bb36aa..3abf99aaf 100644
--- a/stdlib/buffer.gr
+++ b/stdlib/buffer.gr
@@ -16,13 +16,15 @@ from "runtime/unsafe/wasmi32" include WasmI32
from "runtime/unsafe/conv" include Conv
from "runtime/exception" include Exception
from "runtime/dataStructures" include DataStructures
-use DataStructures.{ untagChar }
+use DataStructures.{ untagChar, tagSimpleNumber }
from "int32" include Int32
from "bytes" include Bytes
from "string" include String
from "char" include Char
from "runtime/numbers" include Numbers
use Numbers.{ coerceNumberToWasmI32 }
+from "runtime/utf8" include Utf8
+use Utf8.{ usvEncodeLength }
abstract record Buffer {
mut len: Number,
@@ -389,48 +391,12 @@ provide let addString = (string, buffer) => {
*/
@unsafe
provide let addChar = (char, buffer) => {
- use WasmI32.{ (-), (*), (&), (|), (>>>), ltU as (<), gtU as (>), leU as (<=) }
let usv = untagChar(char)
-
- let bytelen = if (usv < 0x80n) {
- autogrow(1, buffer)
- use WasmI32.{ (+) }
- let off = coerceNumberToWasmI32(buffer.len)
- let dst = WasmI32.fromGrain(buffer.data) + _VALUE_OFFSET
- WasmI32.store8(dst, usv, off)
- 1
- } else {
- let mut count = 0n
- let mut bytelen = 0
- let mut offset = 0n
- if (usv <= 0x07FFn) {
- count = 1n
- bytelen = 2
- offset = 0xC0n
- } else if (usv <= 0xFFFFn) {
- count = 2n
- bytelen = 3
- offset = 0xE0n
- } else {
- count = 3n
- bytelen = 4
- offset = 0xF0n
- }
- use WasmI32.{ (+) }
- autogrow(bytelen, buffer)
- let off = coerceNumberToWasmI32(buffer.len)
- let dst = WasmI32.fromGrain(buffer.data) + _VALUE_OFFSET
- WasmI32.store8(dst, (usv >>> (6n * count)) + offset, off)
- let mut n = 0n
- while (count > 0n) {
- n += 1n
- let temp = usv >>> (6n * (count - 1n))
- WasmI32.store8(dst + n, 0x80n | temp & 0x3Fn, off)
- count -= 1n
- }
- bytelen
- }
- buffer.len += bytelen
+ let byteCount = tagSimpleNumber(usvEncodeLength(usv))
+ autogrow(byteCount, buffer)
+ let index = buffer.len
+ buffer.len += byteCount
+ Bytes.setChar(index, char, buffer.data)
}
/**
diff --git a/stdlib/bytes.gr b/stdlib/bytes.gr
index f63e7f246..e5ee3afb7 100644
--- a/stdlib/bytes.gr
+++ b/stdlib/bytes.gr
@@ -18,10 +18,12 @@ from "runtime/unsafe/wasmf64" include WasmF64
from "runtime/unsafe/conv" include Conv
from "runtime/dataStructures" include DataStructures
use DataStructures.{
+ tagChar,
tagInt8,
tagUint8,
tagInt16,
tagUint16,
+ untagChar,
untagInt8,
untagUint8,
untagInt16,
@@ -33,6 +35,14 @@ from "runtime/exception" include Exception
from "int32" include Int32
from "runtime/numbers" include Numbers
use Numbers.{ coerceNumberToWasmI32 }
+from "runtime/utf8" include Utf8
+use Utf8.{
+ utf8ByteCount,
+ getCodePoint,
+ usvEncodeLength,
+ writeUtf8CodePoint,
+ exception MalformedUnicode,
+}
@unsafe
let _SIZE_OFFSET = 4n
@@ -396,6 +406,70 @@ provide let clear = (bytes: Bytes) => {
ignore(bytes)
}
+/**
+ * Gets the UTF-8 encoded character at the given byte index.
+ *
+ * @param index: The byte index to access
+ * @param bytes: The byte sequence to access
+ * @returns The character that starts at the given index
+ *
+ * @throws IndexOutOfBounds: When `index` is negative
+ * @throws MalformedUnicode: When the requested character is not a valid UTF-8 sequence
+ *
+ * @example
+ * let bytes = Bytes.fromString("Hello")
+ * assert Bytes.getChar(0, bytes) == 'H'
+ *
+ * @since v0.7.0
+ */
+@unsafe
+provide let getChar = (index: Number, bytes: Bytes) => {
+ // result
+ use WasmI32.{ (+), (&), (+), (==), (>) }
+ let ptr = WasmI32.fromGrain(bytes)
+ let size = getSize(ptr)
+ let offset = coerceNumberToWasmI32(index)
+ checkIndexIsInBounds(offset, 1n, size)
+ let byte = WasmI32.load8U(ptr + offset, _VALUE_OFFSET)
+ let charSize = utf8ByteCount(byte)
+ if (offset + charSize > size) {
+ throw MalformedUnicode
+ }
+ let codePoint = getCodePoint(ptr + offset + _VALUE_OFFSET)
+ ignore(bytes)
+ tagChar(codePoint)
+}
+
+/**
+ * UTF-8 encodes a character starting at the given byte index.
+ *
+ * @param index: The byte index to update
+ * @param value: The value to set
+ * @param bytes: The byte sequence to mutate
+ *
+ * @throws IndexOutOfBounds: When `index` is negative
+ * @throws IndexOutOfBounds: When `index + charSize` is greater than the bytes size, `charSize` is the number of bytes in the character ranging from 1 to 4
+ *
+ * @example
+ * let bytes = Bytes.make(1)
+ * Bytes.setChar(0, 'a', bytes)
+ * assert Bytes.getChar(0, bytes) == 'a'
+ *
+ * @since v0.7.0
+ */
+@unsafe
+provide let setChar = (index: Number, value: Char, bytes: Bytes) => {
+ use WasmI32.{ (+) }
+ let ptr = WasmI32.fromGrain(bytes)
+ let size = getSize(ptr)
+ let offset = coerceNumberToWasmI32(index)
+ let usv = untagChar(value)
+ let charSize = usvEncodeLength(usv)
+ checkIndexIsInBounds(offset, charSize, size)
+ writeUtf8CodePoint(ptr + offset + _VALUE_OFFSET, usv)
+ ignore(bytes)
+}
+
/**
* Gets a signed 8-bit integer starting at the given byte index.
*
@@ -452,8 +526,8 @@ provide let setInt8 = (index: Number, value: Int8, bytes: Bytes) => {
let offset = coerceNumberToWasmI32(index)
checkIndexIsInBounds(offset, _INT8_BYTE_SIZE, size)
let v = untagInt8(value)
- ignore(bytes)
WasmI32.store8(ptr + offset, v, _VALUE_OFFSET)
+ ignore(bytes)
}
/**
diff --git a/stdlib/bytes.md b/stdlib/bytes.md
index 4313c071f..9f619cd5f 100644
--- a/stdlib/bytes.md
+++ b/stdlib/bytes.md
@@ -431,6 +431,85 @@ Bytes.clear(bytes)
assert bytes == b"\x00\x00\x00\x00\x00"
```
+### Bytes.**getChar**
+
+
+Added in next
+No other changes yet.
+
+
+```grain
+getChar : (index: Number, bytes: Bytes) => Char
+```
+
+Gets the UTF-8 encoded character at the given byte index.
+
+Parameters:
+
+|param|type|description|
+|-----|----|-----------|
+|`index`|`Number`|The byte index to access|
+|`bytes`|`Bytes`|The byte sequence to access|
+
+Returns:
+
+|type|description|
+|----|-----------|
+|`Char`|The character that starts at the given index|
+
+Throws:
+
+`IndexOutOfBounds`
+
+* When `index` is negative
+
+`MalformedUnicode`
+
+* When the requested character is not a valid UTF-8 sequence
+
+Examples:
+
+```grain
+let bytes = Bytes.fromString("Hello")
+assert Bytes.getChar(0, bytes) == 'H'
+```
+
+### Bytes.**setChar**
+
+
+Added in next
+No other changes yet.
+
+
+```grain
+setChar : (index: Number, value: Char, bytes: Bytes) => Void
+```
+
+UTF-8 encodes a character starting at the given byte index.
+
+Parameters:
+
+|param|type|description|
+|-----|----|-----------|
+|`index`|`Number`|The byte index to update|
+|`value`|`Char`|The value to set|
+|`bytes`|`Bytes`|The byte sequence to mutate|
+
+Throws:
+
+`IndexOutOfBounds`
+
+* When `index` is negative
+* When `index + charSize` is greater than the bytes size, `charSize` is the number of bytes in the character ranging from 1 to 4
+
+Examples:
+
+```grain
+let bytes = Bytes.make(1)
+Bytes.setChar(0, 'a', bytes)
+assert Bytes.getChar(0, bytes) == 'a'
+```
+
### Bytes.**getInt8**
diff --git a/stdlib/char.gr b/stdlib/char.gr
index 0ca14c6ab..10ef6361b 100644
--- a/stdlib/char.gr
+++ b/stdlib/char.gr
@@ -16,8 +16,8 @@ module Char
from "runtime/unsafe/wasmi32" include WasmI32
from "runtime/dataStructures" include DataStructures
use DataStructures.{ tagSimpleNumber, tagChar, untagChar, allocateString }
-
-exception MalformedUtf8
+from "runtime/utf8" include Utf8
+use Utf8.{ usvEncodeLength, writeUtf8CodePoint }
/**
* The minimum valid Unicode scalar value.
@@ -164,52 +164,12 @@ provide let pred = char => {
*/
@unsafe
provide let toString = (char: Char) => {
- use WasmI32.{
- (+),
- (-),
- (*),
- (&),
- (|),
- (>>>),
- ltU as (<),
- gtU as (>),
- leU as (<=),
- }
-
+ use WasmI32.{ (+) }
let usv = untagChar(char)
-
- let result = if (usv < 0x80n) {
- let string = allocateString(1n)
- WasmI32.store8(string, usv, 8n)
- WasmI32.toGrain(string): String
- } else {
- let mut count = 0n
- let mut offset = 0n
- if (usv <= 0x07FFn) {
- count = 1n
- offset = 0xC0n
- } else if (usv <= 0xFFFFn) {
- count = 2n
- offset = 0xE0n
- } else {
- count = 3n
- offset = 0xF0n
- }
- let string = allocateString(count + 1n)
- WasmI32.store8(string, (usv >>> (6n * count)) + offset, 8n)
-
- let mut n = 0n
- while (count > 0n) {
- n += 1n
- let temp = usv >>> (6n * (count - 1n))
- WasmI32.store8(string + n, 0x80n | temp & 0x3Fn, 8n)
- count -= 1n
- }
-
- WasmI32.toGrain(string): String
- }
-
- result
+ let byteCount = usvEncodeLength(usv)
+ let string = allocateString(byteCount)
+ writeUtf8CodePoint(string + 8n, usv)
+ WasmI32.toGrain(string): String
}
/**
diff --git a/stdlib/json.gr b/stdlib/json.gr
index 9a8cbeea6..e5fc87fbe 100644
--- a/stdlib/json.gr
+++ b/stdlib/json.gr
@@ -17,6 +17,8 @@ from "runtime/bigint" include Bigint as BI
from "runtime/dataStructures" include DataStructures
from "runtime/numbers" include Numbers
from "runtime/numberUtils" include NumberUtils
+from "runtime/utf8" include Utf8
+use Utf8.{ getCodePoint }
from "runtime/string" include String as RuntimeString
from "runtime/unsafe/tags" include Tags
from "runtime/unsafe/wasmi32" include WasmI32
@@ -28,7 +30,7 @@ from "char" include Char
from "string" include String
from "list" include List
from "uint8" include Uint8
-use RuntimeString.{ toString as runtimeToString, getCodePoint }
+use RuntimeString.{ toString as runtimeToString }
use Numbers.{ coerceNumberToWasmI32 }
use DataStructures.{ tagSimpleNumber, untagSimpleNumber }
@@ -1321,6 +1323,7 @@ let rec readCodePoint = (bytePosition: Number, string: String) => {
if (bytePositionW32 < byteSize) {
let codePoint = getCodePoint(ptr)
+
tagSimpleNumber(codePoint)
} else {
_END_OF_INPUT
@@ -2072,7 +2075,7 @@ provide let parse: (str: String) => Result = (str: String)
/**
* Utilities for accessing and updating JSON data.
- *
+ *
* @example
* let obj = JsonObject([("x", JsonNumber(123))])
* assert get(property("x") ||> number, obj) == Some(123)
@@ -2103,11 +2106,11 @@ provide module Lenses {
/**
* Reads the value focused on by the given lens from the input data.
- *
+ *
* @param lens: The lens to apply to the subject data
* @param subject: The data which will have the lens applied to it
* @returns `Some(data)` containing the data read by the lens if the lens matches the given data, or `None` if the data cannot be matched to the lens
- *
+ *
* @example assert get(number, JsonNumber(123)) == Some(123)
* @example assert get(string, JsonString("abc")) == Some("abc")
* @example assert get(number, JsonString("abc")) == None
@@ -2117,12 +2120,12 @@ provide module Lenses {
/**
* Sets the value focused on by the given lens from the input data to the
* desired new value.
- *
+ *
* @param lens: The lens to apply to the subject data
* @param newValue: The new value to set at the focus of the lens
* @param subject: The data which will have the lens applied to it
* @returns `Some(data)` containing the new data after the lens substitution if the lens matches the given data, or `None` if the data cannot be matched to the lens
- *
+ *
* @example assert set(number, 123, JsonBoolean(true)) == Some(JsonNumber(123))
* @example assert set(property("a"), JsonNumber(123), JsonObject([("a", JsonNull)])) == Some(JsonObject([("a", JsonNumber(123))]))
* @example assert set(property("a"), JsonNumber(123), JsonBoolean(true)) == None
@@ -2132,12 +2135,12 @@ provide module Lenses {
/**
* Updates the value focused on by the given lens from the input data by
* applying a function to it and setting the focus to the result of the function
- *
+ *
* @param lens: The lens to apply to the subject data
* @param fn: The function to apply to the matched data at the lens if matched
* @param subject: The data which will have the lens applied to it
* @returns `Some(data)` containing the new data after the lens mapping has been applied if the lens matches the given data, or `None` if the data cannot be matched to the lens
- *
+ *
* @example assert map(number, x => x * 2, JsonNumber(5)) == Some(JsonNumber(10))
* @example
* assert map(property("x"), x => JsonArray([x, x]), JsonObject([("x", JsonNumber(1))])) ==
@@ -2153,9 +2156,9 @@ provide module Lenses {
/**
* A lens whose focus is a JSON value.
- *
+ *
* @example assert get(json, JsonString("abc")) == Some(JsonString("abc"))
- *
+ *
* @since v0.7.0
*/
provide let json = {
@@ -2165,9 +2168,9 @@ provide module Lenses {
/**
* A lens whose focus is a JSON boolean value.
- *
+ *
* @example assert get(boolean, JsonBoolean(true)) == Some(true)
- *
+ *
* @since v0.7.0
*/
provide let boolean = {
@@ -2182,9 +2185,9 @@ provide module Lenses {
/**
* A lens whose focus is a JSON string value.
- *
+ *
* @example assert get(string, JsonString("abc")) == Some("abc")
- *
+ *
* @since v0.7.0
*/
provide let string = {
@@ -2199,9 +2202,9 @@ provide module Lenses {
/**
* A lens whose focus is a JSON number value.
- *
+ *
* @example assert get(number, JsonNumber(123)) == Some(123)
- *
+ *
* @since v0.7.0
*/
provide let number = {
@@ -2216,9 +2219,9 @@ provide module Lenses {
/**
* A lens whose focus is a JSON array.
- *
+ *
* @example assert get(array, JsonArray([JsonNumber(123)])) == Some([JsonNumber(123)])
- *
+ *
* @since v0.7.0
*/
provide let array = {
@@ -2233,9 +2236,9 @@ provide module Lenses {
/**
* A lens whose focus is the property pair list of a JSON object.
- *
+ *
* @example assert get(objectProperties, JsonObject([("a", JsonNumber(123))])) == Some([("a", JsonNumber(123))])
- *
+ *
* @since v0.7.0
*/
provide let objectProperties = {
@@ -2264,15 +2267,15 @@ provide module Lenses {
/**
* Creates a lens whose focus is a given property of a JSON object.
- *
+ *
* @param propertyName: The property name of the JSON object to focus on
* @returns A lens whose focus is the given property of a JSON object
- *
+ *
* @example assert get(property("x"), JsonObject([("x", JsonNumber(123))])) == Some(JsonNumber(123))
* @example
* assert set(property("x"), JsonString("new"), JsonObject([("x", JsonNumber(123))])) ==
* Some(JsonObject([("x", JsonString("new"))]))
- *
+ *
* @since v0.7.0
*/
provide let property = propertyName =>
@@ -2298,12 +2301,12 @@ provide module Lenses {
* the result will be enclosed in `Some`; if the lens does not match but the
* value focused is null, then the lens will still successfully match and
* `None` will be returned.
- *
+ *
* @example assert get(nullable(number), JsonNumber(123)) == Some(Some(123))
* @example assert get(nullable(number), JsonNull) == Some(None)
* @example assert get(nullable(number), JsonString("abc")) == None
* @example assert set(nullable(number), Some(123), JsonString("abc")) == Some(JsonNumber(123))
- *
+ *
* @since v0.7.0
*/
provide let nullable = lens =>
@@ -2331,16 +2334,16 @@ provide module Lenses {
/**
* Reverse lens composition.
- *
+ *
* @param lens1: The lens which will be applied first
* @param lens2: The lens which will be applied second
* @returns A lens which combines the two given lenses, passing through the first and then the second
- *
+ *
* @example assert get(property("x") ||> number, JsonObject([("x", JsonNumber(123))])) == Some(123)
* @example
* assert set(property("x") ||> string, "new", JsonObject([("x", JsonNumber(123))])) ==
* Some(JsonObject([("x", JsonString("new"))]))
- *
+ *
* @since v0.7.0
*/
let (||>) = (lens1, lens2) =>
@@ -2361,14 +2364,14 @@ provide module Lenses {
/**
* Creates a lens whose focus is a given property path within a JSON object tree.
- *
+ *
* @param propertyNames: The property path of the JSON object to create a focus on
* @returns A lens whose focus is the given property path of a JSON object
- *
+ *
* @example
* let nestedObj = JsonObject([("a", JsonObject([("b", JsonNumber(123))]))])
* assert get(propertyPath(["a", "b"]), nestedObj) == Some(JsonNumber(123))
- *
+ *
* @since v0.7.0
*/
provide let propertyPath = propertyNames => {
diff --git a/stdlib/runtime/string.gr b/stdlib/runtime/string.gr
index a1836dacf..bbaec3675 100644
--- a/stdlib/runtime/string.gr
+++ b/stdlib/runtime/string.gr
@@ -27,7 +27,8 @@ from "runtime/bigint" include Bigint as BI
from "runtime/unsafe/memory" include Memory
from "runtime/unsafe/tags" include Tags
from "runtime/numberUtils" include NumberUtils
-
+from "runtime/utf8" include Utf8
+use Utf8.{ usvEncodeLength, writeUtf8CodePoint }
from "runtime/dataStructures" include DataStructures
use DataStructures.{ allocateString, allocateArray, untagSimpleNumber }
@@ -39,9 +40,6 @@ primitive (&&) = "@and"
primitive (||) = "@or"
primitive builtinId = "@builtin.id"
primitive ignore = "@ignore"
-primitive throw = "@throw"
-
-exception MalformedUnicode
@unsafe
primitive typeMetadata = "@heap.type_metadata"
@@ -404,40 +402,6 @@ let escapeChar = (s: String) => {
escape(WasmI32.fromGrain(s), false)
}
-@unsafe
-let usvToString = usv => {
- if (usv < 0x80n) {
- let string = allocateString(1n)
- WasmI32.store8(string, usv, 8n)
- WasmI32.toGrain(string): String
- } else {
- let mut count = 0n
- let mut offset = 0n
- if (usv <= 0x07FFn) {
- count = 1n
- offset = 0xC0n
- } else if (usv <= 0xFFFFn) {
- count = 2n
- offset = 0xE0n
- } else {
- count = 3n
- offset = 0xF0n
- }
- let string = allocateString(count + 1n)
- WasmI32.store8(string, (usv >>> (6n * count)) + offset, 8n)
-
- let mut n = 0n
- while (count > 0n) {
- n += 1n
- let temp = usv >>> (6n * (count - 1n))
- WasmI32.store8(string + n, 0x80n | temp & 0x3Fn, 8n)
- count -= 1n
- }
-
- WasmI32.toGrain(string): String
- }
-}
-
@unsafe
let reportCycle = (ptr, cycles) => {
let mut cycleNum = vecFindIndex(cycles, ptr)
@@ -707,7 +671,10 @@ and toStringHelp = (grainValue, extraIndents, toplevel, cycles) => {
let shortVal = grainValue >> 8n
let shortValTag = (grainValue & 0xF8n) >> 3n
if (shortValTag == Tags._GRAIN_CHAR_SHORTVAL_TAG) {
- let string = usvToString(shortVal)
+ let byteCount = usvEncodeLength(shortVal)
+ let string = allocateString(byteCount)
+ writeUtf8CodePoint(string + 8n, shortVal)
+ let string = WasmI32.toGrain(string): String
if (toplevel) {
string
} else {
@@ -892,54 +859,3 @@ provide let print = (value, suffix="\n") => {
ignore(suffix)
void
}
-
-@unsafe
-provide let getCodePoint = (ptr: WasmI32) => {
- // Algorithm from https://encoding.spec.whatwg.org/#utf-8-decoder
- use WasmI32.{ (+), (&), (|), (<<), leU as (<=), geU as (>=), (==) }
-
- let mut codePoint = 0n
- let mut bytesSeen = 0n
- let mut bytesNeeded = 0n
- let mut lowerBoundary = 0x80n
- let mut upperBoundary = 0xBFn
-
- let mut offset = 0n
-
- while (true) {
- let byte = WasmI32.load8U(ptr + offset, 0n)
- offset += 1n
- if (bytesNeeded == 0n) {
- if (byte >= 0x00n && byte <= 0x7Fn) {
- return byte
- } else if (byte >= 0xC2n && byte <= 0xDFn) {
- bytesNeeded = 1n
- codePoint = byte & 0x1Fn
- } else if (byte >= 0xE0n && byte <= 0xEFn) {
- if (byte == 0xE0n) lowerBoundary = 0xA0n
- if (byte == 0xEDn) upperBoundary = 0x9Fn
- bytesNeeded = 2n
- codePoint = byte & 0xFn
- } else if (byte >= 0xF0n && byte <= 0xF4n) {
- if (byte == 0xF0n) lowerBoundary = 0x90n
- if (byte == 0xF4n) upperBoundary = 0x8Fn
- bytesNeeded = 3n
- codePoint = byte & 0x7n
- } else {
- throw MalformedUnicode
- }
- continue
- }
- if (!(lowerBoundary <= byte && byte <= upperBoundary)) {
- throw MalformedUnicode
- }
- lowerBoundary = 0x80n
- upperBoundary = 0xBFn
- codePoint = codePoint << 6n | byte & 0x3Fn
- bytesSeen += 1n
- if (bytesSeen == bytesNeeded) {
- return codePoint
- }
- }
- return 0n
-}
diff --git a/stdlib/runtime/string.md b/stdlib/runtime/string.md
index 9769c7139..4f403457f 100644
--- a/stdlib/runtime/string.md
+++ b/stdlib/runtime/string.md
@@ -93,9 +93,3 @@ Parameters:
|`value`|`a`|The operand|
|`?suffix`|`String`|The string to print after the argument|
-### String.**getCodePoint**
-
-```grain
-getCodePoint : (ptr: WasmI32) => WasmI32
-```
-
diff --git a/stdlib/runtime/utf8.gr b/stdlib/runtime/utf8.gr
new file mode 100644
index 000000000..4ecf907cf
--- /dev/null
+++ b/stdlib/runtime/utf8.gr
@@ -0,0 +1,176 @@
+@noPervasives
+/**
+ * The `Utf8` module provides functions for working with UTF-8 encoded strings.
+ */
+module Utf8
+
+primitive (!) = "@not"
+primitive (&&) = "@and"
+primitive throw = "@throw"
+
+from "runtime/unsafe/wasmi32" include WasmI32
+
+/**
+ * An exception thrown when a string is not valid UTF-8.
+ */
+provide exception MalformedUnicode
+
+/**
+ * Returns the total number of bytes for a UTF-8 code point given the first byte.
+ *
+ * @param byte: The first byte of the UTF-8 code point
+ *
+ * @returns The number of bytes in the UTF-8 code point
+ */
+@unsafe
+provide let utf8ByteCount = byte => {
+ use WasmI32.{ (&), (==) }
+ if ((byte & 0x80n) == 0x00n) {
+ 1n
+ } else if ((byte & 0xF0n) == 0xF0n) {
+ 4n
+ } else if ((byte & 0xE0n) == 0xE0n) {
+ 3n
+ } else {
+ 2n
+ }
+}
+
+/**
+ * Returns the number of bytes required to encode the given USV as UTF-8.
+ *
+ * @param usv: The Unicode scalar value
+ *
+ * @returns The number of bytes required to encode the given USV as UTF-8
+ */
+@unsafe
+provide let usvEncodeLength = usv => {
+ use WasmI32.{ (<=) }
+ if (usv <= 0x007Fn) {
+ 1n
+ } else if (usv <= 0x07FFn) {
+ 2n
+ } else if (usv <= 0xFFFFn) {
+ 3n
+ } else {
+ 4n
+ }
+}
+
+/**
+ * Returns the Unicode code point of the encoded value at the given pointer.
+ *
+ * @param ptr: The pointer to the encoded value in memory
+ *
+ * @returns The Unicode code point of the encoded value at the given pointer
+ *
+ * @throws MalformedUnicode: if the encoded value is not a valid UTF-8 sequence
+ */
+@unsafe
+provide let getCodePoint = (ptr: WasmI32) => {
+ // Algorithm from https://encoding.spec.whatwg.org/#utf-8-decoder
+ use WasmI32.{ (+), (&), (|), (<<), leU as (<=), geU as (>=), (==) }
+
+ let mut codePoint = 0n
+ let mut bytesSeen = 0n
+ let mut bytesNeeded = 0n
+ let mut lowerBoundary = 0x80n
+ let mut upperBoundary = 0xBFn
+
+ let mut offset = 0n
+
+ while (true) {
+ let byte = WasmI32.load8U(ptr + offset, 0n)
+ offset += 1n
+ if (bytesNeeded == 0n) {
+ if (byte >= 0x00n && byte <= 0x7Fn) {
+ return byte
+ } else if (byte >= 0xC2n && byte <= 0xDFn) {
+ bytesNeeded = 1n
+ codePoint = byte & 0x1Fn
+ } else if (byte >= 0xE0n && byte <= 0xEFn) {
+ if (byte == 0xE0n) lowerBoundary = 0xA0n
+ if (byte == 0xEDn) upperBoundary = 0x9Fn
+ bytesNeeded = 2n
+ codePoint = byte & 0xFn
+ } else if (byte >= 0xF0n && byte <= 0xF4n) {
+ if (byte == 0xF0n) lowerBoundary = 0x90n
+ if (byte == 0xF4n) upperBoundary = 0x8Fn
+ bytesNeeded = 3n
+ codePoint = byte & 0x7n
+ } else {
+ throw MalformedUnicode
+ }
+ continue
+ }
+ if (!(lowerBoundary <= byte && byte <= upperBoundary)) {
+ throw MalformedUnicode
+ }
+ lowerBoundary = 0x80n
+ upperBoundary = 0xBFn
+ codePoint = codePoint << 6n | byte & 0x3Fn
+ bytesSeen += 1n
+ if (bytesSeen == bytesNeeded) {
+ return codePoint
+ }
+ }
+ return 0n
+}
+
+/**
+ * Writes the given Unicode code point to the given pointer as encoded UTF-8.
+ *
+ * @param ptr: The pointer to write the UTF-8 character to
+ * @param codePoint: The Unicode code point to write
+ *
+ * @returns The number of bytes written
+ */
+@unsafe
+provide let writeUtf8CodePoint = (ptr, codePoint) => {
+ use WasmI32.{ (+), (-), (&), (|), (>>>), ltU as (<), leU as (<=), (==) }
+ if (codePoint <= 0x007Fn) {
+ // Code points in the ASCII range are written as just one byte with the
+ // leading bit equal to zero (0xxxxxxx). Just store the value as one byte
+ // directly. Note that the value is already guaranteed to start with most
+ // significant bit equal to zero because of the check in the if statement
+ // above, so there's no need to bit-mask it.
+ WasmI32.store8(ptr, codePoint, 0n)
+ 1n
+ } else if (codePoint <= 0x07FFn) {
+ // Code points in the range 0x0080..0x07FF are written as two bytes.
+ // The first byte has a three bit prefix of 110, followed by 5 bits of the
+ // codepoint. The second byte has a two bit prefix of 10, followed by 6 bits
+ // of the codepoint.
+ let high = codePoint >>> 6n & 0b000_11111n | 0b110_00000n
+ let low = codePoint & 0b00_111111n | 0b10_000000n
+ WasmI32.store8(ptr, high, 0n)
+ WasmI32.store8(ptr, low, 1n)
+ 2n
+ } else if (codePoint <= 0xFFFFn) {
+ // Code points in the range 0x0800..0xFFFF are written as three bytes.
+ // The first byte has a four bit prefix of 1110, followed by 4 bits of the
+ // codepoint. Remaining bytes each have a two bit prefix of 10, followed by
+ // 6 bits of the codepoint.
+ let high = codePoint >>> 12n & 0b0000_1111n | 0b1110_0000n
+ let mid = codePoint >>> 6n & 0b00_111111n | 0b10_000000n
+ let low = codePoint & 0b00_111111n | 0b10_000000n
+ WasmI32.store8(ptr, high, 0n)
+ WasmI32.store8(ptr, mid, 1n)
+ WasmI32.store8(ptr, low, 2n)
+ 3n
+ } else {
+ // Code points in the range 0x10000..0x10FFFF are written as four bytes.
+ // The first byte has a five bit prefix of 11110, followed by 3 bits of the
+ // codepoint. Remaining bytes each have a two bit prefix of 10, followed by
+ // 6 bits of the codepoint.
+ let high = codePoint >>> 18n & 0b00000_111n | 0b11110_000n
+ let mid1 = codePoint >>> 12n & 0b00_111111n | 0b10_000000n
+ let mid2 = codePoint >>> 6n & 0b00_111111n | 0b10_000000n
+ let low = codePoint & 0b00_111111n | 0b10_000000n
+ WasmI32.store8(ptr, high, 0n)
+ WasmI32.store8(ptr, mid1, 1n)
+ WasmI32.store8(ptr, mid2, 2n)
+ WasmI32.store8(ptr, low, 3n)
+ 4n
+ }
+}
diff --git a/stdlib/runtime/utf8.md b/stdlib/runtime/utf8.md
new file mode 100644
index 000000000..6337940bb
--- /dev/null
+++ b/stdlib/runtime/utf8.md
@@ -0,0 +1,97 @@
+---
+title: Utf8
+---
+
+The `Utf8` module provides functions for working with UTF-8 encoded strings.
+
+## Values
+
+Functions and constants included in the Utf8 module.
+
+### Utf8.**utf8ByteCount**
+
+```grain
+utf8ByteCount : (byte: WasmI32) => WasmI32
+```
+
+Returns the total number of bytes for a UTF-8 code point given the first byte.
+
+Parameters:
+
+|param|type|description|
+|-----|----|-----------|
+|`byte`|`WasmI32`|The first byte of the UTF-8 code point|
+
+Returns:
+
+|type|description|
+|----|-----------|
+|`WasmI32`|The number of bytes in the UTF-8 code point|
+
+### Utf8.**usvEncodeLength**
+
+```grain
+usvEncodeLength : (usv: WasmI32) => WasmI32
+```
+
+Returns the number of bytes required to encode the given USV as UTF-8.
+
+Parameters:
+
+|param|type|description|
+|-----|----|-----------|
+|`usv`|`WasmI32`|The Unicode scalar value|
+
+Returns:
+
+|type|description|
+|----|-----------|
+|`WasmI32`|The number of bytes required to encode the given USV as UTF-8|
+
+### Utf8.**getCodePoint**
+
+```grain
+getCodePoint : (ptr: WasmI32) => WasmI32
+```
+
+Returns the Unicode code point of the encoded value at the given pointer.
+
+Parameters:
+
+|param|type|description|
+|-----|----|-----------|
+|`ptr`|`WasmI32`|The pointer to the encoded value in memory|
+
+Returns:
+
+|type|description|
+|----|-----------|
+|`WasmI32`|The Unicode code point of the encoded value at the given pointer|
+
+Throws:
+
+`MalformedUnicode`
+
+* if the encoded value is not a valid UTF-8 sequence
+
+### Utf8.**writeUtf8CodePoint**
+
+```grain
+writeUtf8CodePoint : (ptr: WasmI32, codePoint: WasmI32) => WasmI32
+```
+
+Writes the given Unicode code point to the given pointer as encoded UTF-8.
+
+Parameters:
+
+|param|type|description|
+|-----|----|-----------|
+|`ptr`|`WasmI32`|The pointer to write the UTF-8 character to|
+|`codePoint`|`WasmI32`|The Unicode code point to write|
+
+Returns:
+
+|type|description|
+|----|-----------|
+|`WasmI32`|The number of bytes written|
+
diff --git a/stdlib/string.gr b/stdlib/string.gr
index c4dd188ae..6aa3b5a7e 100644
--- a/stdlib/string.gr
+++ b/stdlib/string.gr
@@ -24,6 +24,8 @@ use DataStructures.{
allocateString,
allocateBytes,
}
+from "runtime/utf8" include Utf8
+use Utf8.{ utf8ByteCount, usvEncodeLength, getCodePoint, writeUtf8CodePoint }
/**
* Byte encodings
@@ -153,15 +155,7 @@ provide let indexOf = (search: String, string: String) => {
}
idx += 1n
let byte = WasmI32.load8U(ptr, 0n)
- if ((byte & 0x80n) == 0x00n) {
- ptr += 1n
- } else if ((byte & 0xF0n) == 0xF0n) {
- ptr += 4n
- } else if ((byte & 0xE0n) == 0xE0n) {
- ptr += 3n
- } else {
- ptr += 2n
- }
+ ptr += utf8ByteCount(byte)
}
ignore(search)
@@ -220,57 +214,6 @@ provide let lastIndexOf = (search: String, string: String) => {
return None
}
-@unsafe
-let getCodePoint = (ptr: WasmI32) => {
- // Algorithm from https://encoding.spec.whatwg.org/#utf-8-decoder
- use WasmI32.{ (+), (&), (|), (<<), leU as (<=), geU as (>=), (==) }
-
- let mut codePoint = 0n
- let mut bytesSeen = 0n
- let mut bytesNeeded = 0n
- let mut lowerBoundary = 0x80n
- let mut upperBoundary = 0xBFn
-
- let mut offset = 0n
-
- while (true) {
- let byte = WasmI32.load8U(ptr + offset, 0n)
- offset += 1n
- if (bytesNeeded == 0n) {
- if (byte >= 0x00n && byte <= 0x7Fn) {
- return byte
- } else if (byte >= 0xC2n && byte <= 0xDFn) {
- bytesNeeded = 1n
- codePoint = byte & 0x1Fn
- } else if (byte >= 0xE0n && byte <= 0xEFn) {
- if (byte == 0xE0n) lowerBoundary = 0xA0n
- if (byte == 0xEDn) upperBoundary = 0x9Fn
- bytesNeeded = 2n
- codePoint = byte & 0xFn
- } else if (byte >= 0xF0n && byte <= 0xF4n) {
- if (byte == 0xF0n) lowerBoundary = 0x90n
- if (byte == 0xF4n) upperBoundary = 0x8Fn
- bytesNeeded = 3n
- codePoint = byte & 0x7n
- } else {
- throw MalformedUnicode
- }
- continue
- }
- if (!(lowerBoundary <= byte && byte <= upperBoundary)) {
- throw MalformedUnicode
- }
- lowerBoundary = 0x80n
- upperBoundary = 0xBFn
- codePoint = codePoint << 6n | byte & 0x3Fn
- bytesSeen += 1n
- if (bytesSeen == bytesNeeded) {
- return codePoint
- }
- }
- return 0n
-}
-
@unsafe
let charAtHelp = (position, string: String) => {
if (length(string) <= position || position < 0) {
@@ -289,17 +232,8 @@ let charAtHelp = (position, string: String) => {
return getCodePoint(ptr)
}
let byte = WasmI32.load8U(ptr, 0n)
- let n = if ((byte & 0x80n) == 0x00n) {
- 1n
- } else if ((byte & 0xF0n) == 0xF0n) {
- 4n
- } else if ((byte & 0xE0n) == 0xE0n) {
- 3n
- } else {
- 2n
- }
+ ptr += utf8ByteCount(byte)
counter += 1n
- ptr += n
}
ignore(string)
@@ -362,15 +296,7 @@ let explodeHelp = (s: String, chars) => {
while (ptr < end) {
let byte = WasmI32.load8U(ptr, 0n)
- let n = if ((byte & 0x80n) == 0x00n) {
- 1n
- } else if ((byte & 0xF0n) == 0xF0n) {
- 4n
- } else if ((byte & 0xE0n) == 0xE0n) {
- 3n
- } else {
- 2n
- }
+ let n = utf8ByteCount(byte)
let c = if (chars) {
WasmI32.fromGrain(tagChar(getCodePoint(ptr)))
@@ -438,18 +364,7 @@ provide let implode = (arr: Array) => {
for (let mut i = 0n; i < arrLength; i += 1n) {
let usv = untagChar(arr[tagSimpleNumber(i)])
-
- let n = if (usv <= 0x7Fn) {
- 1n
- } else if (usv <= 0x07FFn) {
- 2n
- } else if (usv <= 0xFFFFn) {
- 3n
- } else {
- 4n
- }
-
- stringByteLength += n
+ stringByteLength += usvEncodeLength(usv)
}
let str = allocateString(stringByteLength)
@@ -457,33 +372,7 @@ provide let implode = (arr: Array) => {
for (let mut i = 0n; i < arrLength; i += 1n) {
let usv = untagChar(arr[tagSimpleNumber(i)])
-
- if (usv < 0x7Fn) {
- WasmI32.store8(str + offset, usv, 0n)
- offset += 1n
- } else {
- let mut count = 0n
- let mut marker = 0n
- if (usv <= 0x07FFn) {
- count = 1n
- marker = 0xC0n
- } else if (usv <= 0xFFFFn) {
- count = 2n
- marker = 0xE0n
- } else {
- count = 3n
- marker = 0xF0n
- }
- WasmI32.store8(str + offset, (usv >>> (6n * count)) + marker, 0n)
- offset += 1n
-
- while (count > 0n) {
- let temp = usv >>> (6n * (count - 1n))
- WasmI32.store8(str + offset, 0x80n | temp & 0x3Fn, 0n)
- count -= 1n
- offset += 1n
- }
- }
+ offset += writeUtf8CodePoint(str + offset, usv)
}
WasmI32.toGrain(str): String
@@ -553,15 +442,7 @@ provide let split = (separator: String, string: String) => {
numStrings += 1n
}
let byte = WasmI32.load8U(ptr, 0n)
- if ((byte & 0x80n) == 0x00n) {
- ptr += 1n
- } else if ((byte & 0xF0n) == 0xF0n) {
- ptr += 4n
- } else if ((byte & 0xE0n) == 0xE0n) {
- ptr += 3n
- } else {
- ptr += 2n
- }
+ ptr += utf8ByteCount(byte)
}
ptr = stringPtr + 8n
@@ -581,15 +462,7 @@ provide let split = (separator: String, string: String) => {
continue
}
let byte = WasmI32.load8U(ptr, 0n)
- if ((byte & 0x80n) == 0x00n) {
- ptr += 1n
- } else if ((byte & 0xF0n) == 0xF0n) {
- ptr += 4n
- } else if ((byte & 0xE0n) == 0xE0n) {
- ptr += 3n
- } else {
- ptr += 2n
- }
+ ptr += utf8ByteCount(byte)
}
// Grab last string
@@ -1122,15 +995,7 @@ let utf16Length = (s: String) => {
while (ptr < end) {
let byte = WasmI32.load8U(ptr, 0n)
- let n = if ((byte & 0x80n) == 0x00n) {
- 1n
- } else if ((byte & 0xF0n) == 0xF0n) {
- 4n
- } else if ((byte & 0xE0n) == 0xE0n) {
- 3n
- } else {
- 2n
- }
+ let n = utf8ByteCount(byte)
if (n == 4n) {
size += 2n
} else {
@@ -1246,15 +1111,7 @@ let encodeAtHelp = (
while (ptr < end) {
let byte = WasmI32.load8U(ptr, 0n)
// number of bytes spanning this UTF-8-encoded scalar value
- let n = if ((byte & 0x80n) == 0x00n) {
- 1n
- } else if ((byte & 0xF0n) == 0xF0n) {
- 4n
- } else if ((byte & 0xE0n) == 0xE0n) {
- 3n
- } else {
- 2n
- }
+ let n = utf8ByteCount(byte)
match (encoding) {
UTF8 => {
// With the optimization above for bulk memory copy, this match
@@ -1458,56 +1315,6 @@ provide let encode = (string: String, encoding: Encoding, includeBom=false) => {
// Byte->String decoding and helper functions:
-@unsafe
-let writeUtf8CodePoint = (ptr, codePoint) => {
- use WasmI32.{ (+), (-), (&), (|), (>>>), ltU as (<), leU as (<=), (==) }
- if (codePoint <= 0x007Fn) {
- // Code points in the ASCII range are written as just one byte with the
- // leading bit equal to zero (0xxxxxxx). Just store the value as one byte
- // directly. Note that the value is already guaranteed to start with most
- // significant bit equal to zero because of the check in the if statement
- // above, so there's no need to bit-mask it.
- WasmI32.store8(ptr, codePoint, 0n)
- 1n
- } else if (codePoint <= 0x07FFn) {
- // Code points in the range 0x0080..0x07FF are written as two bytes.
- // The first byte has a three bit prefix of 110, followed by 5 bits of the
- // codepoint. The second byte has a two bit prefix of 10, followed by 6 bits
- // of the codepoint.
- let high = codePoint >>> 6n & 0b000_11111n | 0b110_00000n
- let low = codePoint & 0b00_111111n | 0b10_000000n
- WasmI32.store8(ptr, high, 0n)
- WasmI32.store8(ptr + 1n, low, 0n)
- 2n
- } else if (codePoint <= 0xFFFFn) {
- // Code points in the range 0x0800..0xFFFF are written as three bytes.
- // The first byte has a four bit prefix of 1110, followed by 4 bits of the
- // codepoint. Remaining bytes each have a two bit prefix of 10, followed by
- // 6 bits of the codepoint.
- let high = codePoint >>> 12n & 0b0000_1111n | 0b1110_0000n
- let mid = codePoint >>> 6n & 0b00_111111n | 0b10_000000n
- let low = codePoint & 0b00_111111n | 0b10_000000n
- WasmI32.store8(ptr, high, 0n)
- WasmI32.store8(ptr + 1n, mid, 0n)
- WasmI32.store8(ptr + 2n, low, 0n)
- 3n
- } else {
- // Code points in the range 0x10000..0x10FFFF are written as four bytes.
- // The first byte has a five bit prefix of 11110, followed by 3 bits of the
- // codepoint. Remaining bytes each have a two bit prefix of 10, followed by
- // 6 bits of the codepoint.
- let high = codePoint >>> 18n & 0b00000_111n | 0b11110_000n
- let mid1 = codePoint >>> 12n & 0b00_111111n | 0b10_000000n
- let mid2 = codePoint >>> 6n & 0b00_111111n | 0b10_000000n
- let low = codePoint & 0b00_111111n | 0b10_000000n
- WasmI32.store8(ptr, high, 0n)
- WasmI32.store8(ptr + 1n, mid1, 0n)
- WasmI32.store8(ptr + 2n, mid2, 0n)
- WasmI32.store8(ptr + 3n, low, 0n)
- 4n
- }
-}
-
@unsafe
let bytesHaveBom = (bytes: Bytes, encoding: Encoding, start: WasmI32) => {
use WasmI32.{ (+), geU as (>=), (==) }
@@ -1945,15 +1752,7 @@ provide let forEachCodePoint = (fn: Number => Void, str: String) => {
while (ptr < end) {
let byte = WasmI32.load8U(ptr, 0n)
- let codePointByteCount = if ((byte & 0x80n) == 0x00n) {
- 1n
- } else if ((byte & 0xF0n) == 0xF0n) {
- 4n
- } else if ((byte & 0xE0n) == 0xE0n) {
- 3n
- } else {
- 2n
- }
+ let codePointByteCount = utf8ByteCount(byte)
// Note that even if up to 4 bytes are needed to represent Unicode
// codepoints, this doesn't mean 32 bits. The highest allowed code point is
@@ -1999,15 +1798,7 @@ provide let forEachCodePointi = (fn: (Number, Number) => Void, str: String) => {
let mut idx = 0n
while (ptr < end) {
let byte = WasmI32.load8U(ptr, 0n)
- let codePointByteCount = if ((byte & 0x80n) == 0x00n) {
- 1n
- } else if ((byte & 0xF0n) == 0xF0n) {
- 4n
- } else if ((byte & 0xE0n) == 0xE0n) {
- 3n
- } else {
- 2n
- }
+ let codePointByteCount = utf8ByteCount(byte)
// Note that even if up to 4 bytes are needed to represent Unicode
// codepoints, this doesn't mean 32 bits. The highest allowed code point is
@@ -2051,15 +1842,7 @@ provide let forEachChar = (fn: Char => Void, str: String) => {
while (ptr < end) {
let byte = WasmI32.load8U(ptr, 0n)
- let codePointByteCount = if ((byte & 0x80n) == 0x00n) {
- 1n
- } else if ((byte & 0xF0n) == 0xF0n) {
- 4n
- } else if ((byte & 0xE0n) == 0xE0n) {
- 3n
- } else {
- 2n
- }
+ let codePointByteCount = utf8ByteCount(byte)
// Note that even if up to 4 bytes are needed to represent Unicode
// codepoints, this doesn't mean 32 bits. The highest allowed code point is
@@ -2102,15 +1885,7 @@ provide let forEachChari = (fn: (Char, Number) => Void, str: String) => {
let mut idx = 0n
while (ptr < end) {
let byte = WasmI32.load8U(ptr, 0n)
- let codePointByteCount = if ((byte & 0x80n) == 0x00n) {
- 1n
- } else if ((byte & 0xF0n) == 0xF0n) {
- 4n
- } else if ((byte & 0xE0n) == 0xE0n) {
- 3n
- } else {
- 2n
- }
+ let codePointByteCount = utf8ByteCount(byte)
// Note that even if up to 4 bytes are needed to represent Unicode
// codepoints, this doesn't mean 32 bits. The highest allowed code point is