Skip to content

Commit

Permalink
Optimized decoding of null-terminated UTF-8 strings
Browse files Browse the repository at this point in the history
  • Loading branch information
sakno committed Sep 20, 2023
1 parent 512715a commit b833baf
Show file tree
Hide file tree
Showing 5 changed files with 84 additions and 66 deletions.
11 changes: 5 additions & 6 deletions src/DotNext.IO/IO/Pipelines/PipeExtensions.Readers.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
using System.Runtime.InteropServices;
using System.Runtime.Versioning;
using System.Security.Cryptography;
using System.Text;
using static System.Buffers.Binary.BinaryPrimitives;
using static System.Text.EncodingExtensions;
using Missing = System.Reflection.Missing;

namespace DotNext.IO.Pipelines;
Expand Down Expand Up @@ -757,10 +757,9 @@ public static async IAsyncEnumerable<ReadOnlyMemory<byte>> ReadAllAsync(this Pip
}

/// <summary>
/// Decodes null-terminated string.
/// Decodes null-terminated UTF-8 encoded string.
/// </summary>
/// <param name="reader">The pipe reader.</param>
/// <param name="context">The text decoding context.</param>
/// <param name="output">The output buffer for decoded characters.</param>
/// <param name="token">The token that can be used to cancel the operation.</param>
/// <returns>The task representing asynchronous execution of this method.</returns>
Expand All @@ -769,12 +768,12 @@ public static async IAsyncEnumerable<ReadOnlyMemory<byte>> ReadAllAsync(this Pip
/// or <paramref name="output"/> is <see langword="null"/>.
/// </exception>
/// <exception cref="OperationCanceledException">The operation has been canceled.</exception>
public static async ValueTask ReadStringAsync(this PipeReader reader, DecodingContext context, IBufferWriter<char> output, CancellationToken token = default)
public static async ValueTask ReadUtf8Async(this PipeReader reader, IBufferWriter<char> output, CancellationToken token = default)
{
ArgumentNullException.ThrowIfNull(reader);
ArgumentNullException.ThrowIfNull(output);

var decoder = context.GetDecoder();
var decoder = Encoding.UTF8.GetDecoder();
SequencePosition consumed;
bool completed;

Expand All @@ -783,7 +782,7 @@ public static async ValueTask ReadStringAsync(this PipeReader reader, DecodingCo
var readResult = await reader.ReadAsync(token).ConfigureAwait(false);
var buffer = readResult.Buffer;

if (buffer.PositionOf(DecodingContext.StringTerminationByte).TryGetValue(out consumed))
if (buffer.PositionOf(DecodingContext.Utf8NullChar).TryGetValue(out consumed))
{
buffer = buffer.Slice(0, consumed);
completed = true;
Expand Down
104 changes: 54 additions & 50 deletions src/DotNext.IO/IO/StreamExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
using System.Runtime.InteropServices;
using System.Runtime.Versioning;
using System.Text;
using Utf8 = System.Text.Unicode.Utf8;

namespace DotNext.IO;

Expand Down Expand Up @@ -1317,10 +1318,9 @@ public static async IAsyncEnumerable<ReadOnlyMemory<byte>> ReadAllAsync(this Str
}

/// <summary>
/// Decodes null-terminated string asynchronously.
/// Decodes null-terminated UTF-8 encoded string asynchronously.
/// </summary>
/// <param name="stream">The stream containing encoded string.</param>
/// <param name="context">The decoding context.</param>
/// <param name="buffer">The buffer used to read from stream.</param>
/// <param name="output">The output buffer for decoded characters.</param>
/// <param name="token">The token that can be used to cancel the operation.</param>
Expand All @@ -1331,47 +1331,30 @@ public static async IAsyncEnumerable<ReadOnlyMemory<byte>> ReadAllAsync(this Str
/// </exception>
/// <exception cref="ArgumentException"><paramref name="buffer"/> is too small to decode at least one character.</exception>
/// <exception cref="OperationCanceledException">The operation has been canceled.</exception>
public static async ValueTask<int> ReadStringAsync(this Stream stream, DecodingContext context, Memory<byte> buffer, IBufferWriter<char> output, CancellationToken token = default)
[AsyncMethodBuilder(typeof(PoolingAsyncValueTaskMethodBuilder<>))]
public static async ValueTask<int> ReadUtf8Async(this Stream stream, Memory<byte> buffer, IBufferWriter<char> output, CancellationToken token = default)
{
ArgumentNullException.ThrowIfNull(stream);
ArgumentNullException.ThrowIfNull(output);

if (context.Encoding.GetMaxCharCount(buffer.Length) is 0)
if (Encoding.UTF8.GetMaxCharCount(buffer.Length) is 0)
throw new ArgumentException(ExceptionMessages.BufferTooSmall, nameof(buffer));

var decoder = context.GetDecoder();
var result = 0;
bool completed;
int consumedBufferBytes, bytesRead, bufferOffset = 0;

do
{
var bytesRead = await stream.ReadAsync(buffer, token).ConfigureAwait(false);
var input = buffer.Slice(0, bytesRead);

var nullCharIndex = input.Span.IndexOf(DecodingContext.StringTerminationByte);
if (nullCharIndex >= 0)
{
result = nullCharIndex + 1;
input = input.Slice(0, nullCharIndex);
completed = true;
}
else
{
completed = input.IsEmpty;
}

decoder.Convert(input.Span, output, completed, out _, out _);
bytesRead = await stream.ReadAsync(buffer.Slice(bufferOffset), token).ConfigureAwait(false);
}
while (!completed);
while (!ConvertToUtf8(buffer.Span.Slice(0, bufferOffset + bytesRead), output, out consumedBufferBytes, out bufferOffset));

return result;
return consumedBufferBytes;
}

/// <summary>
/// Decodes null-terminated string synchronously.
/// Decodes null-terminated UTF-8 encoded string synchronously.
/// </summary>
/// <param name="stream">The stream containing encoded string.</param>
/// <param name="context">The decoding context.</param>
/// <param name="buffer">The buffer used to read from stream.</param>
/// <param name="output">The output buffer for decoded characters.</param>
/// <returns>The number of used bytes in <paramref name="buffer"/>.</returns>
Expand All @@ -1380,39 +1363,60 @@ public static async ValueTask<int> ReadStringAsync(this Stream stream, DecodingC
/// or <paramref name="output"/> is <see langword="null"/>.
/// </exception>
/// <exception cref="ArgumentException"><paramref name="buffer"/> is too small to decode at least one character.</exception>
public static int ReadString(this Stream stream, in DecodingContext context, Span<byte> buffer, IBufferWriter<char> output)
public static int ReadUtf8(this Stream stream, Span<byte> buffer, IBufferWriter<char> output)
{
ArgumentNullException.ThrowIfNull(stream);
ArgumentNullException.ThrowIfNull(output);

if (context.Encoding.GetMaxCharCount(buffer.Length) is 0)
if (Encoding.UTF8.GetMaxCharCount(buffer.Length) is 0)
throw new ArgumentException(ExceptionMessages.BufferTooSmall, nameof(buffer));

var decoder = context.GetDecoder();
var result = 0;
bool completed;
int consumedBufferBytes, bytesRead, bufferOffset = 0;

do
{
var bytesRead = stream.Read(buffer);
var input = buffer.Slice(0, bytesRead);

var nullCharIndex = input.IndexOf(DecodingContext.StringTerminationByte);
if (nullCharIndex >= 0)
{
result = nullCharIndex + 1;
input = input.Slice(0, nullCharIndex);
completed = true;
}
else
{
completed = input.IsEmpty;
}

decoder.Convert(input, output, completed, out _, out _);
bytesRead = stream.Read(buffer.Slice(bufferOffset));
}
while (!completed);
while (!ConvertToUtf8(buffer.Slice(0, bufferOffset + bytesRead), output, out consumedBufferBytes, out bufferOffset));

return result;
return consumedBufferBytes;
}

private static bool ConvertToUtf8(Span<byte> buffer, IBufferWriter<char> output, out int consumedCount, out int bufferOffset)
{
bool flush;
var nullCharIndex = buffer.IndexOf(DecodingContext.Utf8NullChar);

if (nullCharIndex >= 0)
{
consumedCount = nullCharIndex + 1;
buffer = buffer.Slice(0, nullCharIndex);
flush = true;
}
else
{
consumedCount = buffer.Length;
flush = buffer.IsEmpty;
}

var chars = output.GetSpan(Encoding.UTF8.GetMaxCharCount(buffer.Length));

switch (Utf8.ToUtf16(buffer, chars, out var bytesRead, out var charsWritten, replaceInvalidSequences: false, flush))
{
case OperationStatus.NeedMoreData:
// we need more data, copy undecoded bytes to the beginning of the buffer
var bufferTail = buffer.Slice(bytesRead);
bufferOffset = bufferTail.Length;
bufferTail.CopyTo(buffer);
break;
case OperationStatus.Done:
bufferOffset = 0;
break;
default:
throw new DecoderFallbackException();
}

output.Advance(charsWritten);
return flush;
}
}
2 changes: 1 addition & 1 deletion src/DotNext.IO/Text/DecodingContext.cs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ namespace DotNext.Text;
[StructLayout(LayoutKind.Auto)]
public readonly struct DecodingContext : ICloneable, IResettable
{
internal const byte StringTerminationByte = 0;
internal const byte Utf8NullChar = 0;

private readonly Encoding encoding;
private readonly Decoder? decoder;
Expand Down
2 changes: 1 addition & 1 deletion src/DotNext.Tests/IO/Pipelines/PipeExtensionsTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -367,7 +367,7 @@ public static async Task DecodeNullTerminatedStringAsync()
pipe.Writer.Complete();

var writer = new ArrayBufferWriter<char>();
await pipe.Reader.ReadStringAsync(Encoding.UTF8, writer);
await pipe.Reader.ReadUtf8Async(writer);
Equal("Привет, мир!", writer.WrittenSpan.ToString());
}
}
31 changes: 23 additions & 8 deletions src/DotNext.Tests/IO/StreamExtensionsTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -682,15 +682,15 @@ public static async Task ReadEntireStream()
public static async Task DecodeNullTerminatedStringAsync(int bufferSize)
{
using var ms = new MemoryStream();
ms.Write("Привет, мир!"u8);
ms.Write("Привет, \u263A!"u8);
ms.WriteByte(0);
ms.WriteByte(0);
ms.Position = 0L;

var buffer = new byte[bufferSize];
Memory<byte> buffer = new byte[bufferSize];
var writer = new ArrayBufferWriter<char>();
await ms.ReadStringAsync(Encoding.UTF8, buffer, writer);
Equal("Привет, мир!", writer.WrittenSpan.ToString());
await ms.ReadUtf8Async(buffer, writer);
Equal("Привет, \u263A!", writer.WrittenSpan.ToString());
}

[Theory]
Expand All @@ -700,14 +700,29 @@ public static async Task DecodeNullTerminatedStringAsync(int bufferSize)
public static void DecodeNullTerminatedString(int bufferSize)
{
using var ms = new MemoryStream();
ms.Write("Привет, мир!"u8);
ms.Write("Привет, \u263A!"u8);
ms.WriteByte(0);
ms.WriteByte(0);
ms.Position = 0L;

var buffer = new byte[bufferSize];
Span<byte> buffer = stackalloc byte[bufferSize];
var writer = new ArrayBufferWriter<char>();
ms.ReadUtf8(buffer, writer);
Equal("Привет, \u263A!", writer.WrittenSpan.ToString());
}

[Fact]
public static void DecodeNullTerminatedEmptyString()
{
using var ms = new MemoryStream();
ms.Write("\0"u8);
ms.WriteByte(0);
ms.WriteByte(0);
ms.Position = 0L;

Span<byte> buffer = stackalloc byte[8];
var writer = new ArrayBufferWriter<char>();
ms.ReadString(Encoding.UTF8, buffer, writer);
Equal("Привет, мир!", writer.WrittenSpan.ToString());
ms.ReadUtf8(buffer, writer);
Equal(string.Empty, writer.WrittenSpan.ToString());
}
}

0 comments on commit b833baf

Please sign in to comment.