Skip to content

Commit

Permalink
Merge pull request #71 from Voxel8/me/unicode
Browse files Browse the repository at this point in the history
Handle parsing unicode surrogate pairs.
  • Loading branch information
xoofx authored Dec 6, 2019
2 parents 89b5323 + f7188f1 commit 779a4ab
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 5 deletions.
24 changes: 24 additions & 0 deletions SharpYaml.Tests/ScannerTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,30 @@ public void VerifyTokensOnExample14()
StreamEnd);
}


[Test]
public void VerifyTokensOnExample15() {
AssertSequenceOfTokensFrom(ScannerFor("test15.yaml"),
StreamStart,
FlowMappingStart,
Key,
PlainScalar("field1"),
Value,
DoubleQuotedScalar("R \ud83d\ude0e"),
FlowEntry,
Key,
PlainScalar("field2"),
Value,
DoubleQuotedScalar("R \u0100\u0101"),
FlowEntry,
Key,
PlainScalar("field3"),
Value,
DoubleQuotedScalar("R \u0100\ud83d\ude0e\u0101"),
FlowMappingEnd,
StreamEnd);
}

private Scanner ScannerFor(string name)
{
return new Scanner(YamlFile(name));
Expand Down
1 change: 1 addition & 0 deletions SharpYaml.Tests/SharpYaml.Tests.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@
<Service Include="{82A7F48D-3B50-4B1E-B82E-3ADA8210C358}" />
</ItemGroup>
<ItemGroup>
<EmbeddedResource Include="files\test15.yaml" />
<None Include="packages.config" />
<EmbeddedResource Include="files\YamlReferenceCard.yaml" />
</ItemGroup>
Expand Down
5 changes: 5 additions & 0 deletions SharpYaml.Tests/files/test15.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
field1: "R \ud83d\ude0e",
field2: "R \u0100\u0101",
field3: "R \u0100\ud83d\ude0e\u0101"
}
32 changes: 27 additions & 5 deletions SharpYaml/Scanner.cs
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ public class Scanner
private int flowLevel;
private int tokensParsed;

private const int MaxBufferLength = 8;
private const int MaxBufferLength = 12; // Number of characters in two 8 bit unicode codepoints.
private readonly CharacterAnalyzer<LookAheadBuffer> analyzer;
private bool tokenAvailable;

Expand Down Expand Up @@ -1684,10 +1684,32 @@ private Token ScanFlowScalar(bool isSingleQuoted)

if ((character >= 0xD800 && character <= 0xDFFF) || character > 0x10FFFF)
{
throw new SyntaxErrorException(start, mark, "While parsing a quoted scalar, find invalid Unicode character escape code.");
}

scanScalarValue.Append(CharHelper.ConvertFromUtf32(character));
var foundNextCharacter = true;
int nextCharacter = 0;

// We might be dealing with a surrogate pair - try to read the next unicode character.
if (codeLength == 4 && analyzer.Check('\\', codeLength) && analyzer.Check('u', codeLength + 1)) {
for (int k = 0; k < codeLength; ++k) {
if (!analyzer.IsHex(k + codeLength + 2)) {
foundNextCharacter = false;
break;
}
nextCharacter = (nextCharacter << 4) + analyzer.AsHex(k + codeLength + 2);
}

if (foundNextCharacter) {
for (int k = 0; k < codeLength + 2; ++k)
Skip();
}
} else
foundNextCharacter = false;

if (foundNextCharacter)
scanScalarValue.Append(CharHelper.ConvertFromUtf32(CharHelper.ConvertToUtf32((char)character, (char)nextCharacter)));
else
throw new SyntaxErrorException(start, mark, "While parsing a quoted scalar, find invalid Unicode character escape code.");
} else
scanScalarValue.Append(CharHelper.ConvertFromUtf32(character));

// Advance the pointer.

Expand Down

0 comments on commit 779a4ab

Please sign in to comment.