Skip to content

Commit

Permalink
Fix for wrong encoding of Unicode values above 0xffff
Browse files Browse the repository at this point in the history
  • Loading branch information
Thiruvalluvan M G committed Apr 2, 2024
1 parent e575b66 commit 0e9e793
Show file tree
Hide file tree
Showing 3 changed files with 86 additions and 5 deletions.
74 changes: 70 additions & 4 deletions lang/c++/impl/json/JsonIO.cc
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,31 @@ JsonParser::Token JsonParser::tryString() {
}
}

static string::const_iterator unicodeParse(string::const_iterator b, string::const_iterator e, uint32_t &n) {
string::const_iterator start = b;
for (int i = 0; i < 4; i++) {
++b;
if (b == e) {
throw Exception(boost::format(
"Invalid unicode escape: %1%") % string(start, b));
}
n *= 16;
char c = *b;
if (isdigit(c)) {
n += c - '0';
} else if (c >= 'a' && c <= 'f') {
n += c - 'a' + 10;
} else if (c >= 'A' && c <= 'F') {
n += c - 'A' + 10;
} else {
throw Exception(boost::format( "Invalid hex character: %1%") % c);
}
}
return b;
}

// Decode the given string and return contents as UTF8-encoded bytes.
// The input does not have the enclosing double-quotes.
string JsonParser::decodeString(const string &s, bool binary) {
string result;
const auto readNextByte = [](string::const_iterator &it, const string::const_iterator &end) -> char {
Expand All @@ -326,7 +351,12 @@ string JsonParser::decodeString(const string &s, bool binary) {
while (it != end) {
char ch = *it++;
if (ch == '\\') {
<<<<<<< HEAD
ch = readNextByte(it, end);
=======
string::const_iterator startSeq = it;
ch = *++it;
>>>>>>> 4a3d94ff9 (Fix for wrong encoding of Unicode values above 0xffff)
switch (ch) {
case '"':
case '\\':
Expand All @@ -351,6 +381,7 @@ string JsonParser::decodeString(const string &s, bool binary) {
case 'u':
case 'U': {
uint32_t n = 0;
<<<<<<< HEAD
char e[4];
for (char &i : e) {
n *= 16;
Expand All @@ -364,16 +395,51 @@ string JsonParser::decodeString(const string &s, bool binary) {
n += c - 'A' + 10;
}
}
=======
it = unicodeParse(it, s.end(), n);
>>>>>>> 4a3d94ff9 (Fix for wrong encoding of Unicode values above 0xffff)
if (binary) {
if (n > 0xff) {
throw Exception(boost::format(
"Invalid byte for binary: %1%%2%")
% ch % string(e, 4));
% ch % string(startSeq, ++it));
} else {
result.push_back(n);
continue;
}
}
if (n >= 0xd800) {
++it;
if (n > 0xdbff || it == s.end()) {
throw Exception(boost::format(
"Invalid unicode sequence: %1%")
% string(startSeq, it));
}
if (*it != '\\') {
throw Exception(boost::format(
"Invalid unicode sequence: %1%")
% string(startSeq, ++it));
}
++it;
if (it == s.end()) {
throw Exception(boost::format(
"Invalid unicode sequence: %1%")
% string(startSeq, it));
}
if (*it != 'u' && *it != 'U') {
throw Exception(boost::format(
"Invalid unicode sequence: %1%")
% string(startSeq, ++it));
}
uint32_t m = 0;
it = unicodeParse(it, s.end(), m);
if (m < 0xdc00 || m > 0xdfff) {
throw Exception(boost::format(
"Invalid unicode sequence: %1%")
% string(startSeq, ++it));
}
n = 0x10000 + (((n - 0xd800) << 10) | (m - 0xdc00));
}
if (n < 0x80) {
result.push_back(n);
} else if (n < 0x800) {
Expand All @@ -383,15 +449,15 @@ string JsonParser::decodeString(const string &s, bool binary) {
result.push_back((n >> 12) | 0xe0);
result.push_back(((n >> 6) & 0x3f) | 0x80);
result.push_back((n & 0x3f) | 0x80);
} else if (n < 110000) {
} else if (n < 0x110000) {
result.push_back((n >> 18) | 0xf0);
result.push_back(((n >> 12) & 0x3f) | 0x80);
result.push_back(((n >> 6) & 0x3f) | 0x80);
result.push_back((n & 0x3f) | 0x80);
} else {
throw Exception(boost::format(
"Invalid unicode value: %1%i%2%")
% ch % string(e, 4));
"Invalid unicode value: %1%%2%")
% n % string(startSeq, ++it));
}
}
continue;
Expand Down
13 changes: 12 additions & 1 deletion lang/c++/impl/json/JsonIO.hh
Original file line number Diff line number Diff line change
Expand Up @@ -263,12 +263,23 @@ class AVRO_DECL JsonGenerator {
out_.write(toHex((static_cast<unsigned char>(c)) % 16));
}

void escapeUnicode(uint32_t c) {
void escapeUnicode16(uint32_t c) {
out_.write('\\');
out_.write('u');
writeHex((c >> 8) & 0xff);
writeHex(c & 0xff);
}
void escapeUnicode(uint32_t c) {
if (c < 0x10000) {
escapeUnicode16(c);
} else if (c < 0x110000) {
c -= 0x10000;
escapeUnicode16(((c >> 10) & 0x3ff) | 0xd800);
escapeUnicode16((c & 0x3ff) | 0xdc00);
} else {
throw Exception(boost::format("Invalid code-point: %1%") % c);
}
}
void doEncodeString(const char *b, size_t len, bool binary) {
const char *e = b + len;
out_.write('"');
Expand Down
4 changes: 4 additions & 0 deletions lang/c++/test/JsonTests.cc
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,11 @@ TestData<const char *> stringData[] = {
{R"("\/")", EntityType::String, "/", R"("\/")"},
{R"("\u20ac")", EntityType::String, "\xe2\x82\xac", R"("\u20ac")"},
{R"("\u03c0")", EntityType::String, "\xcf\x80", R"("\u03c0")"},
<<<<<<< HEAD
{R"("hello\n")", EntityType::String, "hello\n", R"("hello\n")"},
=======
{R"("\Ud8ab\udccd")", EntityType::String, "\xf0\xba\xb3\x8d", R"("\ud8ab\udccd")"},
>>>>>>> 4a3d94ff9 (Fix for wrong encoding of Unicode values above 0xffff)
};

void testBool(const TestData<bool> &d) {
Expand Down

0 comments on commit 0e9e793

Please sign in to comment.