diff --git a/lang/c++/impl/json/JsonIO.cc b/lang/c++/impl/json/JsonIO.cc index 7f002ed992d..6e36e158566 100644 --- a/lang/c++/impl/json/JsonIO.cc +++ b/lang/c++/impl/json/JsonIO.cc @@ -314,6 +314,31 @@ JsonParser::Token JsonParser::tryString() { } } +static string::const_iterator unicodeParse(string::const_iterator b, string::const_iterator e, uint32_t &n) { + string::const_iterator start = b; + for (int i = 0; i < 4; i++) { + ++b; + if (b == e) { + throw Exception(boost::format( + "Invalid unicode escape: %1%") % string(start, b)); + } + n *= 16; + char c = *b; + if (isdigit(c)) { + n += c - '0'; + } else if (c >= 'a' && c <= 'f') { + n += c - 'a' + 10; + } else if (c >= 'A' && c <= 'F') { + n += c - 'A' + 10; + } else { + throw Exception(boost::format( "Invalid hex character: %1%") % c); + } + } + return b; +} + +// Decode the given string and return contents as UTF8-encoded bytes. +// The input does not have the enclosing double-quotes. string JsonParser::decodeString(const string &s, bool binary) { string result; const auto readNextByte = [](string::const_iterator &it, const string::const_iterator &end) -> char { @@ -326,7 +351,12 @@ string JsonParser::decodeString(const string &s, bool binary) { while (it != end) { char ch = *it++; if (ch == '\\') { +<<<<<<< HEAD ch = readNextByte(it, end); +======= + string::const_iterator startSeq = it; + ch = *++it; +>>>>>>> 4a3d94ff9 (Fix for wrong encoding of Unicode values above 0xffff) switch (ch) { case '"': case '\\': @@ -351,6 +381,7 @@ string JsonParser::decodeString(const string &s, bool binary) { case 'u': case 'U': { uint32_t n = 0; +<<<<<<< HEAD char e[4]; for (char &i : e) { n *= 16; @@ -364,16 +395,51 @@ string JsonParser::decodeString(const string &s, bool binary) { n += c - 'A' + 10; } } +======= + it = unicodeParse(it, s.end(), n); +>>>>>>> 4a3d94ff9 (Fix for wrong encoding of Unicode values above 0xffff) if (binary) { if (n > 0xff) { throw Exception(boost::format( "Invalid byte for binary: %1%%2%") - % ch % string(e, 4)); + % ch % string(startSeq, ++it)); } else { result.push_back(n); continue; } } + if (n >= 0xd800) { + ++it; + if (n > 0xdbff || it == s.end()) { + throw Exception(boost::format( + "Invalid unicode sequence: %1%") + % string(startSeq, it)); + } + if (*it != '\\') { + throw Exception(boost::format( + "Invalid unicode sequence: %1%") + % string(startSeq, ++it)); + } + ++it; + if (it == s.end()) { + throw Exception(boost::format( + "Invalid unicode sequence: %1%") + % string(startSeq, it)); + } + if (*it != 'u' && *it != 'U') { + throw Exception(boost::format( + "Invalid unicode sequence: %1%") + % string(startSeq, ++it)); + } + uint32_t m = 0; + it = unicodeParse(it, s.end(), m); + if (m < 0xdc00 || m > 0xdfff) { + throw Exception(boost::format( + "Invalid unicode sequence: %1%") + % string(startSeq, ++it)); + } + n = 0x10000 + (((n - 0xd800) << 10) | (m - 0xdc00)); + } if (n < 0x80) { result.push_back(n); } else if (n < 0x800) { @@ -383,15 +449,15 @@ string JsonParser::decodeString(const string &s, bool binary) { result.push_back((n >> 12) | 0xe0); result.push_back(((n >> 6) & 0x3f) | 0x80); result.push_back((n & 0x3f) | 0x80); - } else if (n < 110000) { + } else if (n < 0x110000) { result.push_back((n >> 18) | 0xf0); result.push_back(((n >> 12) & 0x3f) | 0x80); result.push_back(((n >> 6) & 0x3f) | 0x80); result.push_back((n & 0x3f) | 0x80); } else { throw Exception(boost::format( - "Invalid unicode value: %1%i%2%") - % ch % string(e, 4)); + "Invalid unicode value: %1%%2%") + % n % string(startSeq, ++it)); } } continue; diff --git a/lang/c++/impl/json/JsonIO.hh b/lang/c++/impl/json/JsonIO.hh index 94889e5d010..447c0b0df32 100644 --- a/lang/c++/impl/json/JsonIO.hh +++ b/lang/c++/impl/json/JsonIO.hh @@ -263,12 +263,23 @@ class AVRO_DECL JsonGenerator { out_.write(toHex((static_cast(c)) % 16)); } - void escapeUnicode(uint32_t c) { + void escapeUnicode16(uint32_t c) { out_.write('\\'); out_.write('u'); writeHex((c >> 8) & 0xff); writeHex(c & 0xff); } + void escapeUnicode(uint32_t c) { + if (c < 0x10000) { + escapeUnicode16(c); + } else if (c < 0x110000) { + c -= 0x10000; + escapeUnicode16(((c >> 10) & 0x3ff) | 0xd800); + escapeUnicode16((c & 0x3ff) | 0xdc00); + } else { + throw Exception(boost::format("Invalid code-point: %1%") % c); + } + } void doEncodeString(const char *b, size_t len, bool binary) { const char *e = b + len; out_.write('"'); diff --git a/lang/c++/test/JsonTests.cc b/lang/c++/test/JsonTests.cc index 10a10054033..378c67bdd8f 100644 --- a/lang/c++/test/JsonTests.cc +++ b/lang/c++/test/JsonTests.cc @@ -68,7 +68,11 @@ TestData stringData[] = { {R"("\/")", EntityType::String, "/", R"("\/")"}, {R"("\u20ac")", EntityType::String, "\xe2\x82\xac", R"("\u20ac")"}, {R"("\u03c0")", EntityType::String, "\xcf\x80", R"("\u03c0")"}, +<<<<<<< HEAD {R"("hello\n")", EntityType::String, "hello\n", R"("hello\n")"}, +======= + {R"("\Ud8ab\udccd")", EntityType::String, "\xf0\xba\xb3\x8d", R"("\ud8ab\udccd")"}, +>>>>>>> 4a3d94ff9 (Fix for wrong encoding of Unicode values above 0xffff) }; void testBool(const TestData &d) {