From 6caca47a776ba3997218e60403a45b882adf0eaf Mon Sep 17 00:00:00 2001 From: Mike Date: Sat, 31 Aug 2024 13:15:07 +0100 Subject: [PATCH] Include unicode formatting support for JSON escapes (#2880) This PR improves on #2875 for JSON string formatting, updating the `escapeControls` function to handle unicode escaping correctly. The check for valid UTF8 has been removed. That's erroneous and got in there because of my lack of understanding about how JSON is encoded. It's fundamentally UNICODE, so characters outside of standard ASCII must all be encoded using `\uNNNN` escapes. That ensures output is always valid JSON, even if string input is garbage. The `escapeControls` function has a `options` parameter which makes it a lot more useful for various escaping requirements. This is also more efficient as the input string requires only two passes for both escaping and quoting: the first pass establishes the new string length, the buffer is re-allocated then the modified string produced. --- Sming/Core/Data/Format/Formatter.cpp | 54 ++++++++++++++--------- Sming/Core/Data/Format/Formatter.h | 13 +++++- Sming/Core/Data/Format/Json.cpp | 62 ++++++--------------------- Sming/Core/Data/Format/Json.h | 1 + tests/HostTests/modules/Formatter.cpp | 44 ++++++++++++++----- 5 files changed, 92 insertions(+), 82 deletions(-) diff --git a/Sming/Core/Data/Format/Formatter.cpp b/Sming/Core/Data/Format/Formatter.cpp index f519306f6f..546900a25a 100644 --- a/Sming/Core/Data/Format/Formatter.cpp +++ b/Sming/Core/Data/Format/Formatter.cpp @@ -12,26 +12,25 @@ #include "Formatter.h" -namespace +namespace Format { /** * @brief Get character used for standard escapes * @param c Code to be escaped + * @param options * @retval char Corresponding character, NUL if there isn't a standard escape */ -char escapeChar(char c) +char escapeChar(char c, Options options) { switch(c) { case '\0': - return '0'; - case '\'': - return '\''; + return options[Option::unicode] ? '\0' : '0'; case '\"': - return '"'; - case '\?': - return '?'; + return options[Option::doublequote] ? c : '\0'; + case '\'': + return options[Option::singlequote] ? c : '\0'; case '\\': - return '\\'; + return options[Option::backslash] ? c : '\0'; case '\a': return 'a'; case '\b': @@ -51,19 +50,22 @@ char escapeChar(char c) } } -} // namespace - -namespace Format -{ -unsigned escapeControls(String& value) +unsigned escapeControls(String& value, Options options) { // Count number of extra characters we'll need to insert unsigned extra{0}; for(auto& c : value) { - if(escapeChar(c)) { + if(escapeChar(c, options)) { extra += 1; // "\" + } else if(options[Option::unicode]) { + if(uint8_t(c) < 0x20 || (c & 0x80)) { + extra += 5; // "\uNNNN" + } } else if(uint8_t(c) < 0x20) { extra += 3; // "\xnn" + } else if((c & 0x80) && options[Option::utf8]) { + // Characters such as £ (0xa3) are escaped to 0xc2 0xa3 in UTF-8 + extra += 1; // 0xc2 } } if(extra == 0) { @@ -79,18 +81,28 @@ unsigned escapeControls(String& value) in += extra; while(len--) { uint8_t c = *in++; - auto esc = escapeChar(c); + auto esc = escapeChar(c, options); if(esc) { *out++ = '\\'; - *out++ = esc; - } else if(c < 0x20) { + c = esc; + } else if(options[Option::unicode]) { + if(uint8_t(c) < 0x20 || (c & 0x80)) { + *out++ = '\\'; + *out++ = 'u'; + *out++ = '0'; + *out++ = '0'; + *out++ = hexchar(uint8_t(c) >> 4); + c = hexchar(uint8_t(c) & 0x0f); + } + } else if(uint8_t(c) < 0x20) { *out++ = '\\'; *out++ = 'x'; *out++ = hexchar(uint8_t(c) >> 4); - *out++ = hexchar(uint8_t(c) & 0x0f); - } else { - *out++ = c; + c = hexchar(uint8_t(c) & 0x0f); + } else if((c & 0x80) && options[Option::utf8]) { + *out++ = 0xc2; } + *out++ = c; } return extra; } diff --git a/Sming/Core/Data/Format/Formatter.h b/Sming/Core/Data/Format/Formatter.h index 9b45322165..45e59aeb84 100644 --- a/Sming/Core/Data/Format/Formatter.h +++ b/Sming/Core/Data/Format/Formatter.h @@ -14,15 +14,26 @@ #include #include +#include namespace Format { +enum class Option { + unicode, //< Use unicode escapes \uNNNN, otherwise hex \xNN + utf8, ///< Convert extended ASCII to UTF8 + doublequote, + singlequote, + backslash, +}; +using Options = BitSet; + /** * @brief Escape standard control codes such as `\n` (below ASCII 0x20) * @param value String to be modified + * @param options * @retval unsigned Number of control characters found and replaced */ -unsigned escapeControls(String& value); +unsigned escapeControls(String& value, Options options); /** * @brief Virtual class to perform format-specific String adjustments diff --git a/Sming/Core/Data/Format/Json.cpp b/Sming/Core/Data/Format/Json.cpp index 536bb017df..522e96757a 100644 --- a/Sming/Core/Data/Format/Json.cpp +++ b/Sming/Core/Data/Format/Json.cpp @@ -17,46 +17,6 @@ namespace Format { Json json; -namespace -{ -bool IsValidUtf8(const char* str, unsigned length) -{ - if(str == nullptr) { - return true; - } - - unsigned i = 0; - while(i < length) { - char c = str[i++]; - if((c & 0x80) == 0) { - continue; - } - - if(i >= length) { - return false; // incomplete multibyte char - } - - if(c & 0x20) { - c = str[i++]; - if((c & 0xC0) != 0x80) { - return false; // malformed trail byte or out of range char - } - if(i >= length) { - return false; // incomplete multibyte char - } - } - - c = str[i++]; - if((c & 0xC0) != 0x80) { - return false; // malformed trail byte - } - } - - return true; -} - -} // namespace - /* * Check for invalid characters and replace them - can break browser * operation otherwise. @@ -66,17 +26,19 @@ bool IsValidUtf8(const char* str, unsigned length) */ void Json::escape(String& value) const { - escapeControls(value); - if(!IsValidUtf8(value.c_str(), value.length())) { - debug_w("Invalid UTF8: %s", value.c_str()); - for(unsigned i = 0; i < value.length(); ++i) { - char& c = value[i]; - if(c < 0x20 || uint8_t(c) > 127) - c = '_'; - } - } + escapeControls(value, Option::unicode | Option::doublequote | Option::backslash); +} - value.replace("\"", "\\\""); +void Json::quote(String& value) const +{ + escape(value); + auto len = value.length(); + if(value.setLength(len + 2)) { + auto s = value.begin(); + memmove(s + 1, s, len); + s[0] = '"'; + s[len + 1] = '"'; + } } } // namespace Format diff --git a/Sming/Core/Data/Format/Json.h b/Sming/Core/Data/Format/Json.h index 2c7f35f983..4abd2a7ee4 100644 --- a/Sming/Core/Data/Format/Json.h +++ b/Sming/Core/Data/Format/Json.h @@ -20,6 +20,7 @@ class Json : public Standard { public: void escape(String& value) const override; + void quote(String& value) const override; MimeType mimeType() const override { diff --git a/tests/HostTests/modules/Formatter.cpp b/tests/HostTests/modules/Formatter.cpp index 38fbd0bbca..0e0593f4e7 100644 --- a/tests/HostTests/modules/Formatter.cpp +++ b/tests/HostTests/modules/Formatter.cpp @@ -10,17 +10,41 @@ class FormatterTest : public TestGroup void execute() override { + // Note: \xa3 is unicode for £ DEFINE_FSTR_LOCAL(text1, "A JSON\ntest string\twith escapes\x12\0\n" - "Worth maybe \xc2\xa3" - "0.53.") - DEFINE_FSTR_LOCAL(text1b, "A JSON\\ntest string\\twith escapes\\x12\\0\\n" - "Worth maybe \xc2\xa3" - "0.53.") - - Serial << text1 << endl; - String s(text1); - Format::json.escape(s); - REQUIRE_EQ(s, text1b); + "Worth \"maybe\" \xa3 0.53. Yen \xa5 5bn.") + + TEST_CASE("JSON") + { + DEFINE_FSTR_LOCAL(text1b, "A JSON\\ntest string\\twith escapes\\u0012\\u0000\\n" + "Worth \\\"maybe\\\" \\u00a3 0.53. Yen \\u00a5 5bn.") + + Serial << text1 << endl; + String s(text1); + Format::json.escape(s); + REQUIRE_EQ(s, text1b); + + s = text1; + Format::json.quote(s); + String quoted = String('"') + text1b + '"'; + REQUIRE_EQ(s, quoted); + } + + TEST_CASE("C++") + { + DEFINE_FSTR_LOCAL( + text1a, "A JSON\\ntest string\\twith escapes\\x12\\0\\nWorth \\\"maybe\\\" \xa3 0.53. Yen \xa5 5bn.") + String s(text1); + Format::escapeControls(s, Format::Option::doublequote | Format::Option::backslash); + REQUIRE_EQ(s, text1a); + + DEFINE_FSTR_LOCAL( + text1b, + "A JSON\\ntest string\\twith escapes\\x12\\0\\nWorth \\\"maybe\\\" \xc2\xa3 0.53. Yen \xc2\xa5 5bn.") + s = text1; + Format::escapeControls(s, Format::Option::utf8 | Format::Option::doublequote | Format::Option::backslash); + REQUIRE_EQ(s, text1b); + } } };