Skip to content

Commit

Permalink
Include unicode formatting support for JSON escapes (#2880)
Browse files Browse the repository at this point in the history
This PR improves on #2875 for JSON string formatting, updating the `escapeControls` function to handle unicode escaping correctly.

The check for valid UTF8 has been removed. That's erroneous and got in there because of my lack of understanding about how JSON is encoded. It's fundamentally UNICODE, so characters outside of standard ASCII must all be encoded using `\uNNNN` escapes. That ensures output is always valid JSON, even if string input is garbage.

The `escapeControls` function has a `options` parameter which makes it a lot more useful for various escaping requirements. This is also more efficient as the input string requires only two passes for both escaping and quoting: the first pass establishes the new string length, the buffer is re-allocated then the modified string produced.
  • Loading branch information
mikee47 authored Aug 31, 2024
1 parent 2d33243 commit 6caca47
Show file tree
Hide file tree
Showing 5 changed files with 92 additions and 82 deletions.
54 changes: 33 additions & 21 deletions Sming/Core/Data/Format/Formatter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,26 +12,25 @@

#include "Formatter.h"

namespace
namespace Format
{
/**
* @brief Get character used for standard escapes
* @param c Code to be escaped
* @param options
* @retval char Corresponding character, NUL if there isn't a standard escape
*/
char escapeChar(char c)
char escapeChar(char c, Options options)
{
switch(c) {
case '\0':
return '0';
case '\'':
return '\'';
return options[Option::unicode] ? '\0' : '0';
case '\"':
return '"';
case '\?':
return '?';
return options[Option::doublequote] ? c : '\0';
case '\'':
return options[Option::singlequote] ? c : '\0';
case '\\':
return '\\';
return options[Option::backslash] ? c : '\0';
case '\a':
return 'a';
case '\b':
Expand All @@ -51,19 +50,22 @@ char escapeChar(char c)
}
}

} // namespace

namespace Format
{
unsigned escapeControls(String& value)
unsigned escapeControls(String& value, Options options)
{
// Count number of extra characters we'll need to insert
unsigned extra{0};
for(auto& c : value) {
if(escapeChar(c)) {
if(escapeChar(c, options)) {
extra += 1; // "\"
} else if(options[Option::unicode]) {
if(uint8_t(c) < 0x20 || (c & 0x80)) {
extra += 5; // "\uNNNN"
}
} else if(uint8_t(c) < 0x20) {
extra += 3; // "\xnn"
} else if((c & 0x80) && options[Option::utf8]) {
// Characters such as £ (0xa3) are escaped to 0xc2 0xa3 in UTF-8
extra += 1; // 0xc2
}
}
if(extra == 0) {
Expand All @@ -79,18 +81,28 @@ unsigned escapeControls(String& value)
in += extra;
while(len--) {
uint8_t c = *in++;
auto esc = escapeChar(c);
auto esc = escapeChar(c, options);
if(esc) {
*out++ = '\\';
*out++ = esc;
} else if(c < 0x20) {
c = esc;
} else if(options[Option::unicode]) {
if(uint8_t(c) < 0x20 || (c & 0x80)) {
*out++ = '\\';
*out++ = 'u';
*out++ = '0';
*out++ = '0';
*out++ = hexchar(uint8_t(c) >> 4);
c = hexchar(uint8_t(c) & 0x0f);
}
} else if(uint8_t(c) < 0x20) {
*out++ = '\\';
*out++ = 'x';
*out++ = hexchar(uint8_t(c) >> 4);
*out++ = hexchar(uint8_t(c) & 0x0f);
} else {
*out++ = c;
c = hexchar(uint8_t(c) & 0x0f);
} else if((c & 0x80) && options[Option::utf8]) {
*out++ = 0xc2;
}
*out++ = c;
}
return extra;
}
Expand Down
13 changes: 12 additions & 1 deletion Sming/Core/Data/Format/Formatter.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,26 @@

#include <WString.h>
#include <Data/WebConstants.h>
#include <Data/BitSet.h>

namespace Format
{
enum class Option {
unicode, //< Use unicode escapes \uNNNN, otherwise hex \xNN
utf8, ///< Convert extended ASCII to UTF8
doublequote,
singlequote,
backslash,
};
using Options = BitSet<uint8_t, Option, 5>;

/**
* @brief Escape standard control codes such as `\n` (below ASCII 0x20)
* @param value String to be modified
* @param options
* @retval unsigned Number of control characters found and replaced
*/
unsigned escapeControls(String& value);
unsigned escapeControls(String& value, Options options);

/**
* @brief Virtual class to perform format-specific String adjustments
Expand Down
62 changes: 12 additions & 50 deletions Sming/Core/Data/Format/Json.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,46 +17,6 @@ namespace Format
{
Json json;

namespace
{
bool IsValidUtf8(const char* str, unsigned length)
{
if(str == nullptr) {
return true;
}

unsigned i = 0;
while(i < length) {
char c = str[i++];
if((c & 0x80) == 0) {
continue;
}

if(i >= length) {
return false; // incomplete multibyte char
}

if(c & 0x20) {
c = str[i++];
if((c & 0xC0) != 0x80) {
return false; // malformed trail byte or out of range char
}
if(i >= length) {
return false; // incomplete multibyte char
}
}

c = str[i++];
if((c & 0xC0) != 0x80) {
return false; // malformed trail byte
}
}

return true;
}

} // namespace

/*
* Check for invalid characters and replace them - can break browser
* operation otherwise.
Expand All @@ -66,17 +26,19 @@ bool IsValidUtf8(const char* str, unsigned length)
*/
void Json::escape(String& value) const
{
escapeControls(value);
if(!IsValidUtf8(value.c_str(), value.length())) {
debug_w("Invalid UTF8: %s", value.c_str());
for(unsigned i = 0; i < value.length(); ++i) {
char& c = value[i];
if(c < 0x20 || uint8_t(c) > 127)
c = '_';
}
}
escapeControls(value, Option::unicode | Option::doublequote | Option::backslash);
}

value.replace("\"", "\\\"");
void Json::quote(String& value) const
{
escape(value);
auto len = value.length();
if(value.setLength(len + 2)) {
auto s = value.begin();
memmove(s + 1, s, len);
s[0] = '"';
s[len + 1] = '"';
}
}

} // namespace Format
1 change: 1 addition & 0 deletions Sming/Core/Data/Format/Json.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ class Json : public Standard
{
public:
void escape(String& value) const override;
void quote(String& value) const override;

MimeType mimeType() const override
{
Expand Down
44 changes: 34 additions & 10 deletions tests/HostTests/modules/Formatter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,41 @@ class FormatterTest : public TestGroup

void execute() override
{
// Note: \xa3 is unicode for £
DEFINE_FSTR_LOCAL(text1, "A JSON\ntest string\twith escapes\x12\0\n"
"Worth maybe \xc2\xa3"
"0.53.")
DEFINE_FSTR_LOCAL(text1b, "A JSON\\ntest string\\twith escapes\\x12\\0\\n"
"Worth maybe \xc2\xa3"
"0.53.")

Serial << text1 << endl;
String s(text1);
Format::json.escape(s);
REQUIRE_EQ(s, text1b);
"Worth \"maybe\" \xa3 0.53. Yen \xa5 5bn.")

TEST_CASE("JSON")
{
DEFINE_FSTR_LOCAL(text1b, "A JSON\\ntest string\\twith escapes\\u0012\\u0000\\n"
"Worth \\\"maybe\\\" \\u00a3 0.53. Yen \\u00a5 5bn.")

Serial << text1 << endl;
String s(text1);
Format::json.escape(s);
REQUIRE_EQ(s, text1b);

s = text1;
Format::json.quote(s);
String quoted = String('"') + text1b + '"';
REQUIRE_EQ(s, quoted);
}

TEST_CASE("C++")
{
DEFINE_FSTR_LOCAL(
text1a, "A JSON\\ntest string\\twith escapes\\x12\\0\\nWorth \\\"maybe\\\" \xa3 0.53. Yen \xa5 5bn.")
String s(text1);
Format::escapeControls(s, Format::Option::doublequote | Format::Option::backslash);
REQUIRE_EQ(s, text1a);

DEFINE_FSTR_LOCAL(
text1b,
"A JSON\\ntest string\\twith escapes\\x12\\0\\nWorth \\\"maybe\\\" \xc2\xa3 0.53. Yen \xc2\xa5 5bn.")
s = text1;
Format::escapeControls(s, Format::Option::utf8 | Format::Option::doublequote | Format::Option::backslash);
REQUIRE_EQ(s, text1b);
}
}
};

Expand Down

0 comments on commit 6caca47

Please sign in to comment.