From b770786f59b5f2ea790639920efbb853abea44a3 Mon Sep 17 00:00:00 2001 From: Eduardo Silva Date: Tue, 3 Dec 2024 20:22:02 -0600 Subject: [PATCH 01/11] utf8: enhance handling of multibyte sequences This patch refactor a bit how UTF8 decoding works by replacing the old lookup table for special characters/codepoints with a new routine and optional lookup table based on the compiler type (GNU/Clang). It also supports proper encoding of multibyte sequences. Signed-off-by: Eduardo Silva --- include/fluent-bit/flb_utf8.h | 82 ++------------ src/flb_utf8.c | 201 ++++++++++++++++++++++++++++++++++ 2 files changed, 208 insertions(+), 75 deletions(-) create mode 100644 src/flb_utf8.c diff --git a/include/fluent-bit/flb_utf8.h b/include/fluent-bit/flb_utf8.h index 00cb08d066f..b883ff0f78c 100644 --- a/include/fluent-bit/flb_utf8.h +++ b/include/fluent-bit/flb_utf8.h @@ -20,84 +20,16 @@ #ifndef FLB_UTF8_H #define FLB_UTF8_H +#define FLB_UTF8_ACCEPT 0 +#define FLB_UTF8_REJECT 1 +#define FLB_UTF8_CONTINUE 2 + #include #include -/* is the start of a UTF-8 string ? */ -#define flb_utf8_check(c) (((c) & 0xC0) != 0x80) - -static const char trailingBytesForUTF8[256] = { - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 -}; - /* returns length of next utf-8 sequence */ -static inline int flb_utf8_len(const char *s) -{ - return trailingBytesForUTF8[(unsigned int)(unsigned char)s[0]] + 1; -} - -/* - * UTF-8 Decoding routines are originally written by Bjoern Hoehrmann - * and taken from the following web site: - * - * http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ - * - * They have been siglhy renamed to follow Fluent Bit naming requirements. - */ - -#define FLB_UTF8_ACCEPT 0 -#define FLB_UTF8_REJECT 1 - -static const uint8_t utf8d[] = { - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f - 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf - 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df - 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef - 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff - 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0 - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2 - 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4 - 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6 - 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8 -}; - -static inline uint32_t flb_utf8_decode(uint32_t *state, uint32_t *codep, - uint32_t byte) -{ - uint32_t type = utf8d[byte]; - - *codep = (*state != FLB_UTF8_ACCEPT) ? - (byte & 0x3fu) | (*codep << 6) : - (0xff >> type) & (byte); - - *state = utf8d[256 + *state*16 + type]; - return *state; -} - - -static inline void flb_utf8_print(const uint8_t *s) { - uint32_t codepoint; - uint32_t state = 0; - - for (; *s; ++s) - if (!flb_utf8_decode(&state, &codepoint, *s)) { - printf("\\u%04x\n", codepoint); - } - - if (state != FLB_UTF8_ACCEPT) { - printf("The string is not well-formed\n"); - } -} +int flb_utf8_len(const char *s); +uint32_t flb_utf8_decode(uint32_t *state, uint32_t *codep, uint8_t byte); +void flb_utf8_print(char *input); #endif diff --git a/src/flb_utf8.c b/src/flb_utf8.c new file mode 100644 index 00000000000..ba8b4696415 --- /dev/null +++ b/src/flb_utf8.c @@ -0,0 +1,201 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ + +/* Fluent Bit + * ========== + * Copyright (C) 2015-2024 The Fluent Bit Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include + +static const char trailing_bytes_for_utf8[256] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 +}; + +/* returns length of next utf-8 sequence */ +int flb_utf8_len(const char *s) +{ + return trailing_bytes_for_utf8[(unsigned int)(unsigned char)s[0]] + 1; +} + +#if defined(__GNUC__) || defined(__clang__) +/* + * if we are compiling with GNU or CLang compiler , we have the ranges + * functionality available, so we can tweak our decoder by using a lookup + * table. + * + * Lookup table for byte classification and state transitions: + * + * Format: {initial_state, bitmask, expected_continuation_bytes} + * ASCII: state 0, no continuation bytes + * Start of multi-byte sequence: state X, continuation byte count + * Invalid: reject state + */ +static const uint8_t utf8_lookup[256][3] = { + [0x00 ... 0x7F] = {0, 0x7F, 0}, /* ASCII */ + [0xC0 ... 0xDF] = {1, 0x1F, 1}, /* Start of 2-byte sequence */ + [0xE0 ... 0xEF] = {2, 0x0F, 2}, /* Start of 3-byte sequence */ + [0xF0 ... 0xF7] = {3, 0x07, 3}, /* Start of 4-byte sequence */ + [0x80 ... 0xBF] = {FLB_UTF8_REJECT, 0, 0}, /* Continuation bytes */ + [0xF8 ... 0xFF] = {FLB_UTF8_REJECT, 0, 0}, /* Invalid bytes */ +}; + +uint32_t flb_utf8_decode(uint32_t *state, uint32_t *codep, uint8_t byte) +{ + const uint8_t *entry = utf8_lookup[byte]; + + if (*state == FLB_UTF8_ACCEPT) { + /* starting a new character */ + *state = entry[0]; + if (*state == FLB_UTF8_REJECT) { + /* invalid start byte */ + return FLB_UTF8_REJECT; + } + *codep = byte & entry[1]; + } + else { + /* continuation byte */ + if ((byte & 0xC0) == 0x80) { + *codep = (*codep << 6) | (byte & 0x3F); + /* decrement continuation bytes */ + (*state)--; + } + else { + /* invalid continuation byte */ + *state = FLB_UTF8_REJECT; + return FLB_UTF8_REJECT; + } + } + + /* check if the sequence is complete */ + if (*state == 0) { + if (*codep >= 0xD800 && *codep <= 0xDFFF) { + /* surrogate pair (invalid UTF-8) */ + *state = FLB_UTF8_REJECT; + return FLB_UTF8_REJECT; + } + else if (*codep > 0x10FFFF) { + /* out of range codepoint */ + *state = FLB_UTF8_REJECT; + return FLB_UTF8_REJECT; + } + /* valid and complete sequence */ + return FLB_UTF8_ACCEPT; + } + + /* we are still processing the current sequence */ + return FLB_UTF8_CONTINUE; +} + +#else + +/* fallback decoder: no lookup table */ +uint32_t flb_utf8_decode(uint32_t *state, uint32_t *codep, uint8_t byte) +{ + /* Start of a new character */ + if (*state == 0) { + if (byte <= 0x7F) { + /* ASCII */ + *codep = byte; + return FLB_UTF8_ACCEPT; + } + else if ((byte & 0xE0) == 0xC0) { + /* start of a 2-byte sequence */ + *codep = byte & 0x1F; + *state = 1; + } + else if ((byte & 0xF0) == 0xE0) { + /* start of a 3-byte sequence */ + *codep = byte & 0x0F; + *state = 2; + } + else if ((byte & 0xF8) == 0xF0) { + /* start of a 4-byte sequence */ + *codep = byte & 0x07; + *state = 3; + } + else { + /* invalid first byte */ + *state = FLB_UTF8_REJECT; + return FLB_UTF8_REJECT; + } + } + else { + /* continuation byte */ + if ((byte & 0xC0) == 0x80) { + *codep = (*codep << 6) | (byte & 0x3F); + + /* reduce the expected continuation bytes */ + (*state)--; + } + else { + /* invalid continuation byte */ + *state = FLB_UTF8_REJECT; + return FLB_UTF8_REJECT; + } + } + + if (*state == 0) { + /* sequence complete */ + if (*codep >= 0xD800 && *codep <= 0xDFFF) { + /* invalid surrogate pair */ + *state = FLB_UTF8_REJECT; + return FLB_UTF8_REJECT; + } + else if (*codep > 0x10FFFF) { + /* codepoint is out of range */ + *state = FLB_UTF8_REJECT; + return FLB_UTF8_REJECT; + } + return FLB_UTF8_ACCEPT; + } + + /* we are still processing the current sequence */ + return FLB_UTF8_CONTINUE; +} + +#endif + +void flb_utf8_print(char *input) +{ + int i; + int ret; + int len; + uint32_t state = 0; + uint32_t codepoint = 0; + + len = strlen(input); + for (i = 0; i < len; i++) { + ret = flb_utf8_decode(&state, &codepoint, (uint8_t) input[i]); + if (ret == FLB_UTF8_ACCEPT) { + printf("Valid Codepoint: U+%04X\n", codepoint); + } + else if (ret == FLB_UTF8_REJECT) { + printf("Invalid UTF-8 sequence detected.\n"); + break; + } + } +} From f52366bc20defb8dbf3fe86721d5e5062f9d13ac Mon Sep 17 00:00:00 2001 From: Eduardo Silva Date: Tue, 3 Dec 2024 20:23:48 -0600 Subject: [PATCH 02/11] build: add flb_utf8.c Signed-off-by: Eduardo Silva --- src/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 8404b65c1ca..bf1c50f147d 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -12,6 +12,7 @@ set(src flb_env.c flb_file.c flb_uri.c + flb_utf8.c flb_hash_table.c flb_help.c flb_pack.c From ca704322e4ddcfc4a5ea8090b83f3ff04ad65598 Mon Sep 17 00:00:00 2001 From: Eduardo Silva Date: Tue, 3 Dec 2024 20:24:35 -0600 Subject: [PATCH 03/11] utils: use new utf8 decoder API Signed-off-by: Eduardo Silva --- src/flb_utils.c | 83 +++++++++++++++++++++++++++++-------------------- 1 file changed, 50 insertions(+), 33 deletions(-) diff --git a/src/flb_utils.c b/src/flb_utils.c index 211632260e3..4834f7eecba 100644 --- a/src/flb_utils.c +++ b/src/flb_utils.c @@ -790,8 +790,10 @@ static const struct escape_seq json_escape_table[128] = { int flb_utils_write_str(char *buf, int *off, size_t size, const char *str, size_t str_len) { int i, b, ret, len, hex_bytes, utf_sequence_length, utf_sequence_number; + int processed_bytes = 0; int is_valid, copypos = 0, vlen; - uint32_t codepoint, state = 0; + uint32_t codepoint = 0; + uint32_t state = 0; char tmp[16]; size_t available; uint32_t c; @@ -904,31 +906,40 @@ int flb_utils_write_str(char *buf, int *off, size_t size, const char *str, size_ /* decode UTF-8 sequence */ state = FLB_UTF8_ACCEPT; codepoint = 0; + processed_bytes = 0; for (b = 0; b < hex_bytes; b++) { s = (unsigned char *) &str[i + b]; ret = flb_utf8_decode(&state, &codepoint, *s); - if (ret == 0) { + processed_bytes++; + + if (ret == FLB_UTF8_ACCEPT) { + /* check if all required bytes for the sequence are processed */ + if (processed_bytes == hex_bytes) { + break; + } + } + else if (ret == FLB_UTF8_REJECT) { + flb_warn("[pack] Invalid UTF-8 bytes found, skipping."); break; } } - if (state != FLB_UTF8_ACCEPT) { - flb_warn("[pack] Invalid UTF-8 bytes found, skipping."); - } - else { - + if (state == FLB_UTF8_ACCEPT) { len = snprintf(tmp, sizeof(tmp), "\\u%.4x", codepoint); if (available < len) { - return FLB_FALSE; // Not enough space + return FLB_FALSE; } memcpy(p, tmp, len); p += len; offset += len; available -= len; } + else { + flb_warn("[pack] Invalid UTF-8 bytes found, skipping."); + } - i += hex_bytes; + i += processed_bytes; } /* Handle sequences beyond 0xFFFF */ else if (c > 0xFFFF) { @@ -940,26 +951,25 @@ int flb_utils_write_str(char *buf, int *off, size_t size, const char *str, size_ break; } + state = FLB_UTF8_ACCEPT; + codepoint = 0; is_valid = FLB_TRUE; + + /* Decode the sequence */ for (utf_sequence_number = 0; utf_sequence_number < utf_sequence_length; utf_sequence_number++) { - /* Leading characters must start with bits 11 */ - if (utf_sequence_number == 0 && ((str[i] & 0xC0) != 0xC0)) { - /* Invalid unicode character. replace */ - flb_debug("[pack] unexpected UTF-8 leading byte, " - "substituting character with replacement character"); - tmp[utf_sequence_number] = str[i]; - i++; /* Consume invalid leading byte */ - utf_sequence_length = utf_sequence_number + 1; - is_valid = FLB_FALSE; - break; - } - /* Trailing characters must start with bits 10 */ - else if (utf_sequence_number > 0 && ((str[i] & 0xC0) != 0x80)) { - /* Invalid unicode character. replace */ - flb_debug("[pack] unexpected UTF-8 continuation byte, " - "substituting character with replacement character"); - /* This byte, i, is the start of the next unicode character */ - utf_sequence_length = utf_sequence_number; + ret = flb_utf8_decode(&state, &codepoint, (uint8_t) str[i]); + + if (ret == FLB_UTF8_REJECT) { + if (utf_sequence_number == 0 && ((str[i] & 0xC0) != 0xC0)) { + tmp[utf_sequence_number] = str[i]; + i++; /* Consume invalid leading byte */ + utf_sequence_length = utf_sequence_number + 1; + } + else if (utf_sequence_number > 0 && ((str[i] & 0xC0) != 0x80)) { + utf_sequence_length = utf_sequence_number; + } + flb_debug("[pack] invalid UTF-8 sequence detected, replacing with substitution character"); + //codepoint = 0xFFFD; // Replacement character is_valid = FLB_FALSE; break; } @@ -968,17 +978,23 @@ int flb_utils_write_str(char *buf, int *off, size_t size, const char *str, size_ ++i; } - --i; + //--i; if (is_valid) { if (available < utf_sequence_length) { - return FLB_FALSE; // Not enough space + /* not enough space */ + return FLB_FALSE; } - encoded_to_buf(p, tmp, utf_sequence_length); - p += utf_sequence_length; + len = snprintf(tmp, sizeof(tmp), "\\u%.4x", codepoint); + if (available < len) { + /* not enough space */ + return FLB_FALSE; + } + memcpy(p, tmp, len); + p += len; offset += utf_sequence_length; - available -= utf_sequence_length; + available -= len;//utf_sequence_length; } else { if (available < utf_sequence_length * 3) { @@ -1035,8 +1051,9 @@ int flb_utils_write_str(char *buf, int *off, size_t size, const char *str, size_ available -= 3; } } + } - else { + else { if (available < 1) { /* no space for a single byte */ return FLB_FALSE; From d06b3fdf466fbb0ce50651394d47ab216b6fd927 Mon Sep 17 00:00:00 2001 From: Eduardo Silva Date: Tue, 3 Dec 2024 20:27:57 -0600 Subject: [PATCH 04/11] tests: internal: utils: adjusts expected utf8 encoded bytes Signed-off-by: Eduardo Silva --- tests/internal/utils.c | 53 ++++++++++++++++++++++++++++++------------ 1 file changed, 38 insertions(+), 15 deletions(-) diff --git a/tests/internal/utils.c b/tests/internal/utils.c index f55a3672c37..fb9f8c1f2a1 100644 --- a/tests/internal/utils.c +++ b/tests/internal/utils.c @@ -127,7 +127,8 @@ static void write_str_test_cases(struct write_str_case *cases) { } /* test case loop for flb_utils_write_str */ -static void write_str_test_cases_w_buf_size(struct write_str_case *cases, int buf_size) { +static void write_str_test_cases_w_buf_size(struct write_str_case *cases, int buf_size) +{ char *buf = flb_calloc(buf_size + 1, sizeof(char)); int size = buf_size + 1; int off; @@ -170,11 +171,13 @@ static void write_str_test_cases_w_buf_size(struct write_str_case *cases, int bu void test_write_str() { - char buf[10]; - char japanese_a[4] = {0xe3, 0x81, 0x82}; - int size = sizeof(buf); int off; int ret; + char buf[10] = {0}; + int size = sizeof(buf); + + /* escaped Unicode representation of あ */ + char jp_expected_output[] = "\\u3042"; off = 0; ret = flb_utils_write_str(buf, &off, size, "a", 1); @@ -189,15 +192,16 @@ void test_write_str() off = 0; ret = flb_utils_write_str(buf, &off, size, "\xe3\x81\x82", 3); TEST_CHECK(ret == FLB_TRUE); - TEST_CHECK(memcmp(buf, japanese_a, off) == 0); + TEST_CHECK(memcmp(buf, jp_expected_output, off) == 0); - // Truncated bytes + /* Truncated bytes: 'buf' should not be touched and off == 0 */ off = 0; ret = flb_utils_write_str(buf, &off, size, "\xe3\x81\x82\xe3", 1); TEST_CHECK(ret == FLB_TRUE); - TEST_CHECK(memcmp(buf, japanese_a, off) == 0); + TEST_CHECK(off == 0); + TEST_CHECK(memcmp(buf, jp_expected_output, off) == 0); - // Error: buffer too small + /* Error: buffer too small */ off = 0; ret = flb_utils_write_str(buf, &off, size, "aaaaaaaaaaa", 11); TEST_CHECK(ret == FLB_FALSE); @@ -238,7 +242,7 @@ void test_write_str_invalid_leading_byte() */ { "\x00\x01\xe3\x81\x82""abc", 8, /* note that 0x01 is an invalid byte */ - "\\u0000\\u0001""\xe3\x81\x82""abc", /* escape hex */ + "\\u0000\\u0001\\u3042""abc", /* escape hex */ FLB_TRUE }, /* @@ -252,7 +256,7 @@ void test_write_str_invalid_leading_byte() "\xee\x83\xb3" /* f3 fragment */ /* replace invalid unicode */ "\xee\x82\x81" /* 81 fragment */ "\xee\x82\x81" /* 81 fragment */ - "\xe3\x81\x82""abc", /* valid unicode */ + "\\u3042""abc", /* valid unicode */ FLB_TRUE }, /* @@ -263,7 +267,25 @@ void test_write_str_invalid_leading_byte() "\xf3\x81\x01\xe3\x81\x82""abc", 9, /* note that 0x01 is an invalid byte */ "\xee\x83\xb3" /* f3 fragment */ /* replace invalid unicode */ "\xee\x82\x81" /* 81 fragment */ - "\\u0001""\xe3\x81\x82""abc", + "\\u0001""\\u3042""abc", + FLB_TRUE + }, + { 0 } + }; + + write_str_test_cases(cases); +} + +void test_write_str_special_bytes() +{ + + struct write_str_case cases[] = { + /* + * Escaped leading hex (two hex, one valid unicode) + */ + { + "你好世界", 12, + "\\u4f60\\u597d\\u4e16\\u754c", FLB_TRUE }, { 0 } @@ -342,10 +364,10 @@ void test_write_str_buffer_overrun() FLB_FALSE }, { - "\"" + "a" "\xe3\x81\x82", 4, /* valid unicode */ - "\\\"""\xe3\x81\x82", /* just enough space for valid unicode */ - FLB_TRUE + "a", /* just enough space for valid ascii */ + FLB_FALSE /* no space for \u3042 */ }, { "\x81" @@ -662,7 +684,7 @@ struct size_to_bytes_check size_to_bytes_checks[] = { {"9223372036.78G", -1}, }; -void test_size_to_bytes() +void test_size_to_bytes() { int i; int size; @@ -682,6 +704,7 @@ TEST_LIST = { /* JSON maps iteration */ { "url_split", test_url_split }, { "write_str", test_write_str }, + { "write_str_special_bytes", test_write_str_special_bytes }, { "test_write_str_invalid_trailing_bytes", test_write_str_invalid_trailing_bytes }, { "test_write_str_invalid_leading_byte", test_write_str_invalid_leading_byte }, { "test_write_str_edge_cases", test_write_str_edge_cases }, From 8d8bda8fa9a0ff5ac590dd47daa85e8a2ccb88a5 Mon Sep 17 00:00:00 2001 From: Eduardo Silva Date: Thu, 5 Dec 2024 17:39:33 -0600 Subject: [PATCH 05/11] tests: internal: data: pack: generators: fix json encoding ascii utf8 Signed-off-by: Eduardo Silva --- tests/internal/data/pack/mixed.py | 26 +++++++------ tests/internal/data/pack/mixed_002.json | 2 +- tests/internal/data/pack/mixed_002.mp | Bin 37 -> 36 bytes tests/internal/data/pack/mixed_003.json | 2 +- tests/internal/data/pack/utf8_bell.json | 2 +- tests/internal/data/pack/utf8_copyright.json | 2 +- tests/internal/data/pack/utf8_gen.py | 37 +++++++++++-------- tests/internal/data/pack/utf8_hokke.json | 2 +- tests/internal/data/pack/utf8_relaxed.json | 2 +- 9 files changed, 42 insertions(+), 33 deletions(-) diff --git a/tests/internal/data/pack/mixed.py b/tests/internal/data/pack/mixed.py index 7ad0a6149b6..c8859231d98 100644 --- a/tests/internal/data/pack/mixed.py +++ b/tests/internal/data/pack/mixed.py @@ -6,22 +6,24 @@ import json import msgpack + def gen_json(f): - raw = open(f, 'r') - data = raw.read() - raw.close() + # Open the input file in text mode with UTF-8 encoding + with open(f, 'r', encoding='utf-8') as raw: + data = raw.read() - out_mp = f[:-4] + ".mp" - out_json = f[:-4] + ".json" + # Define output filenames + base_name = os.path.splitext(f)[0] + out_mp = base_name + ".mp" + out_json = base_name + ".json" - # Write messagepack - fmp = open(out_mp, 'w') - fmp.write(msgpack.packb(data)) - fmp.close() + # Write MessagePack-encoded data in binary mode + with open(out_mp, 'wb') as fmp: + fmp.write(msgpack.packb(data)) - fjson = open(out_json, 'w') - fjson.write(json.dumps(data)) - fjson.close() + # Write JSON-encoded data in text mode + with open(out_json, 'w', encoding='utf-8') as fjson: + fjson.write(json.dumps(data)) for fn in os.listdir('.'): if not os.path.isfile(fn): diff --git a/tests/internal/data/pack/mixed_002.json b/tests/internal/data/pack/mixed_002.json index e32314b0aa9..d1afe50ace3 100644 --- a/tests/internal/data/pack/mixed_002.json +++ b/tests/internal/data/pack/mixed_002.json @@ -1 +1 @@ -"mixed_002 =>\n\n áéíóú\n\n\n'\n\\t\n" \ No newline at end of file +"mixed_002 =>\n\n \u00e1\u00e9\u00ed\u00f3\u00fa\n\n\n'\n\\t\n" \ No newline at end of file diff --git a/tests/internal/data/pack/mixed_002.mp b/tests/internal/data/pack/mixed_002.mp index 1bf975535a1c3e87759e44027bf688f83f835be2..4938203d59a1b150b84400a5fa60eb0dcbff5798 100644 GIT binary patch delta 7 OcmY#YVZ1q!Nf`hKB>~z1 delta 8 PcmY#UWxB;Mkx>}{2vGss diff --git a/tests/internal/data/pack/mixed_003.json b/tests/internal/data/pack/mixed_003.json index 167c89b8a06..126945cb3c7 100644 --- a/tests/internal/data/pack/mixed_003.json +++ b/tests/internal/data/pack/mixed_003.json @@ -1 +1 @@ -"á\n" \ No newline at end of file +"\u00e1\n" \ No newline at end of file diff --git a/tests/internal/data/pack/utf8_bell.json b/tests/internal/data/pack/utf8_bell.json index ced4da0cfe5..d0730c4a7ce 100644 --- a/tests/internal/data/pack/utf8_bell.json +++ b/tests/internal/data/pack/utf8_bell.json @@ -1 +1 @@ -"🔔" \ No newline at end of file +"\ud83d\udd14" \ No newline at end of file diff --git a/tests/internal/data/pack/utf8_copyright.json b/tests/internal/data/pack/utf8_copyright.json index 4d52a66f3d0..92d937cf7bd 100644 --- a/tests/internal/data/pack/utf8_copyright.json +++ b/tests/internal/data/pack/utf8_copyright.json @@ -1 +1 @@ -"©" \ No newline at end of file +"\u00a9" \ No newline at end of file diff --git a/tests/internal/data/pack/utf8_gen.py b/tests/internal/data/pack/utf8_gen.py index 606e8cc2d31..9f1eef2b9c2 100644 --- a/tests/internal/data/pack/utf8_gen.py +++ b/tests/internal/data/pack/utf8_gen.py @@ -6,27 +6,34 @@ import msgpack def gen_json(f): + print(f) - print f - - with io.open(f, 'rb') as raw: + with open(f, 'rb') as raw: data = raw.read() - out_mp = f[:-4] + ".mp" - out_json = f[:-4] + ".json" + out_mp = f"{os.path.splitext(f)[0]}.mp" + out_json = f"{os.path.splitext(f)[0]}.json" + + # Decode input bytes to a string + try: + decoded_data = data.decode('utf-8') + except UnicodeDecodeError as e: + print(f"Error: Unable to decode file {f} as UTF-8: {e}") + return # Write messagepack - fmp = open(out_mp, 'w') - fmp.write(msgpack.packb(data)) - fmp.close() + with open(out_mp, 'wb') as fmp: + fmp.write(msgpack.packb(decoded_data)) - fjson = open(out_json, 'w') - fjson.write(json.dumps(data).encode('utf8')) - fjson.close() + # Write JSON with properly encoded Unicode escape sequences + with open(out_json, 'w', encoding='utf-8') as fjson: + # Use json.dumps with ensure_ascii=True for \uXXXX escape sequences + escaped_data = json.dumps(decoded_data, ensure_ascii=True) + fjson.write(escaped_data) for fn in os.listdir('.'): - if not os.path.isfile(fn): - continue + if not os.path.isfile(fn): + continue - if fn.startswith('utf8_') and fn.endswith('.txt'): - gen_json(fn) + if fn.startswith('utf8_') and fn.endswith('.txt'): + gen_json(fn) diff --git a/tests/internal/data/pack/utf8_hokke.json b/tests/internal/data/pack/utf8_hokke.json index d93624bf21f..37f460c4b35 100644 --- a/tests/internal/data/pack/utf8_hokke.json +++ b/tests/internal/data/pack/utf8_hokke.json @@ -1 +1 @@ -"𩸽" \ No newline at end of file +"\ud867\ude3d" \ No newline at end of file diff --git a/tests/internal/data/pack/utf8_relaxed.json b/tests/internal/data/pack/utf8_relaxed.json index 4526bf40faf..2402faf9df4 100644 --- a/tests/internal/data/pack/utf8_relaxed.json +++ b/tests/internal/data/pack/utf8_relaxed.json @@ -1 +1 @@ -"☺" \ No newline at end of file +"\u263a" \ No newline at end of file From cfe3b8b00c9f59f985c20c55e1cd6463d9aa05d9 Mon Sep 17 00:00:00 2001 From: Eduardo Silva Date: Thu, 5 Dec 2024 17:40:41 -0600 Subject: [PATCH 06/11] utf8: remove lookup table Signed-off-by: Eduardo Silva --- src/flb_utf8.c | 74 -------------------------------------------------- 1 file changed, 74 deletions(-) diff --git a/src/flb_utf8.c b/src/flb_utf8.c index ba8b4696415..8e6ee71c68d 100644 --- a/src/flb_utf8.c +++ b/src/flb_utf8.c @@ -41,78 +41,6 @@ int flb_utf8_len(const char *s) return trailing_bytes_for_utf8[(unsigned int)(unsigned char)s[0]] + 1; } -#if defined(__GNUC__) || defined(__clang__) -/* - * if we are compiling with GNU or CLang compiler , we have the ranges - * functionality available, so we can tweak our decoder by using a lookup - * table. - * - * Lookup table for byte classification and state transitions: - * - * Format: {initial_state, bitmask, expected_continuation_bytes} - * ASCII: state 0, no continuation bytes - * Start of multi-byte sequence: state X, continuation byte count - * Invalid: reject state - */ -static const uint8_t utf8_lookup[256][3] = { - [0x00 ... 0x7F] = {0, 0x7F, 0}, /* ASCII */ - [0xC0 ... 0xDF] = {1, 0x1F, 1}, /* Start of 2-byte sequence */ - [0xE0 ... 0xEF] = {2, 0x0F, 2}, /* Start of 3-byte sequence */ - [0xF0 ... 0xF7] = {3, 0x07, 3}, /* Start of 4-byte sequence */ - [0x80 ... 0xBF] = {FLB_UTF8_REJECT, 0, 0}, /* Continuation bytes */ - [0xF8 ... 0xFF] = {FLB_UTF8_REJECT, 0, 0}, /* Invalid bytes */ -}; - -uint32_t flb_utf8_decode(uint32_t *state, uint32_t *codep, uint8_t byte) -{ - const uint8_t *entry = utf8_lookup[byte]; - - if (*state == FLB_UTF8_ACCEPT) { - /* starting a new character */ - *state = entry[0]; - if (*state == FLB_UTF8_REJECT) { - /* invalid start byte */ - return FLB_UTF8_REJECT; - } - *codep = byte & entry[1]; - } - else { - /* continuation byte */ - if ((byte & 0xC0) == 0x80) { - *codep = (*codep << 6) | (byte & 0x3F); - /* decrement continuation bytes */ - (*state)--; - } - else { - /* invalid continuation byte */ - *state = FLB_UTF8_REJECT; - return FLB_UTF8_REJECT; - } - } - - /* check if the sequence is complete */ - if (*state == 0) { - if (*codep >= 0xD800 && *codep <= 0xDFFF) { - /* surrogate pair (invalid UTF-8) */ - *state = FLB_UTF8_REJECT; - return FLB_UTF8_REJECT; - } - else if (*codep > 0x10FFFF) { - /* out of range codepoint */ - *state = FLB_UTF8_REJECT; - return FLB_UTF8_REJECT; - } - /* valid and complete sequence */ - return FLB_UTF8_ACCEPT; - } - - /* we are still processing the current sequence */ - return FLB_UTF8_CONTINUE; -} - -#else - -/* fallback decoder: no lookup table */ uint32_t flb_utf8_decode(uint32_t *state, uint32_t *codep, uint8_t byte) { /* Start of a new character */ @@ -177,8 +105,6 @@ uint32_t flb_utf8_decode(uint32_t *state, uint32_t *codep, uint8_t byte) return FLB_UTF8_CONTINUE; } -#endif - void flb_utf8_print(char *input) { int i; From cbd1a284222179248ebfddb0b77442343671565d Mon Sep 17 00:00:00 2001 From: Eduardo Silva Date: Thu, 5 Dec 2024 17:42:20 -0600 Subject: [PATCH 07/11] utils: add utf8 encoding for codepoints beyond BMP Signed-off-by: Eduardo Silva --- src/flb_utils.c | 47 +++++++++++++++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 16 deletions(-) diff --git a/src/flb_utils.c b/src/flb_utils.c index 4834f7eecba..998c2ac66ca 100644 --- a/src/flb_utils.c +++ b/src/flb_utils.c @@ -786,20 +786,23 @@ static const struct escape_seq json_escape_table[128] = { * to escape special characters and convert utf-8 byte characters to string * representation. */ - int flb_utils_write_str(char *buf, int *off, size_t size, const char *str, size_t str_len) { int i, b, ret, len, hex_bytes, utf_sequence_length, utf_sequence_number; int processed_bytes = 0; int is_valid, copypos = 0, vlen; + uint32_t c; uint32_t codepoint = 0; uint32_t state = 0; - char tmp[16]; size_t available; - uint32_t c; - char *p; uint8_t *s; off_t offset = 0; + char tmp[16]; + char *p; + + /* to encode codepoints > 0xFFFF */ + uint16_t high; + uint16_t low; available = size - *off; @@ -957,19 +960,21 @@ int flb_utils_write_str(char *buf, int *off, size_t size, const char *str, size_ /* Decode the sequence */ for (utf_sequence_number = 0; utf_sequence_number < utf_sequence_length; utf_sequence_number++) { - ret = flb_utf8_decode(&state, &codepoint, (uint8_t) str[i]); + ret = flb_utf8_decode(&state, &codepoint, (uint8_t) str[i]); if (ret == FLB_UTF8_REJECT) { - if (utf_sequence_number == 0 && ((str[i] & 0xC0) != 0xC0)) { + /* Handle invalid leading byte */ + if (utf_sequence_number == 0) { + flb_debug("[pack] unexpected UTF-8 leading byte, substituting character"); tmp[utf_sequence_number] = str[i]; - i++; /* Consume invalid leading byte */ - utf_sequence_length = utf_sequence_number + 1; + utf_sequence_length = utf_sequence_number + 1; /* Process only this invalid byte */ + i++; /* Consume invalid byte */ } - else if (utf_sequence_number > 0 && ((str[i] & 0xC0) != 0x80)) { - utf_sequence_length = utf_sequence_number; + /* Handle invalid continuation byte */ + else { + flb_debug("[pack] unexpected UTF-8 continuation byte, substituting character"); + utf_sequence_length = utf_sequence_number; /* Adjust length */ } - flb_debug("[pack] invalid UTF-8 sequence detected, replacing with substitution character"); - //codepoint = 0xFFFD; // Replacement character is_valid = FLB_FALSE; break; } @@ -978,7 +983,7 @@ int flb_utils_write_str(char *buf, int *off, size_t size, const char *str, size_ ++i; } - //--i; + --i; if (is_valid) { if (available < utf_sequence_length) { @@ -986,15 +991,25 @@ int flb_utils_write_str(char *buf, int *off, size_t size, const char *str, size_ return FLB_FALSE; } - len = snprintf(tmp, sizeof(tmp), "\\u%.4x", codepoint); + /* Handle codepoints beyond BMP (requires surrogate pairs in UTF-16) */ + if (codepoint > 0xFFFF) { + high = 0xD800 + ((codepoint - 0x10000) >> 10); + low = 0xDC00 + ((codepoint - 0x10000) & 0x3FF); + + len = snprintf(tmp, sizeof(tmp), "\\u%.4x\\u%.4x", high, low); + } + else { + len = snprintf(tmp, sizeof(tmp), "\\u%.4x", codepoint); + } + if (available < len) { /* not enough space */ return FLB_FALSE; } memcpy(p, tmp, len); p += len; - offset += utf_sequence_length; - available -= len;//utf_sequence_length; + offset += len; + available -= len; } else { if (available < utf_sequence_length * 3) { From 4b64b3b453ce531f67740ac0b095d5c844fb5d4c Mon Sep 17 00:00:00 2001 From: Eduardo Silva Date: Thu, 5 Dec 2024 17:49:26 -0600 Subject: [PATCH 08/11] sds: use new utf8 decoder for sds_cat_utf8 Signed-off-by: Eduardo Silva --- src/flb_sds.c | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/src/flb_sds.c b/src/flb_sds.c index 759be89d9ab..1d8263b3432 100644 --- a/src/flb_sds.c +++ b/src/flb_sds.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -279,13 +280,15 @@ flb_sds_t flb_sds_copy(flb_sds_t s, const char *str, int len) return s; } -flb_sds_t flb_sds_cat_utf8 (flb_sds_t *sds, const char *str, int str_len) +flb_sds_t flb_sds_cat_utf8(flb_sds_t *sds, const char *str, int str_len) { static const char int2hex[] = "0123456789abcdef"; int i; int b; int ret; int hex_bytes; + int offset; + size_t size; uint32_t cp; uint32_t state = 0; unsigned char c; @@ -297,6 +300,7 @@ flb_sds_t flb_sds_cat_utf8 (flb_sds_t *sds, const char *str, int str_len) s = *sds; head = FLB_SDS_HEADER(s); + /* make sure we have at least str_len extra bytes available */ if (flb_sds_avail(s) <= str_len) { tmp = flb_sds_increase(s, str_len); if (tmp == NULL) { @@ -306,6 +310,30 @@ flb_sds_t flb_sds_cat_utf8 (flb_sds_t *sds, const char *str, int str_len) head = FLB_SDS_HEADER(s); } + while (1) { + offset = head->len; + ret = flb_utils_write_str(s, &offset, flb_sds_alloc(s), str, str_len); + if (ret == FLB_FALSE) { + /* realloc */ + size = flb_sds_alloc(s) * 2; + tmp = flb_sds_increase(s, size); + if (tmp == NULL) { + return NULL; + } + *sds = s = tmp; + head = FLB_SDS_HEADER(s); + } + else { + break; + } + } + + flb_sds_len_set(s, offset); + s[head->len] = '\0'; + return s; + + + for (i = 0; i < str_len; i++) { if (flb_sds_avail(s) < 8) { tmp = flb_sds_increase(s, 8); From fc236e72c63c413c4e68247915e3e05be6ddacf3 Mon Sep 17 00:00:00 2001 From: Eduardo Silva Date: Thu, 5 Dec 2024 17:50:20 -0600 Subject: [PATCH 09/11] tests: internal: sds: adjust unit test for utf8 Signed-off-by: Eduardo Silva --- tests/internal/sds.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tests/internal/sds.c b/tests/internal/sds.c index 30dc166ddb1..d9d5db1027f 100644 --- a/tests/internal/sds.c +++ b/tests/internal/sds.c @@ -44,7 +44,7 @@ static void test_sds_printf_7143_off_by_1() flb_sds_t test; flb_sds_t test2; int len; - + /* 66 char final string, not impacted by bug */ test = flb_sds_create_size(64); TEST_CHECK(test != NULL); @@ -69,13 +69,20 @@ static void test_sds_printf_7143_off_by_1() static void test_sds_cat_utf8() { + int ret; flb_sds_t s; char *utf8_str = "\xe8\x9f\xb9\xf0\x9f\xa6\x80"; + char *expected = "\\u87f9\\ud83e\\udd80"; s = flb_sds_create(""); flb_sds_cat_utf8(&s, utf8_str, strlen(utf8_str)); - TEST_CHECK(strcmp(s, "\\u87f9\\u1f980") == 0); + ret = strcmp(s, expected); + TEST_CHECK(ret == 0); + if (ret != 0) { + printf("Expected: %s\n", expected); + printf("Received: %s\n", s); + } flb_sds_destroy(s); } From f27201e10129a5264a012b840bcea9ccbe7a4457 Mon Sep 17 00:00:00 2001 From: Eduardo Silva Date: Thu, 5 Dec 2024 17:50:42 -0600 Subject: [PATCH 10/11] tests: internal: utils: add missing test Signed-off-by: Eduardo Silva --- tests/internal/utils.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/internal/utils.c b/tests/internal/utils.c index fb9f8c1f2a1..6195c0e848e 100644 --- a/tests/internal/utils.c +++ b/tests/internal/utils.c @@ -278,7 +278,6 @@ void test_write_str_invalid_leading_byte() void test_write_str_special_bytes() { - struct write_str_case cases[] = { /* * Escaped leading hex (two hex, one valid unicode) @@ -288,6 +287,11 @@ void test_write_str_special_bytes() "\\u4f60\\u597d\\u4e16\\u754c", FLB_TRUE }, + { + "\xC3\xA1\x0A", 3, /* UTF-8 encoding of á and newline */ + "\\u00e1\\n", /* Expected escaped output */ + FLB_TRUE + }, { 0 } }; From 0d2267738fc275fcf990c9322439ff8de5f605d0 Mon Sep 17 00:00:00 2001 From: Eduardo Silva Date: Thu, 12 Dec 2024 10:36:03 -0600 Subject: [PATCH 11/11] sds: utf8: remove dead code Signed-off-by: Eduardo Silva --- src/flb_sds.c | 100 -------------------------------------------------- 1 file changed, 100 deletions(-) diff --git a/src/flb_sds.c b/src/flb_sds.c index 1d8263b3432..4ac36ad22e8 100644 --- a/src/flb_sds.c +++ b/src/flb_sds.c @@ -331,106 +331,6 @@ flb_sds_t flb_sds_cat_utf8(flb_sds_t *sds, const char *str, int str_len) flb_sds_len_set(s, offset); s[head->len] = '\0'; return s; - - - - for (i = 0; i < str_len; i++) { - if (flb_sds_avail(s) < 8) { - tmp = flb_sds_increase(s, 8); - if (tmp == NULL) { - return NULL; - } - *sds = s = tmp; - head = FLB_SDS_HEADER(s); - } - - c = (unsigned char)str[i]; - if (c == '\\' || c == '"') { - s[head->len++] = '\\'; - s[head->len++] = c; - } - else if (c >= '\b' && c <= '\r') { - s[head->len++] = '\\'; - switch (c) { - case '\n': - s[head->len++] = 'n'; - break; - case '\t': - s[head->len++] = 't'; - break; - case '\b': - s[head->len++] = 'b'; - break; - case '\f': - s[head->len++] = 'f'; - break; - case '\r': - s[head->len++] = 'r'; - break; - case '\v': - s[head->len++] = 'u'; - s[head->len++] = '0'; - s[head->len++] = '0'; - s[head->len++] = '0'; - s[head->len++] = 'b'; - break; - } - } - else if (c < 32 || c == 0x7f) { - s[head->len++] = '\\'; - s[head->len++] = 'u'; - s[head->len++] = '0'; - s[head->len++] = '0'; - s[head->len++] = int2hex[ (unsigned char) ((c & 0xf0) >> 4)]; - s[head->len++] = int2hex[ (unsigned char) (c & 0x0f)]; - } - else if (c >= 0x80) { - hex_bytes = flb_utf8_len(str + i); - state = FLB_UTF8_ACCEPT; - cp = 0; - for (b = 0; b < hex_bytes; b++) { - p = (const unsigned char *) str + i + b; - if (p >= (unsigned char *) (str + str_len)) { - break; - } - ret = flb_utf8_decode(&state, &cp, *p); - if (ret == 0) { - break; - } - } - - if (state != FLB_UTF8_ACCEPT) { - /* Invalid UTF-8 hex, just skip utf-8 bytes */ - flb_warn("[pack] invalid UTF-8 bytes, skipping"); - break; - } - - s[head->len++] = '\\'; - s[head->len++] = 'u'; - if (cp > 0xFFFF) { - c = (unsigned char) ((cp & 0xf00000) >> 20); - if (c > 0) { - s[head->len++] = int2hex[c]; - } - c = (unsigned char) ((cp & 0x0f0000) >> 16); - if (c > 0) { - s[head->len++] = int2hex[c]; - } - } - s[head->len++] = int2hex[ (unsigned char) ((cp & 0xf000) >> 12)]; - s[head->len++] = int2hex[ (unsigned char) ((cp & 0x0f00) >> 8)]; - s[head->len++] = int2hex[ (unsigned char) ((cp & 0xf0) >> 4)]; - s[head->len++] = int2hex[ (unsigned char) (cp & 0x0f)]; - i += (hex_bytes - 1); - } - else { - s[head->len++] = c; - } - } - - s[head->len] = '\0'; - - return s; } flb_sds_t flb_sds_printf(flb_sds_t *sds, const char *fmt, ...)