From 0cb069d4564a9272f33fdd463263aa6e3db440fa Mon Sep 17 00:00:00 2001 From: Eduardo Silva Date: Wed, 4 Dec 2024 22:48:28 -0600 Subject: [PATCH] WIP Signed-off-by: Eduardo Silva --- src/flb_sds.c | 31 +++++++++- src/flb_utf8.c | 125 ++++++++++++++++++++++++++++++++--------- src/flb_utils.c | 35 ++++++++---- tests/internal/pack.c | 2 +- tests/internal/utils.c | 31 +++++++++- 5 files changed, 183 insertions(+), 41 deletions(-) diff --git a/src/flb_sds.c b/src/flb_sds.c index 759be89d9ab..67bc736348e 100644 --- a/src/flb_sds.c +++ b/src/flb_sds.c @@ -278,14 +278,17 @@ flb_sds_t flb_sds_copy(flb_sds_t s, const char *str, int len) return s; } +int flb_utils_write_str(char *buf, int *off, size_t size, const char *str, size_t str_len); -flb_sds_t flb_sds_cat_utf8 (flb_sds_t *sds, const char *str, int str_len) +flb_sds_t flb_sds_cat_utf8(flb_sds_t *sds, const char *str, int str_len) { static const char int2hex[] = "0123456789abcdef"; int i; int b; int ret; int hex_bytes; + int offset; + size_t size; uint32_t cp; uint32_t state = 0; unsigned char c; @@ -297,6 +300,7 @@ flb_sds_t flb_sds_cat_utf8 (flb_sds_t *sds, const char *str, int str_len) s = *sds; head = FLB_SDS_HEADER(s); + /* make sure we have at least str_len extra bytes available */ if (flb_sds_avail(s) <= str_len) { tmp = flb_sds_increase(s, str_len); if (tmp == NULL) { @@ -306,6 +310,31 @@ flb_sds_t flb_sds_cat_utf8 (flb_sds_t *sds, const char *str, int str_len) head = FLB_SDS_HEADER(s); } + while (1) { + offset = head->len; + ret = flb_utils_write_str(s, &offset, flb_sds_alloc(s), str, str_len); + if (ret == FLB_FALSE) { + /* realloc */ + size = flb_sds_alloc(s) * 2; + tmp = flb_sds_increase(s, size); + if (tmp == NULL) { + return NULL; + } + *sds = s = tmp; + head = FLB_SDS_HEADER(s); + } + else { + break; + } + } + + printf("ne woffset: %i\n", offset); + flb_sds_len_set(s, offset); + s[head->len] = '\0'; + return s; + + + for (i = 0; i < str_len; i++) { if (flb_sds_avail(s) < 8) { tmp = flb_sds_increase(s, 8); diff --git a/src/flb_utf8.c b/src/flb_utf8.c index ba8b4696415..ea4639fcba4 100644 --- a/src/flb_utf8.c +++ b/src/flb_utf8.c @@ -55,61 +55,134 @@ int flb_utf8_len(const char *s) * Invalid: reject state */ static const uint8_t utf8_lookup[256][3] = { - [0x00 ... 0x7F] = {0, 0x7F, 0}, /* ASCII */ - [0xC0 ... 0xDF] = {1, 0x1F, 1}, /* Start of 2-byte sequence */ - [0xE0 ... 0xEF] = {2, 0x0F, 2}, /* Start of 3-byte sequence */ - [0xF0 ... 0xF7] = {3, 0x07, 3}, /* Start of 4-byte sequence */ - [0x80 ... 0xBF] = {FLB_UTF8_REJECT, 0, 0}, /* Continuation bytes */ - [0xF8 ... 0xFF] = {FLB_UTF8_REJECT, 0, 0}, /* Invalid bytes */ + [0x00 ... 0x7F] = {FLB_UTF8_ACCEPT, 0x7F, 0}, /* ASCII */ + [0xC2 ... 0xDF] = {1, 0x1F, 1}, /* Start of 2-byte sequence */ + [0xE0 ... 0xEF] = {2, 0x0F, 2}, /* Start of 3-byte sequence */ + [0xF0 ... 0xF7] = {3, 0x07, 3}, /* Start of 4-byte sequence */ + [0x80 ... 0xBF] = {FLB_UTF8_CONTINUE, 0x3F, 0}, /* Continuation bytes */ + [0xC0 ... 0xC1] = {FLB_UTF8_REJECT, 0, 0}, /* Invalid 2-byte sequences */ + [0xF8 ... 0xFF] = {FLB_UTF8_REJECT, 0, 0}, /* Invalid bytes */ }; -uint32_t flb_utf8_decode(uint32_t *state, uint32_t *codep, uint8_t byte) -{ +// static const uint8_t utf8_lookup[256][3] = { +// [0x00 ... 0x7F] = {0, 0x7F, 0}, /* ASCII */ +// [0xC0 ... 0xDF] = {1, 0x1F, 1}, /* Start of 2-byte sequence */ +// [0xE0 ... 0xEF] = {2, 0x0F, 2}, /* Start of 3-byte sequence */ +// [0xF0 ... 0xF7] = {3, 0x07, 3}, /* Start of 4-byte sequence */ +// //[0x80 ... 0xBF] = {FLB_UTF8_REJECT, 0, 0}, /* Continuation bytes */ +// [0x80 ... 0xBF] = {FLB_UTF8_CONTINUE, 0x3F, 0}, /* Continuation bytes */ +// //[0x80 ... 0xBF] = {0, 0x3F, 0}, /* Valid continuation bytes */ +// [0xF8 ... 0xFF] = {FLB_UTF8_REJECT, 0, 0}, /* Invalid bytes */ +// }; + + + + +uint32_t flb_utf8_decode(uint32_t *state, uint32_t *codep, uint8_t byte) { + static const uint8_t utf8_lookup[256][3] = { + [0x00 ... 0x7F] = {0, 0x7F, 0}, /* ASCII */ + [0xC0 ... 0xDF] = {1, 0x1F, 1}, /* Start of 2-byte sequence */ + [0xE0 ... 0xEF] = {2, 0x0F, 2}, /* Start of 3-byte sequence */ + [0xF0 ... 0xF7] = {3, 0x07, 3}, /* Start of 4-byte sequence */ + [0x80 ... 0xBF] = {FLB_UTF8_CONTINUE, 0x3F, 0}, /* Continuation bytes */ + [0xF8 ... 0xFF] = {FLB_UTF8_REJECT, 0, 0}, /* Invalid bytes */ + }; + const uint8_t *entry = utf8_lookup[byte]; if (*state == FLB_UTF8_ACCEPT) { - /* starting a new character */ + /* Starting a new sequence */ *state = entry[0]; if (*state == FLB_UTF8_REJECT) { - /* invalid start byte */ + /* Invalid start byte */ return FLB_UTF8_REJECT; } - *codep = byte & entry[1]; - } - else { - /* continuation byte */ + *codep = byte & entry[1]; // Apply the mask to the first byte + if (*state != FLB_UTF8_CONTINUE) { + (*state)--; // Decrement the state + } + } else { + /* Continuation byte expected */ if ((byte & 0xC0) == 0x80) { + /* Valid continuation byte */ + + // Explicitly shift and OR the continuation byte *codep = (*codep << 6) | (byte & 0x3F); - /* decrement continuation bytes */ - (*state)--; - } - else { - /* invalid continuation byte */ + + (*state)--; /* Decrement the continuation state */ + } else { + /* Invalid continuation byte */ *state = FLB_UTF8_REJECT; return FLB_UTF8_REJECT; } } - /* check if the sequence is complete */ + /* Sequence is complete */ if (*state == 0) { if (*codep >= 0xD800 && *codep <= 0xDFFF) { - /* surrogate pair (invalid UTF-8) */ + /* Invalid surrogate pair */ *state = FLB_UTF8_REJECT; return FLB_UTF8_REJECT; - } - else if (*codep > 0x10FFFF) { - /* out of range codepoint */ + } else if (*codep > 0x10FFFF) { + /* Out of range codepoint */ *state = FLB_UTF8_REJECT; return FLB_UTF8_REJECT; } - /* valid and complete sequence */ return FLB_UTF8_ACCEPT; } - /* we are still processing the current sequence */ + /* Sequence is still incomplete */ return FLB_UTF8_CONTINUE; } + +// uint32_t flb_utf8_decode(uint32_t *state, uint32_t *codep, uint8_t byte) +// { +// const uint8_t *entry = utf8_lookup[byte]; + +// if (*state == FLB_UTF8_ACCEPT) { +// /* starting a new character */ +// *state = entry[0]; +// if (*state == FLB_UTF8_REJECT) { +// /* invalid start byte */ +// return FLB_UTF8_REJECT; +// } +// *codep = byte & entry[1]; +// } +// else { +// /* continuation byte */ +// if ((byte & 0xC0) == 0x80) { +// *codep = (*codep << 6) | (byte & 0x3F); +// /* decrement continuation bytes */ +// (*state)--; +// } +// else { +// /* invalid continuation byte */ +// *state = FLB_UTF8_REJECT; +// return FLB_UTF8_REJECT; +// } +// } + +// /* check if the sequence is complete */ +// if (*state == 0) { +// if (*codep >= 0xD800 && *codep <= 0xDFFF) { +// /* surrogate pair (invalid UTF-8) */ +// *state = FLB_UTF8_REJECT; +// return FLB_UTF8_REJECT; +// } +// else if (*codep > 0x10FFFF) { +// /* out of range codepoint */ +// *state = FLB_UTF8_REJECT; +// return FLB_UTF8_REJECT; +// } +// /* valid and complete sequence */ +// return FLB_UTF8_ACCEPT; +// } + +// /* we are still processing the current sequence */ +// return FLB_UTF8_CONTINUE; +// } + #else /* fallback decoder: no lookup table */ diff --git a/src/flb_utils.c b/src/flb_utils.c index 4834f7eecba..ed4b27e130f 100644 --- a/src/flb_utils.c +++ b/src/flb_utils.c @@ -786,7 +786,6 @@ static const struct escape_seq json_escape_table[128] = { * to escape special characters and convert utf-8 byte characters to string * representation. */ - int flb_utils_write_str(char *buf, int *off, size_t size, const char *str, size_t str_len) { int i, b, ret, len, hex_bytes, utf_sequence_length, utf_sequence_number; @@ -856,6 +855,7 @@ int flb_utils_write_str(char *buf, int *off, size_t size, const char *str, size_ /* Use lookup table for escaping known sequences */ if (c < 128 && json_escape_table[c].seq) { + printf("Debug: c = 0x%02X, seq = %s\n", c, json_escape_table[c].seq); /* * All characters in the table have a lenght of 2 or 6 bytes, * just check if the second byte starts with 'u' so we know @@ -868,6 +868,7 @@ int flb_utils_write_str(char *buf, int *off, size_t size, const char *str, size_ len = 2; } + printf("c=%x ; c < 128 | len = %d\n", c, len); /* check if we have anough space */ if (available < len) { return FLB_FALSE; @@ -881,6 +882,7 @@ int flb_utils_write_str(char *buf, int *off, size_t size, const char *str, size_ } /* Handle UTF-8 sequences from 0x80 to 0xFFFF */ else if (c >= 0x80 && c <= 0xFFFF) { + printf( "c=%x ; c >= 0x80 && c <= 0xFFFF\n", c); hex_bytes = flb_utf8_len(&str[i]); /* Handle invalid or truncated sequence */ @@ -943,6 +945,7 @@ int flb_utils_write_str(char *buf, int *off, size_t size, const char *str, size_ } /* Handle sequences beyond 0xFFFF */ else if (c > 0xFFFF) { + printf( "c=%x ; c > 0xFFFF\n", c); utf_sequence_length = flb_utf8_len(str + i); /* skip truncated UTF-8 ? */ @@ -955,30 +958,40 @@ int flb_utils_write_str(char *buf, int *off, size_t size, const char *str, size_ codepoint = 0; is_valid = FLB_TRUE; + printf("i entered as %d; utf_sequence_length is %d\n", i, utf_sequence_length); + /* Decode the sequence */ for (utf_sequence_number = 0; utf_sequence_number < utf_sequence_length; utf_sequence_number++) { - ret = flb_utf8_decode(&state, &codepoint, (uint8_t) str[i]); + ret = flb_utf8_decode(&state, &codepoint, (uint8_t) str[i]); + printf("Byte: 0x%X, State: %d, Codepoint: 0x%X\n", str[i], state, codepoint); + if (ret == FLB_UTF8_REJECT) { - if (utf_sequence_number == 0 && ((str[i] & 0xC0) != 0xC0)) { + printf("rejected utf_sequence_number=%d\n", utf_sequence_number); + /* Handle invalid leading byte */ + if (utf_sequence_number == 0) { + flb_debug("[pack] unexpected UTF-8 leading byte, substituting character"); tmp[utf_sequence_number] = str[i]; - i++; /* Consume invalid leading byte */ - utf_sequence_length = utf_sequence_number + 1; + utf_sequence_length = utf_sequence_number + 1; /* Process only this invalid byte */ + printf("exception 1 utf_sequence_length is %d (str[%i] = 0x%x) i++\n", utf_sequence_length, i, str[i]); + i++; /* Consume invalid byte */ } - else if (utf_sequence_number > 0 && ((str[i] & 0xC0) != 0x80)) { - utf_sequence_length = utf_sequence_number; + /* Handle invalid continuation byte */ + else { + flb_debug("[pack] unexpected UTF-8 continuation byte, substituting character"); + utf_sequence_length = utf_sequence_number; /* Adjust length */ + printf("exception 2 utf_sequence_length is %d (str[%i] = 0x%x)\n", utf_sequence_length, i, str[i]); } - flb_debug("[pack] invalid UTF-8 sequence detected, replacing with substitution character"); - //codepoint = 0xFFFD; // Replacement character is_valid = FLB_FALSE; break; } tmp[utf_sequence_number] = str[i]; + printf("increment i to %d\n", i+1); ++i; } - - //--i; + printf("valid=%i codepoint = 0x%x\n", is_valid, codepoint); + --i; if (is_valid) { if (available < utf_sequence_length) { diff --git a/tests/internal/pack.c b/tests/internal/pack.c index 1bfabfeb6ac..466d8ce5474 100644 --- a/tests/internal/pack.c +++ b/tests/internal/pack.c @@ -421,7 +421,6 @@ static int utf8_tests_create() snprintf(ext_json + (len - 3), sizeof(ext_json) - len - 3, "%s", ".json"); - /* Validate new paths */ ret = stat(ext_mp, &st); if (ret == -1) { @@ -506,6 +505,7 @@ void test_utf8_to_json() printf("[test] %s\n", test->json); printf(" EXPECTED => '%s'\n", file_json); printf(" ENCODED => '%s'\n", out_buf); + exit(1); } TEST_CHECK(out_size == json_size); diff --git a/tests/internal/utils.c b/tests/internal/utils.c index fb9f8c1f2a1..a9c795a70eb 100644 --- a/tests/internal/utils.c +++ b/tests/internal/utils.c @@ -276,16 +276,43 @@ void test_write_str_invalid_leading_byte() write_str_test_cases(cases); } +#include void test_write_str_special_bytes() { + uint8_t input[] = {0xC3, 0xA1}; + uint32_t state = FLB_UTF8_ACCEPT; // Initialize state + uint32_t codepoint = 0; // Initialize codepoint + uint32_t result; // Store the result here + + for (int i = 0; i < sizeof(input); i++) { + printf("Decoding byte: 0x%02X\n", input[i]); + printf("State before: %u, Codepoint before: %u\n", state, codepoint); + result = flb_utf8_decode(&state, &codepoint, input[i]); + printf("State after: %u, Codepoint after: %u\n", state, codepoint); + } + + // Check the final result and state + if (result == FLB_UTF8_ACCEPT && state == FLB_UTF8_ACCEPT) { + printf("Final decoded codepoint: U+%04X\n", codepoint); + } else { + printf("UTF-8 decoding failed!\n"); + } + + return; + struct write_str_case cases[] = { /* * Escaped leading hex (two hex, one valid unicode) */ + // { + // "你好世界", 12, + // "\\u4f60\\u597d\\u4e16\\u754c", + // FLB_TRUE + // }, { - "你好世界", 12, - "\\u4f60\\u597d\\u4e16\\u754c", + "\xC3\xA1\x0A", 3, /* UTF-8 encoding of á and newline */ + "\\u00E1\\n", /* Expected escaped output */ FLB_TRUE }, { 0 }