Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
Signed-off-by: Eduardo Silva <[email protected]>
  • Loading branch information
edsiper committed Dec 5, 2024
1 parent d06b3fd commit 0cb069d
Show file tree
Hide file tree
Showing 5 changed files with 183 additions and 41 deletions.
31 changes: 30 additions & 1 deletion src/flb_sds.c
Original file line number Diff line number Diff line change
Expand Up @@ -278,14 +278,17 @@ flb_sds_t flb_sds_copy(flb_sds_t s, const char *str, int len)

return s;
}
int flb_utils_write_str(char *buf, int *off, size_t size, const char *str, size_t str_len);

flb_sds_t flb_sds_cat_utf8 (flb_sds_t *sds, const char *str, int str_len)
flb_sds_t flb_sds_cat_utf8(flb_sds_t *sds, const char *str, int str_len)
{
static const char int2hex[] = "0123456789abcdef";
int i;
int b;
int ret;
int hex_bytes;
int offset;
size_t size;
uint32_t cp;
uint32_t state = 0;
unsigned char c;
Expand All @@ -297,6 +300,7 @@ flb_sds_t flb_sds_cat_utf8 (flb_sds_t *sds, const char *str, int str_len)
s = *sds;
head = FLB_SDS_HEADER(s);

/* make sure we have at least str_len extra bytes available */
if (flb_sds_avail(s) <= str_len) {
tmp = flb_sds_increase(s, str_len);
if (tmp == NULL) {
Expand All @@ -306,6 +310,31 @@ flb_sds_t flb_sds_cat_utf8 (flb_sds_t *sds, const char *str, int str_len)
head = FLB_SDS_HEADER(s);
}

while (1) {
offset = head->len;
ret = flb_utils_write_str(s, &offset, flb_sds_alloc(s), str, str_len);
if (ret == FLB_FALSE) {
/* realloc */
size = flb_sds_alloc(s) * 2;
tmp = flb_sds_increase(s, size);
if (tmp == NULL) {
return NULL;
}
*sds = s = tmp;
head = FLB_SDS_HEADER(s);
}
else {
break;
}
}

printf("ne woffset: %i\n", offset);
flb_sds_len_set(s, offset);
s[head->len] = '\0';
return s;



for (i = 0; i < str_len; i++) {
if (flb_sds_avail(s) < 8) {
tmp = flb_sds_increase(s, 8);
Expand Down
125 changes: 99 additions & 26 deletions src/flb_utf8.c
Original file line number Diff line number Diff line change
Expand Up @@ -55,61 +55,134 @@ int flb_utf8_len(const char *s)
* Invalid: reject state
*/
static const uint8_t utf8_lookup[256][3] = {
[0x00 ... 0x7F] = {0, 0x7F, 0}, /* ASCII */
[0xC0 ... 0xDF] = {1, 0x1F, 1}, /* Start of 2-byte sequence */
[0xE0 ... 0xEF] = {2, 0x0F, 2}, /* Start of 3-byte sequence */
[0xF0 ... 0xF7] = {3, 0x07, 3}, /* Start of 4-byte sequence */
[0x80 ... 0xBF] = {FLB_UTF8_REJECT, 0, 0}, /* Continuation bytes */
[0xF8 ... 0xFF] = {FLB_UTF8_REJECT, 0, 0}, /* Invalid bytes */
[0x00 ... 0x7F] = {FLB_UTF8_ACCEPT, 0x7F, 0}, /* ASCII */
[0xC2 ... 0xDF] = {1, 0x1F, 1}, /* Start of 2-byte sequence */
[0xE0 ... 0xEF] = {2, 0x0F, 2}, /* Start of 3-byte sequence */
[0xF0 ... 0xF7] = {3, 0x07, 3}, /* Start of 4-byte sequence */
[0x80 ... 0xBF] = {FLB_UTF8_CONTINUE, 0x3F, 0}, /* Continuation bytes */
[0xC0 ... 0xC1] = {FLB_UTF8_REJECT, 0, 0}, /* Invalid 2-byte sequences */
[0xF8 ... 0xFF] = {FLB_UTF8_REJECT, 0, 0}, /* Invalid bytes */
};

uint32_t flb_utf8_decode(uint32_t *state, uint32_t *codep, uint8_t byte)
{
// static const uint8_t utf8_lookup[256][3] = {
// [0x00 ... 0x7F] = {0, 0x7F, 0}, /* ASCII */
// [0xC0 ... 0xDF] = {1, 0x1F, 1}, /* Start of 2-byte sequence */
// [0xE0 ... 0xEF] = {2, 0x0F, 2}, /* Start of 3-byte sequence */
// [0xF0 ... 0xF7] = {3, 0x07, 3}, /* Start of 4-byte sequence */
// //[0x80 ... 0xBF] = {FLB_UTF8_REJECT, 0, 0}, /* Continuation bytes */
// [0x80 ... 0xBF] = {FLB_UTF8_CONTINUE, 0x3F, 0}, /* Continuation bytes */
// //[0x80 ... 0xBF] = {0, 0x3F, 0}, /* Valid continuation bytes */
// [0xF8 ... 0xFF] = {FLB_UTF8_REJECT, 0, 0}, /* Invalid bytes */
// };




uint32_t flb_utf8_decode(uint32_t *state, uint32_t *codep, uint8_t byte) {
static const uint8_t utf8_lookup[256][3] = {
[0x00 ... 0x7F] = {0, 0x7F, 0}, /* ASCII */
[0xC0 ... 0xDF] = {1, 0x1F, 1}, /* Start of 2-byte sequence */
[0xE0 ... 0xEF] = {2, 0x0F, 2}, /* Start of 3-byte sequence */
[0xF0 ... 0xF7] = {3, 0x07, 3}, /* Start of 4-byte sequence */
[0x80 ... 0xBF] = {FLB_UTF8_CONTINUE, 0x3F, 0}, /* Continuation bytes */
[0xF8 ... 0xFF] = {FLB_UTF8_REJECT, 0, 0}, /* Invalid bytes */
};

const uint8_t *entry = utf8_lookup[byte];

if (*state == FLB_UTF8_ACCEPT) {
/* starting a new character */
/* Starting a new sequence */
*state = entry[0];
if (*state == FLB_UTF8_REJECT) {
/* invalid start byte */
/* Invalid start byte */
return FLB_UTF8_REJECT;
}
*codep = byte & entry[1];
}
else {
/* continuation byte */
*codep = byte & entry[1]; // Apply the mask to the first byte
if (*state != FLB_UTF8_CONTINUE) {
(*state)--; // Decrement the state
}
} else {
/* Continuation byte expected */
if ((byte & 0xC0) == 0x80) {
/* Valid continuation byte */

// Explicitly shift and OR the continuation byte
*codep = (*codep << 6) | (byte & 0x3F);
/* decrement continuation bytes */
(*state)--;
}
else {
/* invalid continuation byte */

(*state)--; /* Decrement the continuation state */
} else {
/* Invalid continuation byte */
*state = FLB_UTF8_REJECT;
return FLB_UTF8_REJECT;
}
}

/* check if the sequence is complete */
/* Sequence is complete */
if (*state == 0) {
if (*codep >= 0xD800 && *codep <= 0xDFFF) {
/* surrogate pair (invalid UTF-8) */
/* Invalid surrogate pair */
*state = FLB_UTF8_REJECT;
return FLB_UTF8_REJECT;
}
else if (*codep > 0x10FFFF) {
/* out of range codepoint */
} else if (*codep > 0x10FFFF) {
/* Out of range codepoint */
*state = FLB_UTF8_REJECT;
return FLB_UTF8_REJECT;
}
/* valid and complete sequence */
return FLB_UTF8_ACCEPT;
}

/* we are still processing the current sequence */
/* Sequence is still incomplete */
return FLB_UTF8_CONTINUE;
}


// uint32_t flb_utf8_decode(uint32_t *state, uint32_t *codep, uint8_t byte)
// {
// const uint8_t *entry = utf8_lookup[byte];

// if (*state == FLB_UTF8_ACCEPT) {
// /* starting a new character */
// *state = entry[0];
// if (*state == FLB_UTF8_REJECT) {
// /* invalid start byte */
// return FLB_UTF8_REJECT;
// }
// *codep = byte & entry[1];
// }
// else {
// /* continuation byte */
// if ((byte & 0xC0) == 0x80) {
// *codep = (*codep << 6) | (byte & 0x3F);
// /* decrement continuation bytes */
// (*state)--;
// }
// else {
// /* invalid continuation byte */
// *state = FLB_UTF8_REJECT;
// return FLB_UTF8_REJECT;
// }
// }

// /* check if the sequence is complete */
// if (*state == 0) {
// if (*codep >= 0xD800 && *codep <= 0xDFFF) {
// /* surrogate pair (invalid UTF-8) */
// *state = FLB_UTF8_REJECT;
// return FLB_UTF8_REJECT;
// }
// else if (*codep > 0x10FFFF) {
// /* out of range codepoint */
// *state = FLB_UTF8_REJECT;
// return FLB_UTF8_REJECT;
// }
// /* valid and complete sequence */
// return FLB_UTF8_ACCEPT;
// }

// /* we are still processing the current sequence */
// return FLB_UTF8_CONTINUE;
// }

#else

/* fallback decoder: no lookup table */
Expand Down
35 changes: 24 additions & 11 deletions src/flb_utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -786,7 +786,6 @@ static const struct escape_seq json_escape_table[128] = {
* to escape special characters and convert utf-8 byte characters to string
* representation.
*/

int flb_utils_write_str(char *buf, int *off, size_t size, const char *str, size_t str_len)
{
int i, b, ret, len, hex_bytes, utf_sequence_length, utf_sequence_number;
Expand Down Expand Up @@ -856,6 +855,7 @@ int flb_utils_write_str(char *buf, int *off, size_t size, const char *str, size_

/* Use lookup table for escaping known sequences */
if (c < 128 && json_escape_table[c].seq) {
printf("Debug: c = 0x%02X, seq = %s\n", c, json_escape_table[c].seq);
/*
* All characters in the table have a lenght of 2 or 6 bytes,
* just check if the second byte starts with 'u' so we know
Expand All @@ -868,6 +868,7 @@ int flb_utils_write_str(char *buf, int *off, size_t size, const char *str, size_
len = 2;
}

printf("c=%x ; c < 128 | len = %d\n", c, len);
/* check if we have anough space */
if (available < len) {
return FLB_FALSE;
Expand All @@ -881,6 +882,7 @@ int flb_utils_write_str(char *buf, int *off, size_t size, const char *str, size_
}
/* Handle UTF-8 sequences from 0x80 to 0xFFFF */
else if (c >= 0x80 && c <= 0xFFFF) {
printf( "c=%x ; c >= 0x80 && c <= 0xFFFF\n", c);
hex_bytes = flb_utf8_len(&str[i]);

/* Handle invalid or truncated sequence */
Expand Down Expand Up @@ -943,6 +945,7 @@ int flb_utils_write_str(char *buf, int *off, size_t size, const char *str, size_
}
/* Handle sequences beyond 0xFFFF */
else if (c > 0xFFFF) {
printf( "c=%x ; c > 0xFFFF\n", c);
utf_sequence_length = flb_utf8_len(str + i);

/* skip truncated UTF-8 ? */
Expand All @@ -955,30 +958,40 @@ int flb_utils_write_str(char *buf, int *off, size_t size, const char *str, size_
codepoint = 0;
is_valid = FLB_TRUE;

printf("i entered as %d; utf_sequence_length is %d\n", i, utf_sequence_length);

/* Decode the sequence */
for (utf_sequence_number = 0; utf_sequence_number < utf_sequence_length; utf_sequence_number++) {
ret = flb_utf8_decode(&state, &codepoint, (uint8_t) str[i]);
ret = flb_utf8_decode(&state, &codepoint, (uint8_t) str[i]);
printf("Byte: 0x%X, State: %d, Codepoint: 0x%X\n", str[i], state, codepoint);


if (ret == FLB_UTF8_REJECT) {
if (utf_sequence_number == 0 && ((str[i] & 0xC0) != 0xC0)) {
printf("rejected utf_sequence_number=%d\n", utf_sequence_number);
/* Handle invalid leading byte */
if (utf_sequence_number == 0) {
flb_debug("[pack] unexpected UTF-8 leading byte, substituting character");
tmp[utf_sequence_number] = str[i];
i++; /* Consume invalid leading byte */
utf_sequence_length = utf_sequence_number + 1;
utf_sequence_length = utf_sequence_number + 1; /* Process only this invalid byte */
printf("exception 1 utf_sequence_length is %d (str[%i] = 0x%x) i++\n", utf_sequence_length, i, str[i]);
i++; /* Consume invalid byte */
}
else if (utf_sequence_number > 0 && ((str[i] & 0xC0) != 0x80)) {
utf_sequence_length = utf_sequence_number;
/* Handle invalid continuation byte */
else {
flb_debug("[pack] unexpected UTF-8 continuation byte, substituting character");
utf_sequence_length = utf_sequence_number; /* Adjust length */
printf("exception 2 utf_sequence_length is %d (str[%i] = 0x%x)\n", utf_sequence_length, i, str[i]);
}
flb_debug("[pack] invalid UTF-8 sequence detected, replacing with substitution character");
//codepoint = 0xFFFD; // Replacement character
is_valid = FLB_FALSE;
break;
}

tmp[utf_sequence_number] = str[i];
printf("increment i to %d\n", i+1);
++i;
}

//--i;
printf("valid=%i codepoint = 0x%x\n", is_valid, codepoint);
--i;

if (is_valid) {
if (available < utf_sequence_length) {
Expand Down
2 changes: 1 addition & 1 deletion tests/internal/pack.c
Original file line number Diff line number Diff line change
Expand Up @@ -421,7 +421,6 @@ static int utf8_tests_create()
snprintf(ext_json + (len - 3), sizeof(ext_json) - len - 3,
"%s", ".json");


/* Validate new paths */
ret = stat(ext_mp, &st);
if (ret == -1) {
Expand Down Expand Up @@ -506,6 +505,7 @@ void test_utf8_to_json()
printf("[test] %s\n", test->json);
printf(" EXPECTED => '%s'\n", file_json);
printf(" ENCODED => '%s'\n", out_buf);
exit(1);
}

TEST_CHECK(out_size == json_size);
Expand Down
31 changes: 29 additions & 2 deletions tests/internal/utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -276,16 +276,43 @@ void test_write_str_invalid_leading_byte()
write_str_test_cases(cases);
}

#include <fluent-bit/flb_utf8.h>
void test_write_str_special_bytes()
{

uint8_t input[] = {0xC3, 0xA1};
uint32_t state = FLB_UTF8_ACCEPT; // Initialize state
uint32_t codepoint = 0; // Initialize codepoint
uint32_t result; // Store the result here

for (int i = 0; i < sizeof(input); i++) {
printf("Decoding byte: 0x%02X\n", input[i]);
printf("State before: %u, Codepoint before: %u\n", state, codepoint);
result = flb_utf8_decode(&state, &codepoint, input[i]);
printf("State after: %u, Codepoint after: %u\n", state, codepoint);
}

// Check the final result and state
if (result == FLB_UTF8_ACCEPT && state == FLB_UTF8_ACCEPT) {
printf("Final decoded codepoint: U+%04X\n", codepoint);
} else {
printf("UTF-8 decoding failed!\n");
}

return;

struct write_str_case cases[] = {
/*
* Escaped leading hex (two hex, one valid unicode)
*/
// {
// "你好世界", 12,
// "\\u4f60\\u597d\\u4e16\\u754c",
// FLB_TRUE
// },
{
"你好世界", 12,
"\\u4f60\\u597d\\u4e16\\u754c",
"\xC3\xA1\x0A", 3, /* UTF-8 encoding of á and newline */
"\\u00E1\\n", /* Expected escaped output */
FLB_TRUE
},
{ 0 }
Expand Down

0 comments on commit 0cb069d

Please sign in to comment.