Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

utf8: enhancements for handling of multibyte sequences #9687

Merged
merged 11 commits into from
Dec 13, 2024
82 changes: 7 additions & 75 deletions include/fluent-bit/flb_utf8.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,84 +20,16 @@
#ifndef FLB_UTF8_H
#define FLB_UTF8_H

#define FLB_UTF8_ACCEPT 0
#define FLB_UTF8_REJECT 1
#define FLB_UTF8_CONTINUE 2

#include <fluent-bit/flb_info.h>
#include <inttypes.h>

/* is the start of a UTF-8 string ? */
#define flb_utf8_check(c) (((c) & 0xC0) != 0x80)

static const char trailingBytesForUTF8[256] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
};

/* returns length of next utf-8 sequence */
static inline int flb_utf8_len(const char *s)
{
return trailingBytesForUTF8[(unsigned int)(unsigned char)s[0]] + 1;
}

/*
* UTF-8 Decoding routines are originally written by Bjoern Hoehrmann
* <[email protected]> and taken from the following web site:
*
* http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
*
* They have been siglhy renamed to follow Fluent Bit naming requirements.
*/

#define FLB_UTF8_ACCEPT 0
#define FLB_UTF8_REJECT 1

static const uint8_t utf8d[] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
};

static inline uint32_t flb_utf8_decode(uint32_t *state, uint32_t *codep,
uint32_t byte)
{
uint32_t type = utf8d[byte];

*codep = (*state != FLB_UTF8_ACCEPT) ?
(byte & 0x3fu) | (*codep << 6) :
(0xff >> type) & (byte);

*state = utf8d[256 + *state*16 + type];
return *state;
}


static inline void flb_utf8_print(const uint8_t *s) {
uint32_t codepoint;
uint32_t state = 0;

for (; *s; ++s)
if (!flb_utf8_decode(&state, &codepoint, *s)) {
printf("\\u%04x\n", codepoint);
}

if (state != FLB_UTF8_ACCEPT) {
printf("The string is not well-formed\n");
}
}
int flb_utf8_len(const char *s);
uint32_t flb_utf8_decode(uint32_t *state, uint32_t *codep, uint8_t byte);
void flb_utf8_print(char *input);

#endif
1 change: 1 addition & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ set(src
flb_env.c
flb_file.c
flb_uri.c
flb_utf8.c
flb_hash_table.c
flb_help.c
flb_pack.c
Expand Down
30 changes: 29 additions & 1 deletion src/flb_sds.c
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include <fluent-bit/flb_log.h>
#include <fluent-bit/flb_sds.h>
#include <fluent-bit/flb_utf8.h>
#include <fluent-bit/flb_utils.h>

#include <stdarg.h>
#include <ctype.h>
Expand Down Expand Up @@ -279,13 +280,15 @@ flb_sds_t flb_sds_copy(flb_sds_t s, const char *str, int len)
return s;
}

flb_sds_t flb_sds_cat_utf8 (flb_sds_t *sds, const char *str, int str_len)
flb_sds_t flb_sds_cat_utf8(flb_sds_t *sds, const char *str, int str_len)
{
static const char int2hex[] = "0123456789abcdef";
int i;
int b;
int ret;
int hex_bytes;
int offset;
size_t size;
uint32_t cp;
uint32_t state = 0;
unsigned char c;
Expand All @@ -297,6 +300,7 @@ flb_sds_t flb_sds_cat_utf8 (flb_sds_t *sds, const char *str, int str_len)
s = *sds;
head = FLB_SDS_HEADER(s);

/* make sure we have at least str_len extra bytes available */
if (flb_sds_avail(s) <= str_len) {
tmp = flb_sds_increase(s, str_len);
if (tmp == NULL) {
Expand All @@ -306,6 +310,30 @@ flb_sds_t flb_sds_cat_utf8 (flb_sds_t *sds, const char *str, int str_len)
head = FLB_SDS_HEADER(s);
}

while (1) {
offset = head->len;
ret = flb_utils_write_str(s, &offset, flb_sds_alloc(s), str, str_len);
if (ret == FLB_FALSE) {
/* realloc */
size = flb_sds_alloc(s) * 2;
tmp = flb_sds_increase(s, size);
if (tmp == NULL) {
return NULL;
}
*sds = s = tmp;
head = FLB_SDS_HEADER(s);
}
else {
break;
}
}

flb_sds_len_set(s, offset);
s[head->len] = '\0';
return s;



for (i = 0; i < str_len; i++) {
cosmo0920 marked this conversation as resolved.
Show resolved Hide resolved
if (flb_sds_avail(s) < 8) {
tmp = flb_sds_increase(s, 8);
Expand Down
127 changes: 127 additions & 0 deletions src/flb_utf8.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */

/* Fluent Bit
* ==========
* Copyright (C) 2015-2024 The Fluent Bit Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <fluent-bit/flb_info.h>
#include <fluent-bit/flb_utf8.h>

#include <stdio.h>
#include <string.h>
#include <inttypes.h>

static const char trailing_bytes_for_utf8[256] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
};

/* returns length of next utf-8 sequence */
int flb_utf8_len(const char *s)
{
return trailing_bytes_for_utf8[(unsigned int)(unsigned char)s[0]] + 1;
}

uint32_t flb_utf8_decode(uint32_t *state, uint32_t *codep, uint8_t byte)
{
/* Start of a new character */
if (*state == 0) {
if (byte <= 0x7F) {
/* ASCII */
*codep = byte;
return FLB_UTF8_ACCEPT;
}
else if ((byte & 0xE0) == 0xC0) {
/* start of a 2-byte sequence */
*codep = byte & 0x1F;
*state = 1;
}
else if ((byte & 0xF0) == 0xE0) {
/* start of a 3-byte sequence */
*codep = byte & 0x0F;
*state = 2;
}
else if ((byte & 0xF8) == 0xF0) {
/* start of a 4-byte sequence */
*codep = byte & 0x07;
*state = 3;
}
else {
/* invalid first byte */
*state = FLB_UTF8_REJECT;
return FLB_UTF8_REJECT;
}
}
else {
/* continuation byte */
if ((byte & 0xC0) == 0x80) {
*codep = (*codep << 6) | (byte & 0x3F);

/* reduce the expected continuation bytes */
(*state)--;
}
else {
/* invalid continuation byte */
*state = FLB_UTF8_REJECT;
return FLB_UTF8_REJECT;
}
}

if (*state == 0) {
/* sequence complete */
if (*codep >= 0xD800 && *codep <= 0xDFFF) {
/* invalid surrogate pair */
*state = FLB_UTF8_REJECT;
return FLB_UTF8_REJECT;
}
else if (*codep > 0x10FFFF) {
/* codepoint is out of range */
*state = FLB_UTF8_REJECT;
return FLB_UTF8_REJECT;
}
return FLB_UTF8_ACCEPT;
}

/* we are still processing the current sequence */
return FLB_UTF8_CONTINUE;
}

void flb_utf8_print(char *input)
{
int i;
int ret;
int len;
uint32_t state = 0;
uint32_t codepoint = 0;

len = strlen(input);
for (i = 0; i < len; i++) {
ret = flb_utf8_decode(&state, &codepoint, (uint8_t) input[i]);
if (ret == FLB_UTF8_ACCEPT) {
printf("Valid Codepoint: U+%04X\n", codepoint);
}
else if (ret == FLB_UTF8_REJECT) {
printf("Invalid UTF-8 sequence detected.\n");
break;
}
}
}
Loading
Loading