Skip to content

Commit

Permalink
Speed up identifier detection
Browse files Browse the repository at this point in the history
Both 'is_ident1' and 'is_ident2' are now macros instead of function
calls, and they are tweaked for ASCII detection in advance with the
fallback to table lookup for non-ASCII characters.

Also discard unnecessary variable and add safe guard:
- Variable is_first can be replaced with boolean expression "p == start"
- Add safe guard for ascii character checking to ensure starting identifier
  character must not be numberic

Additionally, replace first ascii character check with macro
is_ident2_ascii to keep readability. The later is_ident2 function call
is replaced with is_ident2_non_ascii because the expanded macro
function will result in multiple decode_utf8 function call, also
it's redundant to check if it's an ascii character or not.

Co-authored-by:  Jim Huang <[email protected]>
  • Loading branch information
jserv authored and ChAoSUnItY committed Nov 28, 2024
1 parent 3dcb97f commit 644be33
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 13 deletions.
7 changes: 5 additions & 2 deletions slimcc.h
Original file line number Diff line number Diff line change
Expand Up @@ -602,8 +602,11 @@ extern bool dont_reuse_stack;

int encode_utf8(char *buf, uint32_t c);
uint32_t decode_utf8(char **new_pos, char *p);
bool is_ident1(uint32_t c);
bool is_ident2(uint32_t c);
#define is_ident1(c) ((c < 0x80) ? is_ident1_ascii(c) : is_ident1_non_ascii(c))
#define is_ident1_ascii(c) ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == '$')
#define is_ident2_ascii(c) (is_ident1_ascii(c) || (c >= '0' && c <= '9'))
bool is_ident1_non_ascii(uint32_t c);
bool is_ident2_non_ascii(uint32_t c);
int display_width(char *p, int len);

//
Expand Down
12 changes: 6 additions & 6 deletions tokenize.c
Original file line number Diff line number Diff line change
Expand Up @@ -129,15 +129,15 @@ static Token *new_token(TokenKind kind, char *start, char *end) {
static int read_ident(char *p) {
char *start = p;

for (bool is_first = true;; is_first = false) {
if (Isalnum(*p) || *p == '_' || *p == '$') {
for (;;) {
if (p == start ? is_ident1_ascii(*p) : is_ident2_ascii(*p)) {
p++;
continue;
}
if ((unsigned char)*p >= 128) {
if ((uint32_t)*p >= 128) {
char *pos;
uint32_t c = decode_utf8(&pos, p);
if (is_first ? is_ident1(c) : is_ident2(c)) {
if (p == start ? is_ident1_non_ascii(c) : is_ident2_non_ascii(c)) {
p = pos;
continue;
}
Expand Down Expand Up @@ -424,13 +424,13 @@ static Token *new_pp_number(char *start, char *p) {
p += 2;
continue;
}
if (Isalnum(*p) || *p == '_' || *p == '$') {
if (is_ident2_ascii(*p)) {
p++;
continue;
}
if ((unsigned char)*p >= 128) {
char *pos;
if (is_ident2(decode_utf8(&pos, p))) {
if (is_ident2_non_ascii(decode_utf8(&pos, p))) {
p = pos;
continue;
}
Expand Down
9 changes: 4 additions & 5 deletions unicode.c
Original file line number Diff line number Diff line change
Expand Up @@ -90,9 +90,8 @@ static bool in_range(uint32_t c, UTF32Range *range, int len) {
// the first character of an identifier.
//
// Non-ASCII characters correspond to XID_Start set of Unicode 15.1.
bool is_ident1(uint32_t c) {
bool is_ident1_non_ascii(uint32_t c) {
static UTF32Range range[] = {
{'$', '$'}, {'A', 'Z'}, {'_', '_'}, {'a', 'z'},
{0x00AA, 0x00AA}, {0x00B5, 0x00B5}, {0x00BA, 0x00BA}, {0x00C0, 0x00D6},
{0x00D8, 0x00F6}, {0x00F8, 0x02C1}, {0x02C6, 0x02D1}, {0x02E0, 0x02E4},
{0x02EC, 0x02EC}, {0x02EE, 0x02EE}, {0x0370, 0x0374}, {0x0376, 0x0377},
Expand Down Expand Up @@ -269,9 +268,9 @@ bool is_ident1(uint32_t c) {
// character of an identifier.
//
// Non-ASCII characters correspond to XID_Continue set of Unicode 15.1.
bool is_ident2(uint32_t c) {
bool is_ident2_non_ascii(uint32_t c) {
static UTF32Range range[] = {
{'0', '9'}, {0x00B7, 0x00B7}, {0x0300, 0x036F}, {0x0387, 0x0387},
{0x00B7, 0x00B7}, {0x0300, 0x036F}, {0x0387, 0x0387},
{0x0483, 0x0487}, {0x0591, 0x05BD}, {0x05BF, 0x05BF}, {0x05C1, 0x05C2},
{0x05C4, 0x05C5}, {0x05C7, 0x05C7}, {0x0610, 0x061A}, {0x064B, 0x0669},
{0x0670, 0x0670}, {0x06D6, 0x06DC}, {0x06DF, 0x06E4}, {0x06E7, 0x06E8},
Expand Down Expand Up @@ -368,7 +367,7 @@ bool is_ident2(uint32_t c) {
{0xE0100, 0xE01EF}
};

return is_ident1(c) || in_range(c, range, sizeof(range) / sizeof(UTF32Range));
return is_ident1(c) || in_range(c, range, sizeof(range) / sizeof(UTF32Range));
}

// Returns the number of columns needed to display a given
Expand Down

0 comments on commit 644be33

Please sign in to comment.