From 5d2f3418a1f67f8fb8392d00a8e13ddc2af9e1dc Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Sat, 16 Nov 2024 16:13:08 +0800 Subject: [PATCH 01/18] Speed up recognize punctuation The original code in function 'read_punct' relies on heavy string specific function calls, resulting in slower execution. Instead, this function can be faster using straightforward control flow. --- tokenize.c | 68 ++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 56 insertions(+), 12 deletions(-) diff --git a/tokenize.c b/tokenize.c index 6ecbd92..03e7095 100644 --- a/tokenize.c +++ b/tokenize.c @@ -151,18 +151,62 @@ static int from_hex(char c) { } // Read a punctuator token from p and returns its length. -static int read_punct(char *p) { - static char *kw[] = { - "<<=", ">>=", "...", "==", "!=", "<=", ">=", "->", "+=", - "-=", "*=", "/=", "++", "--", "%=", "&=", "|=", "^=", "&&", - "||", "<<", ">>", "##", - }; - - for (int i = 0; i < sizeof(kw) / sizeof(*kw); i++) - if (startswith(p, kw[i])) - return strlen(kw[i]); - - return ispunct(*p) ? 1 : 0; +static int read_punct(const char *p) { + char c1; + switch (*p) { + case '<': // Pattern: < <= << <<= + c1 = *(p + 1); + if (c1 == '=') return 2; + if (c1 == '<') + return *(p + 2) == '=' ? 3 : 2; + return 1; + case '>': // Pattern: > >= >> >>= + c1 = *(p + 1); + if (c1 == '=') return 2; + if (c1 == '>') + return *(p + 2) == '=' ? 3 : 2; + return 1; + case '+': // Pattern: + ++ += + c1 = *(p + 1); + return (c1 == '+' || c1 == '=') ? 2 : 1; + case '-': // Pattern: - -- -= -> + c1 = *(p + 1); + return (c1 == '-' || c1 == '=' || c1 == '>') ? 2 : 1; + case '&': // Pattern: & &= && + c1 = *(p + 1); + return (c1 == '=' || c1 == '&') ? 2 : 1; + case '|': // Pattern: | |= || + c1 = *(p + 1); + return (c1 == '=' || c1 == '|') ? 2 : 1; + case '.': // Pattern: . ... + return (*(p + 1) == '.' && *(p + 2) == '.') ? 3 : 1; + case '=': // Pattern: = == + case '!': // Pattern: ! != + case '*': // Pattern: * *= + case '/': // Pattern: / /= + case '%': // Pattern: % %= + case '^': // Pattern: ^ ^= + return *(p + 1) == '=' ? 2 : 1; + case '#': // Pattern: # ## + return *(p + 1) == '#' ? 2 : 1; + case '$': + case '(': + case ')': + case ',': + case ':': + case ';': + case '?': + case '@': + case '[': + case ']': + case '_': + case '`': + case '{': + case '}': + case '~': + return 1; + } + return 0; } TokenKind ident_keyword(Token *tok) { From c7ae74f19f173017e8c95201b827904c8f580baf Mon Sep 17 00:00:00 2001 From: fuhsnn <66062782+fuhsnn@users.noreply.github.com> Date: Sun, 17 Nov 2024 15:13:28 +0800 Subject: [PATCH 02/18] Rework path handling of `#include <>` --- preprocess.c | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/preprocess.c b/preprocess.c index 85be39a..57bc5d9 100644 --- a/preprocess.c +++ b/preprocess.c @@ -888,28 +888,25 @@ static char *read_filename(Token **rest, Token *tok, bool *is_dquote) { // Pattern 3: #include FOO // In this case FOO must be macro-expanded to either // a single string token or a sequence of "<" ... ">". - if (tok->kind == TK_IDENT) + bool is_expanded = false; + if (tok->kind == TK_IDENT) { tok = preprocess2(tok); + is_expanded = true; + } - // Pattern 1: #include "foo.h" + char *filename = NULL; if (tok->kind == TK_STR) { + // Pattern 1: #include "foo.h" // A double-quoted filename for #include is a special kind of // token, and we don't want to interpret any escape sequences in it. // For example, "\f" in "C:\foo" is not a formfeed character but // just two non-control characters, backslash and f. // So we don't want to use token->str. + filename = strndup(tok->loc + 1, tok->len - 2); *is_dquote = true; - if (rest) - *rest = tok->next; - else - skip_line(tok->next); - return strndup(tok->loc + 1, tok->len - 2); - } - - // Pattern 2: #include - if (equal(tok, "<")) { - // Reconstruct a filename from a sequence of tokens between - // "<" and ">". + } else if (equal(tok, "<")) { + // Pattern 2: #include + // Reconstruct a filename from between "<" and ">". Token *start = tok; // Find closing ">". @@ -917,14 +914,20 @@ static char *read_filename(Token **rest, Token *tok, bool *is_dquote) { if (tok->kind == TK_EOF) error_tok(tok, "expected '>'"); + if (!is_expanded && start->file == tok->file && start->loc < tok->loc) + filename = strndup(start->loc + 1, tok->loc - start->loc - 1); + else + filename = join_tokens(start->next, tok); *is_dquote = false; + } + + if (filename && *filename != '\0') { if (rest) *rest = tok->next; else skip_line(tok->next); - return join_tokens(start->next, tok); + return filename; } - error_tok(tok, "expected a filename"); } From aa154968dee4ba393ac9f5fa0a6fa44c4c2c1ddf Mon Sep 17 00:00:00 2001 From: fuhsnn <66062782+fuhsnn@users.noreply.github.com> Date: Tue, 19 Nov 2024 22:00:15 +0800 Subject: [PATCH 03/18] Disable universal char names by default See #92 --- GNUmakefile | 10 ++++++---- main.c | 6 ++++++ slimcc.h | 1 + tokenize.c | 4 +++- 4 files changed, 16 insertions(+), 5 deletions(-) diff --git a/GNUmakefile b/GNUmakefile index 580763c..07a6bb8 100644 --- a/GNUmakefile +++ b/GNUmakefile @@ -1,5 +1,7 @@ CFLAGS=-std=c99 -g -fno-common -Wall -pedantic -Wno-switch +TEST_FLAGS=-Iinclude -Itest -fenable-universal-char + SRCS=$(wildcard *.c) OBJS=$(SRCS:.c=.o) @@ -17,11 +19,11 @@ slimcc: $(OBJS) $(OBJS): slimcc.h test/%.exe: slimcc test/%.c - ./slimcc -Iinclude -Itest -c -o test/$*.o test/$*.c + ./slimcc $(TEST_FLAGS) -c -o test/$*.o test/$*.c $(CC) -std=c11 -pthread -Wno-psabi -no-pie -o $@ test/$*.o -xc test/common test/c23/%.exe: slimcc test/c23/%.c - ./slimcc -std=c23 -Iinclude -Itest -c -o test/c23/$*.o test/c23/$*.c + ./slimcc -std=c23 $(TEST_FLAGS) -c -o test/c23/$*.o test/c23/$*.c $(CC) -std=c11 -pthread -Wno-psabi -no-pie -o $@ test/c23/$*.o -xc test/common test: $(TESTS) $(TESTS_C23) @@ -41,12 +43,12 @@ stage2/%.o: slimcc %.c stage2/test/%.exe: stage2/slimcc test/%.c mkdir -p stage2/test - ./stage2/slimcc -Iinclude -Itest -c -o stage2/test/$*.o test/$*.c + ./stage2/slimcc $(TEST_FLAGS) -c -o stage2/test/$*.o test/$*.c $(CC) -std=c11 -pthread -Wno-psabi -no-pie -o $@ stage2/test/$*.o -xc test/common stage2/test/c23/%.exe: stage2/slimcc test/c23/%.c mkdir -p stage2/test/c23 - ./stage2/slimcc -std=c23 -Iinclude -Itest -c -o stage2/test/c23/$*.o test/c23/$*.c + ./stage2/slimcc -std=c23 $(TEST_FLAGS) -c -o stage2/test/c23/$*.o test/c23/$*.c $(CC) -std=c11 -pthread -Wno-psabi -no-pie -o $@ stage2/test/c23/$*.o -xc test/common test-stage2: $(TESTS:test/%=stage2/test/%) $(TESTS_C23:test/c23/%=stage2/test/c23/%) diff --git a/main.c b/main.c index 0ee4976..06ac1a9 100644 --- a/main.c +++ b/main.c @@ -17,6 +17,7 @@ StdVer opt_std; static StringArray opt_include; bool opt_E; +bool opt_enable_universal_char; static bool opt_P; static bool opt_M; static bool opt_MD; @@ -416,6 +417,11 @@ static void parse_args(int argc, char **argv) { continue; } + if (!strcmp(argv[i], "-fenable-universal-char")) { + opt_enable_universal_char = true; + continue; + } + if (!strncmp(argv[i], "-fstack-reuse=", 14)) { if (strncmp(argv[i] + 14, "all\0", 4)) dont_reuse_stack = true; diff --git a/slimcc.h b/slimcc.h index dfeec50..019b813 100644 --- a/slimcc.h +++ b/slimcc.h @@ -607,6 +607,7 @@ bool file_exists(char *path); extern StringArray include_paths; extern bool opt_E; +extern bool opt_enable_universal_char; extern bool opt_fpic; extern bool opt_fcommon; extern bool opt_optimize; diff --git a/tokenize.c b/tokenize.c index 6ecbd92..2225827 100644 --- a/tokenize.c +++ b/tokenize.c @@ -874,7 +874,9 @@ Token *tokenize_file(char *path, Token **end) { canonicalize_newline(p); remove_backslash_newline(p); - convert_universal_chars(p); + + if (opt_enable_universal_char) + convert_universal_chars(p); return tokenize(add_input_file(path, p, false), end); } From bfe14208ed002228a6299317ce2e37a8afb757d5 Mon Sep 17 00:00:00 2001 From: fuhsnn <66062782+fuhsnn@users.noreply.github.com> Date: Thu, 21 Nov 2024 00:10:48 +0800 Subject: [PATCH 04/18] Replace most usage with macro --- main.c | 2 +- slimcc.h | 6 ++++++ tokenize.c | 14 +++++++------- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/main.c b/main.c index 06ac1a9..a00439c 100644 --- a/main.c +++ b/main.c @@ -885,7 +885,7 @@ static FileType get_file_type(char *filename) { char *p = strstr(filename, ".so."); if (p) { p += 3; - while (isdigit(*p) || (*p == '.' && isdigit(p[1]))) + while (Isdigit(*p) || (*p == '.' && Isdigit(p[1]))) p++; if (!*p) return FILE_DSO; diff --git a/slimcc.h b/slimcc.h index 019b813..54cf1e3 100644 --- a/slimcc.h +++ b/slimcc.h @@ -22,6 +22,12 @@ #define MAX(x, y) ((x) < (y) ? (y) : (x)) #define MIN(x, y) ((x) < (y) ? (x) : (y)) +#define Ucast(c) (unsigned int)(unsigned char)(c) +#define Inrange(c, x, y) ((Ucast(c) - Ucast(x)) <= (Ucast(y) - Ucast(x))) +#define Isdigit(c) Inrange(c, '0', '9') +#define Isalnum(c) (Inrange((c) | 0x20, 'a', 'z') || Isdigit(c)) +#define Isxdigit(c) (Isdigit(c) || Inrange((c) | 0x20, 'a', 'f')) + #if defined(__GNUC__) && (__GNUC__ >= 3) #define FMTCHK(x,y) __attribute__((format(printf,(x),(y)))) #define NORETURN __attribute__((noreturn)) diff --git a/tokenize.c b/tokenize.c index 2225827..599ddaf 100644 --- a/tokenize.c +++ b/tokenize.c @@ -225,11 +225,11 @@ static int read_escaped_char(char **new_pos, char *p) { if (*p == 'x') { // Read a hexadecimal number. p++; - if (!isxdigit(*p)) + if (!Isxdigit(*p)) error_at(p, "invalid hex escape sequence"); int c = 0; - for (; isxdigit(*p); p++) + for (; Isxdigit(*p); p++) c = ((unsigned)c << 4) + from_hex(*p); *new_pos = p; return c; @@ -381,7 +381,7 @@ static Token *new_pp_number(char *start, char *p) { if (*p == '.') { p++; continue; - } else if (*p == '\'' && isalnum(p[1])) { + } else if (*p == '\'' && Isalnum(p[1])) { p += 2; continue; } else if (p[0] && p[1] && strchr("eEpP", p[0]) && strchr("+-", p[1])) { @@ -402,7 +402,7 @@ static bool convert_pp_int(Token *tok, char *loc, int len) { // Read a binary, octal, decimal or hexadecimal number. int base = 10; - if (!strncasecmp(p, "0x", 2) && isxdigit(p[2])) { + if (!strncasecmp(p, "0x", 2) && Isxdigit(p[2])) { p += 2; base = 16; } else if (!strncasecmp(p, "0b", 2) && (p[2] == '0' || p[2] == '1')) { @@ -577,7 +577,7 @@ Token *tokenize(File *file, Token **end) { } // Skip whitespace characters. - if (isspace(*p)) { + if (*p == ' ' || *p == '\t' || *p =='\v' || *p == '\f') { p++; has_space = true; continue; @@ -604,7 +604,7 @@ Token *tokenize(File *file, Token **end) { // Numeric literal char *p2 = (*p == '.') ? p + 1 : p; - if (isdigit(*p2)) { + if (Isdigit(*p2)) { cur = cur->next = new_pp_number(p, p2 + 1); p += cur->len; continue; @@ -801,7 +801,7 @@ static void remove_backslash_newline(char *p) { static uint32_t read_universal_char(char *p, int len) { uint32_t c = 0; for (int i = 0; i < len; i++) { - if (!isxdigit(p[i])) + if (!Isxdigit(p[i])) return 0; c = (c << 4) | from_hex(p[i]); } From 4932df2401afc0afd79fe54cad3ab650b92e227c Mon Sep 17 00:00:00 2001 From: fuhsnn <66062782+fuhsnn@users.noreply.github.com> Date: Thu, 21 Nov 2024 00:16:41 +0800 Subject: [PATCH 05/18] Use Inrange() macro more --- codegen.c | 8 ++++---- tokenize.c | 10 +++++----- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/codegen.c b/codegen.c index a971f71..3c85316 100644 --- a/codegen.c +++ b/codegen.c @@ -3202,7 +3202,7 @@ static void asm_constraint(AsmParam *ap, int x87_clobber) { case 't': fixed_reg(®, REG_X87_ST0, tok); continue; case 'u': fixed_reg(®, REG_X87_ST1, tok); continue; } - if (*p >= '0' && *p <= '9') { + if (Isdigit(*p)) { match_idx = strtoul(p, &p, 10); continue; } @@ -3640,7 +3640,7 @@ static AsmParam *find_op(char *p, char **rest, Token *tok, bool is_label) { if (named_op(p, op->name->len, op->name->loc, rest)) return op; } - } else if (*p >= '0' && *p <= '9') { + } else if (Isdigit(*p)) { unsigned long idx = strtoul(p, rest, 10); if (idx < asm_ops.cnt) return asm_ops.data[idx]; @@ -3662,7 +3662,7 @@ static void asm_body(Node *node) { p++; continue; } - if (*p == 'l' && (p[1] == '[' || (p[1] >= '0' && p[1] <= '9'))) { + if (*p == 'l' && (p[1] == '[' || Isdigit(p[1]))) { AsmParam *ap = find_op(p + 1, &p, node->asm_str, true); if (!ap->arg->unique_label) error_tok(ap->arg->tok, "not a label"); @@ -3685,7 +3685,7 @@ static void asm_body(Node *node) { mod = *p; p++; } - if (*p == '[' || (*p >= '0' && *p <= '9')) { + if (*p == '[' || Isdigit(*p)) { AsmParam *ap = find_op(p, &p, node->asm_str, false); char *punct = (mod == 'c') ? "" : "$"; diff --git a/tokenize.c b/tokenize.c index 599ddaf..66e28d2 100644 --- a/tokenize.c +++ b/tokenize.c @@ -143,9 +143,9 @@ static int read_ident(char *start) { } static int from_hex(char c) { - if ('0' <= c && c <= '9') + if (Inrange(c, '0', '9')) return c - '0'; - if ('a' <= c && c <= 'f') + if (Inrange(c, 'a', 'f')) return c - 'a' + 10; return c - 'A' + 10; } @@ -210,12 +210,12 @@ TokenKind ident_keyword(Token *tok) { } static int read_escaped_char(char **new_pos, char *p) { - if ('0' <= *p && *p <= '7') { + if (Inrange(*p, '0', '7')) { // Read an octal number. int c = *p++ - '0'; - if ('0' <= *p && *p <= '7') { + if (Inrange(*p, '0', '7')) { c = (c << 3) + (*p++ - '0'); - if ('0' <= *p && *p <= '7') + if (Inrange(*p, '0', '7')) c = (c << 3) + (*p++ - '0'); } *new_pos = p; From c97957f55f4eea66a297bde8b4ad791779040a59 Mon Sep 17 00:00:00 2001 From: fuhsnn <66062782+fuhsnn@users.noreply.github.com> Date: Thu, 21 Nov 2024 00:19:17 +0800 Subject: [PATCH 06/18] Micro-optimize tokenizer space skipping --- tokenize.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tokenize.c b/tokenize.c index 66e28d2..88f51ca 100644 --- a/tokenize.c +++ b/tokenize.c @@ -578,7 +578,7 @@ Token *tokenize(File *file, Token **end) { // Skip whitespace characters. if (*p == ' ' || *p == '\t' || *p =='\v' || *p == '\f') { - p++; + for (char c = *p; *(++p) == c;); has_space = true; continue; } From 004c4c9a594b4eb74eee6590cdeaaa6f229e69ad Mon Sep 17 00:00:00 2001 From: fuhsnn <66062782+fuhsnn@users.noreply.github.com> Date: Fri, 22 Nov 2024 02:10:22 +0800 Subject: [PATCH 07/18] Tweak previous commit --- slimcc.h | 1 - tokenize.c | 82 +++++++++++++++++++++++------------------------------- 2 files changed, 35 insertions(+), 48 deletions(-) diff --git a/slimcc.h b/slimcc.h index 54cf1e3..946c787 100644 --- a/slimcc.h +++ b/slimcc.h @@ -1,6 +1,5 @@ #define _POSIX_C_SOURCE 200809L #include -#include #include #include #include diff --git a/tokenize.c b/tokenize.c index 4ab997f..3552fcb 100644 --- a/tokenize.c +++ b/tokenize.c @@ -151,45 +151,33 @@ static int from_hex(char c) { } // Read a punctuator token from p and returns its length. -static int read_punct(const char *p) { - char c1; +static int read_punct(char *p) { + bool is_repeat = p[1] == *p; + bool is_assign = p[1] == '='; + switch (*p) { - case '<': // Pattern: < <= << <<= - c1 = *(p + 1); - if (c1 == '=') return 2; - if (c1 == '<') - return *(p + 2) == '=' ? 3 : 2; - return 1; - case '>': // Pattern: > >= >> >>= - c1 = *(p + 1); - if (c1 == '=') return 2; - if (c1 == '>') - return *(p + 2) == '=' ? 3 : 2; - return 1; - case '+': // Pattern: + ++ += - c1 = *(p + 1); - return (c1 == '+' || c1 == '=') ? 2 : 1; - case '-': // Pattern: - -- -= -> - c1 = *(p + 1); - return (c1 == '-' || c1 == '=' || c1 == '>') ? 2 : 1; - case '&': // Pattern: & &= && - c1 = *(p + 1); - return (c1 == '=' || c1 == '&') ? 2 : 1; - case '|': // Pattern: | |= || - c1 = *(p + 1); - return (c1 == '=' || c1 == '|') ? 2 : 1; - case '.': // Pattern: . ... - return (*(p + 1) == '.' && *(p + 2) == '.') ? 3 : 1; - case '=': // Pattern: = == - case '!': // Pattern: ! != - case '*': // Pattern: * *= - case '/': // Pattern: / /= - case '%': // Pattern: % %= - case '^': // Pattern: ^ ^= - return *(p + 1) == '=' ? 2 : 1; - case '#': // Pattern: # ## - return *(p + 1) == '#' ? 2 : 1; - case '$': + case '-': + if (p[1] == '>') + return 2; + case '&': + case '+': + case '=': + case '|': + return (is_repeat | is_assign) + 1; + case '<': + case '>': + if (is_repeat) + return (p[2] == '=') + 2; + case '!': + case '%': + case '*': + case '/': + case '^': + return is_assign + 1; + case '#': + return is_repeat + 1; + case '.': + return (is_repeat && p[2] == *p) ? 3 : 1; case '(': case ')': case ',': @@ -198,8 +186,8 @@ static int read_punct(const char *p) { case '?': case '@': case '[': + case '\\': case ']': - case '_': case '`': case '{': case '}': @@ -654,6 +642,14 @@ Token *tokenize(File *file, Token **end) { continue; } + // Punctuators + int punct_len = read_punct(p); + if (punct_len) { + cur = cur->next = new_token(TK_PUNCT, p, p + punct_len); + p += cur->len; + continue; + } + // String literal if (*p == '"') { cur = cur->next = read_string_literal(p, p); @@ -727,14 +723,6 @@ Token *tokenize(File *file, Token **end) { continue; } - // Punctuators - int punct_len = read_punct(p); - if (punct_len) { - cur = cur->next = new_token(TK_PUNCT, p, p + punct_len); - p += cur->len; - continue; - } - error_at(p, "invalid token"); } From e5cca771a9e8940c68fe83e44f14c423baf9ea6a Mon Sep 17 00:00:00 2001 From: fuhsnn <66062782+fuhsnn@users.noreply.github.com> Date: Fri, 22 Nov 2024 04:49:46 +0800 Subject: [PATCH 08/18] Remove unnessasary strndup()'s --- preprocess.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/preprocess.c b/preprocess.c index 57bc5d9..137394f 100644 --- a/preprocess.c +++ b/preprocess.c @@ -20,13 +20,13 @@ typedef struct MacroParam MacroParam; struct MacroParam { MacroParam *next; - char *name; + Token *name; }; typedef struct MacroArg MacroArg; struct MacroArg { MacroArg *next; - char *name; + Token *name; bool is_va_args; bool omit_comma; Token *tok; @@ -42,7 +42,7 @@ struct Macro { Token *stop_tok; Macro *locked_next; MacroParam *params; - char *va_args_name; + Token *va_args_name; Token *body; macro_handler_fn *handler; }; @@ -384,7 +384,7 @@ static Macro *add_macro(char *name, bool is_objlike, Token *body) { return m; } -static MacroParam *read_macro_params(Token **rest, Token *tok, char **va_args_name) { +static MacroParam *read_macro_params(Token **rest, Token *tok, Token **va_args_name) { MacroParam head = {0}; MacroParam *cur = &head; @@ -393,7 +393,8 @@ static MacroParam *read_macro_params(Token **rest, Token *tok, char **va_args_na tok = skip(tok, ","); if (equal(tok, "...")) { - *va_args_name = "__VA_ARGS__"; + static Token va_args = {.loc = "__VA_ARGS__", .len = 11}; + *va_args_name = &va_args; *rest = skip(tok->next, ")"); return head.next; } @@ -402,13 +403,13 @@ static MacroParam *read_macro_params(Token **rest, Token *tok, char **va_args_na error_tok(tok, "expected an identifier"); if (equal(tok->next, "...")) { - *va_args_name = strndup(tok->loc, tok->len); + *va_args_name = tok; *rest = skip(tok->next->next, ")"); return head.next; } MacroParam *m = calloc(1, sizeof(MacroParam)); - m->name = strndup(tok->loc, tok->len); + m->name = tok; cur = cur->next = m; tok = tok->next; } @@ -425,7 +426,7 @@ static void read_macro_definition(Token **rest, Token *tok) { if (!tok->has_space && equal(tok, "(")) { // Function-like macro - char *va_args_name = NULL; + Token *va_args_name = NULL; MacroParam *params = read_macro_params(&tok, tok->next, &va_args_name); Macro *m = add_macro(name, false, split_line(rest, tok)); @@ -541,7 +542,7 @@ static MacroArg *find_va_arg(MacroArg *args) { static MacroArg *find_arg(Token **rest, Token *tok, MacroArg *args, Macro *m) { for (MacroArg *ap = args; ap; ap = ap->next) { - if (equal(tok, ap->name)) { + if (tok->len == ap->name->len && !memcmp(tok->loc, ap->name->loc, tok->len)) { if (rest) *rest = tok->next; return ap; From efcbb859b4848093b645211eee9c06574091a941 Mon Sep 17 00:00:00 2001 From: fuhsnn <66062782+fuhsnn@users.noreply.github.com> Date: Fri, 22 Nov 2024 06:19:38 +0800 Subject: [PATCH 09/18] Micro-optimize tokenizing ASCII identifiers --- tokenize.c | 39 +++++++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/tokenize.c b/tokenize.c index 3552fcb..c6a779d 100644 --- a/tokenize.c +++ b/tokenize.c @@ -127,18 +127,23 @@ static bool startswith(char *p, char *q) { // Read an identifier and returns the length of it. // If p does not point to a valid identifier, 0 is returned. -static int read_ident(char *start) { - char *p = start; - uint32_t c = decode_utf8(&p, p); - if (!is_ident1(c)) - return 0; +static int read_ident(char *p) { + char *start = p; - for (;;) { - char *q; - c = decode_utf8(&q, p); - if (!is_ident2(c)) - return p - start; - p = q; + for (bool is_first = true;; is_first = false) { + if (Isalnum(*p) || *p == '_' || *p == '$') { + p++; + continue; + } + if ((unsigned char)*p >= 128) { + char *pos; + uint32_t c = decode_utf8(&pos, p); + if (is_first ? is_ident1(c) : is_ident2(c)) { + p = pos; + continue; + } + } + return p - start; } } @@ -420,11 +425,17 @@ static Token *new_pp_number(char *start, char *p) { p += 2; continue; } - char *pos; - if (is_ident2(decode_utf8(&pos, p))) { - p = pos; + if (Isalnum(*p) || *p == '_' || *p == '$') { + p++; continue; } + if ((unsigned char)*p >= 128) { + char *pos; + if (is_ident2(decode_utf8(&pos, p))) { + p = pos; + continue; + } + } return new_token(TK_PP_NUM, start, p); } } From f65db8e9ac49d284e3c515e35784a94d5d90197b Mon Sep 17 00:00:00 2001 From: fuhsnn <66062782+fuhsnn@users.noreply.github.com> Date: Fri, 22 Nov 2024 16:51:42 +0800 Subject: [PATCH 10/18] Reduce strlen/strncmp calls --- tokenize.c | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/tokenize.c b/tokenize.c index c6a779d..f611f02 100644 --- a/tokenize.c +++ b/tokenize.c @@ -12,6 +12,9 @@ static bool at_bol; // True if the current position follows a space character static bool has_space; +#define startswith2(p, x, y) ((*(p) == x) && ((p)[1] == y)) +#define startswith3(p, x, y, z) ((*(p) == x) && ((p)[1] == y) && ((p)[2] == z)) + // Reports an error and exit. void error(char *fmt, ...) { va_list ap; @@ -627,7 +630,7 @@ Token *tokenize(File *file, Token **end) { } // Skip line comments. - if (startswith(p, "//")) { + if (startswith2(p, '/', '/')) { p += 2; while (*p != '\n') p++; @@ -636,7 +639,7 @@ Token *tokenize(File *file, Token **end) { } // Skip block comments. - if (startswith(p, "/*")) { + if (startswith2(p, '/', '*')) { char *q = strstr(p + 2, "*/"); if (!q) error_at(p, "unclosed block comment"); @@ -669,28 +672,28 @@ Token *tokenize(File *file, Token **end) { } // UTF-8 string literal - if (startswith(p, "u8\"")) { + if (startswith3(p, 'u', '8', '\"')) { cur = cur->next = read_string_literal(p, p + 2); p += cur->len; continue; } // UTF-16 string literal - if (startswith(p, "u\"")) { + if (startswith2(p, 'u', '\"')) { cur = cur->next = read_utf16_string_literal(p, p + 1); p += cur->len; continue; } // Wide string literal - if (startswith(p, "L\"")) { + if (startswith2(p, 'L', '\"')) { cur = cur->next = read_utf32_string_literal(p, p + 1, ty_int); p += cur->len; continue; } // UTF-32 string literal - if (startswith(p, "U\"")) { + if (startswith2(p, 'U', '\"')) { cur = cur->next = read_utf32_string_literal(p, p + 1, ty_uint); p += cur->len; continue; @@ -705,7 +708,7 @@ Token *tokenize(File *file, Token **end) { } // UTF-16 character literal - if (startswith(p, "u'")) { + if (startswith2(p, 'u', '\'')) { cur = cur->next = read_char_literal(p, p + 1, ty_ushort); cur->val &= 0xffff; p += cur->len; @@ -713,14 +716,14 @@ Token *tokenize(File *file, Token **end) { } // Wide character literal - if (startswith(p, "L'")) { + if (startswith2(p, 'L', '\'')) { cur = cur->next = read_char_literal(p, p + 1, ty_int); p += cur->len; continue; } // UTF-32 character literal - if (startswith(p, "U'")) { + if (startswith2(p, 'U', '\'')) { cur = cur->next = read_char_literal(p, p + 1, ty_uint); p += cur->len; continue; @@ -856,28 +859,24 @@ static void convert_universal_chars(char *p) { char *q = p; while (*p) { - if (startswith(p, "\\u")) { + if (startswith2(p, '\\', 'u')) { uint32_t c = read_universal_char(p + 2, 4); if (c) { p += 6; q += encode_utf8(q, c); - } else { - *q++ = *p++; + continue; } - } else if (startswith(p, "\\U")) { + } else if (startswith2(p, '\\', 'U')) { uint32_t c = read_universal_char(p + 2, 8); if (c) { p += 10; q += encode_utf8(q, c); - } else { - *q++ = *p++; + continue; } - } else if (p[0] == '\\') { - *q++ = *p++; - *q++ = *p++; - } else { + } else if (*p == '\\') { *q++ = *p++; } + *q++ = *p++; } *q = '\0'; From 97fbcea599422a08b76e4dc217536eb9a76ed5dd Mon Sep 17 00:00:00 2001 From: fuhsnn <66062782+fuhsnn@users.noreply.github.com> Date: Fri, 22 Nov 2024 17:24:57 +0800 Subject: [PATCH 11/18] Rework integer literal suffix algorithm --- slimcc.h | 2 +- tokenize.c | 53 +++++++++++++++++++++-------------------------------- 2 files changed, 22 insertions(+), 33 deletions(-) diff --git a/slimcc.h b/slimcc.h index 946c787..14ff06a 100644 --- a/slimcc.h +++ b/slimcc.h @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include @@ -26,6 +25,7 @@ #define Isdigit(c) Inrange(c, '0', '9') #define Isalnum(c) (Inrange((c) | 0x20, 'a', 'z') || Isdigit(c)) #define Isxdigit(c) (Isdigit(c) || Inrange((c) | 0x20, 'a', 'f')) +#define Casecmp(c, a) (((c) | 0x20) == a) #if defined(__GNUC__) && (__GNUC__ >= 3) #define FMTCHK(x,y) __attribute__((format(printf,(x),(y)))) diff --git a/tokenize.c b/tokenize.c index f611f02..4e87f28 100644 --- a/tokenize.c +++ b/tokenize.c @@ -124,10 +124,6 @@ static Token *new_token(TokenKind kind, char *start, char *end) { return tok; } -static bool startswith(char *p, char *q) { - return strncmp(p, q, strlen(q)) == 0; -} - // Read an identifier and returns the length of it. // If p does not point to a valid identifier, 0 is returned. static int read_ident(char *p) { @@ -448,46 +444,39 @@ static bool convert_pp_int(Token *tok, char *loc, int len) { // Read a binary, octal, decimal or hexadecimal number. int base = 10; - if (!strncasecmp(p, "0x", 2) && Isxdigit(p[2])) { - p += 2; - base = 16; - } else if (!strncasecmp(p, "0b", 2) && (p[2] == '0' || p[2] == '1')) { - p += 2; - base = 2; - } else if (*p == '0') { - base = 8; + if (*p == '0') { + if (Casecmp(p[1], 'x') && Isxdigit(p[2])) { + p += 2; + base = 16; + } else if (Casecmp(p[1], 'b') && (p[2] == '0' || p[2] == '1')) { + p += 2; + base = 2; + } else { + base = 8; + } } int64_t val = strtoul(p, &p, base); // Read U, L or LL suffixes. - bool ll = false; - bool l = false; bool u = false; - - if (startswith(p, "LLU") || startswith(p, "LLu") || - startswith(p, "llU") || startswith(p, "llu") || - startswith(p, "ULL") || startswith(p, "Ull") || - startswith(p, "uLL") || startswith(p, "ull")) { - p += 3; - ll = u = true; - } else if (!strncasecmp(p, "lu", 2) || !strncasecmp(p, "ul", 2)) { - p += 2; - l = u = true; - } else if (startswith(p, "LL") || startswith(p, "ll")) { - p += 2; - ll = true; - } else if (*p == 'L' || *p == 'l') { - p++; - l = true; - } else if (*p == 'U' || *p == 'u') { - p++; + int l_cnt = 0; + if (Casecmp(*p, 'u')) { + if (Casecmp(p[1], 'l')) + l_cnt = 1 + (p[1] == p[2]); u = true; + } else if (Casecmp(*p, 'l')) { + l_cnt = 1 + (*p == p[1]); + u = Casecmp(p[l_cnt], 'u'); } + p += l_cnt + u; if (p != loc + len) return false; + bool ll = l_cnt == 2; + bool l = l_cnt == 1; + // Infer a type. Type *ty; if (base == 10) { From 3b70322601df92940cfb024a880da9f841d4f49d Mon Sep 17 00:00:00 2001 From: fuhsnn <66062782+fuhsnn@users.noreply.github.com> Date: Fri, 22 Nov 2024 21:06:13 +0800 Subject: [PATCH 12/18] Rework newline canonicalization --- tokenize.c | 65 ++++++++++++++++++++++++++---------------------------- 1 file changed, 31 insertions(+), 34 deletions(-) diff --git a/tokenize.c b/tokenize.c index 4e87f28..be4fc8d 100644 --- a/tokenize.c +++ b/tokenize.c @@ -789,48 +789,45 @@ File *new_file(char *name, int file_no, char *contents) { // Replaces \r or \r\n with \n. static void canonicalize_newline(char *p) { - int i = 0, j = 0; - - while (p[i]) { - if (p[i] == '\r' && p[i + 1] == '\n') { - i += 2; - p[j++] = '\n'; - } else if (p[i] == '\r') { - i++; - p[j++] = '\n'; - } else { - p[j++] = p[i++]; + char *first = strchr(p, '\r'); + if (first) { + char *q = p = first; + + while (*p) { + if (*p == '\r') { + *q++ = '\n'; + p += (p[1] == '\n') + 1; + continue; + } + *q++ = *p++; } - } - p[j] = '\0'; + *q = '\0'; + } } // Removes backslashes followed by a newline. static void remove_backslash_newline(char *p) { - int i = 0, j = 0; - - // We want to keep the number of newline characters so that - // the logical line number matches the physical one. - // This counter maintain the number of newlines we have removed. - int n = 0; - - while (p[i]) { - if (p[i] == '\\' && p[i + 1] == '\n') { - i += 2; - n++; - } else if (p[i] == '\n') { - p[j++] = p[i++]; - for (; n > 0; n--) - p[j++] = '\n'; - } else { - p[j++] = p[i++]; + char *first = strchr(p, '\\'); + if (first) { + char *q = p = first; + int n = 0; + + while (*p) { + if (*p == '\\' && p[1] == '\n') { + p += 2; + n++; + continue; + } + if (*p == '\n') { + for (; n > 0; n--) + *q++ = '\n'; + } + *q++ = *p++; } - } - for (; n > 0; n--) - p[j++] = '\n'; - p[j] = '\0'; + *q = '\0'; + } } static uint32_t read_universal_char(char *p, int len) { From 4a79cd939743c9b7e4ec0041727a75d6275addd8 Mon Sep 17 00:00:00 2001 From: fuhsnn <66062782+fuhsnn@users.noreply.github.com> Date: Fri, 22 Nov 2024 21:21:59 +0800 Subject: [PATCH 13/18] Fixup f65db8e --- tokenize.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/tokenize.c b/tokenize.c index be4fc8d..ada9d2c 100644 --- a/tokenize.c +++ b/tokenize.c @@ -859,8 +859,6 @@ static void convert_universal_chars(char *p) { q += encode_utf8(q, c); continue; } - } else if (*p == '\\') { - *q++ = *p++; } *q++ = *p++; } From 4d344382eb70db78c35ebdd4603d79fda0c4cd58 Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Fri, 22 Nov 2024 22:07:00 +0800 Subject: [PATCH 14/18] Avoid expensive division in HashMap Previously, HashMap relied on division operations, which are expensive. This update replaces divisions with logical AND operations using an additional mask, improving performance. --- hashmap.c | 8 +++++--- slimcc.h | 1 + 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/hashmap.c b/hashmap.c index ae57b1d..365d59b 100644 --- a/hashmap.c +++ b/hashmap.c @@ -2,7 +2,7 @@ #include "slimcc.h" -// Initial hash bucket size +// Initial hash bucket size; must be power of 2 #define INIT_SIZE 16 // Rehash if the usage exceeds 70%. @@ -41,6 +41,7 @@ static void rehash(HashMap *map) { HashMap map2 = {0}; map2.buckets = calloc(cap, sizeof(HashEntry)); map2.capacity = cap; + map2.mask = cap - 1; for (int i = 0; i < map->capacity; i++) { HashEntry *ent = &map->buckets[i]; @@ -64,7 +65,7 @@ static HashEntry *get_entry(HashMap *map, char *key, int keylen) { uint64_t hash = fnv_hash(key, keylen); for (int i = 0; i < map->capacity; i++) { - HashEntry *ent = &map->buckets[(hash + i) % map->capacity]; + HashEntry *ent = &map->buckets[(hash + i) & map->mask]; if (match(ent, key, keylen)) return ent; if (ent->key == NULL) @@ -77,6 +78,7 @@ static HashEntry *get_or_insert_entry(HashMap *map, char *key, int keylen) { if (!map->buckets) { map->buckets = calloc(INIT_SIZE, sizeof(HashEntry)); map->capacity = INIT_SIZE; + map->mask = INIT_SIZE - 1; } else if ((map->used * 100) / map->capacity >= HIGH_WATERMARK) { rehash(map); } @@ -84,7 +86,7 @@ static HashEntry *get_or_insert_entry(HashMap *map, char *key, int keylen) { uint64_t hash = fnv_hash(key, keylen); for (int i = 0; i < map->capacity; i++) { - HashEntry *ent = &map->buckets[(hash + i) % map->capacity]; + HashEntry *ent = &map->buckets[(hash + i) & map->mask]; if (match(ent, key, keylen)) return ent; diff --git a/slimcc.h b/slimcc.h index 14ff06a..e39a548 100644 --- a/slimcc.h +++ b/slimcc.h @@ -65,6 +65,7 @@ typedef struct { HashEntry *buckets; int capacity; int used; + int mask; } HashMap; void *hashmap_get(HashMap *map, char *key); From 240f8546b88e643b87ddacebf6dd8d3af2651340 Mon Sep 17 00:00:00 2001 From: fuhsnn <66062782+fuhsnn@users.noreply.github.com> Date: Fri, 22 Nov 2024 22:27:51 +0800 Subject: [PATCH 15/18] Make Token struct packed --- slimcc.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/slimcc.h b/slimcc.h index 14ff06a..1c38bc6 100644 --- a/slimcc.h +++ b/slimcc.h @@ -30,6 +30,7 @@ #if defined(__GNUC__) && (__GNUC__ >= 3) #define FMTCHK(x,y) __attribute__((format(printf,(x),(y)))) #define NORETURN __attribute__((noreturn)) +#define PACKED __attribute__((packed)) #elif defined(__has_attribute) #if __has_attribute(format) #define FMTCHK(x,y) __attribute__((format(printf,(x),(y)))) @@ -37,6 +38,9 @@ #if __has_attribute(noreturn) #define NORETURN __attribute__((noreturn)) #endif +#if __has_attribute(packed) +#define PACKED __attribute__((packed)) +#endif #endif #ifndef FMTCHK @@ -45,6 +49,9 @@ #ifndef NORETURN #define NORETURN #endif +#ifndef PACKED +#define PACKED +#endif typedef struct Type Type; typedef struct Node Node; @@ -141,7 +148,7 @@ struct Token { Token *origin; // If this is expanded from a macro, the original token char *guard_file; // The path of a potentially include-guarded file Token *attr_next; -}; +} PACKED; void error(char *fmt, ...) FMTCHK(1,2) NORETURN; void error_at(char *loc, char *fmt, ...) FMTCHK(2,3) NORETURN; From d80d5903b38598d69b6a68f4f9019d2b5bb0316d Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Sat, 23 Nov 2024 14:26:45 +0800 Subject: [PATCH 16/18] Expand macro only when preprocessor token is identifier In 'preprocess2' function, 'expand_macro' was called multiple times, some of which were unnecessary. This change ensures that 'expand_macro' is called only when the preprocessor token is an identifier. --- preprocess.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/preprocess.c b/preprocess.c index 137394f..7da5885 100644 --- a/preprocess.c +++ b/preprocess.c @@ -1077,7 +1077,7 @@ static Token *preprocess2(Token *tok) { for (; tok->kind != TK_EOF; pop_macro_lock(tok)) { // If it is a macro, expand it. - if (expand_macro(&tok, tok)) + if (tok->kind == TK_IDENT && expand_macro(&tok, tok)) continue; if (is_hash(tok) && !locked_macros) { From 754532413cac831ce33fb0b29780f469bdd63692 Mon Sep 17 00:00:00 2001 From: "ChAoS_UnItY (Kyle Lin)" Date: Sat, 23 Nov 2024 22:25:17 +0800 Subject: [PATCH 17/18] Add basic testing action for push and pr Co-authored-by: Jim Huang --- .github/workflows/main.yml | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 .github/workflows/main.yml diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 0000000..6c28664 --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,37 @@ +name: Github Actions + +on: [push, pull_request] + +jobs: + host-x86: + runs-on: ubuntu-latest + strategy: + matrix: + compiler: [gcc, clang] + steps: + - name: Checkout code + uses: actions/checkout@v4 + - name: Install prerequisite + run: | + sudo apt-get update -q -y + sudo apt-get install -q -y file + sudo apt-get install -q -y build-essential + - name: Build stage 1 artifact + env: + CC: ${{ matrix.compiler }} + run: | + make clean + make slimcc + shell: bash + - name: Test stage 1 + run: | + make test || exit 1 + shell: bash + - name: Build stage 2 artifact + run: | + make stage2/slimcc + shell: bash + - name: Test stage 2 + run: | + make test-stage2 || exit 1 + shell: bash From 6a06d610c7e985ac9c88523adf07d9a056ab43fa Mon Sep 17 00:00:00 2001 From: Kyle Lin Date: Mon, 25 Nov 2024 23:01:23 +0800 Subject: [PATCH 18/18] Adopt cache-apt-pkgs, add curl 8.9.1 build & unit-test to workflow --- .github/workflows/main.yml | 49 ++++++++++++++++++++++++++++++++++---- 1 file changed, 44 insertions(+), 5 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 6c28664..7b76550 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -3,7 +3,7 @@ name: Github Actions on: [push, pull_request] jobs: - host-x86: + host-x86-build: runs-on: ubuntu-latest strategy: matrix: @@ -12,10 +12,10 @@ jobs: - name: Checkout code uses: actions/checkout@v4 - name: Install prerequisite - run: | - sudo apt-get update -q -y - sudo apt-get install -q -y file - sudo apt-get install -q -y build-essential + uses: awalsh128/cache-apt-pkgs-action@latest + with: + packages: file build-essential clang + version: 1.0 - name: Build stage 1 artifact env: CC: ${{ matrix.compiler }} @@ -35,3 +35,42 @@ jobs: run: | make test-stage2 || exit 1 shell: bash + - run: mv slimcc slimcc-${{ matrix.compiler }} + - name: Upload artifact + uses: actions/upload-artifact@v4 + with: + name: slimcc-${{ matrix.compiler }} + path: slimcc-${{ matrix.compiler }} + + test-building-real-world-projects: + needs: host-x86-build + runs-on: ubuntu-latest + strategy: + matrix: + compiler: [gcc, clang] + steps: + - name: Checkout code + uses: actions/checkout@v4 + - name: Download artifact + uses: actions/download-artifact@v4 + with: + name: slimcc-${{ matrix.compiler }} + - run: | + chmod +x slimcc-${{ matrix.compiler }} + - uses: lukka/get-cmake@latest + with: + useLocalCache: true + - name: Install prerequisite + uses: awalsh128/cache-apt-pkgs-action@latest + with: + packages: file build-essential clang libssh2-1 + version: 1.0 + - name: Test Building curl 8.9.1 + run: | + git clone --depth 1 https://github.com/curl/curl --branch curl-8_9_1 + mkdir curl/cmakebuild + cd curl/cmakebuild + cmake ../ -DCMAKE_C_COMPILER=${GITHUB_WORKSPACE}/slimcc-${{ matrix.compiler }} -DCMAKE_C_FLAGS=-fPIC + make -j + make test-quiet -j + shell: bash