From 5d2f3418a1f67f8fb8392d00a8e13ddc2af9e1dc Mon Sep 17 00:00:00 2001
From: Jim Huang <jserv@ccns.ncku.edu.tw>
Date: Sat, 16 Nov 2024 16:13:08 +0800
Subject: [PATCH 01/18] Speed up recognize punctuation

The original code in function 'read_punct' relies on heavy string
specific function calls, resulting in slower execution. Instead, this
function can be faster using straightforward control flow.
---
 tokenize.c | 68 ++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 56 insertions(+), 12 deletions(-)

diff --git a/tokenize.c b/tokenize.c
index 6ecbd92..03e7095 100644
--- a/tokenize.c
+++ b/tokenize.c
@@ -151,18 +151,62 @@ static int from_hex(char c) {
 }
 
 // Read a punctuator token from p and returns its length.
-static int read_punct(char *p) {
-  static char *kw[] = {
-    "<<=", ">>=", "...", "==", "!=", "<=", ">=", "->", "+=",
-    "-=", "*=", "/=", "++", "--", "%=", "&=", "|=", "^=", "&&",
-    "||", "<<", ">>", "##",
-  };
-
-  for (int i = 0; i < sizeof(kw) / sizeof(*kw); i++)
-    if (startswith(p, kw[i]))
-      return strlen(kw[i]);
-
-  return ispunct(*p) ? 1 : 0;
+static int read_punct(const char *p) {
+  char c1;
+  switch (*p) {
+  case '<': // Pattern: < <= << <<=
+    c1 = *(p + 1);
+    if (c1 == '=') return 2;
+    if (c1 == '<')
+      return *(p + 2) == '=' ? 3 : 2;
+    return 1;
+  case '>': // Pattern: > >= >> >>=
+    c1 = *(p + 1);
+    if (c1 == '=') return 2;
+    if (c1 == '>')
+      return *(p + 2) == '=' ? 3 : 2;
+    return 1;
+  case '+': // Pattern: + ++ +=
+    c1 = *(p + 1);
+    return (c1 == '+' || c1 == '=') ? 2 : 1;
+  case '-': // Pattern: - -- -= ->
+    c1 = *(p + 1);
+    return (c1 == '-' || c1 == '=' || c1 == '>') ? 2 : 1;
+  case '&': // Pattern: & &= &&
+    c1 = *(p + 1);
+    return (c1 == '=' || c1 == '&') ? 2 : 1;
+  case '|': // Pattern: | |= ||
+    c1 = *(p + 1);
+    return (c1 == '=' || c1 == '|') ? 2 : 1;
+  case '.': // Pattern: . ...
+    return (*(p + 1) == '.' && *(p + 2) == '.') ? 3 : 1;
+  case '=': // Pattern: = ==
+  case '!': // Pattern: ! !=
+  case '*': // Pattern: * *=
+  case '/': // Pattern: / /=
+  case '%': // Pattern: % %=
+  case '^': // Pattern: ^ ^=
+    return *(p + 1) == '=' ? 2 : 1;
+  case '#': // Pattern: # ##
+    return *(p + 1) == '#' ? 2 : 1;
+  case '$':
+  case '(':
+  case ')':
+  case ',':
+  case ':':
+  case ';':
+  case '?':
+  case '@':
+  case '[':
+  case ']':
+  case '_':
+  case '`':
+  case '{':
+  case '}':
+  case '~':
+    return 1;
+  }
+  return 0;
 }
 
 TokenKind ident_keyword(Token *tok) {

From c7ae74f19f173017e8c95201b827904c8f580baf Mon Sep 17 00:00:00 2001
From: fuhsnn <66062782+fuhsnn@users.noreply.github.com>
Date: Sun, 17 Nov 2024 15:13:28 +0800
Subject: [PATCH 02/18] Rework path handling of `#include <>`

---
 preprocess.c | 33 ++++++++++++++++++---------------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/preprocess.c b/preprocess.c
index 85be39a..57bc5d9 100644
--- a/preprocess.c
+++ b/preprocess.c
@@ -888,28 +888,25 @@ static char *read_filename(Token **rest, Token *tok, bool *is_dquote) {
   // Pattern 3: #include FOO
   // In this case FOO must be macro-expanded to either
   // a single string token or a sequence of "<" ... ">".
-  if (tok->kind == TK_IDENT)
+  bool is_expanded = false;
+  if (tok->kind == TK_IDENT) {
     tok = preprocess2(tok);
+    is_expanded = true;
+  }
 
-  // Pattern 1: #include "foo.h"
+  char *filename = NULL;
   if (tok->kind == TK_STR) {
+    // Pattern 1: #include "foo.h"
     // A double-quoted filename for #include is a special kind of
     // token, and we don't want to interpret any escape sequences in it.
     // For example, "\f" in "C:\foo" is not a formfeed character but
     // just two non-control characters, backslash and f.
     // So we don't want to use token->str.
+    filename = strndup(tok->loc + 1, tok->len - 2);
     *is_dquote = true;
-    if (rest)
-      *rest = tok->next;
-    else
-      skip_line(tok->next);
-    return strndup(tok->loc + 1, tok->len - 2);
-  }
-
-  // Pattern 2: #include <foo.h>
-  if (equal(tok, "<")) {
-    // Reconstruct a filename from a sequence of tokens between
-    // "<" and ">".
+  } else if (equal(tok, "<")) {
+    // Pattern 2: #include <foo.h>
+    // Reconstruct a filename from between "<" and ">".
     Token *start = tok;
 
     // Find closing ">".
@@ -917,14 +914,20 @@ static char *read_filename(Token **rest, Token *tok, bool *is_dquote) {
       if (tok->kind == TK_EOF)
         error_tok(tok, "expected '>'");
 
+    if (!is_expanded && start->file == tok->file && start->loc < tok->loc)
+      filename = strndup(start->loc + 1, tok->loc - start->loc - 1);
+    else
+      filename = join_tokens(start->next, tok);
     *is_dquote = false;
+  }
+
+  if (filename && *filename != '\0') {
     if (rest)
       *rest = tok->next;
     else
       skip_line(tok->next);
-    return join_tokens(start->next, tok);
+    return filename;
   }
-
   error_tok(tok, "expected a filename");
 }
 

From aa154968dee4ba393ac9f5fa0a6fa44c4c2c1ddf Mon Sep 17 00:00:00 2001
From: fuhsnn <66062782+fuhsnn@users.noreply.github.com>
Date: Tue, 19 Nov 2024 22:00:15 +0800
Subject: [PATCH 03/18] Disable universal char names by default

See #92
---
 GNUmakefile | 10 ++++++----
 main.c      |  6 ++++++
 slimcc.h    |  1 +
 tokenize.c  |  4 +++-
 4 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/GNUmakefile b/GNUmakefile
index 580763c..07a6bb8 100644
--- a/GNUmakefile
+++ b/GNUmakefile
@@ -1,5 +1,7 @@
 CFLAGS=-std=c99 -g -fno-common -Wall -pedantic -Wno-switch
 
+TEST_FLAGS=-Iinclude -Itest -fenable-universal-char
+
 SRCS=$(wildcard *.c)
 OBJS=$(SRCS:.c=.o)
 
@@ -17,11 +19,11 @@ slimcc: $(OBJS)
 $(OBJS): slimcc.h
 
 test/%.exe: slimcc test/%.c
-	./slimcc -Iinclude -Itest -c -o test/$*.o test/$*.c
+	./slimcc $(TEST_FLAGS) -c -o test/$*.o test/$*.c
 	$(CC) -std=c11 -pthread -Wno-psabi -no-pie -o $@ test/$*.o -xc test/common
 
 test/c23/%.exe: slimcc test/c23/%.c
-	./slimcc -std=c23 -Iinclude -Itest -c -o test/c23/$*.o test/c23/$*.c
+	./slimcc -std=c23 $(TEST_FLAGS) -c -o test/c23/$*.o test/c23/$*.c
 	$(CC) -std=c11 -pthread -Wno-psabi -no-pie -o $@ test/c23/$*.o -xc test/common
 
 test: $(TESTS) $(TESTS_C23)
@@ -41,12 +43,12 @@ stage2/%.o: slimcc %.c
 
 stage2/test/%.exe: stage2/slimcc test/%.c
 	mkdir -p stage2/test
-	./stage2/slimcc -Iinclude -Itest -c -o stage2/test/$*.o test/$*.c
+	./stage2/slimcc $(TEST_FLAGS) -c -o stage2/test/$*.o test/$*.c
 	$(CC) -std=c11 -pthread -Wno-psabi -no-pie -o $@ stage2/test/$*.o -xc test/common
 
 stage2/test/c23/%.exe: stage2/slimcc test/c23/%.c
 	mkdir -p stage2/test/c23
-	./stage2/slimcc -std=c23 -Iinclude -Itest -c -o stage2/test/c23/$*.o test/c23/$*.c
+	./stage2/slimcc -std=c23 $(TEST_FLAGS) -c -o stage2/test/c23/$*.o test/c23/$*.c
 	$(CC) -std=c11 -pthread -Wno-psabi -no-pie -o $@ stage2/test/c23/$*.o -xc test/common
 
 test-stage2: $(TESTS:test/%=stage2/test/%) $(TESTS_C23:test/c23/%=stage2/test/c23/%)
diff --git a/main.c b/main.c
index 0ee4976..06ac1a9 100644
--- a/main.c
+++ b/main.c
@@ -17,6 +17,7 @@ StdVer opt_std;
 
 static StringArray opt_include;
 bool opt_E;
+bool opt_enable_universal_char;
 static bool opt_P;
 static bool opt_M;
 static bool opt_MD;
@@ -416,6 +417,11 @@ static void parse_args(int argc, char **argv) {
       continue;
     }
 
+    if (!strcmp(argv[i], "-fenable-universal-char")) {
+      opt_enable_universal_char = true;
+      continue;
+    }
+
     if (!strncmp(argv[i], "-fstack-reuse=", 14)) {
       if (strncmp(argv[i] + 14, "all\0", 4))
         dont_reuse_stack = true;
diff --git a/slimcc.h b/slimcc.h
index dfeec50..019b813 100644
--- a/slimcc.h
+++ b/slimcc.h
@@ -607,6 +607,7 @@ bool file_exists(char *path);
 
 extern StringArray include_paths;
 extern bool opt_E;
+extern bool opt_enable_universal_char;
 extern bool opt_fpic;
 extern bool opt_fcommon;
 extern bool opt_optimize;
diff --git a/tokenize.c b/tokenize.c
index 6ecbd92..2225827 100644
--- a/tokenize.c
+++ b/tokenize.c
@@ -874,7 +874,9 @@ Token *tokenize_file(char *path, Token **end) {
 
   canonicalize_newline(p);
   remove_backslash_newline(p);
-  convert_universal_chars(p);
+
+  if (opt_enable_universal_char)
+    convert_universal_chars(p);
 
   return tokenize(add_input_file(path, p, false), end);
 }

From bfe14208ed002228a6299317ce2e37a8afb757d5 Mon Sep 17 00:00:00 2001
From: fuhsnn <66062782+fuhsnn@users.noreply.github.com>
Date: Thu, 21 Nov 2024 00:10:48 +0800
Subject: [PATCH 04/18] Replace most <ctypes.h> usage with macro

---
 main.c     |  2 +-
 slimcc.h   |  6 ++++++
 tokenize.c | 14 +++++++-------
 3 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/main.c b/main.c
index 06ac1a9..a00439c 100644
--- a/main.c
+++ b/main.c
@@ -885,7 +885,7 @@ static FileType get_file_type(char *filename) {
   char *p = strstr(filename, ".so.");
   if (p) {
     p += 3;
-    while (isdigit(*p) || (*p == '.' && isdigit(p[1])))
+    while (Isdigit(*p) || (*p == '.' && Isdigit(p[1])))
       p++;
     if (!*p)
       return FILE_DSO;
diff --git a/slimcc.h b/slimcc.h
index 019b813..54cf1e3 100644
--- a/slimcc.h
+++ b/slimcc.h
@@ -22,6 +22,12 @@
 #define MAX(x, y) ((x) < (y) ? (y) : (x))
 #define MIN(x, y) ((x) < (y) ? (x) : (y))
 
+#define Ucast(c) (unsigned int)(unsigned char)(c)
+#define Inrange(c, x, y) ((Ucast(c) - Ucast(x)) <= (Ucast(y) - Ucast(x)))
+#define Isdigit(c) Inrange(c, '0', '9')
+#define Isalnum(c) (Inrange((c) | 0x20, 'a', 'z') || Isdigit(c))
+#define Isxdigit(c) (Isdigit(c) || Inrange((c) | 0x20, 'a', 'f'))
+
 #if defined(__GNUC__) && (__GNUC__ >= 3)
 #define FMTCHK(x,y) __attribute__((format(printf,(x),(y))))
 #define NORETURN __attribute__((noreturn))
diff --git a/tokenize.c b/tokenize.c
index 2225827..599ddaf 100644
--- a/tokenize.c
+++ b/tokenize.c
@@ -225,11 +225,11 @@ static int read_escaped_char(char **new_pos, char *p) {
   if (*p == 'x') {
     // Read a hexadecimal number.
     p++;
-    if (!isxdigit(*p))
+    if (!Isxdigit(*p))
       error_at(p, "invalid hex escape sequence");
 
     int c = 0;
-    for (; isxdigit(*p); p++)
+    for (; Isxdigit(*p); p++)
       c = ((unsigned)c << 4) + from_hex(*p);
     *new_pos = p;
     return c;
@@ -381,7 +381,7 @@ static Token *new_pp_number(char *start, char *p) {
     if (*p == '.') {
       p++;
       continue;
-    } else if (*p == '\'' && isalnum(p[1])) {
+    } else if (*p == '\'' && Isalnum(p[1])) {
       p += 2;
       continue;
     } else if (p[0] && p[1] && strchr("eEpP", p[0]) && strchr("+-", p[1])) {
@@ -402,7 +402,7 @@ static bool convert_pp_int(Token *tok, char *loc, int len) {
 
   // Read a binary, octal, decimal or hexadecimal number.
   int base = 10;
-  if (!strncasecmp(p, "0x", 2) && isxdigit(p[2])) {
+  if (!strncasecmp(p, "0x", 2) && Isxdigit(p[2])) {
     p += 2;
     base = 16;
   } else if (!strncasecmp(p, "0b", 2) && (p[2] == '0' || p[2] == '1')) {
@@ -577,7 +577,7 @@ Token *tokenize(File *file, Token **end) {
     }
 
     // Skip whitespace characters.
-    if (isspace(*p)) {
+    if (*p == ' ' || *p == '\t' || *p =='\v' || *p == '\f') {
       p++;
       has_space = true;
       continue;
@@ -604,7 +604,7 @@ Token *tokenize(File *file, Token **end) {
 
     // Numeric literal
     char *p2 = (*p == '.') ? p + 1 : p;
-    if (isdigit(*p2)) {
+    if (Isdigit(*p2)) {
       cur = cur->next = new_pp_number(p, p2 + 1);
       p += cur->len;
       continue;
@@ -801,7 +801,7 @@ static void remove_backslash_newline(char *p) {
 static uint32_t read_universal_char(char *p, int len) {
   uint32_t c = 0;
   for (int i = 0; i < len; i++) {
-    if (!isxdigit(p[i]))
+    if (!Isxdigit(p[i]))
       return 0;
     c = (c << 4) | from_hex(p[i]);
   }

From 4932df2401afc0afd79fe54cad3ab650b92e227c Mon Sep 17 00:00:00 2001
From: fuhsnn <66062782+fuhsnn@users.noreply.github.com>
Date: Thu, 21 Nov 2024 00:16:41 +0800
Subject: [PATCH 05/18] Use Inrange() macro more

---
 codegen.c  |  8 ++++----
 tokenize.c | 10 +++++-----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/codegen.c b/codegen.c
index a971f71..3c85316 100644
--- a/codegen.c
+++ b/codegen.c
@@ -3202,7 +3202,7 @@ static void asm_constraint(AsmParam *ap, int x87_clobber) {
       case 't': fixed_reg(&reg, REG_X87_ST0, tok); continue;
       case 'u': fixed_reg(&reg, REG_X87_ST1, tok); continue;
       }
-      if (*p >= '0' && *p <= '9') {
+      if (Isdigit(*p)) {
         match_idx = strtoul(p, &p, 10);
         continue;
       }
@@ -3640,7 +3640,7 @@ static AsmParam *find_op(char *p, char **rest, Token *tok, bool is_label) {
         if (named_op(p, op->name->len, op->name->loc, rest))
           return op;
     }
-  } else if (*p >= '0' && *p <= '9') {
+  } else if (Isdigit(*p)) {
     unsigned long idx = strtoul(p, rest, 10);
     if (idx < asm_ops.cnt)
       return asm_ops.data[idx];
@@ -3662,7 +3662,7 @@ static void asm_body(Node *node) {
       p++;
       continue;
     }
-    if (*p == 'l' && (p[1] == '[' || (p[1] >= '0' && p[1] <= '9'))) {
+    if (*p == 'l' && (p[1] == '[' || Isdigit(p[1]))) {
       AsmParam *ap = find_op(p + 1, &p, node->asm_str, true);
       if (!ap->arg->unique_label)
         error_tok(ap->arg->tok, "not a label");
@@ -3685,7 +3685,7 @@ static void asm_body(Node *node) {
       mod = *p;
       p++;
     }
-    if (*p == '[' || (*p >= '0' && *p <= '9')) {
+    if (*p == '[' || Isdigit(*p)) {
       AsmParam *ap = find_op(p, &p, node->asm_str, false);
       char *punct = (mod == 'c') ? "" : "$";
 
diff --git a/tokenize.c b/tokenize.c
index 599ddaf..66e28d2 100644
--- a/tokenize.c
+++ b/tokenize.c
@@ -143,9 +143,9 @@ static int read_ident(char *start) {
 }
 
 static int from_hex(char c) {
-  if ('0' <= c && c <= '9')
+  if (Inrange(c, '0', '9'))
     return c - '0';
-  if ('a' <= c && c <= 'f')
+  if (Inrange(c, 'a', 'f'))
     return c - 'a' + 10;
   return c - 'A' + 10;
 }
@@ -210,12 +210,12 @@ TokenKind ident_keyword(Token *tok) {
 }
 
 static int read_escaped_char(char **new_pos, char *p) {
-  if ('0' <= *p && *p <= '7') {
+  if (Inrange(*p, '0', '7')) {
     // Read an octal number.
     int c = *p++ - '0';
-    if ('0' <= *p && *p <= '7') {
+    if (Inrange(*p, '0', '7')) {
       c = (c << 3) + (*p++ - '0');
-      if ('0' <= *p && *p <= '7')
+      if (Inrange(*p, '0', '7'))
         c = (c << 3) + (*p++ - '0');
     }
     *new_pos = p;

From c97957f55f4eea66a297bde8b4ad791779040a59 Mon Sep 17 00:00:00 2001
From: fuhsnn <66062782+fuhsnn@users.noreply.github.com>
Date: Thu, 21 Nov 2024 00:19:17 +0800
Subject: [PATCH 06/18] Micro-optimize tokenizer space skipping

---
 tokenize.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tokenize.c b/tokenize.c
index 66e28d2..88f51ca 100644
--- a/tokenize.c
+++ b/tokenize.c
@@ -578,7 +578,7 @@ Token *tokenize(File *file, Token **end) {
 
     // Skip whitespace characters.
     if (*p == ' ' || *p == '\t' || *p =='\v' || *p == '\f') {
-      p++;
+      for (char c = *p; *(++p) == c;);
       has_space = true;
       continue;
     }

From 004c4c9a594b4eb74eee6590cdeaaa6f229e69ad Mon Sep 17 00:00:00 2001
From: fuhsnn <66062782+fuhsnn@users.noreply.github.com>
Date: Fri, 22 Nov 2024 02:10:22 +0800
Subject: [PATCH 07/18] Tweak previous commit

---
 slimcc.h   |  1 -
 tokenize.c | 82 +++++++++++++++++++++++-------------------------------
 2 files changed, 35 insertions(+), 48 deletions(-)

diff --git a/slimcc.h b/slimcc.h
index 54cf1e3..946c787 100644
--- a/slimcc.h
+++ b/slimcc.h
@@ -1,6 +1,5 @@
 #define _POSIX_C_SOURCE 200809L
 #include <assert.h>
-#include <ctype.h>
 #include <errno.h>
 #include <glob.h>
 #include <inttypes.h>
diff --git a/tokenize.c b/tokenize.c
index 4ab997f..3552fcb 100644
--- a/tokenize.c
+++ b/tokenize.c
@@ -151,45 +151,33 @@ static int from_hex(char c) {
 }
 
 // Read a punctuator token from p and returns its length.
-static int read_punct(const char *p) {
-  char c1;
+static int read_punct(char *p) {
+  bool is_repeat = p[1] == *p;
+  bool is_assign = p[1] == '=';
+
   switch (*p) {
-  case '<': // Pattern: < <= << <<=
-    c1 = *(p + 1);
-    if (c1 == '=') return 2;
-    if (c1 == '<')
-      return *(p + 2) == '=' ? 3 : 2;
-    return 1;
-  case '>': // Pattern: > >= >> >>=
-    c1 = *(p + 1);
-    if (c1 == '=') return 2;
-    if (c1 == '>')
-      return *(p + 2) == '=' ? 3 : 2;
-    return 1;
-  case '+': // Pattern: + ++ +=
-    c1 = *(p + 1);
-    return (c1 == '+' || c1 == '=') ? 2 : 1;
-  case '-': // Pattern: - -- -= ->
-    c1 = *(p + 1);
-    return (c1 == '-' || c1 == '=' || c1 == '>') ? 2 : 1;
-  case '&': // Pattern: & &= &&
-    c1 = *(p + 1);
-    return (c1 == '=' || c1 == '&') ? 2 : 1;
-  case '|': // Pattern: | |= ||
-    c1 = *(p + 1);
-    return (c1 == '=' || c1 == '|') ? 2 : 1;
-  case '.': // Pattern: . ...
-    return (*(p + 1) == '.' && *(p + 2) == '.') ? 3 : 1;
-  case '=': // Pattern: = ==
-  case '!': // Pattern: ! !=
-  case '*': // Pattern: * *=
-  case '/': // Pattern: / /=
-  case '%': // Pattern: % %=
-  case '^': // Pattern: ^ ^=
-    return *(p + 1) == '=' ? 2 : 1;
-  case '#': // Pattern: # ##
-    return *(p + 1) == '#' ? 2 : 1;
-  case '$':
+  case '-':
+    if (p[1] == '>')
+      return 2;
+  case '&':
+  case '+':
+  case '=':
+  case '|':
+    return (is_repeat | is_assign) + 1;
+  case '<':
+  case '>':
+    if (is_repeat)
+      return (p[2] == '=') + 2;
+  case '!':
+  case '%':
+  case '*':
+  case '/':
+  case '^':
+    return is_assign + 1;
+  case '#':
+    return is_repeat + 1;
+  case '.':
+    return (is_repeat && p[2] == *p) ? 3 : 1;
   case '(':
   case ')':
   case ',':
@@ -198,8 +186,8 @@ static int read_punct(const char *p) {
   case '?':
   case '@':
   case '[':
+  case '\\':
   case ']':
-  case '_':
   case '`':
   case '{':
   case '}':
@@ -654,6 +642,14 @@ Token *tokenize(File *file, Token **end) {
       continue;
     }
 
+    // Punctuators
+    int punct_len = read_punct(p);
+    if (punct_len) {
+      cur = cur->next = new_token(TK_PUNCT, p, p + punct_len);
+      p += cur->len;
+      continue;
+    }
+
     // String literal
     if (*p == '"') {
       cur = cur->next = read_string_literal(p, p);
@@ -727,14 +723,6 @@ Token *tokenize(File *file, Token **end) {
       continue;
     }
 
-    // Punctuators
-    int punct_len = read_punct(p);
-    if (punct_len) {
-      cur = cur->next = new_token(TK_PUNCT, p, p + punct_len);
-      p += cur->len;
-      continue;
-    }
-
     error_at(p, "invalid token");
   }
 

From e5cca771a9e8940c68fe83e44f14c423baf9ea6a Mon Sep 17 00:00:00 2001
From: fuhsnn <66062782+fuhsnn@users.noreply.github.com>
Date: Fri, 22 Nov 2024 04:49:46 +0800
Subject: [PATCH 08/18] Remove unnessasary strndup()'s

---
 preprocess.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/preprocess.c b/preprocess.c
index 57bc5d9..137394f 100644
--- a/preprocess.c
+++ b/preprocess.c
@@ -20,13 +20,13 @@
 typedef struct MacroParam MacroParam;
 struct MacroParam {
   MacroParam *next;
-  char *name;
+  Token *name;
 };
 
 typedef struct MacroArg MacroArg;
 struct MacroArg {
   MacroArg *next;
-  char *name;
+  Token *name;
   bool is_va_args;
   bool omit_comma;
   Token *tok;
@@ -42,7 +42,7 @@ struct Macro {
   Token *stop_tok;
   Macro *locked_next;
   MacroParam *params;
-  char *va_args_name;
+  Token *va_args_name;
   Token *body;
   macro_handler_fn *handler;
 };
@@ -384,7 +384,7 @@ static Macro *add_macro(char *name, bool is_objlike, Token *body) {
   return m;
 }
 
-static MacroParam *read_macro_params(Token **rest, Token *tok, char **va_args_name) {
+static MacroParam *read_macro_params(Token **rest, Token *tok, Token **va_args_name) {
   MacroParam head = {0};
   MacroParam *cur = &head;
 
@@ -393,7 +393,8 @@ static MacroParam *read_macro_params(Token **rest, Token *tok, char **va_args_na
       tok = skip(tok, ",");
 
     if (equal(tok, "...")) {
-      *va_args_name = "__VA_ARGS__";
+      static Token va_args = {.loc = "__VA_ARGS__", .len = 11};
+      *va_args_name = &va_args;
       *rest = skip(tok->next, ")");
       return head.next;
     }
@@ -402,13 +403,13 @@ static MacroParam *read_macro_params(Token **rest, Token *tok, char **va_args_na
       error_tok(tok, "expected an identifier");
 
     if (equal(tok->next, "...")) {
-      *va_args_name = strndup(tok->loc, tok->len);
+      *va_args_name = tok;
       *rest = skip(tok->next->next, ")");
       return head.next;
     }
 
     MacroParam *m = calloc(1, sizeof(MacroParam));
-    m->name = strndup(tok->loc, tok->len);
+    m->name = tok;
     cur = cur->next = m;
     tok = tok->next;
   }
@@ -425,7 +426,7 @@ static void read_macro_definition(Token **rest, Token *tok) {
 
   if (!tok->has_space && equal(tok, "(")) {
     // Function-like macro
-    char *va_args_name = NULL;
+    Token *va_args_name = NULL;
     MacroParam *params = read_macro_params(&tok, tok->next, &va_args_name);
 
     Macro *m = add_macro(name, false, split_line(rest, tok));
@@ -541,7 +542,7 @@ static MacroArg *find_va_arg(MacroArg *args) {
 
 static MacroArg *find_arg(Token **rest, Token *tok, MacroArg *args, Macro *m) {
   for (MacroArg *ap = args; ap; ap = ap->next) {
-    if (equal(tok, ap->name)) {
+    if (tok->len == ap->name->len && !memcmp(tok->loc, ap->name->loc, tok->len)) {
       if (rest)
         *rest = tok->next;
       return ap;

From efcbb859b4848093b645211eee9c06574091a941 Mon Sep 17 00:00:00 2001
From: fuhsnn <66062782+fuhsnn@users.noreply.github.com>
Date: Fri, 22 Nov 2024 06:19:38 +0800
Subject: [PATCH 09/18] Micro-optimize tokenizing ASCII identifiers

---
 tokenize.c | 39 +++++++++++++++++++++++++--------------
 1 file changed, 25 insertions(+), 14 deletions(-)

diff --git a/tokenize.c b/tokenize.c
index 3552fcb..c6a779d 100644
--- a/tokenize.c
+++ b/tokenize.c
@@ -127,18 +127,23 @@ static bool startswith(char *p, char *q) {
 
 // Read an identifier and returns the length of it.
 // If p does not point to a valid identifier, 0 is returned.
-static int read_ident(char *start) {
-  char *p = start;
-  uint32_t c = decode_utf8(&p, p);
-  if (!is_ident1(c))
-    return 0;
+static int read_ident(char *p) {
+  char *start = p;
 
-  for (;;) {
-    char *q;
-    c = decode_utf8(&q, p);
-    if (!is_ident2(c))
-      return p - start;
-    p = q;
+  for (bool is_first = true;; is_first = false) {
+    if (Isalnum(*p) || *p == '_' || *p == '$') {
+      p++;
+      continue;
+    }
+    if ((unsigned char)*p >= 128) {
+      char *pos;
+      uint32_t c = decode_utf8(&pos, p);
+      if (is_first ? is_ident1(c) : is_ident2(c)) {
+        p = pos;
+        continue;
+      }
+    }
+    return p - start;
   }
 }
 
@@ -420,11 +425,17 @@ static Token *new_pp_number(char *start, char *p) {
       p += 2;
       continue;
     }
-    char *pos;
-    if (is_ident2(decode_utf8(&pos, p))) {
-      p = pos;
+    if (Isalnum(*p) || *p == '_' || *p == '$') {
+      p++;
       continue;
     }
+    if ((unsigned char)*p >= 128) {
+      char *pos;
+      if (is_ident2(decode_utf8(&pos, p))) {
+        p = pos;
+        continue;
+      }
+    }
     return new_token(TK_PP_NUM, start, p);
   }
 }

From f65db8e9ac49d284e3c515e35784a94d5d90197b Mon Sep 17 00:00:00 2001
From: fuhsnn <66062782+fuhsnn@users.noreply.github.com>
Date: Fri, 22 Nov 2024 16:51:42 +0800
Subject: [PATCH 10/18] Reduce strlen/strncmp calls

---
 tokenize.c | 37 ++++++++++++++++++-------------------
 1 file changed, 18 insertions(+), 19 deletions(-)

diff --git a/tokenize.c b/tokenize.c
index c6a779d..f611f02 100644
--- a/tokenize.c
+++ b/tokenize.c
@@ -12,6 +12,9 @@ static bool at_bol;
 // True if the current position follows a space character
 static bool has_space;
 
+#define startswith2(p, x, y) ((*(p) == x) && ((p)[1] == y))
+#define startswith3(p, x, y, z) ((*(p) == x) && ((p)[1] == y) && ((p)[2] == z))
+
 // Reports an error and exit.
 void error(char *fmt, ...) {
   va_list ap;
@@ -627,7 +630,7 @@ Token *tokenize(File *file, Token **end) {
     }
 
     // Skip line comments.
-    if (startswith(p, "//")) {
+    if (startswith2(p, '/', '/')) {
       p += 2;
       while (*p != '\n')
         p++;
@@ -636,7 +639,7 @@ Token *tokenize(File *file, Token **end) {
     }
 
     // Skip block comments.
-    if (startswith(p, "/*")) {
+    if (startswith2(p, '/', '*')) {
       char *q = strstr(p + 2, "*/");
       if (!q)
         error_at(p, "unclosed block comment");
@@ -669,28 +672,28 @@ Token *tokenize(File *file, Token **end) {
     }
 
     // UTF-8 string literal
-    if (startswith(p, "u8\"")) {
+    if (startswith3(p, 'u', '8', '\"')) {
       cur = cur->next = read_string_literal(p, p + 2);
       p += cur->len;
       continue;
     }
 
     // UTF-16 string literal
-    if (startswith(p, "u\"")) {
+    if (startswith2(p, 'u', '\"')) {
       cur = cur->next = read_utf16_string_literal(p, p + 1);
       p += cur->len;
       continue;
     }
 
     // Wide string literal
-    if (startswith(p, "L\"")) {
+    if (startswith2(p, 'L', '\"')) {
       cur = cur->next = read_utf32_string_literal(p, p + 1, ty_int);
       p += cur->len;
       continue;
     }
 
     // UTF-32 string literal
-    if (startswith(p, "U\"")) {
+    if (startswith2(p, 'U', '\"')) {
       cur = cur->next = read_utf32_string_literal(p, p + 1, ty_uint);
       p += cur->len;
       continue;
@@ -705,7 +708,7 @@ Token *tokenize(File *file, Token **end) {
     }
 
     // UTF-16 character literal
-    if (startswith(p, "u'")) {
+    if (startswith2(p, 'u', '\'')) {
       cur = cur->next = read_char_literal(p, p + 1, ty_ushort);
       cur->val &= 0xffff;
       p += cur->len;
@@ -713,14 +716,14 @@ Token *tokenize(File *file, Token **end) {
     }
 
     // Wide character literal
-    if (startswith(p, "L'")) {
+    if (startswith2(p, 'L', '\'')) {
       cur = cur->next = read_char_literal(p, p + 1, ty_int);
       p += cur->len;
       continue;
     }
 
     // UTF-32 character literal
-    if (startswith(p, "U'")) {
+    if (startswith2(p, 'U', '\'')) {
       cur = cur->next = read_char_literal(p, p + 1, ty_uint);
       p += cur->len;
       continue;
@@ -856,28 +859,24 @@ static void convert_universal_chars(char *p) {
   char *q = p;
 
   while (*p) {
-    if (startswith(p, "\\u")) {
+    if (startswith2(p, '\\', 'u')) {
       uint32_t c = read_universal_char(p + 2, 4);
       if (c) {
         p += 6;
         q += encode_utf8(q, c);
-      } else {
-        *q++ = *p++;
+        continue;
       }
-    } else if (startswith(p, "\\U")) {
+    } else if (startswith2(p, '\\', 'U')) {
       uint32_t c = read_universal_char(p + 2, 8);
       if (c) {
         p += 10;
         q += encode_utf8(q, c);
-      } else {
-        *q++ = *p++;
+        continue;
       }
-    } else if (p[0] == '\\') {
-      *q++ = *p++;
-      *q++ = *p++;
-    } else {
+    } else if (*p == '\\') {
       *q++ = *p++;
     }
+    *q++ = *p++;
   }
 
   *q = '\0';

From 97fbcea599422a08b76e4dc217536eb9a76ed5dd Mon Sep 17 00:00:00 2001
From: fuhsnn <66062782+fuhsnn@users.noreply.github.com>
Date: Fri, 22 Nov 2024 17:24:57 +0800
Subject: [PATCH 11/18] Rework integer literal suffix algorithm

---
 slimcc.h   |  2 +-
 tokenize.c | 53 +++++++++++++++++++++--------------------------------
 2 files changed, 22 insertions(+), 33 deletions(-)

diff --git a/slimcc.h b/slimcc.h
index 946c787..14ff06a 100644
--- a/slimcc.h
+++ b/slimcc.h
@@ -11,7 +11,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <strings.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <sys/wait.h>
@@ -26,6 +25,7 @@
 #define Isdigit(c) Inrange(c, '0', '9')
 #define Isalnum(c) (Inrange((c) | 0x20, 'a', 'z') || Isdigit(c))
 #define Isxdigit(c) (Isdigit(c) || Inrange((c) | 0x20, 'a', 'f'))
+#define Casecmp(c, a) (((c) | 0x20) == a)
 
 #if defined(__GNUC__) && (__GNUC__ >= 3)
 #define FMTCHK(x,y) __attribute__((format(printf,(x),(y))))
diff --git a/tokenize.c b/tokenize.c
index f611f02..4e87f28 100644
--- a/tokenize.c
+++ b/tokenize.c
@@ -124,10 +124,6 @@ static Token *new_token(TokenKind kind, char *start, char *end) {
   return tok;
 }
 
-static bool startswith(char *p, char *q) {
-  return strncmp(p, q, strlen(q)) == 0;
-}
-
 // Read an identifier and returns the length of it.
 // If p does not point to a valid identifier, 0 is returned.
 static int read_ident(char *p) {
@@ -448,46 +444,39 @@ static bool convert_pp_int(Token *tok, char *loc, int len) {
 
   // Read a binary, octal, decimal or hexadecimal number.
   int base = 10;
-  if (!strncasecmp(p, "0x", 2) && Isxdigit(p[2])) {
-    p += 2;
-    base = 16;
-  } else if (!strncasecmp(p, "0b", 2) && (p[2] == '0' || p[2] == '1')) {
-    p += 2;
-    base = 2;
-  } else if (*p == '0') {
-    base = 8;
+  if (*p == '0') {
+    if (Casecmp(p[1], 'x') && Isxdigit(p[2])) {
+      p += 2;
+      base = 16;
+    } else if (Casecmp(p[1], 'b') && (p[2] == '0' || p[2] == '1')) {
+      p += 2;
+      base = 2;
+    } else {
+      base = 8;
+    }
   }
 
   int64_t val = strtoul(p, &p, base);
 
   // Read U, L or LL suffixes.
-  bool ll = false;
-  bool l = false;
   bool u = false;
-
-  if (startswith(p, "LLU") || startswith(p, "LLu") ||
-      startswith(p, "llU") || startswith(p, "llu") ||
-      startswith(p, "ULL") || startswith(p, "Ull") ||
-      startswith(p, "uLL") || startswith(p, "ull")) {
-    p += 3;
-    ll = u = true;
-  } else if (!strncasecmp(p, "lu", 2) || !strncasecmp(p, "ul", 2)) {
-    p += 2;
-    l = u = true;
-  } else if (startswith(p, "LL") || startswith(p, "ll")) {
-    p += 2;
-    ll = true;
-  } else if (*p == 'L' || *p == 'l') {
-    p++;
-    l = true;
-  } else if (*p == 'U' || *p == 'u') {
-    p++;
+  int l_cnt = 0;
+  if (Casecmp(*p, 'u')) {
+    if (Casecmp(p[1], 'l'))
+      l_cnt = 1 + (p[1] == p[2]);
     u = true;
+  } else if (Casecmp(*p, 'l')) {
+    l_cnt = 1 + (*p == p[1]);
+    u = Casecmp(p[l_cnt], 'u');
   }
+  p += l_cnt + u;
 
   if (p != loc + len)
     return false;
 
+  bool ll = l_cnt == 2;
+  bool l = l_cnt == 1;
+
   // Infer a type.
   Type *ty;
   if (base == 10) {

From 3b70322601df92940cfb024a880da9f841d4f49d Mon Sep 17 00:00:00 2001
From: fuhsnn <66062782+fuhsnn@users.noreply.github.com>
Date: Fri, 22 Nov 2024 21:06:13 +0800
Subject: [PATCH 12/18] Rework newline canonicalization

---
 tokenize.c | 65 ++++++++++++++++++++++++++----------------------------
 1 file changed, 31 insertions(+), 34 deletions(-)

diff --git a/tokenize.c b/tokenize.c
index 4e87f28..be4fc8d 100644
--- a/tokenize.c
+++ b/tokenize.c
@@ -789,48 +789,45 @@ File *new_file(char *name, int file_no, char *contents) {
 
 // Replaces \r or \r\n with \n.
 static void canonicalize_newline(char *p) {
-  int i = 0, j = 0;
-
-  while (p[i]) {
-    if (p[i] == '\r' && p[i + 1] == '\n') {
-      i += 2;
-      p[j++] = '\n';
-    } else if (p[i] == '\r') {
-      i++;
-      p[j++] = '\n';
-    } else {
-      p[j++] = p[i++];
+  char *first = strchr(p, '\r');
+  if (first) {
+    char *q = p = first;
+
+    while (*p) {
+      if (*p == '\r') {
+        *q++ = '\n';
+        p += (p[1] == '\n') + 1;
+        continue;
+      }
+      *q++ = *p++;
     }
-  }
 
-  p[j] = '\0';
+    *q = '\0';
+  }
 }
 
 // Removes backslashes followed by a newline.
 static void remove_backslash_newline(char *p) {
-  int i = 0, j = 0;
-
-  // We want to keep the number of newline characters so that
-  // the logical line number matches the physical one.
-  // This counter maintain the number of newlines we have removed.
-  int n = 0;
-
-  while (p[i]) {
-    if (p[i] == '\\' && p[i + 1] == '\n') {
-      i += 2;
-      n++;
-    } else if (p[i] == '\n') {
-      p[j++] = p[i++];
-      for (; n > 0; n--)
-        p[j++] = '\n';
-    } else {
-      p[j++] = p[i++];
+  char *first = strchr(p, '\\');
+  if (first) {
+    char *q = p = first;
+    int n = 0;
+
+    while (*p) {
+      if (*p == '\\' && p[1] == '\n') {
+        p += 2;
+        n++;
+        continue;
+      }
+      if (*p == '\n') {
+        for (; n > 0; n--)
+          *q++ = '\n';
+      }
+      *q++ = *p++;
     }
-  }
 
-  for (; n > 0; n--)
-    p[j++] = '\n';
-  p[j] = '\0';
+    *q = '\0';
+  }
 }
 
 static uint32_t read_universal_char(char *p, int len) {

From 4a79cd939743c9b7e4ec0041727a75d6275addd8 Mon Sep 17 00:00:00 2001
From: fuhsnn <66062782+fuhsnn@users.noreply.github.com>
Date: Fri, 22 Nov 2024 21:21:59 +0800
Subject: [PATCH 13/18] Fixup f65db8e

---
 tokenize.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tokenize.c b/tokenize.c
index be4fc8d..ada9d2c 100644
--- a/tokenize.c
+++ b/tokenize.c
@@ -859,8 +859,6 @@ static void convert_universal_chars(char *p) {
         q += encode_utf8(q, c);
         continue;
       }
-    } else if (*p == '\\') {
-      *q++ = *p++;
     }
     *q++ = *p++;
   }

From 4d344382eb70db78c35ebdd4603d79fda0c4cd58 Mon Sep 17 00:00:00 2001
From: Jim Huang <jserv@ccns.ncku.edu.tw>
Date: Fri, 22 Nov 2024 22:07:00 +0800
Subject: [PATCH 14/18] Avoid expensive division in HashMap

Previously, HashMap relied on division operations, which are expensive.
This update replaces divisions with logical AND operations using an
additional mask, improving performance.
---
 hashmap.c | 8 +++++---
 slimcc.h  | 1 +
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/hashmap.c b/hashmap.c
index ae57b1d..365d59b 100644
--- a/hashmap.c
+++ b/hashmap.c
@@ -2,7 +2,7 @@
 
 #include "slimcc.h"
 
-// Initial hash bucket size
+// Initial hash bucket size; must be power of 2
 #define INIT_SIZE 16
 
 // Rehash if the usage exceeds 70%.
@@ -41,6 +41,7 @@ static void rehash(HashMap *map) {
   HashMap map2 = {0};
   map2.buckets = calloc(cap, sizeof(HashEntry));
   map2.capacity = cap;
+  map2.mask = cap - 1;
 
   for (int i = 0; i < map->capacity; i++) {
     HashEntry *ent = &map->buckets[i];
@@ -64,7 +65,7 @@ static HashEntry *get_entry(HashMap *map, char *key, int keylen) {
   uint64_t hash = fnv_hash(key, keylen);
 
   for (int i = 0; i < map->capacity; i++) {
-    HashEntry *ent = &map->buckets[(hash + i) % map->capacity];
+    HashEntry *ent = &map->buckets[(hash + i) & map->mask];
     if (match(ent, key, keylen))
       return ent;
     if (ent->key == NULL)
@@ -77,6 +78,7 @@ static HashEntry *get_or_insert_entry(HashMap *map, char *key, int keylen) {
   if (!map->buckets) {
     map->buckets = calloc(INIT_SIZE, sizeof(HashEntry));
     map->capacity = INIT_SIZE;
+    map->mask = INIT_SIZE - 1;
   } else if ((map->used * 100) / map->capacity >= HIGH_WATERMARK) {
     rehash(map);
   }
@@ -84,7 +86,7 @@ static HashEntry *get_or_insert_entry(HashMap *map, char *key, int keylen) {
   uint64_t hash = fnv_hash(key, keylen);
 
   for (int i = 0; i < map->capacity; i++) {
-    HashEntry *ent = &map->buckets[(hash + i) % map->capacity];
+    HashEntry *ent = &map->buckets[(hash + i) & map->mask];
 
     if (match(ent, key, keylen))
       return ent;
diff --git a/slimcc.h b/slimcc.h
index 14ff06a..e39a548 100644
--- a/slimcc.h
+++ b/slimcc.h
@@ -65,6 +65,7 @@ typedef struct {
   HashEntry *buckets;
   int capacity;
   int used;
+  int mask;
 } HashMap;
 
 void *hashmap_get(HashMap *map, char *key);

From 240f8546b88e643b87ddacebf6dd8d3af2651340 Mon Sep 17 00:00:00 2001
From: fuhsnn <66062782+fuhsnn@users.noreply.github.com>
Date: Fri, 22 Nov 2024 22:27:51 +0800
Subject: [PATCH 15/18] Make Token struct packed

---
 slimcc.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/slimcc.h b/slimcc.h
index 14ff06a..1c38bc6 100644
--- a/slimcc.h
+++ b/slimcc.h
@@ -30,6 +30,7 @@
 #if defined(__GNUC__) && (__GNUC__ >= 3)
 #define FMTCHK(x,y) __attribute__((format(printf,(x),(y))))
 #define NORETURN __attribute__((noreturn))
+#define PACKED __attribute__((packed))
 #elif defined(__has_attribute)
 #if __has_attribute(format)
 #define FMTCHK(x,y) __attribute__((format(printf,(x),(y))))
@@ -37,6 +38,9 @@
 #if __has_attribute(noreturn)
 #define NORETURN __attribute__((noreturn))
 #endif
+#if __has_attribute(packed)
+#define PACKED __attribute__((packed))
+#endif
 #endif
 
 #ifndef FMTCHK
@@ -45,6 +49,9 @@
 #ifndef NORETURN
 #define NORETURN
 #endif
+#ifndef PACKED
+#define PACKED
+#endif
 
 typedef struct Type Type;
 typedef struct Node Node;
@@ -141,7 +148,7 @@ struct Token {
   Token *origin;    // If this is expanded from a macro, the original token
   char *guard_file; // The path of a potentially include-guarded file
   Token *attr_next;
-};
+} PACKED;
 
 void error(char *fmt, ...) FMTCHK(1,2) NORETURN;
 void error_at(char *loc, char *fmt, ...) FMTCHK(2,3) NORETURN;

From d80d5903b38598d69b6a68f4f9019d2b5bb0316d Mon Sep 17 00:00:00 2001
From: Jim Huang <jserv@ccns.ncku.edu.tw>
Date: Sat, 23 Nov 2024 14:26:45 +0800
Subject: [PATCH 16/18] Expand macro only when preprocessor token is identifier

In 'preprocess2' function, 'expand_macro' was called multiple times,
some of which were unnecessary. This change ensures that 'expand_macro'
is called only when the preprocessor token is an identifier.
---
 preprocess.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/preprocess.c b/preprocess.c
index 137394f..7da5885 100644
--- a/preprocess.c
+++ b/preprocess.c
@@ -1077,7 +1077,7 @@ static Token *preprocess2(Token *tok) {
 
   for (; tok->kind != TK_EOF; pop_macro_lock(tok)) {
     // If it is a macro, expand it.
-    if (expand_macro(&tok, tok))
+    if (tok->kind == TK_IDENT && expand_macro(&tok, tok))
       continue;
 
     if (is_hash(tok) && !locked_macros) {

From 754532413cac831ce33fb0b29780f469bdd63692 Mon Sep 17 00:00:00 2001
From: "ChAoS_UnItY (Kyle Lin)" <minecraft.kyle.train@gmail.com>
Date: Sat, 23 Nov 2024 22:25:17 +0800
Subject: [PATCH 17/18] Add basic testing action for push and pr

Co-authored-by:  Jim Huang <jserv@ccns.ncku.edu.tw>
---
 .github/workflows/main.yml | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)
 create mode 100644 .github/workflows/main.yml

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
new file mode 100644
index 0000000..6c28664
--- /dev/null
+++ b/.github/workflows/main.yml
@@ -0,0 +1,37 @@
+name: Github Actions
+
+on: [push, pull_request]
+
+jobs:
+  host-x86:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        compiler: [gcc, clang]
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+      - name: Install prerequisite
+        run: |
+          sudo apt-get update -q -y
+          sudo apt-get install -q -y file
+          sudo apt-get install -q -y build-essential
+      - name: Build stage 1 artifact
+        env:
+          CC: ${{ matrix.compiler }}
+        run: |
+          make clean
+          make slimcc
+        shell: bash
+      - name: Test stage 1
+        run: |
+          make test || exit 1
+        shell: bash
+      - name: Build stage 2 artifact
+        run: |
+          make stage2/slimcc
+        shell: bash
+      - name: Test stage 2
+        run: |
+          make test-stage2 || exit 1
+        shell: bash

From 6a06d610c7e985ac9c88523adf07d9a056ab43fa Mon Sep 17 00:00:00 2001
From: Kyle Lin <minecraft.kyle.train@gmail.com>
Date: Mon, 25 Nov 2024 23:01:23 +0800
Subject: [PATCH 18/18] Adopt cache-apt-pkgs, add curl 8.9.1 build & unit-test
 to workflow

---
 .github/workflows/main.yml | 49 ++++++++++++++++++++++++++++++++++----
 1 file changed, 44 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 6c28664..7b76550 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -3,7 +3,7 @@ name: Github Actions
 on: [push, pull_request]
 
 jobs:
-  host-x86:
+  host-x86-build:
     runs-on: ubuntu-latest
     strategy:
       matrix:
@@ -12,10 +12,10 @@ jobs:
       - name: Checkout code
         uses: actions/checkout@v4
       - name: Install prerequisite
-        run: |
-          sudo apt-get update -q -y
-          sudo apt-get install -q -y file
-          sudo apt-get install -q -y build-essential
+        uses: awalsh128/cache-apt-pkgs-action@latest
+        with:
+          packages: file build-essential clang
+          version: 1.0
       - name: Build stage 1 artifact
         env:
           CC: ${{ matrix.compiler }}
@@ -35,3 +35,42 @@ jobs:
         run: |
           make test-stage2 || exit 1
         shell: bash
+      - run: mv slimcc slimcc-${{ matrix.compiler }}
+      - name: Upload artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: slimcc-${{ matrix.compiler }}
+          path: slimcc-${{ matrix.compiler }}
+
+  test-building-real-world-projects:
+    needs: host-x86-build
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        compiler: [gcc, clang]
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+      - name: Download artifact
+        uses: actions/download-artifact@v4
+        with:
+          name: slimcc-${{ matrix.compiler }}
+      - run: |
+          chmod +x slimcc-${{ matrix.compiler }}
+      - uses: lukka/get-cmake@latest  
+        with:
+          useLocalCache: true
+      - name: Install prerequisite
+        uses: awalsh128/cache-apt-pkgs-action@latest
+        with:
+          packages: file build-essential clang libssh2-1
+          version: 1.0
+      - name: Test Building curl 8.9.1
+        run: |
+          git clone --depth 1 https://github.com/curl/curl --branch curl-8_9_1
+          mkdir curl/cmakebuild
+          cd curl/cmakebuild
+          cmake ../ -DCMAKE_C_COMPILER=${GITHUB_WORKSPACE}/slimcc-${{ matrix.compiler }} -DCMAKE_C_FLAGS=-fPIC
+          make -j
+          make test-quiet -j
+        shell: bash