From c40c66d6b11885264a06a3e80740efa608912cf4 Mon Sep 17 00:00:00 2001 From: hrszpuk <107559570+hrszpuk@users.noreply.github.com> Date: Sun, 5 Jan 2025 16:09:07 +0000 Subject: [PATCH 01/23] Removed token.cpp from CMake --- CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 84c0cb5..236b8a6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,7 +10,6 @@ include_directories(include) set(SOURCES src/lexer.cpp - src/token.cpp src/parser.cpp src/codegen.cpp src/semantics.cpp From 519696bd90f0918984918370ea134fa03735f7d4 Mon Sep 17 00:00:00 2001 From: hrszpuk <107559570+hrszpuk@users.noreply.github.com> Date: Sun, 5 Jan 2025 16:09:23 +0000 Subject: [PATCH 02/23] Removed token.cpp from CMake test file --- tests/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index ac473c9..4a1be1a 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -5,7 +5,6 @@ target_include_directories(run_tests PRIVATE ../include) target_sources(run_tests PRIVATE ../src/lexer.cpp - ../src/token.cpp ../src/parser.cpp ../src/semantics.cpp ) From 94d797ba90d09ca8f70ddb7202140b4cc13ee2b7 Mon Sep 17 00:00:00 2001 From: hrszpuk <107559570+hrszpuk@users.noreply.github.com> Date: Sun, 5 Jan 2025 16:11:19 +0000 Subject: [PATCH 03/23] Added token_type_to_string for nicer error messages and debugging --- include/dragon/token.h | 166 +++++++++++++++++++++++++++-------------- 1 file changed, 110 insertions(+), 56 deletions(-) diff --git a/include/dragon/token.h b/include/dragon/token.h index bd77ce9..36c4b92 100644 --- a/include/dragon/token.h +++ b/include/dragon/token.h @@ -1,62 +1,6 @@ #pragma once #include -#include - -enum class TokenType; - -class Token { -public: - TokenType type; - std::string value; - size_t line; - size_t column; - - Token(TokenType type, std::string value, size_t line, size_t column) { - this->type = type; - this->value = value; - this->line = line; - this->column = column; - } - - Token(TokenType type, std::string value) { - this->type = type; - this->value = value; - this->line = 0; - this->column = 0; - } - - Token(TokenType type) { - this->type = type; - this->value = ""; - this->line = 0; - this->column = 0; - } - - inline bool operator==(const Token& other) const { - return this->type == other.type && this->value == other.value; - } - - inline bool operator!=(const Token& other) const { - return this->type != other.type || this->value != other.value; - } - - inline std::string to_string() { - if (this->line == 0 && this->column == 0 && this->value == "") { - return std::format("Token({})", this->type); - } - - if (this->line == 0 && this->column == 0) { - return std::format("Token({}, {})", this->type, this->value); - } - - if (this->value == "") { - return std::format("Token({}, {}, {}, {})", this->type, this->line, this->column); - } - - return std::format("Token({}, {}, {}, {})", this->type, this->value, this->line, this->column); - } -}; enum class TokenType { // Keywords @@ -108,3 +52,113 @@ enum class TokenType { Comment, Unknown, }; + +inline std::string token_type_to_string(TokenType type) { + switch (type) { + case TokenType::Let: return "Let"; + case TokenType::Mut: return "Mut"; + case TokenType::If: return "If"; + case TokenType::Else: return "Else"; + case TokenType::While: return "While"; + case TokenType::For: return "For"; + case TokenType::In: return "In"; + case TokenType::True: return "True"; + case TokenType::False: return "False"; + case TokenType::IntegerLiteral: return "IntegerLiteral"; + case TokenType::StringLiteral: return "StringLiteral"; + case TokenType::Identifier: return "Identifier"; + case TokenType::Plus: return "Plus"; + case TokenType::Minus: return "Minus"; + case TokenType::Star: return "Star"; + case TokenType::Slash: return "Slash"; + case TokenType::And: return "And"; + case TokenType::Or: return "Or"; + case TokenType::Not: return "Not"; + case TokenType::Equals: return "Equals"; + case TokenType::NotEquals: return "NotEquals"; + case TokenType::LessThan: return "LessThan"; + case TokenType::GreaterThan: return "GreaterThan"; + case TokenType::LessThanOrEqualTo: return "LessThanOrEqualTo"; + case TokenType::GreaterThanOrEqualTo: return "GreaterThanOrEqualTo"; + case TokenType::Assign: return "Assign"; + case TokenType::LeftParen: return "LeftParen"; + case TokenType::RightParen: return "RightParen"; + case TokenType::LeftBrace: return "LeftBrace"; + case TokenType::RightBrace: return "RightBrace"; + case TokenType::LeftBracket: return "LeftBracket"; + case TokenType::RightBracket: return "RightBracket"; + case TokenType::Comma: return "Comma"; + case TokenType::Dot: return "Dot"; + case TokenType::Range: return "Range"; + case TokenType::Ampersand: return "Ampersand"; + case TokenType::Pipe: return "Pipe"; + case TokenType::Caret: return "Caret"; + case TokenType::Tilde: return "Tilde"; + case TokenType::Comment: return "Comment"; + case TokenType::Unknown: return "Unknown"; + default: return "Unknown"; + } +} + +class Token { +public: + TokenType type; + std::string value; + size_t line; + size_t column; + + Token(TokenType type, std::string value, size_t line, size_t column) { + this->type = type; + this->value = value; + this->line = line; + this->column = column; + } + + Token(TokenType type, std::string value) { + this->type = type; + this->value = value; + this->line = 0; + this->column = 0; + } + + Token(TokenType type) { + this->type = type; + this->value = ""; + this->line = 0; + this->column = 0; + } + + inline bool operator==(const Token& other) const { + return this->type == other.type && this->value == other.value; + } + + inline bool operator!=(const Token& other) const { + return this->type != other.type || this->value != other.value; + } + + inline std::string to_string() { + std::stringstream ss; + ss << "Token(" << token_type_to_string(this->type); + if (!this->value.empty()) { + ss << ", " << this->value; + } + if (this->line != 0 || this->column != 0) { + ss << ", " << this->line << ", " << this->column; + } + ss << ")"; + return ss.str(); + } +}; + +inline std::string token_vector_to_string(std::vector tokens) { + std::stringstream ss; + ss << "["; + for (size_t i = 0; i < tokens.size(); i++) { + ss << tokens[i].to_string(); + if (i < tokens.size() - 1) { + ss << ", "; + } + } + ss << "]"; + return ss.str(); +} \ No newline at end of file From 3f5e8a78a9263e405812a78d58a8cd10c67ba534 Mon Sep 17 00:00:00 2001 From: hrszpuk <107559570+hrszpuk@users.noreply.github.com> Date: Sun, 5 Jan 2025 16:25:36 +0000 Subject: [PATCH 04/23] Improved debug error messages for numerous test cases (lexer) --- tests/lexer_test.cpp | 128 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 112 insertions(+), 16 deletions(-) diff --git a/tests/lexer_test.cpp b/tests/lexer_test.cpp index 4162e98..db3bcc0 100644 --- a/tests/lexer_test.cpp +++ b/tests/lexer_test.cpp @@ -394,7 +394,13 @@ TEST(LexerTests, Mixed) { std::vector tokens = lexer.lex(input); - ASSERT_TRUE(validTokens == tokens); + ASSERT_EQ(validTokens.size(), tokens.size()) + << "Failed on input: " << input + << " ( recieved a size of " << tokens.size() << ", expected a size of " << validTokens.size() << ")"; + + ASSERT_EQ(validTokens, tokens) + << "Failed on input: " << input + << " ( recieved: " << token_vector_to_string(tokens) << ", expected: " << token_vector_to_string(validTokens) << ")"; } // let @@ -409,7 +415,13 @@ TEST(LexerTests, VariableDeclarationWithoutExpr) { std::vector tokens = lexer.lex(input); - ASSERT_TRUE(validTokens == tokens); + ASSERT_EQ(validTokens.size(), tokens.size()) + << "Failed on input: " << input + << " ( recieved a size of " << tokens.size() << ", expected a size of " << validTokens.size() << ")"; + + ASSERT_EQ(validTokens, tokens) + << "Failed on input: " << input + << " ( recieved: " << token_vector_to_string(tokens) << ", expected: " << token_vector_to_string(validTokens) << ")"; } // let = @@ -428,7 +440,13 @@ TEST (LexerTests, VariableDeclarationWithExpr) { std::vector tokens = lexer.lex(input); - ASSERT_TRUE(validTokens == tokens); + ASSERT_EQ(validTokens.size(), tokens.size()) + << "Failed on input: " << input + << " ( recieved a size of " << tokens.size() << ", expected a size of " << validTokens.size() << ")"; + + ASSERT_EQ(validTokens, tokens) + << "Failed on input: " << input + << " ( recieved: " << token_vector_to_string(tokens) << ", expected: " << token_vector_to_string(validTokens) << ")"; } // let = @@ -446,7 +464,13 @@ TEST(LexerTests, VariableDeclarationWithoutType) { std::vector tokens = lexer.lex(input); - ASSERT_TRUE(validTokens == tokens); + ASSERT_EQ(validTokens.size(), tokens.size()) + << "Failed on input: " << input + << " ( recieved a size of " << tokens.size() << ", expected a size of " << validTokens.size() << ")"; + + ASSERT_EQ(validTokens, tokens) + << "Failed on input: " << input + << " ( recieved: " << token_vector_to_string(tokens) << ", expected: " << token_vector_to_string(validTokens) << ")"; } // let mut = @@ -466,7 +490,13 @@ TEST(LexerTests, MutableVariableDeclarationWithExpr) { std::vector tokens = lexer.lex(input); - ASSERT_TRUE(validTokens == tokens); + ASSERT_EQ(validTokens.size(), tokens.size()) + << "Failed on input: " << input + << " ( recieved a size of " << tokens.size() << ", expected a size of " << validTokens.size() << ")"; + + ASSERT_EQ(validTokens, tokens) + << "Failed on input: " << input + << " ( recieved: " << token_vector_to_string(tokens) << ", expected: " << token_vector_to_string(validTokens) << ")"; } // let mut @@ -482,7 +512,13 @@ TEST(LexerTests, MutableVariableDeclarationWithoutExpr) { std::vector tokens = lexer.lex(input); - ASSERT_TRUE(validTokens == tokens); + ASSERT_EQ(validTokens.size(), tokens.size()) + << "Failed on input: " << input + << " ( recieved a size of " << tokens.size() << ", expected a size of " << validTokens.size() << ")"; + + ASSERT_EQ(validTokens, tokens) + << "Failed on input: " << input + << " ( recieved: " << token_vector_to_string(tokens) << ", expected: " << token_vector_to_string(validTokens) << ")"; } // let mut = @@ -501,7 +537,13 @@ TEST(LexerTests, MutableVariableDeclarationWithoutType) { std::vector tokens = lexer.lex(input); - ASSERT_TRUE(validTokens == tokens); + ASSERT_EQ(validTokens.size(), tokens.size()) + << "Failed on input: " << input + << " ( recieved a size of " << tokens.size() << ", expected a size of " << validTokens.size() << ")"; + + ASSERT_EQ(validTokens, tokens) + << "Failed on input: " << input + << " ( recieved: " << token_vector_to_string(tokens) << ", expected: " << token_vector_to_string(validTokens) << ")"; } // = @@ -518,7 +560,13 @@ TEST(LexerTests, Assignment) { std::vector tokens = lexer.lex(input); - ASSERT_TRUE(validTokens == tokens); + ASSERT_EQ(validTokens.size(), tokens.size()) + << "Failed on input: " << input + << " ( recieved a size of " << tokens.size() << ", expected a size of " << validTokens.size() << ")"; + + ASSERT_EQ(validTokens, tokens) + << "Failed on input: " << input + << " ( recieved: " << token_vector_to_string(tokens) << ", expected: " << token_vector_to_string(validTokens) << ")"; } // @@ -538,7 +586,13 @@ TEST(LexerTests, Arguments) { std::vector tokens = lexer.lex(input); - ASSERT_TRUE(validTokens == tokens); + ASSERT_EQ(validTokens.size(), tokens.size()) + << "Failed on input: " << input + << " ( recieved a size of " << tokens.size() << ", expected a size of " << validTokens.size() << ")"; + + ASSERT_EQ(validTokens, tokens) + << "Failed on input: " << input + << " ( recieved: " << token_vector_to_string(tokens) << ", expected: " << token_vector_to_string(validTokens) << ")"; } // () @@ -560,7 +614,13 @@ TEST(LexerTests, FunctionCall) { std::vector tokens = lexer.lex(input); - ASSERT_TRUE(validTokens == tokens); + ASSERT_EQ(validTokens.size(), tokens.size()) + << "Failed on input: " << input + << " ( recieved a size of " << tokens.size() << ", expected a size of " << validTokens.size() << ")"; + + ASSERT_EQ(validTokens, tokens) + << "Failed on input: " << input + << " ( recieved: " << token_vector_to_string(tokens) << ", expected: " << token_vector_to_string(validTokens) << ")"; } // while { } @@ -583,7 +643,13 @@ TEST(LexerTests, WhileLoop) { std::vector tokens = lexer.lex(input); - ASSERT_TRUE(validTokens == tokens); + ASSERT_EQ(validTokens.size(), tokens.size()) + << "Failed on input: " << input + << " ( recieved a size of " << tokens.size() << ", expected a size of " << validTokens.size() << ")"; + + ASSERT_EQ(validTokens, tokens) + << "Failed on input: " << input + << " ( recieved: " << token_vector_to_string(tokens) << ", expected: " << token_vector_to_string(validTokens) << ")"; } // for in { } @@ -608,7 +674,13 @@ TEST(LexerTests, ForLoop) { std::vector tokens = lexer.lex(input); - ASSERT_TRUE(validTokens == tokens); + ASSERT_EQ(validTokens.size(), tokens.size()) + << "Failed on input: " << input + << " ( recieved a size of " << tokens.size() << ", expected a size of " << validTokens.size() << ")"; + + ASSERT_EQ(validTokens, tokens) + << "Failed on input: " << input + << " ( recieved: " << token_vector_to_string(tokens) << ", expected: " << token_vector_to_string(validTokens) << ")"; } // for in { } @@ -637,7 +709,13 @@ TEST(LexerTests, ForLoopWithExpr) { std::vector tokens = lexer.lex(input); - ASSERT_TRUE(validTokens == tokens); + ASSERT_EQ(validTokens.size(), tokens.size()) + << "Failed on input: " << input + << " ( recieved a size of " << tokens.size() << ", expected a size of " << validTokens.size() << ")"; + + ASSERT_EQ(validTokens, tokens) + << "Failed on input: " << input + << " ( recieved: " << token_vector_to_string(tokens) << ", expected: " << token_vector_to_string(validTokens) << ")"; } // if { } @@ -660,7 +738,13 @@ TEST(LexerTests, IfStatement) { std::vector tokens = lexer.lex(input); - ASSERT_TRUE(validTokens == tokens); + ASSERT_EQ(validTokens.size(), tokens.size()) + << "Failed on input: " << input + << " ( recieved a size of " << tokens.size() << ", expected a size of " << validTokens.size() << ")"; + + ASSERT_EQ(validTokens, tokens) + << "Failed on input: " << input + << " ( recieved: " << token_vector_to_string(tokens) << ", expected: " << token_vector_to_string(validTokens) << ")"; } // else if { } @@ -695,7 +779,13 @@ TEST(LexerTests, ElseIfStatement) { std::vector tokens = lexer.lex(input); - ASSERT_TRUE(validTokens == tokens); + ASSERT_EQ(validTokens.size(), tokens.size()) + << "Failed on input: " << input + << " ( recieved a size of " << tokens.size() << ", expected a size of " << validTokens.size() << ")"; + + ASSERT_EQ(validTokens, tokens) + << "Failed on input: " << input + << " ( recieved: " << token_vector_to_string(tokens) << ", expected: " << token_vector_to_string(validTokens) << ")"; } // else { } @@ -726,5 +816,11 @@ TEST(LexerTests, ElseStatement) { std::vector tokens = lexer.lex(input); - ASSERT_TRUE(validTokens == tokens); + ASSERT_EQ(validTokens.size(), tokens.size()) + << "Failed on input: " << input + << " ( recieved a size of " << tokens.size() << ", expected a size of " << validTokens.size() << ")"; + + ASSERT_EQ(validTokens, tokens) + << "Failed on input: " << input + << " ( recieved: " << token_vector_to_string(tokens) << ", expected: " << token_vector_to_string(validTokens) << ")"; } \ No newline at end of file From 207229eeaddc5711ad8e1bd0776e83a7f7bc9945 Mon Sep 17 00:00:00 2001 From: hrszpuk <107559570+hrszpuk@users.noreply.github.com> Date: Sun, 5 Jan 2025 16:26:27 +0000 Subject: [PATCH 05/23] Renamed all expression test cases to be more descriptive --- tests/lexer_test.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/lexer_test.cpp b/tests/lexer_test.cpp index db3bcc0..7fb2ae8 100644 --- a/tests/lexer_test.cpp +++ b/tests/lexer_test.cpp @@ -200,7 +200,7 @@ TEST(LexerTests, MultiLineComments) { // // Arithmetic -TEST(LexerTests, Arithmetic) { +TEST(LexerTests, ArithmeticExpression) { const std::array input = { "1 + 2", "1 - 2", @@ -239,7 +239,7 @@ TEST(LexerTests, Arithmetic) { // // Boolean -TEST(LexerTests, Boolean) { +TEST(LexerTests, BooleanExpression) { const std::array input = { "true && false", "true || false", @@ -278,7 +278,7 @@ TEST(LexerTests, Boolean) { // // Relational -TEST(LexerTests, Relational) { +TEST(LexerTests, RelationalExpression) { const std::array input = { "1 < 2", "1 > 2", @@ -317,7 +317,7 @@ TEST(LexerTests, Relational) { // // bitwise -TEST(LexerTests, Bitwise) { +TEST(LexerTests, BitwiseExpression) { const std::array input = { "1 & 2", "1 | 2", @@ -355,7 +355,7 @@ TEST(LexerTests, Bitwise) { // // Mixed -TEST(LexerTests, Mixed) { +TEST(LexerTests, MixedExpression) { const std::string input = "1 + 2 * 3 / 4 - 5 == !true && 7 < 8 || 9 > 10 && 11 <= 12 | 13 & 14 ^ 15"; const std::vector validTokens = { Token(TokenType::IntegerLiteral, "1"), From 465a19002973aabe32e12b74daf65e7eab405c47 Mon Sep 17 00:00:00 2001 From: hrszpuk <107559570+hrszpuk@users.noreply.github.com> Date: Sun, 5 Jan 2025 16:27:26 +0000 Subject: [PATCH 06/23] Improved error debugging messages for and --- tests/lexer_test.cpp | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/tests/lexer_test.cpp b/tests/lexer_test.cpp index 7fb2ae8..3e867a6 100644 --- a/tests/lexer_test.cpp +++ b/tests/lexer_test.cpp @@ -31,8 +31,15 @@ TEST(LexerTests, Identifier) { Lexer lexer; for (size_t i = 0; i < input.size(); i++) { - Token t = lexer.lex_identifier(input[i]); - ASSERT_TRUE(validTokens[i] == t); + std::vector tokens = lexer.lex(input[i]); + + ASSERT_EQ(tokens.size(), 1) + << "Failed on input: " << input[i] + << " ( recieved a size of " << tokens.size() << ", expected a size of 1)"; + + ASSERT_EQ(validTokens[i], tokens[0]) + << "Failed on input: " << input[i] + << " ( recieved: " << token_vector_to_string(tokens) << ", expected: " << validTokens[i].to_string() << ")"; } } @@ -63,8 +70,15 @@ TEST(LexerTests, Keywords) { Lexer lexer; for (size_t i = 0; i < input.size(); i++) { - Token t = lexer.lex_identifier(input[i]); - ASSERT_TRUE(validTokens[i] == t); + std::vector tokens = lexer.lex(input[i]); + + ASSERT_EQ(tokens.size(), 1) + << "Failed on input: " << input[i] + << " ( recieved a size of " << tokens.size() << ", expected a size of 1)"; + + ASSERT_EQ(validTokens[i], tokens[0]) + << "Failed on input: " << input[i] + << " ( recieved: " << token_vector_to_string(tokens) << ", expected: " << validTokens[i].to_string() << ")"; } } From b2b5207dc232e84bef61ff5b8e0c64775fb4dcb3 Mon Sep 17 00:00:00 2001 From: hrszpuk <107559570+hrszpuk@users.noreply.github.com> Date: Sun, 5 Jan 2025 16:28:02 +0000 Subject: [PATCH 07/23] and literals have updated debug messages --- tests/lexer_test.cpp | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/tests/lexer_test.cpp b/tests/lexer_test.cpp index 3e867a6..274024b 100644 --- a/tests/lexer_test.cpp +++ b/tests/lexer_test.cpp @@ -99,8 +99,15 @@ TEST(LexerTests, StringLiterals) { Lexer lexer; for (size_t i = 0; i < input.size(); i++) { - Token t = lexer.lex_string(input[i]); - ASSERT_TRUE(validTokens[i] == t); + std::vector tokens = lexer.lex(input[i]); + + ASSERT_EQ(tokens.size(), 1) + << "Failed on input: " << input[i] + << " ( recieved a size of " << tokens.size() << ", expected a size of 1)"; + + ASSERT_EQ(validTokens[i], tokens[0]) + << "Failed on input: " << input[i] + << " ( recieved: " << token_vector_to_string(tokens) << ", expected: " << validTokens[i].to_string() << ")"; } } @@ -122,8 +129,15 @@ TEST(LexerTests, Integer) { Lexer lexer; for (size_t i = 0; i < input.size(); i++) { - Token t = lexer.lex_number(input[i]); - ASSERT_TRUE(validTokens[i] == t); + std::vector tokens = lexer.lex(input[i]); + + ASSERT_EQ(tokens.size(), 1) + << "Failed on input: " << input[i] + << " ( recieved a size of " << tokens.size() << ", expected a size of 1)"; + + ASSERT_EQ(validTokens[i], tokens[0]) + << "Failed on input: " << input[i] + << " ( recieved: " << token_vector_to_string(tokens) << ", expected: " << validTokens[i].to_string() << ")"; } } From 8f67d1068735a54b164c54abdf3fdec1c15c437a Mon Sep 17 00:00:00 2001 From: hrszpuk <107559570+hrszpuk@users.noreply.github.com> Date: Sun, 5 Jan 2025 16:28:43 +0000 Subject: [PATCH 08/23] Both single, and multi, line comments now provide better error messages (testing) --- tests/lexer_test.cpp | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/tests/lexer_test.cpp b/tests/lexer_test.cpp index 274024b..6ea43ad 100644 --- a/tests/lexer_test.cpp +++ b/tests/lexer_test.cpp @@ -204,8 +204,15 @@ TEST(LexerTests, SingleLineComments) { Lexer lexer; for (size_t i = 0; i < input.size(); i++) { - Token t = lexer.lex_comment(input[i]); - ASSERT_TRUE(validTokens[i] == t); + std::vector tokens = lexer.lex(input[i]); + + ASSERT_EQ(tokens.size(), 1) + << "Failed on input: " << input[i] + << " ( recieved a size of " << tokens.size() << ", expected a size of 1)"; + + ASSERT_EQ(validTokens[i], tokens[0]) + << "Failed on input: " << input[i] + << " ( recieved: " << token_vector_to_string(tokens) << ", expected: " << validTokens[i].to_string() << ")"; } } @@ -221,8 +228,15 @@ TEST(LexerTests, MultiLineComments) { Lexer lexer; for (size_t i = 0; i < input.size(); i++) { - Token t = lexer.lex_comment(input[i]); - ASSERT_TRUE(validTokens[i] == t); + std::vector tokens = lexer.lex(input[i]); + + ASSERT_EQ(tokens.size(), 1) + << "Failed on input: " << input[i] + << " ( recieved a size of " << tokens.size() << ", expected a size of 1)"; + + ASSERT_EQ(validTokens[i], tokens[0]) + << "Failed on input: " << input[i] + << " ( recieved: " << token_vector_to_string(tokens) << ", expected: " << validTokens[i].to_string() << ")"; } } From 6d52c1e84f4ced08b32c8b2747b3398835b5f79a Mon Sep 17 00:00:00 2001 From: hrszpuk <107559570+hrszpuk@users.noreply.github.com> Date: Sun, 5 Jan 2025 16:29:15 +0000 Subject: [PATCH 09/23] Bug fixes and minor changes to some test cases --- tests/lexer_test.cpp | 370 +++++++++++++++++++++---------------------- 1 file changed, 178 insertions(+), 192 deletions(-) diff --git a/tests/lexer_test.cpp b/tests/lexer_test.cpp index 6ea43ad..39be1e6 100644 --- a/tests/lexer_test.cpp +++ b/tests/lexer_test.cpp @@ -1,32 +1,26 @@ #include #include - +#include +#include +#include #include "dragon/lexer.h" #include "dragon/token.h" // TEST(LexerTests, Identifier) { - const std::array input = { - "x", + const std::vector input = { + "x", "x_y123", "reallyLongVariableNameWithNoNumbersOrUnderscores", "U_ND_ER_SCO_RES", "____starting___with__underscore", - "2thisShouldError_", // Identifiers cannot start with a number - "this should also error", // Variables cannot contain spaces - "Error?", // Identifiers cannot contain question marks - "#*&$£!!!", // Identifiers cannot contain any of these symbols }; - const std::vector validTokens = { - Token(TokenType::Identifier, "x"), - Token(TokenType::Identifier, "x_y123"), - Token(TokenType::Identifier, "reallyLongVariableNameWithNoNumbersOrUnderscores"), - Token(TokenType::Identifier, "U_ND_ER_SCO_RES"), - Token(TokenType::Identifier, "____starting___with__underscore"), - Token(TokenType::Unknown, "2thisShouldError_"), - Token(TokenType::Unknown, "this should also error"), - Token(TokenType::Unknown, "Error?"), - Token(TokenType::Unknown, "#*&$£!!!") + std::vector validTokens = { + Token(TokenType::Identifier, "x"), + Token(TokenType::Identifier, "x_y123"), + Token(TokenType::Identifier, "reallyLongVariableNameWithNoNumbersOrUnderscores"), + Token(TokenType::Identifier, "U_ND_ER_SCO_RES"), + Token(TokenType::Identifier, "____starting___with__underscore"), }; Lexer lexer; @@ -45,7 +39,7 @@ TEST(LexerTests, Identifier) { // TEST(LexerTests, Keywords) { - const std::array input = { + const std::vector input = { "let", "mut", "if", @@ -56,7 +50,7 @@ TEST(LexerTests, Keywords) { "false", "while", }; - const std::vector validTokens = { + std::vector validTokens = { Token(TokenType::Let, "let"), Token(TokenType::Mut, "mut"), Token(TokenType::If, "if"), @@ -79,22 +73,21 @@ TEST(LexerTests, Keywords) { ASSERT_EQ(validTokens[i], tokens[0]) << "Failed on input: " << input[i] << " ( recieved: " << token_vector_to_string(tokens) << ", expected: " << validTokens[i].to_string() << ")"; - } + } } // TEST(LexerTests, StringLiterals) { - const std::array input = { + const std::vector input = { "\"Enter username: \"", "\"This is a string with a escape characters \\\" \\n \\t \"", "\"Abcdefghijklmnopqrstuvwxyz @#][{};;@'><,.//?)(*&^%$£1234567890+_-=`¬\\|\"" }; - const std::array validTokens = { + std::vector validTokens = { Token(TokenType::StringLiteral, "Enter username: "), Token(TokenType::StringLiteral, "This is a string with a escape characters \" \n \t "), - Token(TokenType::StringLiteral, "Abcdefghijklmnopqrstuvwxyz @#][{};;@'><,.//?)(*&^%$£1234567890+_-=`¬\\|") - }; + Token(TokenType::StringLiteral, "Abcdefghijklmnopqrstuvwxyz @#][{};;@'><,.//?)(*&^%$£1234567890+_-=`¬\\|")}; Lexer lexer; @@ -113,18 +106,17 @@ TEST(LexerTests, StringLiterals) { // TEST(LexerTests, Integer) { - const std::array input = { + const std::vector input = { "1", "123", "0", - "1_000_000", // Underscores are allowed, but are ignored + "1_000_000", // Underscores are allowed, but are ignored }; - const std::array validTokens = { + std::vector validTokens = { Token(TokenType::IntegerLiteral, "1"), Token(TokenType::IntegerLiteral, "123"), Token(TokenType::IntegerLiteral, "0"), - Token(TokenType::IntegerLiteral, "1_000_000") - }; + Token(TokenType::IntegerLiteral, "1000000")}; Lexer lexer; @@ -143,7 +135,7 @@ TEST(LexerTests, Integer) { // Test for all symbols TEST(LexerTests, Symbols) { - const std::array input = { + const std::vector input = { "+", "-", "*", @@ -161,9 +153,8 @@ TEST(LexerTests, Symbols) { "&", "|", "^", - "~" - }; - const std::array validTokens = { + "~"}; + std::vector validTokens = { Token(TokenType::Plus, "+"), Token(TokenType::Minus, "-"), Token(TokenType::Star, "*"), @@ -181,25 +172,30 @@ TEST(LexerTests, Symbols) { Token(TokenType::Ampersand, "&"), Token(TokenType::Pipe, "|"), Token(TokenType::Caret, "^"), - Token(TokenType::Tilde, "~") - }; + Token(TokenType::Tilde, "~")}; Lexer lexer; for (size_t i = 0; i < input.size(); i++) { - Token t = lexer.lex_symbol(input[i]); - ASSERT_TRUE(validTokens[i] == t); + std::vector tokens = lexer.lex(input[i]); + + ASSERT_EQ(tokens.size(), 1) + << "Failed on input: " << input[i] + << " ( recieved a size of " << tokens.size() << ", expected a size of 1)"; + + ASSERT_EQ(validTokens[i], tokens[0]) + << "Failed on input: " << input[i] + << " ( recieved: " << token_vector_to_string(tokens) << ", expected: " << validTokens[i].to_string() << ")"; } } +// // Test for single-line comments TEST(LexerTests, SingleLineComments) { - const std::array input = { - "// This is a comment" - }; - const std::array validTokens = { - Token(TokenType::Comment, "// This is a comment") - }; + const std::vector input = { + "// This is a comment"}; + std::vector validTokens = { + Token(TokenType::Comment, "// This is a comment")}; Lexer lexer; @@ -216,14 +212,13 @@ TEST(LexerTests, SingleLineComments) { } } +// // Test for multi-line comments TEST(LexerTests, MultiLineComments) { - const std::array input = { - "/* This is a multi-line comment */" - }; - const std::array validTokens = { - Token(TokenType::Comment, "/* This is a multi-line comment */") - }; + const std::vector input = { + "/* This is a multi-line comment */"}; + std::vector validTokens = { + Token(TokenType::Comment, "/* This is a multi-line comment */")}; Lexer lexer; @@ -240,165 +235,172 @@ TEST(LexerTests, MultiLineComments) { } } -// +// // Arithmetic TEST(LexerTests, ArithmeticExpression) { - const std::array input = { + const std::vector input = { "1 + 2", "1 - 2", "1 * 2", - "1 / 2" - }; + "1 / 2"}; const std::vector> validTokens = { - { - Token(TokenType::IntegerLiteral, "1"), - Token(TokenType::Plus, "+"), - Token(TokenType::IntegerLiteral, "2") - }, - { - Token(TokenType::IntegerLiteral, "1"), - Token(TokenType::Minus, "-"), - Token(TokenType::IntegerLiteral, "2") - }, - { - Token(TokenType::IntegerLiteral, "1"), - Token(TokenType::Star, "*"), - Token(TokenType::IntegerLiteral, "2") - }, - { - Token(TokenType::IntegerLiteral, "1"), - Token(TokenType::Slash, "/"), - Token(TokenType::IntegerLiteral, "2") - } - }; + {Token(TokenType::IntegerLiteral, "1"), + Token(TokenType::Plus, "+"), + Token(TokenType::IntegerLiteral, "2")}, + {Token(TokenType::IntegerLiteral, "1"), + Token(TokenType::Minus, "-"), + Token(TokenType::IntegerLiteral, "2")}, + {Token(TokenType::IntegerLiteral, "1"), + Token(TokenType::Star, "*"), + Token(TokenType::IntegerLiteral, "2")}, + {Token(TokenType::IntegerLiteral, "1"), + Token(TokenType::Slash, "/"), + Token(TokenType::IntegerLiteral, "2")}}; Lexer lexer; - for (size_t i = 0; i < input.size(); i++) { + for (size_t i = 0; i < input.size(); i++) + { std::vector tokens = lexer.lex(input[i]); - ASSERT_TRUE(validTokens[i] == tokens); + + ASSERT_EQ(validTokens[i].size(), tokens.size()) + << "Failed on input: " << (input[i]) + << " ( recieved a size of " << tokens.size() << ", expected a size of " << validTokens[i].size() << ")"; + + for (size_t j = 0; j < tokens.size(); j++) + { + ASSERT_EQ(validTokens[i][j], tokens[j]) + << "Failed on input: " << (input[i]) + << " ( recieved: " << token_vector_to_string(tokens) << ", expected: " << token_vector_to_string(validTokens[i]) << ")"; + } } } -// +// // Boolean TEST(LexerTests, BooleanExpression) { - const std::array input = { + const std::vector input = { "true && false", "true || false", "!true", - "true == false" - "true != false" - }; + "true == false", + "true != false"}; const std::vector> validTokens = { - { - Token(TokenType::True, "true"), - Token(TokenType::And, "&&"), - Token(TokenType::False, "false") - }, - { - Token(TokenType::True, "true"), - Token(TokenType::Or, "||"), - Token(TokenType::False, "false") - }, - { - Token(TokenType::Not, "!"), - Token(TokenType::True, "true") - }, - { - Token(TokenType::True, "true"), - Token(TokenType::Equals, "=="), - Token(TokenType::False, "false") - } - }; + {Token(TokenType::True, "true"), + Token(TokenType::And, "&&"), + Token(TokenType::False, "false")}, + {Token(TokenType::True, "true"), + Token(TokenType::Or, "||"), + Token(TokenType::False, "false")}, + {Token(TokenType::Not, "!"), + Token(TokenType::True, "true")}, + {Token(TokenType::True, "true"), + Token(TokenType::Equals, "=="), + Token(TokenType::False, "false")}, + {Token(TokenType::True, "true"), + Token(TokenType::NotEquals, "!="), + Token(TokenType::False, "false")}}; Lexer lexer; - for (size_t i = 0; i < input.size(); i++) { + for (size_t i = 0; i < input.size(); i++) + { std::vector tokens = lexer.lex(input[i]); - ASSERT_TRUE(validTokens[i] == tokens); + + ASSERT_EQ(validTokens[i].size(), tokens.size()) + << "Failed on input: " << (input[i]) + << " ( recieved a size of " << tokens.size() << ", expected a size of " << validTokens[i].size() << ")"; + + for (size_t j = 0; j < tokens.size(); j++) + { + ASSERT_EQ(validTokens[i][j], tokens[j]) + << "Failed on input: " << input[i] + << " ( recieved: " << token_vector_to_string(tokens) << ", expected: " << token_vector_to_string(validTokens[i]) << ")"; + } } } -// +// // Relational TEST(LexerTests, RelationalExpression) { - const std::array input = { + const std::vector input = { "1 < 2", "1 > 2", "1 <= 2", - "1 >= 2" - }; + "1 >= 2"}; const std::vector> validTokens = { - { - Token(TokenType::IntegerLiteral, "1"), - Token(TokenType::LessThan, "<"), - Token(TokenType::IntegerLiteral, "2") - }, - { - Token(TokenType::IntegerLiteral, "1"), - Token(TokenType::GreaterThan, ">"), - Token(TokenType::IntegerLiteral, "2") - }, - { - Token(TokenType::IntegerLiteral, "1"), - Token(TokenType::LessThanOrEqualTo, "<="), - Token(TokenType::IntegerLiteral, "2") - }, - { - Token(TokenType::IntegerLiteral, "1"), - Token(TokenType::LessThanOrEqualTo, ">="), - Token(TokenType::IntegerLiteral, "2") - } - }; + {Token(TokenType::IntegerLiteral, "1"), + Token(TokenType::LessThan, "<"), + Token(TokenType::IntegerLiteral, "2")}, + {Token(TokenType::IntegerLiteral, "1"), + Token(TokenType::GreaterThan, ">"), + Token(TokenType::IntegerLiteral, "2")}, + {Token(TokenType::IntegerLiteral, "1"), + Token(TokenType::LessThanOrEqualTo, "<="), + Token(TokenType::IntegerLiteral, "2")}, + {Token(TokenType::IntegerLiteral, "1"), + Token(TokenType::GreaterThanOrEqualTo, ">="), + Token(TokenType::IntegerLiteral, "2")}}; Lexer lexer; - for (size_t i = 0; i < input.size(); i++) { + for (size_t i = 0; i < input.size(); i++) + { std::vector tokens = lexer.lex(input[i]); - ASSERT_TRUE(validTokens[i] == tokens); + + ASSERT_EQ(validTokens[i].size(), tokens.size()) + << "Failed on input: " << (input[i]) + << " ( recieved a size of " << tokens.size() << ", expected a size of " << validTokens[i].size() << ")"; + + for (size_t j = 0; j < tokens.size(); j++) + { + ASSERT_EQ(validTokens[i][j], tokens[j]) + << "Failed on input: " << input[i] + << " ( recieved: " << token_vector_to_string(tokens) << ", expected: " << token_vector_to_string(validTokens[i]) << ")"; + } } } -// +// // bitwise TEST(LexerTests, BitwiseExpression) { - const std::array input = { + const std::vector input = { "1 & 2", "1 | 2", "1 ^ 2", - "~1" - }; + "~1"}; const std::vector> validTokens = { - { - Token(TokenType::IntegerLiteral, "1"), - Token(TokenType::Ampersand, "&"), - Token(TokenType::IntegerLiteral, "2") - }, - { - Token(TokenType::IntegerLiteral, "1"), - Token(TokenType::Pipe, "|"), - Token(TokenType::IntegerLiteral, "2") - }, - { - Token(TokenType::IntegerLiteral, "1"), - Token(TokenType::Caret, "^"), - Token(TokenType::IntegerLiteral, "2") - }, - { - Token(TokenType::Tilde, "~"), - Token(TokenType::IntegerLiteral, "1") - } - }; + {Token(TokenType::IntegerLiteral, "1"), + Token(TokenType::Ampersand, "&"), + Token(TokenType::IntegerLiteral, "2")}, + {Token(TokenType::IntegerLiteral, "1"), + Token(TokenType::Pipe, "|"), + Token(TokenType::IntegerLiteral, "2")}, + {Token(TokenType::IntegerLiteral, "1"), + Token(TokenType::Caret, "^"), + Token(TokenType::IntegerLiteral, "2")}, + {Token(TokenType::Tilde, "~"), + Token(TokenType::IntegerLiteral, "1")}}; Lexer lexer; - for (size_t i = 0; i < input.size(); i++) { + for (size_t i = 0; i < input.size(); i++) + { std::vector tokens = lexer.lex(input[i]); - ASSERT_TRUE(validTokens[i] == tokens); + + ASSERT_EQ(validTokens[i].size(), tokens.size()) + << "Failed on input: " << (input[i]) + << " ( recieved a size of " << tokens.size() << ", expected a size of " << validTokens[i].size() << ")"; + + for (size_t j = 0; j < tokens.size(); j++) + { + ASSERT_EQ(validTokens[i][j], tokens[j]) + << "Failed on input: " << input[i] + << " ( recieved: " << token_vector_to_string(tokens) << ", expected: " << token_vector_to_string(validTokens[i]) << ")"; + } } } // // Mixed TEST(LexerTests, MixedExpression) { - const std::string input = "1 + 2 * 3 / 4 - 5 == !true && 7 < 8 || 9 > 10 && 11 <= 12 | 13 & 14 ^ 15"; + const std::string input = "1 + 2 * 3 / 4 - 5 == !true && 7 < 8 || 9 > 10 && 11 <= 12 | 13 & 14 ^ 15"; const std::vector validTokens = { Token(TokenType::IntegerLiteral, "1"), Token(TokenType::Plus, "+"), @@ -429,8 +431,7 @@ TEST(LexerTests, MixedExpression) { Token(TokenType::Ampersand, "&"), Token(TokenType::IntegerLiteral, "14"), Token(TokenType::Caret, "^"), - Token(TokenType::IntegerLiteral, "15") - }; + Token(TokenType::IntegerLiteral, "15")}; Lexer lexer; @@ -448,11 +449,10 @@ TEST(LexerTests, MixedExpression) { // let TEST(LexerTests, VariableDeclarationWithoutExpr) { const std::string input = "let variable int"; - const std::vector validTokens = { + const std::vector validTokens = { Token(TokenType::Let, "let"), Token(TokenType::Identifier, "variable"), - Token(TokenType::Identifier, "int") - }; + Token(TokenType::Identifier, "int")}; Lexer lexer; std::vector tokens = lexer.lex(input); @@ -467,7 +467,7 @@ TEST(LexerTests, VariableDeclarationWithoutExpr) { } // let = -TEST (LexerTests, VariableDeclarationWithExpr) { +TEST(LexerTests, VariableDeclarationWithExpr) { const std::string input = "let variable int = 1 + 2"; const std::vector validTokens = { Token(TokenType::Let, "let"), @@ -476,8 +476,7 @@ TEST (LexerTests, VariableDeclarationWithExpr) { Token(TokenType::Assign, "="), Token(TokenType::IntegerLiteral, "1"), Token(TokenType::Plus, "+"), - Token(TokenType::IntegerLiteral, "2") - }; + Token(TokenType::IntegerLiteral, "2")}; Lexer lexer; std::vector tokens = lexer.lex(input); @@ -500,8 +499,7 @@ TEST(LexerTests, VariableDeclarationWithoutType) { Token(TokenType::Assign, "="), Token(TokenType::IntegerLiteral, "1"), Token(TokenType::Plus, "+"), - Token(TokenType::IntegerLiteral, "2") - }; + Token(TokenType::IntegerLiteral, "2")}; Lexer lexer; std::vector tokens = lexer.lex(input); @@ -526,8 +524,7 @@ TEST(LexerTests, MutableVariableDeclarationWithExpr) { Token(TokenType::Assign, "="), Token(TokenType::IntegerLiteral, "1"), Token(TokenType::Plus, "+"), - Token(TokenType::IntegerLiteral, "2") - }; + Token(TokenType::IntegerLiteral, "2")}; Lexer lexer; std::vector tokens = lexer.lex(input); @@ -541,15 +538,14 @@ TEST(LexerTests, MutableVariableDeclarationWithExpr) { << " ( recieved: " << token_vector_to_string(tokens) << ", expected: " << token_vector_to_string(validTokens) << ")"; } -// let mut +// let mut TEST(LexerTests, MutableVariableDeclarationWithoutExpr) { const std::string input = "let mut variable int"; const std::vector validTokens = { Token(TokenType::Let, "let"), Token(TokenType::Mut, "mut"), Token(TokenType::Identifier, "variable"), - Token(TokenType::Identifier, "int") - }; + Token(TokenType::Identifier, "int")}; Lexer lexer; std::vector tokens = lexer.lex(input); @@ -573,8 +569,7 @@ TEST(LexerTests, MutableVariableDeclarationWithoutType) { Token(TokenType::Assign, "="), Token(TokenType::IntegerLiteral, "1"), Token(TokenType::Plus, "+"), - Token(TokenType::IntegerLiteral, "2") - }; + Token(TokenType::IntegerLiteral, "2")}; Lexer lexer; std::vector tokens = lexer.lex(input); @@ -596,8 +591,7 @@ TEST(LexerTests, Assignment) { Token(TokenType::Assign, "="), Token(TokenType::IntegerLiteral, "1"), Token(TokenType::Plus, "+"), - Token(TokenType::IntegerLiteral, "2") - }; + Token(TokenType::IntegerLiteral, "2")}; Lexer lexer; std::vector tokens = lexer.lex(input); @@ -622,8 +616,7 @@ TEST(LexerTests, Arguments) { Token(TokenType::Comma, ","), Token(TokenType::IntegerLiteral, "1"), Token(TokenType::Plus, "+"), - Token(TokenType::IntegerLiteral, "3") - }; + Token(TokenType::IntegerLiteral, "3")}; Lexer lexer; std::vector tokens = lexer.lex(input); @@ -650,8 +643,7 @@ TEST(LexerTests, FunctionCall) { Token(TokenType::IntegerLiteral, "1"), Token(TokenType::Plus, "+"), Token(TokenType::IntegerLiteral, "3"), - Token(TokenType::RightParen, ")") - }; + Token(TokenType::RightParen, ")")}; Lexer lexer; std::vector tokens = lexer.lex(input); @@ -679,8 +671,7 @@ TEST(LexerTests, WhileLoop) { Token(TokenType::Identifier, "x"), Token(TokenType::Plus, "+"), Token(TokenType::IntegerLiteral, "1"), - Token(TokenType::RightBrace, "}") - }; + Token(TokenType::RightBrace, "}")}; Lexer lexer; std::vector tokens = lexer.lex(input); @@ -710,8 +701,7 @@ TEST(LexerTests, ForLoop) { Token(TokenType::Identifier, "i"), Token(TokenType::Plus, "+"), Token(TokenType::IntegerLiteral, "1"), - Token(TokenType::RightBrace, "}") - }; + Token(TokenType::RightBrace, "}")}; Lexer lexer; std::vector tokens = lexer.lex(input); @@ -745,8 +735,7 @@ TEST(LexerTests, ForLoopWithExpr) { Token(TokenType::Identifier, "i"), Token(TokenType::Plus, "+"), Token(TokenType::IntegerLiteral, "1"), - Token(TokenType::RightBrace, "}") - }; + Token(TokenType::RightBrace, "}")}; Lexer lexer; std::vector tokens = lexer.lex(input); @@ -774,8 +763,7 @@ TEST(LexerTests, IfStatement) { Token(TokenType::Identifier, "x"), Token(TokenType::Plus, "+"), Token(TokenType::IntegerLiteral, "1"), - Token(TokenType::RightBrace, "}") - }; + Token(TokenType::RightBrace, "}")}; Lexer lexer; std::vector tokens = lexer.lex(input); @@ -815,8 +803,7 @@ TEST(LexerTests, ElseIfStatement) { Token(TokenType::Identifier, "x"), Token(TokenType::Minus, "-"), Token(TokenType::IntegerLiteral, "1"), - Token(TokenType::RightBrace, "}") - }; + Token(TokenType::RightBrace, "}")}; Lexer lexer; std::vector tokens = lexer.lex(input); @@ -852,8 +839,7 @@ TEST(LexerTests, ElseStatement) { Token(TokenType::Identifier, "x"), Token(TokenType::Minus, "-"), Token(TokenType::IntegerLiteral, "1"), - Token(TokenType::RightBrace, "}") - }; + Token(TokenType::RightBrace, "}")}; Lexer lexer; std::vector tokens = lexer.lex(input); From f646200595479af6cf19b04155a57883397925b7 Mon Sep 17 00:00:00 2001 From: hrszpuk <107559570+hrszpuk@users.noreply.github.com> Date: Sun, 5 Jan 2025 16:29:46 +0000 Subject: [PATCH 10/23] Setting up lexer class declaration --- include/dragon/lexer.h | 27 ++++++--------------------- 1 file changed, 6 insertions(+), 21 deletions(-) diff --git a/include/dragon/lexer.h b/include/dragon/lexer.h index 6eca3de..c0e2ae0 100644 --- a/include/dragon/lexer.h +++ b/include/dragon/lexer.h @@ -1,24 +1,9 @@ -#ifndef LEXER_H -#define LEXER_H +#pragma once +#include +#include +#include +#include #include "token.h" -#include -#include -#include -#include -typedef struct { - char* source; - size_t position; -} Lexer; - -Lexer* create_lexer(const char* source); -Token lex_number(Lexer* lexer); -Token lex_identifier(Lexer* lexer); -Token lex_symbol(Lexer* lexer); -Token lex_string(Lexer* lexer); -Token is_keyword(Token token); -TokenList* tokenise(const char* source); -void free_lexer(Lexer* lexer); - -#endif // LEXER_H \ No newline at end of file +class Lexer { From 91a492fe6c7ba7f15b9f9775c9458d1734f56b2f Mon Sep 17 00:00:00 2001 From: hrszpuk <107559570+hrszpuk@users.noreply.github.com> Date: Sun, 5 Jan 2025 16:30:08 +0000 Subject: [PATCH 11/23] Added all methods and data for lexer class --- include/dragon/lexer.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/include/dragon/lexer.h b/include/dragon/lexer.h index c0e2ae0..159acd8 100644 --- a/include/dragon/lexer.h +++ b/include/dragon/lexer.h @@ -7,3 +7,32 @@ #include "token.h" class Lexer { +public: + Lexer() = default; + Lexer(std::string input); + + std::vector lex(std::string input); + std::vector lex(); + + void reset(); + + Token lex_identifier(); + Token lex_number(); + Token lex_string(); + Token lex_symbol(); + Token lex_single_line_comment(); + Token lex_multi_line_comment(); + + TokenType get_keyword(std::string input); + +private: + std::vector tokens; + std::string input; + size_t index = 0; + size_t line = 1; + size_t column = 1; + + std::optional peek() const; + std::optional peek_next() const; + std::optional advance(); +}; \ No newline at end of file From 51f8fda88da9f566ec935dd659ff5804a8a72065 Mon Sep 17 00:00:00 2001 From: hrszpuk <107559570+hrszpuk@users.noreply.github.com> Date: Sun, 5 Jan 2025 16:30:40 +0000 Subject: [PATCH 12/23] Implemented lexer constructor --- src/lexer.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/lexer.cpp b/src/lexer.cpp index d2a3fcf..f45e988 100644 --- a/src/lexer.cpp +++ b/src/lexer.cpp @@ -1,10 +1,11 @@ #include "dragon/lexer.h" -Lexer* create_lexer(const char* source) { - Lexer* lexer = (Lexer *)malloc(sizeof(Lexer)); - lexer->source = strdup(source); - lexer->position = 0; - return lexer; +#include + +Lexer::Lexer(std::string input) { + this->input = input; +} + } void free_lexer(Lexer* lexer) { From 2bb94230e69abdc87af3ae7d5298ab205ad6f895 Mon Sep 17 00:00:00 2001 From: hrszpuk <107559570+hrszpuk@users.noreply.github.com> Date: Sun, 5 Jan 2025 16:30:59 +0000 Subject: [PATCH 13/23] Implemented lexer lex(string) --- src/lexer.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/lexer.cpp b/src/lexer.cpp index f45e988..ef1eab7 100644 --- a/src/lexer.cpp +++ b/src/lexer.cpp @@ -6,11 +6,11 @@ Lexer::Lexer(std::string input) { this->input = input; } +std::vector Lexer::lex(std::string input) { + if (!this->input.empty()) this->reset(); + this->input = input; + return this->lex(); } - -void free_lexer(Lexer* lexer) { - free(lexer->source); - free(lexer); } TokenList* tokenise(const char* source) { From 27a2c3a928e4c21b7a28f38a9f38f9dd87edf70a Mon Sep 17 00:00:00 2001 From: hrszpuk <107559570+hrszpuk@users.noreply.github.com> Date: Sun, 5 Jan 2025 16:31:12 +0000 Subject: [PATCH 14/23] Implemented lexer reset --- src/lexer.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/lexer.cpp b/src/lexer.cpp index ef1eab7..5099e1d 100644 --- a/src/lexer.cpp +++ b/src/lexer.cpp @@ -11,13 +11,13 @@ std::vector Lexer::lex(std::string input) { this->input = input; return this->lex(); } -} - -TokenList* tokenise(const char* source) { - TokenList* tokens = create_token_list(); - if (source == NULL || strlen(source) == 0) { - return tokens; +void Lexer::reset() { + this->tokens.clear(); + this->input = ""; + this->index = 0; + this->line = 1; + this->column = 1; } Lexer* lexer = create_lexer(source); From 0c7bc6765f31f08ca7688a092874fb911ee340a3 Mon Sep 17 00:00:00 2001 From: hrszpuk <107559570+hrszpuk@users.noreply.github.com> Date: Sun, 5 Jan 2025 16:31:35 +0000 Subject: [PATCH 15/23] Implemented lexer peek, peek_next, and advance --- src/lexer.cpp | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/src/lexer.cpp b/src/lexer.cpp index 5099e1d..6065498 100644 --- a/src/lexer.cpp +++ b/src/lexer.cpp @@ -158,21 +158,33 @@ Token lex_string(Lexer* lexer) { lexer->position++; } - size_t length = lexer->position - start; - char* value = strndup(lexer->source + start, length); - lexer->position++; +std::optional Lexer::peek() const { + if (this->index < this->input.size()) { + return this->input[this->index]; + } + return std::nullopt; +} - Token token = {TOKEN_STRING, value}; - return token; +std::optional Lexer::peek_next() const { + if (this->index + 1 < this->input.size()) { + return this->input[this->index + 1]; + } + return std::nullopt; } -Token is_keyword(Token token) { - for (size_t i = 0; i < sizeof(keywords) / sizeof(keywords[0]); i++) { - if (strcmp(token.value, keywords[i]) == 0) { - token.type = (TokenType)i; - return token; +std::optional Lexer::advance() { + if (this->index < this->input.size()) { + char c = this->input[this->index]; + this->index++; + this->column++; + + if (c == '\n') { + this->line++; + this->column = 1; } + + return c; } - return token; -} + return std::nullopt; +} \ No newline at end of file From 81a06e8b16f5a9d92f0bb250d9fe72121ae19af6 Mon Sep 17 00:00:00 2001 From: hrszpuk <107559570+hrszpuk@users.noreply.github.com> Date: Sun, 5 Jan 2025 16:31:49 +0000 Subject: [PATCH 16/23] Added get_keyword --- src/lexer.cpp | 42 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/src/lexer.cpp b/src/lexer.cpp index 6065498..e0c31ca 100644 --- a/src/lexer.cpp +++ b/src/lexer.cpp @@ -149,15 +149,45 @@ Token lex_symbol(Lexer* lexer) { return token; } -Token lex_string(Lexer* lexer) { - char quote = lexer->source[lexer->position]; - lexer->position++; +TokenType Lexer::get_keyword(std::string value) { + if (value == "let") { + return TokenType::Let; + } - size_t start = lexer->position; - while (lexer->source[lexer->position] != quote) { - lexer->position++; + if (value == "mut") { + return TokenType::Mut; + } + + if (value == "if") { + return TokenType::If; + } + + if (value == "else") { + return TokenType::Else; + } + + if (value == "while") { + return TokenType::While; } + if (value == "for") { + return TokenType::For; + } + + if (value == "in") { + return TokenType::In; + } + + if (value == "true") { + return TokenType::True; + } + + if (value == "false") { + return TokenType::False; + } + + return TokenType::Identifier; + } std::optional Lexer::peek() const { if (this->index < this->input.size()) { From 531c9e71a6b3d2ac824f923af1dd940d65d70a30 Mon Sep 17 00:00:00 2001 From: hrszpuk <107559570+hrszpuk@users.noreply.github.com> Date: Sun, 5 Jan 2025 16:32:12 +0000 Subject: [PATCH 17/23] Implemented main lex function :D --- src/lexer.cpp | 51 +++++++++++++++++++++++++++++++++------------------ 1 file changed, 33 insertions(+), 18 deletions(-) diff --git a/src/lexer.cpp b/src/lexer.cpp index e0c31ca..df42ace 100644 --- a/src/lexer.cpp +++ b/src/lexer.cpp @@ -20,33 +20,48 @@ void Lexer::reset() { this->column = 1; } - Lexer* lexer = create_lexer(source); +std::vector Lexer::lex() { + while (this->index < this->input.size()) { + auto opt_c = this->peek(); + if (!opt_c.has_value()) break; + char c = opt_c.value(); + std::cout << "lexing starting with: " << c << std::endl; + + if (std::isspace(c)) { + this->advance(); + continue; + } - while (lexer->position < strlen(lexer->source)) { - char c = lexer->source[lexer->position]; + if (std::isalpha(c) || c == '_') { + this->tokens.push_back(this->lex_identifier()); + continue; + } - while (c == ' ' || c == '\n' || c == '\t') { - lexer->position++; - c = lexer->source[lexer->position]; + if (std::isdigit(c)) { + this->tokens.push_back(this->lex_number()); + continue; } - Token token = {TOKEN_INVALID, NULL}; - if (isdigit(c)) { - token = lex_number(lexer); - } else if (isalpha(c)) { - token = lex_identifier(lexer); - } else if (c == '"' || c == '\'') { - token = lex_string(lexer); - } else { - token = lex_symbol(lexer); + if (c == '"') { + std::cout << "lexing string" << std::endl; + this->tokens.push_back(this->lex_string()); + continue; } - append_token(tokens, token); + if (c == '/' && this->peek_next() == '/') { + this->tokens.push_back(this->lex_single_line_comment()); + continue; + } + + if (c == '/' && this->peek_next() == '*') { + this->tokens.push_back(this->lex_multi_line_comment()); + continue; } - free_lexer(lexer); + this->tokens.push_back(this->lex_symbol()); + } - return tokens; + return this->tokens; } Token lex_number(Lexer* lexer) { From 8ee6fec7cefe8c3ece286ec60017fd44b5a41c8c Mon Sep 17 00:00:00 2001 From: hrszpuk <107559570+hrszpuk@users.noreply.github.com> Date: Sun, 5 Jan 2025 16:32:27 +0000 Subject: [PATCH 18/23] Implemented lex_identifier --- src/lexer.cpp | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/src/lexer.cpp b/src/lexer.cpp index df42ace..cbc469d 100644 --- a/src/lexer.cpp +++ b/src/lexer.cpp @@ -64,10 +64,24 @@ std::vector Lexer::lex() { return this->tokens; } -Token lex_number(Lexer* lexer) { - size_t start = lexer->position; - while (isdigit(lexer->source[lexer->position])) { - lexer->position++; +Token Lexer::lex_identifier() { + std::string value = ""; + size_t line = this->line; + size_t column = this->column; + + while (true) { + auto opt_c = this->peek(); + if (!opt_c.has_value() || !(std::isalnum(opt_c.value()) || opt_c.value() == '_')) { + break; + } + value += this->advance().value(); + std::cout << "building value: " << value << std::endl; + } + + TokenType type = this->get_keyword(value); + std::cout << "type: " << token_type_to_string(type) << std::endl; + std::cout << "value: " << "\"" << value << "\"" << std::endl; + return Token(type, value, line, column); } size_t length = lexer->position - start; From 702612ec7b786bae79f56ec053168d2d1bfe5ee8 Mon Sep 17 00:00:00 2001 From: hrszpuk <107559570+hrszpuk@users.noreply.github.com> Date: Sun, 5 Jan 2025 16:32:39 +0000 Subject: [PATCH 19/23] Implemented lex_number --- src/lexer.cpp | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/src/lexer.cpp b/src/lexer.cpp index cbc469d..9b451a9 100644 --- a/src/lexer.cpp +++ b/src/lexer.cpp @@ -84,11 +84,24 @@ Token Lexer::lex_identifier() { return Token(type, value, line, column); } - size_t length = lexer->position - start; - char* value = strndup(lexer->source + start, length); +Token Lexer::lex_number() { + std::string value = ""; + size_t line = this->line; + size_t column = this->column; - Token token = {TOKEN_INTEGER, value}; - return token; + while (true) { + auto opt_c = this->peek(); + if (opt_c.has_value() && opt_c.value() == '_') { + this->advance(); + continue; + } + if (!opt_c.has_value() || !std::isdigit(opt_c.value())) { + break; + } + value += this->advance().value(); + } + + return Token(TokenType::IntegerLiteral, value, line, column); } Token lex_identifier(Lexer* lexer) { From 996fc55622330d9621304769f166bf0bad1e6734 Mon Sep 17 00:00:00 2001 From: hrszpuk <107559570+hrszpuk@users.noreply.github.com> Date: Sun, 5 Jan 2025 16:33:04 +0000 Subject: [PATCH 20/23] Implemented lex_string --- src/lexer.cpp | 40 ++++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/src/lexer.cpp b/src/lexer.cpp index 9b451a9..0efdd4d 100644 --- a/src/lexer.cpp +++ b/src/lexer.cpp @@ -104,20 +104,40 @@ Token Lexer::lex_number() { return Token(TokenType::IntegerLiteral, value, line, column); } -Token lex_identifier(Lexer* lexer) { - size_t start = lexer->position; - while (isalnum(lexer->source[lexer->position])) { - lexer->position++; - } +Token Lexer::lex_string() { + std::string value = ""; + size_t line = this->line; + size_t column = this->column; - size_t length = lexer->position - start; - char* value = strndup(lexer->source + start, length); + this->advance(); // Skip the opening quote - Token token = {TOKEN_IDENTIFIER, value}; + while (true) { + auto opt_c = this->peek(); + if (!opt_c.has_value() || opt_c.value() == '"') { + break; + } + + if (opt_c.value() == '\\') { + this->advance(); // Skip the backslash + auto escaped_char = this->advance(); + if (escaped_char.has_value()) { + switch (escaped_char.value()) { + case 'n': value += '\n'; break; + case 't': value += '\t'; break; + case 'r': value += '\r'; break; + case '\\': value += '\\'; break; + case '"': value += '"'; break; + default: value += '\\'; value += escaped_char.value(); break; + } + } + } else { + value += this->advance().value(); + } + } - token = is_keyword(token); + this->advance(); // Skip the closing quote - return token; + return Token(TokenType::StringLiteral, value, line, column); } Token lex_symbol(Lexer* lexer) { From 9d58782c52c94ef15e46f95e5efcb3ec75959049 Mon Sep 17 00:00:00 2001 From: hrszpuk <107559570+hrszpuk@users.noreply.github.com> Date: Sun, 5 Jan 2025 16:33:25 +0000 Subject: [PATCH 21/23] Implemented lex_symbol --- src/lexer.cpp | 138 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 98 insertions(+), 40 deletions(-) diff --git a/src/lexer.cpp b/src/lexer.cpp index 0efdd4d..4f73d2e 100644 --- a/src/lexer.cpp +++ b/src/lexer.cpp @@ -140,53 +140,111 @@ Token Lexer::lex_string() { return Token(TokenType::StringLiteral, value, line, column); } -Token lex_symbol(Lexer* lexer) { - char c = lexer->source[lexer->position]; - TokenType type = TOKEN_INVALID; - char* value = NULL; +Token Lexer::lex_symbol() { + std::string value = ""; + size_t line = this->line; + size_t column = this->column; + + auto opt_c = this->peek(); + if (!opt_c.has_value()) { + return Token(TokenType::Unknown, value, line, column); + } + char c = opt_c.value(); switch (c) { - case '=': - type = TOKEN_EQUALS; - value = strndup(lexer->source + lexer->position, 1); - break; case '+': - type = TOKEN_PLUS; - value = strndup(lexer->source + lexer->position, 1); - break; - case '{': - type = TOKEN_BRACE_OPEN; - value = strndup(lexer->source + lexer->position, 1); - break; - case '}': - type = TOKEN_BRACE_CLOSE; - value = strndup(lexer->source + lexer->position, 1); - break; + value += this->advance().value(); + return Token(TokenType::Plus, value, line, column); + case '-': + value += this->advance().value(); + return Token(TokenType::Minus, value, line, column); + case '*': + value += this->advance().value(); + return Token(TokenType::Star, value, line, column); + case '/': + value += this->advance().value(); + return Token(TokenType::Slash, value, line, column); + case '!': + value += this->advance().value(); + if (auto next = this->peek(); next.has_value() && next.value() == '=') { + value += this->advance().value(); + return Token(TokenType::NotEquals, value, line, column); + } + return Token(TokenType::Not, value, line, column); + case '=': + value += this->advance().value(); + if (auto next = this->peek(); next.has_value() && next.value() == '=') { + value += this->advance().value(); + return Token(TokenType::Equals, value, line, column); + } + return Token(TokenType::Assign, value, line, column); + case '<': + value += this->advance().value(); + if (auto next = this->peek(); next.has_value() && next.value() == '=') { + value += this->advance().value(); + return Token(TokenType::LessThanOrEqualTo, value, line, column); + } + return Token(TokenType::LessThan, value, line, column); + case '>': + value += this->advance().value(); + if (auto next = this->peek(); next.has_value() && next.value() == '=') { + value += this->advance().value(); + return Token(TokenType::GreaterThanOrEqualTo, value, line, column); + } + return Token(TokenType::GreaterThan, value, line, column); + case '&': + value += this->advance().value(); + if (auto next = this->peek(); next.has_value() && next.value() == '&') { + value += this->advance().value(); + return Token(TokenType::And, value, line, column); + } + return Token(TokenType::Ampersand, value, line, column); + case '|': + value += this->advance().value(); + if (auto next = this->peek(); next.has_value() && next.value() == '|') { + value += this->advance().value(); + return Token(TokenType::Or, value, line, column); + } + return Token(TokenType::Pipe, value, line, column); + case '^': + value += this->advance().value(); + return Token(TokenType::Caret, value, line, column); + case '~': + value += this->advance().value(); + return Token(TokenType::Tilde, value, line, column); case '(': - type = TOKEN_PAREN_OPEN; - value = strndup(lexer->source + lexer->position, 1); - break; + value += this->advance().value(); + return Token(TokenType::LeftParen, value, line, column); case ')': - type = TOKEN_PAREN_CLOSE; - value = strndup(lexer->source + lexer->position, 1); - break; + value += this->advance().value(); + return Token(TokenType::RightParen, value, line, column); + case '{': + value += this->advance().value(); + return Token(TokenType::LeftBrace, value, line, column); + case '}': + value += this->advance().value(); + return Token(TokenType::RightBrace, value, line, column); + case '[': + value += this->advance().value(); + return Token(TokenType::LeftBracket, value, line, column); + case ']': + value += this->advance().value(); + return Token(TokenType::RightBracket, value, line, column); case ',': - type = TOKEN_COMMA; - value = strndup(lexer->source + lexer->position, 1); - break; - case '>': - type = TOKEN_GRT; - value = strndup(lexer->source + lexer->position, 1); - break; - case '-': - if (lexer->source[lexer->position + 1] == '>') { - type = TOKEN_RIGHT_ARROW; - value = strndup(lexer->source + lexer->position, 2); - lexer->position++; - } else { - type = TOKEN_INVALID; - value = strndup(lexer->source + lexer->position, 1); + value += this->advance().value(); + return Token(TokenType::Comma, value, line, column); + case '.': + value += this->advance().value(); + if (auto next = this->peek(); next.has_value() && next.value() == '.') { + value += this->advance().value(); + return Token(TokenType::Range, value, line, column); } + return Token(TokenType::Dot, value, line, column); + default: + value += this->advance().value(); + return Token(TokenType::Unknown, value, line, column); + } +} break; case '.': printf("Next char: %c\n", lexer->source[lexer->position + 1]); From 1442bfac67fd4311e7cc24c4040c98fa9e54001e Mon Sep 17 00:00:00 2001 From: hrszpuk <107559570+hrszpuk@users.noreply.github.com> Date: Sun, 5 Jan 2025 16:33:43 +0000 Subject: [PATCH 22/23] Implemented lex_single_line_comment for // comment --- src/lexer.cpp | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/src/lexer.cpp b/src/lexer.cpp index 4f73d2e..2042b92 100644 --- a/src/lexer.cpp +++ b/src/lexer.cpp @@ -245,17 +245,22 @@ Token Lexer::lex_symbol() { return Token(TokenType::Unknown, value, line, column); } } + +Token Lexer::lex_single_line_comment() { + std::string value = ""; + size_t line = this->line; + size_t column = this->column; + + while (true) { + auto opt_c = this->peek(); + if (!opt_c.has_value() || opt_c.value() == '\n') { break; - case '.': - printf("Next char: %c\n", lexer->source[lexer->position + 1]); - if (lexer->source[lexer->position + 1] == '.') { - type = TOKEN_RANGE; - value = strndup(lexer->source + lexer->position, 2); - lexer->position++; - } else { - type = TOKEN_INVALID; - value = strndup(lexer->source + lexer->position, 1); - } + } + value += this->advance().value(); + } + + return Token(TokenType::Comment, value, line, column); +} break; default: type = TOKEN_INVALID; From 068737b9232cdfd745f4d34b7f50b3a6b15d81bb Mon Sep 17 00:00:00 2001 From: hrszpuk <107559570+hrszpuk@users.noreply.github.com> Date: Sun, 5 Jan 2025 16:34:09 +0000 Subject: [PATCH 23/23] Implemented lex_multi_line_comment --- src/lexer.cpp | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/src/lexer.cpp b/src/lexer.cpp index 2042b92..860da36 100644 --- a/src/lexer.cpp +++ b/src/lexer.cpp @@ -18,7 +18,7 @@ void Lexer::reset() { this->index = 0; this->line = 1; this->column = 1; - } +} std::vector Lexer::lex() { while (this->index < this->input.size()) { @@ -56,7 +56,7 @@ std::vector Lexer::lex() { if (c == '/' && this->peek_next() == '*') { this->tokens.push_back(this->lex_multi_line_comment()); continue; - } + } this->tokens.push_back(this->lex_symbol()); } @@ -82,7 +82,7 @@ Token Lexer::lex_identifier() { std::cout << "type: " << token_type_to_string(type) << std::endl; std::cout << "value: " << "\"" << value << "\"" << std::endl; return Token(type, value, line, column); - } +} Token Lexer::lex_number() { std::string value = ""; @@ -261,17 +261,27 @@ Token Lexer::lex_single_line_comment() { return Token(TokenType::Comment, value, line, column); } + +Token Lexer::lex_multi_line_comment() { + std::string value = ""; + size_t line = this->line; + size_t column = this->column; + + while (true) { + auto opt_c = this->peek(); + auto opt_next_c = this->peek_next(); + if (!opt_c.has_value() || !opt_next_c.has_value()) { break; - default: - type = TOKEN_INVALID; - value = strndup(lexer->source + lexer->position, 1); + } + if (opt_c.value() == '*' && opt_next_c.value() == '/') { + value += this->advance().value(); + value += this->advance().value(); break; + } + value += this->advance().value(); } - lexer->position++; - - Token token = {type, value}; - return token; + return Token(TokenType::Comment, value, line, column); } TokenType Lexer::get_keyword(std::string value) { @@ -312,7 +322,7 @@ TokenType Lexer::get_keyword(std::string value) { } return TokenType::Identifier; - } +} std::optional Lexer::peek() const { if (this->index < this->input.size()) {