Merge pull request #3 from adriantodt/bugfix/unterminatedUnicodeLiteral

Bugfix: unterminated unicode literal
NotJustAnna · Nov 13, 2021 · ec018ea · ec018ea
2 parents a7f5975 + 49acc5a
commit ec018ea
Show file tree

Hide file tree

Showing 5 changed files with 151 additions and 103 deletions.
diff --git a/build.gradle.kts b/build.gradle.kts
@@ -46,7 +46,7 @@ kotlin {
         all { languageSettings.optIn("kotlin.RequiresOptIn") }
         val commonMain by getting {
             dependencies {
-                api("com.github.adriantodt:tartar:3.1.0")
+                api("com.github.adriantodt:tartar:3.2.0")
                 api("com.squareup.okio:okio:3.0.0")
             }
         }

diff --git a/src/commonMain/kotlin/com/github/adriantodt/lin/lexer/LinLexer.kt b/src/commonMain/kotlin/com/github/adriantodt/lin/lexer/LinLexer.kt
@@ -5,7 +5,7 @@ import com.github.adriantodt.tartar.api.dsl.CharPredicate
 import com.github.adriantodt.tartar.api.lexer.Lexer
 import com.github.adriantodt.tartar.api.parser.Token
 import com.github.adriantodt.tartar.extensions.LexicalNumber
-import com.github.adriantodt.tartar.extensions.makeToken
+import com.github.adriantodt.tartar.extensions.processToken
 import com.github.adriantodt.tartar.extensions.readNumber
 import com.github.adriantodt.tartar.extensions.readString
 
@@ -15,97 +15,92 @@ internal fun linStdLexer() = Lexer.create<LinToken> {
     ' ' { while (hasNext()) if (!match(' ')) break }
     '\r' {
         match('\n')
-        process(makeToken(NL))
+        processToken(NL)
     }
-    '\n' { process(makeToken(NL)) }
-    '{' { process(makeToken(L_BRACE)) }
-    '}' { process(makeToken(R_BRACE)) }
-    '(' { process(makeToken(L_PAREN)) }
-    ')' { process(makeToken(R_PAREN)) }
-    '[' { process(makeToken(L_BRACKET)) }
-    ']' { process(makeToken(R_BRACKET)) }
-    ".." { process(makeToken(RANGE, 2)) }
-    '.' { process(makeToken(DOT)) }
-    ',' { process(makeToken(COMMA)) }
-    "::" { process(makeToken(DOUBLE_COLON, 2)) }
-    ':' { process(makeToken(COLON)) }
-    ';' { process(makeToken(SEMICOLON)) }
-    '\u037E' { process(makeToken(SEMICOLON)) } // Greek question mark
-    "+=" { process(makeToken(PLUS_ASSIGN, 2)) }
-    "++" { process(makeToken(INCREMENT, 2)) }
-    '+' { process(makeToken(PLUS)) }
-    "->" { process(makeToken(ARROW, 2)) }
-    "-=" { process(makeToken(MINUS_ASSIGN, 2)) }
-    "--" { process(makeToken(DECREMENT, 2)) }
-    '-' { process(makeToken(MINUS)) }
-    "%=" { process(makeToken(REM_ASSIGN, 2)) }
-    '%' { process(makeToken(REM)) }
-    "*=" { process(makeToken(ASTERISK_ASSIGN, 2)) }
-    '*' { process(makeToken(ASTERISK)) }
+    '\n' { processToken(NL) }
+    '{' { processToken(L_BRACE) }
+    '}' { processToken(R_BRACE) }
+    '(' { processToken(L_PAREN) }
+    ')' { processToken(R_PAREN) }
+    '[' { processToken(L_BRACKET) }
+    ']' { processToken(R_BRACKET) }
+    ".." { processToken(RANGE, 2) }
+    '.' { processToken(DOT) }
+    ',' { processToken(COMMA) }
+    "::" { processToken(DOUBLE_COLON, 2) }
+    ':' { processToken(COLON) }
+    ';' { processToken(SEMICOLON) }
+    '\u037E' { processToken(SEMICOLON) } // Greek question mark
+    "+=" { processToken(PLUS_ASSIGN, 2) }
+    "++" { processToken(INCREMENT, 2) }
+    '+' { processToken(PLUS) }
+    "->" { processToken(ARROW, 2) }
+    "-=" { processToken(MINUS_ASSIGN, 2) }
+    "--" { processToken(DECREMENT, 2) }
+    '-' { processToken(MINUS) }
+    "%=" { processToken(REM_ASSIGN, 2) }
+    '%' { processToken(REM) }
+    "*=" { processToken(ASTERISK_ASSIGN, 2) }
+    '*' { processToken(ASTERISK) }
     "//" { while (hasNext()) if (next() == '\n') break }
     "/*" { while (hasNext()) if (next() == '*' && match('/')) break }
-    "/=" { process(makeToken(SLASH_ASSIGN, 2)) }
-    '/' { process(makeToken(SLASH)) }
-    '\\' { process(makeToken(BACKSLASH)) }
-    "!=" { process(makeToken(NEQ, 2)) }
-    "!!" { process(makeToken(DOUBLE_BANG, 2)) }
-    '!' { process(makeToken(BANG)) }
-    "?:" { process(makeToken(ELVIS)) }
-    "?." { process(makeToken(QUESTION_DOT, 2)) }
-    '?' { process(makeToken(QUESTION)) }
-    "==" { process(makeToken(EQ, 2)) }
-    '=' { process(makeToken(ASSIGN)) }
-    "||" { process(makeToken(OR, 2)) }
-    "|" { process(makeToken(PIPE)) }
-    "&&" { process(makeToken(AND, 2)) }
-    "&" { process(makeToken(AMP)) }
-    "<=" { process(makeToken(LTE, 2)) }
-    '<' { process(makeToken(LT)) }
-    ">=" { process(makeToken(GTE, 2)) }
-    '>' { process(makeToken(GT)) }
-    '\'' { readLinTemplateString(it.toString(), false) }
-    "\"\"\"" { readLinTemplateString(it.toString(), true) }
-    "\"\"" { process(makeToken(STRING, 2)) }
-    '"' { readLinTemplateString(it.toString(), false) }
-    "`" { process(makeToken(IDENTIFIER, readString(it))) }
+    "/=" { processToken(SLASH_ASSIGN, 2) }
+    '/' { processToken(SLASH) }
+    '\\' { processToken(BACKSLASH) }
+    "!=" { processToken(NEQ, 2) }
+    "!!" { processToken(DOUBLE_BANG, 2) }
+    '!' { processToken(BANG) }
+    "?:" { processToken(ELVIS) }
+    "?." { processToken(QUESTION_DOT, 2) }
+    '?' { processToken(QUESTION) }
+    "==" { processToken(EQ, 2) }
+    '=' { processToken(ASSIGN) }
+    "||" { processToken(OR, 2) }
+    "|" { processToken(PIPE) }
+    "&&" { processToken(AND, 2) }
+    "&" { processToken(AMP) }
+    "<=" { processToken(LTE, 2) }
+    '<' { processToken(LT) }
+    ">=" { processToken(GTE, 2) }
+    '>' { processToken(GT) }
+    '\'' { readLinTemplateString("'", false) }
+    "\"\"\"" { readLinTemplateString("\"\"\"", true) }
+    "\"\"" { processToken(STRING, 2) }
+    '"' { readLinTemplateString("\"", false) }
+    "`" { processToken(IDENTIFIER, readString(it), offset = 2) }
     matching(CharPredicate.isDigit).configure {
-        process(
-            when (val n = readNumber(it)) {
-                is LexicalNumber.Invalid -> makeToken(INVALID, n.string)
-                is LexicalNumber.Decimal -> makeToken(DECIMAL, n.value.toString())
-                is LexicalNumber.Integer -> makeToken(INTEGER, n.value.toString())
-            }
-        )
+        when (val n = readNumber(it)) {
+            is LexicalNumber.Invalid -> processToken(INVALID, n.string)
+            is LexicalNumber.Decimal -> processToken(DECIMAL, n.value.toString(), n.string.length)
+            is LexicalNumber.Integer -> processToken(INTEGER, n.value.toString(), n.string.length)
+        }
     }
     matching { it.isLetter() || it == '_' || it == '@' }.configure {
-        process(
-            when (val s = readLinIdentifier(it)) {
-                "break" -> makeToken(BREAK, 5)
-                "continue" -> makeToken(CONTINUE, 8)
-                "do" -> makeToken(DO, 2)
-                "else" -> makeToken(ELSE, 4)
-                "false" -> makeToken(FALSE, 4)
-                "for" -> makeToken(FOR, 3)
-                "fun" -> makeToken(FUN, 3)
-                "if" -> makeToken(IF, 2)
-                "in" -> makeToken(IN, 2)
-                "is" -> makeToken(IS, 2)
-                "null" -> makeToken(NULL, 4)
-                "return" -> makeToken(RETURN, 6)
-                "this" -> makeToken(THIS, 4)
-                "throw" -> makeToken(THROW, 5)
-                "true" -> makeToken(TRUE, 4)
-                "try" -> makeToken(TRY, 3)
-                "typeof" -> makeToken(TYPEOF, 6)
-                //"unit" -> makeToken(UNIT, 4)
-                "val" -> makeToken(VAL, 3)
-                "var" -> makeToken(VAR, 3)
-                "when" -> makeToken(WHEN, 4)
-                "while" -> makeToken(WHILE, 5)
+        when (val s = readLinIdentifier(it)) {
+            "break" -> processToken(BREAK, 5)
+            "continue" -> processToken(CONTINUE, 8)
+            "do" -> processToken(DO, 2)
+            "else" -> processToken(ELSE, 4)
+            "false" -> processToken(FALSE, 4)
+            "for" -> processToken(FOR, 3)
+            "fun" -> processToken(FUN, 3)
+            "if" -> processToken(IF, 2)
+            "in" -> processToken(IN, 2)
+            "is" -> processToken(IS, 2)
+            "null" -> processToken(NULL, 4)
+            "return" -> processToken(RETURN, 6)
+            "this" -> processToken(THIS, 4)
+            "throw" -> processToken(THROW, 5)
+            "true" -> processToken(TRUE, 4)
+            "try" -> processToken(TRY, 3)
+            "typeof" -> processToken(TYPEOF, 6)
+            "val" -> processToken(VAL, 3)
+            "var" -> processToken(VAR, 3)
+            "when" -> processToken(WHEN, 4)
+            "while" -> processToken(WHILE, 5)
 
-                else -> makeToken(IDENTIFIER, s)
-            }
-        )
+            else -> processToken(IDENTIFIER, s)
+        }
     }
-    configure { process(makeToken(INVALID, next().toString())) }
+    configure { processToken(INVALID, next().toString()) }
 }
diff --git a/src/commonMain/kotlin/com/github/adriantodt/lin/lexer/LinLexerUtils.kt b/src/commonMain/kotlin/com/github/adriantodt/lin/lexer/LinLexerUtils.kt
@@ -1,8 +1,9 @@
 package com.github.adriantodt.lin.lexer
 
 import com.github.adriantodt.tartar.api.lexer.LexerContext
+import com.github.adriantodt.tartar.api.lexer.Section
 import com.github.adriantodt.tartar.api.parser.SyntaxException
-import com.github.adriantodt.tartar.extensions.makeToken
+import com.github.adriantodt.tartar.extensions.processToken
 import com.github.adriantodt.tartar.extensions.section
 
 fun LexerContext<*>.readLinIdentifier(firstChar: Char? = null): String {
@@ -60,6 +61,9 @@ fun LexerContext<*>.readLinString(delimiter: Char): String {
 
 
 fun LexerContext<LinToken>.readLinTemplateString(delim: String, raw: Boolean) {
+    val absoluteStart = index - delim.length
+    var sectionOffset = delim.length
+    var start = index
     val buf = StringBuilder()
     var eol = false
 
@@ -71,13 +75,16 @@ fun LexerContext<LinToken>.readLinTemplateString(delim: String, raw: Boolean) {
             if (peek() == '{') {
                 next()
 
-                process(makeToken(TokenType.STRING, buf.toString()))
-                process(makeToken(TokenType.PLUS))
+                processToken(
+                    TokenType.STRING, buf.toString(), index - start, index - start - 2, sectionOffset
+                )
+                processToken(TokenType.PLUS)
+                sectionOffset = 0
                 buf.clear()
 
                 var braces = 0
 
-                process(makeToken(TokenType.L_PAREN))
+                processToken(TokenType.L_PAREN)
 
                 while (hasNext()) {
                     val cc = peek()
@@ -94,27 +101,32 @@ fun LexerContext<LinToken>.readLinTemplateString(delim: String, raw: Boolean) {
                     parseOnce().forEach(this::process)
                 }
 
-                process(makeToken(TokenType.R_PAREN))
-                process(makeToken(TokenType.PLUS))
+                start = index
+                processToken(TokenType.R_PAREN)
+                processToken(TokenType.PLUS)
             } else if (peek().isLetter()) {
-                process(makeToken(TokenType.STRING, buf.toString()))
-                process(makeToken(TokenType.PLUS))
+                processToken(
+                    TokenType.STRING, buf.toString(), index - start, index - start - 1, sectionOffset
+                )
+                processToken(TokenType.PLUS)
+                sectionOffset = 0
                 buf.clear()
 
                 buf.append(next())
 
                 while (hasNext() && peek().isLetterOrDigit()) {
                     buf.append(next())
                 }
+                start = index
 
-                process(makeToken(TokenType.IDENTIFIER, buf.toString()))
+                processToken(TokenType.IDENTIFIER, buf.toString())
                 buf.clear()
 
-                process(makeToken(TokenType.PLUS))
+                processToken(TokenType.PLUS, 0)
             } else {
                 buf.append(next())
             }
-        } else if (c == '\\' && raw) {
+        } else if (c == '\\' && !raw) {
             next()
             if (!hasNext()) break
             when (next()) {
@@ -127,11 +139,16 @@ fun LexerContext<LinToken>.readLinTemplateString(delim: String, raw: Boolean) {
                 '\\' -> buf.append('\\')
                 'u' -> {
                     val u = peekString(4)
-                    if (u.length != 4) throw IllegalStateException("File terminated before escaping")
-                    buf.append(u.toIntOrNull(16)?.toChar() ?: throw IllegalStateException("Illegal unicode escaping"))
+                    if (u.length != 4) {
+                        throw SyntaxException("File terminated before escaping", section(2, u.length + 2))
+                    }
+                    buf.append(
+                        u.toIntOrNull(16)?.toChar()
+                            ?: throw SyntaxException("Illegal unicode escaping", section(2, 6))
+                    )
                     nextString(4)
                 }
-                else -> throw IllegalStateException("Unknown escaping")
+                else -> throw SyntaxException("Unknown escaping", section(2))
             }
         } else if (this.peekString(delim.length) == delim) {
             this.nextString(delim.length)
@@ -144,8 +161,8 @@ fun LexerContext<LinToken>.readLinTemplateString(delim: String, raw: Boolean) {
     }
 
     if (!eol) {
-        throw IllegalStateException("Unterminated string")
+        throw SyntaxException("Unterminated string", Section(source, absoluteStart, index - absoluteStart))
     }
 
-    process(makeToken(TokenType.STRING, buf.toString(), 2))
+    processToken(TokenType.STRING, buf.toString(), index - start, index - start, sectionOffset)
 }
diff --git a/src/commonTest/kotlin/com/github/adriantodt/lin/test/regression/.gitkeep b/src/commonTest/kotlin/com/github/adriantodt/lin/test/regression/.gitkeep
diff --git a/...commonTest/kotlin/com/github/adriantodt/lin/test/regression/lexer/LexerRegressionTests.kt b/...commonTest/kotlin/com/github/adriantodt/lin/test/regression/lexer/LexerRegressionTests.kt
@@ -0,0 +1,36 @@
+package com.github.adriantodt.lin.test.regression.lexer
+
+import com.github.adriantodt.lin.Lin
+import com.github.adriantodt.tartar.api.lexer.Source
+import com.github.adriantodt.tartar.api.parser.SyntaxException
+import kotlin.test.*
+
+class LexerRegressionTests {
+    @Test
+    fun unterminatedUnicodeLiteral() {
+        val code = """
+            "\u123
+        """.trimIndent()
+        val throwable = assertFails {
+            Lin.parser.lexer.parseToList(Source(code, "unterminatedUnicodeLiteral.lin"))
+        }
+        assertIs<SyntaxException>(throwable)
+        val section = throwable.section
+        assertNotNull(section)
+        assertEquals("\\u123", section.substring)
+    }
+
+    @Test
+    fun unterminatedString() {
+        val code = """
+            "ABC
+        """.trimIndent()
+        val throwable = assertFails {
+            Lin.parser.lexer.parseToList(Source(code, "unterminatedString.lin"))
+        }
+        assertIs<SyntaxException>(throwable)
+        val section = throwable.section
+        assertNotNull(section)
+        assertEquals("\"ABC", section.substring)
+    }
+}