Skip to content

Commit

Permalink
Merge pull request #3 from adriantodt/bugfix/unterminatedUnicodeLiteral
Browse files Browse the repository at this point in the history
Bugfix: unterminated unicode literal
  • Loading branch information
AdrianTodt authored Nov 13, 2021
2 parents a7f5975 + 49acc5a commit ec018ea
Show file tree
Hide file tree
Showing 5 changed files with 151 additions and 103 deletions.
2 changes: 1 addition & 1 deletion build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ kotlin {
all { languageSettings.optIn("kotlin.RequiresOptIn") }
val commonMain by getting {
dependencies {
api("com.github.adriantodt:tartar:3.1.0")
api("com.github.adriantodt:tartar:3.2.0")
api("com.squareup.okio:okio:3.0.0")
}
}
Expand Down
167 changes: 81 additions & 86 deletions src/commonMain/kotlin/com/github/adriantodt/lin/lexer/LinLexer.kt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import com.github.adriantodt.tartar.api.dsl.CharPredicate
import com.github.adriantodt.tartar.api.lexer.Lexer
import com.github.adriantodt.tartar.api.parser.Token
import com.github.adriantodt.tartar.extensions.LexicalNumber
import com.github.adriantodt.tartar.extensions.makeToken
import com.github.adriantodt.tartar.extensions.processToken
import com.github.adriantodt.tartar.extensions.readNumber
import com.github.adriantodt.tartar.extensions.readString

Expand All @@ -15,97 +15,92 @@ internal fun linStdLexer() = Lexer.create<LinToken> {
' ' { while (hasNext()) if (!match(' ')) break }
'\r' {
match('\n')
process(makeToken(NL))
processToken(NL)
}
'\n' { process(makeToken(NL)) }
'{' { process(makeToken(L_BRACE)) }
'}' { process(makeToken(R_BRACE)) }
'(' { process(makeToken(L_PAREN)) }
')' { process(makeToken(R_PAREN)) }
'[' { process(makeToken(L_BRACKET)) }
']' { process(makeToken(R_BRACKET)) }
".." { process(makeToken(RANGE, 2)) }
'.' { process(makeToken(DOT)) }
',' { process(makeToken(COMMA)) }
"::" { process(makeToken(DOUBLE_COLON, 2)) }
':' { process(makeToken(COLON)) }
';' { process(makeToken(SEMICOLON)) }
'\u037E' { process(makeToken(SEMICOLON)) } // Greek question mark
"+=" { process(makeToken(PLUS_ASSIGN, 2)) }
"++" { process(makeToken(INCREMENT, 2)) }
'+' { process(makeToken(PLUS)) }
"->" { process(makeToken(ARROW, 2)) }
"-=" { process(makeToken(MINUS_ASSIGN, 2)) }
"--" { process(makeToken(DECREMENT, 2)) }
'-' { process(makeToken(MINUS)) }
"%=" { process(makeToken(REM_ASSIGN, 2)) }
'%' { process(makeToken(REM)) }
"*=" { process(makeToken(ASTERISK_ASSIGN, 2)) }
'*' { process(makeToken(ASTERISK)) }
'\n' { processToken(NL) }
'{' { processToken(L_BRACE) }
'}' { processToken(R_BRACE) }
'(' { processToken(L_PAREN) }
')' { processToken(R_PAREN) }
'[' { processToken(L_BRACKET) }
']' { processToken(R_BRACKET) }
".." { processToken(RANGE, 2) }
'.' { processToken(DOT) }
',' { processToken(COMMA) }
"::" { processToken(DOUBLE_COLON, 2) }
':' { processToken(COLON) }
';' { processToken(SEMICOLON) }
'\u037E' { processToken(SEMICOLON) } // Greek question mark
"+=" { processToken(PLUS_ASSIGN, 2) }
"++" { processToken(INCREMENT, 2) }
'+' { processToken(PLUS) }
"->" { processToken(ARROW, 2) }
"-=" { processToken(MINUS_ASSIGN, 2) }
"--" { processToken(DECREMENT, 2) }
'-' { processToken(MINUS) }
"%=" { processToken(REM_ASSIGN, 2) }
'%' { processToken(REM) }
"*=" { processToken(ASTERISK_ASSIGN, 2) }
'*' { processToken(ASTERISK) }
"//" { while (hasNext()) if (next() == '\n') break }
"/*" { while (hasNext()) if (next() == '*' && match('/')) break }
"/=" { process(makeToken(SLASH_ASSIGN, 2)) }
'/' { process(makeToken(SLASH)) }
'\\' { process(makeToken(BACKSLASH)) }
"!=" { process(makeToken(NEQ, 2)) }
"!!" { process(makeToken(DOUBLE_BANG, 2)) }
'!' { process(makeToken(BANG)) }
"?:" { process(makeToken(ELVIS)) }
"?." { process(makeToken(QUESTION_DOT, 2)) }
'?' { process(makeToken(QUESTION)) }
"==" { process(makeToken(EQ, 2)) }
'=' { process(makeToken(ASSIGN)) }
"||" { process(makeToken(OR, 2)) }
"|" { process(makeToken(PIPE)) }
"&&" { process(makeToken(AND, 2)) }
"&" { process(makeToken(AMP)) }
"<=" { process(makeToken(LTE, 2)) }
'<' { process(makeToken(LT)) }
">=" { process(makeToken(GTE, 2)) }
'>' { process(makeToken(GT)) }
'\'' { readLinTemplateString(it.toString(), false) }
"\"\"\"" { readLinTemplateString(it.toString(), true) }
"\"\"" { process(makeToken(STRING, 2)) }
'"' { readLinTemplateString(it.toString(), false) }
"`" { process(makeToken(IDENTIFIER, readString(it))) }
"/=" { processToken(SLASH_ASSIGN, 2) }
'/' { processToken(SLASH) }
'\\' { processToken(BACKSLASH) }
"!=" { processToken(NEQ, 2) }
"!!" { processToken(DOUBLE_BANG, 2) }
'!' { processToken(BANG) }
"?:" { processToken(ELVIS) }
"?." { processToken(QUESTION_DOT, 2) }
'?' { processToken(QUESTION) }
"==" { processToken(EQ, 2) }
'=' { processToken(ASSIGN) }
"||" { processToken(OR, 2) }
"|" { processToken(PIPE) }
"&&" { processToken(AND, 2) }
"&" { processToken(AMP) }
"<=" { processToken(LTE, 2) }
'<' { processToken(LT) }
">=" { processToken(GTE, 2) }
'>' { processToken(GT) }
'\'' { readLinTemplateString("'", false) }
"\"\"\"" { readLinTemplateString("\"\"\"", true) }
"\"\"" { processToken(STRING, 2) }
'"' { readLinTemplateString("\"", false) }
"`" { processToken(IDENTIFIER, readString(it), offset = 2) }
matching(CharPredicate.isDigit).configure {
process(
when (val n = readNumber(it)) {
is LexicalNumber.Invalid -> makeToken(INVALID, n.string)
is LexicalNumber.Decimal -> makeToken(DECIMAL, n.value.toString())
is LexicalNumber.Integer -> makeToken(INTEGER, n.value.toString())
}
)
when (val n = readNumber(it)) {
is LexicalNumber.Invalid -> processToken(INVALID, n.string)
is LexicalNumber.Decimal -> processToken(DECIMAL, n.value.toString(), n.string.length)
is LexicalNumber.Integer -> processToken(INTEGER, n.value.toString(), n.string.length)
}
}
matching { it.isLetter() || it == '_' || it == '@' }.configure {
process(
when (val s = readLinIdentifier(it)) {
"break" -> makeToken(BREAK, 5)
"continue" -> makeToken(CONTINUE, 8)
"do" -> makeToken(DO, 2)
"else" -> makeToken(ELSE, 4)
"false" -> makeToken(FALSE, 4)
"for" -> makeToken(FOR, 3)
"fun" -> makeToken(FUN, 3)
"if" -> makeToken(IF, 2)
"in" -> makeToken(IN, 2)
"is" -> makeToken(IS, 2)
"null" -> makeToken(NULL, 4)
"return" -> makeToken(RETURN, 6)
"this" -> makeToken(THIS, 4)
"throw" -> makeToken(THROW, 5)
"true" -> makeToken(TRUE, 4)
"try" -> makeToken(TRY, 3)
"typeof" -> makeToken(TYPEOF, 6)
//"unit" -> makeToken(UNIT, 4)
"val" -> makeToken(VAL, 3)
"var" -> makeToken(VAR, 3)
"when" -> makeToken(WHEN, 4)
"while" -> makeToken(WHILE, 5)
when (val s = readLinIdentifier(it)) {
"break" -> processToken(BREAK, 5)
"continue" -> processToken(CONTINUE, 8)
"do" -> processToken(DO, 2)
"else" -> processToken(ELSE, 4)
"false" -> processToken(FALSE, 4)
"for" -> processToken(FOR, 3)
"fun" -> processToken(FUN, 3)
"if" -> processToken(IF, 2)
"in" -> processToken(IN, 2)
"is" -> processToken(IS, 2)
"null" -> processToken(NULL, 4)
"return" -> processToken(RETURN, 6)
"this" -> processToken(THIS, 4)
"throw" -> processToken(THROW, 5)
"true" -> processToken(TRUE, 4)
"try" -> processToken(TRY, 3)
"typeof" -> processToken(TYPEOF, 6)
"val" -> processToken(VAL, 3)
"var" -> processToken(VAR, 3)
"when" -> processToken(WHEN, 4)
"while" -> processToken(WHILE, 5)

else -> makeToken(IDENTIFIER, s)
}
)
else -> processToken(IDENTIFIER, s)
}
}
configure { process(makeToken(INVALID, next().toString())) }
configure { processToken(INVALID, next().toString()) }
}
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
package com.github.adriantodt.lin.lexer

import com.github.adriantodt.tartar.api.lexer.LexerContext
import com.github.adriantodt.tartar.api.lexer.Section
import com.github.adriantodt.tartar.api.parser.SyntaxException
import com.github.adriantodt.tartar.extensions.makeToken
import com.github.adriantodt.tartar.extensions.processToken
import com.github.adriantodt.tartar.extensions.section

fun LexerContext<*>.readLinIdentifier(firstChar: Char? = null): String {
Expand Down Expand Up @@ -60,6 +61,9 @@ fun LexerContext<*>.readLinString(delimiter: Char): String {


fun LexerContext<LinToken>.readLinTemplateString(delim: String, raw: Boolean) {
val absoluteStart = index - delim.length
var sectionOffset = delim.length
var start = index
val buf = StringBuilder()
var eol = false

Expand All @@ -71,13 +75,16 @@ fun LexerContext<LinToken>.readLinTemplateString(delim: String, raw: Boolean) {
if (peek() == '{') {
next()

process(makeToken(TokenType.STRING, buf.toString()))
process(makeToken(TokenType.PLUS))
processToken(
TokenType.STRING, buf.toString(), index - start, index - start - 2, sectionOffset
)
processToken(TokenType.PLUS)
sectionOffset = 0
buf.clear()

var braces = 0

process(makeToken(TokenType.L_PAREN))
processToken(TokenType.L_PAREN)

while (hasNext()) {
val cc = peek()
Expand All @@ -94,27 +101,32 @@ fun LexerContext<LinToken>.readLinTemplateString(delim: String, raw: Boolean) {
parseOnce().forEach(this::process)
}

process(makeToken(TokenType.R_PAREN))
process(makeToken(TokenType.PLUS))
start = index
processToken(TokenType.R_PAREN)
processToken(TokenType.PLUS)
} else if (peek().isLetter()) {
process(makeToken(TokenType.STRING, buf.toString()))
process(makeToken(TokenType.PLUS))
processToken(
TokenType.STRING, buf.toString(), index - start, index - start - 1, sectionOffset
)
processToken(TokenType.PLUS)
sectionOffset = 0
buf.clear()

buf.append(next())

while (hasNext() && peek().isLetterOrDigit()) {
buf.append(next())
}
start = index

process(makeToken(TokenType.IDENTIFIER, buf.toString()))
processToken(TokenType.IDENTIFIER, buf.toString())
buf.clear()

process(makeToken(TokenType.PLUS))
processToken(TokenType.PLUS, 0)
} else {
buf.append(next())
}
} else if (c == '\\' && raw) {
} else if (c == '\\' && !raw) {
next()
if (!hasNext()) break
when (next()) {
Expand All @@ -127,11 +139,16 @@ fun LexerContext<LinToken>.readLinTemplateString(delim: String, raw: Boolean) {
'\\' -> buf.append('\\')
'u' -> {
val u = peekString(4)
if (u.length != 4) throw IllegalStateException("File terminated before escaping")
buf.append(u.toIntOrNull(16)?.toChar() ?: throw IllegalStateException("Illegal unicode escaping"))
if (u.length != 4) {
throw SyntaxException("File terminated before escaping", section(2, u.length + 2))
}
buf.append(
u.toIntOrNull(16)?.toChar()
?: throw SyntaxException("Illegal unicode escaping", section(2, 6))
)
nextString(4)
}
else -> throw IllegalStateException("Unknown escaping")
else -> throw SyntaxException("Unknown escaping", section(2))
}
} else if (this.peekString(delim.length) == delim) {
this.nextString(delim.length)
Expand All @@ -144,8 +161,8 @@ fun LexerContext<LinToken>.readLinTemplateString(delim: String, raw: Boolean) {
}

if (!eol) {
throw IllegalStateException("Unterminated string")
throw SyntaxException("Unterminated string", Section(source, absoluteStart, index - absoluteStart))
}

process(makeToken(TokenType.STRING, buf.toString(), 2))
processToken(TokenType.STRING, buf.toString(), index - start, index - start, sectionOffset)
}
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
package com.github.adriantodt.lin.test.regression.lexer

import com.github.adriantodt.lin.Lin
import com.github.adriantodt.tartar.api.lexer.Source
import com.github.adriantodt.tartar.api.parser.SyntaxException
import kotlin.test.*

class LexerRegressionTests {
@Test
fun unterminatedUnicodeLiteral() {
val code = """
"\u123
""".trimIndent()
val throwable = assertFails {
Lin.parser.lexer.parseToList(Source(code, "unterminatedUnicodeLiteral.lin"))
}
assertIs<SyntaxException>(throwable)
val section = throwable.section
assertNotNull(section)
assertEquals("\\u123", section.substring)
}

@Test
fun unterminatedString() {
val code = """
"ABC
""".trimIndent()
val throwable = assertFails {
Lin.parser.lexer.parseToList(Source(code, "unterminatedString.lin"))
}
assertIs<SyntaxException>(throwable)
val section = throwable.section
assertNotNull(section)
assertEquals("\"ABC", section.substring)
}
}

0 comments on commit ec018ea

Please sign in to comment.