Skip to content

Commit

Permalink
V2 Parser: Add scanner (#340)
Browse files Browse the repository at this point in the history
  • Loading branch information
kubukoz authored Oct 4, 2023
1 parent ea3df7c commit 7d9e6b1
Show file tree
Hide file tree
Showing 9 changed files with 816 additions and 6 deletions.
2 changes: 1 addition & 1 deletion modules/ast/src/test/scala/playground/Assertions.scala
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ object Assertions extends Expectations.Helpers {
val stringWithResets = d.show()(conf).linesWithSeparators.map(Console.RESET + _).mkString

failure(
s"Diff failed:\n${Console.RESET}(${conf.right("expected")}, ${conf.left("actual")})\n\n" + stringWithResets
s"Diff failed:\n${Console.RESET}(${conf.left("expected")}, ${conf.right("actual")})\n\n" + stringWithResets
)
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,247 @@
package playground.smithyql.parser.v2.scanner

import cats.kernel.Eq
import cats.parse.Numbers
import cats.syntax.all.*

import scala.annotation.nowarn

case class Token(
kind: TokenKind,
text: String,
) {
def width: Int = text.length
}

object Token {
implicit val eq: Eq[Token] = Eq.fromUniversalEquals
}

sealed trait TokenKind extends Product with Serializable {

def apply(
text: String
): Token = Token(this, text)

}

object TokenKind {
case object KW_USE extends TokenKind
case object KW_SERVICE extends TokenKind
case object KW_BOOLEAN extends TokenKind
case object LIT_NUMBER extends TokenKind
case object LIT_STRING extends TokenKind
case object KW_NULL extends TokenKind

case object DOT extends TokenKind
case object COMMA extends TokenKind
case object HASH extends TokenKind
case object LB extends TokenKind
case object RB extends TokenKind
case object LBR extends TokenKind
case object RBR extends TokenKind
case object COLON extends TokenKind
case object EQ extends TokenKind
case object SPACE extends TokenKind
case object NEWLINE extends TokenKind
case object IDENT extends TokenKind
case object COMMENT extends TokenKind
case object Error extends TokenKind

implicit val eq: Eq[TokenKind] = Eq.fromUniversalEquals
}

object Scanner {

/** Entrypoint to scanning text into tokens.
*
* Always produces an output that can be rendered back to the original text.
*/
def scan(
s: String
): List[Token] = {
var remaining = s
var tokens = List.empty[Token]
def add(
tok: Token
) = tokens ::= tok

def readSimple(
token: String,
tok: TokenKind,
): PartialFunction[Unit, Unit] = {
case _ if remaining.startsWith(token) =>
add(tok(token.toString))
remaining = remaining.drop(token.length())
}

def simpleTokens(
pairings: (
String,
TokenKind,
)*
): PartialFunction[Unit, Unit] = pairings.map(readSimple.tupled).reduce(_.orElse(_))

val keywords = Map(
"use" -> TokenKind.KW_USE,
"service" -> TokenKind.KW_SERVICE,
"null" -> TokenKind.KW_NULL,
"true" -> TokenKind.KW_BOOLEAN,
"false" -> TokenKind.KW_BOOLEAN,
)

val readIdent: PartialFunction[Unit, Unit] = {
case _ if remaining.head.isLetter =>
val (letters, rest) = remaining.span(ch => ch.isLetterOrDigit || ch == '_')

keywords.get(letters) match {
case Some(kind) =>
// we matched a keyword, return it.
add(kind(letters))

case None =>
// normal ident
add(TokenKind.IDENT(letters))
}

remaining = rest
}

val readPunctuation: PartialFunction[Unit, Unit] = simpleTokens(
"." -> TokenKind.DOT,
"," -> TokenKind.COMMA,
"#" -> TokenKind.HASH,
"[" -> TokenKind.LB,
"]" -> TokenKind.RB,
"{" -> TokenKind.LBR,
"}" -> TokenKind.RBR,
":" -> TokenKind.COLON,
"=" -> TokenKind.EQ,
)

val readStringLiteral: PartialFunction[Unit, Unit] = {
case _ if remaining.startsWith("\"") =>
val (str, rest) = remaining.tail.span(_ != '\"')
if (rest.isEmpty) { // hit EOF
add(TokenKind.LIT_STRING("\"" + str))
remaining = rest
} else {
add(TokenKind.LIT_STRING("\"" + str + "\""))
remaining = rest.tail
}
}

val readNumberLiteral: PartialFunction[Unit, Unit] = {
// I love this language
object jsonNumber {
def unapply(
@nowarn("cat=unused")
unused: Unit
): Option[
(
String,
String,
)
] =
// For now, we're using the cats-parse implementation simply because it's consistent with the current implementation
// and we can rewrite this later on when we drop support for the other parser
// and no longer need cats-parse.
Numbers.jsonNumber.parse(remaining).toOption
}

{ case jsonNumber(rest, num) =>
add(TokenKind.LIT_NUMBER(num.toString))
remaining = rest
}
}

// readOne and friends are all partial functions: this is the current implementation of lookahead.
// it's not great, but it kinda works.
val readOne: PartialFunction[Unit, Unit] = readIdent
.orElse(readPunctuation)
.orElse(readStringLiteral)
.orElse(readNumberLiteral)

// split "whitespace" string into chains of contiguous newlines OR whitespace characters.
def whitespaceChains(
whitespace: String
): List[Token] = {
val isNewline = (ch: Char) => ch == '\n'

if (whitespace.isEmpty)
Nil
else if (isNewline(whitespace.head)) {
val (nl, rest) = whitespace.span(isNewline)
TokenKind.NEWLINE(nl) :: whitespaceChains(rest)
} else {
val (wsp, rest) = whitespace.span(!isNewline(_))
TokenKind.SPACE(wsp) :: whitespaceChains(rest)
}
}

def eatWhitespace(
) = {
val (wsp, rest) = remaining.span(ch => ch.isWhitespace)
if (wsp.isEmpty())
false
else {
whitespaceChains(wsp).foreach(add)
remaining = rest

true
}
}

def eatComments(
) =
if (!remaining.startsWith("//"))
false
else {
while (remaining.startsWith("//")) {
val (comment, rest) = remaining.span(_ != '\n')
add(TokenKind.COMMENT(comment))
remaining = rest
}

true
}

def eatErrors(
) = {
// todo: bug: even if the next character starts a multi-char token, this will consider it an error.
// instead, we should rework "readOne" to consume arbitrary constant-length tokens, and also include the possibility that `rest` has comments or whitespace.
val (failures, _) = remaining.span { _ =>
if (readOne.isDefinedAt(()))
// this will match. stop!
false
else {
// didn't match. We need to move the cursor manually here
remaining = remaining.tail
true
}
}

if (failures.nonEmpty) {
add(TokenKind.Error(failures))
true
} else
false
}

while (remaining.nonEmpty) {
val last = remaining

readOne.lift(()).isDefined ||
eatWhitespace() ||
eatComments() ||
eatErrors(): Unit

// last-effort sanity check
if (remaining == last)
sys.error(s"no progress in the last run! remaining string: $remaining")
}

tokens.reverse
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ import io.circe.Decoder
import io.circe.syntax._
import playground.Assertions._
import playground.smithyql._
import playground.smithyql.parser.v2.scanner.Scanner
import playground.smithyql.parser.v2.scanner.TokenKind
import weaver._

import java.nio.file
Expand Down Expand Up @@ -52,11 +54,30 @@ trait ParserSuite extends SimpleIOSuite {
}
}
}

validTokensTest(testCase, trimWhitespace)
}

private def validTokensTest(
testCase: TestCase,
trimWhitespace: Boolean,
) =
test(testCase.name + " (v2 scanner)") {
testCase.readInput(trimWhitespace).map { input =>
val scanned = Scanner.scan(input)

val errors = scanned.filter(_.kind == TokenKind.Error)
// non-empty inputs should parse to non-empty outputs
assert(input.isEmpty || scanned.nonEmpty) &&
assert(errors.isEmpty)
}
}

// invalidTokens: a flag that tells the suite whether the file should contain invalid tokens.
def loadNegativeParserTests[Alg[_[_]]: SourceParser](
prefix: String,
trimWhitespace: Boolean = false,
invalidTokens: Boolean,
): Unit = loadTestCases("", List("negative", prefix)).foreach { testCase =>
test(testCase.name) {
testCase.readInput(trimWhitespace).map { input =>
Expand All @@ -66,6 +87,10 @@ trait ParserSuite extends SimpleIOSuite {
}
}
}

if (!invalidTokens)
validTokensTest(testCase, trimWhitespace)

}

private def readText(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@ import playground.smithyql.Prelude
import playground.smithyql.parser.ParserSuite

object PreludeParserNegativeTests extends ParserSuite {
loadNegativeParserTests[Prelude]("prelude", trimWhitespace = true)
loadNegativeParserTests[Prelude]("prelude", trimWhitespace = true, invalidTokens = false)
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
package playground.smithyql.parser.v2

import com.softwaremill.diffx.Diff
import playground.smithyql.parser.v2.scanner.Token
import playground.smithyql.parser.v2.scanner.TokenKind

object Diffs {

implicit val tokenKindDiff: Diff[TokenKind] = Diff.derived
implicit val tokenDiff: Diff[Token] = Diff.derived

}
Loading

0 comments on commit 7d9e6b1

Please sign in to comment.