V2 Parser: Add scanner (#340)

kubukoz · Oct 4, 2023 · 7d9e6b1 · 7d9e6b1
1 parent ea3df7c
commit 7d9e6b1
Show file tree

Hide file tree

Showing 9 changed files with 816 additions and 6 deletions.
diff --git a/modules/ast/src/test/scala/playground/Assertions.scala b/modules/ast/src/test/scala/playground/Assertions.scala
@@ -23,7 +23,7 @@ object Assertions extends Expectations.Helpers {
         val stringWithResets = d.show()(conf).linesWithSeparators.map(Console.RESET + _).mkString
 
         failure(
-          s"Diff failed:\n${Console.RESET}(${conf.right("expected")}, ${conf.left("actual")})\n\n" + stringWithResets
+          s"Diff failed:\n${Console.RESET}(${conf.left("expected")}, ${conf.right("actual")})\n\n" + stringWithResets
         )
     }
 

diff --git a/modules/parser/src/main/scala/playground/smithyql/parser/v2/scanner.scala b/modules/parser/src/main/scala/playground/smithyql/parser/v2/scanner.scala
@@ -0,0 +1,247 @@
+package playground.smithyql.parser.v2.scanner
+
+import cats.kernel.Eq
+import cats.parse.Numbers
+import cats.syntax.all.*
+
+import scala.annotation.nowarn
+
+case class Token(
+  kind: TokenKind,
+  text: String,
+) {
+  def width: Int = text.length
+}
+
+object Token {
+  implicit val eq: Eq[Token] = Eq.fromUniversalEquals
+}
+
+sealed trait TokenKind extends Product with Serializable {
+
+  def apply(
+    text: String
+  ): Token = Token(this, text)
+
+}
+
+object TokenKind {
+  case object KW_USE extends TokenKind
+  case object KW_SERVICE extends TokenKind
+  case object KW_BOOLEAN extends TokenKind
+  case object LIT_NUMBER extends TokenKind
+  case object LIT_STRING extends TokenKind
+  case object KW_NULL extends TokenKind
+
+  case object DOT extends TokenKind
+  case object COMMA extends TokenKind
+  case object HASH extends TokenKind
+  case object LB extends TokenKind
+  case object RB extends TokenKind
+  case object LBR extends TokenKind
+  case object RBR extends TokenKind
+  case object COLON extends TokenKind
+  case object EQ extends TokenKind
+  case object SPACE extends TokenKind
+  case object NEWLINE extends TokenKind
+  case object IDENT extends TokenKind
+  case object COMMENT extends TokenKind
+  case object Error extends TokenKind
+
+  implicit val eq: Eq[TokenKind] = Eq.fromUniversalEquals
+}
+
+object Scanner {
+
+  /** Entrypoint to scanning text into tokens.
+    *
+    * Always produces an output that can be rendered back to the original text.
+    */
+  def scan(
+    s: String
+  ): List[Token] = {
+    var remaining = s
+    var tokens = List.empty[Token]
+    def add(
+      tok: Token
+    ) = tokens ::= tok
+
+    def readSimple(
+      token: String,
+      tok: TokenKind,
+    ): PartialFunction[Unit, Unit] = {
+      case _ if remaining.startsWith(token) =>
+        add(tok(token.toString))
+        remaining = remaining.drop(token.length())
+    }
+
+    def simpleTokens(
+      pairings: (
+        String,
+        TokenKind,
+      )*
+    ): PartialFunction[Unit, Unit] = pairings.map(readSimple.tupled).reduce(_.orElse(_))
+
+    val keywords = Map(
+      "use" -> TokenKind.KW_USE,
+      "service" -> TokenKind.KW_SERVICE,
+      "null" -> TokenKind.KW_NULL,
+      "true" -> TokenKind.KW_BOOLEAN,
+      "false" -> TokenKind.KW_BOOLEAN,
+    )
+
+    val readIdent: PartialFunction[Unit, Unit] = {
+      case _ if remaining.head.isLetter =>
+        val (letters, rest) = remaining.span(ch => ch.isLetterOrDigit || ch == '_')
+
+        keywords.get(letters) match {
+          case Some(kind) =>
+            // we matched a keyword, return it.
+            add(kind(letters))
+
+          case None =>
+            // normal ident
+            add(TokenKind.IDENT(letters))
+        }
+
+        remaining = rest
+    }
+
+    val readPunctuation: PartialFunction[Unit, Unit] = simpleTokens(
+      "." -> TokenKind.DOT,
+      "," -> TokenKind.COMMA,
+      "#" -> TokenKind.HASH,
+      "[" -> TokenKind.LB,
+      "]" -> TokenKind.RB,
+      "{" -> TokenKind.LBR,
+      "}" -> TokenKind.RBR,
+      ":" -> TokenKind.COLON,
+      "=" -> TokenKind.EQ,
+    )
+
+    val readStringLiteral: PartialFunction[Unit, Unit] = {
+      case _ if remaining.startsWith("\"") =>
+        val (str, rest) = remaining.tail.span(_ != '\"')
+        if (rest.isEmpty) { // hit EOF
+          add(TokenKind.LIT_STRING("\"" + str))
+          remaining = rest
+        } else {
+          add(TokenKind.LIT_STRING("\"" + str + "\""))
+          remaining = rest.tail
+        }
+    }
+
+    val readNumberLiteral: PartialFunction[Unit, Unit] = {
+      // I love this language
+      object jsonNumber {
+        def unapply(
+          @nowarn("cat=unused")
+          unused: Unit
+        ): Option[
+          (
+            String,
+            String,
+          )
+        ] =
+          // For now, we're using the cats-parse implementation simply because it's consistent with the current implementation
+          // and we can rewrite this later on when we drop support for the other parser
+          // and no longer need cats-parse.
+          Numbers.jsonNumber.parse(remaining).toOption
+      }
+
+      { case jsonNumber(rest, num) =>
+        add(TokenKind.LIT_NUMBER(num.toString))
+        remaining = rest
+      }
+    }
+
+    // readOne and friends are all partial functions: this is the current implementation of lookahead.
+    // it's not great, but it kinda works.
+    val readOne: PartialFunction[Unit, Unit] = readIdent
+      .orElse(readPunctuation)
+      .orElse(readStringLiteral)
+      .orElse(readNumberLiteral)
+
+    // split "whitespace" string into chains of contiguous newlines OR whitespace characters.
+    def whitespaceChains(
+      whitespace: String
+    ): List[Token] = {
+      val isNewline = (ch: Char) => ch == '\n'
+
+      if (whitespace.isEmpty)
+        Nil
+      else if (isNewline(whitespace.head)) {
+        val (nl, rest) = whitespace.span(isNewline)
+        TokenKind.NEWLINE(nl) :: whitespaceChains(rest)
+      } else {
+        val (wsp, rest) = whitespace.span(!isNewline(_))
+        TokenKind.SPACE(wsp) :: whitespaceChains(rest)
+      }
+    }
+
+    def eatWhitespace(
+    ) = {
+      val (wsp, rest) = remaining.span(ch => ch.isWhitespace)
+      if (wsp.isEmpty())
+        false
+      else {
+        whitespaceChains(wsp).foreach(add)
+        remaining = rest
+
+        true
+      }
+    }
+
+    def eatComments(
+    ) =
+      if (!remaining.startsWith("//"))
+        false
+      else {
+        while (remaining.startsWith("//")) {
+          val (comment, rest) = remaining.span(_ != '\n')
+          add(TokenKind.COMMENT(comment))
+          remaining = rest
+        }
+
+        true
+      }
+
+    def eatErrors(
+    ) = {
+      // todo: bug: even if the next character starts a multi-char token, this will consider it an error.
+      // instead, we should rework "readOne" to consume arbitrary constant-length tokens, and also include the possibility that `rest` has comments or whitespace.
+      val (failures, _) = remaining.span { _ =>
+        if (readOne.isDefinedAt(()))
+          // this will match. stop!
+          false
+        else {
+          // didn't match. We need to move the cursor manually here
+          remaining = remaining.tail
+          true
+        }
+      }
+
+      if (failures.nonEmpty) {
+        add(TokenKind.Error(failures))
+        true
+      } else
+        false
+    }
+
+    while (remaining.nonEmpty) {
+      val last = remaining
+
+      readOne.lift(()).isDefined ||
+        eatWhitespace() ||
+        eatComments() ||
+        eatErrors(): Unit
+
+      // last-effort sanity check
+      if (remaining == last)
+        sys.error(s"no progress in the last run! remaining string: $remaining")
+    }
+
+    tokens.reverse
+  }
+
+}
diff --git a/modules/parser/src/test/scala/playground/smithyql/parser/ParserSuite.scala b/modules/parser/src/test/scala/playground/smithyql/parser/ParserSuite.scala
@@ -10,6 +10,8 @@ import io.circe.Decoder
 import io.circe.syntax._
 import playground.Assertions._
 import playground.smithyql._
+import playground.smithyql.parser.v2.scanner.Scanner
+import playground.smithyql.parser.v2.scanner.TokenKind
 import weaver._
 
 import java.nio.file
@@ -52,11 +54,30 @@ trait ParserSuite extends SimpleIOSuite {
         }
       }
     }
+
+    validTokensTest(testCase, trimWhitespace)
   }
 
+  private def validTokensTest(
+    testCase: TestCase,
+    trimWhitespace: Boolean,
+  ) =
+    test(testCase.name + " (v2 scanner)") {
+      testCase.readInput(trimWhitespace).map { input =>
+        val scanned = Scanner.scan(input)
+
+        val errors = scanned.filter(_.kind == TokenKind.Error)
+        // non-empty inputs should parse to non-empty outputs
+        assert(input.isEmpty || scanned.nonEmpty) &&
+        assert(errors.isEmpty)
+      }
+    }
+
+  // invalidTokens: a flag that tells the suite whether the file should contain invalid tokens.
   def loadNegativeParserTests[Alg[_[_]]: SourceParser](
     prefix: String,
     trimWhitespace: Boolean = false,
+    invalidTokens: Boolean,
   ): Unit = loadTestCases("", List("negative", prefix)).foreach { testCase =>
     test(testCase.name) {
       testCase.readInput(trimWhitespace).map { input =>
@@ -66,6 +87,10 @@ trait ParserSuite extends SimpleIOSuite {
         }
       }
     }
+
+    if (!invalidTokens)
+      validTokensTest(testCase, trimWhitespace)
+
   }
 
   private def readText(

diff --git a/...est/scala/playground/smithyql/parser/generative/negative/PreludeParserNegativeTests.scala b/...est/scala/playground/smithyql/parser/generative/negative/PreludeParserNegativeTests.scala
@@ -4,5 +4,5 @@ import playground.smithyql.Prelude
 import playground.smithyql.parser.ParserSuite
 
 object PreludeParserNegativeTests extends ParserSuite {
-  loadNegativeParserTests[Prelude]("prelude", trimWhitespace = true)
+  loadNegativeParserTests[Prelude]("prelude", trimWhitespace = true, invalidTokens = false)
 }
diff --git a/modules/parser/src/test/scala/playground/smithyql/parser/v2/Diffs.scala b/modules/parser/src/test/scala/playground/smithyql/parser/v2/Diffs.scala
@@ -0,0 +1,12 @@
+package playground.smithyql.parser.v2
+
+import com.softwaremill.diffx.Diff
+import playground.smithyql.parser.v2.scanner.Token
+import playground.smithyql.parser.v2.scanner.TokenKind
+
+object Diffs {
+
+  implicit val tokenKindDiff: Diff[TokenKind] = Diff.derived
+  implicit val tokenDiff: Diff[Token] = Diff.derived
+
+}