From 7b8561adf530e83fdca3037df3c8a3c7f333c1e0 Mon Sep 17 00:00:00 2001 From: frank Date: Tue, 24 Dec 2024 23:57:50 +0800 Subject: [PATCH] =?UTF-8?q?=E2=AC=86=EF=B8=8F=20UPDATE=20=E8=AF=AD?= =?UTF-8?q?=E6=B3=95=E5=88=86=E6=9E=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 24.11/lex.mjs | 17 +- 24.11/parser.mjs | 641 +++++----- 24.11/parser_test1.mjs | 144 +++ 24.11/parser_test2.mjs | 160 +++ 24.11/parser_test3.mjs | 251 ++++ 24.11/parser_test4.mjs | 209 ++++ 24.11/parser_test5.mjs | 215 ++++ 24.11/parser_test6.mjs | 339 ++++++ 24.11/parser_test7.mjs | 411 +++++++ 24.11/parser_test8.mjs | 407 +++++++ 24.11/parser_test9.mjs | 419 +++++++ ...55\346\263\225\345\210\206\346\236\220.md" | 1045 ++++++++++++++++- 12 files changed, 3915 insertions(+), 343 deletions(-) create mode 100644 24.11/parser_test1.mjs create mode 100644 24.11/parser_test2.mjs create mode 100644 24.11/parser_test3.mjs create mode 100644 24.11/parser_test4.mjs create mode 100644 24.11/parser_test5.mjs create mode 100644 24.11/parser_test6.mjs create mode 100644 24.11/parser_test7.mjs create mode 100644 24.11/parser_test8.mjs create mode 100644 24.11/parser_test9.mjs diff --git a/24.11/lex.mjs b/24.11/lex.mjs index b8b6b892..8076189b 100644 --- a/24.11/lex.mjs +++ b/24.11/lex.mjs @@ -1,6 +1,6 @@ export const ASSIGN = 'ASSIGN', LPAREN = 'LPAREN', RPAREN = 'RPAREN', LBRACE = 'LBRACE', RBRACE = 'RBRACE', LBRACKET = 'LBRACKET', RBRACKET = 'RBRACKET', SEMICOLON = 'SEMICOLON', COMMA = 'COMMA', PLUS = 'PLUS', MINUS = 'MINUS', MULTIPLY = 'MULTIPLY', DIVIDE = 'DIVIDE', MODULUS = 'MODULUS', - POINT = 'POINT', + POINT = 'POINT', INCREMENT = 'INCREMENT', DECREMENT = 'DECREMENT', AND = 'AND', OR = 'OR', NOT = 'NOT', GT = 'GT', LT = 'LT', GTE = 'GTE', LTE = 'LTE', NEQ = 'NEQ', BAND = 'BAND', BOR = 'BOR', BXOR = 'BXOR', BNOT = 'BNOT', BSHL = 'BSHL', BSHR = 'BSHR'; @@ -49,8 +49,17 @@ export function lex(input) { case '}': tokens.push(new Token(RBRACE, '}')); position++; break; case '+': - tokens.push(new Token(PLUS, '+')); position++; break; + if (input[position + 1] == '+') { + tokens.push(new Token(INCREMENT, '++')); position += 2; break; + } else { + tokens.push(new Token(PLUS, '+')); position ++; break; + } case '-': + if (input[position + 1] == '-') { + tokens.push(new Token(DECREMENT, '--')); position += 2; break; + } else { + tokens.push(new Token(MINUS, '-')); position ++; break; + } tokens.push(new Token(MINUS, '-')); position++; break; case '*': tokens.push(new Token(MULTIPLY, '*')); position++; break; @@ -107,9 +116,9 @@ export function lex(input) { case '\t': case '\r': position++; break; - // 回车这里解析一下,因为想要支持js的弱判断 + // 回车忽略 case '\n': - tokens.push(new Token(NEW_LINE, '\n')); position++; break; + position++; break; case '\'': var start = position; while (true) { diff --git a/24.11/parser.mjs b/24.11/parser.mjs index fddf034a..e0e1a76b 100644 --- a/24.11/parser.mjs +++ b/24.11/parser.mjs @@ -1,215 +1,7 @@ -// var {lex, Token} = require('./lex.mjs'); - -// const ASSIGN = 'ASSIGN', LPAREN = 'LPAREN', RPAREN = 'RPAREN', LBRACE = 'LBRACE', RBRACE = 'RBRACE', LBRACKET = 'LBRACKET', RBRACKET = 'RBRACKET', -// SEMICOLON = 'SEMICOLON', COMMA = 'COMMA', PLUS = 'PLUS', MINUS = 'MINUS', MULTIPLY = 'MULTIPLY', DIVIDE = 'DIVIDE', MODULUS = 'MODULUS', -// POINT = 'POINT', -// AND = 'AND', OR = 'OR', NOT = 'NOT', GT = 'GT', LT = 'LT', GTE = 'GTE', LTE = 'LTE', NEQ = 'NEQ', -// BAND = 'BAND', BOR = 'BOR', BXOR = 'BXOR', BNOT = 'BNOT', BSHL = 'BSHL', BSHR = 'BSHR'; - -// const VAR = 'VAR', IDENTIFIER = 'IDENTIFIER', NUMBER = 'NUMBER', STRING = 'STRING', FUNCTION = 'FUNCTION', IF = 'IF', ELSE = 'ELSE', RETURN = 'RETURN', CONTINUE = 'CONTINUE', BREAK = 'BREAK',FOR = "for", WHILE = "while", NEW_LINE='NEW_LINE', EOF = 'EOF'; -// // ast节点 -// class Node { -// constructor(type, full) { -// this.type = type; -// this.full = full; -// } -// } - -// // identifier节点 -// class IdentifierNode extends Node { -// constructor(name) { -// super('Identifier', true); -// this.name = name; -// } -// } - -// // number节点 -// class NumberNode extends Node { -// constructor(value) { -// super('Number', true); -// this.value = value; -// } -// } - -// // string节点 -// class StringNode extends Node { -// constructor(value) { -// super('String', true); -// this.value = value; -// } -// } - -// // boolean节点 -// class BooleanNode extends Node { -// constructor(value) { -// super('Boolean', true); -// this.value = value; -// } -// } - -// // null节点 -// class NullNode extends Node { -// constructor() { -// super('Null', true); -// } -// } - -// // 前缀运算符节点 -// class PrefixOpratorNode extends Node { -// constructor(op, right) { -// super('PreOperator', false); -// this.op = op; -// this.right = right; -// } -// } - -// // 中缀运算符节点 -// class InfixOpratorNode extends Node { -// constructor( op, left, right) { -// super('InfixOperator', false); -// this.left = left; -// this.op = op; -// this.right = right; -// } -// } - -// class GroupNode extends Node { -// constructor(expression) { -// super('Group', false); -// this.expression = expression; -// } -// } -// const precedence = { -// '*': 2, -// '/': 2, -// '+': 1, -// '-': 1, -// } - -// class Parser { -// constructor(tokens) { -// this.tokens = tokens; -// this.pos = 0; -// } - - -// parse() { - -// } - -// parseExpression() { - -// // 遍历token,封装成AstNode节点放到数组中 -// var nodeArr = []; -// while (this.pos < this.tokens.length) { -// var item = this.tokens[this.pos++]; -// // 结束/换行符/分号 -// if (item.type === 'EOF' || item.type === 'SEMICOLON' || item.type === 'NEW_LINE') { -// break; -// } -// var node = null; -// if (item.type === 'IDENTIFIER') { -// node = new IdentifierNode(item.value); -// } else if (item.type === 'NUMBER') { -// node = new NumberNode(item.value); -// } else if (item.type === 'STRING') { -// node = new StringNode(item.value); -// } else if (item.type === 'BOOLEAN') { -// node = new BooleanNode(item.value); -// } else if (item.type === 'NULL') { -// node = new NullNode(); -// } else if (item.type === MULTIPLY || item.type === DIVIDE || item.type === PLUS || item.type === MINUS) { -// node = new InfixOpratorNode(item.value, null, null); -// } else { -// throw new Error('unexpected token:' + item.value); -// } -// nodeArr.push(node); -// } - -// // 遍历数组,找到优先级最高的运算符节点,将其左右放入到节点中 -// while (true) { -// var maxPrecedence = -1, maxIndex = -1; -// for (var i=0; i maxPrecedence && (maxPrecedence = precedence[node.op]) && (maxIndex = i); -// } -// } -// } -// if (maxIndex > 0) { -// var node = nodeArr[maxIndex]; -// var pre = nodeArr[maxIndex - 1]; -// var next = nodeArr[maxIndex + 1]; -// node.left = pre; -// node.right = next; -// node.full = true; -// nodeArr.splice(maxIndex - 1, 3, nodeArr[maxIndex]); -// } else { -// break; -// } -// } -// return nodeArr[0]; -// } - -// } - - - -// var tokens = lex("1 +2*3/4 -5") - -// var parser = new Parser(tokens); - -// console.log(JSON.stringify(parser.parseExpression(),0,2)); -// var res = { -// "type": "InfixOperator", -// "full": true, -// "left": { -// "type": "InfixOperator", -// "full": true, -// "left": { -// "type": "Number", -// "full": true, -// "value": "1" -// }, -// "op": "+", -// "right": { -// "type": "InfixOperator", -// "full": true, -// "left": { -// "type": "InfixOperator", -// "full": true, -// "left": { -// "type": "Number", -// "full": true, -// "value": "2" -// }, -// "op": "*", -// "right": { -// "type": "Number", -// "full": true, -// "value": "3" -// } -// }, -// "op": "/", -// "right": { -// "type": "Number", -// "full": true, -// "value": "4" -// } -// } -// }, -// "op": "-", -// "right": { -// "type": "Number", -// "full": true, -// "value": "5" -// } -// } - import * as LEX from "./lex.mjs"; import { lex, Token } from './lex.mjs'; +/*******************Sentence声明开始,有三种*******************/ class Sentence { constructor(type) { this.endPos = -1; @@ -218,8 +10,6 @@ class Sentence { } } } -class Expresstion {} - class VarSentence extends Sentence { constructor(name, value, endPos) { super("VAR"); @@ -227,6 +17,10 @@ class VarSentence extends Sentence { this.value = value; // 这里的value是个表达式 this.endPos = endPos; } + + toString() { + return `var ${this.name} = ${this.value.toString()}`; + } } class ReturnSentence extends Sentence { @@ -235,6 +29,23 @@ class ReturnSentence extends Sentence { this.value = value; // 这里的value也是表达式 this.endPos = endPos; } + + toString() { + return `return ${this.value.toString()}`; + } +} + +class BlockSentence extends Sentence { + constructor(sentences, endPos) { + super("BLOCK"); + this.sentences = sentences; + this.endPos = endPos; + } + toString() { + return `{ + ${this.sentences.map(it=>it.toString()).join('\n')} +}` + } } class ExpressionStatement extends Sentence { @@ -243,22 +54,153 @@ class ExpressionStatement extends Sentence { this.expression = expression; // 这里的expression也是表达式 this.endPos = endPos; } + + toString() { + return this.expression.toString(); + } +} +/*******************Sentence声明结束*******************/ + +/*******************表达式Ast节点声明开始*******************/ +class AstNode { +} + +class NumberAstNode extends AstNode { + constructor(token) { + super(); + this.token = token; + } + + toString() { + return this.token.value; + } +} + +class IdentifierAstNode extends AstNode { + constructor(token) { + super(); + this.token = token; + } + + toString() { + return this.token.value; + } +} + +class NullAstNode extends AstNode { + + toString() { + return "null"; + } +} + +class StringAstNode extends AstNode { + constructor(token) { + super(); + this.token = token; + } + + toString() { + return this.token.value; + } } -class IdentifierExpression extends Expresstion { - constructor(str, token) { - super("IDENTIFIER"); - this.str = str; +class BooleanAstNode extends AstNode { + constructor(token) { + super(); this.token = token; } + + toString() { + return this.token.value; + } +} + +class InfixOperatorAstNode extends AstNode { + constructor(token) { + super(); + this.op = token; + this.left = null; + this.right = null; + this.precedence = precedenceMap[token.value]; + } + + toString() { + return `(${this.left.toString()} ${this.op.value} ${this.right.toString()})`; + } } -// 语法解析,把tokens转换为sentences + +class PrefixOperatorAstNode extends AstNode { + constructor(token, right) { + super(false); + this.op = token; + this.right = right; + } + toString() { + return `(${this.op.value} ${this.right.toString()})`; + } +} + +class PostfixOperatorAstNode extends AstNode { + constructor(token, left) { + super(false); + this.op = token; + this.left = left; + } + toString() { + return `(${this.left.toString()} ${this.op.value})`; + } +} + +class FunctionDeclarationAstNode extends AstNode { + constructor(name, params, body) { + super(); + this.name = name == null ? null :new IdentifierAstNode(name); + this.params = params; + this.body = body; + } + toString() { + return `function${this.name ? ' ' + this.name.toString() : ''}(${this.params.join(',')})${this.body.map(it=>it.toString()).join('\n')}`; + } +} + +class FunctionCallAstNode extends AstNode { + constructor(name, args) { + super(); + this.name = new IdentifierAstNode(name); + this.args = args; // args是ast数组 + } + toString() { + return `${this.name.toString()}(${this.args.map(it=>it.toString()).join(',')})` + } +} + +class GroupAstNode extends AstNode { + constructor(exp) { + super(); + this.exp = exp; + } + toString() { + // 因为小括号已经在运算符的toString中使用了,这里为了更好的凸显使用中文中括号 + return `【${this.exp.toString()}】` + } +} + +/*******************表达式Ast节点声明结束*******************/ +// 辅助函数 +function assert(condition, msg) { + if (!condition) { + throw new Error("assert failed "+ msg); + } +} + +// 语法解析函数,把tokens转换为sentences function parse(tokens) { var sentences = []; for (var i = 0; i < tokens.length; i++) { var token = tokens[i]; var sentence = null; - if (token.type === LEX.NEW_LINE || token.type === LEX.SEMICOLON) { + if (token.type === LEX.SEMICOLON) { continue; } else if (token.type === LEX.EOF) { break; @@ -266,6 +208,8 @@ function parse(tokens) { sentence = parseVarSentence(tokens, i); } else if (token.type === LEX.RETURN) { sentence = parseReturnSentence(tokens, i); + } else if (token.type === LEX.LBRACE) { + sentence = parseBlockSentence(tokens, i); } else { sentence = parseExpressionStatement(tokens, i); } @@ -280,9 +224,12 @@ function parseVarSentence(tokens, start) { assert (tokens[start].type === LEX.VAR); assert (tokens[start + 1].type === LEX.IDENTIFIER); assert (tokens[start + 2].type === LEX.ASSIGN); - var name = new IdentifierExpression(tokens[start + 1].value, tokens[start + 1]); + var name = new IdentifierAstNode(tokens[start + 1]); + var braceCount = 0; for (var i = start + 3; i < tokens.length; i++) { - if (tokens[i].type === LEX.SEMICOLON || tokens[i].type === LEX.NEW_LINE || tokens[i].type === LEX.EOF) { + if (tokens[i].type == LEX.LBRACE) braceCount++; + if (tokens[i].type == LEX.RBRACE) braceCount--; + if ((braceCount <= 0 &&tokens[i].type === LEX.SEMICOLON) || tokens[i].type === LEX.EOF) { var value = parseExpression(tokens, start + 3, i); return new VarSentence(name, value, i); } @@ -292,96 +239,216 @@ function parseVarSentence(tokens, start) { // 与var语句类似 function parseReturnSentence(tokens, start) { assert (tokens[start].type === LEX.RETURN); + var braceCount = 0; for (var i = start + 1; i < tokens.length; i++) { - if (tokens[i].type === LEX.SEMICOLON || tokens[i].type === LEX.NEW_LINE || tokens[i].type === LEX.EOF) { - var value = parseExpression(tokens, start + 1, i); - return new ReturnSentence(value, i); + if (tokens[i].type == LEX.LBRACE) braceCount++; + if (tokens[i].type == LEX.RBRACE) braceCount--; + if ((braceCount <= 0 &&tokens[i].type === LEX.SEMICOLON) || tokens[i].type === LEX.EOF) { + return new ReturnSentence(parseExpression(tokens, start + 1, i), i); } } + return new ReturnSentence(parseExpression(tokens, start + 1, i), tokens.length - 1); } -// 转换为表达式语句 -function parseExpressionStatement(tokens, start) { +// 转换为块语句,块语句中包含一个语句数组 +function parseBlockSentence(tokens, start) { + var braceCount = 0; for (var i = start; i < tokens.length; i++) { - if (tokens[i].type === LEX.SEMICOLON || tokens[i].type === LEX.NEW_LINE || tokens[i].type === LEX.EOF) { - var expression = parseExpression(tokens, start, i); - return new ExpressionStatement(expression, i); + if (tokens[i].type == LEX.LBRACE) braceCount++; + if (tokens[i].type == LEX.RBRACE) braceCount--; + if (braceCount == 0) { + return new BlockSentence(parse(tokens.slice(start + 1, i)), i); } } + throw new Error("brace not close for block sentence") } -function assert(condition) { - if (!condition) { - throw new Error("assert failed"); +// 转换为表达式语句 +function parseExpressionStatement(tokens, start) { + var braceCount = 0; + for (var i = start; i < tokens.length; i++) { + if (tokens[i].type == LEX.LBRACE) braceCount++; + if (tokens[i].type == LEX.RBRACE) braceCount--; + if ((braceCount <= 0 && tokens[i].type === LEX.SEMICOLON) || tokens[i].type === LEX.EOF) { + return new ExpressionStatement(parseExpression(tokens, start, i), i); + } } + return new ExpressionStatement(parseExpression(tokens, start, tokens.length - 1), tokens.length - 1); } + - +// 二元运算符优先级表 const precedenceMap = { - '+': 1, - '-': 1, - '*': 2, - '/': 2 + '=': 10, + '||': 11, '&&': 12, '^': 13, + '==': 14, '!=': 14, + '<': 15, '<=': 15, '>': 15, '>=': 15, + '<<': 16, '>>': 16, '>>>': 16, + '+': 17, '-': 17, + '*': 18, '/': 18, '%': 18, } -class AstNode { - constructor(full) { - if (full === undefined) this.full = false; - this.full = full; + +// 最重要的表达式解析函数 +function parseExpression(tokens, start, end) { + var stack = []; + var i = start, mid = null; + while (i < end) { + // 元素类型可能不是数字,这里封装成一个getNode方法,处理字面量和前缀运算符等等 + mid = mid == null ? getNode() : mid; + // 其他代码基本没变,就是getNode过程中i向后移动了,所以下面i+1的地方改成了i,i+2的地方改成i++了 + var opNode = i == end ? null : new InfixOperatorAstNode(tokens[i]); + var stackTopPrecedence = stack.length == 0? 0: stack[stack.length - 1].precedence; + if (opNode == null || opNode.precedence <= stackTopPrecedence) { + if (stack.length == 0) return mid; + var top = stack.pop(); + top.right = mid; + mid = top; + if (opNode == null && stack.length == 0) { + return mid; + } + } else { + opNode.left = mid; + stack.push(opNode); + i++; + mid = null; + } } -} -class NumberAstNode extends AstNode { - constructor(value) { - super(true); - this.value = value; + function getNode(token) { + var token = tokens[i]; + var node = null; + switch (token.type) { + case LEX.NUMBER: + node = new NumberAstNode(token); + i++; break; + case LEX.STRING: + node = new StringAstNode(token); + i++; break; + case LEX.BOOLEAN: + node = new BooleanAstNode(token); + i++; break; + case LEX.NULL: + node = new NullAstNode(); + i++; break; + case LEX.IDENTIFIER: + // 函数调用 + if (i + 1 < end && tokens[i + 1].type == LEX.LPAREN) { + node = parseFunctionCall(); + } else { + i++; + node = new IdentifierAstNode(token); + } + break; + case LEX.LPAREN: + node = parseGroup(); + break; + case LEX.FUNCTION: + node = parseFunctionDeclaration(); + break; + // 遇到前缀运算符 + case LEX.PLUS: + case LEX.MINUS: + case LEX.INCREMENT: + case LEX.DECREMENT: + case LEX.NOT: + case LEX.BIT_NOT: + i++; + node = new PrefixOperatorAstNode(token, getNode()); + break; + default: + throw new Error('unexpected token in getNode: ' + token.type); + } + // 后缀 + if (tokens[i].type == LEX.INCREMENT || tokens[i].type == LEX.DECREMENT) { + node = new PostfixOperatorAstNode(tokens[i], node); + i++; + } + return node; } -} -class InfixOperatorAstNode extends AstNode { - constructor(token) { - super(false); - this.op = token; - this.left = null; - this.right = null; - this.precedence = precedenceMap[token.value]; + function parseGroup() { + assert(tokens[i].type == LEX.LPAREN); + var parenCount = 1; + for (var j = i + 1; j < end; j++) { + if (tokens[j].type == LEX.LPAREN) parenCount++; + if (tokens[j].type == LEX.RPAREN) parenCount--; + if (parenCount == 0) { + var exp = parseExpression(tokens, i + 1, j); + i = j + 1; // j是右括号,所以还要再往后一个 + return new GroupAstNode(exp); + } + } + throw new Error('group not close') } -} -function parseExpression(tokens, start, end) { - // 1 先把tokens 转成 AstNode数组 - var nodes = []; - for (var i = start; i < end; i++) { - var token = tokens[i]; - if (token.type === LEX.NUMBER) { - nodes.push(new NumberAstNode(token.value)); - } else if (token.type === LEX.PLUS || token.type === LEX.MINUS || token.type === LEX.MULTIPLY || token.type === LEX.DIVIDE) { - var node = new InfixOperatorAstNode(token); - nodes.push(node); - } else { - throw new Error("unexpected token type: " + token.type); + function parseFunctionCall() { + assert(tokens[i].type == LEX.IDENTIFIER); // 函数名 + assert(tokens[i + 1].type == LEX.LPAREN); // 左括号 + var nameTk = tokens[i]; + i = i + 2; // 此时i位于第一个参数的start位置 + + // 识别参数要找逗号来隔开每个参数的表达式,分别去递归解析 + var args = []; + var innerPattern = 0, innerBracket = 0; + for (var j = i; j < end; j++) { + if (tokens[j].type == LEX.LPAREN) innerPattern++; + if (tokens[j].type == LEX.RPAREN) innerPattern--; + if (tokens[j].type == LEX.LBRACE) innerBracket++; + if (tokens[j].type == LEX.RBRACE) innerBracket--; + // 最后一个参数 + if (innerPattern == -1) { + args.push(parseExpression(tokens, i, j)); + i = j + 1; + return new FunctionCallAstNode(nameTk, args); + } + // 出现逗号,并且不在内部的()或者[]中,说明是参数的结束 + if (tokens[j].type == LEX.COMMA && innerPattern == 0) { + args.push(parseExpression(tokens, i, j)); + i = j + 1; + } } + throw new Error("unexpected end of expression"); } - // 2 数组元素不为1,则不停地遍历数组,找到最高优先级的,把两边的节点合并进来 - while (nodes.length > 1) { - var maxPrecedence = -1, maxIndex = -1; - for (var i = 0; i < nodes.length; i++) { - var node = nodes[i]; - if (!node.full && node.precedence > maxPrecedence) { - maxPrecedence = node.precedence; - maxIndex = i; + function parseFunctionDeclaration() { + assert(tokens[i].type == LEX.FUNCTION); + // 1 函数名识别,null为匿名函数 + var name = tokens[i + 1].type == LEX.IDENTIFIER ? tokens[i++] : null; + assert(tokens[i + 1].type == LEX.LPAREN); + // 2 参数识别,格式就是括号内,identifier,逗号,..循环..右括号结束 + var params = []; + for (var j = i + 2; j < end; j+=2) { + assert(tokens[j].type == LEX.IDENTIFIER); + assert(tokens[j+1].type == LEX.COMMA || tokens[j + 1].type == LEX.RPAREN); + params.push(new IdentifierAstNode(tokens[j])); + // 右括号结束参数部分 + if (tokens[j + 1].type == LEX.RPAREN) { + i = j + 2; + break; } } - var maxPrecedenceNode = nodes[maxIndex]; - maxPrecedenceNode.left = nodes[maxIndex - 1]; - maxPrecedenceNode.right = nodes[maxIndex + 1]; - maxPrecedenceNode.full = true; - // splice函数,把maxInde-1开始往后3个元素,替换为maxPrecedenceNode这一个元素 - nodes.splice(maxIndex - 1, 3, maxPrecedenceNode); - } - return nodes[0]; + // 3 body识别,按照大括号识别即可,注意有可能有大括号嵌套,所以要记录左大括号出现的数量,当右大括号出现,数量减一。数量为0,就是函数body结束 + assert(tokens[i].type == LEX.LBRACE); + var braceCount = 1; + for (var j = i + 1; j < end; j++) { + if (tokens[j].type == LEX.LBRACE) braceCount++; + if (tokens[j].type == LEX.RBRACE) braceCount--; + // 函数结束 + if (braceCount == 0) { + var body = parse(tokens.slice(i, j + 1)); + i = j + 1; + return new FunctionDeclarationAstNode(name, params, body); + } + } + } } - -var code = `var a = 1 + 2 * 3 / 4 - 5;` - +var code = `var a = (-1 + -2) * a++ / null - !false + "hello" + add(add(3,add(1,2)), 4); +var add = function(a, b) { + return a + b; +}; +function minus(a,b) { + return a- b; +}; +{var a = 1;}; +` var tokens = lex(code); - var sentences = parse(tokens) - -console.log(JSON.stringify(sentences, 0, 2)); \ No newline at end of file +for (var i = 0; i < sentences.length; i++) { + console.log(sentences[i].toString()); +} \ No newline at end of file diff --git a/24.11/parser_test1.mjs b/24.11/parser_test1.mjs new file mode 100644 index 00000000..ec5a1d5a --- /dev/null +++ b/24.11/parser_test1.mjs @@ -0,0 +1,144 @@ +import * as LEX from "./lex.mjs"; +import { lex } from './lex.mjs'; + +class Sentence { + constructor(type) { + this.endPos = -1; + if (type) { + this.type = type.toUpperCase() + "_SENTENCE"; + } + } +} + +class VarSentence extends Sentence { + constructor(name, value, endPos) { + super("VAR"); + this.name = name; // name本身其实也是个表达式 + this.value = value; // 这里的value是个表达式 + this.endPos = endPos; + } +} + +class ReturnSentence extends Sentence { + constructor(value, endPos) { + super("RETURN"); + this.value = value; // 这里的value也是表达式 + this.endPos = endPos; + } +} + +class BlockSentence extends Sentence { + constructor(sentences, endPos) { + super("BLOCK"); + this.sentences = sentences; + this.endPos = endPos; + } +} + +class ExpressionStatement extends Sentence { + constructor(expression, endPos) { + super("EXPRESSION"); + this.expression = expression; // 这里的expression也是表达式 + this.endPos = endPos; + } +} + +class AstNode {} + +class IdentifierAstNode extends AstNode { + constructor(token) { + super(); + this.token = token; + } +} +// 语法解析,把tokens转换为sentences +function parse(tokens) { + var sentences = []; + for (var i = 0; i < tokens.length; i++) { + var token = tokens[i]; + var sentence = null; + if (token.type === LEX.SEMICOLON) { + continue; + } else if (token.type === LEX.EOF) { + break; + } if (token.type === LEX.VAR) { + sentence = parseVarSentence(tokens, i); + } else if (token.type === LEX.RETURN) { + sentence = parseReturnSentence(tokens, i); + } else if (token.type === LEX.LBRACE) { + sentence = parseBlockSentence(tokens, i); + } else { + sentence = parseExpressionStatement(tokens, i); + } + i = sentence.endPos; + sentences.push(sentence); + } + return sentences; +} + +// 从start开始转换成var语句,校验是不是var xx = xxx;格式,然后需要解析表达式parseExpression函数。 +function parseVarSentence(tokens, start) { + assert (tokens[start].type === LEX.VAR); + assert (tokens[start + 1].type === LEX.IDENTIFIER); + assert (tokens[start + 2].type === LEX.ASSIGN); + var name = new IdentifierAstNode(tokens[start + 1]); + for (var i = start + 3; i < tokens.length; i++) { + if (tokens[i].type === LEX.SEMICOLON || tokens[i].type === LEX.EOF) { + var value = parseExpression(tokens, start + 3, i); + return new VarSentence(name, value, i); + } + } +} + +// 与var语句类似 +function parseReturnSentence(tokens, start) { + assert (tokens[start].type === LEX.RETURN); + for (var i = start + 1; i < tokens.length; i++) { + if (tokens[i].type === LEX.SEMICOLON || tokens[i].type === LEX.EOF) { + var value = parseExpression(tokens, start + 1, i); + return new ReturnSentence(value, i); + } + } +} + +// 转换为块语句,块语句中包含一个语句数组 +function parseBlockSentence(tokens, start) { + var braceCount = 0; + for (var i = start; i < tokens.length; i++) { + if (tokens[i].type == LEX.LBRACE) braceCount++; + if (tokens[i].type == LEX.RBRACE) braceCount--; + if (braceCount == 0) { + return new BlockSentence(parse(tokens.slice(start + 1, i)), i); + } + } + throw new Error("brace not close for block sentence") +} +// 转换为表达式语句 +function parseExpressionStatement(tokens, start) { + for (var i = start; i < tokens.length; i++) { + if (tokens[i].type === LEX.SEMICOLON || tokens[i].type === LEX.EOF) { + var expression = parseExpression(tokens, start, i); + return new ExpressionStatement(expression, i); + } + } +} + +// 这里先放置个空的逻辑,后面再补上 +function parseExpression(tokens, start, end) { + return new AstNode(); +} + +function assert(condition) { + if (!condition) { + throw new Error("assert failed"); + } +} + +var code = `var a = 1 + 2 * 3 / 4 - 5; + return a; + func(a, b); +`; + +var tokens = lex(code); +var sentences = parse(tokens) +console.log(JSON.stringify(sentences, 0, 2)); \ No newline at end of file diff --git a/24.11/parser_test2.mjs b/24.11/parser_test2.mjs new file mode 100644 index 00000000..47828b5d --- /dev/null +++ b/24.11/parser_test2.mjs @@ -0,0 +1,160 @@ +import * as LEX from "./lex.mjs"; +import { lex } from './lex.mjs'; + +class Sentence { + constructor(type) { + if (type) { + this.type = type.toUpperCase() + "_SENTENCE"; + } + } +} +class VarSentence extends Sentence { + constructor(name, value) { + super("VAR"); + this.name = name; // name本身其实也是个表达式 + this.value = value; // 这里的value是个表达式 + } + + toString() { + return `var ${this.name} = ${this.value.toString()}`; + } +} + +class ReturnSentence extends Sentence { + constructor(value) { + super("RETURN"); + this.value = value; // 这里的value也是表达式 + } + toString() { + return `return ${this.value.toString()}`; + } +} + +class BlockSentence extends Sentence { + constructor(sentences) { + super("BLOCK"); + this.sentences = sentences; + } + toString() { + return `{ + ${this.sentences.map(it=>it.toString()).join('\n')} +}` + } +} + +class ExpressionStatement extends Sentence { + constructor(expression) { + super("EXPRESSION"); + this.expression = expression; // 这里的expression也是表达式 + } + + toString() { + return this.expression.toString(); + } +} + +class AstNode {} + +class IdentifierAstNode extends AstNode { + constructor(token) { + super(); + this.token = token; + } + toString() { + return this.token.value; + } +} +// 语法解析,把tokens转换为sentences +function parse(tokens) { + // 从i开始转换成var语句,校验是不是var xx = xxx;格式,然后需要解析表达式parseExpression函数。 + function parseVarSentence() { + assert (tokens[i].type === LEX.VAR); + assert (tokens[i + 1].type === LEX.IDENTIFIER); + assert (tokens[i + 2].type === LEX.ASSIGN); + var name = new IdentifierAstNode(tokens[i + 1]); + for (var j = i + 3; j < tokens.length; j++) { + if (tokens[j].type === LEX.SEMICOLON || tokens[j].type === LEX.EOF) { + var value = parseExpression(tokens, i + 3, j); + i = j; + return new VarSentence(name, value); + } + } + } + // 与var语句类似 + function parseReturnSentence() { + assert (tokens[i].type === LEX.RETURN); + for (var j = i + 1; j < tokens.length; j++) { + if (tokens[j].type === LEX.SEMICOLON || tokens[j].type === LEX.EOF) { + var value = parseExpression(tokens, i + 1, j); + i = j; + return new ReturnSentence(value); + } + } + } + // 转换为表达式语句 + function parseExpressionStatement() { + for (var j = i; j < tokens.length; j++) { + if (tokens[j].type === LEX.SEMICOLON || tokens[j].type === LEX.EOF) { + var expression = parseExpression(tokens, i, j); + i = j; + return new ExpressionStatement(expression); + } + } + } + // 转换为块语句,块语句中包含一个语句数组 + function parseBlockSentence() { + var braceCount = 0; + for (var j = i; j < tokens.length; j++) { + if (tokens[j].type == LEX.LBRACE) braceCount++; + if (tokens[j].type == LEX.RBRACE) braceCount--; + if (braceCount == 0) { + return new BlockSentence(parse(tokens.slice(i + 1, i = j))); + } + } + throw new Error("brace not close for block sentence") + } + var sentences = []; + for (var i = 0; i < tokens.length; i++) { + var token = tokens[i]; + var sentence = null; + if (token.type === LEX.SEMICOLON) { + continue; + } else if (token.type === LEX.EOF) { + break; + } if (token.type === LEX.VAR) { + sentence = parseVarSentence(); + } else if (token.type === LEX.RETURN) { + sentence = parseReturnSentence(); + } else if (token.type === LEX.LBRACE) { + sentence = parseBlockSentence(); + } else { + sentence = parseExpressionStatement(); + } + sentences.push(sentence); + } + return sentences; +} + + +// 这里先放置个空的逻辑,后面再补上 +function parseExpression(tokens, start, end) { + return new AstNode(); +} + +function assert(condition) { + if (!condition) { + throw new Error("assert failed"); + } +} + +var code = `var a = 1 + 2 * 3 / 4 - 5; + return a; + func(a, b); + {var a = 100;}`; + +var tokens = lex(code); +var sentences = parse(tokens) + +for (var i = 0; i < sentences.length; i++) { + console.log(sentences[i].toString()); +} \ No newline at end of file diff --git a/24.11/parser_test3.mjs b/24.11/parser_test3.mjs new file mode 100644 index 00000000..52f22b90 --- /dev/null +++ b/24.11/parser_test3.mjs @@ -0,0 +1,251 @@ +/**=========================================> 这是parser_test2.mjs中相同的部分 <========================================= */ +import * as LEX from "./lex.mjs"; +import { lex } from './lex.mjs'; + +class Sentence { + constructor(type) { + if (type) { + this.type = type.toUpperCase() + "_SENTENCE"; + } + } +} +class VarSentence extends Sentence { + constructor(name, value) { + super("VAR"); + this.name = name; // name本身其实也是个表达式 + this.value = value; // 这里的value是个表达式 + } + + toString() { + return `var ${this.name} = ${this.value.toString()}`; + } +} + +class ReturnSentence extends Sentence { + constructor(value) { + super("RETURN"); + this.value = value; // 这里的value也是表达式 + } + toString() { + return `return ${this.value.toString()}`; + } +} + +class BlockSentence extends Sentence { + constructor(sentences) { + super("BLOCK"); + this.sentences = sentences; + } + toString() { + return `{ + ${this.sentences.map(it=>it.toString()).join('\n')} +}` + } +} + +class ExpressionStatement extends Sentence { + constructor(expression) { + super("EXPRESSION"); + this.expression = expression; // 这里的expression也是表达式 + } + + toString() { + return this.expression.toString(); + } +} +///////////////////// 注意 AstNode新增了full属性 +class AstNode { + constructor(full) { + if (full === undefined) this.full = false; + this.full = full; + } +} + + +class IdentifierAstNode extends AstNode { + constructor(token) { + super(); + this.token = token; + } + toString() { + return this.token.value; + } +} +// 语法解析,把tokens转换为sentences +function parse(tokens) { + // 从i开始转换成var语句,校验是不是var xx = xxx;格式,然后需要解析表达式parseExpression函数。 + function parseVarSentence() { + assert (tokens[i].type === LEX.VAR); + assert (tokens[i + 1].type === LEX.IDENTIFIER); + assert (tokens[i + 2].type === LEX.ASSIGN); + var name = new IdentifierAstNode(tokens[i + 1]); + for (var j = i + 3; j < tokens.length; j++) { + if (tokens[j].type === LEX.SEMICOLON || tokens[j].type === LEX.EOF) { + var value = parseExpression(tokens, i + 3, j); + i = j; + return new VarSentence(name, value); + } + } + } + // 与var语句类似 + function parseReturnSentence() { + assert (tokens[i].type === LEX.RETURN); + for (var j = i + 1; j < tokens.length; j++) { + if (tokens[j].type === LEX.SEMICOLON || tokens[j].type === LEX.EOF) { + var value = parseExpression(tokens, i + 1, j); + i = j; + return new ReturnSentence(value); + } + } + } + // 转换为表达式语句 + function parseExpressionStatement() { + for (var j = i; j < tokens.length; j++) { + if (tokens[j].type === LEX.SEMICOLON || tokens[j].type === LEX.EOF) { + var expression = parseExpression(tokens, i, j); + i = j; + return new ExpressionStatement(expression); + } + } + } + // 转换为块语句,块语句中包含一个语句数组 + function parseBlockSentence() { + var braceCount = 0; + for (var j = i; j < tokens.length; j++) { + if (tokens[j].type == LEX.LBRACE) braceCount++; + if (tokens[j].type == LEX.RBRACE) braceCount--; + if (braceCount == 0) { + return new BlockSentence(parse(tokens.slice(i + 1, i = j))); + } + } + throw new Error("brace not close for block sentence") + } + var sentences = []; + for (var i = 0; i < tokens.length; i++) { + var token = tokens[i]; + var sentence = null; + if (token.type === LEX.SEMICOLON) { + continue; + } else if (token.type === LEX.EOF) { + break; + } if (token.type === LEX.VAR) { + sentence = parseVarSentence(); + } else if (token.type === LEX.RETURN) { + sentence = parseReturnSentence(); + } else if (token.type === LEX.LBRACE) { + sentence = parseBlockSentence(); + } else { + sentence = parseExpressionStatement(); + } + sentences.push(sentence); + } + return sentences; +} +function assert(condition) { + if (!condition) { + throw new Error("assert failed"); + } +} +/**=========================================> 这是parser_test2.mjs中相同的部分 <========================================= */ +const precedenceMap = { + '+': 1, + '-': 1, + '*': 2, + '/': 2 +} +class NumberAstNode extends AstNode { + constructor(value) { + super(true); + this.value = value; + } + toString() { + return this.value; + } +} +class InfixOperatorAstNode extends AstNode { + constructor(token) { + super(false); + this.op = token; + this.left = null; + this.right = null; + this.precedence = precedenceMap[token.value]; + } + toString() { + return `(${this.left.toString()} ${this.op.value} ${this.right.toString()})`; + } +} +function parseExpression2(tokens, start, end) { + // 1 先把tokens 转成 AstNode数组 + var nodes = []; + for (var i = start; i < end; i++) { + var token = tokens[i]; + if (token.type === LEX.NUMBER) { + nodes.push(new NumberAstNode(token.value)); + } else if (token.type === LEX.PLUS || token.type === LEX.MINUS || token.type === LEX.MULTIPLY || token.type === LEX.DIVIDE) { + var node = new InfixOperatorAstNode(token); + nodes.push(node); + } else { + throw new Error("unexpected token type: " + token.type); + } + } + // 2 数组元素不为1,则不停地遍历数组,找到最高优先级的,把两边的节点合并进来 + while (nodes.length > 1) { + var maxPrecedence = -1, maxIndex = -1; + for (var i = 0; i < nodes.length; i++) { + var node = nodes[i]; + if (!node.full && node.precedence > maxPrecedence) { + maxPrecedence = node.precedence; + maxIndex = i; + } + } + var maxPrecedenceNode = nodes[maxIndex]; + maxPrecedenceNode.left = nodes[maxIndex - 1]; + maxPrecedenceNode.right = nodes[maxIndex + 1]; + maxPrecedenceNode.full = true; + // splice函数,把maxInde-1开始往后3个元素,替换为maxPrecedenceNode这一个元素 + nodes.splice(maxIndex - 1, 3, maxPrecedenceNode); + } + return nodes[0]; +} + +function parseExpression(tokens, start, end) { + var nodes = []; + var opNodes = []; + for (var i = start; i < end; i++) { + var token = tokens[i]; + if (token.type === LEX.NUMBER) { + nodes.push(new NumberAstNode(token.value)); + } else if (token.type === LEX.PLUS || token.type === LEX.MINUS || token.type === LEX.MULTIPLY || token.type === LEX.DIVIDE) { + var node = new InfixOperatorAstNode(token); + while (opNodes.length > 0 && node.precedence <= opNodes[opNodes.length - 1].precedence) { + var opNode = opNodes.pop(); + // opNode一定是倒数第二个元素,所以就可以简化成下面这样 + opNode.right = nodes.pop(); + nodes.pop(); + opNode.left = nodes.pop(); + nodes.push(opNode); + } + nodes.push(node); + opNodes.push(node); + } else { + throw new Error("unexpected token type: " + token.type); + } + } + while (opNodes.length > 0) { + var opNode = opNodes.pop(); + // opNode一定是倒数第二个元素,所以就可以简化成下面这样 + opNode.right = nodes.pop(); + nodes.pop(); + opNode.left = nodes.pop(); + nodes.push(opNode); + } + return nodes[0]; +} +var code = `var a = 1 + 2 * 3 / 4 - 5;`; + +var tokens = lex(code); +var sentences = parse(tokens) + +for (var i = 0; i < sentences.length; i++) { + console.log(sentences[i].toString()); +} \ No newline at end of file diff --git a/24.11/parser_test4.mjs b/24.11/parser_test4.mjs new file mode 100644 index 00000000..88f5740d --- /dev/null +++ b/24.11/parser_test4.mjs @@ -0,0 +1,209 @@ +/**=========================================> 这是parser_test2.mjs中相同的部分 <========================================= */ +import * as LEX from "./lex.mjs"; +import { lex } from './lex.mjs'; + +class Sentence { + constructor(type) { + if (type) { + this.type = type.toUpperCase() + "_SENTENCE"; + } + } +} +class VarSentence extends Sentence { + constructor(name, value) { + super("VAR"); + this.name = name; // name本身其实也是个表达式 + this.value = value; // 这里的value是个表达式 + } + + toString() { + return `var ${this.name} = ${this.value.toString()}`; + } +} + +class ReturnSentence extends Sentence { + constructor(value) { + super("RETURN"); + this.value = value; // 这里的value也是表达式 + } + toString() { + return `return ${this.value.toString()}`; + } +} + +class BlockSentence extends Sentence { + constructor(sentences) { + super("BLOCK"); + this.sentences = sentences; + } + toString() { + return `{ + ${this.sentences.map(it=>it.toString()).join('\n')} +}` + } +} + +class ExpressionStatement extends Sentence { + constructor(expression) { + super("EXPRESSION"); + this.expression = expression; // 这里的expression也是表达式 + } + + toString() { + return this.expression.toString(); + } +} +class AstNode { +} + + +class IdentifierAstNode extends AstNode { + constructor(token) { + super(); + this.token = token; + } + toString() { + return this.token.value; + } +} +// 语法解析,把tokens转换为sentences +function parse(tokens) { + // 从i开始转换成var语句,校验是不是var xx = xxx;格式,然后需要解析表达式parseExpression函数。 + function parseVarSentence() { + assert (tokens[i].type === LEX.VAR); + assert (tokens[i + 1].type === LEX.IDENTIFIER); + assert (tokens[i + 2].type === LEX.ASSIGN); + var name = new IdentifierAstNode(tokens[i + 1]); + for (var j = i + 3; j < tokens.length; j++) { + if (tokens[j].type === LEX.SEMICOLON || tokens[j].type === LEX.EOF) { + var value = parseExpression(tokens, i + 3); + assert(i = j, "parse var sentence failed"); + return new VarSentence(name, value); + } + } + } + // 与var语句类似 + function parseReturnSentence() { + assert (tokens[i].type === LEX.RETURN); + for (var j = i + 1; j < tokens.length; j++) { + if (tokens[j].type === LEX.SEMICOLON || tokens[j].type === LEX.EOF) { + var value = parseExpression(tokens, i + 1); + i = j; + assert(i = j, "parse var sentence failed"); + return new ReturnSentence(value); + } + } + } + // 转换为表达式语句 + function parseExpressionStatement() { + for (var j = i; j < tokens.length; j++) { + if (tokens[j].type === LEX.SEMICOLON || tokens[j].type === LEX.EOF) { + var expression = parseExpression(tokens, i); + i = j; + assert(i = j, "parse var sentence failed"); + return new ExpressionStatement(expression); + } + } + } + // 转换为块语句,块语句中包含一个语句数组 + function parseBlockSentence() { + var braceCount = 0; + for (var j = i; j < tokens.length; j++) { + if (tokens[j].type == LEX.LBRACE) braceCount++; + if (tokens[j].type == LEX.RBRACE) braceCount--; + if (braceCount == 0) { + return new BlockSentence(parse(tokens.slice(i + 1, i = j))); + } + } + throw new Error("brace not close for block sentence") + } + var sentences = []; + for (var i = 0; i < tokens.length; i++) { + var token = tokens[i]; + var sentence = null; + if (token.type === LEX.SEMICOLON) { + continue; + } else if (token.type === LEX.EOF) { + break; + } if (token.type === LEX.VAR) { + sentence = parseVarSentence(); + } else if (token.type === LEX.RETURN) { + sentence = parseReturnSentence(); + } else if (token.type === LEX.LBRACE) { + sentence = parseBlockSentence(); + } else { + sentence = parseExpressionStatement(); + } + sentences.push(sentence); + } + return sentences; +} +function assert(condition) { + if (!condition) { + throw new Error("assert failed"); + } +} +/**=========================================> 这是parser_test2.mjs中相同的部分 <========================================= */ +const precedenceMap = { + '+': 1, + '-': 1, + '*': 2, + '/': 2 +} +class NumberAstNode extends AstNode { + constructor(value) { + super(true); + this.value = value; + } + toString() { + return this.value; + } +} +class InfixOperatorAstNode extends AstNode { + constructor(token) { + super(false); + this.op = token; + this.left = null; + this.right = null; + this.precedence = precedenceMap[token.value]; + } + toString() { + return `(${this.left.toString()} ${this.op.value} ${this.right.toString()})`; + } +} + +// 参数中添加了优先级 +function parseExpression(tokens, start, precedence=0) { + // 因为用到递归,并且因为有递归,所以start这个下标的位置需要用引用类型 + // 这样递归更深中的移动,也会在上层改变start的值,所以进入前简单处理下start如果是数字,修改为对象类型 + if (start.index === undefined) { + return parseExpression(tokens, {index:start}, precedence); + } + + var leftNode = new NumberAstNode(tokens[start.index].value); + while (start.index < tokens.length - 1 && isValidInfixOperator(tokens[start.index + 1])) { + var opNode = new InfixOperatorAstNode(tokens[start.index + 1]); + if (opNode.precedence <= precedence) { + return leftNode; + } else { + opNode.left = leftNode; + start.index += 2; + opNode.right = parseExpression(tokens, start, opNode.precedence); + leftNode = opNode; + } + } + return leftNode; +} + +function isValidInfixOperator(token) { + return token.type === LEX.PLUS || token.type === LEX.MINUS || token.type === LEX.MULTIPLY || token.type === LEX.DIVIDE; +} + +var code = `var a = 1 + 2 * 3 / 4 - 5;`; + +var tokens = lex(code); +var sentences = parse(tokens) + +for (var i = 0; i < sentences.length; i++) { + console.log(sentences[i].toString()); +} \ No newline at end of file diff --git a/24.11/parser_test5.mjs b/24.11/parser_test5.mjs new file mode 100644 index 00000000..b0feac68 --- /dev/null +++ b/24.11/parser_test5.mjs @@ -0,0 +1,215 @@ +/**=========================================> 这是parser_test2.mjs中相同的部分 <========================================= */ +import * as LEX from "./lex.mjs"; +import { lex } from './lex.mjs'; + +class Sentence { + constructor(type) { + if (type) { + this.type = type.toUpperCase() + "_SENTENCE"; + } + } +} +class VarSentence extends Sentence { + constructor(name, value) { + super("VAR"); + this.name = name; // name本身其实也是个表达式 + this.value = value; // 这里的value是个表达式 + } + + toString() { + return `var ${this.name} = ${this.value.toString()}`; + } +} + +class ReturnSentence extends Sentence { + constructor(value) { + super("RETURN"); + this.value = value; // 这里的value也是表达式 + } + toString() { + return `return ${this.value.toString()}`; + } +} + +class BlockSentence extends Sentence { + constructor(sentences) { + super("BLOCK"); + this.sentences = sentences; + } + toString() { + return `{ + ${this.sentences.map(it=>it.toString()).join('\n')} +}` + } +} + +class ExpressionStatement extends Sentence { + constructor(expression) { + super("EXPRESSION"); + this.expression = expression; // 这里的expression也是表达式 + } + + toString() { + return this.expression.toString(); + } +} +class AstNode { +} + + +class IdentifierAstNode extends AstNode { + constructor(token) { + super(); + this.token = token; + } + toString() { + return this.token.value; + } +} +// 语法解析,把tokens转换为sentences +function parse(tokens) { + // 从i开始转换成var语句,校验是不是var xx = xxx;格式,然后需要解析表达式parseExpression函数。 + function parseVarSentence() { + assert (tokens[i].type === LEX.VAR); + assert (tokens[i + 1].type === LEX.IDENTIFIER); + assert (tokens[i + 2].type === LEX.ASSIGN); + var name = new IdentifierAstNode(tokens[i + 1]); + for (var j = i + 3; j < tokens.length; j++) { + if (tokens[j].type === LEX.SEMICOLON || tokens[j].type === LEX.EOF) { + var value = parseExpression(tokens, i + 3); + i = j; + return new VarSentence(name, value); + } + } + } + // 与var语句类似 + function parseReturnSentence() { + assert (tokens[i].type === LEX.RETURN); + for (var j = i + 1; j < tokens.length; j++) { + if (tokens[j].type === LEX.SEMICOLON || tokens[j].type === LEX.EOF) { + var value = parseExpression(tokens, i + 1); + i = j; + return new ReturnSentence(value); + } + } + } + // 转换为表达式语句 + function parseExpressionStatement() { + for (var j = i; j < tokens.length; j++) { + if (tokens[j].type === LEX.SEMICOLON || tokens[j].type === LEX.EOF) { + var expression = parseExpression(tokens, i); + i = j; + return new ExpressionStatement(expression); + } + } + } + // 转换为块语句,块语句中包含一个语句数组 + function parseBlockSentence() { + var braceCount = 0; + for (var j = i; j < tokens.length; j++) { + if (tokens[j].type == LEX.LBRACE) braceCount++; + if (tokens[j].type == LEX.RBRACE) braceCount--; + if (braceCount == 0) { + return new BlockSentence(parse(tokens.slice(i + 1, i = j))); + } + } + throw new Error("brace not close for block sentence") + } + var sentences = []; + for (var i = 0; i < tokens.length; i++) { + var token = tokens[i]; + var sentence = null; + if (token.type === LEX.SEMICOLON) { + continue; + } else if (token.type === LEX.EOF) { + break; + } if (token.type === LEX.VAR) { + sentence = parseVarSentence(); + } else if (token.type === LEX.RETURN) { + sentence = parseReturnSentence(); + } else if (token.type === LEX.LBRACE) { + sentence = parseBlockSentence(); + } else { + sentence = parseExpressionStatement(); + } + sentences.push(sentence); + } + return sentences; +} +function assert(condition) { + if (!condition) { + throw new Error("assert failed"); + } +} +/**=========================================> 这是parser_test2.mjs中相同的部分 <========================================= */ +const precedenceMap = { + '+': 1, + '-': 1, + '*': 2, + '/': 2 +} +class NumberAstNode extends AstNode { + constructor(value) { + super(true); + this.value = value; + } + toString() { + return this.value; + } +} +class InfixOperatorAstNode extends AstNode { + constructor(token) { + super(false); + this.op = token; + this.left = null; + this.right = null; + this.precedence = precedenceMap[token.value]; + } + toString() { + return `(${this.left.toString()} ${this.op.value} ${this.right.toString()})`; + } +} + +function parseExpression(tokens, start) { + var stack = []; + var i = start, mid = null; + while (true) { + // 每个循环,准备好栈顶优先级、中间元素、当前操作符 + var stackTopPrecedence = stack.length == 0? 0: stack[stack.length - 1].precedence; + mid = mid == null ? new NumberAstNode(tokens[i++].value) : mid; + var opNode = getEofOrInfixNode(tokens, i); + // 结束循环的条件 + if (opNode.precedence == 0 && stackTopPrecedence == 0)return mid; + // 栈顶操作符赢得mid:弹出栈顶,填充right,并作为新的mid; NULL是EOF是最低优先级 + if (opNode.precedence <= stackTopPrecedence) { + var top = stack.pop(); + top.right = mid; + mid = top; + } + // 当前操作符赢得mid:塞入栈中,继续向后走 + else { + opNode.left = mid; + stack.push(opNode); + i++; + mid = null; // 往后走取新的mid + } + } +} +function getEofOrInfixNode(tokens, index) { + var eof = new InfixOperatorAstNode('EOF'); + eof.precedence = 0; + if (index >= tokens.length) return eof + var token = tokens[index]; + if (precedenceMap[token.value] == null) { + return eof; + } + return new InfixOperatorAstNode(tokens[index]); +} +var code = `var a = 1 + 2 * 3 / 4 - 5;`; + +var tokens = lex(code); +var sentences = parse(tokens) + +for (var i = 0; i < sentences.length; i++) { + console.log(sentences[i].toString()); +} \ No newline at end of file diff --git a/24.11/parser_test6.mjs b/24.11/parser_test6.mjs new file mode 100644 index 00000000..ee341cc5 --- /dev/null +++ b/24.11/parser_test6.mjs @@ -0,0 +1,339 @@ +import * as LEX from "./lex.mjs"; +import { lex } from './lex.mjs'; + +class Sentence { + constructor(type) { + if (type) { + this.type = type.toUpperCase() + "_SENTENCE"; + } + } +} +class VarSentence extends Sentence { + constructor(name, value) { + super("VAR"); + this.name = name; // name本身其实也是个表达式 + this.value = value; // 这里的value是个表达式 + } + + toString() { + return `var ${this.name} = ${this.value.toString()}`; + } +} + +class ReturnSentence extends Sentence { + constructor(value) { + super("RETURN"); + this.value = value; // 这里的value也是表达式 + } + toString() { + return `return ${this.value.toString()}`; + } +} + +class BlockSentence extends Sentence { + constructor(sentences) { + super("BLOCK"); + this.sentences = sentences; + } + toString() { + return `{ + ${this.sentences.map(it=>it.toString()).join('\n')} +}` + } +} + +class ExpressionStatement extends Sentence { + constructor(expression) { + super("EXPRESSION"); + this.expression = expression; // 这里的expression也是表达式 + } + + toString() { + return this.expression.toString(); + } +} +// 基础类型 +class AstNode { +} +// 数字字面量 +class NumberAstNode extends AstNode { + constructor(token) { + super(); + this.token = token; + } + + toString() { + return this.token.value; + } +} +// 变量名/函数名字面量 +class IdentifierAstNode extends AstNode { + constructor(token) { + super(); + this.token = token; + } + + toString() { + return this.token.value; + } +} +// null字面量 +class NullAstNode extends AstNode { + toString() { + return "null"; + } +} + +// 字符串字面量 +class StringAstNode extends AstNode { + constructor(token) { + super(); + this.token = token; + } + toString() { + return this.token.value; + } +} +// boolean字面量 +class BooleanAstNode extends AstNode { + constructor(token) { + super(); + this.token = token; + } + toString() { + return this.token.value; + } +} +// 中缀操作符节点 +class InfixOperatorAstNode extends AstNode { + constructor(token) { + super(); + this.op = token; + this.left = null; + this.right = null; + this.precedence = precedenceMap[token.value]; + } + toString() { + return `(${this.left.toString()} ${this.op.value} ${this.right.toString()})`; + } +} +// 前缀操作符 +class PrefixOperatorAstNode extends AstNode { + constructor(token, right) { + super(false); + this.op = token; + this.right = right; + } + toString() { + return `(${this.op.value} ${this.right.toString()})`; + } +} +// 后缀操作符 +class PostfixOperatorAstNode extends AstNode { + constructor(token, left) { + super(false); + this.op = token; + this.left = left; + } + toString() { + return `(${this.left.toString()} ${this.op.value})`; + } +} +// 函数声明 +class FunctionDeclarationAstNode extends AstNode { + constructor(nameToken, params, body) { + super(); + this.name = name == null ? null :new IdentifierAstNode(nameToken); + this.params = params; + this.body = body; + } + toString() { + return `function${this.name ? ' ' + this.name.toString() : ''}(${this.params.join(',')})${this.body.map(it=>it.toString()).join('\n')}`; + } +} +// 函数调用 +class FunctionCallAstNode extends AstNode { + constructor(nameToken, args) { + super(); + this.name = new IdentifierAstNode(nameToken); + this.args = args; // args是ast数组 + } + toString() { + return `${this.name.toString()}(${this.args.map(it=>it.toString()).join(',')})` + } +} +// 分组节点 +class GroupAstNode extends AstNode { + constructor(exp) { + super(); + this.exp = exp; + } + toString() { + // 因为小括号已经在运算符的toString中使用了,这里为了更好的凸显使用中文中括号 + return `【${this.exp.toString()}】` + } +} +// 语法解析,把tokens转换为sentences +function parse(tokens) { + // 从i开始转换成var语句,校验是不是var xx = xxx;格式,然后需要解析表达式parseExpression函数。 + function parseVarSentence() { + assert (tokens[i].type === LEX.VAR); + assert (tokens[i + 1].type === LEX.IDENTIFIER); + assert (tokens[i + 2].type === LEX.ASSIGN); + var name = new IdentifierAstNode(tokens[i + 1]); + for (var j = i + 3; j < tokens.length; j++) { + if (tokens[j].type === LEX.SEMICOLON || tokens[j].type === LEX.EOF) { + var value = parseExpression(tokens, i + 3); + i = j; + return new VarSentence(name, value); + } + } + } + // 与var语句类似 + function parseReturnSentence() { + assert (tokens[i].type === LEX.RETURN); + for (var j = i + 1; j < tokens.length; j++) { + if (tokens[j].type === LEX.SEMICOLON || tokens[j].type === LEX.EOF) { + var value = parseExpression(tokens, i + 1); + i = j; + return new ReturnSentence(value); + } + } + } + // 转换为表达式语句 + function parseExpressionStatement() { + for (var j = i; j < tokens.length; j++) { + if (tokens[j].type === LEX.SEMICOLON || tokens[j].type === LEX.EOF) { + var expression = parseExpression(tokens, i); + i = j; + return new ExpressionStatement(expression); + } + } + } + // 转换为块语句,块语句中包含一个语句数组 + function parseBlockSentence() { + var braceCount = 0; + for (var j = i; j < tokens.length; j++) { + if (tokens[j].type == LEX.LBRACE) braceCount++; + if (tokens[j].type == LEX.RBRACE) braceCount--; + if (braceCount == 0) { + return new BlockSentence(parse(tokens.slice(i + 1, i = j))); + } + } + throw new Error("brace not close for block sentence") + } + var sentences = []; + for (var i = 0; i < tokens.length; i++) { + var token = tokens[i]; + var sentence = null; + if (token.type === LEX.SEMICOLON) { + continue; + } else if (token.type === LEX.EOF) { + break; + } if (token.type === LEX.VAR) { + sentence = parseVarSentence(); + } else if (token.type === LEX.RETURN) { + sentence = parseReturnSentence(); + } else if (token.type === LEX.LBRACE) { + sentence = parseBlockSentence(); + } else { + sentence = parseExpressionStatement(); + } + sentences.push(sentence); + } + return sentences; +} +function assert(condition) { + if (!condition) { + throw new Error("assert failed"); + } +} +const precedenceMap = { + '+': 1, + '-': 1, + '*': 2, + '/': 2 +} + +function parseExpression(tokens, start) { + var stack = []; + var i = start, mid = null; + while (true) { + // 每个循环,准备好栈顶优先级、中间元素、当前操作符 + var stackTopPrecedence = stack.length == 0? 0: stack[stack.length - 1].precedence; + mid = mid == null ? nextUnaryNode() : mid; + var opNode = getEofOrInfixNode(tokens, i); + // 结束循环的条件 + if (opNode.precedence == 0 && stackTopPrecedence == 0)return mid; + // 栈顶操作符赢得mid:弹出栈顶,填充right,并作为新的mid; NULL是EOF是最低优先级 + if (opNode.precedence <= stackTopPrecedence) { + var top = stack.pop(); + top.right = mid; + mid = top; + } + // 当前操作符赢得mid:塞入栈中,继续向后走 + else { + opNode.left = mid; + stack.push(opNode); + i++; + mid = null; // 往后走取新的mid + } + } + // 这个函数的定义放到parseExpression函数里面 + function nextUnaryNode() { + var node = null; + switch (tokens[i].type) { + case LEX.NUMBER: + node = new NumberAstNode(tokens[i++]); + break; + case LEX.STRING: + node = new StringAstNode(tokens[i++]); + break; + case LEX.BOOLEAN: + node = new BooleanAstNode(tokens[i++]); + break; + case LEX.NULL: + node = new NullAstNode(tokens[i++]); + break; + case LEX.IDENTIFIER: + node = new IdentifierAstNode(tokens[i++]); + break; + // 遇到前缀运算符 + case LEX.PLUS: + case LEX.MINUS: + case LEX.INCREMENT: + case LEX.DECREMENT: + case LEX.NOT: + case LEX.BIT_NOT: + // 前缀后面递归解析一元节点(前缀后面一定是个一元节点) + // 并且前缀操作符都是右结合的,所以可以直接递归。 + node = new PrefixOperatorAstNode(tokens[i++], nextUnaryNode()); + break; + default: + throw new Error('unexpected token in getNode: ' + tokens[i].type); + } + // 后缀操作符,后缀操作符都是左结合的,并且后缀操作符的优先级比前缀都要高 + while (tokens[i].type == LEX.INCREMENT || tokens[i].type == LEX.DECREMENT) { + node = new PostfixOperatorAstNode(tokens[i++], node); + } + return node; + } +} +function getEofOrInfixNode(tokens, index) { + var eof = new InfixOperatorAstNode('EOF'); + eof.precedence = 0; + if (index >= tokens.length) return eof + var token = tokens[index]; + if (precedenceMap[token.value] == null) { + return eof; + } + return new InfixOperatorAstNode(tokens[index]); +} +var code = `var a = var1 + 2 * 3 / 4 - 5;`; + +var tokens = lex(code); +var sentences = parse(tokens) + +for (var i = 0; i < sentences.length; i++) { + console.log(sentences[i].toString()); +} \ No newline at end of file diff --git a/24.11/parser_test7.mjs b/24.11/parser_test7.mjs new file mode 100644 index 00000000..7b6ef52b --- /dev/null +++ b/24.11/parser_test7.mjs @@ -0,0 +1,411 @@ +import * as LEX from "./lex.mjs"; +import { lex } from './lex.mjs'; + +class Sentence { + constructor(type) { + if (type) { + this.type = type.toUpperCase() + "_SENTENCE"; + } + } +} +class VarSentence extends Sentence { + constructor(name, value) { + super("VAR"); + this.name = name; // name本身其实也是个表达式 + this.value = value; // 这里的value是个表达式 + } + + toString() { + return `var ${this.name} = ${this.value.toString()}`; + } +} + +class ReturnSentence extends Sentence { + constructor(value) { + super("RETURN"); + this.value = value; // 这里的value也是表达式 + } + toString() { + return `return ${this.value.toString()}`; + } +} + +class BlockSentence extends Sentence { + constructor(sentences) { + super("BLOCK"); + this.sentences = sentences; + } + toString() { + return `{ + ${this.sentences.map(it=>it.toString()).join('\n')} +}` + } +} + +class ExpressionStatement extends Sentence { + constructor(expression) { + super("EXPRESSION"); + this.expression = expression; // 这里的expression也是表达式 + } + + toString() { + return this.expression.toString(); + } +} +// 基础类型 +class AstNode { +} +// 数字字面量 +class NumberAstNode extends AstNode { + constructor(token) { + super(); + this.token = token; + } + + toString() { + return this.token.value; + } +} +// 变量名/函数名字面量 +class IdentifierAstNode extends AstNode { + constructor(token) { + super(); + this.token = token; + } + + toString() { + return this.token.value; + } +} +// null字面量 +class NullAstNode extends AstNode { + toString() { + return "null"; + } +} + +// 字符串字面量 +class StringAstNode extends AstNode { + constructor(token) { + super(); + this.token = token; + } + toString() { + return this.token.value; + } +} +// boolean字面量 +class BooleanAstNode extends AstNode { + constructor(token) { + super(); + this.token = token; + } + toString() { + return this.token.value; + } +} +// 中缀操作符节点 +class InfixOperatorAstNode extends AstNode { + constructor(token) { + super(); + this.op = token; + this.left = null; + this.right = null; + this.precedence = precedenceMap[token.value]; + } + toString() { + return `(${this.left.toString()} ${this.op.value} ${this.right.toString()})`; + } +} +// 前缀操作符 +class PrefixOperatorAstNode extends AstNode { + constructor(token, right) { + super(false); + this.op = token; + this.right = right; + } + toString() { + return `(${this.op.value} ${this.right.toString()})`; + } +} +// 后缀操作符 +class PostfixOperatorAstNode extends AstNode { + constructor(token, left) { + super(false); + this.op = token; + this.left = left; + } + toString() { + return `(${this.left.toString()} ${this.op.value})`; + } +} +// 函数声明 +class FunctionDeclarationAstNode extends AstNode { + constructor(nameToken, params, body) { + super(); + this.name = name == null ? null :new IdentifierAstNode(nameToken); + this.params = params; + this.body = body; + } + toString() { + return `function${this.name ? ' ' + this.name.toString() : ''}(${this.params.join(',')})${this.body.map(it=>it.toString()).join('\n')}`; + } +} +// 函数调用 +class FunctionCallAstNode extends AstNode { + constructor(nameToken, args) { + super(); + this.name = new IdentifierAstNode(nameToken); + this.args = args; // args是ast数组 + } + toString() { + return `${this.name.toString()}(${this.args.map(it=>it.toString()).join(',')})` + } +} +// 分组节点 +class GroupAstNode extends AstNode { + constructor(exp) { + super(); + this.exp = exp; + } + toString() { + // 因为小括号已经在运算符的toString中使用了,这里为了更好的凸显使用中文中括号 + return `【${this.exp.toString()}】` + } +} + + + + + + + + + + + + + +const precedenceMap = { + '+': 1, + '-': 1, + '*': 2, + '/': 2 +} +const prefixPrecedenceMap = { + '-': 100, + '!': 100, + '~': 100, + '+': 100, + '++': 100, + '--': 100 +} +const postfixPrecedenceMap = { + '++': 200, + '--': 200 +} + +class Parser { + constructor(tokens) { + this.tokens = tokens; + this.cursor = 0; + } + // 语法解析,把tokens转换为sentences + parse() { + var tokens = this.tokens; + var sentences = []; + for (;;) { + var token = tokens[this.cursor]; + var sentence = null; + if (token.type === LEX.SEMICOLON) { + continue; + } else if (token.type === LEX.EOF) { + break; + } if (token.type === LEX.VAR) { + sentence = this.parseVarSentence(); + } else if (token.type === LEX.RETURN) { + sentence = this.parseReturnSentence(); + } else if (token.type === LEX.LBRACE) { + sentence = this.parseBlockSentence(); + } else { + sentence = this.parseExpressionStatement(); + } + sentences.push(sentence); + } + return sentences; + } + + // 从i开始转换成var语句,校验是不是var xx = xxx;格式,然后需要解析表达式parseExpression函数。 + parseVarSentence() { + var tokens = this.tokens; + assert (tokens[this.cursor].type === LEX.VAR); + assert (tokens[this.cursor + 1].type === LEX.IDENTIFIER); + assert (tokens[this.cursor + 2].type === LEX.ASSIGN); + var name = new IdentifierAstNode(tokens[this.cursor + 1]); + for (var j = this.cursor + 3; j < tokens.length; j++) { + if (tokens[j].type === LEX.SEMICOLON || tokens[j].type === LEX.EOF) { + var value = this.parseExpression(this.cursor = this.cursor + 3); + return new VarSentence(name, value); + } + } + } + // 从i开始转换成var语句,校验是不是var xx = xxx;格式,然后需要解析表达式parseExpression函数。 + parseVarSentence() { + var tokens = this.tokens; + assert (tokens[this.cursor].type === LEX.VAR); + assert (tokens[this.cursor + 1].type === LEX.IDENTIFIER); + assert (tokens[this.cursor + 2].type === LEX.ASSIGN); + var name = new IdentifierAstNode(tokens[this.cursor + 1]); + for (var j = this.cursor + 3; j < tokens.length; j++) { + if (tokens[j].type === LEX.SEMICOLON || tokens[j].type === LEX.EOF) { + this.cursor = this.cursor + 3 + var value = this.parseExpression(); + assert(tokens[this.cursor].type === LEX.SEMICOLON || tokens[this.cursor].type == LEX.EOF); + this.cursor ++; + return new VarSentence(name, value); + } + } + } + // 与var语句类似 + parseReturnSentence() { + var tokens = this.tokens; + assert (tokens[this.cursor].type === LEX.RETURN); + for (var j = this.cursor + 1; j < tokens.length; j++) { + if (tokens[j].type === LEX.SEMICOLON || tokens[j].type === LEX.EOF) { + this.cursor += 1; + var value = this.parseExpression(); + assert(tokens[this.cursor].type === LEX.SEMICOLON || tokens[this.cursor].type == LEX.EOF); + this.cursor ++; + return new ReturnSentence(value); + } + } + } + // 转换为表达式语句 + parseExpressionStatement() { + var tokens = this.tokens; + for (var j = this.cursor; j < tokens.length; j++) { + if (tokens[j].type === LEX.SEMICOLON || tokens[j].type === LEX.EOF) { + var expression = this.parseExpression(); + assert(tokens[this.cursor].type === LEX.SEMICOLON || tokens[this.cursor].type == LEX.EOF); + this.cursor ++; + return new ExpressionStatement(expression); + } + } + } + // 转换为块语句,块语句中包含一个语句数组 + parseBlockSentence() { + var tokens = this.tokens; + var braceCount = 0; + for (var j = this.cursor; j < tokens.length; j++) { + if (tokens[j].type == LEX.LBRACE) braceCount++; + if (tokens[j].type == LEX.RBRACE) braceCount--; + if (braceCount == 0) { + return new BlockSentence(parse(tokens.slice(this.cursor + 1, this.cursor = j))); + } + } + throw new Error("brace not close for block sentence") + } + + // 表达式解析,解析下一个表达式,遇到无法识别的字符会结束 + parseExpression() { + var tokens = this.tokens; + var stack = []; + var mid = null; + while (true) { + // 每个循环,准备好栈顶优先级、中间元素、当前操作符 + var stackTopPrecedence = stack.length == 0? 0: stack[stack.length - 1].precedence; + mid = mid == null ? this.nextUnaryNode() : mid; + var opNode = this.getEofOrInfixNode(tokens, this.cursor); + // 结束循环的条件 + if (opNode.precedence == 0 && stackTopPrecedence == 0)return mid; + // 栈顶操作符赢得mid:弹出栈顶,填充right,并作为新的mid; NULL是EOF是最低优先级 + if (opNode.precedence <= stackTopPrecedence) { + var top = stack.pop(); + top.right = mid; + mid = top; + } + // 当前操作符赢得mid:塞入栈中,继续向后走 + else { + opNode.left = mid; + stack.push(opNode); + this.cursor++; + mid = null; // 往后走取新的mid + } + } + + } + nextUnaryNode() { + var tokens = this.tokens; + var node = null; + switch (tokens[this.cursor].type) { + case LEX.NUMBER: + node = new NumberAstNode(tokens[this.cursor++]); + break; + case LEX.STRING: + node = new StringAstNode(tokens[this.cursor++]); + break; + case LEX.BOOLEAN: + node = new BooleanAstNode(tokens[this.cursor++]); + break; + case LEX.NULL: + node = new NullAstNode(tokens[this.cursor++]); + break; + case LEX.IDENTIFIER: + node = new IdentifierAstNode(tokens[this.cursor++]); + break; + // 遇到前缀运算符 + case LEX.PLUS: + case LEX.MINUS: + case LEX.INCREMENT: + case LEX.DECREMENT: + case LEX.NOT: + case LEX.BIT_NOT: + // 前缀后面递归解析一元节点(前缀后面一定是个一元节点) + // 并且前缀操作符都是右结合的,所以可以直接递归。 + node = new PrefixOperatorAstNode(tokens[this.cursor++], this.nextUnaryNode()); + break; + // 分组 + case LEX.LPAREN: + // 递归解析(后面的即可,因为遇到)的时候,parseExpression无法识别,就会结束解析 + this.cursor++; + // GroupAstNode其实可有可无 + node = new GroupAstNode(this.parseExpression()); + assert(tokens[this.cursor++].type == LEX.RPAREN, "group not closed"); + break; + default: + throw new Error('unexpected token in nextUnary: ' + tokens[this.cursor].type); + } + // 后缀操作符,后缀操作符都是左结合的,并且后缀操作符的优先级比前缀都要高 + while (tokens[this.cursor].type == LEX.INCREMENT || tokens[this.cursor].type == LEX.DECREMENT) { + node = new PostfixOperatorAstNode(tokens[this.cursor++], node); + } + return node; + } + getEofOrInfixNode(tokens, index) { + var eof = new InfixOperatorAstNode('EOF'); + eof.precedence = 0; + if (index >= tokens.length) return eof + var token = tokens[index]; + if (precedenceMap[token.value] == null) { + return eof; + } + return new InfixOperatorAstNode(tokens[index]); + } + +} + +function assert(condition) { + if (!condition) { + throw new Error("assert failed"); + } +} + + +var code = `var a = 1 * 2 - 3;`; + +var tokens = lex(code); +var sentences = new Parser(tokens).parse() + +for (var i = 0; i < sentences.length; i++) { + console.log(sentences[i].toString()); +} \ No newline at end of file diff --git a/24.11/parser_test8.mjs b/24.11/parser_test8.mjs new file mode 100644 index 00000000..f12b6bf6 --- /dev/null +++ b/24.11/parser_test8.mjs @@ -0,0 +1,407 @@ +import * as LEX from "./lex.mjs"; +import { lex } from './lex.mjs'; + +class Sentence { + constructor(type) { + if (type) { + this.type = type.toUpperCase() + "_SENTENCE"; + } + } +} +class VarSentence extends Sentence { + constructor(name, value) { + super("VAR"); + this.name = name; // name本身其实也是个表达式 + this.value = value; // 这里的value是个表达式 + } + + toString() { + return `var ${this.name} = ${this.value.toString()}`; + } +} + +class ReturnSentence extends Sentence { + constructor(value) { + super("RETURN"); + this.value = value; // 这里的value也是表达式 + } + toString() { + return `return ${this.value.toString()}`; + } +} + +class BlockSentence extends Sentence { + constructor(sentences) { + super("BLOCK"); + this.sentences = sentences; + } + toString() { + return `{ + ${this.sentences.map(it=>it.toString()).join('\n')} +}` + } +} + +class ExpressionStatement extends Sentence { + constructor(expression) { + super("EXPRESSION"); + this.expression = expression; // 这里的expression也是表达式 + } + + toString() { + return this.expression.toString(); + } +} +// 基础类型 +class AstNode { +} +// 数字字面量 +class NumberAstNode extends AstNode { + constructor(token) { + super(); + this.token = token; + } + + toString() { + return this.token.value; + } +} +// 变量名/函数名字面量 +class IdentifierAstNode extends AstNode { + constructor(token) { + super(); + this.token = token; + } + + toString() { + return this.token.value; + } +} +// null字面量 +class NullAstNode extends AstNode { + toString() { + return "null"; + } +} + +// 字符串字面量 +class StringAstNode extends AstNode { + constructor(token) { + super(); + this.token = token; + } + toString() { + return this.token.value; + } +} +// boolean字面量 +class BooleanAstNode extends AstNode { + constructor(token) { + super(); + this.token = token; + } + toString() { + return this.token.value; + } +} +// 中缀操作符节点 +class InfixOperatorAstNode extends AstNode { + constructor(token) { + super(); + this.op = token; + this.left = null; + this.right = null; + this.precedence = precedenceMap[token.value]; + } + toString() { + return `(${this.left.toString()} ${this.op.value} ${this.right.toString()})`; + } +} +// 前缀操作符 +class PrefixOperatorAstNode extends AstNode { + constructor(token, right) { + super(false); + this.op = token; + this.right = right; + } + toString() { + return `(${this.op.value} ${this.right.toString()})`; + } +} +// 后缀操作符 +class PostfixOperatorAstNode extends AstNode { + constructor(token, left) { + super(false); + this.op = token; + this.left = left; + } + toString() { + return `(${this.left.toString()} ${this.op.value})`; + } +} +// 函数声明 +class FunctionDeclarationAstNode extends AstNode { + constructor(nameToken, params, body) { + super(); + this.name = name == null ? null :new IdentifierAstNode(nameToken); + this.params = params; + this.body = body; + } + toString() { + return `function${this.name ? ' ' + this.name.toString() : ''}(${this.params.join(',')})${this.body.map(it=>it.toString()).join('\n')}`; + } +} +// 函数调用 +class FunctionCallAstNode extends AstNode { + constructor(nameToken, args) { + super(); + this.name = new IdentifierAstNode(nameToken); + this.args = args; // args是ast数组 + } + toString() { + return `${this.name.toString()}(${this.args.map(it=>it.toString()).join(',')})` + } +} +// 分组节点 +class GroupAstNode extends AstNode { + constructor(exp) { + super(); + this.exp = exp; + } + toString() { + // 因为小括号已经在运算符的toString中使用了,这里为了更好的凸显使用中文中括号 + return `【${this.exp.toString()}】` + } +} + + + + + + + + + + + + + +const precedenceMap = { + '+': 1, + '-': 1, + '*': 2, + '/': 2 +} +const prefixPrecedenceMap = { + '-': 100, + '!': 100, + '~': 100, + '+': 100, + '++': 100, + '--': 100 +} +const postfixPrecedenceMap = { + '++': 200, + '--': 200 +} + +class Parser { + constructor(tokens) { + this.tokens = tokens; + this.cursor = 0; + } + // 语法解析,把tokens转换为sentences + parse() { + var tokens = this.tokens; + var sentences = []; + for (;;) { + var token = tokens[this.cursor]; + var sentence = null; + if (token.type === LEX.SEMICOLON) { + continue; + } else if (token.type === LEX.EOF) { + break; + } if (token.type === LEX.VAR) { + sentence = this.parseVarSentence(); + } else if (token.type === LEX.RETURN) { + sentence = this.parseReturnSentence(); + } else if (token.type === LEX.LBRACE) { + sentence = this.parseBlockSentence(); + } else { + sentence = this.parseExpressionStatement(); + } + sentences.push(sentence); + } + return sentences; + } + + // 从i开始转换成var语句,校验是不是var xx = xxx;格式,然后需要解析表达式parseExpression函数。 + parseVarSentence() { + var tokens = this.tokens; + assert (tokens[this.cursor].type === LEX.VAR); + assert (tokens[this.cursor + 1].type === LEX.IDENTIFIER); + assert (tokens[this.cursor + 2].type === LEX.ASSIGN); + var name = new IdentifierAstNode(tokens[this.cursor + 1]); + for (var j = this.cursor + 3; j < tokens.length; j++) { + if (tokens[j].type === LEX.SEMICOLON || tokens[j].type === LEX.EOF) { + var value = this.parseExpression(this.cursor = this.cursor + 3); + return new VarSentence(name, value); + } + } + } + // 从i开始转换成var语句,校验是不是var xx = xxx;格式,然后需要解析表达式parseExpression函数。 + parseVarSentence() { + var tokens = this.tokens; + assert (tokens[this.cursor].type === LEX.VAR); + assert (tokens[this.cursor + 1].type === LEX.IDENTIFIER); + assert (tokens[this.cursor + 2].type === LEX.ASSIGN); + var name = new IdentifierAstNode(tokens[this.cursor + 1]); + for (var j = this.cursor + 3; j < tokens.length; j++) { + if (tokens[j].type === LEX.SEMICOLON || tokens[j].type === LEX.EOF) { + this.cursor = this.cursor + 3 + var value = this.parseExpression(); + assert(tokens[this.cursor].type === LEX.SEMICOLON || tokens[this.cursor].type == LEX.EOF); + this.cursor ++; + return new VarSentence(name, value); + } + } + } + // 与var语句类似 + parseReturnSentence() { + var tokens = this.tokens; + assert (tokens[this.cursor].type === LEX.RETURN); + for (var j = this.cursor + 1; j < tokens.length; j++) { + if (tokens[j].type === LEX.SEMICOLON || tokens[j].type === LEX.EOF) { + this.cursor += 1; + var value = this.parseExpression(); + assert(tokens[this.cursor].type === LEX.SEMICOLON || tokens[this.cursor].type == LEX.EOF); + this.cursor ++; + return new ReturnSentence(value); + } + } + } + // 转换为表达式语句 + parseExpressionStatement() { + var tokens = this.tokens; + for (var j = this.cursor; j < tokens.length; j++) { + if (tokens[j].type === LEX.SEMICOLON || tokens[j].type === LEX.EOF) { + var expression = this.parseExpression(); + assert(tokens[this.cursor].type === LEX.SEMICOLON || tokens[this.cursor].type == LEX.EOF); + this.cursor ++; + return new ExpressionStatement(expression); + } + } + } + // 转换为块语句,块语句中包含一个语句数组 + parseBlockSentence() { + var tokens = this.tokens; + var braceCount = 0; + for (var j = this.cursor; j < tokens.length; j++) { + if (tokens[j].type == LEX.LBRACE) braceCount++; + if (tokens[j].type == LEX.RBRACE) braceCount--; + if (braceCount == 0) { + return new BlockSentence(parse(tokens.slice(this.cursor + 1, this.cursor = j))); + } + } + throw new Error("brace not close for block sentence") + } + + // 然后修改parseExpression函数,使其接受一个参数,代表前置符号的优先级 + parseExpression(precedence = 0) { + var tokens = this.tokens; + var stack = []; + var mid = null; + while (true) { + // 此时栈为空的时候默认看到的就是上下文传进来的优先级 + var stackTopPrecedence = stack.length == 0 ? precedence: stack[stack.length - 1].precedence; + mid = mid == null ? this.nextUnaryNode() : mid; + var opNode = this.getEofOrInfixNode(tokens, this.cursor); + // 结束循环的条件改为,当前操作符优先级<=上下文优先级 并且 栈为空 + // 这样首先是能兼容为0的情况,其次前缀操作符优先级是比中缀高的,所以前缀操作符传进来的时候一定是遇到中缀就结束 + if (opNode.precedence <= precedence && stackTopPrecedence == precedence) return mid; + if (opNode.precedence <= stackTopPrecedence) { + var top = stack.pop(); + top.right = mid; + mid = top; + } + else { + opNode.left = mid; + stack.push(opNode); + this.cursor++; + mid = null; + } + } + } + nextUnaryNode() { + var tokens = this.tokens; + var node = null; + switch (tokens[this.cursor].type) { + case LEX.NUMBER: + node = new NumberAstNode(tokens[this.cursor++]); + break; + case LEX.STRING: + node = new StringAstNode(tokens[this.cursor++]); + break; + case LEX.BOOLEAN: + node = new BooleanAstNode(tokens[this.cursor++]); + break; + case LEX.NULL: + node = new NullAstNode(tokens[this.cursor++]); + break; + case LEX.IDENTIFIER: + node = new IdentifierAstNode(tokens[this.cursor++]); + break; + // 遇到前缀运算符 + case LEX.PLUS: + case LEX.MINUS: + case LEX.INCREMENT: + case LEX.DECREMENT: + case LEX.NOT: + case LEX.BIT_NOT: + // 使用parseExpression函数递归,但是要传递当前符号的优先级 + node = new PrefixOperatorAstNode(tokens[this.cursor], this.parseExpression(prefixPrecedenceMap[tokens[this.cursor++].value])); + break; + // 分组 + case LEX.LPAREN: + // 递归解析(后面的即可,因为遇到)的时候,parseExpression无法识别,就会结束解析 + this.cursor++; + // GroupAstNode其实可有可无 + node = new GroupAstNode(this.parseExpression()); + assert(tokens[this.cursor++].type == LEX.RPAREN, "group not closed"); + break; + default: + throw new Error('unexpected token in nextUnary: ' + tokens[this.cursor].type); + } + while (tokens[this.cursor].type == LEX.INCREMENT || tokens[this.cursor].type == LEX.DECREMENT) { + node = new PostfixOperatorAstNode(tokens[this.cursor++], node); + } + return node; + } + getEofOrInfixNode(tokens, index) { + var eof = new InfixOperatorAstNode('EOF'); + eof.precedence = 0; + if (index >= tokens.length) return eof + var token = tokens[index]; + if (precedenceMap[token.value] == null) { + return eof; + } + return new InfixOperatorAstNode(tokens[index]); + } + +} + +function assert(condition) { + if (!condition) { + throw new Error("assert failed"); + } +} + + +var code = `var a = -1 * (-++ x ++ - 3) * 4;`; + +var tokens = lex(code); +var sentences = new Parser(tokens).parse() + +for (var i = 0; i < sentences.length; i++) { + console.log(sentences[i].toString()); +} \ No newline at end of file diff --git a/24.11/parser_test9.mjs b/24.11/parser_test9.mjs new file mode 100644 index 00000000..92093a44 --- /dev/null +++ b/24.11/parser_test9.mjs @@ -0,0 +1,419 @@ +import * as LEX from "./lex.mjs"; +import { lex } from './lex.mjs'; + +class Sentence { + constructor(type) { + if (type) { + this.type = type.toUpperCase() + "_SENTENCE"; + } + } +} +class VarSentence extends Sentence { + constructor(name, value) { + super("VAR"); + this.name = name; // name本身其实也是个表达式 + this.value = value; // 这里的value是个表达式 + } + + toString() { + return `var ${this.name} = ${this.value.toString()}`; + } +} + +class ReturnSentence extends Sentence { + constructor(value) { + super("RETURN"); + this.value = value; // 这里的value也是表达式 + } + toString() { + return `return ${this.value.toString()}`; + } +} + +class BlockSentence extends Sentence { + constructor(sentences) { + super("BLOCK"); + this.sentences = sentences; + } + toString() { + return `{ + ${this.sentences.map(it=>it.toString()).join('\n')} +}` + } +} + +class ExpressionStatement extends Sentence { + constructor(expression) { + super("EXPRESSION"); + this.expression = expression; // 这里的expression也是表达式 + } + + toString() { + return this.expression.toString(); + } +} +// 基础类型 +class AstNode { +} +// 数字字面量 +class NumberAstNode extends AstNode { + constructor(token) { + super(); + this.token = token; + } + + toString() { + return this.token.value; + } +} +// 变量名/函数名字面量 +class IdentifierAstNode extends AstNode { + constructor(token) { + super(); + this.token = token; + } + + toString() { + return this.token.value; + } +} +// null字面量 +class NullAstNode extends AstNode { + toString() { + return "null"; + } +} + +// 字符串字面量 +class StringAstNode extends AstNode { + constructor(token) { + super(); + this.token = token; + } + toString() { + return this.token.value; + } +} +// boolean字面量 +class BooleanAstNode extends AstNode { + constructor(token) { + super(); + this.token = token; + } + toString() { + return this.token.value; + } +} +// 中缀操作符节点 +class InfixOperatorAstNode extends AstNode { + constructor(token) { + super(); + this.op = token; + this.left = null; + this.right = null; + this.precedence = precedenceMap[token.value]; + } + toString() { + return `(${this.left.toString()} ${this.op.value} ${this.right.toString()})`; + } +} +// 前缀操作符 +class PrefixOperatorAstNode extends AstNode { + constructor(token, right) { + super(false); + this.op = token; + this.right = right; + this.precedence = prefixPrecedenceMap[token.value]; + } + toString() { + return `(${this.op.value} ${this.right.toString()})`; + } +} +// 后缀操作符 +class PostfixOperatorAstNode extends AstNode { + constructor(token, left) { + super(false); + this.op = token; + this.left = left; + this.precedence = postfixPrecedenceMap[token.value]; + } + toString() { + return `(${this.left.toString()} ${this.op.value})`; + } +} +// 函数声明 +class FunctionDeclarationAstNode extends AstNode { + constructor(nameToken, params, body) { + super(); + this.name = name == null ? null :new IdentifierAstNode(nameToken); + this.params = params; + this.body = body; + } + toString() { + return `function${this.name ? ' ' + this.name.toString() : ''}(${this.params.join(',')})${this.body.map(it=>it.toString()).join('\n')}`; + } +} +// 函数调用 +class FunctionCallAstNode extends AstNode { + constructor(nameToken, args) { + super(); + this.name = new IdentifierAstNode(nameToken); + this.args = args; // args是ast数组 + } + toString() { + return `${this.name.toString()}(${this.args.map(it=>it.toString()).join(',')})` + } +} +// 分组节点 +class GroupAstNode extends AstNode { + constructor(exp) { + super(); + this.exp = exp; + } + toString() { + // 因为小括号已经在运算符的toString中使用了,这里为了更好的凸显使用中文中括号 + return `【${this.exp.toString()}】` + } +} + + + + + + + + + + + + + +const precedenceMap = { + '+': 1, + '-': 1, + '*': 2, + '/': 2 +} +const prefixPrecedenceMap = { + '-': 100, + '!': 100, + '~': 100, + '+': 100, + '++': 100, + '--': 100 +} +const postfixPrecedenceMap = { + '++': 200, + '--': 200 +} + +class Parser { + constructor(tokens) { + this.tokens = tokens; + this.cursor = 0; + } + // 语法解析,把tokens转换为sentences + parse() { + var tokens = this.tokens; + var sentences = []; + for (;;) { + var token = tokens[this.cursor]; + var sentence = null; + if (token.type === LEX.SEMICOLON) { + continue; + } else if (token.type === LEX.EOF) { + break; + } if (token.type === LEX.VAR) { + sentence = this.parseVarSentence(); + } else if (token.type === LEX.RETURN) { + sentence = this.parseReturnSentence(); + } else if (token.type === LEX.LBRACE) { + sentence = this.parseBlockSentence(); + } else { + sentence = this.parseExpressionStatement(); + } + sentences.push(sentence); + } + return sentences; + } + + // 从i开始转换成var语句,校验是不是var xx = xxx;格式,然后需要解析表达式parseExpression函数。 + parseVarSentence() { + var tokens = this.tokens; + assert (tokens[this.cursor].type === LEX.VAR); + assert (tokens[this.cursor + 1].type === LEX.IDENTIFIER); + assert (tokens[this.cursor + 2].type === LEX.ASSIGN); + var name = new IdentifierAstNode(tokens[this.cursor + 1]); + for (var j = this.cursor + 3; j < tokens.length; j++) { + if (tokens[j].type === LEX.SEMICOLON || tokens[j].type === LEX.EOF) { + var value = this.parseExpression(this.cursor = this.cursor + 3); + return new VarSentence(name, value); + } + } + } + // 从i开始转换成var语句,校验是不是var xx = xxx;格式,然后需要解析表达式parseExpression函数。 + parseVarSentence() { + var tokens = this.tokens; + assert (tokens[this.cursor].type === LEX.VAR); + assert (tokens[this.cursor + 1].type === LEX.IDENTIFIER); + assert (tokens[this.cursor + 2].type === LEX.ASSIGN); + var name = new IdentifierAstNode(tokens[this.cursor + 1]); + for (var j = this.cursor + 3; j < tokens.length; j++) { + if (tokens[j].type === LEX.SEMICOLON || tokens[j].type === LEX.EOF) { + this.cursor = this.cursor + 3 + var value = this.parseExpression(); + assert(tokens[this.cursor].type === LEX.SEMICOLON || tokens[this.cursor].type == LEX.EOF); + this.cursor ++; + return new VarSentence(name, value); + } + } + } + // 与var语句类似 + parseReturnSentence() { + var tokens = this.tokens; + assert (tokens[this.cursor].type === LEX.RETURN); + for (var j = this.cursor + 1; j < tokens.length; j++) { + if (tokens[j].type === LEX.SEMICOLON || tokens[j].type === LEX.EOF) { + this.cursor += 1; + var value = this.parseExpression(); + assert(tokens[this.cursor].type === LEX.SEMICOLON || tokens[this.cursor].type == LEX.EOF); + this.cursor ++; + return new ReturnSentence(value); + } + } + } + // 转换为表达式语句 + parseExpressionStatement() { + var tokens = this.tokens; + for (var j = this.cursor; j < tokens.length; j++) { + if (tokens[j].type === LEX.SEMICOLON || tokens[j].type === LEX.EOF) { + var expression = this.parseExpression(); + assert(tokens[this.cursor].type === LEX.SEMICOLON || tokens[this.cursor].type == LEX>EOF); + this.cursor ++; + return new ExpressionStatement(expression); + } + } + } + // 转换为块语句,块语句中包含一个语句数组 + parseBlockSentence() { + var tokens = this.tokens; + var braceCount = 0; + for (var j = this.cursor; j < tokens.length; j++) { + if (tokens[j].type == LEX.LBRACE) braceCount++; + if (tokens[j].type == LEX.RBRACE) braceCount--; + if (braceCount == 0) { + return new BlockSentence(parse(tokens.slice(this.cursor + 1, this.cursor = j))); + } + } + throw new Error("brace not close for block sentence") + } + + parseExpression() { + var tokens = this.tokens; + var stack = []; + var mid = null; + while (true) { + var stackTopPrecedence = stack.length == 0 ? 0: stack[stack.length - 1].precedence; + mid = mid == null ? this.nextUnaryNode() : mid; + // 如果是next返回的不完整前缀表达式,相当于left填充过的二元操作符,直接塞到stack + if (mid instanceof PrefixOperatorAstNode && mid.right == null) { + stack.push(mid); + mid = null; + continue; + } + var opNode = this.getEofOrInfixNodeOrPostNode(tokens, this.cursor); + if (opNode.precedence == 0 && stackTopPrecedence == 0) + return mid; + if (opNode instanceof PostfixOperatorAstNode) { + opNode.left = mid; + mid = opNode; + this.cursor++; + } + else if (opNode.precedence <= stackTopPrecedence) { + var top = stack.pop(); + top.right = mid; + mid = top; + } + else { + opNode.left = mid; + stack.push(opNode); + this.cursor++; + mid = null; + } + } + } + + nextUnaryNode() { + var tokens = this.tokens; + var node = null; + switch (tokens[this.cursor].type) { + case LEX.NUMBER: + node = new NumberAstNode(tokens[this.cursor++]); + break; + case LEX.STRING: + node = new StringAstNode(tokens[this.cursor++]); + break; + case LEX.BOOLEAN: + node = new BooleanAstNode(tokens[this.cursor++]); + break; + case LEX.NULL: + node = new NullAstNode(tokens[this.cursor++]); + break; + case LEX.IDENTIFIER: + node = new IdentifierAstNode(tokens[this.cursor++]); + break; + // 遇到前缀运算符 + case LEX.PLUS: + case LEX.MINUS: + case LEX.NOT: + case LEX.BIT_NOT: + case LEX.INCREMENT: + case LEX.DECREMENT: // 使用parseExpression函数递归,但是要传递当前符号的优先级 + node = new PrefixOperatorAstNode(tokens[this.cursor++], null); + break; + // 分组 + case LEX.LPAREN: + // 递归解析(后面的即可,因为遇到)的时候,parseExpression无法识别,就会结束解析 + this.cursor++; + // GroupAstNode其实可有可无 + node = new GroupAstNode(this.parseExpression()); + assert(tokens[this.cursor++].type == LEX.RPAREN, "group not closed"); + break; + default: + throw new Error('unexpected token in nextUnary: ' + tokens[this.cursor].type); + } + + return node; + } + getEofOrInfixNodeOrPostNode(tokens, index) { + var eof = new InfixOperatorAstNode('EOF'); + eof.precedence = 0; + if (index >= tokens.length) return eof + var token = tokens[index]; + if (precedenceMap[token.value] == null && postfixPrecedenceMap[token.value] == null) { + return eof; + } + if (token.type == LEX.INCREMENT || token.type == LEX.DECREMENT) { + return new PostfixOperatorAstNode(tokens[index], null); + } + return new InfixOperatorAstNode(tokens[index]); + } +} + +function assert(condition) { + if (!condition) { + throw new Error("assert failed"); + } +} + + +var code = `var a = -1 * (-++ x- y--) * 4;`; + +var code = `var a = -1 + -2 * a++ / null - !false + "hello";` + +var tokens = lex(code); +var sentences = new Parser(tokens).parse() + +for (var i = 0; i < sentences.length; i++) { + console.log(sentences[i].toString()); +} \ No newline at end of file diff --git "a/24.12/\350\257\255\346\263\225\345\210\206\346\236\220.md" "b/24.12/\350\257\255\346\263\225\345\210\206\346\236\220.md" index 104666c7..4de0df51 100644 --- "a/24.12/\350\257\255\346\263\225\345\210\206\346\236\220.md" +++ "b/24.12/\350\257\255\346\263\225\345\210\206\346\236\220.md" @@ -6,14 +6,14 @@ tags: - parser - 解释器 --- -# 1 概述 +# 1 解析语句 `词法分析`将`char[]`转换为了`token[]`,接下来就是`语法分析`,语法分析是将`token[]`转换为`sentence[]`,对于`sentence`, ``` char[] --lex--> token[] --parse--> sentence[] ``` `sentence`的转换,例如`var`语句、`return`语句。 -`var`语句的解析,就是遍历`token[]`,如果遇到`var`,则判断下一个是不是`IDENTiFER`,如果是,则判断下一个是不是`=`,如果是,则往下直到找到`;`或`换行`或者`EOF`,我们暂时支持这三种情况,而中间的部分的`token[]`可能是数字,也可能是其他变量,也可能是函数调用,也可能是其他操作,但不管是什么东西,都有一个共性就是有返回值,我们把这个东西叫做`表达式`。`return`语句是类似的,他更简单,只需要`return EXPRESSION;`. +`var`语句的解析,就是遍历`token[]`,如果遇到`var`,则判断下一个是不是`IDENTiFER`,如果是,则判断下一个是不是`=`,如果是,则往下直到找到`;`或`换行`或者`EOF`,我们暂时支持这三种情况,而中间的部分的`token[]`可能是数字,也可能是其他变量,也可能是函数调用,也可能是其他操作,但不管是什么东西,都有一个共性就是有返回值,我们把这个东西叫做`表达式`。`return`语句是类似的,他更简单,只需要`return EXPRESSION;`,还有一种常见的语句是块`block`,也就是`{}`中放置多个语句的形式,这也是一种语句,语句中含有其他语句。 ```js var IDENTIFER = token[]; return token[]; @@ -23,25 +23,28 @@ var a = 1; var a = b; var a = b + 1; var a = func(1, 1); -var a = 3 + 3 * 2 -func(1,1) +var a = 3 + 3 * 2 -func(1,1); return a; return 1; return func(1,1); ... +{ + var a = 1; +} ``` -表达式在大多数解释型语言中都可以单独作为一个语句例如单独写一个`1 + 1;`,这也是一种语句类型,所以我们最终有三种语句类型,`var语句` `return语句` 和`表达式语句`,暂时不考虑`if/while/for`等流程控制语句。(这里有人会困惑,函数声明不算是语句吗?函数声明是有返回值的表达式,返回当前函数。) +表达式在大多数解释型语言中都可以单独作为一个语句例如单独写一个`1 + 1;`,这也是一种语句类型,所以我们需要支持`var语句` `return语句` `块语句` `表达式语句`,暂时不考虑`if/while/for`等流程控制语句,后续会补充。(这里有人会困惑,函数声明不算是语句吗?函数声明是有返回值的表达式,返回当前函数。) ```js a + 1; 1 + 2; func(1,2); ``` -我们可以看到三种语句中都有一个绕不开的话题,就是`表达式`,表达式可以作为`var`语句的初始化值,也可以作为`return`语句的返回值,也可以作为`表达式语句`的表达式,但是表达式本身可以作为其他表达式的子表达式,例如`1 + 2`,中`1`和`2`本身其实也是表达式,通过运算符可以把两个表达式连接成一个新的表达式。`var` `return`格式的解析比较简单,语法分析最难的是**表达式的解析**。 +我们可以看到除了块,其他三种语句中都有一个绕不开的话题,就是`表达式`,表达式可以作为`var`语句的初始化值,也可以作为`return`语句的返回值,也可以作为`表达式语句`的表达式,但是表达式本身可以作为其他表达式的子表达式,例如`1 + 2`,中`1`和`2`本身其实也是表达式,通过运算符可以把两个表达式连接成一个新的表达式。`var` `return`格式的解析比较简单,语法分析最难的是**表达式的解析**。 -这里我们借助上一节中提供的`lex.mjs`来开展本节的工作(当然也强烈建议先了解词法分析,再来看语法分析) +这里我们借助上一节中提供的`lex.mjs`来开展本节的工作(当然也强烈建议先了解词法分析,再来看语法分析),定义两种主要类型`Sentence`和`AstNode`,分别用来表示语句和抽象语法树节点,前者有四种语句类型也就是上面提到的`var语句` `return语句` `块语句` `表达式语句`,后者`AstNode`则是表达式的抽象语法树节点,这里我们暂且不需要关心如何解析表达式,只需要知道,每个表达式会被解析为一个抽象语法树节点,返回的就是`root`节点即可。 ```js :parser.mjs import * as LEX from "./lex.mjs"; -import { lex, Token } from './lex.mjs'; +import { lex } from './lex.mjs'; class Sentence { constructor(type) { @@ -51,7 +54,6 @@ class Sentence { } } } -class Expresstion {} class VarSentence extends Sentence { constructor(name, value, endPos) { @@ -78,10 +80,11 @@ class ExpressionStatement extends Sentence { } } -class IdentifierExpression extends Expresstion { - constructor(str, token) { - super("IDENTIFIER"); - this.str = str; +class AstNode {} + +class IdentifierAstNode extends AstNode { + constructor(token) { + super(); this.token = token; } } @@ -91,7 +94,7 @@ function parse(tokens) { for (var i = 0; i < tokens.length; i++) { var token = tokens[i]; var sentence = null; - if (token.type === LEX.NEW_LINE || token.type === LEX.SEMICOLON) { + if (token.type === LEX.SEMICOLON) { continue; } else if (token.type === LEX.EOF) { break; @@ -113,9 +116,9 @@ function parseVarSentence(tokens, start) { assert (tokens[start].type === LEX.VAR); assert (tokens[start + 1].type === LEX.IDENTIFIER); assert (tokens[start + 2].type === LEX.ASSIGN); - var name = new IdentifierExpression(tokens[start + 1].value, tokens[start + 1]); + var name = new IdentifierAstNode(tokens[start + 1]); for (var i = start + 3; i < tokens.length; i++) { - if (tokens[i].type === LEX.SEMICOLON || tokens[i].type === LEX.NEW_LINE || tokens[i].type === LEX.EOF) { + if (tokens[i].type === LEX.SEMICOLON || tokens[i].type === LEX.EOF) { var value = parseExpression(tokens, start + 3, i); return new VarSentence(name, value, i); } @@ -126,7 +129,7 @@ function parseVarSentence(tokens, start) { function parseReturnSentence(tokens, start) { assert (tokens[start].type === LEX.RETURN); for (var i = start + 1; i < tokens.length; i++) { - if (tokens[i].type === LEX.SEMICOLON || tokens[i].type === LEX.NEW_LINE || tokens[i].type === LEX.EOF) { + if (tokens[i].type === LEX.SEMICOLON || tokens[i].type === LEX.EOF) { var value = parseExpression(tokens, start + 1, i); return new ReturnSentence(value, i); } @@ -136,7 +139,7 @@ function parseReturnSentence(tokens, start) { // 转换为表达式语句 function parseExpressionStatement(tokens, start) { for (var i = start; i < tokens.length; i++) { - if (tokens[i].type === LEX.SEMICOLON || tokens[i].type === LEX.NEW_LINE || tokens[i].type === LEX.EOF) { + if (tokens[i].type === LEX.SEMICOLON || tokens[i].type === LEX.EOF) { var expression = parseExpression(tokens, start, i); return new ExpressionStatement(expression, i); } @@ -145,7 +148,7 @@ function parseExpressionStatement(tokens, start) { // 这里先放置个空的逻辑,后面再补上 function parseExpression(tokens, start, end) { - return new Expresstion(); + return new AstNode(); } function assert(condition) { @@ -170,7 +173,6 @@ console.log(JSON.stringify(sentences, 0, 2)); "endPos": 12, "type": "VAR_SENTENCE", "name": { - "str": "a", "token": { "type": "IDENTIFIER", "value": "a" @@ -179,18 +181,189 @@ console.log(JSON.stringify(sentences, 0, 2)); "value": {} }, { - "endPos": 16, + "endPos": 15, "type": "RETURN_SENTENCE", "value": {} }, { - "endPos": 24, + "endPos": 22, "type": "EXPRESSION_SENTENCE", "expression": {} } ] */ ``` +上面代码和测试,对应的是`24.11/parser_test1.js`文件的内容,但是目前有2个问题,1是转换成的对象结构,查看起来有点麻烦,我们可以给每一种`Sentence`类型都新增`toString`方法;2是为了记录当前解析到哪个位置了,我们在`Sentence`类中新增一个属性`endPos`,这是一个解析过程中的变量,实际上不应该被语句持有,我们可以优化一下,将`parseXXX`语句放到`parse`函数中,这样下标`i`就是可以捕捉的变量,对i直接操作就可以了。经过优化后的代码如下,对应`24.11/parser_test2.js`文件的内容: +```js :parser_test2.mjs +import * as LEX from "./lex.mjs"; +import { lex } from './lex.mjs'; + +class Sentence { + constructor(type) { + if (type) { + this.type = type.toUpperCase() + "_SENTENCE"; + } + } +} +class VarSentence extends Sentence { + constructor(name, value) { + super("VAR"); + this.name = name; // name本身其实也是个表达式 + this.value = value; // 这里的value是个表达式 + } + + toString() { + return `var ${this.name} = ${this.value.toString()}`; + } +} + +class ReturnSentence extends Sentence { + constructor(value) { + super("RETURN"); + this.value = value; // 这里的value也是表达式 + } + toString() { + return `return ${this.value.toString()}`; + } +} + +class BlockSentence extends Sentence { + constructor(sentences) { + super("BLOCK"); + this.sentences = sentences; + } + toString() { + return `{ + ${this.sentences.map(it=>it.toString()).join('\n')} +}` + } +} + +class ExpressionStatement extends Sentence { + constructor(expression) { + super("EXPRESSION"); + this.expression = expression; // 这里的expression也是表达式 + } + + toString() { + return this.expression.toString(); + } +} + +class AstNode {} + +class IdentifierAstNode extends AstNode { + constructor(token) { + super(); + this.token = token; + } + toString() { + return this.token.value; + } +} +// 语法解析,把tokens转换为sentences +function parse(tokens) { + // 从i开始转换成var语句,校验是不是var xx = xxx;格式,然后需要解析表达式parseExpression函数。 + function parseVarSentence() { + assert (tokens[i].type === LEX.VAR); + assert (tokens[i + 1].type === LEX.IDENTIFIER); + assert (tokens[i + 2].type === LEX.ASSIGN); + var name = new IdentifierAstNode(tokens[i + 1]); + for (var j = i + 3; j < tokens.length; j++) { + if (tokens[j].type === LEX.SEMICOLON || tokens[j].type === LEX.EOF) { + var value = parseExpression(tokens, i + 3, j); + i = j; + return new VarSentence(name, value); + } + } + } + // 与var语句类似 + function parseReturnSentence() { + assert (tokens[i].type === LEX.RETURN); + for (var j = i + 1; j < tokens.length; j++) { + if (tokens[j].type === LEX.SEMICOLON || tokens[j].type === LEX.EOF) { + var value = parseExpression(tokens, i + 1, j); + i = j; + return new ReturnSentence(value); + } + } + } + // 转换为表达式语句 + function parseExpressionStatement() { + for (var j = i; j < tokens.length; j++) { + if (tokens[j].type === LEX.SEMICOLON || tokens[j].type === LEX.EOF) { + var expression = parseExpression(tokens, i, j); + i = j; + return new ExpressionStatement(expression); + } + } + } + // 转换为块语句,块语句中包含一个语句数组 + function parseBlockSentence() { + var braceCount = 0; + for (var j = i; j < tokens.length; j++) { + if (tokens[j].type == LEX.LBRACE) braceCount++; + if (tokens[j].type == LEX.RBRACE) braceCount--; + if (braceCount == 0) { + return new BlockSentence(parse(tokens.slice(i + 1, i = j))); + } + } + throw new Error("brace not close for block sentence") + } + var sentences = []; + for (var i = 0; i < tokens.length; i++) { + var token = tokens[i]; + var sentence = null; + if (token.type === LEX.SEMICOLON) { + continue; + } else if (token.type === LEX.EOF) { + break; + } if (token.type === LEX.VAR) { + sentence = parseVarSentence(); + } else if (token.type === LEX.RETURN) { + sentence = parseReturnSentence(); + } else if (token.type === LEX.LBRACE) { + sentence = parseBlockSentence(); + } else { + sentence = parseExpressionStatement(); + } + sentences.push(sentence); + } + return sentences; +} + + +// 这里先放置个空的逻辑,后面再补上 +function parseExpression(tokens, start, end) { + return new AstNode(); +} + +function assert(condition) { + if (!condition) { + throw new Error("assert failed"); + } +} + +var code = `var a = 1 + 2 * 3 / 4 - 5; + return a; + func(a, b); + {var a = 100;}`; + +var tokens = lex(code); +var sentences = parse(tokens) + +for (var i = 0; i < sentences.length; i++) { + console.log(sentences[i].toString()); +} +/** 打印结果 + * var a = [object Object] +* return [object Object] +* [object Object] +* { +* var a = [object Object] +* } +*/ +``` # 2 解析表达式 上面的代码非常简单,就是按照特定的语句类型去解析语句罢了,只不过`parseExpression`函数给了一个空实现,这就是解析表达式的部分。那么表达式都有哪些形式呢? - 数字、字符串、布尔值、null、变量,这些单个值就是表达式,比如`1`、`"hello"`、`true`、`null`、`a`等。 @@ -215,6 +388,7 @@ console.log(JSON.stringify(sentences, 0, 2)); ```js 1 + 2 * 3 / 4 - 5 ``` +### 2.1.1 优先级排序依次合并 从操作符的优先级去考虑,其实问题很简单,表达式只有两种`token`一种就是`操作符`,另一种是`数字`。我们遍历一遍节点,找到优先级最高的`操作符`,然后把他两边的节点挂到他的`left`和`right`上,把这个`操作符`标记一下已经完成了。 然后再来一次,找到剩下的`操作符`优先级最高的,重复上面操作,不断循环,最后数组就只剩下一个节点了,也就是我们要的树的root节点了,有点绕么?我们来看下面的操作图。 @@ -254,6 +428,9 @@ class NumberAstNode extends AstNode { super(true); this.value = value; } + toString() { + return this.value; + } } class InfixOperatorAstNode extends AstNode { constructor(token) { @@ -263,6 +440,9 @@ class InfixOperatorAstNode extends AstNode { this.right = null; this.precedence = precedenceMap[token.value]; } + toString() { + return `(${this.left.toString()} ${this.op.value} ${this.right.toString()})`; + } } function parseExpression(tokens, start, end) { // 1 先把tokens 转成 AstNode数组 @@ -298,21 +478,23 @@ function parseExpression(tokens, start, end) { return nodes[0]; } ``` -这个方法非常的简单,对于理解表达式解析非常有用,具有很好的教学意义,但他还是有一些问题。主要是算法的复杂度不太理想,当前的复杂度是`O(n^2)`,还可以进行简单的优化。一种纯工程的优化方式就是,把`AstNode[]`数组改为双向链表,即给`AstNode`加两个属性`prev`和`next`。遍历`token`,将其append到双向链表尾部(最开始放个哨兵节点),并且如果当前节点是运算符,则塞到优先队列(堆)中,优先队列按照节点的运算优先级从高到低。这样每次从优先队列中`pop`取出优先级最高的节点,然后从双向链表中取出它的左右节点,放到自己的`left`和`right`属性上,然后继续从优先队列`pop`,重复这个过程,直到队列中没有元素。此时哨兵节点后面紧邻的一个节点就是`AstTree`根节点。这个优化方式非常工程化,代码也很简单,这里不展开了,感兴趣的自己简单实现下,他的复杂度是`O(nlogn)`。 +这个方法非常的简单,对应全量代码在`24.11/parser_test3.mjs`。对于理解表达式解析非常有用,具有很好的教学意义,但他还是有一些问题。主要是算法的复杂度不太理想,当前的复杂度是`O(n^2)`,还可以进行简单的优化。一种纯工程的优化方式就是,把`AstNode[]`数组改为双向链表,即给`AstNode`加两个属性`prev`和`next`。遍历`token`,将其append到双向链表尾部(最开始放个哨兵节点),并且如果当前节点是运算符,则塞到优先队列(堆)中,优先队列按照节点的运算优先级从高到低。这样每次从优先队列中`pop`取出优先级最高的节点,然后从双向链表中取出它的左右节点,放到自己的`left`和`right`属性上,然后继续从优先队列`pop`,重复这个过程,直到队列中没有元素。此时哨兵节点后面紧邻的一个节点就是`AstTree`根节点。这个优化方式非常工程化,代码也很简单,这里不展开了,感兴趣的自己简单实现下,他的复杂度是`O(nlogn)`。 +### 2.1.2 Shunting Yard逆波兰 但是这个复杂度还不太够,目标是`O(n)`,我们上面思想是从最高优先级开始合并,依次到最低优先级,所以潜在就是有排序的,排序就意味着不可能超过`O(nlogn)`,如果要想达到`O(n)`,那其实就是遍历一遍,而不要排序,不排序,怎么保证高优先级的先合并呢?好这里其实有一个思维定式,就是我们一直想着让最高优先级的最先合并,其实一个表达式的节点合并,并不需要最高优先级的先合并,举个例子:`1 + 2 + 3 * 4`,一定需要先`(3*4)`然后再`(1+2)`最后`(1+2)+(3*4)`吗?不,其实读取到第二个加号的时候,就可以先`(1+2)`了,然后读取到乘号的时候`(3*4)`最后`(1+2)+(3*4)`。其实是当我们读取到优先级小于等于前一个符号优先级的时候,就可以把前一个符号进行合并了。也就是我们不需要先找到最高优先级的,只要发现后一个符号优先级小于等于前一个,那前一个就可以进行合并操作。 +上面的思路还把`full`这个额外字段去掉了,代码更整洁了,另外还希望把`parseExpression`这个函数的入参中的`end`给去掉,这也是一个冗余的字段,可以通过是否到了`tokens`结束位置,或者遇到了无法处理的`token`就结束表达式的识别。修改后代码如下: ```js -function parseExpression(tokens, start, end) { +function parseExpression(tokens, start) { var nodes = []; var opNodes = []; - for (var i = start; i < end; i++) { + for (var i = start; i < tokens.length; i++) { var token = tokens[i]; if (token.type === LEX.NUMBER) { nodes.push(new NumberAstNode(token.value)); } else if (token.type === LEX.PLUS || token.type === LEX.MINUS || token.type === LEX.MULTIPLY || token.type === LEX.DIVIDE) { var node = new InfixOperatorAstNode(token); - if (opNodes.length > 0 && node.precedence <= opNodes[opNodes.length - 1].precedence) { + while (opNodes.length > 0 && node.precedence <= opNodes[opNodes.length - 1].precedence) { var opNode = opNodes.pop(); var opIndex = nodes.indexOf(opNode); opNode.left = nodes[opIndex - 1]; @@ -320,48 +502,69 @@ function parseExpression(tokens, start, end) { nodes.splice(opIndex - 1, 3, opNode); } nodes.push(node); + opNodes.push(node); } else { - throw new Error("unexpected token type: " + token.type); + // 无法识别的token结束表达式识别 + break; } } + // 遍历完之后,opNode是单调增的优先级,挨着融合即可,或者也可以整合到for循环中,用一个优先级为0的EOF哨兵节点 + // 可以减少下面重复代码,但是为了更好理解,我就把这段代码摘出来放到下面了 + while (opNodes.length > 0) { + var opNode = opNodes.pop(); + var opIndex = nodes.indexOf(opNode); + opNode.left = nodes[opIndex - 1]; + opNode.right = nodes[opIndex + 1]; + nodes.splice(opIndex - 1, 3, opNode); + } return nodes[0]; } -``` +``` +插一句:删除`end`后,其实需要对`parseXXXSentence`进行整改,例如当遇到`var a = 1 + 1千万;`这个非法语句的时候,`start`指向第一个1,`end`指向分号,需要解析的部分为`1 + 1千万`,然后`千`这里无法解析就报错了。而如果去掉`end`参数,会导致解析`1 + 1`后遇到`千`就返回了,并且不报错,最终的结果实际是`var a = 1 + 1`,为了能够处理非法语句,就需要在`parseVarSentence`中判断表达式解析完之后,指针是不是指向`;`。 + 这样我们只需要遍历一遍就得到结果了,不过这个代码稍微有点问题`var opIndex = nodes.indexOf(opNode);`这一行的复杂度是`O(n)`导致最终复杂度还是`O(n^2)`,我们需要把nodes从数组改成双向链表,不过双向链表会使`AstNode`代码变复杂,我就不展示代码了,只需要直到改成双向链表其实复杂度就会降低到`O(n)`。而不展示代码的另一个原因是,还有一种更简单的写法如下,因为四则运算都是左右二元的运算符,所以上一个运算符一定位于当前`nodes`数组的倒数第二个位置,所以可以如下简化,此时复杂度`O(n)` ```js -function parseExpression(tokens, start, end) { +function parseExpression(tokens, start) { var nodes = []; var opNodes = []; - for (var i = start; i < end; i++) { + for (var i = start; i < tokens.length; i++) { var token = tokens[i]; if (token.type === LEX.NUMBER) { nodes.push(new NumberAstNode(token.value)); } else if (token.type === LEX.PLUS || token.type === LEX.MINUS || token.type === LEX.MULTIPLY || token.type === LEX.DIVIDE) { var node = new InfixOperatorAstNode(token); - if (opNodes.length > 0 && node.precedence <= opNodes[opNodes.length - 1].precedence) { + while (opNodes.length > 0 && node.precedence <= opNodes[opNodes.length - 1].precedence) { var opNode = opNodes.pop(); - opNode.right = nodes[nodes.length - 1]; - opNode.left = nodes[nodes.length - 3]; - nodes.pop(); - nodes.pop(); - nodes.pop(); - nodes.unshift(opNode); + // opNode一定是倒数第二个元素,所以就可以简化成下面这样 + opNode.right = nodes.pop(); + nodes.pop(); + opNode.left = nodes.pop(); + nodes.push(opNode); } nodes.push(node); + opNodes.push(node); } else { - throw new Error("unexpected token type: " + token.type); + break; } } + while (opNodes.length > 0) { + var opNode = opNodes.pop(); + // opNode一定是倒数第二个元素,所以就可以简化成下面这样 + opNode.right = nodes.pop(); + nodes.pop(); + opNode.left = nodes.pop(); + nodes.push(opNode); + } return nodes[0]; } ``` -这其实就是逆波兰表达式的思想,只不过逆波兰表达式,是把`nodes`这个数组在开始的时候只存放`NumberAstNode`,而`opNodes`这个是用栈实现的,存放的是`InfixOperatorAstNode`,遇到优先级小于等于栈顶的运算符就出栈,放到`nodes`的最后,形成后缀表达式,我们这里不是为了产生逆波兰表达式,所以直接把需要操作的节点给合并了,但是都是用到了一个核心思想:**当新的操作符优先级小于等于栈顶操作符,则栈顶操作符可以弹出并进行合并**,本质上就是单调栈的思想。 +这其实就是逆波兰表达式的思想,当然逆波兰表达式的标准解法中,一个栈只用来存储数字,另一个只用来存储操作符,比我们上面做法要精炼,但是都是用到了一个核心思想:**当新的操作符优先级小于等于栈顶操作符,则栈顶操作符可以弹出并进行合并**,本质上就是单调栈的思想。 基于这种思想进行语法分析,其实又叫`Shunting Yard`算法,是一种最简单朴素的表达式解析的方法,他的优点就是非常简单容易理解,缺点则是: - 1 右结合是默认不支持的,`a=b=1`的场景下,上面栈的顺序,始终会在遇到第二个等号的时候,先计算`a=b`,需要做一些改动,即`<=`这里需要根据操作符进行调整是`<=`还是`<` - 2 对于复杂语法,例如函数调用,三目运算符,等,需要额外的修改才行。 - 3 对于上下文敏感的语法,基本无能为力。 - +### 2.1.3 Pratt算法 所以现代编程语言的解析器基本不适用`Shunting Yard`算法,而是采用了`Pratt`算法。我们再来聊一下`Pratt`的实现思想。`Pratt`也是基于运算符优先级,他基于每个双目操作符,在被遍历到的时候: - 当前运算符优先级比之前的更高,则`cur.left=两个运算符中间的节点` - 当前运算符优先级比之前的更低,则`pre.right=两个运算符中间的节点` @@ -396,24 +599,762 @@ function parseExpression(tokens, start, end) { ![img](https://i.imgur.com/SRIOM9r.png) -那来实现一下: +那来实现一下,完整代码在`24.11/parser_test4.mjs`: ```js -function parseExpression(tokens, start, end, precedence, currentIndex = {index: start}) { - var numNode = new NumberAstNode(tokens[start].value); - var result = numNode; +// 参数中添加了优先级 +function parseExpression(tokens, start, precedence=0) { + // 因为用到递归,并且因为有递归,所以start这个下标的位置需要用引用类型 + // 这样递归更深中的移动,也会在上层改变start的值,所以进入前简单处理下start如果是数字,修改为对象类型 + if (start.index === undefined) { + return parseExpression(tokens, {index:start}, precedence); + } + var leftNode = new NumberAstNode(tokens[start.index].value); + while (start.index < tokens.length - 1 && isValidInfixOperator(tokens[start.index + 1])) { + var opNode = new InfixOperatorAstNode(tokens[start.index + 1]); + if (opNode.precedence <= precedence) { + return leftNode; + } else { + opNode.left = leftNode; + start.index += 2; + opNode.right = parseExpression(tokens, start, opNode.precedence); + leftNode = opNode; + } + } + return leftNode; +} +function isValidInfixOperator(token) { + return token.type === LEX.PLUS || token.type === LEX.MINUS || token.type === LEX.MULTIPLY || token.type === LEX.DIVIDE; +} +``` +这个代码行数不多,但是因为有循环和递归,并且`dfs`中是有返回值的情况是比较难理解的,所以我们即使上面陈述这么多,还是会觉得云里雾里。我们可以把上述代码改为无返回值的`dfs`形式,这种比较好理解一点: +```js +function parseExpression(tokens, start, preNode) { + // 第一次调用的时候,preNode也是不传的,我们自己构造一个哨兵节点 + if (start.index === undefined) { + preNode = new InfixOperatorAstNode(''); preNode.precedence = 0; + parseExpression(tokens, {index:start},preNode) + return preNode.right; + } + // 我们把变量名改为mid,每次循环中的处理,就是为了决定mid这个中间元素的归属 + // 如果当前操作符优先级比前一个要高,则归当前opNode的left + // 否则,mid归前一个的right,前一个咋来的呢,是递归传进来的preNode + var mid = new NumberAstNode(tokens[start.index].value); + var precedence = preNode.precedence; - while (currentIndex.index < end - 1) { - var opNode = new InfixOperatorAstNode(tokens[currentIndex.index + 1]); + // start数值参数换成对象的原因是,下面的递归会改动这个指针. + // 而指针的移动是不可逆的,所以要传指针,而不是传值。 + while (start.index < tokens.length - 1 && isValidInfixOperator(tokens[start.index + 1])) { + var opNode = new InfixOperatorAstNode(tokens[start.index + 1]); if (opNode.precedence <= precedence) { - currentIndex.index = start; // currentIndex是用对象类型,记录的上下文在所有的dfs中是一个单例 - return result; + preNode.right = mid; // pre赢得mid,pre左右都填充了 + return; } else { - opNode.left = result; - opNode.right = parseExpression(tokens, currentIndex.index + 2, end, opNode.precedence, currentIndex); - result = opNode; + opNode.left = mid; // opNode赢得mid + start.index += 2; // 指针往后移动2个(每次移动2个),一个数字一个符号 + parseExpression(tokens, start, opNode); // opNode作为preNode,指针往后移动 + mid = opNode; // opNode的right在递归中填充完毕,此时他作为下一任mid + } + } + preNode.right = mid; +} +``` +改成上面的无返回值形式,配合代码的注释更好理解了。 +### 2.1.4 栈版本的Pratt算法 +但因为有递归还是没有很平面化,我们用栈的方式能更好的理解这个算法。如下图,我们每次遍历两个元素,一个是中间元素mid,一个是操作符。这里的中间元素一定是数字,或者left,right都已填充的操作符节点。`while`循环,每次只需要准备栈顶元素、`mid`中间元素、当前操作符元素,栈顶和当前优先级pk,决定mid是归谁。 + +![img](https://i.imgur.com/G9UAr99.png) + +每次循环要做的事情,就是将栈顶元素(一定是right为空的操作符节点)和当前操作符进行优先级PK,优先级更高的获得中间元素,如果是当前操作符赢了,那么要将中间元素挂在自己左边(因为位置上就是在自己左边),否则中间元素挂在栈顶元素的右边。如果是后者的话,栈顶元素的right就填充好了,此时他不能再留在栈中了,就弹出来作为下一任中间元素,继续循环。下一次循环中,当前遍历的指针不动,只是中间元素换成了弹出的栈顶元素,而当前栈顶元素也要换人。而循环结束的条件就是,左边是空栈(前面没有未完成的操作符),右边是`EOF`(后面也没有未完成的操作符),直接返回mid(中间的就是结果了)。该版本的代码在`24.11/parser_test5.mjs`: +```js +function parseExpression(tokens, start) { + var stack = []; + var i = start, mid = null; + while (true) { + // 每个循环,准备好栈顶优先级、中间元素、当前操作符 + var stackTopPrecedence = stack.length == 0? 0: stack[stack.length - 1].precedence; + mid = mid == null ? new NumberAstNode(tokens[i++].value) : mid; + var opNode = getEofOrInfixNode(tokens, i); + // 结束循环的条件 + if (opNode.precedence == 0 && stackTopPrecedence == 0)return mid; + // 栈顶操作符赢得mid:弹出栈顶,填充right,并作为新的mid; NULL是EOF是最低优先级 + if (opNode.precedence <= stackTopPrecedence) { + var top = stack.pop(); + top.right = mid; + mid = top; + } + // 当前操作符赢得mid:塞入栈中,继续向后走 + else { + opNode.left = mid; + stack.push(opNode); + i++; + mid = null; // 往后走取新的mid + } + } +} +function getEofOrInfixNode(tokens, index) { + var eof = new InfixOperatorAstNode('EOF'); + eof.precedence = 0; + if (index >= tokens.length) return eof + var token = tokens[index]; + if (precedenceMap[token.value] == null) { + return eof; + } + return new InfixOperatorAstNode(tokens[index]); +} +``` +这样的单层循环,理解起来是不是比递归要简单多了,其实本质上是一样的,只是用栈模拟了递归的过程,这样对于`pratt`的解析,我们用了多种代码的形式,有了非常深刻的理解。 +## 2.2 其他语法 +上面花了大量篇幅来讲四则运算的语法分析,而实际上表达式除了四则运算符,还有其他的很多形式,正如上面提到的,我们还需要整理代码来适配: +- 数字、字符串、布尔值、null、变量,这些单个值就是表达式,比如`1`、`"hello"`、`true`、`null`、`a`等。 +- 前缀操作符 + 另一个表达式形成新的表达式,例如`-1`、`!true`等。 +- 另一个表达式 + 后缀操作符,例如`a++`、`a--`等。 +- 二元操作符 + 左右两个表达式,例如`1 + 2`、`a > b`、`x = 1`等。 +- 函数调用,例如`add(a, b)`,注意函数的每个参数也需要是表达式。 +- 括号(组group)包住子表达式,例如`(1 + 2)`。 +- 函数声明,在很多解释型语言中,函数声明也是表达式该表达式返回值就是函数本身,例如`var add = function(a, b) { return a + b; }`。 + +### 2.2.1 字面量类型 +对于第一种字面量类型的,我们判断`token`的类型即可,四则运算代码中都是数字的`Node`,只需要判断`token`类型转换成对应的`NumberNode`,`StringNode`,`BooleanNode`,`NullNode`,`IdentifierNode`即可;以下为各种`AstNode`代码。 +```js +// 基础类型 +class AstNode { +} +// 数字字面量 +class NumberAstNode extends AstNode { + constructor(token) { + super(); + this.token = token; + } + + toString() { + return this.token.value; + } +} +// 变量名/函数名字面量 +class IdentifierAstNode extends AstNode { + constructor(token) { + super(); + this.token = token; + } + + toString() { + return this.token.value; + } +} +// null字面量 +class NullAstNode extends AstNode { + toString() { + return "null"; + } +} + +// 字符串字面量 +class StringAstNode extends AstNode { + constructor(token) { + super(); + this.token = token; + } + toString() { + return this.token.value; + } +} +// boolean字面量 +class BooleanAstNode extends AstNode { + constructor(token) { + super(); + this.token = token; + } + toString() { + return this.token.value; + } +} +// 中缀操作符节点 +class InfixOperatorAstNode extends AstNode { + constructor(token) { + super(); + this.op = token; + this.left = null; + this.right = null; + this.precedence = precedenceMap[token.value]; + } + toString() { + return `(${this.left.toString()} ${this.op.value} ${this.right.toString()})`; + } +} +// 前缀操作符 +class PrefixOperatorAstNode extends AstNode { + constructor(token, right) { + super(false); + this.op = token; + this.right = right; + } + toString() { + return `(${this.op.value} ${this.right.toString()})`; + } +} +// 后缀操作符 +class PostfixOperatorAstNode extends AstNode { + constructor(token, left) { + super(false); + this.op = token; + this.left = left; + } + toString() { + return `(${this.left.toString()} ${this.op.value})`; + } +} +// 函数声明 +class FunctionDeclarationAstNode extends AstNode { + constructor(nameToken, params, body) { + super(); + this.name = name == null ? null :new IdentifierAstNode(nameToken); + this.params = params; + this.body = body; + } + toString() { + return `function${this.name ? ' ' + this.name.toString() : ''}(${this.params.join(',')})${this.body.map(it=>it.toString()).join('\n')}`; + } +} +// 函数调用 +class FunctionCallAstNode extends AstNode { + constructor(nameToken, args) { + super(); + this.name = new IdentifierAstNode(nameToken); + this.args = args; // args是ast数组 + } + toString() { + return `${this.name.toString()}(${this.args.map(it=>it.toString()).join(',')})` + } +} +// 分组节点 +class GroupAstNode extends AstNode { + constructor(exp) { + super(); + this.exp = exp; + } + toString() { + // 因为小括号已经在运算符的toString中使用了,这里为了更好的凸显使用中文中括号 + return `【${this.exp.toString()}】` + } +} +``` +在`parseExpression`函数中的修改,主要是`new NumebrrAstNode(xxx)`这一行,需要判断当前`token`的类型,然后创建对应的`AstNode`。这里暂不展示代码。可以直接查看`parser_test6.mjs`。 +### 2.2.2 单元操作符 +前缀运算符有`++`、`--`、`+`、`-`、`!`、`~`。同一个符号出现时可能有多重身份,例如`-`可以是前缀,也可能是中缀。这就需要一些额外的判断,如果前一个`token`是括号三兄弟`([{`或者位于开始位置或者是其他中缀运算符,那么这个位置`-`就是前缀运算符,否则就是中缀运算符。而对于`++`这种可能是前缀,也可能是后缀的运算符,也可按照上述判断是否是前缀,如果不是前缀,就是后缀运算符。以上面基于栈的`pratt`算法为基础进行改造: +```diff ++ // 后缀>前缀>中缀 ++ const prefixPrecedenceMap = { ++ '-': 100, ++ '!': 100, ++ '~': 100, ++ '+': 100, ++ '++': 100, ++ '--': 100 ++ } +... +- mid = mid == null ? new NumberAstNode(tokens[i++].value) : mid; ++ mid = mid == null ? nextAstNode() : mid; +... ++ // 这个函数的定义放到parseExpression函数里面 ++ function nextAstNode() { ++ var token = tokens[i]; ++ var node = null; ++ switch (token.type) { ++ case LEX.NUMBER: ++ node = new NumberAstNode(tokens[i++]); ++ break; ++ case LEX.STRING: ++ node = new StringAstNode(tokens[i++]); ++ break; ++ case LEX.BOOLEAN: ++ node = new BooleanAstNode(tokens[i++]); ++ break; ++ case LEX.NULL: ++ node = new NullAstNode(tokens[i++]); ++ break; ++ // 遇到前缀运算符 ++ case LEX.PLUS: ++ case LEX.MINUS: ++ case LEX.INCREMENT: ++ case LEX.DECREMENT: ++ case LEX.NOT: ++ case LEX.BIT_NOT: ++ // 前缀后面递归解析一元节点(前缀后面一定是个一元节点) ++ // 并且前缀操作符都是右结合的,所以递归结合(递归=栈=后来的先结合) ++ node = new PrefixOperatorAstNode(tokens[i++], nextUnaryNode()); ++ break; ++ default: ++ throw new Error('unexpected token in getNode: ' + token.type); ++ } ++ // 后缀,都是左结合的,所以遇到就合并 ++ while (tokens[i].type == LEX.INCREMENT || tokens[i].type == LEX.DECREMENT) { ++ node = new PostfixOperatorAstNode(tokens[i++], node); ++ } ++ return node; ++ } +``` +到此对应的全量代码在`24.11/parser_test6.mjs` + +### 2.2.3 分组类型 +`(1 + 2) * 3`这里的小括号就是分组,需要在`nextUnaryNode`函数中添加`LPAREN`左括号的识别,识别到之后就则需要递归进行表达式的解析,代码类似这样: +```js +function nextAstNode() { + //... + case LEX.LPAREN: + node = new GroupAstNode(parseExpression(tokens, i + 1)); + break; + //... +} +``` +但是这里有个问题,在调用`parseExpression`函数结束的时候,`i`实际上没有任何变化`(1 + 2)*3`的例子中`i`位于左括号,而递归`parseExpression`运行结束后,`i`还是0,这样就死循环了,问题其实出在`parseExression`函数的`start`是值传递的,导致运行结束后,上层的`i`值是没有变的。有三种方式可以让上层感知变量的变化: +- 1 上面用过的`start = {index: i}`,用对象,即引用传递 +- 2 用返回值,在返回值中带一个属性`endPos`,递归调用结束后,令`i = res.endPos` +- 3 用更大范围的全局变量,比如把`parseExpression` `parseXXX`....所有的方法都放到一个`class`中,`i`作为`class`一个属性,这样就成为一个所有函数看来的全局变量了,这也是一种面向对象的思想,状态`i`作为对象的属性,是非常推荐的一种代码写法,其实我们应该在一开始就用对象来封装的,但是上来用对象封装很多人会觉得不适,好像被强加了一些代码风格,会质疑为什么要用对象,不用可不可以,所以我们从文章开始到现在,都没有使用对象的方式,在这个过程中大家能深刻的感受到了全局变量`i`传递的时候的不方便,甚至我们被逼的在函数中定义函数,来利用上一级函数的全局变量`i`,但是到这里我们终于发现,原来用`class`封装的方式可以更好的维护代码了,我们就可以带着之前的理解来迎接`class`形式的代码了。 +- 4 就`js`可以传入闭包,例如`parseExpression(tokens, i + 1, (endPos)=> i = endPos)`,然后`parseExpression(tokens, start, endCallBack=()=>{})`的`return`的这一行改成`endCallBack(i); return mid;`,闭包本质上是传入了一个含有外部`i`引用类型。所以其实和方法1是一个道理,没什么特殊的 + +我们使用方法3,然后将之前的所有的方法,都平级放到`class Parser`中,全量代码参考`24.11/parser_test7.mjs`,这里我们把`Parser`类列出: +```js +class Parser { + constructor(tokens) { + this.tokens = tokens; + this.cursor = 0; + } + // 语法解析,把tokens转换为sentences + parse() { + var tokens = this.tokens; + var sentences = []; + for (;;) { + var token = tokens[this.cursor]; + var sentence = null; + if (token.type === LEX.SEMICOLON) { + continue; + } else if (token.type === LEX.EOF) { + break; + } if (token.type === LEX.VAR) { + sentence = this.parseVarSentence(); + } else if (token.type === LEX.RETURN) { + sentence = this.parseReturnSentence(); + } else if (token.type === LEX.LBRACE) { + sentence = this.parseBlockSentence(); + } else { + sentence = this.parseExpressionStatement(); + } + sentences.push(sentence); + } + return sentences; + } + + // 从i开始转换成var语句,校验是不是var xx = xxx;格式,然后需要解析表达式parseExpression函数。 + parseVarSentence() { + var tokens = this.tokens; + assert (tokens[this.cursor].type === LEX.VAR); + assert (tokens[this.cursor + 1].type === LEX.IDENTIFIER); + assert (tokens[this.cursor + 2].type === LEX.ASSIGN); + var name = new IdentifierAstNode(tokens[this.cursor + 1]); + for (var j = this.cursor + 3; j < tokens.length; j++) { + if (tokens[j].type === LEX.SEMICOLON || tokens[j].type === LEX.EOF) { + var value = this.parseExpression(this.cursor = this.cursor + 3); + return new VarSentence(name, value); + } + } + } + // 从i开始转换成var语句,校验是不是var xx = xxx;格式,然后需要解析表达式parseExpression函数。 + parseVarSentence() { + var tokens = this.tokens; + assert (tokens[this.cursor].type === LEX.VAR); + assert (tokens[this.cursor + 1].type === LEX.IDENTIFIER); + assert (tokens[this.cursor + 2].type === LEX.ASSIGN); + var name = new IdentifierAstNode(tokens[this.cursor + 1]); + for (var j = this.cursor + 3; j < tokens.length; j++) { + if (tokens[j].type === LEX.SEMICOLON || tokens[j].type === LEX.EOF) { + this.cursor = this.cursor + 3 + var value = this.parseExpression(); + assert(tokens[this.cursor].type === LEX.SEMICOLON || tokens[this.cursor] == LEX>EOF); + this.cursor ++; + return new VarSentence(name, value); + } + } + } + // 与var语句类似 + parseReturnSentence() { + var tokens = this.tokens; + assert (tokens[this.cursor].type === LEX.RETURN); + for (var j = this.cursor + 1; j < tokens.length; j++) { + if (tokens[j].type === LEX.SEMICOLON || tokens[j].type === LEX.EOF) { + this.cursor += 1; + var value = this.parseExpression(); + assert(tokens[this.cursor].type === LEX.SEMICOLON || tokens[this.cursor] == LEX>EOF); + this.cursor ++; + return new ReturnSentence(value); + } + } + } + // 转换为表达式语句 + parseExpressionStatement() { + var tokens = this.tokens; + for (var j = this.cursor; j < tokens.length; j++) { + if (tokens[j].type === LEX.SEMICOLON || tokens[j].type === LEX.EOF) { + var expression = this.parseExpression(); + assert(tokens[this.cursor].type === LEX.SEMICOLON || tokens[this.cursor] == LEX>EOF); + this.cursor ++; + return new ExpressionStatement(expression); + } + } + } + // 转换为块语句,块语句中包含一个语句数组 + parseBlockSentence() { + var tokens = this.tokens; + var braceCount = 0; + for (var j = this.cursor; j < tokens.length; j++) { + if (tokens[j].type == LEX.LBRACE) braceCount++; + if (tokens[j].type == LEX.RBRACE) braceCount--; + if (braceCount == 0) { + return new BlockSentence(parse(tokens.slice(this.cursor + 1, this.cursor = j))); + } + } + throw new Error("brace not close for block sentence") + } + + // 表达式解析,解析下一个表达式,遇到无法识别的字符会结束 + parseExpression() { + var tokens = this.tokens; + var stack = []; + var mid = null; + while (true) { + // 每个循环,准备好栈顶优先级、中间元素、当前操作符 + var stackTopPrecedence = stack.length == 0? 0: stack[stack.length - 1].precedence; + mid = mid == null ? this.nextUnaryNode() : mid; + var opNode = this.getEofOrInfixNode(tokens, this.cursor); + // 结束循环的条件 + if (opNode.precedence == 0 && stackTopPrecedence == 0)return mid; + // 栈顶操作符赢得mid:弹出栈顶,填充right,并作为新的mid; NULL是EOF是最低优先级 + if (opNode.precedence <= stackTopPrecedence) { + var top = stack.pop(); + top.right = mid; + mid = top; + } + // 当前操作符赢得mid:塞入栈中,继续向后走 + else { + opNode.left = mid; + stack.push(opNode); + this.cursor++; + mid = null; // 往后走取新的mid + } + } + } + nextUnaryNode() { + var tokens = this.tokens; + var node = null; + switch (tokens[this.cursor].type) { + case LEX.NUMBER: + node = new NumberAstNode(tokens[this.cursor++]); + break; + case LEX.STRING: + node = new StringAstNode(tokens[this.cursor++]); + break; + case LEX.BOOLEAN: + node = new BooleanAstNode(tokens[this.cursor++]); + break; + case LEX.NULL: + node = new NullAstNode(tokens[this.cursor++]); + break; + case LEX.IDENTIFIER: + node = new IdentifierAstNode(tokens[this.cursor++]); + break; + // 遇到前缀运算符 + case LEX.PLUS: + case LEX.MINUS: + case LEX.INCREMENT: + case LEX.DECREMENT: + case LEX.NOT: + case LEX.BIT_NOT: + // 前缀后面递归解析一元节点(前缀后面一定是个一元节点) + // 并且前缀操作符都是右结合的,所以可以直接递归。 + node = new PrefixOperatorAstNode(tokens[this.cursor++], this.nextUnaryNode()); + break; + // 分组 + case LEX.LPAREN: + // 递归解析(后面的即可,因为遇到)的时候,parseExpression无法识别,就会结束解析 + this.cursor++; + // GroupAstNode其实可有可无 + node = new GroupAstNode(this.parseExpression()); + assert(tokens[this.cursor++].type == LEX.RPAREN, "group not closed"); + break; + default: + throw new Error('unexpected token in nextUnary: ' + tokens[this.cursor].type); + } + // 后缀操作符,后缀操作符都是左结合的,并且后缀操作符的优先级比前缀都要高 + while (tokens[this.cursor].type == LEX.INCREMENT || tokens[this.cursor].type == LEX.DECREMENT) { + node = new PostfixOperatorAstNode(tokens[this.cursor++], node); + } + return node; + } + getEofOrInfixNode(tokens, index) { + var eof = new InfixOperatorAstNode('EOF'); + eof.precedence = 0; + if (index >= tokens.length) return eof + var token = tokens[index]; + if (precedenceMap[token.value] == null) { + return eof; + } + return new InfixOperatorAstNode(tokens[index]); + } + +} +``` +另外有了全局的地方记录`cursor`的值,我们也可以修改一元(前缀/后缀)操作符的代码了,前缀操作符目前是`node = new PrefixOperatorAstNode(tokens[this.cursor++], this.nextUnaryNode());`来实现的,其实递归的`this.nextUnaryNode()`也可以改成递归`parseExpression()`。即我们对一元和二元操作符一视同仁,只不过前缀操作符`left`节点是已经填充好的,后缀则是`right`是填充好的。如下也是一种思路,全量代码在`24.11/parser_test8.js` +```js +const prefixPrecedenceMap = { + '-': 100, + '!': 100, + '~': 100, + '+': 100, + '++': 100, + '--': 100 +} +const postfixPrecedenceMap = { + '++': 200, + '--': 200 +} +... + // 遇到前缀运算符 + case LEX.PLUS: + case LEX.MINUS: + case LEX.INCREMENT: + case LEX.DECREMENT: + case LEX.NOT: + case LEX.BIT_NOT: + // 使用parseExpression函数递归,但是要传递当前符号的优先级 + node = new PrefixOperatorAstNode(tokens[this.cursor], this.parseExpression(prefixPrecedenceMap[tokens[this.cursor++].value])); + break; + // 分组 + case LEX.LPAREN: + // 递归解析(后面的即可,因为遇到)的时候,parseExpression无法识别,就会结束解析 + this.cursor++; + node = new GroupAstNode(this.parseExpression()); + assert(tokens[this.cursor++].type == LEX.RPAREN, "group not closed"); + break; +.... + + + // 然后修改parseExpression函数,使其接受一个参数,代表前置符号的优先级 + parseExpression(precedence = 0) { + var tokens = this.tokens; + var stack = []; + var mid = null; + while (true) { + // 此时栈为空的时候默认看到的就是上下文传进来的优先级 + var stackTopPrecedence = stack.length == 0 ? precedence: stack[stack.length - 1].precedence; + mid = mid == null ? this.nextUnaryNode() : mid; + var opNode = this.getEofOrInfixNode(tokens, this.cursor); + // 结束循环的条件改为,当前操作符优先级<=上下文优先级 并且 栈为空 + // 这样首先是能兼容为0的情况,其次前缀操作符优先级是比中缀高的,所以前缀操作符传进来的时候一定是遇到中缀就结束 + if (opNode.precedence <= precedence && stackTopPrecedence == precedence) return mid; + if (opNode.precedence <= stackTopPrecedence) { + var top = stack.pop(); + top.right = mid; + mid = top; + } + else { + opNode.left = mid; + stack.push(opNode); + this.cursor++; + mid = null; + } + } + } +``` +测试没有问题: + +![img](https://i.imgur.com/qW3EHbB.png) + +但是我还想再改一下代码,因为在分组和前缀表达式中,都使用了递归调用`parseExpression`,分组的比较容易接受,因为分组内是一个全新的表达式解析的过程,但是前缀和后缀表达式其实是一种特殊的二元表达式,我们其实可以把前缀和后缀表达式的处理合并到`while`循环中。 + +首先把这俩递归都删掉,改成直接返回一个`PrefixOperatorAstNode`,这里面的表达式字段是`null`,就像遇到二元操作符的时候`right`也是null是一样的。 +```js + // 遇到前缀运算符 + case LEX.PLUS: + case LEX.MINUS: + case LEX.INCREMENT: + case LEX.DECREMENT: + case LEX.NOT: + case LEX.BIT_NOT: + node = new PrefixOperatorAstNode(tokens[this.cursor++], null); + break; + //... + // 删掉后缀操作符处理的部分 +``` +然后在`while`循环中,判断出遇到了前缀操作符,直接当做set过left的二元操作符一样的处理,push到stack中即可。 +```js + parseExpression() { + var tokens = this.tokens; + var stack = []; + var mid = null; + while (true) { + var stackTopPrecedence = stack.length == 0 ? 0: stack[stack.length - 1].precedence; + mid = mid == null ? this.nextUnaryNode() : mid; + // 如果是next返回的不完整前缀表达式,相当于left填充过的二元操作符,直接塞到stack + if (mid instanceof PrefixOperatorAstNode && mid.right == null) { + stack.push(mid); + mid = null; + continue; + } + // 这里get到的除了中缀还有可能是后缀运算符,修改下方法名 + var opNode = this.getEofOrInfixNodeOrPostNode(tokens, this.cursor); + if (opNode.precedence == 0 && stackTopPrecedence == 0) + return mid; + // 如果是后缀运算符,直接填充left,然后继续,因为后缀表达式一定跟在IDENTIFIER节点之后,所以mid一定是ident,直接填充left即可 + if (opNode instanceof PostfixOperatorAstNode) { + opNode.left = mid; + mid = opNode; + this.cursor++; + } else if (opNode.precedence <= stackTopPrecedence) { + var top = stack.pop(); + top.right = mid; + mid = top; + } else { + opNode.left = mid; + stack.push(opNode); + this.cursor++; + mid = null; + } + } + } + getEofOrInfixNodeOrPostNode(tokens, index) { + var eof = new InfixOperatorAstNode('EOF'); + eof.precedence = 0; + if (index >= tokens.length) return eof + var token = tokens[index]; + if (precedenceMap[token.value] == null && postfixPrecedenceMap[token.value] == null) { + return eof; + } + if (token.type == LEX.INCREMENT || token.type == LEX.DECREMENT) { + return new PostfixOperatorAstNode(tokens[index], null); + } + return new InfixOperatorAstNode(tokens[index]); + } +``` + +结果也是符合预期的: + +![img](https://i.imgur.com/mjNZG7s.png) + +### 2.2.4 中缀运算符 +我们上面代码其实都是从中缀运算符为主心骨展开的代码,所以默认就是支持中缀运算符的,不过前面的中缀取值只有四则运算,这里可以丰富下: +```js +const precedenceMap = { + '=': 10, + '||': 11, '&&': 12, '^': 13, + '==': 14, '!=': 14, + '<': 15, '<=': 15, '>': 15, '>=': 15, + '<<': 16, '>>': 16, '>>>': 16, + '+': 17, '-': 17, + '*': 18, '/': 18, '%': 18, +} +``` +### 2.2.5 函数调用 + + +### 2.2.6 函数声明 + +- todo if语句 for语句 while语句的支持。 + + + +下面是没整理好的代码与思路。 + +先来看简单的函数声明,函数声明的格式有两种,一种是`function name(a,b,c){}`,一种是`function(a,b,c){}`,后者是匿名函数。 +```js +class FunctionDeclarationAstNode extends AstNode { + constructor(name, params, body) { + super(); + this.name = name == null ? null :new IdentifierAstNode(name); + this.params = params; + this.body = body; + } + toString() { + return `function${this.name ? ' ' + this.name.toString() : ''}(${this.params.join(',')})${this.body.map(it=>it.toString()).join('\n')}`; + } +} + +function parseExpression(tokens, start, end) { + //.... + function parseFunctionDeclaration() { + assert(tokens[i].type == LEX.FUNCTION); + // 1 函数名识别,null为匿名函数 + var name = tokens[i + 1].type == LEX.IDENTIFIER ? tokens[i++] : null; + assert(tokens[i + 1].type == LEX.LPATTEN); + // 2 参数识别,格式就是括号内,identifier,逗号,..循环..右括号结束 + var params = []; + for (var j = i + 2; j < end; j+=2) { + assert(tokens[j].type == LEX.IDENTIFIER); + assert(tokens[j+1].type == LEX.COMMA); + params.push(tokens[j]); + // 右括号结束参数部分 + if (tokens[j].type == LEX.RPATTEN) { + i = j + 1; + break; + } + } + // 3 body识别,按照大括号识别即可,注意有可能有大括号嵌套,所以要记录左大括号出现的数量,当右大括号出现,数量减一。数量为0,就是函数body结束 + assert(tokens[i].type == LEX.LBRACE); + var braceCount = 1; + for (var j = i + 1; j < end; j++) { + if (tokens[j].type == LEX.LBRACE) braceCount++; + if (tokens[j].type == LEX.RBRACE) braceCount--; + // 函数结束 + if (braceCount == 0) { + var body = parseSentences(tokens, i, j + 1); + i = j + 1; + return new FunctionDeclarationAstNode(name, params, body); + } } } - return result; } + ``` -这个代码行数不多,但是因为有循环和递归,所以比较难理解。 \ No newline at end of file +函数调用,函数调用主要就是将每个参数进行表达式的解析,参数是用逗号分开的`func1(a,b,c)`一般找到一个逗号或者`)`就可以切成一个参数了,但是考虑到有可能参数中嵌套函数,`func1(func2(a,b), c))`,此时用逗号直接切分是不行的,简单的方法考虑到逗号只出现在函数的参数和数组中,而这两种情况都是有`()`和`[]`包裹的,所以可以在没有括号包裹的情况下,出现逗号或者出现`)`,认为是参数的切分标志。代码如下: +```js +class FunctionCallAstNode extends AstNode { + constructor(name, args) { + super(); + this.name = new IdentifierAstNode(name); + this.args = args; // args是ast数组 + } + toString() { + return `${this.name.toString()}(${this.args.map(it=>it.toString()).join(',')})` + } +} + +function parseExpresstion(tokens, start, end, precedence=0) { + // ...... + function parseFunctionCall() { + assert(tokens[i].type == LEX.IDENTIFIER); // 函数名 + assert(tokens[i + 1].type == LEX.LPAREN); // 左括号 + var nameTk = tokens[i]; + i = i + 2; // 此时i位于第一个参数的start位置 + + // 识别参数要找逗号来隔开每个参数的表达式,分别去递归解析 + var args = []; + var innerPattern = 0, innerBracket = 0; + for (var j = i; j < end; j++) { + if (tokens[j].type == LEX.LPAREN) innerPattern++; + if (tokens[j].type == LEX.RPAREN) innerPattern--; + if (tokens[j].type == LEX.LBRACE) innerBracket++; + if (tokens[j].type == LEX.RBRACE) innerBracket--; + // 最后一个参数 + if (innerPattern == -1) { + args.push(parseExpression(tokens, i, j)); + i = j + 1; + return new FunctionCallAstNode(nameTk, args); + } + // 出现逗号,并且不在内部的()或者[]中,说明是参数的结束 + if (tokens[j].type == LEX.COMMA && innerPattern == 0) { + args.push(parseExpression(tokens, i, j)); + i = j + 1; + } + } + throw new Error("unexpected end of expression"); + } +} +``` \ No newline at end of file