-
Notifications
You must be signed in to change notification settings - Fork 0
/
tokenise.ts
98 lines (84 loc) · 2.31 KB
/
tokenise.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
export type {Token};
export {TokenType, TokenIterator, tokenise};
interface Token {
type: TokenType
raw: string
}
enum TokenType {
SPACE,
ID,
VAR,
NUM,
OP,
STR,
OPEN_BRACKET,
CLOSE_BRACKET,
COMMA,
PERIOD,
COLON,
SEMICOLON,
OTHER,
END,
}
const TOKEN_TYPES: Record<TokenType, RegExp> = {
[TokenType.SPACE]: /\s+/,
[TokenType.ID]: /(?:Type|Set|Let|Do|if|for|times|step|part)/,
[TokenType.VAR]: /[A-Za-z]\w*/,
[TokenType.NUM]: /(?:[0-9]*[.][0-9]+|[0-9]+)/,
[TokenType.OP]: /(?:!=|[><]=|[-+*/^=<>])/,
[TokenType.STR]: /".*"/, // Oddly, this greedy behaviour for double quotes is correct.
[TokenType.OPEN_BRACKET]: /[([]/,
[TokenType.CLOSE_BRACKET]: /[\])]/,
[TokenType.COMMA]: /,/,
[TokenType.COLON]: /:/,
[TokenType.SEMICOLON]: /;/,
[TokenType.PERIOD]: /[.]/,
[TokenType.OTHER]: /./,
[TokenType.END]: /impossible/,
};
// Build TYPES into a set of named groups.
const TOKEN_REGEX = Object.entries(TOKEN_TYPES).map(([k, v]) => `(?<${TokenType[k as keyof typeof TokenType]}>${v.source})`).join('|');
class TokenIterator<T> {
it: Iterator<T>;
state: T;
terminal: T;
constructor(it: Iterator<T>, terminal: T) {
this.it = it;
this.terminal = terminal;
this.state = this._next();
}
_next(): T {
const {value, done} = this.it.next();
return done ? this.terminal : value;
}
next(): T {
const oldState = this.state;
this.state = this._next();
return oldState;
}
peek(): T {
return this.state;
}
}
function tokenise(s: string): TokenIterator<Token> {
return new TokenIterator(_tokenise(s), {type: TokenType.END, raw: ''});
}
function *_tokenise(s: string): Iterator<Token> {
s = s.trim();
if (s === '' || s.startsWith('*') || s.endsWith('*')) {
// comment form
return;
}
const re = new RegExp(TOKEN_REGEX, 'y');
let m;
while ((m = re.exec(s)) && m.groups) {
for (const [typeString, raw] of Object.entries(m.groups)) {
if (raw !== undefined) {
const type = TokenType[typeString as keyof typeof TokenType];
if (type !== TokenType.SPACE) {
yield {type, raw};
}
}
}
}
}