-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparser.mly
180 lines (151 loc) · 6.69 KB
/
parser.mly
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
(* The first section of the grammar definition, called the *header*,
is the part that appears below between %{ and %}. It is code
that will simply be copied literally into the generated parser.ml.
Here we use it just to open the Ast module so that, later on
in the grammar definition, we can write expressions like
[Int i] instead of [Ast.Int i]. *)
%{
open Ast
%}
(* The next section of the grammar definition, called the *declarations*,
first declares all the lexical *tokens* of the language. These are
all the kinds of tokens we can expect to read from the token stream.
Note that each of these is just a descriptive name---nothing so far
says that LPAREN really corresponds to '(', for example. The tokens
that have a <type> annotation appearing in them are declaring that
they will carry some additional data along with them. In the
case of INT, that's an OCaml int. In the case of ID, that's
an OCaml string. *)
%token <int> INT
%token <string> ID
%token PLUS
%token MINUS
%token TIMES
%token DIVIDED
%token LPAREN
%token RPAREN
%token LBRACE
%token RBRACE
%token LET
%token EQUALS
%token IN
%token PROC
%token ISZERO
%token IF
%token THEN
%token ELSE
%token LETREC
%token SET
%token BEGIN
%token END
%token NEWREF
%token DEREF
%token SETREF
%token SEMICOLON
%token INTTYPE
%token BOOLTYPE
%token UNITTYPE
%token ARROW
%token REFTYPE
%token COMMA
%token COLON
%token EOF
(* After declaring the tokens, we have to provide some additional information
about precedence and associativity. The following declarations say that
PLUS is left associative, that IN is not associative, and that PLUS
has higher precedence than IN (because PLUS appears on a line after IN).
Because PLUS is left associative, "1+2+3" will parse as "(1+2)+3"
and not as "1+(2+3)".
Because PLUS has higher precedence than IN, "let x=1 in x+2" will
parse as "let x=1 in (x+2)" and not as "(let x=1 in x)+2". *)
%nonassoc IN ELSE EQUALS /* lowest precedence */
%right ARROW
%left PLUS MINUS
%left TIMES DIVIDED /* highest precedence */
(*%nonassoc UMINUS /* highest precedence */*)
(* After declaring associativity and precedence, we need to declare what
the starting point is for parsing the language. The following
declaration says to start with a rule (defined below) named [prog].
The declaration also says that parsing a [prog] will return an OCaml
value of type [Ast.expr]. *)
%start <Ast.prog> prog
(* The following %% ends the declarations section of the grammar definition. *)
%%
(* Now begins the *rules* section of the grammar definition. This is a list
of rules that are essentially in Backus-Naur Form (BNF), although where in
BNF we would write "::=" these rules simply write ":".
The format of a rule is
name:
| production { action }
| production { action }
| etc.
;
The *production* is the sequence of *symbols* that the rule matches.
A symbol is either a token or the name of another rule.
The *action* is the OCaml value to return if a match occurs.
Each production can *bind* the value carried by a symbol and use
that value in its action. This is perhaps best understood by example... *)
(* The first rule, named [prog], has just a single production. It says
that a [prog] is an [expr] followed by [EOF] (which stands for end-of-file).
EOF is a token returned by the lexer when it reaches the end of the token stream.
The first part of the production, [e=expr], says to match an [expr] and bind
the resulting value to [e]. The action simply says to return that value [e]. *)
prog:
| e = expr; EOF { AProg e }
;
(* The second rule, named [expr], has productions for integers, variables,
addition expressions, let expressions, and parenthesized expressions.
- The first production, [i = INT], says to match an [INT] token, bind the
resulting OCaml [int] value to [i], and return AST node [Int i].
- The second production, [x = ID], says to match an [ID] token, bind the
resulting OCaml [string] value to [x], and return AST node [Var x].
- The third production, [e1 = expr; PLUS; e2 = expr], says to match
an [expr] followed by a [PLUS] token followed by another [expr].
The first [expr] is bound to [e1] and the second to [e2]. The AST
node returned is [Add(e1,e2)].
- The fourth production, [LET; x = ID; EQUALS; e1 = expr; IN; e2 = expr],
says to match a [LET] token followed by an [ID] token followed by
an [EQUALS] token followed by an [expr] followed by an [IN] token
followed by another [expr]. The string carried by the [ID] is bound
to [x], and the two expressions are bound to [e1] and [e2]. The AST
node returned is [Let(x,e1,e2)].
- The fifth production, [LPAREN; e = expr; RPAREN] says to match an
[LPAREN] token followed by an [expr] followed by an [RPAREN]. The
expression is bound to [e] and returned. *)
expr:
| i = INT { Int i }
| x = ID { Var x }
| LPAREN; RPAREN { Unit }
| e1 = expr; PLUS; e2 = expr { Add(e1,e2) }
| e1 = expr; MINUS; e2 = expr { Sub(e1,e2) }
| e1 = expr; TIMES; e2 = expr { Mul(e1,e2) }
| e1 = expr; DIVIDED; e2 = expr { Div(e1,e2) }
| LET; x = ID; EQUALS; e1 = expr; IN; e2 = expr { Let(x,e1,e2) }
| LETREC; x = ID; LPAREN; y = ID; COLON; targ=texpr; RPAREN; COLON
; tr=texpr; EQUALS; e1 = expr; IN; e2 = expr { Letrec(tr,x,y,targ,e1,e2) }
| LETREC; x = ID; LPAREN; y = ID; RPAREN; EQUALS; e1 = expr; IN; e2 = expr { LetrecUntyped(x,y,e1,e2) }
| PROC; LPAREN; x = ID; COLON; t=texpr; RPAREN; LBRACE; e = expr; RBRACE { Proc(x,t,e) }
| PROC; LPAREN; x = ID; RPAREN; LBRACE; e = expr; RBRACE { ProcUntyped(x,e) }
| LPAREN; e1 = expr; e2 = expr; RPAREN { App(e1,e2) }
| ISZERO; LPAREN; e = expr; RPAREN { IsZero(e) }
| NEWREF; LPAREN; e = expr; RPAREN { NewRef(e) }
| DEREF; LPAREN; e = expr; RPAREN { DeRef(e) }
| SETREF; LPAREN; e1 = expr; COMMA; e2 = expr; RPAREN { SetRef(e1,e2) }
| IF; e1 = expr; THEN; e2 = expr; ELSE; e3 = expr { ITE(e1,e2,e3) }
| SET; x = ID; EQUALS; e = expr { Set(x,e) }
| BEGIN; es = exprs; END { BeginEnd(es) }
| LPAREN; e = expr; RPAREN {e}
(* | MINUS e = expr %prec UMINUS { SubExp(IntExp 0,e) }*)
| LPAREN; MINUS e = expr; RPAREN { Sub(Int 0, e) }
;
exprs:
es = separated_list(SEMICOLON, expr) { es } ;
texpr:
| x = ID { VarType x }
| INTTYPE { IntType }
| BOOLTYPE { BoolType }
| UNITTYPE { UnitType }
| t1 = texpr; ARROW; t2 = texpr { FuncType(t1,t2) }
| LPAREN; t1 = texpr; RPAREN { t1 }
| REFTYPE; LPAREN; t1 = texpr; RPAREN { RefType(t1) }
(* And that's the end of the grammar definition. *)