-
Notifications
You must be signed in to change notification settings - Fork 2
/
GrammarProcess.c
366 lines (349 loc) · 13.6 KB
/
GrammarProcess.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
#include <string.h>
#include "include.h"
Token terminal_table[MAX_TERMINAL_NUM + 1] = {0};
char non_terminal_table[MAX_NON_TERMINAL_NUM + 1] = {0};
int terminal_nums = 0;
int non_terminal_nums = 1; // 非终结符的第一个[0]是保留的起始符号位
// lex -> id
int get_id(const char *lex)
{
// 是终结符的情况
if (is_operator(lex) || is_keyword(lex) ||
strcmp(lex, "id") == 0 || strcmp(lex, "digits") == 0 || strcmp(lex, "$") == 0)
{
// 终结符占用<0的部分,没有0,所以从1开始
// 有意义的下标区间[1, terminal_nums],并且最后一个是$,占用[terminal_nums]位置
for (int i = 1; i <= terminal_nums; ++i)
{
if (strcmp(terminal_table[i].str, lex) == 0)
return -i;
}
strcpy(terminal_table[++terminal_nums].str, lex);
return -terminal_nums;
}
else // 是非终结符的情况
{ // 非终结符的第一个[0]是保留的起始符号位
for (int i = 1; i < non_terminal_nums; ++i)
{
if (non_terminal_table[i] == lex[0])
return i;
}
non_terminal_table[non_terminal_nums] = lex[0];
return non_terminal_nums++;
}
}
// id -> lex
void get_lex(const int id, char temp[MAX_TOKEN_LEN])
{
if (id > 0)
{
temp[0] = non_terminal_table[id];
temp[1] = '\0';
}
else if (id < 0)
strcpy(temp, terminal_table[-id].str);
else
strcpy(temp, "START");
}
void processGrammar(FILE *fp, pGrammar grammar)
{
Production start_production = {.id = 0, .production = 0, .gen_nums = 1, .generative = {1}};
Production production_buffer[1024];
production_buffer[0] = start_production;
int now_production_num = 1;
char token[100];
int state = 0; //表示识别状态,0未识别符号; >0正在识别符号
int prod = 1; //正在识别的是左侧还是右侧
int read_production = 0; //当前是否读入了产生式
while (1)
{
char ch = (char) fgetc(fp);
if (feof(fp))
break;
if (ch == '#')
{
while (ch != '\n' && !feof(fp))
ch = (char) fgetc(fp);
if (feof(fp))
break;
}
else if (ch == '@')
{
prod = 0;
}
else if (ch == '\n' || ch == ' ' || ch == '\r' || ch == '\t')
{
if (state != 0)
{
token[state] = '\0';
if (prod)
{
production_buffer[now_production_num].production = get_id(token);
read_production = 1;
}
else
{
production_buffer[now_production_num].generative[production_buffer[now_production_num].gen_nums++] = get_id(token);
}
state = 0;
}
if (ch == '\n')
{
if (prod == 1 && read_production == 1)
{
print_error("Not a Correct Grammar, please recheck your grammar file, exit");
exit(-1);
}
else if (prod == 0)
{
production_buffer[now_production_num].id = now_production_num;
++now_production_num;
if (now_production_num >= 1024)
{
print_error("Too many productions, exit.");
exit(-1);
}
prod = 1;
read_production = 0;
production_buffer[now_production_num].gen_nums = 0;
}
}
}
else
token[state++] = ch;
}
fclose(fp);
get_id("$"); // 将$加入终级符表,$位于最后一个终极符
grammar->production_nums = now_production_num;
grammar->productions = (pProduction) malloc(sizeof(Production) * now_production_num);
grammar->item_nums = terminal_nums + non_terminal_nums;
grammar->follow_nums = non_terminal_nums;
memcpy(grammar->productions, production_buffer, sizeof(Production) * now_production_num);
}
// 用于判断一个串是否可以推出空
int productEpsilon(const Grammar grammar, const int* string, int str_nums)
{
// 遍历串中的每一个元素
for (int i = 0; i < str_nums; ++i)
{
int j; // 遍历元素的first集,看是否有空串
for (j = 0; j < grammar.firsts[string[i] + terminal_nums].item_nums; ++j)
if (grammar.firsts[string[i] + terminal_nums].items[j] == 0)
break;
// 如果这个元素的first集没有空串,那么整个串就不能推出空
if (j == grammar.firsts[string[i] + terminal_nums].item_nums)
return 0;
}
return 1;
}
// 用于合并两个集合,结果放入a中。b中的epsilon不会加入a中。返回值是新增的元素个数
// epsilon_pos用于记录b中epsilon的位置。传入NULL表示不记录
// epsilon_val用于表示是否在first集中加入空串,生成follow集时用0
int mergeSet(int *a, int* a_size, const int *b, int b_size, int* epsilon_pos, int epsilon_val)
{
int pos = *a_size;
int epsilon_pos_ = -1;
for (int i = 0; i < b_size; ++i)
{
if (b[i] == 0) // b[i]是epsilon,不能加入first,follow集合。返回值为epsilon的位置
{
epsilon_pos_ = i;
if (epsilon_val==0) {continue;}
}
int j;
for (j = 0; j < *a_size; ++j)
if (a[j] == b[i])
break;
if (j == *a_size)
a[pos++] = b[i];
}
int ret = pos - *a_size;
*a_size = pos;
if (epsilon_pos != NULL)
*epsilon_pos = epsilon_pos_;
return ret;
}
void calcFIRSTSet(pGrammar grammar)
{
grammar->firsts = (pFIRST) malloc(sizeof(FIRST) * grammar->item_nums);
// 初始化非终极符的first集
for (int i = 0; i < non_terminal_nums; ++i)
{
grammar->firsts[i + terminal_nums].items = malloc(sizeof(int) * terminal_nums);
grammar->firsts[i + terminal_nums].item_nums = 0;
}
// 初始化终极符的first集(其本身)
for (int i = -terminal_nums; i < 0; ++i)
{
grammar->firsts[i + terminal_nums].items = (int*) malloc(sizeof(int) * 1);
grammar->firsts[i + terminal_nums].item_nums = 1;
grammar->firsts[i + terminal_nums].items[0] = i;
}
// 循环求first集直到不再变化
while (1)
{
int changed = 0;
for (int i = 0; i < grammar->production_nums; ++i)
{
// 产生式左部
int alpha = grammar->productions[i].production;
// 产生式为空处理
if(grammar->productions[i].gen_nums == 0)
{
int epsilon_set = 0;
changed += mergeSet(grammar->firsts[alpha + terminal_nums].items,
&grammar->firsts[alpha + terminal_nums].item_nums,
&epsilon_set, 1, NULL, 1);
continue;
}
// 产生式右部不为空
for (int k = 0; k < grammar->productions[i].gen_nums; ++k)
{
int xk = grammar->productions[i].generative[k];
int epsilon_pos;
changed += mergeSet(grammar->firsts[alpha + terminal_nums].items,
&grammar->firsts[alpha + terminal_nums].item_nums,
grammar->firsts[xk + terminal_nums].items,
grammar->firsts[xk + terminal_nums].item_nums, &epsilon_pos, 0);
// xk的first集含空串
if (epsilon_pos != -1)
{
// 判断是否是最后一个符号
if(k == grammar->productions[i].gen_nums - 1)
{
int epsilon_set = 0;
changed += mergeSet(grammar->firsts[alpha + terminal_nums].items,
&grammar->firsts[alpha + terminal_nums].item_nums,
&epsilon_set, 1, NULL, 1);
}
}
// xk的first集不含空串,则跳过该产生式
else
break;
}
}
if (changed == 0)
break;
}
// 重新为first集分配合适的空间大小
for (int i = 0; i < non_terminal_nums; ++i)
{
grammar->firsts[i + terminal_nums].items = (int*) realloc(grammar->firsts[i + terminal_nums].items,
sizeof(int) * grammar->firsts[i + terminal_nums].item_nums);
}
}
void calcFOLLOWSet(pGrammar grammar)
{
grammar->follows = (pFIRST) malloc(sizeof(FIRST) * grammar->follow_nums);
for (int i = 0; i < grammar->follow_nums; ++i)
{
grammar->follows[i].items = (int*) malloc(sizeof(int) * grammar->item_nums);
grammar->follows[i].item_nums = 0;
}
grammar->follows[0].items[0] = -terminal_nums; // 起始符号的follow集合中加入$
grammar->follows[0].item_nums = 1;
int changed = 1; // 记录每次循环follow集合中新增元素的个数
while (changed != 0) // 当follow集合不再变化时退出
{
changed = 0;
// 对于每一条产生式遍历
for (int i = 0; i < grammar->production_nums; ++i)
{
// 对于每一个产生式右侧的符号遍历
for (int j = 0; j < grammar->productions[i].gen_nums; ++j)
{
int ch = grammar->productions[i].generative[j];
if (ch <= 0) // 终极符号没有follow集合,开始符号follow集合固定为$
continue;
int k; // 对于每一个产生式右侧的符号的所有后继符号遍历
for (k = j + 1; k < grammar->productions[i].gen_nums; ++k)
{
int next = grammar->productions[i].generative[k];
if (next < 0) // 下一个符号是终极符号,停止遍历并将其加入到follow集合中
{
changed += mergeSet(grammar->follows[ch].items,
&grammar->follows[ch].item_nums,
&next, 1, NULL, 0);
break;
}
else
{
// 下一个符号是非终极符号,将其first集合加入到follow集合中
int epsilon_pos;
changed += mergeSet(grammar->follows[ch].items,
&grammar->follows[ch].item_nums,
grammar->firsts[next + terminal_nums].items,
grammar->firsts[next + terminal_nums].item_nums,
&epsilon_pos, 0);
// 后继符号的first集合中没有epsilon,停止遍历
if (-1 == epsilon_pos)
break;
}
}
// j已经是最后一个符号,或者j后面的符号都是非终极符号并且FIRST集合均包含epsilon
if (j + 1 == grammar->productions[i].gen_nums || k == grammar->productions[i].gen_nums)
{
// 把产生式的左侧符号的follow集合加入到j的follow集合中
// 如果产生式的左侧符号和右侧符号相同,不需要加入
if (grammar->productions[i].production != ch)
changed += mergeSet(grammar->follows[ch].items, &grammar->follows[ch].item_nums,
grammar->follows[grammar->productions[i].production].items,
grammar->follows[grammar->productions[i].production].item_nums,
NULL, 0);
}
}
}
}
for (int i = 0; i < grammar->follow_nums; ++i)
grammar->follows[i].items = (int*) realloc(grammar->follows[i].items,
sizeof(int) * grammar->follows[i].item_nums);
}
Grammar generateGrammar(FILE* fp)
{
Grammar grammar;
processGrammar(fp, &grammar);
calcFIRSTSet(&grammar);
calcFOLLOWSet(&grammar);
return grammar;
}
void printGrammar(const Grammar grammar)
{
char temp[MAX_TOKEN_LEN];
printf("=====[Grammar]=====\n");
printf("===Productions===\n");
for (int i = 0; i < grammar.production_nums; ++i)
{
get_lex(grammar.productions[i].production, temp);
printf("%s -> ", temp);
for (int j = 0; j < grammar.productions[i].gen_nums; ++j)
{
get_lex(grammar.productions[i].generative[j], temp);
printf("%s ", temp);
}
printf("\n");
}
printf("\n===FIRST Sets===\n");
for (int i = 0; i < grammar.item_nums; ++i)
{
get_lex(i - terminal_nums, temp);
printf("%7s :", temp);
for (int j = 0; j < grammar.firsts[i].item_nums; ++j)
{
get_lex(grammar.firsts[i].items[j], temp);
printf(" %s", temp);
}
printf("\n");
}
printf("\n===FOLLOW Sets===\n");
for (int i = 0; i < grammar.follow_nums; ++i)
{
get_lex(i, temp);
printf("%7s :", temp);
for (int j = 0; j < grammar.follows[i].item_nums; ++j)
{
get_lex(grammar.follows[i].items[j], temp);
printf(" %s", temp);
}
printf("\n");
}
}