forked from ivoa-std/ADQL
-
Notifications
You must be signed in to change notification settings - Fork 0
/
bnf2html.pl
471 lines (416 loc) · 13.3 KB
/
bnf2html.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
#!/usr/bin/env perl
#
# @(#)$Id: bnf2html.pl,v 3.16 2017/11/14 06:53:22 jleffler Exp $
#
# Convert SQL-92, SQL-99 BNF plain text file into hyperlinked HTML.
use strict;
use warnings;
use POSIX qw(strftime);
#use Data::Dumper;
use constant debug => 0;
my(%rules); # Indexed by rule names w/o angle-brackets; each entry is a ref to a hash.
my(%keywords); # Index by keywords; each entry is a ref to a hash.
my(%names); # Indexed by rule names w/o angle-brackets; each entry is a ref to an array of line numbers
sub top
{
print "<p><a href='#top'>Top</a></p>\n\n";
}
# Usage: add_rule_name(\%names, $rulename, $.);
sub add_rule_name
{
my($reflist, $lhs, $line) = @_;
#print "\nrulename = $lhs; line = $line\n";
if (defined ${$reflist}{$lhs})
{
#print Data::Dumper->Dump([ ${$reflist}{$lhs} ], qw[ ${$reflist}{$lhs} ]);
#print Data::Dumper->Dump([ \@{${$reflist}{$lhs}} ], qw[ \@{${$reflist}{$lhs}} ]);
my @lines = @{${$reflist}{$lhs}};
print STDERR "\n$0: Rule <$lhs> at line $line already seen at line(s) ", join(", ", @lines), "\n\n";
}
else
{
${$reflist}{$lhs} = [];
}
push @{${$reflist}{$lhs}}, $line;
}
# Usage: add_entry(\%keywords, $keyword, $rule);
# Usage: add_entry(\%rules, $rhs, $rule);
sub add_entry
{
my($reflist, $lhs, $rhs) = @_;
${$reflist}{$lhs} = {} unless defined ${$reflist}{$lhs};
${$reflist}{$lhs}{$rhs} = 1;
}
sub add_refs
{
my($def, $tail) = @_;
print "\n<!-- ADD REFS ($def) ($tail) -->\n" if debug;
return if $tail =~ m/^!!/;
return if $tail =~ m/^&(?:lt|gt|amp);$/;
while ($tail)
{
$tail =~ s/^\s*//;
if ($tail =~ m%^\<([-:/\w\s]+)\>%)
{
print "<!-- Rule - LHS: $def - RHS $1 -->\n" if debug;
add_entry(\%rules, $1, $def);
$tail =~ s%^\<([-:/\w\s]+)\>%%;
}
elsif ($tail =~ m%^([-:/\w]+)%)
{
my($token) = $1;
print "<!-- KyWd - LHS: $def - RHS $token -->\n" if debug;
add_entry(\%keywords, $token, $def) if $token =~ m%[[:alpha:]][[:alpha:]]% || $token eq 'C';
$tail =~ s%^[-:/\w]+%%;
}
else
{
# Otherwise, it is punctuation (such as the BNF metacharacters).
$tail =~ s%^[^-:/\w]%%;
}
}
}
# NB: webcode replaces tabs with blanks!
# open( my $WEBCODE, "-|", "webcode @ARGV") or die "$!";
# Replace the above lines to avoid dependency on webcode
# (which is a small C utility). Output is the same, at least for
# the current version of the ADQL BNF.
# Could also do this using regexs in perl.
my $escape_html = "sed -e's/&/\\&/g' -e's/</\\</g' -e's/>/\\>/g'";
open( my $WEBCODE, "-|", "$escape_html @ARGV") or die "$!";
# Read first line of file - use as title in head and in H1 heading in body
$_ = <$WEBCODE>;
exit 0 unless defined($_);
chomp;
# Is it wicked to use double quoting with single quotes, as in qq'text'?
# It is used quite extensively in this script - beware!
print qq'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">\n';
print "<!-- Generated HTML - Modify at your own peril! -->\n";
print "<html>\n<head>\n";
print "<title> $_ </title>\n</head>\n<body>\n\n";
print "<h1> $_ </h1>\n\n";
print qq'<a name="top"> </a>\n';
print "<br>\n";
print qq'<a href="#xref-rules"> Cross-Reference: rules </a>\n';
print "<br>\n";
print qq'<a href="#xref-keywords"> Cross-Reference: keywords </a>\n';
print "<br>\n";
sub rcs_id
{
my($id) = @_;
$id =~ s%^(@\(#\))?\$[I]d: %%o;
$id =~ s% \$$%%o;
$id =~ s%,v % %o;
$id =~ s%\w+ Exp( \w+)?$%%o;
my(@words) = split / /, $id;
my($version) = "file $words[0] version $words[1] dated $words[2] $words[3]";
return $version;
}
sub iso8601_format
{
my($tm) = @_;
my $today = strftime("%Y-%m-%d %H:%M:%S+00:00", gmtime($tm));
return($today);
}
# Print hrefs for non-terminals and keywords.
# Also substitute /* Nothing */ for an absence of productions between alternatives.
sub print_tail
{
my($tail, $tcount) = @_;
while ($tail)
{
my($newtail);
if ($tail =~ m%^\s+%)
{
my($spaces) = $&;
$newtail = $';
print "<!-- print_tail: SPACES = '$spaces', NEWTAIL = '$newtail' -->\n" if debug;
$spaces =~ s% {4,8}% %g;
print $spaces;
# Spaces are not a token - don't count them!
}
elsif ($tail =~ m%^'[^']*'% || $tail =~ m%^"[^"]*"% || $tail =~ m%^!!.*$%)
{
# Quoted literal - print and ignore.
# Or meta-expression...
my($quote) = $&;
$newtail = $';
print "<!-- print_tail: QUOTE = <$quote>, NEWTAIL = '$newtail' -->\n" if debug;
$quote =~ s%!!.*%<font color="red"> $quote </font>%;
print $quote;
$tcount++;
}
elsif ($tail =~ m%^\<([-:/\w\s]+)\>%)
{
my($nonterm) = $&;
$newtail = $';
print "<!-- print_tail: NONTERM = '$nonterm', NEWTAIL = '$newtail' -->\n" if debug;
$nonterm =~ s%\<([-:/\w\s]+)\>%<a href='#$1'>\<$1\></a>%;
print " $nonterm";
$tcount++;
}
elsif ($tail =~ m%^[\w_]([-._\w]*[\w_])?%)
{
# Keyword
my($keyword) = $&;
$newtail = $';
print "<!-- print_tail: KEYWORD = '$keyword', NEWTAIL = '$newtail' -->\n" if debug;
print(($keyword =~ m/^\d\d+$/) ? $keyword : qq' <a href="#xref-$keyword"> $keyword </a>');
$tcount++;
}
else
{
# Metacharacter, string literal, etc.
$tail =~ m%\S+%;
my($symbol) = $&;
$newtail = $';
print "<!-- print_tail: SYMBOL = '$symbol', NEWTAIL = '$newtail' -->\n" if debug;
if ($symbol eq '|')
{
print "<font color=red>/* Nothing */</font> " if $tcount == 0;
$tcount = 0;
}
else
{
$symbol =~ s%...omitted...%<font color=red>/* $& */</font>%i;
$tcount++;
}
print " $symbol";
}
$tail = $newtail;
}
return($tcount);
}
sub undo_web_coding
{
my($line) = @_;
$line =~ s%>%>%g;
$line =~ s%<%<%g;
$line =~ s%&%&%g;
return $line;
}
my $hr_count = 0;
my $tcount = 0; # Ick!
my $def; # Current rule
# Don't forget - the input has been web-encoded!
while (<$WEBCODE>)
{
chomp;
next if /^===*$/o;
s/\s+$//o; # Remove trailing white space
if (/^$/)
{
print "\n";
}
elsif (/^---*$/)
{
print "<hr>\n";
}
elsif (/^--@@\s*(.*)$/)
{
my $comment = undo_web_coding($1);
print "<!-- $comment -->\n";
}
elsif (/^@.#..Id:/)
{
# Convert what(1) string identifier into version information
my $id = '$Id: bnf2html.pl,v 3.16 2017/11/14 06:53:22 jleffler Exp $';
my($v1) = rcs_id($_);
my $v2 = rcs_id($id);
print "<p><font color=green><i><small>\n";
print "Derived from $v1\n";
my $today = iso8601_format(time);
print "<br>\n";
print "Generated on $today by $v2\n";
print "</small></i></font></p>\n";
}
elsif (/\s+::=/)
{
# Definition line
$def = $_;
$def =~ s%\<([-:/()\w\s]+)\>.*%$1%;
my($tail) = $_;
$tail =~ s%.*::=\s*%%;
print qq'<p><a href="#xref-$def" name="$def"> <$def> </a> ::=';
$tcount = 0;
add_rule_name(\%names, $def, $.);
if ($def eq "vertical bar")
{
# Needs special case attention to avoid a /* Nothing */ comment appearing.
# Problem pointed out by Jens Odborg ([email protected]) 2016-04-14.
# This builds knowledge of the SQL language definition into this script;
# ugly, but trying to fix it in the print_tail function is probably worse.
print " |";
}
elsif ($tail)
{
add_refs($def, $tail);
print " ";
$tcount = print_tail($tail, $tcount);
}
print "\n";
}
elsif (/^\s/)
{
# Expansion line
add_refs($def, $_);
print "<br>";
$tcount = print_tail($_, $tcount);
}
elsif (m/^--[\/]?(\w+)/)
{
# Pseudo-directive line in lower-case
# Print a 'Top' link before <hr> tags except first.
top if /--hr/ && $hr_count++ > 0;
s%--(/?[a-z][a-z\d]*)%<$1>%;
s%\<([-:/\w\s]+)\>%<a href='#$1'>\<$1\></a>%g;
print "$_\n";
}
elsif (m%^--##%)
{
$_ = undo_web_coding($_);
s%^--##\s*%%;
print "$_\n";
}
elsif (m/^--%start\s+(\w+)/)
{
# Designated start symbol
my $start = $1;
print qq'<p><b>Start symbol: </b> <a href="#$start"> $start </a></p>\n';
}
else
{
# Anything unrecognized passed through unchanged!
print "$_\n";
}
}
close $WEBCODE;
# Print index of initial letters for keywords.
sub print_index_key
{
my($prefix, @keys) = @_;
my %letters = ();
foreach my $keyword (@keys)
{
my $initial = uc substr $keyword, 0, 1;
$letters{$initial} = 1;
}
foreach my $letter ('A' .. 'Z')
{
if (defined($letters{$letter}))
{
print qq'<a href="#$prefix-$letter"> $letter </a>\n';
}
else
{
print qq'$letter\n';
}
}
print "\n";
}
### Generate cross-reference tables
{
print "<br>\n\n";
print "<hr>\n";
print qq'<a name="xref-rules"></a>\n';
print "<h2> Cross-Reference Table: Rules </h2>\n";
print_index_key("rules", keys %rules);
print "<table border=1>\n";
print "<tr> <th> Rule (non-terminal) </th> <th> Rules using it </th> </tr>\n";
my %letters = ();
foreach my $rule (sort { uc $a cmp uc $b } keys %rules)
{
my $initial = uc substr $rule, 0, 1;
my $label = "";
if (!defined($letters{$initial}))
{
$letters{$initial} = 1;
$label = qq'<a name="rules-$initial"> </a>';
}
print qq'<tr> <td> $label <a href="#$rule" name="xref-$rule"> $rule </a> </td>\n <td> ';
my $pad = "";
foreach my $ref (sort { uc $a cmp uc $b } keys %{$rules{$rule}})
{
print qq'$pad<a href="#$ref"> <$ref> </a>\n';
$pad = " ";
}
print " </td>\n</tr>\n";
}
print "</table>\n";
print "<br>\n";
top;
}
{
print "<hr>\n";
print qq'<a name="xref-keywords"></a>\n';
print "<h2> Cross-Reference Table: Keywords </h2>\n";
print_index_key("keywords", keys %keywords);
print "<table border=1>\n";
print "<tr> <th> Keyword </th> <th> Rules using it </th> </tr>\n";
my %letters = ();
foreach my $keyword (sort { uc $a cmp uc $b } keys %keywords)
{
my $initial = uc substr $keyword, 0, 1;
my $label = "";
if (!defined($letters{$initial}))
{
$letters{$initial} = 1;
$label = qq'<a name="keywords-$initial"> </a>';
}
print qq'<tr> <td> $label <a name="xref-$keyword"> </a> $keyword </td>\n <td> ';
my $pad = "";
foreach my $ref (sort { uc $a cmp uc $b } keys %{$keywords{$keyword}})
{
print qq'$pad<a href="#$ref"> <$ref> </a>\n';
$pad = " ";
}
print " </td>\n</tr>\n";
}
print "</table>\n";
print "<br>\n";
top;
print "<hr>\n";
}
printf "%s\n", q'Please send feedback to Jonathan Leffler:';
printf "%s\n", q'<a href="mailto:[email protected]"> [email protected] </a>.';
print "\n</body>\n</html>\n";
__END__
=pod
=head1 PROGRAM
bnf2html - Convert (ISO SQL) BNF Notation to Hyperlinked HTML
=head1 SYNTAX
bnf2html [file ...]
=head1 DESCRIPTION
The bnf2html filters the annotated BNF (Backus-Naur Form) from its input
files and converts it into HTML on standard output.
The HTML is heavily hyperlinked.
Each rule (LHS) links to a table of other rules where it is used on the
RHS.
Similarly, each symbol on the RHS is linked to the rule that defines it.
Thus, it is possible to find where items are used and defined quite
easily.
=head1 INPUT FORMAT
This script is adapted to the BNF notation using in the SQL standard
(ISO/IEC 9075:2003, for example).
It also takes various forms of annotations.
The first line of the file is used as the title in the head section.
It is also used as the text for a H1 header at the top of the body.
Lines consisting of two or more equal signs are ignored.
Lines consisting of two or more dashes are converted to a horizontal
rule.
Lines starting with the SCCS identification string '@(#)' are used to
print version information about the file converted and the script doing
the converting.
Lines containing space, colon, colon, equals are treated as rules.
Lines starting with white space are treated as continuations of a rule.
Lines starting dash, dash, (optionally a slash) and then one or more tag
letters are converted into an HTML start or end tag.
Any line starting dash, dash, hash, hash has any HTML entities
introduced by the WEBCODE program removed.
The should be at most one line starting '--%start'; this indicates the
start symbol for the bnf2yacc converter, but is effectively ignored by
bnf2html.
Any other line is passed through verbatim.
=head1 AUTHOR
Jonathan Leffler <[email protected]>
=cut