-
Notifications
You must be signed in to change notification settings - Fork 5
/
strudy.js
421 lines (368 loc) · 16.9 KB
/
strudy.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
#!/usr/bin/env node
/**
* The spec analyzer takes a relative path to a crawl report or to a folder that
* contains an `index.json` file that is the crawl report and creates a report
* that contains, for each spec, a list of potential anomalies.
*
* Provided Strudy was installed as a global package, the spec analyzer can be
* called directly through:
*
* `strudy --help`
*
* If Strudy was not installed as a global package, call:
*
* `node strudy.js --help`
*
* @module crawler
*/
import { Command, InvalidArgumentError } from 'commander';
import { constants as fsConstants } from 'node:fs';
import fs from 'node:fs/promises';
import path from 'node:path';
import satisfies from 'semver/functions/satisfies.js';
import packageContents from './package.json' with { type: 'json' };
import study from './src/lib/study.js';
import loadJSON from './src/lib/load-json.js';
import { expandCrawlResult } from 'reffy';
import matter from 'gray-matter';
// Warn if version of Node.js does not satisfy requirements
const { version, engines } = packageContents;
if (engines && engines.node && !satisfies(process.version, engines.node)) {
console.warn(`
[WARNING] Node.js ${process.version} detected but Strudy needs Node.js ${engines.node}.
Please consider upgrading Node.js if the program crashes!`);
}
async function exists(file) {
try {
await fs.access(file, fsConstants.R_OK);
return true;
}
catch {
return false;
}
}
function myParseInt(value) {
const parsedValue = parseInt(value, 10);
if (isNaN(parsedValue)) {
throw new InvalidArgumentError('Not a number.');
}
return parsedValue;
}
const program = new Command();
program
.name('strudy')
.description('Analyzes a crawl report generated by Reffy to detect anomalies in specifications')
.version(version);
program
.command('inspect')
.alias('study')
.argument('<crawl>', 'Path/URL to crawl report')
.option('-cc, --cc <names...>', 'people to Cc in issues that may need help')
.option('-f, --format <format>', 'report markdown or json', 'markdown')
.option('-i, --issues <folder>', 'report issues as markdown files in the given folder')
.option('-m, --max <max>', 'maximum number of issue files to create/update', myParseInt, 0)
.option('-s, --spec <specs...>', 'restrict analysis to given specs', ['all'])
.option('--sort <sort>', 'key(s) to use to sort the structured report', 'default')
.option('--structure <structure>', 'report structure', 'type+spec')
.option('--tr <trreport>', 'path/URL to crawl report on published specs')
.option('--update-mode <mode>', 'what issue files to update', 'new')
.option('-w, --what <what...>', 'what to analyze', ['all'])
.showHelpAfterError('(run with --help for usage information)')
.addHelpText('after', `
Minimal usage example:
To study a crawl report in current folder:
$ strudy inspect .
Description:
Analyzes a crawl report generated by Reffy and create a report with potential
anomalies in each of the specs contained in the crawl report.
Depending on command options, the report is either written to the console as
a serialized JSON object or as a markdown report (see the --format option),
or written to individual issues files in a folder (see the --issues option).
Argument:
<crawl>
Path to the crawl report to analyze. If the path leads to a folder, Strudy
will look for an "ed/index.json" file under that folder first (if it exists,
it will also look for a possible "tr/index.json" file to set the --tr option),
then for an "index.json" file.
Usage notes for some of the options:
-cc, --cc <names...>
Lists people to copy in issues with a "Cc" message so that they get notified.
This is helpful to follow issues that may warrant further discussion and
guidance.
Each name should be a GitHub handle, such as "tidoust" or "dontcallmedom".
The handle may start with a "@" (code will add it as prefix automatically
otherwise).
The "Cc" message will only be added to anomalies that are not obvious to fix:
for example, it will be set for anomalies about algorithms and Web IDL, but
not for broken links or references to discontinued specs (see "cc" flag in
the definitions of anomalies in src/lib/study.js).
The option is ignored if the --issues option is not set.
-f, --format <format>
Tell Strudy to return a report in the specified format. Format may be one of
"markdown" (default when option is not set) or "json".
The --format option cannot be set to "json" if the --issues option is set.
-i, --issues <folder>
Tell Strudy to report the anomalies in anomaly files in the given folder.
An anomaly file gets created for and named after keys at the first level of
the report (see --structure option).
Anomaly files are in markdown. The --format option must be set to "markdown",
or not set at all.
Anomaly files start with metadata, used to convert the file to a GitHub issue
and track the resolution of the issue afterwards: "Repo" sets the repository
for the issue, "Title" the title of the issue, and "Tracked" the URL of the
issue, once created.
Existing anomaly files in the folder are preserved by default, set the
--update-mode option to change that behavior.
-m, --max <max>
Maximum number of issue files to add or update. Defaults to 0, which means
"no limit".
This setting should only be useful when combined with --issues to create
issue files in batches. It may also be set in the absence of --issues, in
which case it restricts the number of entries at the first level of the
report (see --structure).
-s, --spec <specs...>
Valid spec values are spec shortnames. Use "all" to include all specs. This
is equivalent to not setting the option at all.
For instance:
$ strudy inspect . --spec picture-in-picture
The analysis skips discontinued specs that may appear in the crawl result by
default. To force an analysis on a discontinued spec, mention its shortname
explicitly. You may combine that shortname with the value "all" to analyze
all non-discontinued specs plus the ones explicitly listed with their
shortnames.
For instance:
$ strudy inspect . --spec all --spec tracking-dnt
--sort <sort>
Specifies the key(s) to use to sort each level in the structured report.
Use "/" to separate levels. See --structure for details on the possible
report structure.
Possible keys:
"default" follow the natural order of the underlying structures, e.g.
return specs in the order in which they appear in the initial
list, anomalies in extraction order (which usually follows the
document order)
"name" sort entries by the name. For a "spec" level, the name is the
spec's shortname. For a "type" level, the name is the anomaly
type name. For a "type+spec" level, the name is the name of the
file that would be created if --issues is set, meaning the spec's
shortname completed with the anomaly type name.
"title" sort entries by their title. For a "spec" level, the title is the
spec's title. For the final level, the title is the anomaly
message. Etc.
If the --sort value contains more levels than there are in the structured
report, additional keys are ignored. If the value contails fewer levels than
there are in the structured report, the default order is used for unspecified
levels.
For example, if the structure is "type/spec", the --sort option could be:
"default" to use the default order at all levels
"default/title" to use the default order for the root level, and to sort
specs by title
"name/title/title" to sort anomaly types by names, specs by title, and
anomalies by message.
Sort is always ascending.
--structure <type>
Describes the hierarchy in the report(s) that Strudy returns. Possible values:
"flat" no level, report anomalies one by one
"type+spec" one level with one entry per type and spec (default)
"group+spec/type" first level per group and spec, second level per type
"spec/type" first level per spec, second level per type
"spec/group/type" first level per spec, second level per group, third level
per type
"type/spec" first level per type, second level per spec
"group/type/spec" first level per group, second level per type, third level
per spec
"group/spec/type" first level per group, second level per spec, third level
per type
Last level contains the actual list of anomalies.
Note: an anomaly always has a "type". Related anomaly types are grouped in an
anomaly "group". For example, "brokenLinks" and "datedUrls" both belong to
the "backrefs" group (also see the --what option).
--tr <trreport>
Useful for Strudy to refine its broken link analysis when crawl report
contains info about latest Editor's Drafts.
A spec that references terms defined in a second spec for which the /TR
version lags behind the Editor's Draft may have issues of the form "The term
exists in the /TR version but no longer exists in the Editor's Draft".
Note that if <crawl> is a link to a folder, the tool will automatically look
for the TR crawl report in a "tr" subfolder and set <trreport> itself.
--update-mode <mode>
Tell Strudy what issue files to update when --issues is set and an issue file
already exists for the issue at hand. Possible values are:
"new" (default) preserve existing files
"old" preserve existing files but get rid of old ones for which
study reveals no more issue
"untracked" same as "old" but also update existing files that do not
have a "Tracked" URL
"tracked" same as "old" but also update existing files that have a
"Tracked" URL
"all" update all existing files, deleting them when needed
Strudy will always create new issue files, the mode only changes the behavior
for existing issue files.
The --issues option must be set.
-w, --what <what...>
Tell Strudy which anomalies to analyze. Values can be the names of anomaly
types or the name of anomaly groups. The value "all" (default) tells Strudy
to analyze and report on all possible anomalies.
The list of anomaly types and groups will likely evolve over time, see actual
list in src/lib/study.js.
Examples:
"-w algorithms -w backrefs" to study algorithms and references to other specs
"-w unknownSpecs" to study links to unknown specs
`)
.action(async (report, options) => {
// Check options
if (options.format && !['json', 'markdown'].includes(options.format)) {
console.error(`Unsupported --format option "${options.format}".
Format must be one of "json" or "markdown".`)
process.exit(2);
}
if (options.format !== 'markdown' && options.issues) {
console.error(`The --format option can only be set to "markdown" when --issues is used.`);
process.exit(2);
}
if (options.updateMode && !['new', 'old', 'untracked', 'tracked', 'all'].includes(options.updateMode)) {
console.error(`Unsupported --update-mode option "${options.updateMode}"`);
process.exit(2);
}
if (options.updateMode !== 'new' && !options.issues) {
console.error('The --update-mode option can only be set when --issues is set');
process.exit(2);
}
if (options.issues && !await exists(options.issues)) {
console.error(`Could not find/access the folder to store anomalies: ${options.issues}`)
process.exit(2);
}
// Load (and expand) the crawl results
let edReportFile = report;
let trReportFile = options.tr;
if (!report.endsWith('.json')) {
if (await exists(path.join(report, 'ed'))) {
edReportFile = path.join(report, 'ed');
if (!trReportFile && await exists(path.join(report, 'tr'))) {
trReportFile = path.join(report, 'tr');
}
}
edReportFile = path.join(edReportFile, 'index.json');
}
if (!await exists(edReportFile)) {
console.error(`Could not find/access crawl/study report: ${report}`);
process.exit(2);
}
if (trReportFile) {
if (!trReportFile.endsWith('.json')) {
trReportFile = path.join(trReportFile, 'index.json');
}
if (!await exists(trReportFile)) {
console.error(`Could not find/access TR crawl report: ${options.tr}`);
process.exit(2);
}
}
let edReport = await loadJSON(edReportFile);
edReport = await expandCrawlResult(edReport, path.dirname(edReportFile));
let trReport;
if (trReportFile) {
trReport = await loadJSON(trReportFile);
trReport = await expandCrawlResult(trReport, path.dirname(trReportFile));
}
// Create a structured anomaly report out of the crawl report
const anomaliesReport = await study(edReport.results, {
what: options.what,
structure: options.structure,
sort: options.sort,
format: options.format === 'json' ?
'json' :
(options.issues ? 'issue' : 'full'),
trResults: trReport?.results ?? [],
specs: options.spec,
cc: (options.issues ? options.cc : null)
});
// Output the structured anomaly report
if (options.format === 'json') {
// Caller wants a JSON report. We'll just trim the number of anomalies
// in the first level to the requested maximum as needed
if (options.max > 0) {
anomaliesReport.results = anomaliesReport.results.slice(0, options.max);
}
console.log(JSON.stringify(anomaliesReport, null, 2));
}
else if (options.issues) {
// Caller wants to add/update issue files in the provided folder.
// Issue files are formatted with the gray-matter library to save useful
// metadata as front matter in the file.
const issueUrl = /^https:\/\/github\.com\/([^/]+)\/([^/]+)\/(issues|pull)\/(\d+)$/;
let reported = 0;
for (const entry of anomaliesReport.results) {
const filename = path.join(options.issues, `${entry.name}.md`);
let existingReport;
let tracked = 'N/A';
if (await exists(filename)) {
if (options.updateMode === 'new' ||
options.updateMode === 'old') {
console.warn(`- skip ${filename}, file already exists`);
continue;
}
existingReport = matter(await fs.readFile(filename, 'utf-8'));
tracked = existingReport.data.Tracked ?? 'N/A';
if ((options.updateMode === 'tracked' && !tracked.match(issueUrl)) ||
(options.updateMode === 'untracked' && tracked.match(issueUrl))) {
console.warn(`- skip ${filename}, file already exists, with Tracked="${tracked}"`);
continue;
}
}
const content = `
${entry.content}
<sub>This issue was detected and reported semi-automatically by [Strudy](https://github.com/w3c/strudy/) based on data collected in [webref](https://github.com/w3c/webref/).</sub>`;
// Note from @tidoust: One day, I'll understand how to set up Git and
// code so that all line endings end up being "\n" even on Windows
// machines. In the meantime, note that local issue files may well
// contain "\r\n" on Windows machines.
if (existingReport?.content.replace(/\r\n/g, '\n').trim() === content.trim()) {
console.warn(`- skip ${filename}, file already exists, no change`);
continue;
}
const issueReport = matter(content);
issueReport.data = {
Title: entry.title,
Tracked: tracked
};
if (entry.spec) {
const spec = edReport.results.find(spec => spec.url === entry.spec.url);
if (spec.nightly?.repository) {
issueReport.data.Repo = spec.nightly.repository;
}
}
console.warn(`- ${existingReport ? 'update' : 'add'} ${filename}`);
const filecontent = issueReport.stringify();
await fs.writeFile(filename, filecontent, 'utf-8');
reported += 1;
if (options.max > 0 && reported >= options.max) {
break;
}
}
if (options.updateMode !== 'new') {
const reportFiles = await fs.readdir(options.issues);
const todelete = reportFiles.filter(file =>
anomaliesReport.looksGood.find(name => file === `${name}.md`));
for (const file of todelete) {
const filename = path.join(options.issues, file);
console.warn(`- delete ${filename}, no more anomalies detected`);
await fs.rm(filename, { force: true });
}
}
}
else {
// Caller wants a markdown report written to the console.
// The anomalies report should already be a "full" one (so only one
// result item at the first level).
const content = anomaliesReport.results[0].content;
let reported = 0;
for (const entry of content) {
console.log(entry);
reported += 1;
if (options.max > 0 && reported >= options.max) {
break;
}
}
}
});
program.parseAsync(process.argv);