strudy.js

#!/usr/bin/env node
/**
 * The spec analyzer takes a relative path to a crawl report or to a folder that
 * contains an `index.json` file that is the crawl report and creates a report
 * that contains, for each spec, a list of potential anomalies.
 *
 * Provided Strudy was installed as a global package, the spec analyzer can be
 * called directly through:
 *
 * `strudy --help`
 *
 * If Strudy was not installed as a global package, call:
 *
 * `node strudy.js --help`
 *
 * @module crawler
 */

import { Command, InvalidArgumentError } from 'commander';
import { constants as fsConstants } from 'node:fs';
import fs from 'node:fs/promises';
import path from 'node:path';
import satisfies from 'semver/functions/satisfies.js';
import packageContents from './package.json' with { type: 'json' };
import study from './src/lib/study.js';
import loadJSON from './src/lib/load-json.js';
import { expandCrawlResult } from 'reffy';
import matter from 'gray-matter';

// Warn if version of Node.js does not satisfy requirements
const { version, engines } = packageContents;
if (engines && engines.node && !satisfies(process.version, engines.node)) {
  console.warn(`
[WARNING] Node.js ${process.version} detected but Strudy needs Node.js ${engines.node}.
          Please consider upgrading Node.js if the program crashes!`);
}


async function exists(file) {
  try {
    await fs.access(file, fsConstants.R_OK);
    return true;
  }
  catch {
    return false;
  }
}

function myParseInt(value) {
  const parsedValue = parseInt(value, 10);
  if (isNaN(parsedValue)) {
    throw new InvalidArgumentError('Not a number.');
  }
  return parsedValue;
}

const program = new Command();
program
  .name('strudy')
  .description('Analyzes a crawl report generated by Reffy to detect anomalies in specifications')
  .version(version);

program
  .command('inspect')
  .alias('study')
  .argument('<crawl>', 'Path/URL to crawl report')
  .option('-cc, --cc <names...>', 'people to Cc in issues that may need help')
  .option('-f, --format <format>', 'report markdown or json', 'markdown')
  .option('-i, --issues <folder>', 'report issues as markdown files in the given folder')
  .option('-m, --max <max>', 'maximum number of issue files to create/update', myParseInt, 0)
  .option('-s, --spec <specs...>', 'restrict analysis to given specs', ['all'])
  .option('--sort <sort>', 'key(s) to use to sort the structured report', 'default')
  .option('--structure <structure>', 'report structure', 'type+spec')
  .option('--tr <trreport>', 'path/URL to crawl report on published specs')
  .option('--update-mode <mode>', 'what issue files to update', 'new')
  .option('-w, --what <what...>', 'what to analyze', ['all'])
  .showHelpAfterError('(run with --help for usage information)')
  .addHelpText('after', `
Minimal usage example:
  To study a crawl report in current folder:
    $ strudy inspect .

Description:
  Analyzes a crawl report generated by Reffy and create a report with potential
  anomalies in each of the specs contained in the crawl report.

  Depending on command options, the report is either written to the console as
  a serialized JSON object or as a markdown report (see the --format option),
  or written to individual issues files in a folder (see the --issues option).

Argument:
<crawl>
  Path to the crawl report to analyze. If the path leads to a folder, Strudy
  will look for an "ed/index.json" file under that folder first (if it exists,
  it will also look for a possible "tr/index.json" file to set the --tr option),
  then for an "index.json" file.

Usage notes for some of the options:
-cc, --cc <names...>
  Lists people to copy in issues with a "Cc" message so that they get notified.
  This is helpful to follow issues that may warrant further discussion and
  guidance.

  Each name should be a GitHub handle, such as "tidoust" or "dontcallmedom".
  The handle may start with a "@" (code will add it as prefix automatically
  otherwise).

  The "Cc" message will only be added to anomalies that are not obvious to fix:
  for example, it will be set for anomalies about algorithms and Web IDL, but
  not for broken links or references to discontinued specs (see "cc" flag in
  the definitions of anomalies in src/lib/study.js).

  The option is ignored if the --issues option is not set.

-f, --format <format>
  Tell Strudy to return a report in the specified format. Format may be one of
  "markdown" (default when option is not set) or "json".

  The --format option cannot be set to "json" if the --issues option is set.

-i, --issues <folder>
  Tell Strudy to report the anomalies in anomaly files in the given folder.
  An anomaly file gets created for and named after keys at the first level of
  the report (see --structure option).

  Anomaly files are in markdown. The --format option must be set to "markdown",
  or not set at all.

  Anomaly files start with metadata, used to convert the file to a GitHub issue
  and track the resolution of the issue afterwards: "Repo" sets the repository
  for the issue, "Title" the title of the issue, and "Tracked" the URL of the
  issue, once created.

  Existing anomaly files in the folder are preserved by default, set the
  --update-mode option to change that behavior.

-m, --max <max>
  Maximum number of issue files to add or update. Defaults to 0, which means
  "no limit".

  This setting should only be useful when combined with --issues to create
  issue files in batches. It may also be set in the absence of --issues, in
  which case it restricts the number of entries at the first level of the
  report (see --structure).

-s, --spec <specs...>
  Valid spec values are spec shortnames. Use "all" to include all specs. This
  is equivalent to not setting the option at all.

  For instance:
    $ strudy inspect . --spec picture-in-picture

  The analysis skips discontinued specs that may appear in the crawl result by
  default. To force an analysis on a discontinued spec, mention its shortname
  explicitly. You may combine that shortname with the value "all" to analyze
  all non-discontinued specs plus the ones explicitly listed with their
  shortnames.

  For instance:
    $ strudy inspect . --spec all --spec tracking-dnt

--sort <sort>
  Specifies the key(s) to use to sort each level in the structured report.
  Use "/" to separate levels. See --structure for details on the possible
  report structure.

  Possible keys:
  "default"   follow the natural order of the underlying structures, e.g.
              return specs in the order in which they appear in the initial
              list, anomalies in extraction order (which usually follows the
              document order)
  "name"      sort entries by the name. For a "spec" level, the name is the
              spec's shortname. For a "type" level, the name is the anomaly
              type name. For a "type+spec" level, the name is the name of the
              file that would be created if --issues is set, meaning the spec's
              shortname completed with the anomaly type name.
  "title"     sort entries by their title. For a "spec" level, the title is the
              spec's title. For the final level, the title is the anomaly
              message. Etc.

  If the --sort value contains more levels than there are in the structured
  report, additional keys are ignored. If the value contails fewer levels than
  there are in the structured report, the default order is used for unspecified
  levels.

  For example, if the structure is "type/spec", the --sort option could be:
  "default"           to use the default order at all levels
  "default/title"     to use the default order for the root level, and to sort
                      specs by title
  "name/title/title"  to sort anomaly types by names, specs by title, and
                      anomalies by message.

  Sort is always ascending.

--structure <type>
  Describes the hierarchy in the report(s) that Strudy returns. Possible values:
  "flat"            no level, report anomalies one by one
  "type+spec"       one level with one entry per type and spec (default)
  "group+spec/type" first level per group and spec, second level per type
  "spec/type"       first level per spec, second level per type
  "spec/group/type" first level per spec, second level per group, third level
                    per type
  "type/spec"       first level per type, second level per spec
  "group/type/spec" first level per group, second level per type, third level
                    per spec
  "group/spec/type" first level per group, second level per spec, third level
                    per type

  Last level contains the actual list of anomalies.

  Note: an anomaly always has a "type". Related anomaly types are grouped in an
  anomaly "group". For example, "brokenLinks" and "datedUrls" both belong to
  the "backrefs" group (also see the --what option).

--tr <trreport>
  Useful for Strudy to refine its broken link analysis when crawl report
  contains info about latest Editor's Drafts.

  A spec that references terms defined in a second spec for which the /TR
  version lags behind the Editor's Draft may have issues of the form "The term
  exists in the /TR version but no longer exists in the Editor's Draft".

  Note that if <crawl> is a link to a folder, the tool will automatically look
  for the TR crawl report in a "tr" subfolder and set <trreport> itself.

--update-mode <mode>
  Tell Strudy what issue files to update when --issues is set and an issue file
  already exists for the issue at hand. Possible values are:
  "new" (default)  preserve existing files
  "old"            preserve existing files but get rid of old ones for which
                   study reveals no more issue
  "untracked"      same as "old" but also update existing files that do not
                   have a "Tracked" URL
  "tracked"        same as "old" but also update existing files that have a
                   "Tracked" URL
  "all"            update all existing files, deleting them when needed

  Strudy will always create new issue files, the mode only changes the behavior
  for existing issue files.

  The --issues option must be set.

-w, --what <what...>
  Tell Strudy which anomalies to analyze. Values can be the names of anomaly
  types or the name of anomaly groups. The value "all" (default) tells Strudy
  to analyze and report on all possible anomalies.

  The list of anomaly types and groups will likely evolve over time, see actual
  list in src/lib/study.js.

  Examples:
  "-w algorithms -w backrefs" to study algorithms and references to other specs
  "-w unknownSpecs" to study links to unknown specs
`)
  .action(async (report, options) => {
    // Check options
    if (options.format && !['json', 'markdown'].includes(options.format)) {
      console.error(`Unsupported --format option "${options.format}".
Format must be one of "json" or "markdown".`)
      process.exit(2);
    }
    if (options.format !== 'markdown' && options.issues) {
      console.error(`The --format option can only be set to "markdown" when --issues is used.`);
      process.exit(2);
    }
    if (options.updateMode && !['new', 'old', 'untracked', 'tracked', 'all'].includes(options.updateMode)) {
      console.error(`Unsupported --update-mode option "${options.updateMode}"`);
      process.exit(2);
    }
    if (options.updateMode !== 'new' && !options.issues) {
      console.error('The --update-mode option can only be set when --issues is set');
      process.exit(2);
    }
    if (options.issues && !await exists(options.issues)) {
      console.error(`Could not find/access the folder to store anomalies: ${options.issues}`)
      process.exit(2);
    }

    // Load (and expand) the crawl results
    let edReportFile = report;
    let trReportFile = options.tr;
    if (!report.endsWith('.json')) {
      if (await exists(path.join(report, 'ed'))) {
        edReportFile = path.join(report, 'ed');
        if (!trReportFile && await exists(path.join(report, 'tr'))) {
          trReportFile = path.join(report, 'tr');
        }
      }
      edReportFile = path.join(edReportFile, 'index.json');
    }
    if (!await exists(edReportFile)) {
      console.error(`Could not find/access crawl/study report: ${report}`);
      process.exit(2);
    }
    if (trReportFile) {
      if (!trReportFile.endsWith('.json')) {
        trReportFile = path.join(trReportFile, 'index.json');
      }
      if (!await exists(trReportFile)) {
        console.error(`Could not find/access TR crawl report: ${options.tr}`);
        process.exit(2);
      }
    }

    let edReport = await loadJSON(edReportFile);
    edReport = await expandCrawlResult(edReport, path.dirname(edReportFile));

    let trReport;
    if (trReportFile) {
      trReport = await loadJSON(trReportFile);
      trReport = await expandCrawlResult(trReport, path.dirname(trReportFile));
    }

    // Create a structured anomaly report out of the crawl report
    const anomaliesReport = await study(edReport.results, {
      what: options.what,
      structure: options.structure,
      sort: options.sort,
      format: options.format === 'json' ?
        'json' :
        (options.issues ? 'issue' : 'full'),
      trResults: trReport?.results ?? [],
      specs: options.spec,
      cc: (options.issues ? options.cc : null)
    });

    // Output the structured anomaly report
    if (options.format === 'json') {
      // Caller wants a JSON report. We'll just trim the number of anomalies
      // in the first level to the requested maximum as needed
      if (options.max > 0) {
        anomaliesReport.results = anomaliesReport.results.slice(0, options.max);
      }
      console.log(JSON.stringify(anomaliesReport, null, 2));
    }
    else if (options.issues) {
      // Caller wants to add/update issue files in the provided folder.
      // Issue files are formatted with the gray-matter library to save useful
      // metadata as front matter in the file.
      const issueUrl = /^https:\/\/github\.com\/([^/]+)\/([^/]+)\/(issues|pull)\/(\d+)$/;
      let reported = 0;
      for (const entry of anomaliesReport.results) {
        const filename = path.join(options.issues, `${entry.name}.md`);
        let existingReport;
        let tracked = 'N/A';
        if (await exists(filename)) {
          if (options.updateMode === 'new' ||
              options.updateMode === 'old') {
            console.warn(`- skip ${filename}, file already exists`);
            continue;
          }
          existingReport = matter(await fs.readFile(filename, 'utf-8'));
          tracked = existingReport.data.Tracked ?? 'N/A';
          if ((options.updateMode === 'tracked' && !tracked.match(issueUrl)) ||
              (options.updateMode === 'untracked' && tracked.match(issueUrl))) {
            console.warn(`- skip ${filename}, file already exists, with Tracked="${tracked}"`);
            continue;
          }
        }

        const content = `
${entry.content}

<sub>This issue was detected and reported semi-automatically by [Strudy](https://github.com/w3c/strudy/) based on data collected in [webref](https://github.com/w3c/webref/).</sub>`;
        // Note from @tidoust: One day, I'll understand how to set up Git and
        // code so that all line endings end up being "\n" even on Windows
        // machines. In the meantime, note that local issue files may well
        // contain "\r\n" on Windows machines.
        if (existingReport?.content.replace(/\r\n/g, '\n').trim() === content.trim()) {
          console.warn(`- skip ${filename}, file already exists, no change`);
          continue;
        }

        const issueReport = matter(content);
        issueReport.data = {
          Title: entry.title,
          Tracked: tracked
        };
        if (entry.spec) {
          const spec = edReport.results.find(spec => spec.url === entry.spec.url);
          if (spec.nightly?.repository) {
            issueReport.data.Repo = spec.nightly.repository;
          }
        }
        console.warn(`- ${existingReport ? 'update' : 'add'} ${filename}`);
        const filecontent = issueReport.stringify();
        await fs.writeFile(filename, filecontent, 'utf-8');
        reported += 1;
        if (options.max > 0 && reported >= options.max) {
          break;
        }
      }

      if (options.updateMode !== 'new') {
        const reportFiles = await fs.readdir(options.issues);
        const todelete = reportFiles.filter(file =>
          anomaliesReport.looksGood.find(name => file === `${name}.md`));
        for (const file of todelete) {
          const filename = path.join(options.issues, file);
          console.warn(`- delete ${filename}, no more anomalies detected`);
          await fs.rm(filename, { force: true });
        }
      }
    }
    else {
      // Caller wants a markdown report written to the console.
      // The anomalies report should already be a "full" one (so only one
      // result item at the first level).
      const content = anomaliesReport.results[0].content;
      let reported = 0;
      for (const entry of content) {
        console.log(entry);
        reported += 1;
        if (options.max > 0 && reported >= options.max) {
          break;
        }
      }
    }
  });

program.parseAsync(process.argv);