From d00cac552dca4b5db0802f450425cc29c4f3d826 Mon Sep 17 00:00:00 2001 From: Arnau Casau <47946624+arnaucasau@users.noreply.github.com> Date: Fri, 5 Jan 2024 19:01:18 +0100 Subject: [PATCH] First refactor of sphinxHtmlToMarkdown.ts (#584) Part of https://github.com/Qiskit/documentation/issues/223. First small refactor of the `sphinxHtmlToMarkdown.ts` script. This PR divides the `sphinxHtmlToMarkdown` function into different helper functions, reducing its length and allowing the possibility of adding tests for different parts of the code. Changes in this PR: - Renamed the `$page` variable to `$` to reduce the noise introduced by the number of calls we had and to follow the [Cheerio](https://cheerio.js.org/docs/basics/loading#load) convention. - Moved the code to find all the images into a helper function. - Grouped similar transformations to the HTML (done previous to its conversion to markdown) into a helper function called `preprocess`. - Moved the conversion from HTML to markdown process to a different function to simplify the `sphinxHtmlToMarkdown` function (divide more in the future). - Refactored the recursive search of methods, adding early returns and merging some parts. - Moved the unified plugin to the `unifiedParser.ts` file in order to abstract the transformation, which can now be called from other files. That script can group together all the unified plugins, such as the link checker, the mergeClassMembers.ts, and this one. In this PR, the code has been copied almost identically to how it was before, except for some little adjustments, like in the unified plugin where we now use an anonymous function and have the two visitors merged. More changes will be introduced in a follow-up. --------- Co-authored-by: Eric Arellano <14852634+Eric-Arellano@users.noreply.github.com> --- scripts/lib/sphinx/PythonObjectMeta.ts | 18 +- scripts/lib/sphinx/sphinxHtmlToMarkdown.ts | 422 +++++++++++---------- 2 files changed, 242 insertions(+), 198 deletions(-) diff --git a/scripts/lib/sphinx/PythonObjectMeta.ts b/scripts/lib/sphinx/PythonObjectMeta.ts index f46066b2f51..b51396d1b39 100644 --- a/scripts/lib/sphinx/PythonObjectMeta.ts +++ b/scripts/lib/sphinx/PythonObjectMeta.ts @@ -10,15 +10,17 @@ // copyright notice, and modified files need to carry a notice indicating // that they have been altered from the originals. +export type PythonApiType = + | "class" + | "method" + | "property" + | "attribute" + | "module" + | "function" + | "exception"; + export type PythonObjectMeta = { python_api_name?: string; - python_api_type?: - | "class" - | "method" - | "property" - | "attribute" - | "module" - | "function" - | "exception"; + python_api_type?: PythonApiType; hardcoded_frontmatter?: string; }; diff --git a/scripts/lib/sphinx/sphinxHtmlToMarkdown.ts b/scripts/lib/sphinx/sphinxHtmlToMarkdown.ts index 3dbd046c50b..5dc83d4b3ae 100644 --- a/scripts/lib/sphinx/sphinxHtmlToMarkdown.ts +++ b/scripts/lib/sphinx/sphinxHtmlToMarkdown.ts @@ -10,19 +10,19 @@ // copyright notice, and modified files need to carry a notice indicating // that they have been altered from the originals. -import { load } from "cheerio"; import { unified } from "unified"; import rehypeParse from "rehype-parse"; import rehypeRemark from "rehype-remark"; import remarkStringify from "remark-stringify"; import remarkGfm from "remark-gfm"; import { last, first, without, initial, tail } from "lodash"; +import { CheerioAPI, Cheerio, load } from "cheerio"; import { defaultHandlers, Handle, toMdast, all } from "hast-util-to-mdast"; import { toText } from "hast-util-to-text"; import remarkMath from "remark-math"; import remarkMdx from "remark-mdx"; import { SphinxToMdResult } from "./SphinxToMdResult"; -import { PythonObjectMeta } from "./PythonObjectMeta"; +import { PythonObjectMeta, PythonApiType } from "./PythonObjectMeta"; import { getLastPartFromFullIdentifier, removePrefix, @@ -42,7 +42,6 @@ export async function sphinxHtmlToMarkdown(options: { baseSourceUrl?: string; releaseNotesTitle?: string; }): Promise { - const images: Array<{ src: string; dest: string }> = []; const { html, url, @@ -53,130 +52,11 @@ export async function sphinxHtmlToMarkdown(options: { const meta: PythonObjectMeta = {}; const isReleaseNotes = url.endsWith("release_notes.html") ? true : false; - const $page = load(html); - const main = $page(`[role='main']`); - const $main = $page(main); - - // remove html extensions in relative links - $main.find("a").each((_, link) => { - const $link = $page(link); - const href = $link.attr("href"); - if (href && !href.startsWith("http")) { - $link.attr("href", href.replaceAll(".html", "")); - } - }); - - if (isReleaseNotes && releaseNotesTitle) { - // Replace heading with custom heading - $page("h1").html(releaseNotesTitle); - } - - $main - .find("img") - .toArray() - .forEach((el) => { - const $img = $page(el); - - const imageUrl = new URL($img.attr("src")!, url); - const src = imageUrl.toString(); - - const filename = last(src.split("/")); - const dest = `${imageDestination}/${filename}`; - - $img.attr("src", dest); - - if (isReleaseNotes) { - // Release notes links should point to the current version - $img.attr("src", dest.replace(/[0-9].*\//, "")); - } - - images.push({ src, dest: dest }); - }); - - // remove permalink links - $main.find('a[title="Permalink to this headline"]').remove(); - $main.find('a[title="Permalink to this heading"]').remove(); - $main.find('a[title="Permalink to this definition"]').remove(); - $main.find('a[title="Link to this heading"]').remove(); - $main.find('a[title="Link to this definition"]').remove(); - - // remove download source code - $main.find("p > a.reference.download.internal").closest("p").remove(); - - // handle tabs, use heading for the summary and remove the blockquote - $main.find(".sd-summary-title").each((_, quote) => { - const $quote = $page(quote); - $quote.replaceWith(`

${$quote.html()}

`); - }); - - $main.find(".sd-card-body blockquote").each((_, quote) => { - const $quote = $page(quote); - $quote.replaceWith($quote.children()); - }); - - // add language class to code blocks - $main.find("pre").each((_, pre) => { - const $pre = $page(pre); - $pre.replaceWith( - `
${$pre.html()}
`, - ); - }); - - // replace source links - $main.find("a").each((_, a) => { - const $a = $page(a); - const href = $a.attr("href"); - if (href?.startsWith("http:")) return; - if (href?.includes(`/_modules/`)) { - //_modules/qiskit_ibm_runtime/ibm_backend - const match = href?.match(/_modules\/(.*?)(#|$)/); - if (match) { - const newHref = `${baseSourceUrl ?? ""}${match[1]}.py`; - $a.attr("href", newHref); - } - } - }); + const $ = load(html); + const $main = $(`[role='main']`); + const images = loadImages($, $main, url, imageDestination, isReleaseNotes); - // use titles for method and attribute headers - $main.find(".rubric").each((_, el) => { - const $el = $page(el); - $el.replaceWith(`

${$el.html()}

`); - }); - - // delete colons - $main.find(".colon").remove(); - - // translate type headings to titles - function findByText(selector: string, text: string) { - return $main - .find(selector) - .filter((i, el) => $page(el).text().trim() === text); - } - - $main - .find("dl.field-list.simple") - .toArray() - .map((dl) => { - const $dl = $page(dl); - - $dl - .find("dt") - .toArray() - .forEach((dt) => { - const $dt = $page(dt); - $dt.replaceWith(`${$dt.html()}`); - }); - - $dl - .find("dd") - .toArray() - .forEach((dd) => { - const $dd = $page(dd); - $dd.replaceWith(`
${$dd.html()}
`); - }); - - $dl.replaceWith(`
${$dl.html()}
`); - }); + preprocessHtml($, $main, baseSourceUrl, isReleaseNotes, releaseNotesTitle); let continueMapMembers = true; while (continueMapMembers) { @@ -192,12 +72,12 @@ export async function sphinxHtmlToMarkdown(options: { continue; } - const $dl = $page(dl); + const $dl = $(dl); const replacement = $dl .children() .toArray() .map((child) => { - const $child = $page(child); + const $child = $(child); $child.find(".viewcode-link").closest("a").remove(); const id = $dl.find("dt").attr("id") || ""; @@ -207,7 +87,7 @@ export async function sphinxHtmlToMarkdown(options: { meta.python_api_name = id; } - findByText("em.property", "class").remove(); + findByText($, $main, "em.property", "class").remove(); return `

${$child.html()}

`; } else if (child.name === "dt" && $dl.hasClass("property")) { if (!meta.python_api_type) { @@ -219,7 +99,7 @@ export async function sphinxHtmlToMarkdown(options: { } } - findByText("em.property", "property").remove(); + findByText($, $main, "em.property", "property").remove(); const signature = $child.find("em").text()?.replace(/^:\s+/, ""); if (signature.trim().length === 0) return; return `

${signature}

`; @@ -233,13 +113,13 @@ export async function sphinxHtmlToMarkdown(options: { } else { // Inline methods if (id) { - $page( - `

${getLastPartFromFullIdentifier(id)}

`, - ).insertBefore($dl); + $(`

${getLastPartFromFullIdentifier(id)}

`).insertBefore( + $dl, + ); } } - findByText("em.property", "method").remove(); + findByText($, $main, "em.property", "method").remove(); return `

${$child.html()}

`; } else if (child.name === "dt" && $dl.hasClass("attribute")) { if (!meta.python_api_type) { @@ -250,7 +130,7 @@ export async function sphinxHtmlToMarkdown(options: { $dl.siblings("h1").text(getLastPartFromFullIdentifier(id)); } - findByText("em.property", "attribute").remove(); + findByText($, $main, "em.property", "attribute").remove(); const signature = $child.find("em").text()?.replace(/^:\s+/, ""); if (signature.trim().length === 0) return; return `

${signature}

`; @@ -289,7 +169,7 @@ export async function sphinxHtmlToMarkdown(options: { meta.python_api_type = "function"; meta.python_api_name = id; } - findByText("em.property", "function").remove(); + findByText($, $main, "em.property", "function").remove(); return `

${$child.html()}

`; } else if (child.name === "dt" && $dl.hasClass("exception")) { if (!meta.python_api_type) { @@ -297,7 +177,7 @@ export async function sphinxHtmlToMarkdown(options: { meta.python_api_name = id; } - findByText("em.property", "exception").remove(); + findByText($, $main, "em.property", "exception").remove(); return `

${$child.html()}

`; } @@ -313,7 +193,7 @@ export async function sphinxHtmlToMarkdown(options: { .find("div.math") .toArray() .map((el) => { - const $el = $page(el); + const $el = $(el); $el.replaceWith(`
${$el.html()}
`); }); @@ -322,7 +202,7 @@ export async function sphinxHtmlToMarkdown(options: { const moduleIdWithPrefix = $main .find("span, section") .toArray() - .map((el) => $page(el).attr("id")) + .map((el) => $(el).attr("id")) .find((id) => id?.startsWith(modulePrefix)); if (moduleIdWithPrefix) { meta.python_api_type = "module"; @@ -335,8 +215,8 @@ export async function sphinxHtmlToMarkdown(options: { .find("h1,h2") .toArray() .forEach((el) => { - const $el = $page(el); - const $a = $page($el.find("a")); + const $el = $(el); + const $a = $($el.find("a")); const signature = $a.text(); $a.remove(); @@ -351,8 +231,148 @@ export async function sphinxHtmlToMarkdown(options: { } // convert to markdown - const mainHtml = main.html()!; + const markdown = await generateMarkdownFile($main.html()!, meta); + + return { markdown, meta, images, isReleaseNotes }; +} + +function loadImages( + $: CheerioAPI, + $main: Cheerio, + url: string, + imageDestination: string, + isReleaseNotes: boolean, +): Array<{ src: string; dest: string }> { + const images: Array<{ src: string; dest: string }> = []; + $main + .find("img") + .toArray() + .forEach((img) => { + const $img = $(img); + + const imageUrl = new URL($img.attr("src")!, url); + const src = imageUrl.toString(); + const filename = last(src.split("/")); + const dest = `${imageDestination}/${filename}`; + + $img.attr("src", dest); + + if (isReleaseNotes) { + // Release notes links should point to the current version + $img.attr("src", dest.replace(/[0-9].*\//, "")); + } + + images.push({ src, dest: dest }); + }); + + return images; +} + +function preprocessHtml( + $: CheerioAPI, + $main: Cheerio, + baseSourceUrl: string | undefined, + isReleaseNotes: boolean, + releaseNotesTitle: string | undefined, +): void { + // remove html extensions in relative links + $main.find("a").each((_, link) => { + const $link = $(link); + const href = $link.attr("href"); + if (href && !href.startsWith("http")) { + $link.attr("href", href.replaceAll(".html", "")); + } + }); + + // Custom heading for release notes + if (isReleaseNotes && releaseNotesTitle) { + $("h1").html(releaseNotesTitle); + } + + // remove permalink links + $main.find('a[title="Permalink to this headline"]').remove(); + $main.find('a[title="Permalink to this heading"]').remove(); + $main.find('a[title="Permalink to this definition"]').remove(); + $main.find('a[title="Link to this heading"]').remove(); + $main.find('a[title="Link to this definition"]').remove(); + + // remove download source code + $main.find("p > a.reference.download.internal").closest("p").remove(); + + // handle tabs, use heading for the summary and remove the blockquote + $main.find(".sd-summary-title").each((_, quote) => { + const $quote = $(quote); + $quote.replaceWith(`

${$quote.html()}

`); + }); + + $main.find(".sd-card-body blockquote").each((_, quote) => { + const $quote = $(quote); + $quote.replaceWith($quote.children()); + }); + + // add language class to code blocks + $main.find("pre").each((_, pre) => { + const $pre = $(pre); + $pre.replaceWith( + `
${$pre.html()}
`, + ); + }); + + // replace source links + $main.find("a").each((_, a) => { + const $a = $(a); + const href = $a.attr("href"); + if (href?.startsWith("http:")) return; + if (href?.includes(`/_modules/`)) { + //_modules/qiskit_ibm_runtime/ibm_backend + const match = href?.match(/_modules\/(.*?)(#|$)/); + if (match) { + const newHref = `${baseSourceUrl ?? ""}${match[1]}.py`; + $a.attr("href", newHref); + } + } + }); + + // use titles for method and attribute headers + $main.find(".rubric").each((_, el) => { + const $el = $(el); + $el.replaceWith(`

${$el.html()}

`); + }); + + // delete colons + $main.find(".colon").remove(); + + $main + .find("dl.field-list.simple") + .toArray() + .map((dl) => { + const $dl = $(dl); + + $dl + .find("dt") + .toArray() + .forEach((dt) => { + const $dt = $(dt); + $dt.replaceWith(`${$dt.html()}`); + }); + + $dl + .find("dd") + .toArray() + .forEach((dd) => { + const $dd = $(dd); + $dd.replaceWith(`
${$dd.html()}
`); + }); + + $dl.replaceWith(`
${$dl.html()}
`); + }); +} + +async function generateMarkdownFile( + mainHtml: string, + meta: PythonObjectMeta, +): Promise { const handlers: Record = { br(h, node: any) { return all(h, node); @@ -483,63 +503,55 @@ export async function sphinxHtmlToMarkdown(options: { handlers, }) .use(remarkStringify, remarkStringifyOptions) - .use(() => { - return (root: Root) => { - // merge contiguous emphasis - visit(root, "emphasis", (node, index, parent) => { - if (index === null || parent === null) return; - let nextIndex = index + 1; - while (parent.children[nextIndex]?.type === "emphasis") { - node.children.push( - ...((parent.children[nextIndex] as any).children ?? []), - ); - nextIndex++; - } - parent.children.splice(index + 1, nextIndex - (index + 1)); - }); + .use(() => (root: Root) => { + // merge contiguous emphasis + visit(root, "emphasis", (node, index, parent) => { + if (index === null || parent === null) return; + let nextIndex = index + 1; + while (parent.children[nextIndex]?.type === "emphasis") { + node.children.push( + ...((parent.children[nextIndex] as any).children ?? []), + ); + nextIndex++; + } + parent.children.splice(index + 1, nextIndex - (index + 1)); // remove initial and trailing spaces from emphasis - visit(root, "emphasis", (node, index, parent) => { - if (index === null || parent === null) return; - const firstChild = first(node.children); - if (firstChild?.type === "text") { - const match = firstChild.value.match(/^\s+/); - if (match) { - if (match[0] === firstChild.value) { - node.children = tail(node.children); - } else { - firstChild.value = removePrefix(firstChild.value, match[0]); - } - parent.children.splice(index, 0, { - type: "text", - value: match[0], - }); + const firstChild = first(node.children); + if (firstChild?.type === "text") { + const match = firstChild.value.match(/^\s+/); + if (match) { + if (match[0] === firstChild.value) { + node.children = tail(node.children); + } else { + firstChild.value = removePrefix(firstChild.value, match[0]); } + parent.children.splice(index, 0, { + type: "text", + value: match[0], + }); } - const lastChild = last(node.children); - if (lastChild?.type === "text") { - const match = lastChild.value.match(/\s+$/); - if (match) { - if (match[0] === lastChild.value) { - node.children = initial(node.children); - } else { - lastChild.value = removeSuffix(lastChild.value, match[0]); - } - parent.children.splice(index + 1, 0, { - type: "text", - value: match[0], - }); + } + const lastChild = last(node.children); + if (lastChild?.type === "text") { + const match = lastChild.value.match(/\s+$/); + if (match) { + if (match[0] === lastChild.value) { + node.children = initial(node.children); + } else { + lastChild.value = removeSuffix(lastChild.value, match[0]); } + parent.children.splice(index + 1, 0, { + type: "text", + value: match[0], + }); } - }); - }; + } + }); }) .process(mainHtml); - let markdown = mdFile.toString(); - markdown = markdown.replaceAll(``, ""); - - return { markdown, meta, images, isReleaseNotes }; + return mdFile.toString().replaceAll(``, ""); } function buildAdmonition(options: { @@ -581,3 +593,33 @@ function buildSpanId(id: string): MdxJsxFlowElement { children: [], }; } + +/** + * Find the element that both matches the `selector` and whose content is the same as `text` + */ +function findByText( + $: CheerioAPI, + $main: Cheerio, + selector: string, + text: string, +): Cheerio { + return $main.find(selector).filter((i, el) => $(el).text().trim() === text); +} + +function getPythonApiType($dl: Cheerio): PythonApiType | undefined { + for (const className of [ + "function", + "class", + "exception", + "method", + "property", + "attribute", + "module", + ]) { + if ($dl.hasClass(className)) { + return className as PythonApiType; + } + } + + return undefined; +}