diff --git a/src/Downloader.ts b/src/Downloader.ts index 21bf36dd..80dc95d3 100644 --- a/src/Downloader.ts +++ b/src/Downloader.ts @@ -1,6 +1,8 @@ -import deepmerge from 'deepmerge' import * as backoff from 'backoff' import { config } from './config.js' +import { contains } from './util/index.js' +import deepmerge from 'deepmerge' +import * as domino from 'domino' import { default as imagemin } from 'imagemin' import imageminAdvPng from 'imagemin-advpng' import type { BackoffStrategy } from 'backoff' @@ -293,10 +295,13 @@ class Downloader { } public async getArticle( + webp: boolean, + _moduleDependencies: any, articleId: string, articleDetailXId: RKVS, articleRenderer, articleUrl, + dump, articleDetail?: ArticleDetail, isMainPage?: boolean, ): Promise { @@ -309,10 +314,13 @@ class Downloader { return articleRenderer.render({ data, + webp, + _moduleDependencies, articleId, articleDetailXId, articleDetail, isMainPage, + dump, }) } @@ -450,7 +458,7 @@ class Downloader { .buffer(resp.data, imageminOptions.get('webp').get(resp.headers['content-type'])) .catch(async (err) => { if (/Unsupported color conversion request/.test(err.stderr)) { - return await (imagemin as any) + return (imagemin as any) .buffer(await sharp(resp.data).toColorspace('srgb').toBuffer(), imageminOptions.get('webp').get(resp.headers['content-type'])) .catch(() => { return resp.data @@ -460,7 +468,7 @@ class Downloader { return data }) } else { - return await (imagemin as any).buffer(resp.data, imageminOptions.get('default').get(resp.headers['content-type'])).catch(() => { + return (imagemin as any).buffer(resp.data, imageminOptions.get('default').get(resp.headers['content-type'])).catch(() => { return resp.data }) } @@ -600,6 +608,63 @@ class Downloader { call.on('backoff', this.backoffOptions.backoffHandler) call.start() } + + public async getModuleDependencies(title: string) { + const genericJsModules = config.output.mw.js + const genericCssModules = config.output.mw.css + /* These vars will store the list of js and css dependencies for + the article we are downloading. */ + let jsConfigVars = '' + let jsDependenciesList: string[] = [] + let styleDependenciesList: string[] = [] + + const apiUrlDirector = new ApiURLDirector(MediaWiki.apiUrl.href) + + const articleApiUrl = apiUrlDirector.buildArticleApiURL(title) + + const articleData = await this.getJSON(articleApiUrl) + + if (articleData.error) { + const errorMessage = `Unable to retrieve js/css dependencies for article '${title}': ${articleData.error.code}` + logger.error(errorMessage) + + /* If article is missing (for example because it just has been deleted) */ + if (articleData.error.code === 'missingtitle') { + return { jsConfigVars, jsDependenciesList, styleDependenciesList } + } + + /* Something went wrong in modules retrieval at app level (no HTTP error) */ + throw new Error(errorMessage) + } + + const { + parse: { modules, modulescripts, modulestyles, headhtml }, + } = articleData + jsDependenciesList = genericJsModules.concat(modules, modulescripts).filter((a) => a) + styleDependenciesList = [].concat(modules, modulestyles, genericCssModules).filter((a) => a) + styleDependenciesList = styleDependenciesList.filter((oneStyleDep) => !contains(config.filters.blackListCssModules, oneStyleDep)) + + logger.info(`Js dependencies of ${title} : ${jsDependenciesList}`) + logger.info(`Css dependencies of ${title} : ${styleDependenciesList}`) + + // Saving, as a js module, the jsconfigvars that are set in the header of a wikipedia page + // the script below extracts the config with a regex executed on the page header returned from the api + const scriptTags = domino.createDocument(`${headhtml['*']}`).getElementsByTagName('script') + const regex = /mw\.config\.set\(\{.*?\}\);/gm + // eslint-disable-next-line @typescript-eslint/prefer-for-of + for (let i = 0; i < scriptTags.length; i += 1) { + if (scriptTags[i].text.includes('mw.config.set')) { + jsConfigVars = regex.exec(scriptTags[i].text)[0] || '' + jsConfigVars = `(window.RLQ=window.RLQ||[]).push(function() {${jsConfigVars}});` + } else if (scriptTags[i].text.includes('RLCONF') || scriptTags[i].text.includes('RLSTATE') || scriptTags[i].text.includes('RLPAGEMODULES')) { + jsConfigVars = scriptTags[i].text + } + } + + jsConfigVars = jsConfigVars.replace('nosuchaction', 'view') // to replace the wgAction config that is set to 'nosuchaction' from api but should be 'view' + + return { jsConfigVars, jsDependenciesList, styleDependenciesList } + } } export default Downloader diff --git a/src/RedisStore.ts b/src/RedisStore.ts index 8206e27a..027d80af 100644 --- a/src/RedisStore.ts +++ b/src/RedisStore.ts @@ -4,57 +4,89 @@ import RedisKvs from './util/RedisKvs.js' import * as logger from './Logger.js' class RedisStore implements RS { - private readonly _client: RedisClientType - private storesReady: boolean - - private _filesToDownloadXPath: RKVS - private _filesToRetryXPath: RKVS - private _articleDetailXId: RKVS - private _redirectsXId: RKVS - - constructor(redisPath: string, opts?: any) { - const options = { ...opts } - const quitOnError = !(options.quitOnError === false) - delete options.quitOnError - - if (redisPath.startsWith('/') || redisPath.startsWith('./')) { - options.socket = { - ...options.socket, - path: redisPath, - } - } else { - options.url = redisPath - } + private static instance: RedisStore - this._client = createClient(options) + #client: RedisClientType + #storesReady: boolean + #filesToDownloadXPath: RKVS + #filesToRetryXPath: RKVS + #articleDetailXId: RKVS + #redirectsXId: RKVS + + public get client() { + return this.#client + } + + public get filesToDownloadXPath(): RKVS { + return this.#filesToDownloadXPath + } + + public get filesToRetryXPath(): RKVS { + return this.#filesToRetryXPath + } + + public get articleDetailXId(): RKVS { + return this.#articleDetailXId + } + + public get redirectsXId(): RKVS { + return this.#redirectsXId + } + + public static getInstance(): RedisStore { + if (!RedisStore.instance) { + RedisStore.instance = new RedisStore() + } + return RedisStore.instance + } - this._client.on('error', (err) => { - if (quitOnError) { - logger.error('Redis Client Error', err) - process.exit(3) + public setOptions(redisPath: string, opts?: any): void { + if (RedisStore.instance) { + const options = { ...opts } + const quitOnError = !(options.quitOnError === false) + delete options.quitOnError + + if (redisPath.startsWith('/') || redisPath.startsWith('./')) { + options.socket = { + ...options.socket, + path: redisPath, + } + } else { + options.url = redisPath } - }) + + this.#client = createClient(options) + + this.#client.on('error', (err) => { + if (quitOnError) { + logger.error('Redis Client Error', err) + process.exit(3) + } + }) + } else { + throw new Error('Redis store has not been instantiated before setting options') + } } public async connect(populateStores = true) { - if (this._client.isOpen) { + if (this.#client.isOpen) { return } - await this._client.connect() + await this.#client.connect() if (populateStores) { await this.checkForExistingStores() await this.populateStores() - this.storesReady = true + this.#storesReady = true } } public async close() { - if (this._client.isReady && this.storesReady) { + if (this.#client.isReady && this.#storesReady) { logger.log('Flushing Redis DBs') - await Promise.all([this._filesToDownloadXPath.flush(), this._filesToRetryXPath.flush(), this._articleDetailXId.flush(), this._redirectsXId.flush()]) + await Promise.all([this.#filesToDownloadXPath.flush(), this.#filesToRetryXPath.flush(), this.#articleDetailXId.flush(), this.#redirectsXId.flush()]) } - if (this._client.isOpen) { - await this._client.quit() + if (this.#client.isOpen) { + await this.#client.quit() } } @@ -62,12 +94,12 @@ class RedisStore implements RS { const patterns = ['*-media', '*-media-retry', '*-detail', '*-redirect'] let keys: string[] = [] for (const pattern of patterns) { - keys = keys.concat(await this._client.keys(pattern)) + keys = keys.concat(await this.#client.keys(pattern)) } keys.forEach(async (key) => { try { - const length = await this._client.hLen(key) + const length = await this.#client.hLen(key) const time = new Date(Number(key.slice(0, key.indexOf('-')))) logger.error(`Found store from previous run from ${time} that is still in redis: ${key} with length ${length}`) } catch { @@ -77,19 +109,19 @@ class RedisStore implements RS { } private async populateStores() { - this._filesToDownloadXPath = new RedisKvs(this._client, `${Date.now()}-media`, { + this.#filesToDownloadXPath = new RedisKvs(this.#client, `${Date.now()}-media`, { u: 'url', n: 'namespace', m: 'mult', w: 'width', }) - this._filesToRetryXPath = new RedisKvs(this._client, `${Date.now()}-media-retry`, { + this.#filesToRetryXPath = new RedisKvs(this.#client, `${Date.now()}-media-retry`, { u: 'url', n: 'namespace', m: 'mult', w: 'width', }) - this._articleDetailXId = new RedisKvs(this._client, `${Date.now()}-detail`, { + this.#articleDetailXId = new RedisKvs(this.#client, `${Date.now()}-detail`, { s: 'subCategories', c: 'categories', p: 'pages', @@ -101,35 +133,16 @@ class RedisStore implements RS { m: 'missing', n: 'title', }) - this._redirectsXId = new RedisKvs(this._client, `${Date.now()}-redirect`, { + this.#redirectsXId = new RedisKvs(this.#client, `${Date.now()}-redirect`, { t: 'targetId', n: 'title', }) } public createRedisKvs(...args: [string, KVS?]): RKVS { - return new RedisKvs(this._client, ...args) - } - - public get client() { - return this._client - } - - public get filesToDownloadXPath(): RKVS { - return this._filesToDownloadXPath - } - - public get filesToRetryXPath(): RKVS { - return this._filesToRetryXPath - } - - public get articleDetailXId(): RKVS { - return this._articleDetailXId - } - - public get redirectsXId(): RKVS { - return this._redirectsXId + return new RedisKvs(this.#client, ...args) } } -export default RedisStore +const rs = RedisStore.getInstance() +export default rs as RedisStore diff --git a/src/mwoffliner.lib.ts b/src/mwoffliner.lib.ts index 9e956304..e6e0a19a 100644 --- a/src/mwoffliner.lib.ts +++ b/src/mwoffliner.lib.ts @@ -215,9 +215,9 @@ async function execute(argv: any) { await downloader.setBaseUrls() - const redisStore = new RedisStore(argv.redis || config.defaults.redisPath) - await redisStore.connect() - const { articleDetailXId, filesToDownloadXPath, filesToRetryXPath, redirectsXId } = redisStore + RedisStore.setOptions(argv.redis || config.defaults.redisPath) + await RedisStore.connect() + const { articleDetailXId, filesToDownloadXPath, filesToRetryXPath, redirectsXId } = RedisStore // Output directory const outputDirectory = path.isAbsolute(_outputDirectory || '') ? _outputDirectory : path.join(process.cwd(), _outputDirectory || 'out') @@ -236,12 +236,12 @@ async function execute(argv: any) { process.on('SIGTERM', async () => { logger.log('SIGTERM') - await redisStore.close() + await RedisStore.close() process.exit(128 + 15) }) process.on('SIGINT', async () => { logger.log('SIGINT') - await redisStore.close() + await RedisStore.close() process.exit(128 + 2) }) @@ -290,13 +290,13 @@ async function execute(argv: any) { logger.info('Getting article ids') let stime = Date.now() - await getArticleIds(downloader, redisStore, mainPage, articleList ? articleListLines : null, articleListToIgnore ? articleListToIgnoreLines : null) + await getArticleIds(downloader, mainPage, articleList ? articleListLines : null, articleListToIgnore ? articleListToIgnoreLines : null) logger.log(`Got ArticleIDs in ${(Date.now() - stime) / 1000} seconds`) if (MediaWiki.getCategories) { - await getCategoriesForArticles(articleDetailXId, downloader, redisStore) + await getCategoriesForArticles(articleDetailXId, downloader) - while ((await trimUnmirroredPages(downloader, redisStore)) > 0) { + while ((await trimUnmirroredPages(downloader)) > 0) { // Remove unmirrored pages, categories, subCategories // trimUnmirroredPages returns number of modified articles } @@ -406,7 +406,7 @@ async function execute(argv: any) { logger.log(`Found [${stylesheetsToGet.length}] stylesheets to download`) logger.log('Downloading stylesheets and populating media queue') - const { finalCss } = await getAndProcessStylesheets(downloader, redisStore, stylesheetsToGet) + const { finalCss } = await getAndProcessStylesheets(downloader, stylesheetsToGet) logger.log('Downloaded stylesheets') const article = new ZimArticle({ url: `${config.output.dirs.mediawiki}/style.css`, data: finalCss, ns: '-' }) @@ -420,7 +420,7 @@ async function execute(argv: any) { logger.log('Getting articles') stime = Date.now() - const { jsModuleDependencies, cssModuleDependencies } = await saveArticles(zimCreator, downloader, redisStore, dump) + const { jsModuleDependencies, cssModuleDependencies } = await saveArticles(zimCreator, downloader, dump) logger.log(`Fetching Articles finished in ${(Date.now() - stime) / 1000} seconds`) logger.log(`Found [${jsModuleDependencies.size}] js module dependencies`) @@ -641,7 +641,7 @@ async function execute(argv: any) { } MediaWiki.reset() - redisStore.close() + RedisStore.close() return dumps } diff --git a/src/renderers/abstract.renderer.ts b/src/renderers/abstract.renderer.ts new file mode 100644 index 00000000..d85d879c --- /dev/null +++ b/src/renderers/abstract.renderer.ts @@ -0,0 +1,757 @@ +import * as domino from 'domino' +import * as logger from '../Logger.js' +import * as QueryStringParser from 'querystring' +import htmlMinifier from 'html-minifier' +import MediaWiki from '../MediaWiki.js' +import RedisStore from '../RedisStore.js' +import DOMUtils from '../DOMUtils.js' +import DU from '../DOMUtils.js' +import { config } from '../config.js' +import { Dump } from '../Dump.js' +import { rewriteUrlsOfDoc } from '../util/rewriteUrls.js' +import { footerTemplate, htmlTemplateCode } from '../Templates.js' +import { + getFullUrl, + getMediaBase, + getMimeType, + getRelativeFilePath, + isWebpCandidateImageMimeType, + interpolateTranslationString, + genCanonicalLink, + genHeaderScript, + genHeaderCSSLink, + encodeArticleIdForZimHtmlUrl, +} from '../util/misc.js' + +type renderType = 'auto' | 'desktop' | 'mobile' | 'specific' +type renderName = 'VisualEditor' | 'WikimediaDesktop' | 'WikimediaMobile' + +interface RendererBuilderOptionsBase { + renderType: renderType +} + +interface RendererBuilderOptionsCommon { + renderType: renderType + renderName?: never +} + +interface RendererBuilderOptionsSpecific extends RendererBuilderOptionsBase { + renderType: 'specific' + renderName: renderName +} + +export type RendererBuilderOptions = RendererBuilderOptionsCommon | RendererBuilderOptionsSpecific + +export interface RenderOpts { + data?: any + webp: boolean + _moduleDependencies: any + articleId?: string + articleDetailXId?: RKVS + articleDetail?: ArticleDetail + isMainPage?: boolean + dump: Dump +} + +export interface RenderSingleOutput { + articleId: string + displayTitle: string + html: string + mediaDependencies: any + subtitles: any +} + +export type RenderOutput = RenderSingleOutput[] + +export abstract class Renderer { + protected async treatVideo( + dump: Dump, + srcCache: KVS, + articleId: string, + videoEl: DominoElement, + webp: boolean, + ): Promise<{ mediaDependencies: string[]; subtitles: string[] }> { + const mediaDependencies: string[] = [] + const subtitles: string[] = [] + + if (dump.nopic || dump.novid || dump.nodet) { + DOMUtils.deleteNode(videoEl) + return { mediaDependencies, subtitles } + } + + this.adjustVideoElementAttributes(videoEl) + + const chosenVideoSourceEl = this.chooseBestVideoSource(videoEl) + + if (!chosenVideoSourceEl) { + logger.warn(`Unable to find an appropriate video/audio source for an media element in article '${articleId}'`) + DOMUtils.deleteNode(videoEl) + return { mediaDependencies, subtitles } + } + + this.handleVideoPoster(videoEl, articleId, webp, mediaDependencies, srcCache) + this.updateVideoSrc(chosenVideoSourceEl, articleId, srcCache, mediaDependencies) + + const trackElements = Array.from(videoEl.querySelectorAll('track')) + for (const track of trackElements) { + subtitles.push(await this.treatSubtitle(track, articleId)) + } + + return { mediaDependencies, subtitles } + } + + private adjustVideoElementAttributes(videoEl: DominoElement): void { + if (videoEl.getAttribute('height') && videoEl.getAttribute('height') < 40) { + videoEl.setAttribute('height', '40') + } + videoEl.setAttribute('controls', '40') + } + + private chooseBestVideoSource(videoEl: DominoElement): DominoElement | null { + /* Choose best fiting resolution video node */ + const videoSourceEls: any[] = Array.from(videoEl.children).filter((child: any) => child.tagName === 'SOURCE') + const videoDisplayedWidth = Number(videoEl.getAttribute('width')) + let bestWidthDiff = 424242 + let chosenVideoSourceEl: DominoElement + videoSourceEls.forEach((videoSourceEl: DominoElement) => { + // Ignore non-webm && non-audio sources + const videoSourceType = videoSourceEl.getAttribute('type') + if (!videoSourceEl.getAttribute('src').endsWith('.webm') && !videoSourceType.startsWith('audio')) { + DOMUtils.deleteNode(videoSourceEl) + return + } + + // Handle audio content + if (videoSourceType.startsWith('audio/ogg')) { + chosenVideoSourceEl = videoSourceEl + return + } else if (videoSourceType.startsWith('audio')) { + DOMUtils.deleteNode(videoSourceEl) + return + } + + // If undefined displayed width, then take the best resolution + const videoSourceElWidth = Number(videoSourceEl.getAttribute('data-file-width') || videoSourceEl.getAttribute('data-width') || 0) + if (!videoDisplayedWidth) { + const chosenVideoSourceElWidth = chosenVideoSourceEl ? chosenVideoSourceEl.getAttribute('data-file-width') || chosenVideoSourceEl.getAttribute('data-width') || 0 : 0 + if (videoSourceElWidth > chosenVideoSourceElWidth || (videoSourceElWidth === chosenVideoSourceElWidth && videoSourceEl.getAttribute('src').endsWith('.vp9.webm'))) { + DOMUtils.deleteNode(chosenVideoSourceEl) + chosenVideoSourceEl = videoSourceEl + return + } + } + + // Otherwise, choose with better (smaller) width diff + else { + const widthDiff = Number(videoSourceElWidth - videoDisplayedWidth) + + // If no source has been picked so far, just take this one + if (!chosenVideoSourceEl) { + chosenVideoSourceEl = videoSourceEl + bestWidthDiff = widthDiff + return + } + + // Resolution of source is higher than displayed resolution + else if (widthDiff >= 0) { + if (bestWidthDiff < 0 || widthDiff < bestWidthDiff || (widthDiff === bestWidthDiff && videoSourceEl.getAttribute('src').endsWith('.vp9.webm'))) { + DOMUtils.deleteNode(chosenVideoSourceEl) + chosenVideoSourceEl = videoSourceEl + bestWidthDiff = widthDiff + return + } + } + + // Resolution of source is smaller than displayed resolution + else { + if (widthDiff > bestWidthDiff || (widthDiff === bestWidthDiff && videoSourceEl.getAttribute('src').endsWith('.vp9.webm'))) { + DOMUtils.deleteNode(chosenVideoSourceEl) + chosenVideoSourceEl = videoSourceEl + bestWidthDiff = widthDiff + return + } + } + } + + // Delete all other nodes + DOMUtils.deleteNode(videoSourceEl) + }) + + return chosenVideoSourceEl + } + + private handleVideoPoster(videoEl: DominoElement, articleId: string, webp: boolean, mediaDependencies: string[], srcCache: KVS): void { + const posterUrl = videoEl.getAttribute('poster') + if (posterUrl) { + const videoPosterUrl = getFullUrl(posterUrl, MediaWiki.baseUrl) + const newVideoPosterUrl = getRelativeFilePath(articleId, getMediaBase(videoPosterUrl, true), 'I') + + if (posterUrl) { + videoEl.setAttribute('poster', isWebpCandidateImageMimeType(webp, getMimeType(newVideoPosterUrl)) ? newVideoPosterUrl + '.webp' : newVideoPosterUrl) + } + videoEl.removeAttribute('resource') + + if (!srcCache.hasOwnProperty(videoPosterUrl)) { + srcCache[videoPosterUrl] = true + mediaDependencies.push(videoPosterUrl) + } + } + } + + private updateVideoSrc(chosenVideoSourceEl: DominoElement, articleId: string, srcCache: KVS, mediaDependencies: string[]): void { + /* Download content, but avoid duplicate calls */ + const sourceUrl = getFullUrl(chosenVideoSourceEl.getAttribute('src'), MediaWiki.baseUrl) + if (!srcCache.hasOwnProperty(sourceUrl)) { + srcCache[sourceUrl] = true + mediaDependencies.push(sourceUrl) + } + + /* Set new URL for the video element */ + const fileBase = getMediaBase(sourceUrl, true) + chosenVideoSourceEl.setAttribute('src', getRelativeFilePath(articleId, fileBase, 'I')) + } + + protected async treatSubtitle(trackEle: DominoElement, articleId: string): Promise { + const subtitleSourceUrl = getFullUrl(trackEle.getAttribute('src'), MediaWiki.baseUrl) + const { title, lang } = QueryStringParser.parse(subtitleSourceUrl) as { title: string; lang: string } + // The source URL we get from Mediawiki article is in srt format, so we replace it to vtt which is standard subtitle trackformat for src attribute. + const vttFormatUrl = new URL(subtitleSourceUrl) + vttFormatUrl.searchParams.set('trackformat', 'vtt') + trackEle.setAttribute('src', `${getRelativeFilePath(articleId, title, '-')}-${lang}.vtt`) + return vttFormatUrl.href + } + + private treatImageFrames(dump: Dump, parsoidDoc: DominoElement, imageNode: DominoElement) { + const image = imageNode.getElementsByTagName('img')[0] || imageNode.getElementsByTagName('video')[0] + + if (!this.shouldKeepNode(dump, imageNode, image)) { + DOMUtils.deleteNode(imageNode) + return + } + + const descriptions = imageNode.getElementsByTagName('figcaption') + const description = descriptions.length > 0 ? descriptions[0] : undefined + const imageWidth = parseInt(image.getAttribute('width'), 10) + + const thumbDiv = this.makeThumbDiv(dump, parsoidDoc, imageNode) + + const thumbinnerDiv = parsoidDoc.createElement('div') + thumbinnerDiv.setAttribute('class', 'thumbinner') + thumbinnerDiv.setAttribute('style', `width:${imageWidth + 2}px`) + + const thumbcaptionDiv = parsoidDoc.createElement('div') + thumbcaptionDiv.setAttribute('class', 'thumbcaption') + const autoAlign = dump.mwMetaData.textDir === 'ltr' ? 'left' : 'right' + thumbcaptionDiv.setAttribute('style', `text-align: ${autoAlign}`) + if (description) { + thumbcaptionDiv.innerHTML = description.innerHTML + } + + thumbinnerDiv.appendChild(this.isStillLinked(image) ? image.parentNode : image) + thumbinnerDiv.appendChild(thumbcaptionDiv) + thumbDiv.appendChild(thumbinnerDiv) + + imageNode.parentNode.replaceChild(thumbDiv, imageNode) + } + + private async treatImage(dump: Dump, srcCache: KVS, articleId: string, img: DominoElement, webp: boolean): Promise<{ mediaDependencies: string[] }> { + const mediaDependencies: string[] = [] + + if (!this.shouldKeepImage(dump, img)) { + DOMUtils.deleteNode(img) + return { mediaDependencies } + } + + /* Remove image link */ + const linkNode = img.parentNode + if (linkNode.tagName === 'A') { + /* Check if the target is mirrored */ + const href = linkNode.getAttribute('href') || '' + const title = MediaWiki.extractPageTitleFromHref(href) + const keepLink = title && (await RedisStore.articleDetailXId.exists(title)) + + /* Under certain condition it seems that this is possible + * to have parentNode == undefined, in this case this + * seems preferable to remove the whole link+content than + * keeping a wrong link. See for example this url + * http://parsoid.wmflabs.org/ko/%EC%9D%B4%ED%9C%98%EC%86%8C */ + if (!keepLink) { + if (linkNode.parentNode) { + linkNode.parentNode.replaceChild(img, linkNode) + } else { + DOMUtils.deleteNode(img) + return { mediaDependencies } + } + } + } + + /* Rewrite image src attribute */ + const src = getFullUrl(img.getAttribute('src'), MediaWiki.baseUrl) + let newSrc: string + try { + const resourceNamespace = 'I' + const slashesInUrl = articleId.split('/').length - 1 + const upStr = '../'.repeat(slashesInUrl + 1) + newSrc = `${upStr}${resourceNamespace}/` + getMediaBase(src, true) + /* Download image, but avoid duplicate calls */ + if (!srcCache.hasOwnProperty(src)) { + srcCache[src] = true + mediaDependencies.push(src) + } + + /* Change image source attribute to point to the local image */ + img.setAttribute('src', isWebpCandidateImageMimeType(webp, getMimeType(src)) ? newSrc + '.webp' : newSrc) + + /* Remove useless 'resource' attribute */ + img.removeAttribute('resource') + + /* Remove srcset */ + img.removeAttribute('srcset') + } catch (err) { + DOMUtils.deleteNode(img) + } + + /* Add lazy loading */ + img.setAttribute('loading', 'lazy') + + return { mediaDependencies } + } + + private shouldKeepImage(dump: Dump, img: DominoElement) { + const imageNodeClass = img.getAttribute('class') || '' + const src = img.getAttribute('src') + return ( + (!dump.nopic || imageNodeClass.includes('mwe-math-fallback-image-inline') || img.getAttribute('typeof') === 'mw:Extension/math') && + src && + !src.includes('./Special:FilePath/') + ) + } + + protected async treatMedias(parsoidDoc: DominoElement, dump: Dump, articleId: string, webp: boolean) { + let mediaDependencies: string[] = [] + let subtitles: string[] = [] + /* Clean/rewrite image tags */ + const imgs = Array.from(parsoidDoc.getElementsByTagName('img')) + const videos: DominoElement = Array.from(parsoidDoc.querySelectorAll('video, audio')) + const srcCache: KVS = {} + + for (const videoEl of videos) { + //