Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

1881/modularization article treatment - Rearchitecture MWOffliner HTML/CSS/JS scraping (part #2) #1886

Merged
merged 9 commits into from
Sep 7, 2023
71 changes: 68 additions & 3 deletions src/Downloader.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import deepmerge from 'deepmerge'
import * as backoff from 'backoff'
import { config } from './config.js'
import { contains } from './util/index.js'
import deepmerge from 'deepmerge'
import * as domino from 'domino'
import { default as imagemin } from 'imagemin'
import imageminAdvPng from 'imagemin-advpng'
import type { BackoffStrategy } from 'backoff'
Expand Down Expand Up @@ -293,10 +295,13 @@
}

public async getArticle(
webp: boolean,
_moduleDependencies: any,
articleId: string,
articleDetailXId: RKVS<ArticleDetail>,
articleRenderer,
articleUrl,
dump,
articleDetail?: ArticleDetail,
isMainPage?: boolean,
): Promise<any> {
Expand All @@ -309,10 +314,13 @@

return articleRenderer.render({
data,
webp,
VadimKovalenkoSNF marked this conversation as resolved.
Show resolved Hide resolved
_moduleDependencies,
articleId,
articleDetailXId,
articleDetail,
isMainPage,
dump,
})
}

Expand Down Expand Up @@ -450,7 +458,7 @@
.buffer(resp.data, imageminOptions.get('webp').get(resp.headers['content-type']))
.catch(async (err) => {
if (/Unsupported color conversion request/.test(err.stderr)) {
return await (imagemin as any)
return (imagemin as any)

Check warning on line 461 in src/Downloader.ts

View check run for this annotation

Codecov / codecov/patch

src/Downloader.ts#L461

Added line #L461 was not covered by tests
.buffer(await sharp(resp.data).toColorspace('srgb').toBuffer(), imageminOptions.get('webp').get(resp.headers['content-type']))
.catch(() => {
return resp.data
Expand All @@ -460,7 +468,7 @@
return data
})
} else {
return await (imagemin as any).buffer(resp.data, imageminOptions.get('default').get(resp.headers['content-type'])).catch(() => {
return (imagemin as any).buffer(resp.data, imageminOptions.get('default').get(resp.headers['content-type'])).catch(() => {

Check warning on line 471 in src/Downloader.ts

View check run for this annotation

Codecov / codecov/patch

src/Downloader.ts#L471

Added line #L471 was not covered by tests
return resp.data
})
}
Expand Down Expand Up @@ -600,6 +608,63 @@
call.on('backoff', this.backoffOptions.backoffHandler)
call.start()
}

public async getModuleDependencies(title: string) {
const genericJsModules = config.output.mw.js
const genericCssModules = config.output.mw.css
/* These vars will store the list of js and css dependencies for
the article we are downloading. */
let jsConfigVars = ''
let jsDependenciesList: string[] = []
let styleDependenciesList: string[] = []

const apiUrlDirector = new ApiURLDirector(MediaWiki.apiUrl.href)

const articleApiUrl = apiUrlDirector.buildArticleApiURL(title)

const articleData = await this.getJSON<any>(articleApiUrl)

if (articleData.error) {
const errorMessage = `Unable to retrieve js/css dependencies for article '${title}': ${articleData.error.code}`
logger.error(errorMessage)

Check warning on line 629 in src/Downloader.ts

View check run for this annotation

Codecov / codecov/patch

src/Downloader.ts#L628-L629

Added lines #L628 - L629 were not covered by tests

/* If article is missing (for example because it just has been deleted) */
if (articleData.error.code === 'missingtitle') {
return { jsConfigVars, jsDependenciesList, styleDependenciesList }

Check warning on line 633 in src/Downloader.ts

View check run for this annotation

Codecov / codecov/patch

src/Downloader.ts#L633

Added line #L633 was not covered by tests
}

/* Something went wrong in modules retrieval at app level (no HTTP error) */
throw new Error(errorMessage)

Check warning on line 637 in src/Downloader.ts

View check run for this annotation

Codecov / codecov/patch

src/Downloader.ts#L637

Added line #L637 was not covered by tests
}

const {
parse: { modules, modulescripts, modulestyles, headhtml },
} = articleData
jsDependenciesList = genericJsModules.concat(modules, modulescripts).filter((a) => a)
styleDependenciesList = [].concat(modules, modulestyles, genericCssModules).filter((a) => a)
styleDependenciesList = styleDependenciesList.filter((oneStyleDep) => !contains(config.filters.blackListCssModules, oneStyleDep))

logger.info(`Js dependencies of ${title} : ${jsDependenciesList}`)
logger.info(`Css dependencies of ${title} : ${styleDependenciesList}`)

// Saving, as a js module, the jsconfigvars that are set in the header of a wikipedia page
// the script below extracts the config with a regex executed on the page header returned from the api
const scriptTags = domino.createDocument(`${headhtml['*']}</body></html>`).getElementsByTagName('script')
const regex = /mw\.config\.set\(\{.*?\}\);/gm
// eslint-disable-next-line @typescript-eslint/prefer-for-of
for (let i = 0; i < scriptTags.length; i += 1) {
if (scriptTags[i].text.includes('mw.config.set')) {
jsConfigVars = regex.exec(scriptTags[i].text)[0] || ''
jsConfigVars = `(window.RLQ=window.RLQ||[]).push(function() {${jsConfigVars}});`
} else if (scriptTags[i].text.includes('RLCONF') || scriptTags[i].text.includes('RLSTATE') || scriptTags[i].text.includes('RLPAGEMODULES')) {
jsConfigVars = scriptTags[i].text
}
}

jsConfigVars = jsConfigVars.replace('nosuchaction', 'view') // to replace the wgAction config that is set to 'nosuchaction' from api but should be 'view'

return { jsConfigVars, jsDependenciesList, styleDependenciesList }
}
}

export default Downloader
137 changes: 75 additions & 62 deletions src/RedisStore.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,70 +4,102 @@
import * as logger from './Logger.js'

class RedisStore implements RS {
private readonly _client: RedisClientType
private storesReady: boolean

private _filesToDownloadXPath: RKVS<FileDetail>
private _filesToRetryXPath: RKVS<FileDetail>
private _articleDetailXId: RKVS<ArticleDetail>
private _redirectsXId: RKVS<ArticleRedirect>

constructor(redisPath: string, opts?: any) {
const options = { ...opts }
const quitOnError = !(options.quitOnError === false)
delete options.quitOnError

if (redisPath.startsWith('/') || redisPath.startsWith('./')) {
options.socket = {
...options.socket,
path: redisPath,
}
} else {
options.url = redisPath
}
private static instance: RedisStore

this._client = createClient(options)
#client: RedisClientType
#storesReady: boolean
#filesToDownloadXPath: RKVS<FileDetail>
#filesToRetryXPath: RKVS<FileDetail>
#articleDetailXId: RKVS<ArticleDetail>
#redirectsXId: RKVS<ArticleRedirect>

public get client() {
return this.#client

Check warning on line 17 in src/RedisStore.ts

View check run for this annotation

Codecov / codecov/patch

src/RedisStore.ts#L16-L17

Added lines #L16 - L17 were not covered by tests
}

public get filesToDownloadXPath(): RKVS<FileDetail> {
return this.#filesToDownloadXPath
}

public get filesToRetryXPath(): RKVS<FileDetail> {
return this.#filesToRetryXPath
}

public get articleDetailXId(): RKVS<ArticleDetail> {
return this.#articleDetailXId
}

public get redirectsXId(): RKVS<ArticleRedirect> {
return this.#redirectsXId
}

public static getInstance(): RedisStore {
if (!RedisStore.instance) {
RedisStore.instance = new RedisStore()
}
return RedisStore.instance
}

this._client.on('error', (err) => {
if (quitOnError) {
logger.error('Redis Client Error', err)
process.exit(3)
public setOptions(redisPath: string, opts?: any): void {
if (RedisStore.instance) {
const options = { ...opts }
const quitOnError = !(options.quitOnError === false)
delete options.quitOnError

if (redisPath.startsWith('/') || redisPath.startsWith('./')) {
options.socket = {

Check warning on line 50 in src/RedisStore.ts

View check run for this annotation

Codecov / codecov/patch

src/RedisStore.ts#L50

Added line #L50 was not covered by tests
...options.socket,
path: redisPath,
}
} else {
options.url = redisPath
}
})

this.#client = createClient(options)

this.#client.on('error', (err) => {
if (quitOnError) {
logger.error('Redis Client Error', err)
process.exit(3)

Check warning on line 63 in src/RedisStore.ts

View check run for this annotation

Codecov / codecov/patch

src/RedisStore.ts#L62-L63

Added lines #L62 - L63 were not covered by tests
}
})
} else {
throw new Error('Redis store has not been instantiated before setting options')

Check warning on line 67 in src/RedisStore.ts

View check run for this annotation

Codecov / codecov/patch

src/RedisStore.ts#L66-L67

Added lines #L66 - L67 were not covered by tests
}
}

public async connect(populateStores = true) {
if (this._client.isOpen) {
if (this.#client.isOpen) {
return
}
await this._client.connect()
await this.#client.connect()
if (populateStores) {
await this.checkForExistingStores()
await this.populateStores()
this.storesReady = true
this.#storesReady = true
}
}

public async close() {
if (this._client.isReady && this.storesReady) {
if (this.#client.isReady && this.#storesReady) {
logger.log('Flushing Redis DBs')
await Promise.all([this._filesToDownloadXPath.flush(), this._filesToRetryXPath.flush(), this._articleDetailXId.flush(), this._redirectsXId.flush()])
await Promise.all([this.#filesToDownloadXPath.flush(), this.#filesToRetryXPath.flush(), this.#articleDetailXId.flush(), this.#redirectsXId.flush()])
}
if (this._client.isOpen) {
await this._client.quit()
if (this.#client.isOpen) {
await this.#client.quit()
}
}

public async checkForExistingStores() {
const patterns = ['*-media', '*-media-retry', '*-detail', '*-redirect']
let keys: string[] = []
for (const pattern of patterns) {
keys = keys.concat(await this._client.keys(pattern))
keys = keys.concat(await this.#client.keys(pattern))
}

keys.forEach(async (key) => {
try {
const length = await this._client.hLen(key)
const length = await this.#client.hLen(key)

Check warning on line 102 in src/RedisStore.ts

View check run for this annotation

Codecov / codecov/patch

src/RedisStore.ts#L102

Added line #L102 was not covered by tests
const time = new Date(Number(key.slice(0, key.indexOf('-'))))
logger.error(`Found store from previous run from ${time} that is still in redis: ${key} with length ${length}`)
} catch {
Expand All @@ -77,19 +109,19 @@
}

private async populateStores() {
this._filesToDownloadXPath = new RedisKvs(this._client, `${Date.now()}-media`, {
this.#filesToDownloadXPath = new RedisKvs(this.#client, `${Date.now()}-media`, {
u: 'url',
n: 'namespace',
m: 'mult',
w: 'width',
})
this._filesToRetryXPath = new RedisKvs(this._client, `${Date.now()}-media-retry`, {
this.#filesToRetryXPath = new RedisKvs(this.#client, `${Date.now()}-media-retry`, {
u: 'url',
n: 'namespace',
m: 'mult',
w: 'width',
})
this._articleDetailXId = new RedisKvs(this._client, `${Date.now()}-detail`, {
this.#articleDetailXId = new RedisKvs(this.#client, `${Date.now()}-detail`, {
s: 'subCategories',
c: 'categories',
p: 'pages',
Expand All @@ -101,35 +133,16 @@
m: 'missing',
n: 'title',
})
this._redirectsXId = new RedisKvs(this._client, `${Date.now()}-redirect`, {
this.#redirectsXId = new RedisKvs(this.#client, `${Date.now()}-redirect`, {
t: 'targetId',
n: 'title',
})
}

public createRedisKvs(...args: [string, KVS<string>?]): RKVS<any> {
return new RedisKvs(this._client, ...args)
}

public get client() {
return this._client
}

public get filesToDownloadXPath(): RKVS<FileDetail> {
return this._filesToDownloadXPath
}

public get filesToRetryXPath(): RKVS<FileDetail> {
return this._filesToRetryXPath
}

public get articleDetailXId(): RKVS<ArticleDetail> {
return this._articleDetailXId
}

public get redirectsXId(): RKVS<ArticleRedirect> {
return this._redirectsXId
return new RedisKvs(this.#client, ...args)

Check warning on line 143 in src/RedisStore.ts

View check run for this annotation

Codecov / codecov/patch

src/RedisStore.ts#L143

Added line #L143 was not covered by tests
}
}

export default RedisStore
const rs = RedisStore.getInstance()
export default rs as RedisStore
22 changes: 11 additions & 11 deletions src/mwoffliner.lib.ts
Original file line number Diff line number Diff line change
Expand Up @@ -62,186 +62,186 @@

const packageJSON = JSON.parse(readFileSync(path.join(__dirname, '../package.json'), 'utf8'))

async function execute(argv: any) {
/* ********************************* */
/* CUSTOM VARIABLE SECTION ********* */
/* ********************************* */

const {
speed: _speed,
adminEmail,
verbose,
minifyHtml,
keepEmptyParagraphs,
mwUrl,
mwWikiPath,
mwApiPath,
mwRestApiPath,
mwModulePath,
mwDomain,
mwUsername,
mwPassword,
requestTimeout,
customMainPage,
customZimTitle,
customZimDescription,
customZimLongDescription,
customZimTags,
customZimLanguage,
withoutZimFullTextIndex,
webp,
format,
filenamePrefix,
resume,
publisher: _publisher,
outputDirectory: _outputDirectory,
addNamespaces: _addNamespaces,
customZimFavicon,
optimisationCacheUrl,
customFlavour,
} = argv

let { articleList, articleListToIgnore } = argv

if (verbose) logger.setVerboseLevel(verbose)

logger.log(`Starting mwoffliner v${packageJSON.version}...`)

// TODO: Move it to sanitaze method
if (articleList) articleList = String(articleList)
if (articleListToIgnore) articleListToIgnore = String(articleListToIgnore)
const publisher = _publisher || config.defaults.publisher

// TODO: Move it to sanitaze method
/* HTTP user-agent string */
// const adminEmail = argv.adminEmail;
if (!isValidEmail(adminEmail)) {
throw new Error(`Admin email [${adminEmail}] is not valid`)
}

// TODO: Move it to sanitaze method
/* Number of parallel requests. To secure stability and avoid HTTP
429 errors, no more than MAX_CPU_CORES can be considered */
if (_speed && isNaN(_speed)) {
throw new Error('speed is not a number, please give a number value to --speed')
}
const cpuCount = Math.min(os.cpus().length, MAX_CPU_CORES)
const speed = Math.max(1, Math.round(cpuCount * (_speed || 1)))

/* Check Node.js version */
const nodeVersionSatisfiesPackage = semver.satisfies(process.version, packageJSON.engines.node)
if (!nodeVersionSatisfiesPackage) {
logger.warn(`***********\n\n\tCurrent node version is [${process.version}]. We recommend [${packageJSON.engines.node}]\n\n***********`)
}

/* Instanciate custom flavour module */
logger.info(`Using custom flavour: ${customFlavour || 'no'}`)
const customProcessor = customFlavour ? new (await import(customFlavour))() : null

let s3Obj
// Check for S3 creds
if (optimisationCacheUrl) {
// Decompose the url with path and other S3 creds
const s3UrlObj = urlParser.parse(optimisationCacheUrl)
const queryReader = QueryStringParser.parse(s3UrlObj.query)
const s3Url = (s3UrlObj.protocol || 'https:') + '//' + (s3UrlObj.host || '') + (s3UrlObj.pathname || '')
s3Obj = new S3(s3Url, queryReader)
await s3Obj.initialise().then(() => {
logger.log('Successfully logged in S3')
})
}

// Extract S3 obj to pass to downloader class
const s3 = s3Obj ? s3Obj : {}

/* Wikipedia/... URL; Normalize by adding trailing / as necessary */
MediaWiki.base = mwUrl
MediaWiki.getCategories = !!argv.getCategories
MediaWiki.apiPath = mwApiPath
MediaWiki.restApiPath = mwRestApiPath
MediaWiki.modulePathOpt = mwModulePath
MediaWiki.domain = mwDomain
MediaWiki.password = mwPassword
MediaWiki.username = mwUsername
MediaWiki.wikiPath = mwWikiPath

/* Download helpers; TODO: Merge with something else / expand this. */
const downloader = new Downloader({
uaString: `${config.userAgent} (${adminEmail})`,
speed,
reqTimeout: requestTimeout * 1000 || config.defaults.requestTimeout,
optimisationCacheUrl,
s3,
webp,
})

/* perform login */
await MediaWiki.login(downloader)

/* Get MediaWiki Info */
let mwMetaData
try {
mwMetaData = await MediaWiki.getMwMetaData(downloader)
} catch (err) {
logger.error('FATAL - Failed to get MediaWiki Metadata')
throw err
}

const metaDataRequiredKeys = {
Creator: mwMetaData.creator,
Description: customZimDescription || mwMetaData.subTitle,
Language: customZimLanguage || mwMetaData.langIso3,
Publisher: publisher,
Title: customZimTitle || mwMetaData.title,
'Illustration_48x48@1': await getIllustrationMetadata(),
}
validateMetadata(metaDataRequiredKeys)

// Sanitizing main page
let mainPage = articleList ? '' : mwMetaData.mainPage

if (customMainPage) {
mainPage = customMainPage
const mainPageUrl = MediaWiki.webUrl + encodeURIComponent(mainPage)
if (!(await checkApiAvailability(mainPageUrl))) {
throw new Error(`customMainPage doesn't return 200 status code for url ${mainPageUrl}`)
}
}

MediaWiki.apiCheckArticleId = mwMetaData.mainPage
await MediaWiki.hasCoordinates(downloader)
await MediaWiki.hasWikimediaDesktopRestApi()
await MediaWiki.hasVisualEditorApi()

await downloader.setBaseUrls()

const redisStore = new RedisStore(argv.redis || config.defaults.redisPath)
await redisStore.connect()
const { articleDetailXId, filesToDownloadXPath, filesToRetryXPath, redirectsXId } = redisStore
RedisStore.setOptions(argv.redis || config.defaults.redisPath)
await RedisStore.connect()
const { articleDetailXId, filesToDownloadXPath, filesToRetryXPath, redirectsXId } = RedisStore

// Output directory
const outputDirectory = path.isAbsolute(_outputDirectory || '') ? _outputDirectory : path.join(process.cwd(), _outputDirectory || 'out')
await mkdirPromise(outputDirectory)
logger.log(`Using output directory ${outputDirectory}`)

// Temporary directory
const tmpDirectory = await getTmpDirectory()
logger.log(`Using temporary directory ${tmpDirectory}`)

process.on('exit', async (code) => {
logger.log(`Exiting with code [${code}]`)
logger.log(`Deleting temporary directory [${tmpDirectory}]`)
rimraf.sync(tmpDirectory)
})

process.on('SIGTERM', async () => {
logger.log('SIGTERM')
await redisStore.close()
await RedisStore.close()

Check warning on line 239 in src/mwoffliner.lib.ts

View check run for this annotation

Codecov / codecov/patch

src/mwoffliner.lib.ts#L239

Added line #L239 was not covered by tests
process.exit(128 + 15)
})
process.on('SIGINT', async () => {
logger.log('SIGINT')
await redisStore.close()
await RedisStore.close()

Check warning on line 244 in src/mwoffliner.lib.ts

View check run for this annotation

Codecov / codecov/patch

src/mwoffliner.lib.ts#L244

Added line #L244 was not covered by tests
process.exit(128 + 2)
})

Expand Down Expand Up @@ -290,13 +290,13 @@

logger.info('Getting article ids')
let stime = Date.now()
await getArticleIds(downloader, redisStore, mainPage, articleList ? articleListLines : null, articleListToIgnore ? articleListToIgnoreLines : null)
await getArticleIds(downloader, mainPage, articleList ? articleListLines : null, articleListToIgnore ? articleListToIgnoreLines : null)
logger.log(`Got ArticleIDs in ${(Date.now() - stime) / 1000} seconds`)

if (MediaWiki.getCategories) {
await getCategoriesForArticles(articleDetailXId, downloader, redisStore)
await getCategoriesForArticles(articleDetailXId, downloader)

Check warning on line 297 in src/mwoffliner.lib.ts

View check run for this annotation

Codecov / codecov/patch

src/mwoffliner.lib.ts#L297

Added line #L297 was not covered by tests

while ((await trimUnmirroredPages(downloader, redisStore)) > 0) {
while ((await trimUnmirroredPages(downloader)) > 0) {

Check warning on line 299 in src/mwoffliner.lib.ts

View check run for this annotation

Codecov / codecov/patch

src/mwoffliner.lib.ts#L299

Added line #L299 was not covered by tests
// Remove unmirrored pages, categories, subCategories
// trimUnmirroredPages returns number of modified articles
}
Expand Down Expand Up @@ -406,7 +406,7 @@
logger.log(`Found [${stylesheetsToGet.length}] stylesheets to download`)

logger.log('Downloading stylesheets and populating media queue')
const { finalCss } = await getAndProcessStylesheets(downloader, redisStore, stylesheetsToGet)
const { finalCss } = await getAndProcessStylesheets(downloader, stylesheetsToGet)
logger.log('Downloaded stylesheets')

const article = new ZimArticle({ url: `${config.output.dirs.mediawiki}/style.css`, data: finalCss, ns: '-' })
Expand All @@ -420,7 +420,7 @@

logger.log('Getting articles')
stime = Date.now()
const { jsModuleDependencies, cssModuleDependencies } = await saveArticles(zimCreator, downloader, redisStore, dump)
const { jsModuleDependencies, cssModuleDependencies } = await saveArticles(zimCreator, downloader, dump)
logger.log(`Fetching Articles finished in ${(Date.now() - stime) / 1000} seconds`)

logger.log(`Found [${jsModuleDependencies.size}] js module dependencies`)
Expand Down Expand Up @@ -641,7 +641,7 @@
}

MediaWiki.reset()
redisStore.close()
RedisStore.close()

return dumps
}

Check notice on line 647 in src/mwoffliner.lib.ts

View check run for this annotation

codefactor.io / CodeFactor

src/mwoffliner.lib.ts#L65-L647

Complex Method
Expand Down
Loading
Loading