Skip to content

Commit

Permalink
refactor(parser): La fonction linkParser est maintenant dans /config
Browse files Browse the repository at this point in the history
  • Loading branch information
remillc committed Dec 20, 2023
1 parent d72f71e commit 59a0dc6
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 22 deletions.
21 changes: 21 additions & 0 deletions config/linkParser.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
export function linkParser() {
return Array
.from(/** @type {NodeListOf<HTMLAnchorElement>} */(document.querySelectorAll('a[href]')))
// Exclude those inside a rss module
.filter(link => !(link.parentNode && link.parentNode instanceof HTMLAnchorElement && link.closest('.s-lg-rss')))
.map(link => ({
url: link.href,
text: link.tagName === 'IMG' ? link.getAttribute('alt') : link.innerText,
urlData: link.getAttribute('href'),
isNavigationRequest: true
}))
.concat(Array
.from(/** @type {NodeListOf<HTMLImageElement>} */(document.querySelectorAll('img[src]')))
.map(img => ({
url: img.src,
text: img.alt || img.title || null,
urlData: img.getAttribute('src'),
isNavigationRequest: false
}))
)
}
25 changes: 3 additions & 22 deletions harvest.js
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
import readline from 'readline'
import yargs from 'yargs'
import { hideBin } from 'yargs/helpers'
import { console, inspect } from 'corvee-core'
import { Harvester } from 'corvee-harvester'
import { fetchGuides } from './lib/fetch-guides.js'
import { saveBrowsingContexts, saveReportCodes, saveRecords, saveSystemInfo } from './utils/index.js'
import pageSnippetPlugin from './plugins/page-snippet.js'
import { console, inspect } from 'corvee-core'

import { harvesterConfig } from './config/index.js'
import { savePageSnippets } from './utils/save-page-snippets.js'
import { linkParser } from 'config/linkParser.js'

const today = new Date();
const year = today.getFullYear();
Expand Down Expand Up @@ -75,27 +76,7 @@ async function harvest() {

harvester.addPlugins(pageSnippetPlugin)

harvester.setLinkParser(function linkParser() {
return Array
.from(/** @type {NodeListOf<HTMLAnchorElement>} */(document.querySelectorAll('a[href]')))
// Exclude those inside a rss module
.filter(link => !(link.parentNode && link.parentNode instanceof HTMLAnchorElement && link.closest('.s-lg-rss')))
.map(link => ({
url: link.href,
text: link.tagName === 'IMG' ? link.getAttribute('alt') : link.innerText,
urlData: link.getAttribute('href'),
isNavigationRequest: true
}))
.concat(Array
.from(/** @type {NodeListOf<HTMLImageElement>} */(document.querySelectorAll('img[src]')))
.map(img => ({
url: img.src,
text: img.alt || img.title || null,
urlData: img.getAttribute('src'),
isNavigationRequest: false
}))
)
})
harvester.setLinkParser(linkParser)

await harvester.addUrl(links);

Expand Down

0 comments on commit 59a0dc6

Please sign in to comment.