Skip to content

Commit

Permalink
feat(harvester): Exclusion des nouvelles du moissonnage
Browse files Browse the repository at this point in the history
  • Loading branch information
remillc committed Jan 4, 2024
1 parent bfa2d96 commit b4e4509
Showing 1 changed file with 8 additions and 5 deletions.
13 changes: 8 additions & 5 deletions config/harvester.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,15 @@ export const harvesterConfig = {
checkExtern: true,
fetchLinksOnce: true,
getPerfData: false,

// URLs matching the given regular expressions / strings will be ignored and not checked.
ignore: [
...adressesSimplifiees,
//
// /^https:\/\/bib\.umontreal\.ca\/[^#?]/,
/^https:\/\/bib\.umontreal\.ca\/activites/i,
/^https:\/\/bib\.umontreal\.ca\/communications\/nouvelles/i,
/^https:\/\/bib\.umontreal\.ca\/les\-bibliotheques\-udem\/nouvelles/i,
'https://www.bib.umontreal.ca/une-question',
/^https:\/\/unequestion\.bib\.umontreal\.ca\//i,
/^https:\/\/www\.bib\.umontreal\.ca\/ideale/i,
Expand Down Expand Up @@ -65,11 +67,11 @@ export const harvesterConfig = {
/^https:\/\/news\.google\.com\//i, //
],
internLinks: [
/https?:\/\/[^\/]*bib\.umontreal\.ca(:\d+)?(\/.*)?/,
/https?:\/\/atrium\.umontreal\.ca(\/.*)?/,
/https:\/\/umontreal\.on\.worldcat\.org(\/.*)?/,
/https:\/\/umontreal\.account\.worldcat\.org(\/.*)?/,
/https:\/\/87128\.account\.worldcat\.org(\/.*)?/,
/^https?:\/\/[^\/]*bib\.umontreal\.ca(:\d+)?(\/.*)?/,
// /^https?:\/\/atrium\.umontreal\.ca(\/.*)?/,
/^https:\/\/umontreal\.on\.worldcat\.org(\/.*)?/,
/^https:\/\/umontreal\.account\.worldcat\.org(\/.*)?/,
// /^https:\/\/87128\.account\.worldcat\.org(\/.*)?/,
// 'http://localhost[.*]'
],
linkParserDelay: false,
Expand All @@ -79,6 +81,7 @@ export const harvesterConfig = {
// maxRequestsPerCrawl: 50,
navigationOnly: true,
// navigationTimeoutSecs: 5,

// Check but do not recurse into URLs matching the given strings / regular expressions.
noFollow: [
/^https:\/\/umontreal\.on\.worldcat\.org/,
Expand Down

0 comments on commit b4e4509

Please sign in to comment.