Skip to content

Commit

Permalink
Update definitions - May of 2024 (#408)
Browse files Browse the repository at this point in the history
* chore!: change license to MIT

* chore: update versions

* fix: compiler errors and linter violations

* feature: allow for element matching to find page ID

* feature: split up element matching into textcontent and href

* fix: use puppeteer-extra to reduce misses

* chore: fix definitions

* fix: remove CBS cookiecliker

* fix: account for shadow root in Parool paywall

* fix: selector fixes for various media
  • Loading branch information
fdebijl authored May 20, 2024
1 parent cde76a0 commit e5ea71c
Show file tree
Hide file tree
Showing 20 changed files with 1,968 additions and 2,141 deletions.
12 changes: 12 additions & 0 deletions .editorconfig
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
root = true

[*]
end_of_line = lf
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
indent_style = space
indent_size = 2

[*.{diff,md}]
trim_trailing_whitespace = false
31 changes: 3 additions & 28 deletions .eslintrc.json
Original file line number Diff line number Diff line change
@@ -1,31 +1,6 @@
{
"extends": [
"plugin:@typescript-eslint/recommended"
],
"parser": "@typescript-eslint/parser",
"parserOptions": {
"ecmaVersion": 2018,
"sourceType": "module",
"ecmaFeatures": {
"legacyDecorators": true
}
},
"extends": "@fdebijl",
"rules": {
"no-console": 0,
"no-plusplus": 0,
"no-await-in-loop": 0,
"max-len": [
2,
{
"code": 250
}
],
"object-curly-newline": 0,
"no-trailing-spaces": 1,
"@typescript-eslint/camelcase": 0,
"prefer-const": 1
},
"env": {
"node": true
"@typescript-eslint/no-explicit-any": 0
}
}
}
4 changes: 1 addition & 3 deletions .github/FUNDING.yml
Original file line number Diff line number Diff line change
@@ -1,3 +1 @@
# These are supported funding model platforms

github: [opentitles]
github: [opentitles]
11 changes: 7 additions & 4 deletions .github/workflows/status.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,22 @@ on:
pull_request:
types: [assigned, opened, synchronize, reopened]

env:
NODE_VERSION: 20

jobs:
validate:
name: Validate
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
with:
# Checkout the head ref instead of the PR branch that github creates.
ref: ${{ github.head_ref }}
- name: Setup Node.js v14
uses: actions/setup-node@v3
- name: Setup Node
uses: actions/setup-node@v4
with:
node-version: 14
node-version: ${{ env.NODE_VERSION }}
- name: Install and build
run: |
npm ci
Expand Down
682 changes: 21 additions & 661 deletions LICENSE

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions index.d.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
declare module 'puppeteer-extra-plugin-stealth' {
const content: any;
export default content;
}
74 changes: 59 additions & 15 deletions media.json
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,10 @@
"page_id_location": "url",
"page_id_query": "",
"match_domains": ["telegraaf.nl"],
"title_query": ["h1.ArticleTitleBlock__title"]
"title_query": [
"h1.ArticleTitleBlock__title",
".Article__titleWrapper h1"
]
},
{
"name": "NUnl",
Expand Down Expand Up @@ -188,7 +191,13 @@
"page_id_location": "url",
"page_id_query": "",
"match_domains": ["volkskrant.nl"],
"title_query": [".artstyle__header-title", "h1.title", "h1.artstyle__header-title", ".h1-headline"]
"title_query": [
".artstyle__header-title",
"h1.title",
"h1.artstyle__header-title",
".h1-headline",
"[data-test-id=article-title]"
]
},
{
"name": "RTL",
Expand All @@ -208,7 +217,12 @@
"page_id_location": "url",
"page_id_query": "",
"match_domains": ["rtlnieuws.nl", "bright.nl"],
"title_query": ["div.article-title-width > h1.node-title", ".video-content > h1", "h1.article__title"]
"title_query": [
"div.article-title-width > h1.node-title",
".video-content > h1",
"h1.article__title",
"[data-testid=hero-heading]"
]
},
{
"name": "Trouw",
Expand All @@ -220,7 +234,10 @@
"page_id_location": "url",
"page_id_query": "",
"match_domains": ["trouw.nl"],
"title_query": ["h1.artstyle__header-title"]
"title_query": [
"h1.artstyle__header-title",
"[data-test-id=article-title]"
]
},
{
"name": "Parool",
Expand All @@ -243,7 +260,10 @@
"page_id_location": "url",
"page_id_query": "",
"match_domains": ["parool.nl"],
"title_query": [".artstyle__header-title"]
"title_query": [
".artstyle__header-title",
"[data-test-id=article-title]"
]
},
{
"name": "Limburger",
Expand All @@ -259,7 +279,7 @@
"page_id_location": "url",
"page_id_query": "",
"match_domains": ["limburger.nl"],
"title_query": ["div > header.article__header > h1"]
"title_query": ["div > header > h1"]
},
{
"name": "FD",
Expand All @@ -271,7 +291,10 @@
"page_id_location": "url",
"page_id_query": "",
"match_domains": ["fd.nl"],
"title_query": [".head.full.social-quotable > h1"]
"title_query": [
".head.full.social-quotable > h1",
"h1.heading"
]
},
{
"name": "HVNL",
Expand Down Expand Up @@ -387,7 +410,6 @@
"cnn_topstories",
"cnn_world",
"cnn_us",
"cnn_allpolitics",
"cnn_tech",
"cnn_health",
"cnn_showbiz",
Expand Down Expand Up @@ -441,12 +463,13 @@
"match_domains": ["cnn.com", "edition.cnn.com"],
"title_query": [
".l-container > h1.pg-headline",
"h1.Article__title",
"h1.PageHead__title",
"h1.Article__title",
"h1.PageHead__title",
"h1.article-title",
"h1.pg-headline",
".pg-rail .el__video-collection__meta-wrapper > h1.media__video-headline",
"h1.headline__text"
"h1.headline__text",
".headline > h1"
]
},
{
Expand Down Expand Up @@ -504,7 +527,8 @@
"title_query": [
"h1[itemprop=\"headline\"]",
"header h1",
"h1[data-testid=\"headline\"]"
"h1[data-testid=\"headline\"]",
".article-headline > h1"
]
},
{
Expand Down Expand Up @@ -659,7 +683,10 @@
"page_id_location": "url",
"page_id_query": "",
"match_domains": ["time.com"],
"title_query": ["h1.headline"]
"title_query": [
"h1.headline",
"main h1"
]
}
],
"uk": [
Expand All @@ -679,11 +706,28 @@
"page_id_query": "",
"match_domains": ["theguardian.com"],
"title_query": [
"h1[itemprop=\"headline\"]",
"h1 > span",
"h1[itemprop=\"headline\"]",
"h1 > span",
"div[data-gu-name=\"headline\"] h1"
]
}
],
"be": [
{
"name": "VRT",
"prefix": "https://www.vrt.be/vrtnws/",
"suffix": ".rss.articles.xml",
"feeds": [
"nl",
"en"
],
"id_container": "id",
"id_mask": "p\\.[a-zA-Z0-9]{9}$",
"page_id_location": "element_href",
"page_id_query": "link[rel=shortlink]",
"match_domains": ["vrt.be"],
"title_query": [".vrt-title"]
}
]
}
}
Loading

0 comments on commit e5ea71c

Please sign in to comment.