Skip to content

Commit

Permalink
Add 4 new crawlers. Add scraper as a catch all word (JayBizzle#359)
Browse files Browse the repository at this point in the history
* Add 4 new user agents

* Add the word scraper as a generic user agent flag

* build crawler json and txt export files
  • Loading branch information
MaxGiting authored Feb 16, 2020
1 parent 3e496b7 commit 608ec84
Show file tree
Hide file tree
Showing 5 changed files with 15 additions and 21 deletions.
2 changes: 1 addition & 1 deletion raw/Crawlers.json

Large diffs are not rendered by default.

14 changes: 5 additions & 9 deletions raw/Crawlers.txt
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,6 @@ AportWorm\/
AppBeat\/
AppEngine-Google
AppleSyndication
AppStoreScraperZ
Aprc\/[0-9]
Arachmo
arachnode
Expand Down Expand Up @@ -382,6 +381,7 @@ GigablastOpenSource
GIS-LABS
github-camo
github\.com
Goldfire Server
Go [\d\.]* package http
Go http package
Go-Ahead-Got-It
Expand Down Expand Up @@ -421,7 +421,6 @@ GoogleProducer
GoogleSites
Google-Transparency-Report
Gookey
GoScraper
GoSpotCheck
gosquared-thumbnailer
Gotit
Expand Down Expand Up @@ -688,6 +687,7 @@ Morfeus Fucking Scanner
Morning Paper
MovableType
mowser
Mr\.4x3 Powered
Mrcgiguy
MS Web Services Client Protocol
MSFrontPage
Expand Down Expand Up @@ -757,7 +757,6 @@ oegp
Offline Explorer
Offline Navigator
OgScrper
og-scraper
okhttp
omgili
OMSC
Expand All @@ -772,7 +771,6 @@ Optimizer
Orbiter
OrgProbe\/
orion-semantics
OSPScraper
Outlook-Express
Outlook-iOS
ow\.ly
Expand All @@ -783,6 +781,7 @@ Page Valet
page_verifier
page scorer
page2rss
PageFreezer
PageGrabber
PagePeeker
PageScorer
Expand Down Expand Up @@ -836,6 +835,7 @@ PostmanRuntime
PostPost
postrank
PowerPoint\/
Prebid
Priceonomics Analysis Engine
PrintFriendly
PritTorrent
Expand All @@ -850,7 +850,6 @@ PRTG Network Monitor
pshtt, https scanning
PTST
PTST\/[0-9]+
Pulsepoint XT3 web scraper
Pump
python-httpx
Python-httplib2
Expand Down Expand Up @@ -910,7 +909,6 @@ scooter
ScoutJet
ScoutURLMonitor
ScrapeBox Page Scanner
SimpleScraper
Scrapy
Screaming
ScreenShotService
Expand Down Expand Up @@ -1030,7 +1028,6 @@ summify
SuperHTTP
Surphace Scout
Suzuran
SwiteScraper
Symfony BrowserKit
Symfony2 BrowserKit
SynHttpClient-Built
Expand Down Expand Up @@ -1119,7 +1116,6 @@ VB Project
vBSEO
VCI
via ggpht\.com GoogleImageProxy
VidibleScraper
Virusdie
visionutils
vkShare
Expand Down Expand Up @@ -1272,4 +1268,4 @@ ZnHTTP
Zombie\.js
Zoom\.Mac
ZyBorg
[a-z0-9\-_]*(bot|crawl|archiver|transcoder|spider|uptime|validator|fetcher|cron|checker|reader|extractor|monitoring|analyzer)
[a-z0-9\-_]*(bot|crawl|archiver|transcoder|spider|uptime|validator|fetcher|cron|checker|reader|extractor|monitoring|analyzer|scraper)
14 changes: 5 additions & 9 deletions src/Fixtures/Crawlers.php
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,6 @@ class Crawlers extends AbstractProvider
'AppBeat\/',
'AppEngine-Google',
'AppleSyndication',
'AppStoreScraperZ',
'Aprc\/[0-9]',
'Arachmo',
'arachnode',
Expand Down Expand Up @@ -403,6 +402,7 @@ class Crawlers extends AbstractProvider
'GIS-LABS',
'github-camo',
'github\.com',
'Goldfire Server',
'Go [\d\.]* package http',
'Go http package',
'Go-Ahead-Got-It',
Expand Down Expand Up @@ -442,7 +442,6 @@ class Crawlers extends AbstractProvider
'GoogleSites',
'Google-Transparency-Report',
'Gookey',
'GoScraper',
'GoSpotCheck',
'gosquared-thumbnailer',
'Gotit',
Expand Down Expand Up @@ -709,6 +708,7 @@ class Crawlers extends AbstractProvider
'Morning Paper',
'MovableType',
'mowser',
'Mr\.4x3 Powered',
'Mrcgiguy',
'MS Web Services Client Protocol',
'MSFrontPage',
Expand Down Expand Up @@ -778,7 +778,6 @@ class Crawlers extends AbstractProvider
'Offline Explorer',
'Offline Navigator',
'OgScrper',
'og-scraper',
'okhttp',
'omgili',
'OMSC',
Expand All @@ -793,7 +792,6 @@ class Crawlers extends AbstractProvider
'Orbiter',
'OrgProbe\/',
'orion-semantics',
'OSPScraper',
'Outlook-Express',
'Outlook-iOS',
'ow\.ly',
Expand All @@ -804,6 +802,7 @@ class Crawlers extends AbstractProvider
'page_verifier',
'page scorer',
'page2rss',
'PageFreezer',
'PageGrabber',
'PagePeeker',
'PageScorer',
Expand Down Expand Up @@ -857,6 +856,7 @@ class Crawlers extends AbstractProvider
'PostPost',
'postrank',
'PowerPoint\/',
'Prebid',
'Priceonomics Analysis Engine',
'PrintFriendly',
'PritTorrent',
Expand All @@ -871,7 +871,6 @@ class Crawlers extends AbstractProvider
'pshtt, https scanning',
'PTST ',
'PTST\/[0-9]+',
'Pulsepoint XT3 web scraper',
'Pump',
'python-httpx',
'Python-httplib2',
Expand Down Expand Up @@ -931,7 +930,6 @@ class Crawlers extends AbstractProvider
'ScoutJet',
'ScoutURLMonitor',
'ScrapeBox Page Scanner',
'SimpleScraper',
'Scrapy',
'Screaming',
'ScreenShotService',
Expand Down Expand Up @@ -1051,7 +1049,6 @@ class Crawlers extends AbstractProvider
'SuperHTTP',
'Surphace Scout',
'Suzuran',
'SwiteScraper',
'Symfony BrowserKit',
'Symfony2 BrowserKit',
'SynHttpClient-Built',
Expand Down Expand Up @@ -1140,7 +1137,6 @@ class Crawlers extends AbstractProvider
'vBSEO',
'VCI',
'via ggpht\.com GoogleImageProxy',
'VidibleScraper',
'Virusdie',
'visionutils',
'vkShare',
Expand Down Expand Up @@ -1293,6 +1289,6 @@ class Crawlers extends AbstractProvider
'Zombie\.js',
'Zoom\.Mac',
'ZyBorg',
'[a-z0-9\-_]*(bot|crawl|archiver|transcoder|spider|uptime|validator|fetcher|cron|checker|reader|extractor|monitoring|analyzer)',
'[a-z0-9\-_]*(bot|crawl|archiver|transcoder|spider|uptime|validator|fetcher|cron|checker|reader|extractor|monitoring|analyzer|scraper)',
);
}
4 changes: 4 additions & 0 deletions tests/crawlers.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3855,3 +3855,7 @@ Google-Ads-Overview Mozilla/5.0 (Linux; U; Android 2.3.4; generic) AppleWebKit/5
adreview/1.0
Google-speakr
Google-speakr,gzip(gfe)
PageFreezer
Prebid.js Scraper
Mr.4x3 Powered
Goldfire Server
2 changes: 0 additions & 2 deletions tests/devices.txt
Original file line number Diff line number Diff line change
Expand Up @@ -45654,7 +45654,6 @@ Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR
Mozilla/5.0 (Linux; Android 6.0.1; HTC One A9 Build/MMB29M; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/54.0.2840.68 Mobile Safari/537.36
Mozilla/5.0 (Linux; Android 5.1; MEDION E5004 Build/LRX21M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.81 Mobile Safari/537.36
Opera/9.80 (J2ME/MIDP; Opera Mini/4.5.40312/37.8552; U; en) Presto/2.12.423 Version/12.16
Goldfire Server
Mozilla/5.0 (iPhone; CPU iPhone OS 10_2 like Mac OS X) AppleWebKit/602.3.12 (KHTML, like Gecko) Mobile/14C92 (4298171760)
Mozilla/5.0 (iPad; CPU OS 9_3_5 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Mobile/13G36 [FBAN/FBIOS;FBAV/63.0.0.37.140;FBBV/37606630;FBRV/37606630;FBDV/iPad4,2;FBMD/iPad;FBSN/iPhone OS;FBSV/9.3.5;FBSS/2;FBCR/MEO;FBID/tablet;FBLC/en_GB;FBOP/5]
Mozilla/5.0 (iPhone; CPU iPhone OS 10_2 like Mac OS X) AppleWebKit/602.3.12 (KHTML, like Gecko) Mobile/14C92 (5668673232)
Expand Down Expand Up @@ -153723,7 +153722,6 @@ Mozilla/5.0 (iPhone; CPU iPhone OS 11_2_6 like Mac OS X) AppleWebKit/604.5.6 (KH
Mozilla/5.0 (Linux; Android 8.0.0; F8331 Build/41.3.A.0.401) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.109 Mobile Safari/537.36
Mozilla/5.0 (iPhone; CPU iPhone OS 10_3 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Mobile/14E277 [FBAN/FBIOS;FBAV/154.0.0.34.386;FBBV/87041355;FBDV/iPhone7,1;FBMD/iPhone;FBSN/iOS;FBSV/10.3;FBSS/3;FBCR/Digi;FBID/phone;FBLC/en_GB;FBOP/5;FBRV/88069422]
Mozilla/5.0 (iPad; CPU OS 11_1_2 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) GSA/47.1.192149458 Mobile/15B202 Safari/604.1
Mr.4x3 Powered
Mozilla/5.0 (iPhone; CPU iPhone OS 11_0_2 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Mobile/15A421 [FBAN/FBIOS;FBAV/141.0.0.51.91;FBBV/71695290;FBDV/iPhone7,2;FBMD/iPhone;FBSN/iOS;FBSV/11.0.2;FBSS/2;FBCR/EE;FBID/phone;FBLC/en_GB;FBOP/5;FBRV/72701227]
Mozilla/5.0 (Linux; Android 7.1.1; MIX 2 Build/NMF26X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.137 Mobile Safari/537.36
Mozilla/5.0 (Linux; Android 7.1.1; Moto G (5S) Plus Build/NPSS26.116-61-8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.158 Mobile Safari/537.36
Expand Down

0 comments on commit 608ec84

Please sign in to comment.