Skip to content

Commit

Permalink
Add 3 User Agents & Clean up regexs (#311)
Browse files Browse the repository at this point in the history
* Add three new useragents

* Escape catch all regex characters

* Export user agents
  • Loading branch information
MaxGiting authored Dec 17, 2018
1 parent 50e06c8 commit 00799aa
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 24 deletions.
2 changes: 1 addition & 1 deletion raw/Crawlers.json

Large diffs are not rendered by default.

25 changes: 14 additions & 11 deletions raw/Crawlers.txt
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ arachnode
Arachnophilia
aria2
Arukereso
asafaweb.com
asafaweb\.com
AskQuickly
Ask Jeeves
ASPSeek
Expand All @@ -106,6 +106,7 @@ axios\/
B-l-i-t-z-B-O-T
Backlink-Ceck
backlink-check
BacklinkHttpStatus
BackStreet
BackWeb
Bad-Neighborhood
Expand Down Expand Up @@ -477,7 +478,7 @@ httrack
huaweisymantec
HubSpot
Humanlinks
HyperZbozi.cz Feeder
HyperZbozi\.cz Feeder
i2kconnect\/
Iblog
ichiro
Expand Down Expand Up @@ -613,7 +614,7 @@ LYT\.SR
mabontland
Mag-Net
MagpieRSS
Mail.Ru
Mail\.Ru
MailChimp
Majestic12
makecontact\/
Expand Down Expand Up @@ -643,7 +644,7 @@ Microsoft\ Data\ Access
MIDown\ tool
MIIxpc
Mindjet
Miniature.io\/
Miniature\.io\/
Miniflux
Mister\ PiX
mixdata dot com
Expand Down Expand Up @@ -812,7 +813,7 @@ PritTorrent\/[0-9]
Prlog
probethenet
Project 25499
Promotion_Tools_www.searchenginepromotionhelp.com
Promotion_Tools_www\.searchenginepromotionhelp\.com
prospectb2b
Protopage
ProWebWalker
Expand All @@ -830,7 +831,7 @@ Qirina Hurdler
QQDownload
QrafterPro
Qseero
Qualidator.com SiteAnalyzer
Qualidator\.com SiteAnalyzer
QueryN\ Metasearch
queuedriver
Quora Link Preview
Expand Down Expand Up @@ -896,7 +897,7 @@ Semrush
sentry\/
SEO Browser
Seo Servis
seo-nastroj.cz
seo-nastroj\.cz
seo4ajax
Seobility
SEOCentro
Expand Down Expand Up @@ -988,7 +989,7 @@ Statastico\/
StatusCake
Steeler
Stratagems Kumo
Stroke.cz
Stroke\.cz
StudioFACA
StumbleUpon
suchen
Expand Down Expand Up @@ -1035,14 +1036,15 @@ Tiny Tiny RSS
TLSProbe\/
Toata
topster
touche.com
Traackr.com
touche\.com
Traackr\.com
tracemyfile
TrapitAgent
Trendiction
Trendsmap
trendspottr\.com
truwoGPS
TryJsoup
TulipChain
Turingos
Turnitin
Expand All @@ -1069,7 +1071,7 @@ Upflow
Uptimia
URL Verifier
URLChecker
URLitor.com
URLitor\.com
urlresolver
Urlstat
UrlTrends Ranking Updater
Expand Down Expand Up @@ -1231,5 +1233,6 @@ Zeus
zgrab
ZnajdzFoto
Zombie\.js
Zoom\.Mac
ZyBorg
[a-z0-9\-_]*(bot|crawl|archiver|transcoder|spider|uptime|validator|fetcher|cron)
25 changes: 14 additions & 11 deletions src/Fixtures/Crawlers.php
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ class Crawlers extends AbstractProvider
'Arachnophilia',
'aria2',
'Arukereso',
'asafaweb.com',
'asafaweb\.com',
'AskQuickly',
'Ask Jeeves',
'ASPSeek',
Expand All @@ -127,6 +127,7 @@ class Crawlers extends AbstractProvider
'B-l-i-t-z-B-O-T',
'Backlink-Ceck',
'backlink-check',
'BacklinkHttpStatus',
'BackStreet',
'BackWeb',
'Bad-Neighborhood',
Expand Down Expand Up @@ -498,7 +499,7 @@ class Crawlers extends AbstractProvider
'huaweisymantec',
'HubSpot ',
'Humanlinks',
'HyperZbozi.cz Feeder',
'HyperZbozi\.cz Feeder',
'i2kconnect\/',
'Iblog',
'ichiro',
Expand Down Expand Up @@ -634,7 +635,7 @@ class Crawlers extends AbstractProvider
'mabontland',
'Mag-Net',
'MagpieRSS',
'Mail.Ru',
'Mail\.Ru',
'MailChimp',
'Majestic12',
'makecontact\/',
Expand Down Expand Up @@ -664,7 +665,7 @@ class Crawlers extends AbstractProvider
'MIDown\ tool',
'MIIxpc',
'Mindjet',
'Miniature.io\/',
'Miniature\.io\/',
'Miniflux',
'Mister\ PiX',
'mixdata dot com',
Expand Down Expand Up @@ -833,7 +834,7 @@ class Crawlers extends AbstractProvider
'Prlog',
'probethenet',
'Project 25499',
'Promotion_Tools_www.searchenginepromotionhelp.com',
'Promotion_Tools_www\.searchenginepromotionhelp\.com',
'prospectb2b',
'Protopage',
'ProWebWalker',
Expand All @@ -851,7 +852,7 @@ class Crawlers extends AbstractProvider
'QQDownload',
'QrafterPro',
'Qseero',
'Qualidator.com SiteAnalyzer',
'Qualidator\.com SiteAnalyzer',
'QueryN\ Metasearch',
'queuedriver',
'Quora Link Preview',
Expand Down Expand Up @@ -917,7 +918,7 @@ class Crawlers extends AbstractProvider
'sentry\/',
'SEO Browser',
'Seo Servis',
'seo-nastroj.cz',
'seo-nastroj\.cz',
'seo4ajax',
'Seobility',
'SEOCentro',
Expand Down Expand Up @@ -1009,7 +1010,7 @@ class Crawlers extends AbstractProvider
'StatusCake',
'Steeler',
'Stratagems Kumo',
'Stroke.cz',
'Stroke\.cz',
'StudioFACA',
'StumbleUpon',
'suchen',
Expand Down Expand Up @@ -1056,14 +1057,15 @@ class Crawlers extends AbstractProvider
'TLSProbe\/',
'Toata',
'topster',
'touche.com',
'Traackr.com',
'touche\.com',
'Traackr\.com',
'tracemyfile',
'TrapitAgent',
'Trendiction',
'Trendsmap',
'trendspottr\.com',
'truwoGPS',
'TryJsoup',
'TulipChain',
'Turingos',
'Turnitin',
Expand All @@ -1090,7 +1092,7 @@ class Crawlers extends AbstractProvider
'Uptimia',
'URL Verifier',
'URLChecker',
'URLitor.com',
'URLitor\.com',
'urlresolver',
'Urlstat',
'UrlTrends Ranking Updater',
Expand Down Expand Up @@ -1252,6 +1254,7 @@ class Crawlers extends AbstractProvider
'zgrab',
'ZnajdzFoto',
'Zombie\.js',
'Zoom\.Mac',
'ZyBorg',
'[a-z0-9\-_]*(bot|crawl|archiver|transcoder|spider|uptime|validator|fetcher|cron)',
);
Expand Down
5 changes: 4 additions & 1 deletion tests/crawlers.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3403,4 +3403,7 @@ GRequests/0.10
DAP/NetHTTP
Hadi Agent
Canary%20Mail/397 CFNetwork/893.13.1 Darwin/17.4.0 (x86_64)
Sendsay.Ru/1.0; https://Sendsay.Ru/; [email protected]
Sendsay.Ru/1.0; https://Sendsay.Ru/; [email protected]
Mozilla/5.0 (Zoom.Mac 10.8.5 x86)
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36 TryJsoup/1.0 (+http://try.jsoup.org/)
Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0 ; BacklinkHttpStatus)

0 comments on commit 00799aa

Please sign in to comment.