From fd9ea8abc583f1eb1e1ddcf3fe6fe58606461d6f Mon Sep 17 00:00:00 2001 From: Gabriel Jablonski Date: Thu, 19 Sep 2024 15:23:18 -0300 Subject: [PATCH 1/2] Update Amazon bot match to cover `Amazonbot` user agent (#156) As seen in the [Amazon developer docs](https://developer.amazon.com/amazonbot), `Amazonbot` is also an user agent used by Amazon crawlers. Did my best to follow existing code, but please let me know if there are any changes that should be made. --- README.md | 1 + lib/legitbot/amazon.rb | 5 +++-- test/amazon_test.rb | 27 +++++++++++++++++++++++++-- test/lib/dns_server_mock.rb | 6 ++++++ 4 files changed, 35 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 5ce400e..9d97558 100644 --- a/README.md +++ b/README.md @@ -50,6 +50,7 @@ end ## Supported - [Ahrefs](https://ahrefs.com/robot) +- [Amazonbot](https://developer.amazon.com/amazonbot) - [Amazon AdBot](https://adbot.amazon.com/index.html) - [Applebot](https://support.apple.com/en-us/119829) - [Baidu spider](http://help.baidu.com/question?prod_en=master&class=498&id=1000973) diff --git a/lib/legitbot/amazon.rb b/lib/legitbot/amazon.rb index fe20bc2..317415a 100644 --- a/lib/legitbot/amazon.rb +++ b/lib/legitbot/amazon.rb @@ -2,9 +2,10 @@ module Legitbot # :nodoc: # https://adbot.amazon.com/index.html + # https://developer.amazon.com/amazonbot class Amazon < BotMatch - domains 'amazonadbot.com.' + domains 'amazon.', 'amazonadbot.com.' end - rule Legitbot::Amazon, %w[AmazonAdBot] + rule Legitbot::Amazon, %w[Amazonbot AmazonAdBot] end diff --git a/test/amazon_test.rb b/test/amazon_test.rb index 7a33a8c..052b0f2 100644 --- a/test/amazon_test.rb +++ b/test/amazon_test.rb @@ -30,7 +30,7 @@ def test_malicious_ua refute_predicate bot, :valid? end - def test_valid_ua + def test_user_agent1 bot = Legitbot.bot( 'Mozilla/5.0 (compatible; AmazonAdBot/1.0; +https://adbot.amazon.com)', '54.166.7.90' @@ -40,7 +40,19 @@ def test_valid_ua assert_predicate bot, :valid? end - def test_valid_name + # rubocop:disable Layout/LineLength + def test_user_agent2 + bot = Legitbot.bot( + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/600.2.5 (KHTML\, like Gecko) Version/8.0.2 Safari/600.2.5 (Amazonbot/0.1; +https://developer.amazon.com/support/amazonbot)', + '52.70.240.171' + ) + + assert bot + assert_predicate bot, :valid? + end + # rubocop:enable Layout/LineLength + + def test_valid_name1 bot = Legitbot.bot( 'Mozilla/5.0 (compatible; AmazonAdBot/1.0; +https://adbot.amazon.com)', '54.166.7.90' @@ -49,6 +61,17 @@ def test_valid_name assert_equal :amazon, bot.detected_as end + # rubocop:disable Layout/LineLength + def test_valid_name2 + bot = Legitbot.bot( + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/600.2.5 (KHTML\, like Gecko) Version/8.0.2 Safari/600.2.5 (Amazonbot/0.1; +https://developer.amazon.com/support/amazonbot)', + '52.70.240.171' + ) + + assert_equal :amazon, bot.detected_as + end + # rubocop:enable Layout/LineLength + def test_fake_name bot = Legitbot.bot( 'Mozilla/5.0 (compatible; AmazonAdBot/1.0; +https://adbot.amazon.com)', diff --git a/test/lib/dns_server_mock.rb b/test/lib/dns_server_mock.rb index 99adeca..fd0dc04 100644 --- a/test/lib/dns_server_mock.rb +++ b/test/lib/dns_server_mock.rb @@ -29,6 +29,12 @@ '54.166.7.90' => { ptr: %w[crawler-54-166-7-90.amazonadbot.com] }, + '52-70-240-171.crawl.amazonbot.amazon' => { + a: %w[52.70.240.171] + }, + '52.70.240.171' => { + ptr: %w[52-70-240-171.crawl.amazonbot.amazon] + }, # Apple '17-58-98-60.applebot.apple.com' => { From 13d53f6861a00badf22629597b32b9308951feec Mon Sep 17 00:00:00 2001 From: Gabriel Jablonski Date: Thu, 19 Sep 2024 15:28:42 -0300 Subject: [PATCH 2/2] Handle DataForSEO bots (#157) DataForSEO is a crawler that is responsible for >50% of bot requests on a website I manage (>1.3M requests from a single IP address in the past few months), so handling it with `legitbot` seems like a good idea. The bot specs are available here: https://dataforseo.com/dataforseo-bot Let me know if any changes are needed. --- README.md | 1 + lib/legitbot.rb | 1 + lib/legitbot/dataforseo.rb | 10 +++++++ test/dataforseo_test.rb | 60 +++++++++++++++++++++++++++++++++++++ test/lib/dns_server_mock.rb | 8 +++++ 5 files changed, 80 insertions(+) create mode 100644 lib/legitbot/dataforseo.rb create mode 100644 test/dataforseo_test.rb diff --git a/README.md b/README.md index 9d97558..b401261 100644 --- a/README.md +++ b/README.md @@ -55,6 +55,7 @@ end - [Applebot](https://support.apple.com/en-us/119829) - [Baidu spider](http://help.baidu.com/question?prod_en=master&class=498&id=1000973) - [Bingbot](https://blogs.bing.com/webmaster/2012/08/31/how-to-verify-that-bingbot-is-bingbot/) +- [DataForSEO](https://dataforseo.com/dataforseo-bot) - [DuckDuckGo bot](https://duckduckgo.com/duckduckbot) - [Google crawlers](https://support.google.com/webmasters/answer/1061943) - [IAS](https://integralads.com/ias-privacy-data-management/policies/site-indexing-policy/) diff --git a/lib/legitbot.rb b/lib/legitbot.rb index 957fb02..b9fc249 100644 --- a/lib/legitbot.rb +++ b/lib/legitbot.rb @@ -8,6 +8,7 @@ require_relative 'legitbot/apple' require_relative 'legitbot/baidu' require_relative 'legitbot/bing' +require_relative 'legitbot/dataforseo' require_relative 'legitbot/duckduckgo' require_relative 'legitbot/facebook' require_relative 'legitbot/google' diff --git a/lib/legitbot/dataforseo.rb b/lib/legitbot/dataforseo.rb new file mode 100644 index 0000000..3ce2da2 --- /dev/null +++ b/lib/legitbot/dataforseo.rb @@ -0,0 +1,10 @@ +# frozen_string_literal: true + +module Legitbot # :nodoc: + # https://dataforseo.com/dataforseo-bot + class DataForSEO < BotMatch + domains 'dataforseo.com.' + end + + rule Legitbot::DataForSEO, %w[DataForSeoBot] +end diff --git a/test/dataforseo_test.rb b/test/dataforseo_test.rb new file mode 100644 index 0000000..cd105ca --- /dev/null +++ b/test/dataforseo_test.rb @@ -0,0 +1,60 @@ +# frozen_string_literal: true + +require_relative 'test_helper' + +class DataForSEOTest < Minitest::Test + include Minitest::Hooks + include DnsServerMock + + def test_malicious_ip + ip = '149.210.164.47' + match = Legitbot::DataForSEO.new ip + + refute_predicate match, :valid? + end + + def test_valid_ip + ip = '136.243.228.176' + match = Legitbot::DataForSEO.new ip + + assert_predicate match, :valid? + end + + def test_malicious_ua + bot = Legitbot.bot( + 'Mozilla/5.0 (compatible; DataForSeoBot; +https://dataforseo.com/dataforseo-bot)', + '149.210.164.47' + ) + + assert bot + refute_predicate bot, :valid? + end + + def test_valid_ua + bot = Legitbot.bot( + 'Mozilla/5.0 (compatible; DataForSeoBot; +https://dataforseo.com/dataforseo-bot)', + '136.243.228.176' + ) + + assert bot + assert_predicate bot, :valid? + end + + def test_valid_name + bot = Legitbot.bot( + 'Mozilla/5.0 (compatible; DataForSeoBot; +https://dataforseo.com/dataforseo-bot)', + '136.243.228.176' + ) + + assert_equal :dataforseo, bot.detected_as + end + + def test_fake_name + bot = Legitbot.bot( + 'Mozilla/5.0 (compatible; DataForSeoBot; +https://dataforseo.com/dataforseo-bot)', + '81.1.172.108' + ) + + assert_equal :dataforseo, bot.detected_as + end +end diff --git a/test/lib/dns_server_mock.rb b/test/lib/dns_server_mock.rb index fd0dc04..4d99eba 100644 --- a/test/lib/dns_server_mock.rb +++ b/test/lib/dns_server_mock.rb @@ -44,6 +44,14 @@ ptr: %w[17-58-98-60.applebot.apple.com] }, + # DataForSEO + 'crawling-gateway-136-243-228-176.dataforseo.com' => { + a: %w[136.243.228.176] + }, + '136.243.228.176' => { + ptr: %w[crawling-gateway-136-243-228-176.dataforseo.com] + }, + # Google 'crawl-66-249-64-141.googlebot.com' => { a: %w[66.249.64.141]