From fb719a606fcdd96998162508fdedbf9e67630e1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lautaro=20Nahuel=20De=20Le=C3=B3n?= Date: Mon, 8 Jun 2015 23:39:57 -0300 Subject: [PATCH] Added a filter-by-css option in Varnish::Spider This can be used to crawl some specific area instead of the entire page in order to optimise the Varnish warp-up strategy --- bin/varnisher | 5 ++ lib/varnisher/spider.rb | 4 +- test/data/spider.html | 4 ++ test/spider_test.rb | 112 +++++++++++++++++++++++++++------------- 4 files changed, 88 insertions(+), 37 deletions(-) diff --git a/bin/varnisher b/bin/varnisher index 58f9e25..f049367 100755 --- a/bin/varnisher +++ b/bin/varnisher @@ -93,6 +93,11 @@ Main { description 'When given, /foo?foo=bar and /foo?foo=baz will be treated as the same resource.' } + option('f', 'filter-by-css') { + argument :optional + description 'When given, it will be used to crawl some specific area of the page.' + } + def run target = params['target'].value diff --git a/lib/varnisher/spider.rb b/lib/varnisher/spider.rb index 494240a..a6b7db5 100644 --- a/lib/varnisher/spider.rb +++ b/lib/varnisher/spider.rb @@ -121,7 +121,9 @@ def find_links(doc, uri) # # @return [Array] An array of strings def get_anchors(doc) - doc.xpath('//a[@href]').map { |e| e['href'] } + css_selectors = Varnisher.options['filter-by-css'] || 'html' + xpath_selectors = css_selectors != 'html' ? 'a[@href]' : '//a[@href]' + doc.css(css_selectors).xpath(xpath_selectors).map { |e| e['href'] } end # Given an HTML document, will return all the URLs that exist in diff --git a/test/data/spider.html b/test/data/spider.html index 4b84c12..308b127 100644 --- a/test/data/spider.html +++ b/test/data/spider.html @@ -12,5 +12,9 @@ +
+ Page-relative link + Hostname-relative link +
diff --git a/test/spider_test.rb b/test/spider_test.rb index 9c7337d..75b509c 100644 --- a/test/spider_test.rb +++ b/test/spider_test.rb @@ -7,56 +7,96 @@ :body => File.new(File.dirname(__FILE__) + "/data/spider.html"), :status => 200 ) + end - Varnisher.options = { 'quiet' => true } + describe "with standard options" do + before do + Varnisher.options = { 'quiet' => true } - @spider = Varnisher::Spider.new('http://www.example.com/foo') - @spider.crawl_page(URI.parse('http://www.example.com/foo')) - end + @spider = Varnisher::Spider.new('http://www.example.com/foo') + @spider.crawl_page(URI.parse('http://www.example.com/foo')) + end - it "visits the first page" do - assert @spider.visited.include?('http://www.example.com/foo') - end + it "visits the first page" do + assert @spider.visited.include?('http://www.example.com/foo') + end - it "extracts page-relative links" do - assert_includes @spider.to_visit, URI.parse('http://www.example.com/bar') - end + it "extracts page-relative links" do + assert_includes @spider.to_visit, URI.parse('http://www.example.com/bar') + end - it "extracts hostname-relative links" do - assert_includes @spider.to_visit, URI.parse('http://www.example.com/baz') - end + it "extracts hostname-relative links" do + assert_includes @spider.to_visit, URI.parse('http://www.example.com/baz') + end - it "extracts absolute URLs" do - assert_includes @spider.to_visit, URI.parse('http://www.example.com/foo/bar') - end + it "extracts absolute URLs" do + assert_includes @spider.to_visit, URI.parse('http://www.example.com/foo/bar') + end - it "ignores URLs on different hostnames" do - refute_includes @spider.to_visit, URI.parse('http://www.example.net/foo') - end + it "ignores URLs on different hostnames" do + refute_includes @spider.to_visit, URI.parse('http://www.example.net/foo') + end - it "reads URLs from comments" do - assert_includes @spider.to_visit, URI.parse('http://www.example.com/commented') - end + it "reads URLs from comments" do + assert_includes @spider.to_visit, URI.parse('http://www.example.com/commented') + end + + it "ignores external URLs in comments" do + refute_includes @spider.to_visit, URI.parse('http://www.example.net/commented') + end + + it "crawls all queued pages" do + stub_request(:any, /www.example.com.*/) + .to_return(:status => 200) - it "ignores external URLs in comments" do - refute_includes @spider.to_visit, URI.parse('http://www.example.net/commented') + @spider.run + + expected_urls = [ + 'http://www.example.com/foo', + 'http://www.example.com/bar', + 'http://www.example.com/baz', + 'http://www.example.com/foo/bar', + ] + + expected_urls.each do |url| + assert_requested :get, url + end + end end - it "crawls all queued pages" do - stub_request(:any, /www.example.com.*/) - .to_return(:status => 200) + describe 'with options' do + describe 'with css-selectors' do + before do + Varnisher.options = { 'quiet' => true, 'filter-by-css' => 'div.important-container' } + + @spider = Varnisher::Spider.new('http://www.example.com/foo') + @spider.crawl_page(URI.parse('http://www.example.com/foo')) + end + + it 'extracts urls inside the css class selector (includes urls from comments)' do + expected_urls = [ + URI.parse('http://www.example.com/sir'), + URI.parse('http://www.example.com/zar'), + URI.parse('http://www.example.com/commented') + ] - @spider.run + expected_urls.each do |url| + assert_includes @spider.to_visit, url + end + end - expected_urls = [ - 'http://www.example.com/foo', - 'http://www.example.com/bar', - 'http://www.example.com/baz', - 'http://www.example.com/foo/bar', - ] + it 'ignores urls outside the css class selector' do + non_expected_urls = [ + URI.parse('http://www.example.com/bar'), + URI.parse('http://www.example.com/baz'), + URI.parse('http://www.example.com/foo/bar'), + URI.parse('http://www.example.net/foo') + ] - expected_urls.each do |url| - assert_requested :get, url + non_expected_urls.each do |url| + refute_includes @spider.to_visit, url + end + end end end end