robmiller · lndl · Jun 9, 2015
diff --git a/bin/varnisher b/bin/varnisher
@@ -93,6 +93,11 @@ Main {
       description 'When given, /foo?foo=bar and /foo?foo=baz will be treated as the same resource.'
     }
 
+    option('f', 'filter-by-css') {
+      argument :optional
+      description 'When given, it will be used to crawl some specific area of the page.'
+    }
+
     def run
       target = params['target'].value
 

diff --git a/lib/varnisher/spider.rb b/lib/varnisher/spider.rb
@@ -121,7 +121,9 @@ def find_links(doc, uri)
     #
     # @return [Array] An array of strings
     def get_anchors(doc)
-      doc.xpath('//a[@href]').map { |e| e['href'] }
+      css_selectors   = Varnisher.options['filter-by-css'] || 'html'
+      xpath_selectors = css_selectors != 'html' ? 'a[@href]' : '//a[@href]'
+      doc.css(css_selectors).xpath(xpath_selectors).map { |e| e['href'] }
     end
 
     # Given an HTML document, will return all the URLs that exist in

diff --git a/test/data/spider.html b/test/data/spider.html
@@ -12,5 +12,9 @@
 
 		<!-- http://www.example.com/commented -->
 		<!-- http://www.example.net/commented -->
+		<div class="important-container">
+			<a href="sir">Page-relative link</a>
+			<a href="/zar">Hostname-relative link</a>
+		</div>
 	</body>
 </html>
diff --git a/test/spider_test.rb b/test/spider_test.rb
@@ -7,56 +7,96 @@
         :body => File.new(File.dirname(__FILE__) + "/data/spider.html"),
         :status => 200
       )
+  end
 
-    Varnisher.options = { 'quiet' => true }
+  describe "with standard options" do
+    before do
+      Varnisher.options = { 'quiet' => true }
 
-    @spider = Varnisher::Spider.new('http://www.example.com/foo')
-    @spider.crawl_page(URI.parse('http://www.example.com/foo'))
-  end
+      @spider = Varnisher::Spider.new('http://www.example.com/foo')
+      @spider.crawl_page(URI.parse('http://www.example.com/foo'))
+    end
 
-  it "visits the first page" do
-    assert @spider.visited.include?('http://www.example.com/foo')
-  end
+    it "visits the first page" do
+      assert @spider.visited.include?('http://www.example.com/foo')
+    end
 
-  it "extracts page-relative links" do
-    assert_includes @spider.to_visit, URI.parse('http://www.example.com/bar')
-  end
+    it "extracts page-relative links" do
+      assert_includes @spider.to_visit, URI.parse('http://www.example.com/bar')
+    end
 
-  it "extracts hostname-relative links" do
-    assert_includes @spider.to_visit, URI.parse('http://www.example.com/baz')
-  end
+    it "extracts hostname-relative links" do
+      assert_includes @spider.to_visit, URI.parse('http://www.example.com/baz')
+    end
 
-  it "extracts absolute URLs" do
-    assert_includes @spider.to_visit, URI.parse('http://www.example.com/foo/bar')
-  end
+    it "extracts absolute URLs" do
+      assert_includes @spider.to_visit, URI.parse('http://www.example.com/foo/bar')
+    end
 
-  it "ignores URLs on different hostnames" do
-    refute_includes @spider.to_visit, URI.parse('http://www.example.net/foo')
-  end
+    it "ignores URLs on different hostnames" do
+      refute_includes @spider.to_visit, URI.parse('http://www.example.net/foo')
+    end
 
-  it "reads URLs from comments" do
-    assert_includes @spider.to_visit, URI.parse('http://www.example.com/commented')
-  end
+    it "reads URLs from comments" do
+      assert_includes @spider.to_visit, URI.parse('http://www.example.com/commented')
+    end
+
+    it "ignores external URLs in comments" do
+      refute_includes @spider.to_visit, URI.parse('http://www.example.net/commented')
+    end
+
+    it "crawls all queued pages" do
+      stub_request(:any, /www.example.com.*/)
+        .to_return(:status => 200)
 
-  it "ignores external URLs in comments" do
-    refute_includes @spider.to_visit, URI.parse('http://www.example.net/commented')
+      @spider.run
+
+      expected_urls = [
+        'http://www.example.com/foo',
+        'http://www.example.com/bar',
+        'http://www.example.com/baz',
+        'http://www.example.com/foo/bar',
+      ]
+
+      expected_urls.each do |url|
+        assert_requested :get, url
+      end
+    end
   end
 
-  it "crawls all queued pages" do
-    stub_request(:any, /www.example.com.*/)
-      .to_return(:status => 200)
+  describe 'with options' do
+    describe 'with css-selectors' do
+      before do
+        Varnisher.options = { 'quiet' => true, 'filter-by-css' => 'div.important-container' }
+
+        @spider = Varnisher::Spider.new('http://www.example.com/foo')
+        @spider.crawl_page(URI.parse('http://www.example.com/foo'))
+      end
+
+      it 'extracts urls inside the css class selector (includes urls from comments)' do
+        expected_urls = [
+          URI.parse('http://www.example.com/sir'),
+          URI.parse('http://www.example.com/zar'),
+          URI.parse('http://www.example.com/commented')
+        ]
 
-    @spider.run
+        expected_urls.each do |url|
+          assert_includes @spider.to_visit, url
+        end
+      end
 
-    expected_urls = [
-      'http://www.example.com/foo',
-      'http://www.example.com/bar',
-      'http://www.example.com/baz',
-      'http://www.example.com/foo/bar',
-    ]
+      it 'ignores urls outside the css class selector' do
+        non_expected_urls = [
+          URI.parse('http://www.example.com/bar'),
+          URI.parse('http://www.example.com/baz'),
+          URI.parse('http://www.example.com/foo/bar'),
+          URI.parse('http://www.example.net/foo')
+        ]
 
-    expected_urls.each do |url|
-      assert_requested :get, url
+        non_expected_urls.each do |url|
+          refute_includes @spider.to_visit, url
+        end
+      end
     end
   end
 end