diff --git a/bin/varnisher b/bin/varnisher index 58f9e25..f049367 100755 --- a/bin/varnisher +++ b/bin/varnisher @@ -93,6 +93,11 @@ Main { description 'When given, /foo?foo=bar and /foo?foo=baz will be treated as the same resource.' } + option('f', 'filter-by-css') { + argument :optional + description 'When given, it will be used to crawl some specific area of the page.' + } + def run target = params['target'].value diff --git a/lib/varnisher/spider.rb b/lib/varnisher/spider.rb index 494240a..a6b7db5 100644 --- a/lib/varnisher/spider.rb +++ b/lib/varnisher/spider.rb @@ -121,7 +121,9 @@ def find_links(doc, uri) # # @return [Array] An array of strings def get_anchors(doc) - doc.xpath('//a[@href]').map { |e| e['href'] } + css_selectors = Varnisher.options['filter-by-css'] || 'html' + xpath_selectors = css_selectors != 'html' ? 'a[@href]' : '//a[@href]' + doc.css(css_selectors).xpath(xpath_selectors).map { |e| e['href'] } end # Given an HTML document, will return all the URLs that exist in diff --git a/test/data/spider.html b/test/data/spider.html index 4b84c12..308b127 100644 --- a/test/data/spider.html +++ b/test/data/spider.html @@ -12,5 +12,9 @@ +