diff --git a/CHANGELOG.md b/CHANGELOG.md index 65a2b2b..82dee02 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,10 @@ All notable changes to `mixed-content-scanner-cli` will be documented in this file +## 1.2.0 - 2018-03-22 + +- Add `--ignore-robots` option. + ## 1.1.2 - 2018-03-02 - update `spatie/mixed-content-scanner` to 3.0 diff --git a/README.md b/README.md index 5a5eb7d..31cbc63 100644 --- a/README.md +++ b/README.md @@ -100,6 +100,14 @@ Of course you can also combine filters and ignores: mixed-content-scanner scan https://spatie.be --filter="^\/en" --ignore="opensource" ``` +### Ignoring robots + +By default, the crawler will respect robots data. You can ignore them though with the `--ignore-robots option`. + +```bash +mixed-content-scanner scan https://example.com --ignore-robots +``` + ## Changelog Please see [CHANGELOG](CHANGELOG.md) for more information what has changed recently. diff --git a/composer.json b/composer.json index d920397..027c330 100644 --- a/composer.json +++ b/composer.json @@ -21,7 +21,7 @@ ], "require": { "php": "^7.1", - "spatie/mixed-content-scanner": "^3.0", + "spatie/mixed-content-scanner": "^3.1", "symfony/console": "^4.0", "symfony/var-dumper": "^4.0" }, diff --git a/src/ScanCommand.php b/src/ScanCommand.php index 2f48972..592fbb8 100644 --- a/src/ScanCommand.php +++ b/src/ScanCommand.php @@ -3,6 +3,7 @@ namespace Spatie\MixedContentScannerCli; use GuzzleHttp\RequestOptions; +use Spatie\Crawler\Crawler; use Symfony\Component\Console\Command\Command; use Symfony\Component\Console\Input\InputOption; use Symfony\Component\Console\Style\SymfonyStyle; @@ -21,6 +22,7 @@ protected function configure() ->addArgument('url', InputArgument::REQUIRED, 'Which argument do you want to scan') ->addOption('filter', null, InputOption::VALUE_REQUIRED | InputOption::VALUE_IS_ARRAY, 'urls whose path pass the regex will be scanned') ->addOption('ignore', null, InputOption::VALUE_REQUIRED | InputOption::VALUE_IS_ARRAY, 'urls whose path pass the regex will not be scanned') + ->addOption('ignore-robots', null, InputOption::VALUE_NONE, 'Ignore robots.txt, robots meta tags and -headers.') ->addOption('verify-ssl', null, InputOption::VALUE_NONE, 'Verify the craweld urls have a valid certificate. If they do not an empty response will be the result of the crawl'); } @@ -40,7 +42,14 @@ protected function execute(InputInterface $input, OutputInterface $output) $input->getOption('ignore') ); + $ignoreRobots = $input->getOption('ignore-robots'); + (new MixedContentScanner($mixedContentLogger)) + ->configureCrawler(function (Crawler $crawler) use ($ignoreRobots) { + if ($ignoreRobots) { + $crawler->ignoreRobots(); + } + }) ->setCrawlProfile($crawlProfile) ->scan($scanUrl, $this->getClientOptions($input)); }