Skip to content

Commit

Permalink
Add --ignore-robots option.
Browse files Browse the repository at this point in the history
  • Loading branch information
brendt committed May 22, 2018
1 parent d1745d5 commit e754a67
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 1 deletion.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

All notable changes to `mixed-content-scanner-cli` will be documented in this file

## 1.2.0 - 2018-03-22

- Add `--ignore-robots` option.

## 1.1.2 - 2018-03-02

- update `spatie/mixed-content-scanner` to 3.0
Expand Down
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,14 @@ Of course you can also combine filters and ignores:
mixed-content-scanner scan https://spatie.be --filter="^\/en" --ignore="opensource"
```

### Ignoring robots

By default, the crawler will respect robots data. You can ignore them though with the `--ignore-robots option`.

```bash
mixed-content-scanner scan https://example.com --ignore-robots
```

## Changelog

Please see [CHANGELOG](CHANGELOG.md) for more information what has changed recently.
Expand Down
2 changes: 1 addition & 1 deletion composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
],
"require": {
"php": "^7.1",
"spatie/mixed-content-scanner": "^3.0",
"spatie/mixed-content-scanner": "^3.1",
"symfony/console": "^4.0",
"symfony/var-dumper": "^4.0"
},
Expand Down
9 changes: 9 additions & 0 deletions src/ScanCommand.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
namespace Spatie\MixedContentScannerCli;

use GuzzleHttp\RequestOptions;
use Spatie\Crawler\Crawler;
use Symfony\Component\Console\Command\Command;
use Symfony\Component\Console\Input\InputOption;
use Symfony\Component\Console\Style\SymfonyStyle;
Expand All @@ -21,6 +22,7 @@ protected function configure()
->addArgument('url', InputArgument::REQUIRED, 'Which argument do you want to scan')
->addOption('filter', null, InputOption::VALUE_REQUIRED | InputOption::VALUE_IS_ARRAY, 'urls whose path pass the regex will be scanned')
->addOption('ignore', null, InputOption::VALUE_REQUIRED | InputOption::VALUE_IS_ARRAY, 'urls whose path pass the regex will not be scanned')
->addOption('ignore-robots', null, InputOption::VALUE_NONE, 'Ignore robots.txt, robots meta tags and -headers.')
->addOption('verify-ssl', null, InputOption::VALUE_NONE, 'Verify the craweld urls have a valid certificate. If they do not an empty response will be the result of the crawl');
}

Expand All @@ -40,7 +42,14 @@ protected function execute(InputInterface $input, OutputInterface $output)
$input->getOption('ignore')
);

$ignoreRobots = $input->getOption('ignore-robots');

(new MixedContentScanner($mixedContentLogger))
->configureCrawler(function (Crawler $crawler) use ($ignoreRobots) {
if ($ignoreRobots) {
$crawler->ignoreRobots();
}
})
->setCrawlProfile($crawlProfile)
->scan($scanUrl, $this->getClientOptions($input));
}
Expand Down

0 comments on commit e754a67

Please sign in to comment.