-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #81 from cfpb/feature/wpull
Implement new crawler based on wpull
- Loading branch information
Showing
12 changed files
with
652 additions
and
20 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
import os | ||
import os.path | ||
|
||
import djclick as click | ||
from wpull.application.builder import Builder | ||
from wpull.application.options import AppArgumentParser | ||
|
||
from crawler import wpull_plugin | ||
|
||
|
||
@click.command() | ||
@click.argument("start_url") | ||
@click.argument("db_filename", type=click.Path()) | ||
@click.option( | ||
"--max-pages", type=int, help="Maximum number of pages to crawl", default=0 | ||
) | ||
@click.option("--depth", type=int, help="Maximum crawl depth", default=0) | ||
@click.option( | ||
"--recreate", | ||
is_flag=True, | ||
show_default=True, | ||
default=False, | ||
help="Overwrite SQLite database if it already exists", | ||
) | ||
@click.option("--resume", is_flag=True) | ||
def command(start_url, db_filename, max_pages, depth, recreate, resume): | ||
"""Crawl a website to a SQLite database.""" | ||
if os.path.exists(db_filename): | ||
if not recreate and not resume: | ||
raise click.ClickException( | ||
f"File {db_filename} already exists, " | ||
"use --recreate to recreate " | ||
"or --resume to resume a previous crawl." | ||
) | ||
|
||
if recreate: | ||
os.remove(db_filename) | ||
|
||
wpull_progress_filename = f"{db_filename}.wpull.db" | ||
click.echo( | ||
f"Storing crawl progress in {wpull_progress_filename}, use --resume to resume." | ||
) | ||
|
||
if not resume and os.path.exists(wpull_progress_filename): | ||
os.path.remove(wpull_progress_filename) | ||
|
||
arg_parser = AppArgumentParser() | ||
args = arg_parser.parse_args( | ||
[ | ||
start_url, | ||
"--quiet", | ||
"--recursive", | ||
"--delete-after", | ||
"--no-robots", | ||
"--wait=0.5", | ||
"--random-wait", | ||
"--dns-timeout=5", | ||
"--connect-timeout=5", | ||
"--read-timeout=30", | ||
"--session-timeout=30", | ||
"--span-hosts", | ||
"--link-extractors=html", | ||
"--follow-tags=a", | ||
"--user-agent=CFPB website indexer", | ||
"--no-check-certificate", | ||
f"--level={depth}", | ||
f"--plugin-script={wpull_plugin.__file__}", | ||
f"--plugin-args={db_filename},{max_pages}", | ||
f"--database={wpull_progress_filename}", | ||
] | ||
) | ||
builder = Builder(args) | ||
app = builder.build() | ||
|
||
# This is required due to the use of async code in wpull. Unfortunately | ||
# wpull hooks aren't called in a way that allows us to wrap Django database | ||
# calls with sync_to_async. This is only safe because we only download one | ||
# URL at a time. | ||
# https://docs.djangoproject.com/en/3.2/topics/async/#async-safety | ||
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true" | ||
|
||
exit_status = app.run_sync() | ||
click.echo(f"done, exiting with status {exit_status}") | ||
return exit_status |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
from operator import attrgetter | ||
from unittest.mock import patch | ||
|
||
import lxml.etree | ||
|
||
from django.test import SimpleTestCase | ||
|
||
from crawler.models import Error, Page, Redirect | ||
|
||
|
||
class PageTests(SimpleTestCase): | ||
def test_from_html_no_title_returns_none(self): | ||
self.assertIsNone( | ||
Page.from_html( | ||
"https://example.com/", | ||
"<html><head></head><body>This page has no title.</body></html>", | ||
"example.com", | ||
) | ||
) | ||
|
||
def check_from_html(self): | ||
html = """ | ||
<html lang="en"> | ||
<head><title>Test page</title></head> | ||
<body> | ||
<script>Ignore me!</script> | ||
<div class="m-links">Links</div> | ||
<div><a href="/page/">A regular link on the same domain.</a></div> | ||
<div class="a-external-link"> | ||
<a href="/external-site/?ext_url=https%3A%2F%2Fexample.org%2F"> | ||
An external link pointing to another domain | ||
</a> | ||
<a href="/external-site/"> | ||
An external link missing its target | ||
</a> | ||
<a href="https://example.org/external-site/"> | ||
A link on another domain that also uses /external-site/ | ||
</a> | ||
</div> | ||
</body> | ||
</html> | ||
""".strip() | ||
|
||
page = Page.from_html("https://example.com/", html, "example.com") | ||
self.assertEqual(str(page), "https://example.com/") | ||
self.assertEqual(page.title, "Test page") | ||
self.assertEqual(page.language, "en") | ||
self.assertEqual(page.html, html) | ||
self.assertEqual( | ||
page.text, | ||
( | ||
"Links " | ||
"A regular link on the same domain. " | ||
"An external link pointing to another domain " | ||
"An external link missing its target " | ||
"A link on another domain that also uses /external-site/" | ||
), | ||
) | ||
self.assertCountEqual( | ||
page.components.values_list("class_name", flat=True), | ||
["a-external-link", "m-links"], | ||
) | ||
self.assertCountEqual( | ||
page.links.values_list("href", flat=True), | ||
[ | ||
"/external-site/", | ||
"/page/", | ||
"https://example.org/", | ||
"https://example.org/external-site/", | ||
], | ||
) | ||
|
||
def test_from_html(self): | ||
self.check_from_html() | ||
|
||
def test_from_html_etree_fallback_parser(self): | ||
with patch( | ||
"lxml.html.fromstring", | ||
side_effect=lxml.etree.ParserError("testing parser error"), | ||
): | ||
self.check_from_html() | ||
|
||
def test_from_html_no_body(self): | ||
html = '<html lang="en"><head><title>Test page with no body</head></html>' | ||
page = Page.from_html("https://example.com/", html, "example.com") | ||
self.assertEqual(str(page), "https://example.com/") | ||
self.assertEqual(page.title, "Test page with no body") | ||
self.assertEqual(page.language, "en") | ||
self.assertEqual(page.html, html) | ||
self.assertIsNone(page.text) | ||
|
||
|
||
class ErrorTests(SimpleTestCase): | ||
def test_error_str(self): | ||
self.assertEqual( | ||
str(Error(url="/not-found/", status_code=404)), "/not-found/ 404 !" | ||
) | ||
|
||
def test_error_str_with_referrer(self): | ||
self.assertEqual( | ||
str( | ||
Redirect( | ||
url="/redirect/", | ||
referrer="/source/", | ||
status_code=301, | ||
location="/destination/", | ||
) | ||
), | ||
"/redirect/ (from /source/) 301 -> /destination/", | ||
) |
Oops, something went wrong.