Skip to content

Commit

Permalink
Merge pull request #81 from cfpb/feature/wpull
Browse files Browse the repository at this point in the history
Implement new crawler based on wpull
  • Loading branch information
chosak authored Nov 2, 2023
2 parents af5d01c + bf51eb8 commit bcd66f0
Show file tree
Hide file tree
Showing 12 changed files with 652 additions and 20 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ yarn build
Create a Python virtual environment and install required packages:

```
python3.8 -m venv venv
python3.6 -m venv venv
source venv/bin/activate
pip install -r requirements/base.txt
```
Expand Down Expand Up @@ -248,7 +248,7 @@ under the `/sample/src` subdirectory.
To regenerate these files, first serve the sample website locally:

```
python -m http.server -d ./sample/src
cd ./sample/src && python -m http.server
```

This starts the sample website running at http://localhost:8000.
Expand Down
84 changes: 84 additions & 0 deletions crawler/management/commands/crawl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import os
import os.path

import djclick as click
from wpull.application.builder import Builder
from wpull.application.options import AppArgumentParser

from crawler import wpull_plugin


@click.command()
@click.argument("start_url")
@click.argument("db_filename", type=click.Path())
@click.option(
"--max-pages", type=int, help="Maximum number of pages to crawl", default=0
)
@click.option("--depth", type=int, help="Maximum crawl depth", default=0)
@click.option(
"--recreate",
is_flag=True,
show_default=True,
default=False,
help="Overwrite SQLite database if it already exists",
)
@click.option("--resume", is_flag=True)
def command(start_url, db_filename, max_pages, depth, recreate, resume):
"""Crawl a website to a SQLite database."""
if os.path.exists(db_filename):
if not recreate and not resume:
raise click.ClickException(
f"File {db_filename} already exists, "
"use --recreate to recreate "
"or --resume to resume a previous crawl."
)

if recreate:
os.remove(db_filename)

wpull_progress_filename = f"{db_filename}.wpull.db"
click.echo(
f"Storing crawl progress in {wpull_progress_filename}, use --resume to resume."
)

if not resume and os.path.exists(wpull_progress_filename):
os.path.remove(wpull_progress_filename)

arg_parser = AppArgumentParser()
args = arg_parser.parse_args(
[
start_url,
"--quiet",
"--recursive",
"--delete-after",
"--no-robots",
"--wait=0.5",
"--random-wait",
"--dns-timeout=5",
"--connect-timeout=5",
"--read-timeout=30",
"--session-timeout=30",
"--span-hosts",
"--link-extractors=html",
"--follow-tags=a",
"--user-agent=CFPB website indexer",
"--no-check-certificate",
f"--level={depth}",
f"--plugin-script={wpull_plugin.__file__}",
f"--plugin-args={db_filename},{max_pages}",
f"--database={wpull_progress_filename}",
]
)
builder = Builder(args)
app = builder.build()

# This is required due to the use of async code in wpull. Unfortunately
# wpull hooks aren't called in a way that allows us to wrap Django database
# calls with sync_to_async. This is only safe because we only download one
# URL at a time.
# https://docs.djangoproject.com/en/3.2/topics/async/#async-safety
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"

exit_status = app.run_sync()
click.echo(f"done, exiting with status {exit_status}")
return exit_status
121 changes: 120 additions & 1 deletion crawler/models.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
import lxml.etree
import lxml.html.soupparser
import re
from urllib import parse

from django.db import models
from django.utils import timezone

from modelcluster.models import ClusterableModel
from modelcluster.fields import ParentalManyToManyField
Expand Down Expand Up @@ -35,6 +41,105 @@ class Page(Request, ClusterableModel):
components = ParentalManyToManyField(Component, related_name="pages")
links = ParentalManyToManyField(Link, related_name="links")

def __str__(self):
return self.url

HTML_COMPONENT_SEARCH = re.compile(r"(?:(?:class=\")|\s)((?:o|m|a)-[\w\-]*)")
HTML_EXTERNAL_SITE = re.compile("/external-site/")
HTML_WHITESPACE = re.compile(r"\s+")

@classmethod
def from_html(
cls,
url,
html,
internal_link_host,
):
try:
tree = lxml.html.fromstring(html)
except lxml.etree.ParserError:
# https://bugs.launchpad.net/lxml/+bug/1949271
tree = lxml.html.soupparser.fromstring(html)

title_tag = tree.find(".//title")
title = title_tag.text.strip() if title_tag is not None else None
language = tree.find(".").get("lang")

if title is None:
return

body = cls._get_cleaned_body_from_tree(tree)

if body is not None:
text = cls.HTML_WHITESPACE.sub(" ", body.text_content()).strip()
else:
text = None

page = Page(
timestamp=timezone.now(),
url=url,
title=title,
language=language,
html=html,
text=text,
)

if body is None:
return page

hrefs = list(
set(
href
for element, attribute, href, pos in body.iterlinks()
if "a" == element.tag and "href" == attribute
)
)

# Remove any external link URL wrapping.
for i, href in enumerate(hrefs):
parsed_href = parse.urlparse(href)
if not cls.HTML_EXTERNAL_SITE.match(parsed_href.path):
continue

if parsed_href.netloc and internal_link_host != parsed_href.netloc:
continue

ext_url = parse.parse_qs(parsed_href.query).get("ext_url")
if ext_url:
hrefs[i] = ext_url[0]

page.links = [Link(href=href) for href in sorted(hrefs)]

body_html = lxml.etree.tostring(body, encoding="unicode")

class_names = set(cls.HTML_COMPONENT_SEARCH.findall(body_html))
page.components = [
Component(class_name=class_name) for class_name in sorted(class_names)
]

return page

@staticmethod
def _get_cleaned_body_from_tree(tree):
"""Extract page body without header, footer, images, or scripts."""
body = tree.find("./body")

if body is not None:
drop_element_selectors = [
".o-header",
".o-footer",
".skip-nav",
"img",
"script",
"style",
]

for drop_element_selector in drop_element_selectors:
for element in body.cssselect(drop_element_selector):
element.drop_tree()

return body


class ErrorBase(Request):
status_code = models.PositiveIntegerField(db_index=True)
Expand All @@ -43,10 +148,24 @@ class ErrorBase(Request):
class Meta(Request.Meta):
abstract = True

def __str__(self):
s = self.url

if self.referrer:
s += f" (from {self.referrer})"

s += f" {self.status_code}"

return s


class Error(ErrorBase):
pass
def __str__(self):
return super().__str__() + " !"


class Redirect(ErrorBase):
location = models.TextField(db_index=True)

def __str__(self):
return super().__str__() + f" -> {self.location}"
Empty file added crawler/tests/__init__.py
Empty file.
110 changes: 110 additions & 0 deletions crawler/tests/test_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
from operator import attrgetter
from unittest.mock import patch

import lxml.etree

from django.test import SimpleTestCase

from crawler.models import Error, Page, Redirect


class PageTests(SimpleTestCase):
def test_from_html_no_title_returns_none(self):
self.assertIsNone(
Page.from_html(
"https://example.com/",
"<html><head></head><body>This page has no title.</body></html>",
"example.com",
)
)

def check_from_html(self):
html = """
<html lang="en">
<head><title>Test page</title></head>
<body>
<script>Ignore me!</script>
<div class="m-links">Links</div>
<div><a href="/page/">A regular link on the same domain.</a></div>
<div class="a-external-link">
<a href="/external-site/?ext_url=https%3A%2F%2Fexample.org%2F">
An external link pointing to another domain
</a>
<a href="/external-site/">
An external link missing its target
</a>
<a href="https://example.org/external-site/">
A link on another domain that also uses /external-site/
</a>
</div>
</body>
</html>
""".strip()

page = Page.from_html("https://example.com/", html, "example.com")
self.assertEqual(str(page), "https://example.com/")
self.assertEqual(page.title, "Test page")
self.assertEqual(page.language, "en")
self.assertEqual(page.html, html)
self.assertEqual(
page.text,
(
"Links "
"A regular link on the same domain. "
"An external link pointing to another domain "
"An external link missing its target "
"A link on another domain that also uses /external-site/"
),
)
self.assertCountEqual(
page.components.values_list("class_name", flat=True),
["a-external-link", "m-links"],
)
self.assertCountEqual(
page.links.values_list("href", flat=True),
[
"/external-site/",
"/page/",
"https://example.org/",
"https://example.org/external-site/",
],
)

def test_from_html(self):
self.check_from_html()

def test_from_html_etree_fallback_parser(self):
with patch(
"lxml.html.fromstring",
side_effect=lxml.etree.ParserError("testing parser error"),
):
self.check_from_html()

def test_from_html_no_body(self):
html = '<html lang="en"><head><title>Test page with no body</head></html>'
page = Page.from_html("https://example.com/", html, "example.com")
self.assertEqual(str(page), "https://example.com/")
self.assertEqual(page.title, "Test page with no body")
self.assertEqual(page.language, "en")
self.assertEqual(page.html, html)
self.assertIsNone(page.text)


class ErrorTests(SimpleTestCase):
def test_error_str(self):
self.assertEqual(
str(Error(url="/not-found/", status_code=404)), "/not-found/ 404 !"
)

def test_error_str_with_referrer(self):
self.assertEqual(
str(
Redirect(
url="/redirect/",
referrer="/source/",
status_code=301,
location="/destination/",
)
),
"/redirect/ (from /source/) 301 -> /destination/",
)
Loading

0 comments on commit bcd66f0

Please sign in to comment.