Merge pull request #81 from cfpb/feature/wpull

Implement new crawler based on wpull
cfpb · Nov 2, 2023 · bcd66f0 · bcd66f0
2 parents af5d01c + bf51eb8
commit bcd66f0
Show file tree

Hide file tree

Showing 12 changed files with 652 additions and 20 deletions.
diff --git a/README.md b/README.md
@@ -166,7 +166,7 @@ yarn build
 Create a Python virtual environment and install required packages:
 
 ```
-python3.8 -m venv venv
+python3.6 -m venv venv
 source venv/bin/activate
 pip install -r requirements/base.txt
 ```
@@ -248,7 +248,7 @@ under the `/sample/src` subdirectory.
 To regenerate these files, first serve the sample website locally:
 
 ```
-python -m http.server -d ./sample/src
+cd ./sample/src && python -m http.server
 ```
 
 This starts the sample website running at http://localhost:8000.

diff --git a/crawler/management/commands/crawl.py b/crawler/management/commands/crawl.py
@@ -0,0 +1,84 @@
+import os
+import os.path
+
+import djclick as click
+from wpull.application.builder import Builder
+from wpull.application.options import AppArgumentParser
+
+from crawler import wpull_plugin
+
+
+@click.command()
+@click.argument("start_url")
+@click.argument("db_filename", type=click.Path())
+@click.option(
+    "--max-pages", type=int, help="Maximum number of pages to crawl", default=0
+)
+@click.option("--depth", type=int, help="Maximum crawl depth", default=0)
+@click.option(
+    "--recreate",
+    is_flag=True,
+    show_default=True,
+    default=False,
+    help="Overwrite SQLite database if it already exists",
+)
+@click.option("--resume", is_flag=True)
+def command(start_url, db_filename, max_pages, depth, recreate, resume):
+    """Crawl a website to a SQLite database."""
+    if os.path.exists(db_filename):
+        if not recreate and not resume:
+            raise click.ClickException(
+                f"File {db_filename} already exists, "
+                "use --recreate to recreate "
+                "or --resume to resume a previous crawl."
+            )
+
+        if recreate:
+            os.remove(db_filename)
+
+    wpull_progress_filename = f"{db_filename}.wpull.db"
+    click.echo(
+        f"Storing crawl progress in {wpull_progress_filename}, use --resume to resume."
+    )
+
+    if not resume and os.path.exists(wpull_progress_filename):
+        os.path.remove(wpull_progress_filename)
+
+    arg_parser = AppArgumentParser()
+    args = arg_parser.parse_args(
+        [
+            start_url,
+            "--quiet",
+            "--recursive",
+            "--delete-after",
+            "--no-robots",
+            "--wait=0.5",
+            "--random-wait",
+            "--dns-timeout=5",
+            "--connect-timeout=5",
+            "--read-timeout=30",
+            "--session-timeout=30",
+            "--span-hosts",
+            "--link-extractors=html",
+            "--follow-tags=a",
+            "--user-agent=CFPB website indexer",
+            "--no-check-certificate",
+            f"--level={depth}",
+            f"--plugin-script={wpull_plugin.__file__}",
+            f"--plugin-args={db_filename},{max_pages}",
+            f"--database={wpull_progress_filename}",
+        ]
+    )
+    builder = Builder(args)
+    app = builder.build()
+
+    # This is required due to the use of async code in wpull. Unfortunately
+    # wpull hooks aren't called in a way that allows us to wrap Django database
+    # calls with sync_to_async. This is only safe because we only download one
+    # URL at a time.
+    # https://docs.djangoproject.com/en/3.2/topics/async/#async-safety
+    os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"
+
+    exit_status = app.run_sync()
+    click.echo(f"done, exiting with status {exit_status}")
+    return exit_status
diff --git a/crawler/models.py b/crawler/models.py
@@ -1,4 +1,10 @@
+import lxml.etree
+import lxml.html.soupparser
+import re
+from urllib import parse
+
 from django.db import models
+from django.utils import timezone
 
 from modelcluster.models import ClusterableModel
 from modelcluster.fields import ParentalManyToManyField
@@ -35,6 +41,105 @@ class Page(Request, ClusterableModel):
     components = ParentalManyToManyField(Component, related_name="pages")
     links = ParentalManyToManyField(Link, related_name="links")
 
+    def __str__(self):
+        return self.url
+
+    HTML_COMPONENT_SEARCH = re.compile(r"(?:(?:class=\")|\s)((?:o|m|a)-[\w\-]*)")
+    HTML_EXTERNAL_SITE = re.compile("/external-site/")
+    HTML_WHITESPACE = re.compile(r"\s+")
+
+    @classmethod
+    def from_html(
+        cls,
+        url,
+        html,
+        internal_link_host,
+    ):
+        try:
+            tree = lxml.html.fromstring(html)
+        except lxml.etree.ParserError:
+            # https://bugs.launchpad.net/lxml/+bug/1949271
+            tree = lxml.html.soupparser.fromstring(html)
+
+        title_tag = tree.find(".//title")
+        title = title_tag.text.strip() if title_tag is not None else None
+        language = tree.find(".").get("lang")
+
+        if title is None:
+            return
+
+        body = cls._get_cleaned_body_from_tree(tree)
+
+        if body is not None:
+            text = cls.HTML_WHITESPACE.sub(" ", body.text_content()).strip()
+        else:
+            text = None
+
+        page = Page(
+            timestamp=timezone.now(),
+            url=url,
+            title=title,
+            language=language,
+            html=html,
+            text=text,
+        )
+
+        if body is None:
+            return page
+
+        hrefs = list(
+            set(
+                href
+                for element, attribute, href, pos in body.iterlinks()
+                if "a" == element.tag and "href" == attribute
+            )
+        )
+
+        # Remove any external link URL wrapping.
+        for i, href in enumerate(hrefs):
+            parsed_href = parse.urlparse(href)
+            if not cls.HTML_EXTERNAL_SITE.match(parsed_href.path):
+                continue
+
+            if parsed_href.netloc and internal_link_host != parsed_href.netloc:
+                continue
+
+            ext_url = parse.parse_qs(parsed_href.query).get("ext_url")
+            if ext_url:
+                hrefs[i] = ext_url[0]
+
+        page.links = [Link(href=href) for href in sorted(hrefs)]
+
+        body_html = lxml.etree.tostring(body, encoding="unicode")
+
+        class_names = set(cls.HTML_COMPONENT_SEARCH.findall(body_html))
+        page.components = [
+            Component(class_name=class_name) for class_name in sorted(class_names)
+        ]
+
+        return page
+
+    @staticmethod
+    def _get_cleaned_body_from_tree(tree):
+        """Extract page body without header, footer, images, or scripts."""
+        body = tree.find("./body")
+
+        if body is not None:
+            drop_element_selectors = [
+                ".o-header",
+                ".o-footer",
+                ".skip-nav",
+                "img",
+                "script",
+                "style",
+            ]
+
+            for drop_element_selector in drop_element_selectors:
+                for element in body.cssselect(drop_element_selector):
+                    element.drop_tree()
+
+        return body
+
 
 class ErrorBase(Request):
     status_code = models.PositiveIntegerField(db_index=True)
@@ -43,10 +148,24 @@ class ErrorBase(Request):
     class Meta(Request.Meta):
         abstract = True
 
+    def __str__(self):
+        s = self.url
+
+        if self.referrer:
+            s += f" (from {self.referrer})"
+
+        s += f" {self.status_code}"
+
+        return s
+
 
 class Error(ErrorBase):
-    pass
+    def __str__(self):
+        return super().__str__() + " !"
 
 
 class Redirect(ErrorBase):
     location = models.TextField(db_index=True)
+
+    def __str__(self):
+        return super().__str__() + f" -> {self.location}"
diff --git a/crawler/tests/__init__.py b/crawler/tests/__init__.py
diff --git a/crawler/tests/test_models.py b/crawler/tests/test_models.py
@@ -0,0 +1,110 @@
+from operator import attrgetter
+from unittest.mock import patch
+
+import lxml.etree
+
+from django.test import SimpleTestCase
+
+from crawler.models import Error, Page, Redirect
+
+
+class PageTests(SimpleTestCase):
+    def test_from_html_no_title_returns_none(self):
+        self.assertIsNone(
+            Page.from_html(
+                "https://example.com/",
+                "<html><head></head><body>This page has no title.</body></html>",
+                "example.com",
+            )
+        )
+
+    def check_from_html(self):
+        html = """
+<html lang="en">
+<head><title>Test page</title></head>
+<body>
+    <script>Ignore me!</script>
+    <div class="m-links">Links</div>
+        <div><a href="/page/">A regular link on the same domain.</a></div>
+        <div class="a-external-link">
+            <a href="/external-site/?ext_url=https%3A%2F%2Fexample.org%2F">
+                An external link pointing to another domain
+            </a>
+            <a href="/external-site/">
+                An external link missing its target
+            </a>
+            <a href="https://example.org/external-site/">
+                A link on another domain that also uses /external-site/
+            </a>
+        </div>
+</body>
+</html>
+        """.strip()
+
+        page = Page.from_html("https://example.com/", html, "example.com")
+        self.assertEqual(str(page), "https://example.com/")
+        self.assertEqual(page.title, "Test page")
+        self.assertEqual(page.language, "en")
+        self.assertEqual(page.html, html)
+        self.assertEqual(
+            page.text,
+            (
+                "Links "
+                "A regular link on the same domain. "
+                "An external link pointing to another domain "
+                "An external link missing its target "
+                "A link on another domain that also uses /external-site/"
+            ),
+        )
+        self.assertCountEqual(
+            page.components.values_list("class_name", flat=True),
+            ["a-external-link", "m-links"],
+        )
+        self.assertCountEqual(
+            page.links.values_list("href", flat=True),
+            [
+                "/external-site/",
+                "/page/",
+                "https://example.org/",
+                "https://example.org/external-site/",
+            ],
+        )
+
+    def test_from_html(self):
+        self.check_from_html()
+
+    def test_from_html_etree_fallback_parser(self):
+        with patch(
+            "lxml.html.fromstring",
+            side_effect=lxml.etree.ParserError("testing parser error"),
+        ):
+            self.check_from_html()
+
+    def test_from_html_no_body(self):
+        html = '<html lang="en"><head><title>Test page with no body</head></html>'
+        page = Page.from_html("https://example.com/", html, "example.com")
+        self.assertEqual(str(page), "https://example.com/")
+        self.assertEqual(page.title, "Test page with no body")
+        self.assertEqual(page.language, "en")
+        self.assertEqual(page.html, html)
+        self.assertIsNone(page.text)
+
+
+class ErrorTests(SimpleTestCase):
+    def test_error_str(self):
+        self.assertEqual(
+            str(Error(url="/not-found/", status_code=404)), "/not-found/ 404 !"
+        )
+
+    def test_error_str_with_referrer(self):
+        self.assertEqual(
+            str(
+                Redirect(
+                    url="/redirect/",
+                    referrer="/source/",
+                    status_code=301,
+                    location="/destination/",
+                )
+            ),
+            "/redirect/ (from /source/) 301 -> /destination/",
+        )