Merge pull request #105 from cfpb/feature/multi-crawl

Store multiple crawls in a single database
cfpb · Sep 13, 2024 · 54d1451 · 54d1451
2 parents 56383fd + 0610913
commit 54d1451
Show file tree

Hide file tree

Showing 29 changed files with 989 additions and 230 deletions.
diff --git a/.coveragerc b/.coveragerc
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -28,7 +28,8 @@ jobs:
         -r requirements/test.txt
     - name: Check formatting
       run: black . --check
+    - name: The test fixture should be kept in sync with the sample database
+      run: ./manage.py dumpdata --indent=4 crawler | diff crawler/fixtures/sample.json -
     - name: Run Python tests
-      run: coverage run ./manage.py test --keepdb
-    - name: Report test coverage
-      run: coverage report --show-missing
+      run: pytest
+
diff --git a/.prettierignore b/.prettierignore
@@ -5,3 +5,5 @@ htmlcov
 sample
 venv
 viewer/static
+
+crawler/fixtures/sample.json
diff --git a/README.md b/README.md
@@ -144,8 +144,62 @@ Please see
 [the dj-database-url documentation](https://github.com/jazzband/dj-database-url)
 for additional examples.
 
+If the `DATABASE_URL` environment variable is left unset, the
+[sample SQLite database file](#sample-test-data)
+will be used.
+
 ## Development
 
+### Sample test data
+
+This repository includes a sample database file for testing purposes at `sample/sample.sqlite3`.
+
+The sample database file is used by the viewer application when no other crawl
+database file has been specified.
+
+The source website content used to generate this file is included in this repository
+under the `sample/src` subdirectory.
+
+To regenerate the same database file, first delete it:
+
+```
+rm ./sample/sample.sqlite3
+```
+
+Then, start a Python webserver to serve the sample website locally:
+
+```
+cd ./sample/src && python -m http.server
+```
+
+This starts the sample website running at http://localhost:8000.
+
+Then, in another terminal, recreate the database file:
+
+```
+./manage.py migrate
+```
+
+Finally, perform the crawl against the locally running site:
+
+```
+./manage.py crawl http://localhost:8000/
+```
+
+These commands assume use of a local Python virtual environment;
+alternatively consider
+[using Docker](#crawling-a-website-and-viewing-the-crawl-results-using-docker).
+
+This command will receate the sample database file
+`sample/sample.sqlite3`
+with a fresh crawl.
+To write to a different database, use
+[the `DATABASE_URL` environment variable](#database-configuration).
+
+For consistency, the
+[Python test fixture](#testing)
+should be updated at the same time as the sample database.
+
 ### Testing
 
 To run Python unit tests, first install the test dependencies in your virtual environment:
@@ -157,11 +211,17 @@ pip install -r requirements/test.txt
 To run the tests:
 
 ```
-./manage.py test --keepdb
+pytest
 ```
 
-The `--keepdb` parameter is used because tests are run using
-[a fixed, pre-existing test database](#sample-test-data).
+The Python tests make use of a test fixture generated from
+[the sample database](#sample-test-data).
+
+To recreate this test fixture:
+
+```
+./manage.py dumpdata --indent=4 crawler > crawler/fixtures/sample.json
+```
 
 ### Code formatting
 
@@ -194,35 +254,6 @@ You can fix any problems by running:
 yarn prettier:fix
 ```
 
-### Sample test data
-
-This repository includes a sample database file for testing purposes at `/sample/sample.sqlite3`.
-
-The sample database file is used by the viewer application when no other crawl
-database file has been specified.
-
-The source website content used to generate this file is included in this repository
-under the `/sample/src` subdirectory.
-To regenerate these files, first serve the sample website locally:
-
-```
-cd ./sample/src && python -m http.server
-```
-
-This starts the sample website running at http://localhost:8000.
-
-Then, in another terminal, start a crawl against the locally running site:
-
-```
-./manage.py crawl http://localhost:8000/ --recreate ./sample/src/sample.sqlite3
-```
-
-(This uses a local Python virtual environment; see
-[above](#crawling-a-website-and-viewing-the-crawl-results-using-docker)
-for instructions on using Docker instead.)
-
-This command will overwrite the sample database with a fresh crawl.
-
 ## Deployment
 
 _For information on how this project is deployed at the CFPB,

diff --git a/crawler/fixtures/sample.json b/crawler/fixtures/sample.json
@@ -0,0 +1,157 @@
+[
+{
+    "model": "crawler.crawl",
+    "pk": 1,
+    "fields": {
+        "started": "2024-09-11T16:41:20.036Z",
+        "status": "Finished",
+        "config": {
+            "start_url": "http://localhost:8000",
+            "max_pages": 0,
+            "depth": 0
+        },
+        "failure_message": null
+    }
+},
+{
+    "model": "crawler.component",
+    "pk": 1,
+    "fields": {
+        "class_name": "o-sample"
+    }
+},
+{
+    "model": "crawler.link",
+    "pk": 1,
+    "fields": {
+        "href": "./file.xlsx"
+    }
+},
+{
+    "model": "crawler.link",
+    "pk": 2,
+    "fields": {
+        "href": "/child/"
+    }
+},
+{
+    "model": "crawler.link",
+    "pk": 3,
+    "fields": {
+        "href": "/child/?foo=bar"
+    }
+},
+{
+    "model": "crawler.link",
+    "pk": 4,
+    "fields": {
+        "href": "/child/?page=2"
+    }
+},
+{
+    "model": "crawler.link",
+    "pk": 5,
+    "fields": {
+        "href": "/child/?page=2&foo=bar"
+    }
+},
+{
+    "model": "crawler.link",
+    "pk": 6,
+    "fields": {
+        "href": "https://example.com/"
+    }
+},
+{
+    "model": "crawler.link",
+    "pk": 7,
+    "fields": {
+        "href": "https://example.com/file.xlsx"
+    }
+},
+{
+    "model": "crawler.link",
+    "pk": 8,
+    "fields": {
+        "href": "https://example.org/"
+    }
+},
+{
+    "model": "crawler.link",
+    "pk": 9,
+    "fields": {
+        "href": "/"
+    }
+},
+{
+    "model": "crawler.page",
+    "pk": 1,
+    "fields": {
+        "crawl": 1,
+        "timestamp": "2024-09-11T16:41:20.227Z",
+        "url": "http://localhost:8000/",
+        "title": "Sample homepage",
+        "language": "en",
+        "html": "<!DOCTYPE html>\n<html lang=\"en\">\n  <head>\n    <title>Sample homepage</title>\n    <meta charset=\"utf-8\" />\n    <meta http-equiv=\"Content-type\" content=\"text/html; charset=utf-8\" />\n    <meta name=\"viewport\" content=\"width=device-width, initial-scale=1\" />\n  </head>\n  <body>\n    <h1>Sample homepage</h1>\n    <p>This is sample content.</p>\n    <div class=\"o-sample\">This is a sample component.</div>\n    <p><a href=\"/child/\">This is a link to a child page.</a></p>\n    <p><a href=\"https://example.com/\">This is a link somewhere else.</a></p>\n    <p><a href=\"/external-site/?ext_url=https%3A%2F%2Fexample.org%2F\" data-pretty-href=\"https://example.org/\">This is an obfuscated link somewhere else.</a></p>\n    <p><a href=\"/external-site/?ext_url=https%3A%2F%2Fexample.org%2F\" data-pretty-href=\"https://example.org/\">This is another obfuscated link some\n    where else.</a></p>\n    <p><a href=\"./file.xlsx\">This links to a file.</a></p>\n    <p><a href=\"https://example.com/file.xlsx\">This links to a file somewhere else.</a></p>\n    <p><a href=\"/child/?page=2\">This link has a page query string parameter.</a></p>  <p><a href=\"/child/?foo=bar\">This link has a non-page query string parameter.</a></p>\n    <p><a href=\"/child/?page=2&foo=bar\">This link has multiple query string parameters.</a></p>\n  </body>\n</html>\n",
+        "text": "Sample homepage This is sample content. This is a sample component. This is a link to a child page. This is a link somewhere else. This is an obfuscated link somewhere else. This is another obfuscated link some where else. This links to a file. This links to a file somewhere else. This link has a page query string parameter. This link has a non-page query string parameter. This link has multiple query string parameters.",
+        "components": [
+            1
+        ],
+        "links": [
+            1,
+            2,
+            3,
+            4,
+            5,
+            6,
+            7,
+            8
+        ]
+    }
+},
+{
+    "model": "crawler.page",
+    "pk": 2,
+    "fields": {
+        "crawl": 1,
+        "timestamp": "2024-09-11T16:41:20.679Z",
+        "url": "http://localhost:8000/child/?page=2",
+        "title": "Sample child page",
+        "language": "en",
+        "html": "<!DOCTYPE html>\n<html lang=\"en\">\n  <head>\n    <title>Sample child page</title>\n    <meta charset=\"utf-8\" />\n    <meta http-equiv=\"Content-type\" content=\"text/html; charset=utf-8\" />\n    <meta name=\"viewport\" content=\"width=device-width, initial-scale=1\" />\n  </head>\n  <body>\n    <h1>Sample child page</h1>\n    <p>This is sample content.</p>\n    <p><a href=\"/\">This is a link to the homepage.</a></p>\n  </body>\n</html>\n",
+        "text": "Sample child page This is sample content. This is a link to the homepage.",
+        "components": [],
+        "links": [
+            9
+        ]
+    }
+},
+{
+    "model": "crawler.page",
+    "pk": 3,
+    "fields": {
+        "crawl": 1,
+        "timestamp": "2024-09-11T16:41:23.003Z",
+        "url": "http://localhost:8000/child/",
+        "title": "Sample child page",
+        "language": "en",
+        "html": "<!DOCTYPE html>\n<html lang=\"en\">\n  <head>\n    <title>Sample child page</title>\n    <meta charset=\"utf-8\" />\n    <meta http-equiv=\"Content-type\" content=\"text/html; charset=utf-8\" />\n    <meta name=\"viewport\" content=\"width=device-width, initial-scale=1\" />\n  </head>\n  <body>\n    <h1>Sample child page</h1>\n    <p>This is sample content.</p>\n    <p><a href=\"/\">This is a link to the homepage.</a></p>\n  </body>\n</html>\n",
+        "text": "Sample child page This is sample content. This is a link to the homepage.",
+        "components": [],
+        "links": [
+            9
+        ]
+    }
+},
+{
+    "model": "crawler.error",
+    "pk": 1,
+    "fields": {
+        "crawl": 1,
+        "timestamp": "2024-09-11T16:41:22.353Z",
+        "url": "https://example.com/file.xlsx",
+        "status_code": 404,
+        "referrer": "http://localhost:8000/"
+    }
+}
+]
diff --git a/crawler/management/commands/crawl.py b/crawler/management/commands/crawl.py
@@ -1,82 +1,15 @@
-import os
-import os.path
-
 import djclick as click
-from wpull.application.builder import Builder
-from wpull.application.options import AppArgumentParser
 
-from crawler import wpull_plugin
+from crawler.models import CrawlConfig
+from crawler.wpull.crawler import WpullCrawler
 
 
 @click.command()
 @click.argument("start_url")
-@click.argument("db_filename", type=click.Path())
 @click.option(
     "--max-pages", type=int, help="Maximum number of pages to crawl", default=0
 )
 @click.option("--depth", type=int, help="Maximum crawl depth", default=0)
-@click.option(
-    "--recreate",
-    is_flag=True,
-    show_default=True,
-    default=False,
-    help="Overwrite SQLite database if it already exists",
-)
-@click.option("--resume", is_flag=True)
-def command(start_url, db_filename, max_pages, depth, recreate, resume):
-    """Crawl a website to a SQLite database."""
-    if os.path.exists(db_filename):
-        if not recreate and not resume:
-            raise click.ClickException(
-                f"File {db_filename} already exists, "
-                "use --recreate to recreate "
-                "or --resume to resume a previous crawl."
-            )
-
-        if recreate:
-            os.remove(db_filename)
-
-    wpull_progress_filename = f"{db_filename}.wpull.db"
-    click.echo(
-        f"Storing crawl progress in {wpull_progress_filename}, use --resume to resume."
-    )
-
-    if not resume and os.path.exists(wpull_progress_filename):
-        os.remove(wpull_progress_filename)
-
-    arg_parser = AppArgumentParser()
-    args = arg_parser.parse_args(
-        [
-            start_url,
-            "--quiet",
-            "--recursive",
-            "--delete-after",
-            "--no-robots",
-            "--wait=0.5",
-            "--random-wait",
-            "--dns-timeout=5",
-            "--connect-timeout=5",
-            "--read-timeout=30",
-            "--session-timeout=30",
-            "--span-hosts",
-            "--link-extractors=html",
-            "--follow-tags=a",
-            "--user-agent=CFPB website indexer",
-            "--no-check-certificate",
-            f"--level={depth}",
-            f"--plugin-script={wpull_plugin.__file__}",
-            f"--plugin-args={db_filename},{max_pages}",
-            f"--database={wpull_progress_filename}",
-        ]
-    )
-    builder = Builder(args)
-    app = builder.build()
-
-    # This is required due to the use of async code in wpull. Unfortunately
-    # wpull hooks aren't called in a way that allows us to wrap Django database
-    # calls with sync_to_async. This is only safe because we only download one
-    # URL at a time.
-    # https://docs.djangoproject.com/en/3.2/topics/async/#async-safety
-    os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"
-
-    return app.run_sync()
+def command(start_url, max_pages, depth):
+    config = CrawlConfig(start_url=start_url, max_pages=max_pages, depth=depth)
+    return WpullCrawler().crawl(config)