-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #105 from cfpb/feature/multi-crawl
Store multiple crawls in a single database
- Loading branch information
Showing
29 changed files
with
989 additions
and
230 deletions.
There are no files selected for viewing
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,3 +5,5 @@ htmlcov | |
sample | ||
venv | ||
viewer/static | ||
|
||
crawler/fixtures/sample.json |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,157 @@ | ||
[ | ||
{ | ||
"model": "crawler.crawl", | ||
"pk": 1, | ||
"fields": { | ||
"started": "2024-09-11T16:41:20.036Z", | ||
"status": "Finished", | ||
"config": { | ||
"start_url": "http://localhost:8000", | ||
"max_pages": 0, | ||
"depth": 0 | ||
}, | ||
"failure_message": null | ||
} | ||
}, | ||
{ | ||
"model": "crawler.component", | ||
"pk": 1, | ||
"fields": { | ||
"class_name": "o-sample" | ||
} | ||
}, | ||
{ | ||
"model": "crawler.link", | ||
"pk": 1, | ||
"fields": { | ||
"href": "./file.xlsx" | ||
} | ||
}, | ||
{ | ||
"model": "crawler.link", | ||
"pk": 2, | ||
"fields": { | ||
"href": "/child/" | ||
} | ||
}, | ||
{ | ||
"model": "crawler.link", | ||
"pk": 3, | ||
"fields": { | ||
"href": "/child/?foo=bar" | ||
} | ||
}, | ||
{ | ||
"model": "crawler.link", | ||
"pk": 4, | ||
"fields": { | ||
"href": "/child/?page=2" | ||
} | ||
}, | ||
{ | ||
"model": "crawler.link", | ||
"pk": 5, | ||
"fields": { | ||
"href": "/child/?page=2&foo=bar" | ||
} | ||
}, | ||
{ | ||
"model": "crawler.link", | ||
"pk": 6, | ||
"fields": { | ||
"href": "https://example.com/" | ||
} | ||
}, | ||
{ | ||
"model": "crawler.link", | ||
"pk": 7, | ||
"fields": { | ||
"href": "https://example.com/file.xlsx" | ||
} | ||
}, | ||
{ | ||
"model": "crawler.link", | ||
"pk": 8, | ||
"fields": { | ||
"href": "https://example.org/" | ||
} | ||
}, | ||
{ | ||
"model": "crawler.link", | ||
"pk": 9, | ||
"fields": { | ||
"href": "/" | ||
} | ||
}, | ||
{ | ||
"model": "crawler.page", | ||
"pk": 1, | ||
"fields": { | ||
"crawl": 1, | ||
"timestamp": "2024-09-11T16:41:20.227Z", | ||
"url": "http://localhost:8000/", | ||
"title": "Sample homepage", | ||
"language": "en", | ||
"html": "<!DOCTYPE html>\n<html lang=\"en\">\n <head>\n <title>Sample homepage</title>\n <meta charset=\"utf-8\" />\n <meta http-equiv=\"Content-type\" content=\"text/html; charset=utf-8\" />\n <meta name=\"viewport\" content=\"width=device-width, initial-scale=1\" />\n </head>\n <body>\n <h1>Sample homepage</h1>\n <p>This is sample content.</p>\n <div class=\"o-sample\">This is a sample component.</div>\n <p><a href=\"/child/\">This is a link to a child page.</a></p>\n <p><a href=\"https://example.com/\">This is a link somewhere else.</a></p>\n <p><a href=\"/external-site/?ext_url=https%3A%2F%2Fexample.org%2F\" data-pretty-href=\"https://example.org/\">This is an obfuscated link somewhere else.</a></p>\n <p><a href=\"/external-site/?ext_url=https%3A%2F%2Fexample.org%2F\" data-pretty-href=\"https://example.org/\">This is another obfuscated link some\n where else.</a></p>\n <p><a href=\"./file.xlsx\">This links to a file.</a></p>\n <p><a href=\"https://example.com/file.xlsx\">This links to a file somewhere else.</a></p>\n <p><a href=\"/child/?page=2\">This link has a page query string parameter.</a></p> <p><a href=\"/child/?foo=bar\">This link has a non-page query string parameter.</a></p>\n <p><a href=\"/child/?page=2&foo=bar\">This link has multiple query string parameters.</a></p>\n </body>\n</html>\n", | ||
"text": "Sample homepage This is sample content. This is a sample component. This is a link to a child page. This is a link somewhere else. This is an obfuscated link somewhere else. This is another obfuscated link some where else. This links to a file. This links to a file somewhere else. This link has a page query string parameter. This link has a non-page query string parameter. This link has multiple query string parameters.", | ||
"components": [ | ||
1 | ||
], | ||
"links": [ | ||
1, | ||
2, | ||
3, | ||
4, | ||
5, | ||
6, | ||
7, | ||
8 | ||
] | ||
} | ||
}, | ||
{ | ||
"model": "crawler.page", | ||
"pk": 2, | ||
"fields": { | ||
"crawl": 1, | ||
"timestamp": "2024-09-11T16:41:20.679Z", | ||
"url": "http://localhost:8000/child/?page=2", | ||
"title": "Sample child page", | ||
"language": "en", | ||
"html": "<!DOCTYPE html>\n<html lang=\"en\">\n <head>\n <title>Sample child page</title>\n <meta charset=\"utf-8\" />\n <meta http-equiv=\"Content-type\" content=\"text/html; charset=utf-8\" />\n <meta name=\"viewport\" content=\"width=device-width, initial-scale=1\" />\n </head>\n <body>\n <h1>Sample child page</h1>\n <p>This is sample content.</p>\n <p><a href=\"/\">This is a link to the homepage.</a></p>\n </body>\n</html>\n", | ||
"text": "Sample child page This is sample content. This is a link to the homepage.", | ||
"components": [], | ||
"links": [ | ||
9 | ||
] | ||
} | ||
}, | ||
{ | ||
"model": "crawler.page", | ||
"pk": 3, | ||
"fields": { | ||
"crawl": 1, | ||
"timestamp": "2024-09-11T16:41:23.003Z", | ||
"url": "http://localhost:8000/child/", | ||
"title": "Sample child page", | ||
"language": "en", | ||
"html": "<!DOCTYPE html>\n<html lang=\"en\">\n <head>\n <title>Sample child page</title>\n <meta charset=\"utf-8\" />\n <meta http-equiv=\"Content-type\" content=\"text/html; charset=utf-8\" />\n <meta name=\"viewport\" content=\"width=device-width, initial-scale=1\" />\n </head>\n <body>\n <h1>Sample child page</h1>\n <p>This is sample content.</p>\n <p><a href=\"/\">This is a link to the homepage.</a></p>\n </body>\n</html>\n", | ||
"text": "Sample child page This is sample content. This is a link to the homepage.", | ||
"components": [], | ||
"links": [ | ||
9 | ||
] | ||
} | ||
}, | ||
{ | ||
"model": "crawler.error", | ||
"pk": 1, | ||
"fields": { | ||
"crawl": 1, | ||
"timestamp": "2024-09-11T16:41:22.353Z", | ||
"url": "https://example.com/file.xlsx", | ||
"status_code": 404, | ||
"referrer": "http://localhost:8000/" | ||
} | ||
} | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,82 +1,15 @@ | ||
import os | ||
import os.path | ||
|
||
import djclick as click | ||
from wpull.application.builder import Builder | ||
from wpull.application.options import AppArgumentParser | ||
|
||
from crawler import wpull_plugin | ||
from crawler.models import CrawlConfig | ||
from crawler.wpull.crawler import WpullCrawler | ||
|
||
|
||
@click.command() | ||
@click.argument("start_url") | ||
@click.argument("db_filename", type=click.Path()) | ||
@click.option( | ||
"--max-pages", type=int, help="Maximum number of pages to crawl", default=0 | ||
) | ||
@click.option("--depth", type=int, help="Maximum crawl depth", default=0) | ||
@click.option( | ||
"--recreate", | ||
is_flag=True, | ||
show_default=True, | ||
default=False, | ||
help="Overwrite SQLite database if it already exists", | ||
) | ||
@click.option("--resume", is_flag=True) | ||
def command(start_url, db_filename, max_pages, depth, recreate, resume): | ||
"""Crawl a website to a SQLite database.""" | ||
if os.path.exists(db_filename): | ||
if not recreate and not resume: | ||
raise click.ClickException( | ||
f"File {db_filename} already exists, " | ||
"use --recreate to recreate " | ||
"or --resume to resume a previous crawl." | ||
) | ||
|
||
if recreate: | ||
os.remove(db_filename) | ||
|
||
wpull_progress_filename = f"{db_filename}.wpull.db" | ||
click.echo( | ||
f"Storing crawl progress in {wpull_progress_filename}, use --resume to resume." | ||
) | ||
|
||
if not resume and os.path.exists(wpull_progress_filename): | ||
os.remove(wpull_progress_filename) | ||
|
||
arg_parser = AppArgumentParser() | ||
args = arg_parser.parse_args( | ||
[ | ||
start_url, | ||
"--quiet", | ||
"--recursive", | ||
"--delete-after", | ||
"--no-robots", | ||
"--wait=0.5", | ||
"--random-wait", | ||
"--dns-timeout=5", | ||
"--connect-timeout=5", | ||
"--read-timeout=30", | ||
"--session-timeout=30", | ||
"--span-hosts", | ||
"--link-extractors=html", | ||
"--follow-tags=a", | ||
"--user-agent=CFPB website indexer", | ||
"--no-check-certificate", | ||
f"--level={depth}", | ||
f"--plugin-script={wpull_plugin.__file__}", | ||
f"--plugin-args={db_filename},{max_pages}", | ||
f"--database={wpull_progress_filename}", | ||
] | ||
) | ||
builder = Builder(args) | ||
app = builder.build() | ||
|
||
# This is required due to the use of async code in wpull. Unfortunately | ||
# wpull hooks aren't called in a way that allows us to wrap Django database | ||
# calls with sync_to_async. This is only safe because we only download one | ||
# URL at a time. | ||
# https://docs.djangoproject.com/en/3.2/topics/async/#async-safety | ||
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true" | ||
|
||
return app.run_sync() | ||
def command(start_url, max_pages, depth): | ||
config = CrawlConfig(start_url=start_url, max_pages=max_pages, depth=depth) | ||
return WpullCrawler().crawl(config) |
Oops, something went wrong.