From 94e87da4ec77965b0859e42ab147ce22e7bd0b12 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Sat, 20 Jul 2024 21:17:07 -0400 Subject: [PATCH] feat: get_spider_list() (used by Schedule and ListSpiders) supports the [settings] section. Add Root._config (hack) #526. chore: Root has the correct default for runner, if it were somehow not configured. --- docs/api.rst | 2 + docs/news.rst | 1 + integration_tests/test_webservice.py | 3 +- pyproject.toml | 4 + scrapyd/webservice.py | 27 +++++-- scrapyd/website.py | 25 +++--- tests/__init__.py | 5 ++ tests/conftest.py | 14 +++- .../localproject/__init__.py | 0 .../localproject/settings.py | 0 .../localproject/spiders/__init__.py | 0 .../localproject/spiders/example.py | 0 .../{localproject => filesystem}/scrapy.cfg | 0 tests/test_webservice.py | 81 +++++++++++++------ tests/test_website.py | 5 +- 15 files changed, 119 insertions(+), 48 deletions(-) rename tests/fixtures/{localproject => filesystem}/localproject/__init__.py (100%) rename tests/fixtures/{localproject => filesystem}/localproject/settings.py (100%) rename tests/fixtures/{localproject => filesystem}/localproject/spiders/__init__.py (100%) rename tests/fixtures/{localproject => filesystem}/localproject/spiders/example.py (100%) rename tests/fixtures/{localproject => filesystem}/scrapy.cfg (100%) diff --git a/docs/api.rst b/docs/api.rst index 19ad1887..a7ff89f0 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -201,6 +201,8 @@ listspiders.json Get the spiders in a version of a project. +.. note:: If :ref:`the project is in a Python module rather than a Python egg`, don't set the ``version`` parameter. + Supported request methods ``GET`` Parameters diff --git a/docs/news.rst b/docs/news.rst index f7dc285c..403d8228 100644 --- a/docs/news.rst +++ b/docs/news.rst @@ -36,6 +36,7 @@ Web UI API ^^^ +- The :ref:`schedule.json` and :ref:`listspiders.json` webservices support Scrapy projects stored as Python modules, using the previously undocumented :ref:`[settings]` section. - Clarify error messages, for example: - ``'project' parameter is required``, instead of ``'project'`` (KeyError) diff --git a/integration_tests/test_webservice.py b/integration_tests/test_webservice.py index 888bcf4d..31b24a44 100644 --- a/integration_tests/test_webservice.py +++ b/integration_tests/test_webservice.py @@ -46,7 +46,7 @@ def test_options(webservice, method): assert response.headers["Allow"] == f"OPTIONS, HEAD, {method}" -# cancel.json, status.json and listjobs.json will error with "project '%b' not found" on directory traversal attempts. +# Cancel, Status, ListJobs and ListSpiders will error with "project '%b' not found" on directory traversal attempts. # The egg storage (in get_project_list, called by get_spider_queues, called by QueuePoller, used by these webservices) # would need to find a project like "../project" (which is impossible with the default eggstorage) to not error. @pytest.mark.parametrize( @@ -76,7 +76,6 @@ def test_project_directory_traversal(webservice, method, params): ("webservice", "method", "params"), [ ("schedule", "post", {"spider": "s"}), - ("listspiders", "get", {}), ], ) def test_project_directory_traversal_runner(webservice, method, params): diff --git a/pyproject.toml b/pyproject.toml index 583ce628..0ca7d33d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,6 +30,10 @@ ignore = [ "DTZ005", # `datetime.datetime.now()` called without a `tz` argument "DTZ006", # `datetime.datetime.fromtimestamp()` called without a `tz` argument "DTZ007", # Naive datetime constructed using `datetime.datetime.strptime()` without %z + + # https://github.com/scrapy/scrapyd/issues/526 + "FIX002", + "SLF001", ] [tool.ruff.lint.flake8-builtins] diff --git a/scrapyd/webservice.py b/scrapyd/webservice.py index 1eba8db7..13e71835 100644 --- a/scrapyd/webservice.py +++ b/scrapyd/webservice.py @@ -57,7 +57,7 @@ def wrapper(self, txrequest, *args, **kwargs): return decorator -def get_spider_list(project, runner=None, pythonpath=None, version=None): +def get_spider_list(project, runner=None, pythonpath=None, version=None, config=None): """Return the spider list from the given project, using the given runner""" # UtilsCache uses JsonSqliteDict, which encodes the project's value as JSON, but JSON allows only string keys, @@ -72,16 +72,24 @@ def get_spider_list(project, runner=None, pythonpath=None, version=None): except KeyError: pass + settings = {} if config is None else dict(config.items("settings", default=[])) + + # runner should always be set. if runner is None: - runner = Config().get("runner") + runner = Config().get("runner", "scrapyd.runner") env = os.environ.copy() env["PYTHONIOENCODING"] = "UTF-8" env["SCRAPY_PROJECT"] = project + # TODO(jpmckinney): Remove + # https://github.com/scrapy/scrapyd/commit/17520a32d19726dc4b09611ff732a9ff3fa8b6ea if pythonpath: env["PYTHONPATH"] = pythonpath if version: env["SCRAPYD_EGG_VERSION"] = version + if project in dict(settings): + env["SCRAPY_SETTINGS_MODULE"] = settings[project] + pargs = [sys.executable, "-m", runner, "list", "-s", "LOG_STDOUT=0"] proc = Popen(pargs, stdout=PIPE, stderr=PIPE, env=env) out, err = proc.communicate() @@ -196,7 +204,7 @@ def render_POST(self, txrequest, project, spider, version, jobid, priority, sett raise error.Error(code=http.OK, message=b"version '%b' not found" % version.encode()) raise error.Error(code=http.OK, message=b"project '%b' not found" % project.encode()) - spiders = get_spider_list(project, version=version, runner=self.root.runner) + spiders = get_spider_list(project, version=version, runner=self.root.runner, config=self.root._config) if spider not in spiders: raise error.Error(code=http.OK, message=b"spider '%b' not found" % spider.encode()) @@ -251,9 +259,11 @@ def render_POST(self, txrequest, project, version, egg): ) self.root.eggstorage.put(BytesIO(egg), project, version) - spiders = get_spider_list(project, version=version, runner=self.root.runner) self.root.update_projects() + + spiders = get_spider_list(project, version=version, runner=self.root.runner, config=self.root._config) UtilsCache.invalid_cache(project) + return { "node_name": self.root.nodename, "status": "ok", @@ -280,12 +290,13 @@ class ListSpiders(WsResource): @param("project") @param("_version", dest="version", required=False, default=None) def render_GET(self, txrequest, project, version): - if self.root.eggstorage.get(project, version) == (None, None): - if version: - raise error.Error(code=http.OK, message=b"version '%b' not found" % version.encode()) + if project not in self.root.scheduler.queues: raise error.Error(code=http.OK, message=b"project '%b' not found" % project.encode()) - spiders = get_spider_list(project, version=version, runner=self.root.runner) + if version and self.root.eggstorage.get(project, version) == (None, None): + raise error.Error(code=http.OK, message=b"version '%b' not found" % version.encode()) + + spiders = get_spider_list(project, version=version, runner=self.root.runner, config=self.root._config) return {"node_name": self.root.nodename, "status": "ok", "spiders": spiders} diff --git a/scrapyd/website.py b/scrapyd/website.py index 45fc11e2..a8aac5e9 100644 --- a/scrapyd/website.py +++ b/scrapyd/website.py @@ -128,22 +128,27 @@ def _getFilesAndDirectories(self, directory): class Root(resource.Resource): def __init__(self, config, app): resource.Resource.__init__(self) + + logs_dir = config.get("logs_dir") + items_dir = config.get("items_dir") + + self.app = app + # TODO(jpmckinney): Make Config a Component + # https://github.com/scrapy/scrapyd/issues/526 + self._config = config self.debug = config.getboolean("debug", False) - self.runner = config.get("runner") + self.runner = config.get("runner", "scrapyd.runner") self.prefix_header = config.get("prefix_header") - logsdir = config.get("logs_dir") - itemsdir = config.get("items_dir") - self.local_items = itemsdir and (urlparse(itemsdir).scheme.lower() in ["", "file"]) - self.app = app + self.local_items = items_dir and (urlparse(items_dir).scheme.lower() in ["", "file"]) self.nodename = config.get("node_name", socket.gethostname()) + self.putChild(b"", Home(self, self.local_items)) - if logsdir: - self.putChild(b"logs", File(logsdir.encode("ascii", "ignore"), "text/plain")) + if logs_dir: + self.putChild(b"logs", File(logs_dir.encode("ascii", "ignore"), "text/plain")) if self.local_items: - self.putChild(b"items", File(itemsdir, "text/plain")) + self.putChild(b"items", File(items_dir, "text/plain")) self.putChild(b"jobs", Jobs(self, self.local_items)) - services = config.items("services", ()) - for service_name, service_path in services: + for service_name, service_path in config.items("services", default=[]): service_cls = load_object(service_path) self.putChild(service_name.encode(), service_cls(self)) diff --git a/tests/__init__.py b/tests/__init__.py index 30673c5f..8722edb5 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -6,5 +6,10 @@ def get_egg_data(basename): return pkgutil.get_data("tests", f"fixtures/{basename}.egg") +def has_settings(root): + # https://github.com/scrapy/scrapyd/issues/526 + return root._config.cp.has_section("settings") + + def root_add_version(root, project, version, basename): root.eggstorage.put(io.BytesIO(get_egg_data(basename)), project, version) diff --git a/tests/conftest.py b/tests/conftest.py index 7dd5a3d6..5c3a3b4c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,3 +1,6 @@ +import os.path +import shutil + import pytest from twisted.web import http from twisted.web.http import Request @@ -8,6 +11,8 @@ from scrapyd.website import Root from tests import root_add_version +BASEDIR = os.path.abspath(os.path.dirname(__file__)) + @pytest.fixture() def txrequest(): @@ -19,22 +24,25 @@ def txrequest(): # Use this fixture when testing the Scrapyd web UI or API or writing configuration files. @pytest.fixture() def chdir(monkeypatch, tmpdir): - return monkeypatch.chdir(tmpdir) + monkeypatch.chdir(tmpdir) + return tmpdir @pytest.fixture( params=[ None, (Config.SECTION, "items_dir", "items"), - ("settings", "localproject", "tests.fixtures.localbot.settings"), + ("settings", "localproject", "localproject.settings"), ], ids=["default", "items_dir", "settings"], ) def root(request, chdir): config = Config() if request.param: - if request.param[0] != Config.SECTION: + if request.param[0] == "settings": config.cp.add_section(request.param[0]) + # Copy the local files to be in the Python path. + shutil.copytree(os.path.join(BASEDIR, "fixtures", "filesystem"), os.path.join(chdir), dirs_exist_ok=True) config.cp.set(*request.param) return Root(config, application(config)) diff --git a/tests/fixtures/localproject/localproject/__init__.py b/tests/fixtures/filesystem/localproject/__init__.py similarity index 100% rename from tests/fixtures/localproject/localproject/__init__.py rename to tests/fixtures/filesystem/localproject/__init__.py diff --git a/tests/fixtures/localproject/localproject/settings.py b/tests/fixtures/filesystem/localproject/settings.py similarity index 100% rename from tests/fixtures/localproject/localproject/settings.py rename to tests/fixtures/filesystem/localproject/settings.py diff --git a/tests/fixtures/localproject/localproject/spiders/__init__.py b/tests/fixtures/filesystem/localproject/spiders/__init__.py similarity index 100% rename from tests/fixtures/localproject/localproject/spiders/__init__.py rename to tests/fixtures/filesystem/localproject/spiders/__init__.py diff --git a/tests/fixtures/localproject/localproject/spiders/example.py b/tests/fixtures/filesystem/localproject/spiders/example.py similarity index 100% rename from tests/fixtures/localproject/localproject/spiders/example.py rename to tests/fixtures/filesystem/localproject/spiders/example.py diff --git a/tests/fixtures/localproject/scrapy.cfg b/tests/fixtures/filesystem/scrapy.cfg similarity index 100% rename from tests/fixtures/localproject/scrapy.cfg rename to tests/fixtures/filesystem/scrapy.cfg diff --git a/tests/test_webservice.py b/tests/test_webservice.py index 56fc8ee5..2b880f94 100644 --- a/tests/test_webservice.py +++ b/tests/test_webservice.py @@ -11,7 +11,7 @@ from scrapyd.jobstorage import Job from scrapyd.txapp import application from scrapyd.webservice import UtilsCache, get_spider_list -from tests import get_egg_data, root_add_version +from tests import get_egg_data, has_settings, root_add_version job1 = Job("p1", "s1", end_time=datetime.datetime(2001, 2, 3, 4, 5, 6, 7)) @@ -21,9 +21,8 @@ def app(chdir): return application -def has_settings(root): - # The configuration is not guaranteed to be accessible here, but it is for now. - return root.scheduler.config.cp.has_section("settings") +def get_local_projects(root): + return ["localproject"] if has_settings(root) else [] def add_test_version(app, project, version, basename): @@ -154,32 +153,44 @@ def test_daemonstatus(txrequest, root_with_egg): @pytest.mark.parametrize( - ("extra_args", "spiders"), + ("args", "spiders", "run_only_if_has_settings"), [ - ({}, ["spider1", "spider2", "spider3"]), - ({b"_version": [b"r1"]}, ["spider1", "spider2"]), + ({b"project": [b"myproject"]}, ["spider1", "spider2", "spider3"], False), + ({b"project": [b"myproject"], b"_version": [b"r1"]}, ["spider1", "spider2"], False), + ({b"project": [b"localproject"]}, ["example"], True), ], ) -def test_list_spiders(txrequest, root, extra_args, spiders): +def test_list_spiders(txrequest, root, args, spiders, run_only_if_has_settings): + if run_only_if_has_settings and not has_settings(root): + pytest.skip("[settings] section is not set") + UtilsCache.invalid_cache("myproject") # test_get_spider_list fills cache root_add_version(root, "myproject", "r1", "mybot") root_add_version(root, "myproject", "r2", "mybot2") + root.update_projects() expected = {"status": "ok", "spiders": spiders} - assert_content(txrequest, root, "listspiders", {b"project": [b"myproject"], **extra_args}, expected) + assert_content(txrequest, root, "listspiders", args, expected) @pytest.mark.parametrize( - ("args", "param"), + ("args", "param", "run_only_if_has_settings"), [ - ({b"project": [b"nonexistent"]}, "project"), - ({b"project": [b"myproject"], b"_version": [b"nonexistent"]}, "version"), + ({b"project": [b"nonexistent"]}, "project", False), + ({b"project": [b"myproject"], b"_version": [b"nonexistent"]}, "version", False), + ({b"project": [b"localproject"], b"_version": [b"nonexistent"]}, "version", True), ], ) -def test_list_spiders_nonexistent(txrequest, root, args, param): +def test_list_spiders_nonexistent(txrequest, root, args, param, run_only_if_has_settings): + if run_only_if_has_settings and not has_settings(root): + pytest.skip("[settings] section is not set") + + UtilsCache.invalid_cache("myproject") # test_get_spider_list fills cache + root_add_version(root, "myproject", "r1", "mybot") root_add_version(root, "myproject", "r2", "mybot2") + root.update_projects() txrequest.args = args.copy() with pytest.raises(error.Error) as exc: @@ -200,14 +211,12 @@ def test_list_versions_nonexistent(txrequest, root): def test_list_projects(txrequest, root_with_egg): - expected = {"status": "ok", "projects": ["quotesbot"]} - if has_settings(root_with_egg): - expected["projects"].append("localproject") + expected = {"status": "ok", "projects": ["quotesbot", *get_local_projects(root_with_egg)]} assert_content(txrequest, root_with_egg, "listprojects", {}, expected) def test_list_projects_empty(txrequest, root): - expected = {"status": "ok", "projects": []} + expected = {"status": "ok", "projects": get_local_projects(root)} assert_content(txrequest, root, "listprojects", {}, expected) @@ -237,6 +246,8 @@ def test_list_jobs_finished(txrequest, root_with_egg): def test_delete_version(txrequest, root): + projects = get_local_projects(root) + root_add_version(root, "myproject", "r1", "mybot") root_add_version(root, "myproject", "r2", "mybot2") root.update_projects() @@ -258,7 +269,7 @@ def test_delete_version(txrequest, root): txrequest.args = {} content = root.children[b"listprojects.json"].render_GET(txrequest) - assert content["projects"] == ["myproject"] + assert content["projects"] == ["myproject", *projects] # Delete another version. txrequest.args = {b"project": [b"myproject"], b"version": [b"r1"]} @@ -269,7 +280,7 @@ def test_delete_version(txrequest, root): txrequest.args = {} content = root.children[b"listprojects.json"].render_GET(txrequest) - assert content["projects"] == [] # "myproject" if root.update_projects() weren't celled + assert content["projects"] == [*projects] # "myproject" if root.update_projects() weren't celled @pytest.mark.parametrize( @@ -289,13 +300,15 @@ def test_delete_version_nonexistent(txrequest, root_with_egg, args, message): def test_delete_project(txrequest, root_with_egg): + projects = get_local_projects(root_with_egg) + txrequest.args = {b"project": [b"quotesbot"]} content = root_with_egg.children[b"listspiders.json"].render_GET(txrequest) assert content["spiders"] == ["toscrape-css", "toscrape-xpath"] txrequest.args = {} content = root_with_egg.children[b"listprojects.json"].render_GET(txrequest) - assert content["projects"] == ["quotesbot"] + assert content["projects"] == ["quotesbot", *projects] # Delete the project. txrequest.args = {b"project": [b"quotesbot"]} @@ -311,7 +324,7 @@ def test_delete_project(txrequest, root_with_egg): txrequest.args = {} content = root_with_egg.children[b"listprojects.json"].render_GET(txrequest) - assert content["projects"] == [] # "quotesbot" if root.update_projects() weren't celled + assert content["projects"] == [*projects] # "quotesbot" if root.update_projects() weren't celled def test_delete_project_nonexistent(txrequest, root): @@ -323,7 +336,7 @@ def test_delete_project_nonexistent(txrequest, root): assert exc.value.message == b"project 'nonexistent' not found" -def test_addversion(txrequest, root): +def test_add_version(txrequest, root): txrequest.args = {b"project": [b"quotesbot"], b"version": [b"0.1"], b"egg": [get_egg_data("quotesbot")]} eggstorage = root.app.getComponent(IEggStorage) @@ -342,7 +355,7 @@ def test_addversion(txrequest, root): assert no_version == "0_1" -def test_addversion_same(txrequest, root): +def test_add_version_same(txrequest, root): txrequest.args = {b"project": [b"quotesbot"], b"version": [b"0.1"], b"egg": [get_egg_data("quotesbot")]} eggstorage = root.app.getComponent(IEggStorage) @@ -402,6 +415,27 @@ def test_schedule_nonexistent_spider(txrequest, root_with_egg): assert exc.value.message == b"spider 'nonexistent' not found" +# Cancel, Status, ListJobs and ListSpiders error with "project '%b' not found" on directory traversal attempts. +# The egg storage (in get_project_list, called by get_spider_queues, called by QueuePoller, used by these webservices) +# would need to find a project like "../project" (which is impossible with the default eggstorage) to not error. +@pytest.mark.parametrize( + ("method", "basename", "args"), + [ + ("POST", "cancel", {b"project": [b"../p"], b"job": [b"aaa"]}), + ("GET", "status", {b"project": [b"../p"], b"job": [b"aaa"]}), + ("GET", "listspiders", {b"project": [b"../p"]}), + ("GET", "listjobs", {b"project": [b"../p"]}), + ], +) +def test_project_directory_traversal_notfound(txrequest, root, method, basename, args): + txrequest.args = args.copy() + with pytest.raises(error.Error) as exc: + getattr(root.children[b"%b.json" % basename.encode()], f"render_{method}")(txrequest) + + assert exc.value.status == b"200" + assert exc.value.message == b"project '../p' not found" + + @pytest.mark.parametrize( ("endpoint", "attach_egg", "method"), [ @@ -430,7 +464,6 @@ def test_project_directory_traversal(txrequest, root, endpoint, attach_egg, meth ("endpoint", "method"), [ (b"schedule.json", "render_POST"), - (b"listspiders.json", "render_GET"), ], ) def test_project_directory_traversal_runner(txrequest, root, endpoint, method): diff --git a/tests/test_website.py b/tests/test_website.py index a6cb7cac..8baa7e21 100644 --- a/tests/test_website.py +++ b/tests/test_website.py @@ -1,3 +1,6 @@ +from tests import has_settings + + def test_render_jobs(txrequest, root_with_egg): content = root_with_egg.children[b"jobs"].render(txrequest) expect_headers = { @@ -23,7 +26,7 @@ def test_render_home(txrequest, root_with_egg): content = root_with_egg.children[b""].render_GET(txrequest) expect_headers = { b"Content-Type": [b"text/html; charset=utf-8"], - b"Content-Length": [b"714"], + b"Content-Length": [b"736" if has_settings(root_with_egg) else b"714"], } if root_with_egg.local_items: expect_headers[b"Content-Length"] = [b"751"]