From a5a8f42a6a259bde5adcf658e6390499ea33d193 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Iv=C3=A1n=20de=20Prado?= Date: Mon, 29 Nov 2021 16:50:32 +0100 Subject: [PATCH 01/34] meta module --- setup.py | 1 + tests/po_lib/__init__.py | 27 ++++ tests/po_lib/a_module.py | 14 ++ tests/po_lib/an_empty_module.py | 0 tests/po_lib/an_empty_package/__init__.py | 0 tests/po_lib/nested_package/__init__.py | 13 ++ .../po_lib/nested_package/a_nested_module.py | 19 +++ tests/test_decorators.py | 78 +++++++++++ web_poet/__init__.py | 3 +- web_poet/meta.py | 130 ++++++++++++++++++ 10 files changed, 284 insertions(+), 1 deletion(-) create mode 100644 tests/po_lib/__init__.py create mode 100644 tests/po_lib/a_module.py create mode 100644 tests/po_lib/an_empty_module.py create mode 100644 tests/po_lib/an_empty_package/__init__.py create mode 100644 tests/po_lib/nested_package/__init__.py create mode 100644 tests/po_lib/nested_package/a_nested_module.py create mode 100644 tests/test_decorators.py create mode 100644 web_poet/meta.py diff --git a/setup.py b/setup.py index d702e6d7..6ada64e8 100644 --- a/setup.py +++ b/setup.py @@ -22,6 +22,7 @@ install_requires=( 'attrs', 'parsel', + 'url-matcher @ git+ssh://git@github.com/zytedata/url-matcher@main#egg=url-matcher', ), classifiers=( 'Development Status :: 2 - Pre-Alpha', diff --git a/tests/po_lib/__init__.py b/tests/po_lib/__init__.py new file mode 100644 index 00000000..d66e5065 --- /dev/null +++ b/tests/po_lib/__init__.py @@ -0,0 +1,27 @@ +from url_matcher import Patterns + +from web_poet.meta import handle_urls + + +class POTopLevelOverriden1: + ... + + +class POTopLevelOverriden2: + ... + + +# This first annotation is ignored. A single annotation per namespace per class is allowed +@handle_urls("example.com", POTopLevelOverriden1) +@handle_urls("example.com", POTopLevelOverriden1, exclude="/*.jpg|", priority=300) +class POTopLevel1: + expected_overrides = POTopLevelOverriden1 + expected_patterns = Patterns(["example.com"], ["/*.jpg|"], priority=300) + + +# The second annotation is for a different namespace +@handle_urls("example.com", POTopLevelOverriden2) +@handle_urls("example.org", POTopLevelOverriden2, namespace="secondary") +class POTopLevel2: + expected_overrides = POTopLevelOverriden2 + expected_patterns = Patterns(["example.com"]) diff --git a/tests/po_lib/a_module.py b/tests/po_lib/a_module.py new file mode 100644 index 00000000..b3d41d3c --- /dev/null +++ b/tests/po_lib/a_module.py @@ -0,0 +1,14 @@ +from url_matcher import Patterns + +from web_poet.meta import handle_urls + + +class POModuleOverriden: + ... + + +@handle_urls("example.com", overrides=POModuleOverriden) +class POModule(object): + expected_overrides = POModuleOverriden + expected_patterns = Patterns(["example.com"]) + diff --git a/tests/po_lib/an_empty_module.py b/tests/po_lib/an_empty_module.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/po_lib/an_empty_package/__init__.py b/tests/po_lib/an_empty_package/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/po_lib/nested_package/__init__.py b/tests/po_lib/nested_package/__init__.py new file mode 100644 index 00000000..e8a8795d --- /dev/null +++ b/tests/po_lib/nested_package/__init__.py @@ -0,0 +1,13 @@ +from url_matcher import Patterns + +from web_poet.meta import handle_urls + + +class PONestedPkgOverriden: + ... + + +@handle_urls(include=["example.com", "example.org"], exclude=["/*.jpg|"], overrides=PONestedPkgOverriden) +class PONestedPkg(object): + expected_overrides = PONestedPkgOverriden + expected_patterns = Patterns(["example.com", "example.org"], ["/*.jpg|"]) diff --git a/tests/po_lib/nested_package/a_nested_module.py b/tests/po_lib/nested_package/a_nested_module.py new file mode 100644 index 00000000..9526d752 --- /dev/null +++ b/tests/po_lib/nested_package/a_nested_module.py @@ -0,0 +1,19 @@ +from url_matcher import Patterns + +from web_poet.meta import handle_urls + + +class PONestedModuleOverriden: + ... + + +class PONestedModuleOverridenSecondary: + ... + + +@handle_urls(include=["example.com", "example.org"], exclude=["/*.jpg|"], overrides=PONestedModuleOverriden) +@handle_urls("example.com", PONestedModuleOverridenSecondary, namespace="secondary") +class PONestedModule(object): + expected_overrides = PONestedModuleOverriden + expected_patterns = Patterns(include=["example.com", "example.org"], exclude=["/*.jpg|"]) + diff --git a/tests/test_decorators.py b/tests/test_decorators.py new file mode 100644 index 00000000..203effbd --- /dev/null +++ b/tests/test_decorators.py @@ -0,0 +1,78 @@ +import sys +from pathlib import Path + +import pytest +from url_matcher import Patterns + +from po_lib import POTopLevel1, POTopLevel2, POTopLevelOverriden1, POTopLevelOverriden2 +from po_lib.a_module import POModule +from po_lib.nested_package import PONestedPkg +from po_lib.nested_package.a_nested_module import PONestedModule, PONestedModuleOverridenSecondary +from web_poet.meta import find_page_object_overrides + + +POS = {POTopLevel1, POTopLevel2, POModule, PONestedPkg, PONestedModule} + + +@pytest.fixture(autouse=True) +def run_before_and_after_tests(tmpdir): + """Fixture to execute asserts before and after a test is run in this module""" + + # Ensuring po_lib is in the packages path + tests_path = str(Path(__file__).absolute().parent) + sys.path.append(tests_path) + + yield # this is where the testing happens + + # Cleaning up path + del sys.path[-1] + + +def test_list_page_objects_from_pkg(): + """Tests that metadata is extracted properly from the po_lib package""" + pos = find_page_object_overrides("po_lib") + assert pos.keys() == POS + + for po, spec in pos.items(): + assert spec.overrides == po.expected_overrides, po + assert spec.patterns == po.expected_patterns, po + + +def test_list_page_objects_from_module(): + pos = find_page_object_overrides("po_lib.a_module") + assert len(pos) == 1 + spec = pos[POModule] + assert spec.patterns == POModule.expected_patterns + assert spec.overrides == POModule.expected_overrides + + +def test_list_page_objects_from_empty_module(): + pos = find_page_object_overrides("po_lib.an_empty_module") + assert len(pos) == 0 + + +def test_list_page_objects_from_empty_pkg(): + pos = find_page_object_overrides("po_lib.an_empty_package") + assert len(pos) == 0 + + +def test_list_page_objects_from_unknown_module(): + with pytest.raises(ImportError): + find_page_object_overrides("po_lib.unknown_module") + + +def test_list_page_objects_from_namespace(): + pos = find_page_object_overrides("po_lib", namespace="secondary") + assert len(pos) == 2 + + potop2 = pos[POTopLevel2] + assert potop2.patterns == Patterns(["example.org"]) + assert potop2.overrides == POTopLevelOverriden2 + + pones = pos[PONestedModule] + assert pones.patterns == Patterns(["example.com"]) + assert pones.overrides == PONestedModuleOverridenSecondary + + +def test_list_page_objects_from_empty_namespace(): + assert find_page_object_overrides("po_lib", namespace="foo") == {} diff --git a/web_poet/__init__.py b/web_poet/__init__.py index e5bf1f54..cb17f8d7 100644 --- a/web_poet/__init__.py +++ b/web_poet/__init__.py @@ -1,2 +1,3 @@ from .pages import WebPage, ItemPage, ItemWebPage, Injectable -from .page_inputs import ResponseData \ No newline at end of file +from .page_inputs import ResponseData +from .meta import handle_urls \ No newline at end of file diff --git a/web_poet/meta.py b/web_poet/meta.py new file mode 100644 index 00000000..ba165138 --- /dev/null +++ b/web_poet/meta.py @@ -0,0 +1,130 @@ +import importlib +import importlib.util +import pkgutil +import sys +from dataclasses import dataclass +from typing import Iterable, Union, List, Callable, Dict + +from url_matcher import Patterns + + +HANDLE_URLS_NAMESPACES_KEY = "_handle_urls_namespaces_" + + +@dataclass(frozen=True) +class HandleUrlsSpec: + patterns: Patterns + overrides: Callable + + +def _as_list(value: Union[str, Iterable[str], None]) -> List[str]: + """ + >>> _as_list(None) + [] + >>> _as_list("foo") + ['foo'] + >>> _as_list(["foo", "bar"]) + ['foo', 'bar'] + """ + if value is None: + return [] + if isinstance(value, str): + return [value] + return list(value) + + +def handle_urls(include: Union[str, Iterable[str]], + overrides: Callable, + *, + exclude: Union[str, Iterable[str], None] = None, + priority: int = 500, + namespace: str = "", + ): + """ + Class decorator that indicates that the decorated Page Object should be used instead of the overridden one + for a particular set the URLs. + + Which Page Object is overridden is determined by the `overrides` parameter. + + Over which URLs the overridden happens is determined by the `include`, `exclude` and `priority` parameters. + See the documentation of the `url-matcher` package for more information about them. + + Different namespaces can be used to create different groups of annotations. The default namespace is the empty + string. + + For the example, the following Page Object is decorated with the `handle_urls` decorator: + + .. code-block:: python + + @handle_urls("example.com", overrides=ProductPageObject) + class ExampleComProductPage(ItemPage): + ... + + The annotation indicates that the `ExampleComProductPage` Page Object should be used + instead of the `ProductPageObject` Page Object for all the URLs whose domain is `example.com`. + + :param include: Defines the URLs that should be handled by the overridden Page Object. + :param overrides: The Page Object that should be replaced by the annotated one. + :param exclude: Defines URLs over which the override should not happen. + :param priority: The priority in case of conflicting annotations. + """ + + def wrapper(cls): + module = sys.modules[cls.__module__] + if not hasattr(module, HANDLE_URLS_NAMESPACES_KEY): + setattr(module, HANDLE_URLS_NAMESPACES_KEY, {}) + + handle_urls_dict = getattr(module, HANDLE_URLS_NAMESPACES_KEY) + spec = HandleUrlsSpec( + patterns=Patterns( + include=_as_list(include), + exclude=_as_list(exclude), + priority=priority), + overrides=overrides + ) + namespace_dict = handle_urls_dict.setdefault(namespace, {}) + if cls not in namespace_dict: + # If it was already defined, we don't want to override it + namespace_dict[cls] = spec + return cls + + return wrapper + + +def walk_modules(module: str) -> Iterable[type]: + """ + Return all modules from a module recursively. Note that this will import all the modules and submodules. + It returns the provided module as well. + """ + def onerror(mod): + raise + + spec = importlib.util.find_spec(module) + if not spec: + raise ImportError(f"Module {module} not found") + mod = importlib.import_module(spec.name) + yield mod + if spec.submodule_search_locations: + for info in pkgutil.walk_packages(spec.submodule_search_locations, f"{spec.name}.", onerror): + mod = importlib.import_module(info.name) + yield mod + + +def find_page_object_overrides(module: str, namespace: str = "") -> Dict[Callable, HandleUrlsSpec]: + """ + Find all the Page Objects overrides in the given module/package and it submodules. + + Only the page objects that have been decorated with the `handle_urls` decorator will be returned. + + Note that this will import the module and its submodules. + + :param module: The module or package to search in + :param namespace: Only return page objects in this namespace + :return: Return a dictionary with all the page objects where the key is the page object type and the value is its + associated :py:class:`web_poet.decorators.HandleUrlsSpec` metadata. + """ + page_objects = {} + for module in walk_modules(module): + handle_urls_dict = getattr(module, HANDLE_URLS_NAMESPACES_KEY, {}) + page_objects.update(handle_urls_dict.get(namespace) or {}) + return page_objects From ec80b691d0e63ffe2e740b38734c2676aaa5d1d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Iv=C3=A1n=20de=20Prado?= Date: Mon, 29 Nov 2021 17:36:57 +0100 Subject: [PATCH 02/34] CMD for listing overrides --- setup.py | 1 + web_poet/__main__.py | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 web_poet/__main__.py diff --git a/setup.py b/setup.py index 6ada64e8..c8e66aed 100644 --- a/setup.py +++ b/setup.py @@ -23,6 +23,7 @@ 'attrs', 'parsel', 'url-matcher @ git+ssh://git@github.com/zytedata/url-matcher@main#egg=url-matcher', + 'tabulate', ), classifiers=( 'Development Status :: 2 - Pre-Alpha', diff --git a/web_poet/__main__.py b/web_poet/__main__.py new file mode 100644 index 00000000..88c3c49e --- /dev/null +++ b/web_poet/__main__.py @@ -0,0 +1,33 @@ +import argparse + +import tabulate + +from web_poet.meta import find_page_object_overrides + + +def qualified_name(cls: callable) -> str: + return f"{cls.__module__}.{cls.__name__}" + + +def main(): + parser = argparse.ArgumentParser(description= + "Tool that list the Page Object overrides from a package or module recursively") + parser.add_argument( + "module", + metavar="PKG_OR_MODULE", + type=str, + help="A package or module to list overrides from", + ) + parser.add_argument( + "--namespace", "-n", metavar="NAMESPACE", type=str, help="Namespace to list overrides from", + default="" + ) + args = parser.parse_args() + table = [("Use this", "instead of that", "for URL patterns", "else these URL patterns", "with priority")] + table += [(qualified_name(po), qualified_name(meta.overrides), meta.patterns.include, meta.patterns.exclude, meta.patterns.priority) + for po, meta in find_page_object_overrides(args.module, args.namespace).items()] + print(tabulate.tabulate(table, headers="firstrow")) + + +if __name__ == "__main__": + main() \ No newline at end of file From 308bd1d3340d522e845e3630a236d16ae8208349 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Iv=C3=A1n=20de=20Prado?= Date: Tue, 30 Nov 2021 10:29:03 +0100 Subject: [PATCH 03/34] Refactoring with better names and structures and meta inclusion --- tests/po_lib/__init__.py | 4 +- tests/po_lib/a_module.py | 5 +- tests/po_lib/nested_package/__init__.py | 3 +- .../po_lib/nested_package/a_nested_module.py | 3 +- .../{test_decorators.py => test_overrides.py} | 58 +++++++++++-------- web_poet/__init__.py | 2 +- web_poet/__main__.py | 15 ++--- web_poet/{meta.py => overrides.py} | 47 +++++++++------ 8 files changed, 82 insertions(+), 55 deletions(-) rename tests/{test_decorators.py => test_overrides.py} (50%) rename web_poet/{meta.py => overrides.py} (70%) diff --git a/tests/po_lib/__init__.py b/tests/po_lib/__init__.py index d66e5065..8e1ca325 100644 --- a/tests/po_lib/__init__.py +++ b/tests/po_lib/__init__.py @@ -1,6 +1,6 @@ from url_matcher import Patterns -from web_poet.meta import handle_urls +from web_poet import handle_urls class POTopLevelOverriden1: @@ -17,6 +17,7 @@ class POTopLevelOverriden2: class POTopLevel1: expected_overrides = POTopLevelOverriden1 expected_patterns = Patterns(["example.com"], ["/*.jpg|"], priority=300) + expected_meta = {} # The second annotation is for a different namespace @@ -25,3 +26,4 @@ class POTopLevel1: class POTopLevel2: expected_overrides = POTopLevelOverriden2 expected_patterns = Patterns(["example.com"]) + expected_meta = {} diff --git a/tests/po_lib/a_module.py b/tests/po_lib/a_module.py index b3d41d3c..88dfe6ca 100644 --- a/tests/po_lib/a_module.py +++ b/tests/po_lib/a_module.py @@ -1,14 +1,15 @@ from url_matcher import Patterns -from web_poet.meta import handle_urls +from web_poet import handle_urls class POModuleOverriden: ... -@handle_urls("example.com", overrides=POModuleOverriden) +@handle_urls("example.com", overrides=POModuleOverriden, extra_arg="foo") class POModule(object): expected_overrides = POModuleOverriden expected_patterns = Patterns(["example.com"]) + expected_meta = {"extra_arg": "foo"} diff --git a/tests/po_lib/nested_package/__init__.py b/tests/po_lib/nested_package/__init__.py index e8a8795d..49eaeb36 100644 --- a/tests/po_lib/nested_package/__init__.py +++ b/tests/po_lib/nested_package/__init__.py @@ -1,6 +1,6 @@ from url_matcher import Patterns -from web_poet.meta import handle_urls +from web_poet import handle_urls class PONestedPkgOverriden: @@ -11,3 +11,4 @@ class PONestedPkgOverriden: class PONestedPkg(object): expected_overrides = PONestedPkgOverriden expected_patterns = Patterns(["example.com", "example.org"], ["/*.jpg|"]) + expected_meta = {} diff --git a/tests/po_lib/nested_package/a_nested_module.py b/tests/po_lib/nested_package/a_nested_module.py index 9526d752..dc866a69 100644 --- a/tests/po_lib/nested_package/a_nested_module.py +++ b/tests/po_lib/nested_package/a_nested_module.py @@ -1,6 +1,6 @@ from url_matcher import Patterns -from web_poet.meta import handle_urls +from web_poet import handle_urls class PONestedModuleOverriden: @@ -16,4 +16,5 @@ class PONestedModuleOverridenSecondary: class PONestedModule(object): expected_overrides = PONestedModuleOverriden expected_patterns = Patterns(include=["example.com", "example.org"], exclude=["/*.jpg|"]) + expected_meta = {} diff --git a/tests/test_decorators.py b/tests/test_overrides.py similarity index 50% rename from tests/test_decorators.py rename to tests/test_overrides.py index 203effbd..293e2f1e 100644 --- a/tests/test_decorators.py +++ b/tests/test_overrides.py @@ -4,11 +4,11 @@ import pytest from url_matcher import Patterns -from po_lib import POTopLevel1, POTopLevel2, POTopLevelOverriden1, POTopLevelOverriden2 +from po_lib import POTopLevel1, POTopLevel2, POTopLevelOverriden2 from po_lib.a_module import POModule from po_lib.nested_package import PONestedPkg from po_lib.nested_package.a_nested_module import PONestedModule, PONestedModuleOverridenSecondary -from web_poet.meta import find_page_object_overrides +from web_poet.overrides import find_page_object_overrides POS = {POTopLevel1, POTopLevel2, POModule, PONestedPkg, PONestedModule} @@ -30,30 +30,32 @@ def run_before_and_after_tests(tmpdir): def test_list_page_objects_from_pkg(): """Tests that metadata is extracted properly from the po_lib package""" - pos = find_page_object_overrides("po_lib") - assert pos.keys() == POS + rules = find_page_object_overrides("po_lib") + assert {po.use for po in rules} == POS - for po, spec in pos.items(): - assert spec.overrides == po.expected_overrides, po - assert spec.patterns == po.expected_patterns, po + for rule in rules: + assert rule.instead_of == rule.use.expected_overrides, rule.use + assert rule.for_patterns == rule.use.expected_patterns, rule.use + assert rule.meta == rule.use.expected_meta, rule.use def test_list_page_objects_from_module(): - pos = find_page_object_overrides("po_lib.a_module") - assert len(pos) == 1 - spec = pos[POModule] - assert spec.patterns == POModule.expected_patterns - assert spec.overrides == POModule.expected_overrides + rules = find_page_object_overrides("po_lib.a_module") + assert len(rules) == 1 + rule = rules[0] + assert rule.use == POModule + assert rule.for_patterns == POModule.expected_patterns + assert rule.instead_of == POModule.expected_overrides def test_list_page_objects_from_empty_module(): - pos = find_page_object_overrides("po_lib.an_empty_module") - assert len(pos) == 0 + rules = find_page_object_overrides("po_lib.an_empty_module") + assert len(rules) == 0 def test_list_page_objects_from_empty_pkg(): - pos = find_page_object_overrides("po_lib.an_empty_package") - assert len(pos) == 0 + rules = find_page_object_overrides("po_lib.an_empty_package") + assert len(rules) == 0 def test_list_page_objects_from_unknown_module(): @@ -62,17 +64,23 @@ def test_list_page_objects_from_unknown_module(): def test_list_page_objects_from_namespace(): - pos = find_page_object_overrides("po_lib", namespace="secondary") - assert len(pos) == 2 + rules = find_page_object_overrides("po_lib", namespace="secondary") + assert len(rules) == 2 + rule_for = {po.use: po for po in rules} - potop2 = pos[POTopLevel2] - assert potop2.patterns == Patterns(["example.org"]) - assert potop2.overrides == POTopLevelOverriden2 + potop2 = rule_for[POTopLevel2] + assert potop2.for_patterns == Patterns(["example.org"]) + assert potop2.instead_of == POTopLevelOverriden2 - pones = pos[PONestedModule] - assert pones.patterns == Patterns(["example.com"]) - assert pones.overrides == PONestedModuleOverridenSecondary + pones = rule_for[PONestedModule] + assert pones.for_patterns == Patterns(["example.com"]) + assert pones.instead_of == PONestedModuleOverridenSecondary def test_list_page_objects_from_empty_namespace(): - assert find_page_object_overrides("po_lib", namespace="foo") == {} + assert find_page_object_overrides("po_lib", namespace="foo") == [] + + +def test_cmd(): + from web_poet.__main__ import main + main(["po_lib"]) \ No newline at end of file diff --git a/web_poet/__init__.py b/web_poet/__init__.py index cb17f8d7..c19cd6b1 100644 --- a/web_poet/__init__.py +++ b/web_poet/__init__.py @@ -1,3 +1,3 @@ from .pages import WebPage, ItemPage, ItemWebPage, Injectable from .page_inputs import ResponseData -from .meta import handle_urls \ No newline at end of file +from .overrides import handle_urls \ No newline at end of file diff --git a/web_poet/__main__.py b/web_poet/__main__.py index 88c3c49e..2af4ea89 100644 --- a/web_poet/__main__.py +++ b/web_poet/__main__.py @@ -1,15 +1,16 @@ import argparse +from typing import Callable import tabulate -from web_poet.meta import find_page_object_overrides +from web_poet.overrides import find_page_object_overrides -def qualified_name(cls: callable) -> str: +def qualified_name(cls: Callable) -> str: return f"{cls.__module__}.{cls.__name__}" -def main(): +def main(args=None): parser = argparse.ArgumentParser(description= "Tool that list the Page Object overrides from a package or module recursively") parser.add_argument( @@ -22,10 +23,10 @@ def main(): "--namespace", "-n", metavar="NAMESPACE", type=str, help="Namespace to list overrides from", default="" ) - args = parser.parse_args() - table = [("Use this", "instead of that", "for URL patterns", "else these URL patterns", "with priority")] - table += [(qualified_name(po), qualified_name(meta.overrides), meta.patterns.include, meta.patterns.exclude, meta.patterns.priority) - for po, meta in find_page_object_overrides(args.module, args.namespace).items()] + args = parser.parse_args(args) + table = [("Use this", "instead of that", "for URL patterns", "else these URL patterns", "with priority", "meta")] + table += [(qualified_name(rule.use), qualified_name(rule.instead_of), rule.for_patterns.include, rule.for_patterns.exclude, rule.for_patterns.priority, rule.meta) + for rule in find_page_object_overrides(args.module, args.namespace)] print(tabulate.tabulate(table, headers="firstrow")) diff --git a/web_poet/meta.py b/web_poet/overrides.py similarity index 70% rename from web_poet/meta.py rename to web_poet/overrides.py index ba165138..329d4174 100644 --- a/web_poet/meta.py +++ b/web_poet/overrides.py @@ -2,19 +2,28 @@ import importlib.util import pkgutil import sys -from dataclasses import dataclass -from typing import Iterable, Union, List, Callable, Dict +from dataclasses import dataclass, field +from typing import Iterable, Union, List, Callable, Dict, Any from url_matcher import Patterns -HANDLE_URLS_NAMESPACES_KEY = "_handle_urls_namespaces_" +OVERRIDES_NAMESPACES_KEY = "_overrides_namespaces_" @dataclass(frozen=True) class HandleUrlsSpec: patterns: Patterns overrides: Callable + meta: Dict[str, Any] = field(default_factory=dict) + + +@dataclass(frozen=True) +class OverrideRule: + for_patterns: Patterns + use: Callable + instead_of: Callable + meta: Dict[str, Any] = field(default_factory=dict) def _as_list(value: Union[str, Iterable[str], None]) -> List[str]: @@ -39,6 +48,7 @@ def handle_urls(include: Union[str, Iterable[str]], exclude: Union[str, Iterable[str], None] = None, priority: int = 500, namespace: str = "", + **kwargs: Dict[str, Any] ): """ Class decorator that indicates that the decorated Page Object should be used instead of the overridden one @@ -46,7 +56,7 @@ def handle_urls(include: Union[str, Iterable[str]], Which Page Object is overridden is determined by the `overrides` parameter. - Over which URLs the overridden happens is determined by the `include`, `exclude` and `priority` parameters. + Over which URLs the override happens is determined by the `include`, `exclude` and `priority` parameters. See the documentation of the `url-matcher` package for more information about them. Different namespaces can be used to create different groups of annotations. The default namespace is the empty @@ -61,26 +71,29 @@ class ExampleComProductPage(ItemPage): ... The annotation indicates that the `ExampleComProductPage` Page Object should be used - instead of the `ProductPageObject` Page Object for all the URLs whose domain is `example.com`. + instead of the `ProductPageObject` Page Object for all the URLs whose top level domain is `example.com`. + + Any extra parameters are stored as meta information that can be later used. :param include: Defines the URLs that should be handled by the overridden Page Object. :param overrides: The Page Object that should be replaced by the annotated one. :param exclude: Defines URLs over which the override should not happen. - :param priority: The priority in case of conflicting annotations. + :param priority: The resolution priority in case of conflicting annotations. """ def wrapper(cls): module = sys.modules[cls.__module__] - if not hasattr(module, HANDLE_URLS_NAMESPACES_KEY): - setattr(module, HANDLE_URLS_NAMESPACES_KEY, {}) + if not hasattr(module, OVERRIDES_NAMESPACES_KEY): + setattr(module, OVERRIDES_NAMESPACES_KEY, {}) - handle_urls_dict = getattr(module, HANDLE_URLS_NAMESPACES_KEY) + handle_urls_dict = getattr(module, OVERRIDES_NAMESPACES_KEY) spec = HandleUrlsSpec( patterns=Patterns( include=_as_list(include), exclude=_as_list(exclude), priority=priority), - overrides=overrides + overrides=overrides, + meta=kwargs, ) namespace_dict = handle_urls_dict.setdefault(namespace, {}) if cls not in namespace_dict: @@ -110,21 +123,21 @@ def onerror(mod): yield mod -def find_page_object_overrides(module: str, namespace: str = "") -> Dict[Callable, HandleUrlsSpec]: +def find_page_object_overrides(module: str, namespace: str = "") -> List[OverrideRule]: """ Find all the Page Objects overrides in the given module/package and it submodules. - Only the page objects that have been decorated with the `handle_urls` decorator will be returned. + The page objects that have been decorated with the `handle_urls` decorator will be returned. Note that this will import the module and its submodules. :param module: The module or package to search in - :param namespace: Only return page objects in this namespace - :return: Return a dictionary with all the page objects where the key is the page object type and the value is its - associated :py:class:`web_poet.decorators.HandleUrlsSpec` metadata. + :param namespace: Only return page objects overrides in this namespace + :return: Return a list of :py:class:`web_poet.overrides.OverrideRule` metadata. """ page_objects = {} for module in walk_modules(module): - handle_urls_dict = getattr(module, HANDLE_URLS_NAMESPACES_KEY, {}) + handle_urls_dict = getattr(module, OVERRIDES_NAMESPACES_KEY, {}) page_objects.update(handle_urls_dict.get(namespace) or {}) - return page_objects + return [OverrideRule(for_patterns=spec.patterns, use=po, instead_of=spec.overrides, meta=spec.meta) + for po, spec in page_objects.items()] From aa8000dcc956cec3ddaeba7702a7eff7b3c581ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Iv=C3=A1n=20de=20Prado?= Date: Tue, 30 Nov 2021 10:43:47 +0100 Subject: [PATCH 04/34] docstring --- tests/po_lib/__init__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/po_lib/__init__.py b/tests/po_lib/__init__.py index 8e1ca325..d94653ee 100644 --- a/tests/po_lib/__init__.py +++ b/tests/po_lib/__init__.py @@ -1,3 +1,6 @@ +""" +This package is just for overrides testing purposes. +""" from url_matcher import Patterns from web_poet import handle_urls From a2d5cb67d7863a0a71e2647e21c9a6fc63e390f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Iv=C3=A1n=20de=20Prado?= Date: Tue, 30 Nov 2021 10:48:01 +0100 Subject: [PATCH 05/34] Fix url_matcher dep --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index c8e66aed..579e633c 100644 --- a/setup.py +++ b/setup.py @@ -22,7 +22,7 @@ install_requires=( 'attrs', 'parsel', - 'url-matcher @ git+ssh://git@github.com/zytedata/url-matcher@main#egg=url-matcher', + 'url-matcher @ git+https://git@github.com/zytedata/url-matcher@main#egg=url-matcher', 'tabulate', ), classifiers=( From 1f1f410db28f2ff0a8870b2d0c5decd43bd91238 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Iv=C3=A1n=20de=20Prado?= Date: Tue, 30 Nov 2021 10:59:15 +0100 Subject: [PATCH 06/34] Fix CI tests --- tests/test_overrides.py | 41 ++++++++++++----------------------------- 1 file changed, 12 insertions(+), 29 deletions(-) diff --git a/tests/test_overrides.py b/tests/test_overrides.py index 293e2f1e..1064a5da 100644 --- a/tests/test_overrides.py +++ b/tests/test_overrides.py @@ -1,36 +1,19 @@ -import sys -from pathlib import Path - import pytest from url_matcher import Patterns -from po_lib import POTopLevel1, POTopLevel2, POTopLevelOverriden2 -from po_lib.a_module import POModule -from po_lib.nested_package import PONestedPkg -from po_lib.nested_package.a_nested_module import PONestedModule, PONestedModuleOverridenSecondary +from tests.po_lib import POTopLevel1, POTopLevel2, POTopLevelOverriden2 +from tests.po_lib.a_module import POModule +from tests.po_lib.nested_package import PONestedPkg +from tests.po_lib.nested_package.a_nested_module import PONestedModule, PONestedModuleOverridenSecondary from web_poet.overrides import find_page_object_overrides POS = {POTopLevel1, POTopLevel2, POModule, PONestedPkg, PONestedModule} -@pytest.fixture(autouse=True) -def run_before_and_after_tests(tmpdir): - """Fixture to execute asserts before and after a test is run in this module""" - - # Ensuring po_lib is in the packages path - tests_path = str(Path(__file__).absolute().parent) - sys.path.append(tests_path) - - yield # this is where the testing happens - - # Cleaning up path - del sys.path[-1] - - def test_list_page_objects_from_pkg(): """Tests that metadata is extracted properly from the po_lib package""" - rules = find_page_object_overrides("po_lib") + rules = find_page_object_overrides("tests.po_lib") assert {po.use for po in rules} == POS for rule in rules: @@ -40,7 +23,7 @@ def test_list_page_objects_from_pkg(): def test_list_page_objects_from_module(): - rules = find_page_object_overrides("po_lib.a_module") + rules = find_page_object_overrides("tests.po_lib.a_module") assert len(rules) == 1 rule = rules[0] assert rule.use == POModule @@ -49,22 +32,22 @@ def test_list_page_objects_from_module(): def test_list_page_objects_from_empty_module(): - rules = find_page_object_overrides("po_lib.an_empty_module") + rules = find_page_object_overrides("tests.po_lib.an_empty_module") assert len(rules) == 0 def test_list_page_objects_from_empty_pkg(): - rules = find_page_object_overrides("po_lib.an_empty_package") + rules = find_page_object_overrides("tests.po_lib.an_empty_package") assert len(rules) == 0 def test_list_page_objects_from_unknown_module(): with pytest.raises(ImportError): - find_page_object_overrides("po_lib.unknown_module") + find_page_object_overrides("tests.po_lib.unknown_module") def test_list_page_objects_from_namespace(): - rules = find_page_object_overrides("po_lib", namespace="secondary") + rules = find_page_object_overrides("tests.po_lib", namespace="secondary") assert len(rules) == 2 rule_for = {po.use: po for po in rules} @@ -78,9 +61,9 @@ def test_list_page_objects_from_namespace(): def test_list_page_objects_from_empty_namespace(): - assert find_page_object_overrides("po_lib", namespace="foo") == [] + assert find_page_object_overrides("tests.po_lib", namespace="foo") == [] def test_cmd(): from web_poet.__main__ import main - main(["po_lib"]) \ No newline at end of file + main(["tests.po_lib"]) \ No newline at end of file From bdb8987b27287c9c869a5536bab3222b1260b110 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Iv=C3=A1n=20de=20Prado?= Date: Tue, 30 Nov 2021 11:18:21 +0100 Subject: [PATCH 07/34] Make mypy happy again --- CHANGELOG.rst | 5 +++++ tests/po_lib/__init__.py | 17 +++++++++++++---- tests/po_lib/a_module.py | 5 +++-- tests/po_lib/nested_package/__init__.py | 5 +++-- tests/po_lib/nested_package/a_nested_module.py | 5 +++-- tox.ini | 1 + web_poet/overrides.py | 6 +++--- 7 files changed, 31 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index b6a215a0..09dd1608 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -2,6 +2,11 @@ Changelog ========= +TBR +------------------ + +* ``handle_urls`` decorator and ``find_page_object_overrides`` function added. + 0.1.1 (2021-06-02) ------------------ diff --git a/tests/po_lib/__init__.py b/tests/po_lib/__init__.py index d94653ee..05612595 100644 --- a/tests/po_lib/__init__.py +++ b/tests/po_lib/__init__.py @@ -1,11 +1,20 @@ """ This package is just for overrides testing purposes. """ +from collections import Callable +from typing import Dict, Any + from url_matcher import Patterns from web_poet import handle_urls +class POBase: + expected_overrides: Callable + expected_patterns: Patterns + expected_meta: Dict[str, Any] + + class POTopLevelOverriden1: ... @@ -17,16 +26,16 @@ class POTopLevelOverriden2: # This first annotation is ignored. A single annotation per namespace per class is allowed @handle_urls("example.com", POTopLevelOverriden1) @handle_urls("example.com", POTopLevelOverriden1, exclude="/*.jpg|", priority=300) -class POTopLevel1: +class POTopLevel1(POBase): expected_overrides = POTopLevelOverriden1 expected_patterns = Patterns(["example.com"], ["/*.jpg|"], priority=300) - expected_meta = {} + expected_meta = {} # type: ignore # The second annotation is for a different namespace @handle_urls("example.com", POTopLevelOverriden2) @handle_urls("example.org", POTopLevelOverriden2, namespace="secondary") -class POTopLevel2: +class POTopLevel2(POBase): expected_overrides = POTopLevelOverriden2 expected_patterns = Patterns(["example.com"]) - expected_meta = {} + expected_meta = {} # type: ignore diff --git a/tests/po_lib/a_module.py b/tests/po_lib/a_module.py index 88dfe6ca..0dcf04c6 100644 --- a/tests/po_lib/a_module.py +++ b/tests/po_lib/a_module.py @@ -1,5 +1,6 @@ from url_matcher import Patterns +from tests.po_lib import POBase from web_poet import handle_urls @@ -8,8 +9,8 @@ class POModuleOverriden: @handle_urls("example.com", overrides=POModuleOverriden, extra_arg="foo") -class POModule(object): +class POModule(POBase): expected_overrides = POModuleOverriden expected_patterns = Patterns(["example.com"]) - expected_meta = {"extra_arg": "foo"} + expected_meta = {"extra_arg": "foo"} # type: ignore diff --git a/tests/po_lib/nested_package/__init__.py b/tests/po_lib/nested_package/__init__.py index 49eaeb36..537a995d 100644 --- a/tests/po_lib/nested_package/__init__.py +++ b/tests/po_lib/nested_package/__init__.py @@ -1,5 +1,6 @@ from url_matcher import Patterns +from tests.po_lib import POBase from web_poet import handle_urls @@ -8,7 +9,7 @@ class PONestedPkgOverriden: @handle_urls(include=["example.com", "example.org"], exclude=["/*.jpg|"], overrides=PONestedPkgOverriden) -class PONestedPkg(object): +class PONestedPkg(POBase): expected_overrides = PONestedPkgOverriden expected_patterns = Patterns(["example.com", "example.org"], ["/*.jpg|"]) - expected_meta = {} + expected_meta = {} # type: ignore diff --git a/tests/po_lib/nested_package/a_nested_module.py b/tests/po_lib/nested_package/a_nested_module.py index dc866a69..7d9536b3 100644 --- a/tests/po_lib/nested_package/a_nested_module.py +++ b/tests/po_lib/nested_package/a_nested_module.py @@ -1,5 +1,6 @@ from url_matcher import Patterns +from tests.po_lib import POBase from web_poet import handle_urls @@ -13,8 +14,8 @@ class PONestedModuleOverridenSecondary: @handle_urls(include=["example.com", "example.org"], exclude=["/*.jpg|"], overrides=PONestedModuleOverriden) @handle_urls("example.com", PONestedModuleOverridenSecondary, namespace="secondary") -class PONestedModule(object): +class PONestedModule(POBase): expected_overrides = PONestedModuleOverriden expected_patterns = Patterns(include=["example.com", "example.org"], exclude=["/*.jpg|"]) - expected_meta = {} + expected_meta = {} # type: ignore diff --git a/tox.ini b/tox.ini index 836537c6..2339e2b5 100644 --- a/tox.ini +++ b/tox.ini @@ -15,6 +15,7 @@ commands = [testenv:mypy] deps = mypy + types-tabulate commands = mypy --ignore-missing-imports web_poet tests diff --git a/web_poet/overrides.py b/web_poet/overrides.py index 329d4174..2043500a 100644 --- a/web_poet/overrides.py +++ b/web_poet/overrides.py @@ -48,7 +48,7 @@ def handle_urls(include: Union[str, Iterable[str]], exclude: Union[str, Iterable[str], None] = None, priority: int = 500, namespace: str = "", - **kwargs: Dict[str, Any] + **kwargs ): """ Class decorator that indicates that the decorated Page Object should be used instead of the overridden one @@ -104,7 +104,7 @@ def wrapper(cls): return wrapper -def walk_modules(module: str) -> Iterable[type]: +def walk_modules(module: str) -> Iterable: """ Return all modules from a module recursively. Note that this will import all the modules and submodules. It returns the provided module as well. @@ -135,7 +135,7 @@ def find_page_object_overrides(module: str, namespace: str = "") -> List[Overrid :param namespace: Only return page objects overrides in this namespace :return: Return a list of :py:class:`web_poet.overrides.OverrideRule` metadata. """ - page_objects = {} + page_objects: Dict[Callable, HandleUrlsSpec] = {} for module in walk_modules(module): handle_urls_dict = getattr(module, OVERRIDES_NAMESPACES_KEY, {}) page_objects.update(handle_urls_dict.get(namespace) or {}) From a3e3eea9b4c99aa51c525ab4cf814b00f53e79a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Iv=C3=A1n=20de=20Prado?= Date: Tue, 30 Nov 2021 11:39:44 +0100 Subject: [PATCH 08/34] Documentation fixed --- docs/api_reference.rst | 10 ++++++++++ docs/requirements.txt | 16 ++++++++-------- web_poet/overrides.py | 16 +++++++++------- 3 files changed, 27 insertions(+), 15 deletions(-) diff --git a/docs/api_reference.rst b/docs/api_reference.rst index 011f878e..235979c6 100644 --- a/docs/api_reference.rst +++ b/docs/api_reference.rst @@ -45,3 +45,13 @@ Mixins .. autoclass:: web_poet.mixins.ResponseShortcutsMixin :members: :no-special-members: + + +Overrides +========= + +.. autofunction:: web_poet.handle_urls + +.. automodule:: web_poet.overrides + :members: + :exclude-members: handle_urls \ No newline at end of file diff --git a/docs/requirements.txt b/docs/requirements.txt index 119c8afe..5374ffd8 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,8 +1,8 @@ -Sphinx==3.0.3 -sphinx-rtd-theme==0.4.3 -sphinxcontrib-applehelp==1.0.2 -sphinxcontrib-devhelp==1.0.2 -sphinxcontrib-htmlhelp==1.0.3 -sphinxcontrib-jsmath==1.0.1 -sphinxcontrib-qthelp==1.0.3 -sphinxcontrib-serializinghtml==1.1.4 +Sphinx +sphinx-rtd-theme +sphinxcontrib-applehelp +sphinxcontrib-devhelp +sphinxcontrib-htmlhelp +sphinxcontrib-jsmath +sphinxcontrib-qthelp +sphinxcontrib-serializinghtml diff --git a/web_poet/overrides.py b/web_poet/overrides.py index 2043500a..5ed58984 100644 --- a/web_poet/overrides.py +++ b/web_poet/overrides.py @@ -13,6 +13,7 @@ @dataclass(frozen=True) class HandleUrlsSpec: + """Meta information used by the :py:func:`web_poet.handle_urls` decorator""" patterns: Patterns overrides: Callable meta: Dict[str, Any] = field(default_factory=dict) @@ -20,6 +21,7 @@ class HandleUrlsSpec: @dataclass(frozen=True) class OverrideRule: + """A single override rule. Specify when a page object should be used instead of another""" for_patterns: Patterns use: Callable instead_of: Callable @@ -54,15 +56,15 @@ def handle_urls(include: Union[str, Iterable[str]], Class decorator that indicates that the decorated Page Object should be used instead of the overridden one for a particular set the URLs. - Which Page Object is overridden is determined by the `overrides` parameter. + Which Page Object is overridden is determined by the ``overrides`` parameter. - Over which URLs the override happens is determined by the `include`, `exclude` and `priority` parameters. - See the documentation of the `url-matcher` package for more information about them. + Over which URLs the override happens is determined by the ``include``, ``exclude`` and ``priority`` parameters. + See the documentation of the ``url-matcher`` package for more information about them. Different namespaces can be used to create different groups of annotations. The default namespace is the empty string. - For the example, the following Page Object is decorated with the `handle_urls` decorator: + For the example, the following Page Object is decorated with the ``handle_urls`` decorator: .. code-block:: python @@ -70,8 +72,8 @@ def handle_urls(include: Union[str, Iterable[str]], class ExampleComProductPage(ItemPage): ... - The annotation indicates that the `ExampleComProductPage` Page Object should be used - instead of the `ProductPageObject` Page Object for all the URLs whose top level domain is `example.com`. + The annotation indicates that the ``ExampleComProductPage`` Page Object should be used + instead of the ``ProductPageObject`` Page Object for all the URLs whose top level domain is ``example.com``. Any extra parameters are stored as meta information that can be later used. @@ -127,7 +129,7 @@ def find_page_object_overrides(module: str, namespace: str = "") -> List[Overrid """ Find all the Page Objects overrides in the given module/package and it submodules. - The page objects that have been decorated with the `handle_urls` decorator will be returned. + The page objects that have been decorated with the ``handle_urls`` decorator will be returned. Note that this will import the module and its submodules. From ef9945b37e4e266e342e90b23b7193d33b9e3c30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Iv=C3=A1n=20de=20Prado?= Date: Tue, 30 Nov 2021 11:51:52 +0100 Subject: [PATCH 09/34] Minor changes --- web_poet/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web_poet/__main__.py b/web_poet/__main__.py index 2af4ea89..5de9f12d 100644 --- a/web_poet/__main__.py +++ b/web_poet/__main__.py @@ -24,7 +24,7 @@ def main(args=None): default="" ) args = parser.parse_args(args) - table = [("Use this", "instead of that", "for URL patterns", "else these URL patterns", "with priority", "meta")] + table = [("Use this", "instead of", "for the URL patterns", "except for the patterns", "with priority", "meta")] table += [(qualified_name(rule.use), qualified_name(rule.instead_of), rule.for_patterns.include, rule.for_patterns.exclude, rule.for_patterns.priority, rule.meta) for rule in find_page_object_overrides(args.module, args.namespace)] print(tabulate.tabulate(table, headers="firstrow")) From f6fdac42ec8ce8339ffa35ba0c03c323a2ba4107 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Iv=C3=A1n=20de=20Prado?= Date: Wed, 1 Dec 2021 13:01:50 +0100 Subject: [PATCH 10/34] url-matcher has now been released. --- docs/conf.py | 1 + setup.py | 2 +- web_poet/overrides.py | 3 ++- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 353e5968..09dfab08 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -192,4 +192,5 @@ intersphinx_mapping = { 'python': ('https://docs.python.org/3', None, ), 'scrapy': ('https://docs.scrapy.org/en/latest', None, ), + 'url-matcher': ('https://url-matcher.readthedocs.io/en/stable/', None, ), } diff --git a/setup.py b/setup.py index 579e633c..11f8cb79 100644 --- a/setup.py +++ b/setup.py @@ -22,7 +22,7 @@ install_requires=( 'attrs', 'parsel', - 'url-matcher @ git+https://git@github.com/zytedata/url-matcher@main#egg=url-matcher', + 'url-matcher', 'tabulate', ), classifiers=( diff --git a/web_poet/overrides.py b/web_poet/overrides.py index 5ed58984..78e8a235 100644 --- a/web_poet/overrides.py +++ b/web_poet/overrides.py @@ -59,7 +59,8 @@ def handle_urls(include: Union[str, Iterable[str]], Which Page Object is overridden is determined by the ``overrides`` parameter. Over which URLs the override happens is determined by the ``include``, ``exclude`` and ``priority`` parameters. - See the documentation of the ``url-matcher`` package for more information about them. + See the documentation of the `url-matcher `_ + package for more information about them. Different namespaces can be used to create different groups of annotations. The default namespace is the empty string. From ba52ce09e983d7f1589fe84b7ab74d13a37e005b Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 20 Dec 2021 13:57:56 +0800 Subject: [PATCH 11/34] add entry point for CLI command --- CHANGELOG.rst | 1 + setup.py | 1 + 2 files changed, 2 insertions(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 28b37e16..cbe4c204 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,6 +6,7 @@ TBR ------------------ * ``handle_urls`` decorator and ``find_page_object_overrides`` function added. +* new CLI tool for displaying all available Page Objects: ``web_poet `` * removed support for Python 3.6 * added support for Python 3.10 diff --git a/setup.py b/setup.py index 8a2f4a1d..86e103bd 100644 --- a/setup.py +++ b/setup.py @@ -14,6 +14,7 @@ author='Scrapinghub', author_email='info@scrapinghub.com', url='https://github.com/scrapinghub/web-poet', + entry_points={'console_scripts': ['web_poet = web_poet.__main__:main']}, packages=find_packages( exclude=( 'tests', From ba616261d4ea24bd4588faf44bc7ef3b8bcc5201 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 20 Dec 2021 14:15:39 +0800 Subject: [PATCH 12/34] fix import which fails tests --- tests/po_lib/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/po_lib/__init__.py b/tests/po_lib/__init__.py index 05612595..fc27e4d6 100644 --- a/tests/po_lib/__init__.py +++ b/tests/po_lib/__init__.py @@ -1,8 +1,7 @@ """ This package is just for overrides testing purposes. """ -from collections import Callable -from typing import Dict, Any +from typing import Dict, Any, Callable from url_matcher import Patterns From f5cffefd521e17c419ef2db004d0fbc717417aa8 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 20 Dec 2021 21:38:20 +0800 Subject: [PATCH 13/34] refactor namespace to be classes instead --- tests/po_lib/__init__.py | 11 +- .../po_lib/nested_package/a_nested_module.py | 4 +- tests/test_overrides.py | 10 +- web_poet/__init__.py | 2 +- web_poet/__main__.py | 39 ++- web_poet/overrides.py | 232 +++++++++++++----- 6 files changed, 222 insertions(+), 76 deletions(-) diff --git a/tests/po_lib/__init__.py b/tests/po_lib/__init__.py index fc27e4d6..03f426e3 100644 --- a/tests/po_lib/__init__.py +++ b/tests/po_lib/__init__.py @@ -5,7 +5,7 @@ from url_matcher import Patterns -from web_poet import handle_urls +from web_poet import handle_urls, PageObjectRegistry class POBase: @@ -22,7 +22,10 @@ class POTopLevelOverriden2: ... -# This first annotation is ignored. A single annotation per namespace per class is allowed +secondary_registry = PageObjectRegistry(name="secondary") + + +# This first annotation is ignored. A single annotation per registry is allowed @handle_urls("example.com", POTopLevelOverriden1) @handle_urls("example.com", POTopLevelOverriden1, exclude="/*.jpg|", priority=300) class POTopLevel1(POBase): @@ -31,9 +34,9 @@ class POTopLevel1(POBase): expected_meta = {} # type: ignore -# The second annotation is for a different namespace +# The second annotation is for a different registry @handle_urls("example.com", POTopLevelOverriden2) -@handle_urls("example.org", POTopLevelOverriden2, namespace="secondary") +@secondary_registry.handle_urls("example.org", POTopLevelOverriden2) class POTopLevel2(POBase): expected_overrides = POTopLevelOverriden2 expected_patterns = Patterns(["example.com"]) diff --git a/tests/po_lib/nested_package/a_nested_module.py b/tests/po_lib/nested_package/a_nested_module.py index 7d9536b3..bc2424fe 100644 --- a/tests/po_lib/nested_package/a_nested_module.py +++ b/tests/po_lib/nested_package/a_nested_module.py @@ -1,6 +1,6 @@ from url_matcher import Patterns -from tests.po_lib import POBase +from tests.po_lib import POBase, secondary_registry from web_poet import handle_urls @@ -13,7 +13,7 @@ class PONestedModuleOverridenSecondary: @handle_urls(include=["example.com", "example.org"], exclude=["/*.jpg|"], overrides=PONestedModuleOverriden) -@handle_urls("example.com", PONestedModuleOverridenSecondary, namespace="secondary") +@secondary_registry.handle_urls("example.com", PONestedModuleOverridenSecondary) class PONestedModule(POBase): expected_overrides = PONestedModuleOverriden expected_patterns = Patterns(include=["example.com", "example.org"], exclude=["/*.jpg|"]) diff --git a/tests/test_overrides.py b/tests/test_overrides.py index 1064a5da..53765334 100644 --- a/tests/test_overrides.py +++ b/tests/test_overrides.py @@ -46,8 +46,8 @@ def test_list_page_objects_from_unknown_module(): find_page_object_overrides("tests.po_lib.unknown_module") -def test_list_page_objects_from_namespace(): - rules = find_page_object_overrides("tests.po_lib", namespace="secondary") +def test_list_page_objects_from_imported_registry(): + rules = find_page_object_overrides("tests.po_lib", registry="secondary") assert len(rules) == 2 rule_for = {po.use: po for po in rules} @@ -60,10 +60,10 @@ def test_list_page_objects_from_namespace(): assert pones.instead_of == PONestedModuleOverridenSecondary -def test_list_page_objects_from_empty_namespace(): - assert find_page_object_overrides("tests.po_lib", namespace="foo") == [] +def test_list_page_objects_from_non_existing_registry(): + assert find_page_object_overrides("tests.po_lib", registry="not-exist") == [] def test_cmd(): from web_poet.__main__ import main - main(["tests.po_lib"]) \ No newline at end of file + main(["tests.po_lib"]) diff --git a/web_poet/__init__.py b/web_poet/__init__.py index c19cd6b1..586a65c6 100644 --- a/web_poet/__init__.py +++ b/web_poet/__init__.py @@ -1,3 +1,3 @@ from .pages import WebPage, ItemPage, ItemWebPage, Injectable from .page_inputs import ResponseData -from .overrides import handle_urls \ No newline at end of file +from .overrides import handle_urls, PageObjectRegistry diff --git a/web_poet/__main__.py b/web_poet/__main__.py index 5de9f12d..4fa2cf24 100644 --- a/web_poet/__main__.py +++ b/web_poet/__main__.py @@ -11,8 +11,9 @@ def qualified_name(cls: Callable) -> str: def main(args=None): - parser = argparse.ArgumentParser(description= - "Tool that list the Page Object overrides from a package or module recursively") + parser = argparse.ArgumentParser( + description="Tool that list the Page Object overrides from a package or module recursively" + ) parser.add_argument( "module", metavar="PKG_OR_MODULE", @@ -20,15 +21,37 @@ def main(args=None): help="A package or module to list overrides from", ) parser.add_argument( - "--namespace", "-n", metavar="NAMESPACE", type=str, help="Namespace to list overrides from", - default="" + "--registry", + "-n", + metavar="REGISTRY_NAME", + type=str, + help="Registry name to list overrides from", + default="default", ) args = parser.parse_args(args) - table = [("Use this", "instead of", "for the URL patterns", "except for the patterns", "with priority", "meta")] - table += [(qualified_name(rule.use), qualified_name(rule.instead_of), rule.for_patterns.include, rule.for_patterns.exclude, rule.for_patterns.priority, rule.meta) - for rule in find_page_object_overrides(args.module, args.namespace)] + table = [ + ( + "Use this", + "instead of", + "for the URL patterns", + "except for the patterns", + "with priority", + "meta", + ) + ] + table += [ + ( + qualified_name(rule.use), + qualified_name(rule.instead_of), + rule.for_patterns.include, + rule.for_patterns.exclude, + rule.for_patterns.priority, + rule.meta, + ) + for rule in find_page_object_overrides(args.module, registry=args.registry) + ] print(tabulate.tabulate(table, headers="firstrow")) if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/web_poet/overrides.py b/web_poet/overrides.py index 78e8a235..d551b373 100644 --- a/web_poet/overrides.py +++ b/web_poet/overrides.py @@ -1,5 +1,6 @@ import importlib import importlib.util +import warnings import pkgutil import sys from dataclasses import dataclass, field @@ -7,13 +8,15 @@ from url_matcher import Patterns - -OVERRIDES_NAMESPACES_KEY = "_overrides_namespaces_" +# Used by ``PageObjectRegistry`` to declare itself in a module so that it's +# easily discovered by ``find_page_object_overrides()`` later on. +REGISTRY_MODULE_ANCHOR = "_registry_module_anchor_" @dataclass(frozen=True) class HandleUrlsSpec: """Meta information used by the :py:func:`web_poet.handle_urls` decorator""" + patterns: Patterns overrides: Callable meta: Dict[str, Any] = field(default_factory=dict) @@ -22,6 +25,7 @@ class HandleUrlsSpec: @dataclass(frozen=True) class OverrideRule: """A single override rule. Specify when a page object should be used instead of another""" + for_patterns: Patterns use: Callable instead_of: Callable @@ -44,67 +48,161 @@ def _as_list(value: Union[str, Iterable[str], None]) -> List[str]: return list(value) -def handle_urls(include: Union[str, Iterable[str]], - overrides: Callable, - *, - exclude: Union[str, Iterable[str], None] = None, - priority: int = 500, - namespace: str = "", - **kwargs - ): - """ - Class decorator that indicates that the decorated Page Object should be used instead of the overridden one - for a particular set the URLs. +class PageObjectRegistry: + """This contains the mapping rules that associates the Page Objects available + for a given URL matching rule. + + Different Registry classes can be used to create different groups of + annotations. Here's an example usage: + + .. code-block:: python + + from web_poet import PageObjectRegistry + + main_registry = PageObjectRegistry(name="main") + secondary_registry = PageObjectRegistry(name="secondary") + + @main_registry.handle_urls("example.com", overrides=ProductPageObject) + @secondary_registry.handle_urls("example.com", overrides=ProductPageObject) + class ExampleComProductPage(ItemPage): + ... + + The annotation indicates that the ``ExampleComProductPage`` + Page Object should be used instead of the ``ProductPageObject`` Page + Object for all the URLs whose top level domain is ``example.com``. + + Moreover, this rule is available for the two (2) registries we've declared. + This could be useful in cases wherein you want to categorize the rules by + ``PageObjectRegistry``. They could each be accessed via: + + .. code-block:: python - Which Page Object is overridden is determined by the ``overrides`` parameter. + from web_poet import find_page_object_overrides - Over which URLs the override happens is determined by the ``include``, ``exclude`` and ``priority`` parameters. - See the documentation of the `url-matcher `_ - package for more information about them. + po_path = "my_scrapy_project.page_objects" - Different namespaces can be used to create different groups of annotations. The default namespace is the empty - string. + rules_main = find_page_object_overrides(po_path, registry="main") + rules_secondary = find_page_object_overrides(po_path, registry="secondary") - For the example, the following Page Object is decorated with the ``handle_urls`` decorator: + However, ``web-poet`` already contains a default Registry named ``"default"``. + It can be directly accessed via: .. code-block:: python + from web_poet import handle_urls, find_page_object_overrides + @handle_urls("example.com", overrides=ProductPageObject) class ExampleComProductPage(ItemPage): ... - The annotation indicates that the ``ExampleComProductPage`` Page Object should be used - instead of the ``ProductPageObject`` Page Object for all the URLs whose top level domain is ``example.com``. + # The `registry` is already set to 'default' + find_page_object_overrides("my_scrapy_project.page_objects") - Any extra parameters are stored as meta information that can be later used. + Notice that there was no need to directly use the ``PageObjectRegistry`` as + the convenience functions would suffice. In addition, if you need to organize + your Page Objects in your Scrapy project, a single (1) instance of the + ``PageObjectRegistry`` would work, as long as you organize your files + into modules. The rules could then be accessed like: - :param include: Defines the URLs that should be handled by the overridden Page Object. - :param overrides: The Page Object that should be replaced by the annotated one. - :param exclude: Defines URLs over which the override should not happen. - :param priority: The resolution priority in case of conflicting annotations. + * ``find_page_object_overrides("my_scrapy_project.page_objects.site_A")`` + * ``find_page_object_overrides("my_scrapy_project.page_objects.site_B")`` """ - def wrapper(cls): + def __init__(self, name: str = ""): + self.name = name + self.data: Dict[Callable, HandleUrlsSpec] = {} + + def _declare_registry_in_module(self, cls): + """This allows the Registry to be easily discovered later on by + ``find_page_object_overrides()`` by explicitly declaring its presence + on the given module. + """ + module = sys.modules[cls.__module__] - if not hasattr(module, OVERRIDES_NAMESPACES_KEY): - setattr(module, OVERRIDES_NAMESPACES_KEY, {}) - - handle_urls_dict = getattr(module, OVERRIDES_NAMESPACES_KEY) - spec = HandleUrlsSpec( - patterns=Patterns( - include=_as_list(include), - exclude=_as_list(exclude), - priority=priority), - overrides=overrides, - meta=kwargs, - ) - namespace_dict = handle_urls_dict.setdefault(namespace, {}) - if cls not in namespace_dict: + if not hasattr(module, REGISTRY_MODULE_ANCHOR): + registries = {self.name: self} + else: + registries = getattr(module, REGISTRY_MODULE_ANCHOR) + registries[self.name] = self + + setattr(module, REGISTRY_MODULE_ANCHOR, registries) + + def handle_urls( + self, + include: Union[str, Iterable[str]], + overrides: Callable, + *, + exclude: Union[str, Iterable[str], None] = None, + priority: int = 500, + **kwargs, + ): + """ + Class decorator that indicates that the decorated Page Object should be + used instead of the overridden one for a particular set the URLs. + + Which Page Object is overridden is determined by the ``overrides`` + parameter. + + Over which URLs the override happens is determined by the ``include``, + ``exclude`` and ``priority`` parameters. See the documentation of the + `url-matcher `_ package for more + information about them. + + Any extra parameters are stored as meta information that can be later used. + + :param include: Defines the URLs that should be handled by the overridden Page Object. + :param overrides: The Page Object that should be replaced by the annotated one. + :param exclude: Defines URLs over which the override should not happen. + :param priority: The resolution priority in case of conflicting annotations. + """ + + def wrapper(cls): + self._declare_registry_in_module(cls) + + spec = HandleUrlsSpec( + patterns=Patterns( + include=_as_list(include), + exclude=_as_list(exclude), + priority=priority, + ), + overrides=overrides, + meta=kwargs, + ) # If it was already defined, we don't want to override it - namespace_dict[cls] = spec - return cls + if cls not in self.data: + self.data[cls] = spec + else: + warnings.warn( + f"Multiple @handle_urls annotations with the same 'overrides' " + f"are ignored in the same Registry. Ignoring duplicate " + f"annotation on '{include}' for {cls}." + ) + + return cls + + return wrapper - return wrapper + def get_data_from_module(self, module: str) -> Dict[Callable, HandleUrlsSpec]: + """Returns the override mappings that were declared using ``handle_urls`` + in a specific module. + + This is useful if you've organized your Page Objects into multiple + submodules in your project. + """ + return { + cls: spec + for cls, spec in self.data.items() + if cls.__module__.startswith(module.__name__) + } + + def __repr__(self) -> str: + return f"PageObjectRegistry(name='{self.name}')" + + +# For ease of use, we'll create a default registry so that users can simply +# use its `handles_url()` method directly by `from web_poet import handles_url` +default_registry = PageObjectRegistry(name="default") +handle_urls = default_registry.handle_urls def walk_modules(module: str) -> Iterable: @@ -112,7 +210,8 @@ def walk_modules(module: str) -> Iterable: Return all modules from a module recursively. Note that this will import all the modules and submodules. It returns the provided module as well. """ - def onerror(mod): + + def onerror(_): raise spec = importlib.util.find_spec(module) @@ -121,26 +220,47 @@ def onerror(mod): mod = importlib.import_module(spec.name) yield mod if spec.submodule_search_locations: - for info in pkgutil.walk_packages(spec.submodule_search_locations, f"{spec.name}.", onerror): + for info in pkgutil.walk_packages( + spec.submodule_search_locations, f"{spec.name}.", onerror + ): mod = importlib.import_module(info.name) yield mod -def find_page_object_overrides(module: str, namespace: str = "") -> List[OverrideRule]: +def find_page_object_overrides( + module: str, registry: str = "default" +) -> List[OverrideRule]: """ - Find all the Page Objects overrides in the given module/package and it submodules. + Find all the Page Objects overrides in the given module/package and its + submodules. - The page objects that have been decorated with the ``handle_urls`` decorator will be returned. + The Page Objects that have been decorated with the ``handle_urls`` decorator + from the specified Registry ``name`` will be returned. - Note that this will import the module and its submodules. + Note that this will explore the `module` and traverse its `submodules`. :param module: The module or package to search in - :param namespace: Only return page objects overrides in this namespace + :param registry: Only return page objects overrides in this registry :return: Return a list of :py:class:`web_poet.overrides.OverrideRule` metadata. """ + page_objects: Dict[Callable, HandleUrlsSpec] = {} for module in walk_modules(module): - handle_urls_dict = getattr(module, OVERRIDES_NAMESPACES_KEY, {}) - page_objects.update(handle_urls_dict.get(namespace) or {}) - return [OverrideRule(for_patterns=spec.patterns, use=po, instead_of=spec.overrides, meta=spec.meta) - for po, spec in page_objects.items()] + handle_urls_dict = getattr(module, REGISTRY_MODULE_ANCHOR, {}) + + # A module could have multiple non-default PageObjectRegistry instances + registry = handle_urls_dict.get(registry) + if not registry: + continue + + page_objects.update(registry.get_data_from_module(module)) + + return [ + OverrideRule( + for_patterns=spec.patterns, + use=po, + instead_of=spec.overrides, + meta=spec.meta, + ) + for po, spec in page_objects.items() + ] From c3579b9ad5e63e5d41ee4ee91e989cb5115ea3bb Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 20 Dec 2021 22:09:24 +0800 Subject: [PATCH 14/34] fix failing mypy tests after refactoring --- tests/test_overrides.py | 4 ++-- web_poet/__main__.py | 2 +- web_poet/overrides.py | 18 +++++++++--------- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/test_overrides.py b/tests/test_overrides.py index 53765334..d8f3dc08 100644 --- a/tests/test_overrides.py +++ b/tests/test_overrides.py @@ -47,7 +47,7 @@ def test_list_page_objects_from_unknown_module(): def test_list_page_objects_from_imported_registry(): - rules = find_page_object_overrides("tests.po_lib", registry="secondary") + rules = find_page_object_overrides("tests.po_lib", registry_name="secondary") assert len(rules) == 2 rule_for = {po.use: po for po in rules} @@ -61,7 +61,7 @@ def test_list_page_objects_from_imported_registry(): def test_list_page_objects_from_non_existing_registry(): - assert find_page_object_overrides("tests.po_lib", registry="not-exist") == [] + assert find_page_object_overrides("tests.po_lib", registry_name="not-exist") == [] def test_cmd(): diff --git a/web_poet/__main__.py b/web_poet/__main__.py index 4fa2cf24..f8d0d5d5 100644 --- a/web_poet/__main__.py +++ b/web_poet/__main__.py @@ -48,7 +48,7 @@ def main(args=None): rule.for_patterns.priority, rule.meta, ) - for rule in find_page_object_overrides(args.module, registry=args.registry) + for rule in find_page_object_overrides(args.module, registry_name=args.registry) ] print(tabulate.tabulate(table, headers="firstrow")) diff --git a/web_poet/overrides.py b/web_poet/overrides.py index d551b373..a8119b77 100644 --- a/web_poet/overrides.py +++ b/web_poet/overrides.py @@ -81,8 +81,8 @@ class ExampleComProductPage(ItemPage): po_path = "my_scrapy_project.page_objects" - rules_main = find_page_object_overrides(po_path, registry="main") - rules_secondary = find_page_object_overrides(po_path, registry="secondary") + rules_main = find_page_object_overrides(po_path, registry_name="main") + rules_secondary = find_page_object_overrides(po_path, registry_name="secondary") However, ``web-poet`` already contains a default Registry named ``"default"``. It can be directly accessed via: @@ -192,7 +192,7 @@ def get_data_from_module(self, module: str) -> Dict[Callable, HandleUrlsSpec]: return { cls: spec for cls, spec in self.data.items() - if cls.__module__.startswith(module.__name__) + if cls.__module__.startswith(module) } def __repr__(self) -> str: @@ -228,7 +228,7 @@ def onerror(_): def find_page_object_overrides( - module: str, registry: str = "default" + module: str, registry_name: str = "default" ) -> List[OverrideRule]: """ Find all the Page Objects overrides in the given module/package and its @@ -240,20 +240,20 @@ def find_page_object_overrides( Note that this will explore the `module` and traverse its `submodules`. :param module: The module or package to search in - :param registry: Only return page objects overrides in this registry + :param registry_name: Only return page objects overrides in this registry :return: Return a list of :py:class:`web_poet.overrides.OverrideRule` metadata. """ page_objects: Dict[Callable, HandleUrlsSpec] = {} - for module in walk_modules(module): - handle_urls_dict = getattr(module, REGISTRY_MODULE_ANCHOR, {}) + for mod in walk_modules(module): + handle_urls_dict = getattr(mod, REGISTRY_MODULE_ANCHOR, {}) # A module could have multiple non-default PageObjectRegistry instances - registry = handle_urls_dict.get(registry) + registry = handle_urls_dict.get(registry_name) if not registry: continue - page_objects.update(registry.get_data_from_module(module)) + page_objects.update(registry.get_data_from_module(mod.__name__)) return [ OverrideRule( From 531752f0e415908949f7742a24fff276c850fdfd Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Tue, 21 Dec 2021 11:40:08 +0800 Subject: [PATCH 15/34] update tests to improve coverage --- tests/test_overrides.py | 17 +++++++++++++---- web_poet/__main__.py | 2 +- web_poet/overrides.py | 4 ++-- 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/tests/test_overrides.py b/tests/test_overrides.py index d8f3dc08..07d13375 100644 --- a/tests/test_overrides.py +++ b/tests/test_overrides.py @@ -4,8 +4,11 @@ from tests.po_lib import POTopLevel1, POTopLevel2, POTopLevelOverriden2 from tests.po_lib.a_module import POModule from tests.po_lib.nested_package import PONestedPkg -from tests.po_lib.nested_package.a_nested_module import PONestedModule, PONestedModuleOverridenSecondary -from web_poet.overrides import find_page_object_overrides +from tests.po_lib.nested_package.a_nested_module import ( + PONestedModule, + PONestedModuleOverridenSecondary, +) +from web_poet.overrides import find_page_object_overrides, PageObjectRegistry POS = {POTopLevel1, POTopLevel2, POModule, PONestedPkg, PONestedModule} @@ -65,5 +68,11 @@ def test_list_page_objects_from_non_existing_registry(): def test_cmd(): - from web_poet.__main__ import main - main(["tests.po_lib"]) + from web_poet.__main__ import main + + assert main(["tests.po_lib"]) is None + + +def test_registry_repr(): + registry = PageObjectRegistry(name="test") + assert "name='test'" in str(registry) diff --git a/web_poet/__main__.py b/web_poet/__main__.py index f8d0d5d5..fc5fe3cc 100644 --- a/web_poet/__main__.py +++ b/web_poet/__main__.py @@ -54,4 +54,4 @@ def main(args=None): if __name__ == "__main__": - main() + main() # pragma: no cover diff --git a/web_poet/overrides.py b/web_poet/overrides.py index a8119b77..eb30f7a3 100644 --- a/web_poet/overrides.py +++ b/web_poet/overrides.py @@ -211,8 +211,8 @@ def walk_modules(module: str) -> Iterable: It returns the provided module as well. """ - def onerror(_): - raise + def onerror(err): + raise err # pragma: no cover spec = importlib.util.find_spec(module) if not spec: From 7495b5843be09f8f58c81b46e430d302e822172d Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Tue, 21 Dec 2021 13:54:38 +0800 Subject: [PATCH 16/34] add missing import for find_page_object_overrides --- web_poet/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web_poet/__init__.py b/web_poet/__init__.py index 586a65c6..ee3462bd 100644 --- a/web_poet/__init__.py +++ b/web_poet/__init__.py @@ -1,3 +1,3 @@ from .pages import WebPage, ItemPage, ItemWebPage, Injectable from .page_inputs import ResponseData -from .overrides import handle_urls, PageObjectRegistry +from .overrides import handle_urls, find_page_object_overrides, PageObjectRegistry From 0a0ee12f5106f61047ee563768d8a35359324052 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Tue, 21 Dec 2021 14:12:29 +0800 Subject: [PATCH 17/34] add docs for overrides --- docs/api_reference.rst | 6 +- docs/index.rst | 1 + docs/intro/overrides.rst | 122 +++++++++++++++++++++++++++++++++++++++ docs/intro/tutorial.rst | 2 +- 4 files changed, 129 insertions(+), 2 deletions(-) create mode 100644 docs/intro/overrides.rst diff --git a/docs/api_reference.rst b/docs/api_reference.rst index 235979c6..e4d06484 100644 --- a/docs/api_reference.rst +++ b/docs/api_reference.rst @@ -1,3 +1,5 @@ +.. _`api-reference`: + ============= API Reference ============= @@ -47,6 +49,8 @@ Mixins :no-special-members: +.. _`api-overrides`: + Overrides ========= @@ -54,4 +58,4 @@ Overrides .. automodule:: web_poet.overrides :members: - :exclude-members: handle_urls \ No newline at end of file + :exclude-members: handle_urls diff --git a/docs/index.rst b/docs/index.rst index d6d2e269..db4d852d 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -33,6 +33,7 @@ and the motivation behind ``web-poet``, start with :ref:`from-ground-up`. intro/tutorial intro/from-ground-up + intro/overrides .. toctree:: :caption: Reference diff --git a/docs/intro/overrides.rst b/docs/intro/overrides.rst new file mode 100644 index 00000000..9e264a52 --- /dev/null +++ b/docs/intro/overrides.rst @@ -0,0 +1,122 @@ +.. _`intro-overrides`: + +Overrides +========= + +Overrides contains mapping rules to associate which URLs a particular +Page Object would be used. The URL matching rules is handled by another library +called `url-matcher `_. + +Using such matching rules establishes the core concept of Overrides wherein +its able to use specific Page Objects in lieu of the original one. + +This enables ``web-poet`` to be used effectively by other frameworks like +`scrapy-poet `_. + +Example Use Case +---------------- + +Let's explore an example use case for the Overrides concept. + +Suppose we're using Page Objects for our broadcrawl project which explores +eCommerce websites to discover product pages. It wouldn't be entirely possible +for us to create parsers for all websites since we don't know which sites we're +going to crawl beforehand. + +However, we could at least create a generic Page Object to support parsing of +some fields in well-known locations of product information like ````. +This enables our broadcrawler to at least parse some useful information. Let's +call such Page Object to be ``GenericProductPage``. + +Assuming that one of our project requirements is to fully support parsing of the +`top 3 eCommerce websites`, then we'd need to create a Page Object for each one +to parse more specific fields. + +Here's where the Overrides concept comes in: + + 1. The ``GenericProductPage`` is used to parse all eCommerce product pages + `by default`. + 2. Whenever one of our declared URL rules matches with a given page URL, + then the Page Object associated with that rule `overrides (or replaces)` + the default ``GenericProductPage``. + +This enables us to fine tune our parsing logic `(which are abstracted away for +each Page Object)` depending on the page we're parsing. + +Let's see this in action by creating Page Objects below. + + +Creating Overrides +------------------ + +Let's take a look at how the following code is structured: + +.. code-block:: python + + from web_poet import handle_urls + from web_poet.pages import ItemWebPage + + class GenericProductPage(ItemWebPage): + def to_item(self): + return {"product title": self.css("title::text").get()} + + @handle_urls("example.com", overrides=GenericProductPage) + class ExampleProductPage(ItemWebPage): + def to_item(self): + ... # more specific parsing + + @handle_urls("anotherexample.com", overrides=GenericProductPage, exclude="/digital-goods/") + class AnotherExampleProductPage(ItemWebPage): + def to_item(self): + ... # more specific parsing + + @handle_urls(["dualexample.com", "dualexample.net"], overrides=GenericProductPage) + class DualExampleProductPage(ItemWebPage): + def to_item(self): + ... # more specific parsing + +The code above declares that: + + - For sites that matches the ``example.com`` pattern, ``ExampleProductPage`` + would be used instead of ``GenericProductPage``. + - The same is true for ``YetAnotherExampleProductPage`` where it is used + instead of ``GenericProductPage`` for two URLs: ``dualexample.com`` and + ``dualexample.net``. + - However, ``AnotherExampleProductPage`` is only used instead of ``GenericProductPage`` + when we're parsing pages from ``anotherexample.com`` which doesn't contain + ``/digital-goods/`` in its URL path. + +The override mechanism that ``web-poet`` offers could also still be further +customized. You can read some of the specific parameters and alternative ways +to organize the rules via the :ref:`Overrides API section <api-overrides>`. + + +Viewing all available Overrides +------------------------------- + +A convenience function is available discover and retrieve all rules from your +project. Make sure to check out :ref:`Overrides API section <api-overrides>` +to see the other functionalities of ``find_page_object_overrides``. + +.. code-block:: + + from web_poet import find_page_object_overrides + + rules = find_page_object_overrides("my_project.page_objects") + + print(len(rules)) # 3 + + print(rules[0]) # OverrideRule(for_patterns=Patterns(include=['example.com'], exclude=[], priority=500), use=<class 'my_project.page_objects.ExampleProductPage'>, instead_of=<class 'my_project.page_objects.GenericProductPage'>, meta={}) + + +A handy CLI tool is also available at your disposal to quickly see the available +Override rules in a given module in your project. For example, invoking something +like ``web_poet my_project.page_objects`` would produce the following: + +.. code-block:: + + Use this instead of for the URL patterns except for the patterns with priority meta + ---------------------------------------------------- ------------------------------------------ -------------------------------------- ------------------------- --------------- ------ + my_project.page_objects.ExampleProductPage my_project.page_objects.GenericProductPage ['example.com'] [] 500 {} + my_project.page_objects.AnotherExampleProductPage my_project.page_objects.GenericProductPage ['anotherexample.com'] ['/digital-goods/'] 500 {} + my_project.page_objects.DualExampleProductPage my_project.page_objects.GenericProductPage ['dualexample.com', 'dualexample.net'] [] 500 {} diff --git a/docs/intro/tutorial.rst b/docs/intro/tutorial.rst index 5afbbd6c..9b8f7c01 100644 --- a/docs/intro/tutorial.rst +++ b/docs/intro/tutorial.rst @@ -131,4 +131,4 @@ As you can see, it's possible to use web-poet with built-in libraries such as `scrapy-poet <https://scrapy-poet.readthedocs.io>`_. If you want to understand the idea behind web-poet better, -check the :ref:`from-ground-up` tutorial. \ No newline at end of file +check the :ref:`from-ground-up` tutorial. From 46d40e7cd5c989e80803cf220f8016581ed5059c Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal <kevinoxy@gmail.com> Date: Wed, 22 Dec 2021 21:46:24 +0800 Subject: [PATCH 18/34] refactor by removing the need for find_page_object_overrides() --- CHANGELOG.rst | 5 +- docs/intro/overrides.rst | 11 ++- tests/po_lib/__init__.py | 3 +- tests/po_sub_lib/__init__.py | 25 ++++++ tests/test_overrides.py | 48 +++++++---- web_poet/__init__.py | 2 +- web_poet/__main__.py | 14 +-- web_poet/overrides.py | 159 +++++++++++------------------------ 8 files changed, 122 insertions(+), 145 deletions(-) create mode 100644 tests/po_sub_lib/__init__.py diff --git a/CHANGELOG.rst b/CHANGELOG.rst index cbe4c204..f5c28b1c 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -5,8 +5,9 @@ Changelog TBR ------------------ -* ``handle_urls`` decorator and ``find_page_object_overrides`` function added. -* new CLI tool for displaying all available Page Objects: ``web_poet <path>`` +* added a ``PageObjectRegistry`` class which has the ``handle_urls`` decorator + to write override rules. +* new CLI tool for displaying all available Page Objects: ``web_poet <module>`` * removed support for Python 3.6 * added support for Python 3.10 diff --git a/docs/intro/overrides.rst b/docs/intro/overrides.rst index 9e264a52..7fc7a24e 100644 --- a/docs/intro/overrides.rst +++ b/docs/intro/overrides.rst @@ -96,16 +96,19 @@ Viewing all available Overrides A convenience function is available discover and retrieve all rules from your project. Make sure to check out :ref:`Overrides API section <api-overrides>` -to see the other functionalities of ``find_page_object_overrides``. +to see the other functionalities. .. code-block:: - from web_poet import find_page_object_overrides + from web_poet import default_registry - rules = find_page_object_overrides("my_project.page_objects") + # Retrieves all rules that were registered in the registry + rules = default_registry.get_overrides() - print(len(rules)) # 3 + # Or, we could also filter out the rules by the module they were defined in + rules = default_registry.get_overrides_from_module("my_project.page_objects") + print(len(rules)) # 3 print(rules[0]) # OverrideRule(for_patterns=Patterns(include=['example.com'], exclude=[], priority=500), use=<class 'my_project.page_objects.ExampleProductPage'>, instead_of=<class 'my_project.page_objects.GenericProductPage'>, meta={}) diff --git a/tests/po_lib/__init__.py b/tests/po_lib/__init__.py index 03f426e3..2925a668 100644 --- a/tests/po_lib/__init__.py +++ b/tests/po_lib/__init__.py @@ -5,6 +5,7 @@ from url_matcher import Patterns +from .. import po_sub_lib # NOTE: this module contains a PO with @handle_rules from web_poet import handle_urls, PageObjectRegistry @@ -22,7 +23,7 @@ class POTopLevelOverriden2: ... -secondary_registry = PageObjectRegistry(name="secondary") +secondary_registry = PageObjectRegistry() # This first annotation is ignored. A single annotation per registry is allowed diff --git a/tests/po_sub_lib/__init__.py b/tests/po_sub_lib/__init__.py new file mode 100644 index 00000000..e74ffab3 --- /dev/null +++ b/tests/po_sub_lib/__init__.py @@ -0,0 +1,25 @@ +"""This package is being used by tests/po_lib to validate some behaviors on +external depedencies. +""" +from typing import Dict, Any, Callable + +from url_matcher import Patterns + +from web_poet import handle_urls + + +class POBase: + expected_overrides: Callable + expected_patterns: Patterns + expected_meta: Dict[str, Any] + + +class POSubLibOverriden: + ... + + +@handle_urls("sub_example.com", POSubLibOverriden) +class POSubLib(POBase): + expected_overrides = POSubLibOverriden + expected_patterns = Patterns(["sub_example.com"]) + expected_meta = {} # type: ignore diff --git a/tests/test_overrides.py b/tests/test_overrides.py index 07d13375..59cb3139 100644 --- a/tests/test_overrides.py +++ b/tests/test_overrides.py @@ -1,24 +1,45 @@ import pytest from url_matcher import Patterns -from tests.po_lib import POTopLevel1, POTopLevel2, POTopLevelOverriden2 +from tests.po_sub_lib import POSubLib +from tests.po_lib import POTopLevel1, POTopLevel2, POTopLevelOverriden2, secondary_registry from tests.po_lib.a_module import POModule from tests.po_lib.nested_package import PONestedPkg from tests.po_lib.nested_package.a_nested_module import ( PONestedModule, PONestedModuleOverridenSecondary, ) -from web_poet.overrides import find_page_object_overrides, PageObjectRegistry +from web_poet.overrides import PageObjectRegistry, default_registry POS = {POTopLevel1, POTopLevel2, POModule, PONestedPkg, PONestedModule} +def test_list_page_objects_all(): + rules = default_registry.get_overrides() + + page_objects = {po.use for po in rules} + + # Ensure that ALL Override Rules are returned as long as the given + # registry's @handle_urls annotation was used. + assert page_objects == POS.union({POSubLib}) + for rule in rules: + assert rule.instead_of == rule.use.expected_overrides, rule.use + assert rule.for_patterns == rule.use.expected_patterns, rule.use + assert rule.meta == rule.use.expected_meta, rule.use + + def test_list_page_objects_from_pkg(): """Tests that metadata is extracted properly from the po_lib package""" - rules = find_page_object_overrides("tests.po_lib") - assert {po.use for po in rules} == POS + rules = default_registry.get_overrides_from_module("tests.po_lib") + page_objects = {po.use for po in rules} + + # Ensure that the "tests.po_lib", which imports another module named + # "tests.po_sub_lib" which contains @handle_urls decorators, does not + # retrieve the override rules from the external package. + assert POSubLib not in page_objects + assert page_objects == POS for rule in rules: assert rule.instead_of == rule.use.expected_overrides, rule.use assert rule.for_patterns == rule.use.expected_patterns, rule.use @@ -26,7 +47,7 @@ def test_list_page_objects_from_pkg(): def test_list_page_objects_from_module(): - rules = find_page_object_overrides("tests.po_lib.a_module") + rules = default_registry.get_overrides_from_module("tests.po_lib.a_module") assert len(rules) == 1 rule = rules[0] assert rule.use == POModule @@ -35,22 +56,22 @@ def test_list_page_objects_from_module(): def test_list_page_objects_from_empty_module(): - rules = find_page_object_overrides("tests.po_lib.an_empty_module") + rules = default_registry.get_overrides_from_module("tests.po_lib.an_empty_module") assert len(rules) == 0 def test_list_page_objects_from_empty_pkg(): - rules = find_page_object_overrides("tests.po_lib.an_empty_package") + rules = default_registry.get_overrides_from_module("tests.po_lib.an_empty_package") assert len(rules) == 0 def test_list_page_objects_from_unknown_module(): with pytest.raises(ImportError): - find_page_object_overrides("tests.po_lib.unknown_module") + default_registry.get_overrides_from_module("tests.po_lib.unknown_module") def test_list_page_objects_from_imported_registry(): - rules = find_page_object_overrides("tests.po_lib", registry_name="secondary") + rules = secondary_registry.get_overrides_from_module("tests.po_lib") assert len(rules) == 2 rule_for = {po.use: po for po in rules} @@ -63,16 +84,7 @@ def test_list_page_objects_from_imported_registry(): assert pones.instead_of == PONestedModuleOverridenSecondary -def test_list_page_objects_from_non_existing_registry(): - assert find_page_object_overrides("tests.po_lib", registry_name="not-exist") == [] - - def test_cmd(): from web_poet.__main__ import main assert main(["tests.po_lib"]) is None - - -def test_registry_repr(): - registry = PageObjectRegistry(name="test") - assert "name='test'" in str(registry) diff --git a/web_poet/__init__.py b/web_poet/__init__.py index ee3462bd..f4366d05 100644 --- a/web_poet/__init__.py +++ b/web_poet/__init__.py @@ -1,3 +1,3 @@ from .pages import WebPage, ItemPage, ItemWebPage, Injectable from .page_inputs import ResponseData -from .overrides import handle_urls, find_page_object_overrides, PageObjectRegistry +from .overrides import handle_urls, PageObjectRegistry, default_registry diff --git a/web_poet/__main__.py b/web_poet/__main__.py index fc5fe3cc..16b231b8 100644 --- a/web_poet/__main__.py +++ b/web_poet/__main__.py @@ -1,9 +1,11 @@ +"""Returns all Override Rules from the default registry.""" + import argparse from typing import Callable import tabulate -from web_poet.overrides import find_page_object_overrides +from web_poet import default_registry def qualified_name(cls: Callable) -> str: @@ -20,14 +22,6 @@ def main(args=None): type=str, help="A package or module to list overrides from", ) - parser.add_argument( - "--registry", - "-n", - metavar="REGISTRY_NAME", - type=str, - help="Registry name to list overrides from", - default="default", - ) args = parser.parse_args(args) table = [ ( @@ -48,7 +42,7 @@ def main(args=None): rule.for_patterns.priority, rule.meta, ) - for rule in find_page_object_overrides(args.module, registry_name=args.registry) + for rule in default_registry.get_overrides_from_module(args.module) ] print(tabulate.tabulate(table, headers="firstrow")) diff --git a/web_poet/overrides.py b/web_poet/overrides.py index eb30f7a3..405670af 100644 --- a/web_poet/overrides.py +++ b/web_poet/overrides.py @@ -2,29 +2,16 @@ import importlib.util import warnings import pkgutil -import sys from dataclasses import dataclass, field from typing import Iterable, Union, List, Callable, Dict, Any from url_matcher import Patterns -# Used by ``PageObjectRegistry`` to declare itself in a module so that it's -# easily discovered by ``find_page_object_overrides()`` later on. -REGISTRY_MODULE_ANCHOR = "_registry_module_anchor_" - - -@dataclass(frozen=True) -class HandleUrlsSpec: - """Meta information used by the :py:func:`web_poet.handle_urls` decorator""" - - patterns: Patterns - overrides: Callable - meta: Dict[str, Any] = field(default_factory=dict) - @dataclass(frozen=True) class OverrideRule: - """A single override rule. Specify when a page object should be used instead of another""" + """A single override rule that specifies when a page object should be used + instead of another.""" for_patterns: Patterns use: Callable @@ -59,8 +46,8 @@ class PageObjectRegistry: from web_poet import PageObjectRegistry - main_registry = PageObjectRegistry(name="main") - secondary_registry = PageObjectRegistry(name="secondary") + main_registry = PageObjectRegistry() + secondary_registry = PageObjectRegistry() @main_registry.handle_urls("example.com", overrides=ProductPageObject) @secondary_registry.handle_urls("example.com", overrides=ProductPageObject) @@ -77,55 +64,38 @@ class ExampleComProductPage(ItemPage): .. code-block:: python - from web_poet import find_page_object_overrides - - po_path = "my_scrapy_project.page_objects" + rules_main = main_registry.get_overrides() + rules_secondary = main_registry.get_overrides() - rules_main = find_page_object_overrides(po_path, registry_name="main") - rules_secondary = find_page_object_overrides(po_path, registry_name="secondary") - - However, ``web-poet`` already contains a default Registry named ``"default"``. - It can be directly accessed via: + On the other hand, ``web-poet`` already provides a default Registry named + ``default_registry`` for convenience. It can be directly accessed via: .. code-block:: python - from web_poet import handle_urls, find_page_object_overrides + from web_poet import handle_urls, default_registry @handle_urls("example.com", overrides=ProductPageObject) class ExampleComProductPage(ItemPage): ... - # The `registry` is already set to 'default' - find_page_object_overrides("my_scrapy_project.page_objects") - - Notice that there was no need to directly use the ``PageObjectRegistry`` as - the convenience functions would suffice. In addition, if you need to organize - your Page Objects in your Scrapy project, a single (1) instance of the - ``PageObjectRegistry`` would work, as long as you organize your files - into modules. The rules could then be accessed like: + override_rules = default_registry.get_overrides() - * ``find_page_object_overrides("my_scrapy_project.page_objects.site_A")`` - * ``find_page_object_overrides("my_scrapy_project.page_objects.site_B")`` - """ + Notice that the ``handle_urls`` that we've imported is a part of + ``default_registry``. This provides a shorter and quicker way to interact + with the built-in default Registry. - def __init__(self, name: str = ""): - self.name = name - self.data: Dict[Callable, HandleUrlsSpec] = {} + In addition, if you need to organize your Page Objects in your project, a + single (1) default instance of the ``PageObjectRegistry`` would work, as + long as you organize your files into modules. - def _declare_registry_in_module(self, cls): - """This allows the Registry to be easily discovered later on by - ``find_page_object_overrides()`` by explicitly declaring its presence - on the given module. - """ + The rules could then be accessed using this method: - module = sys.modules[cls.__module__] - if not hasattr(module, REGISTRY_MODULE_ANCHOR): - registries = {self.name: self} - else: - registries = getattr(module, REGISTRY_MODULE_ANCHOR) - registries[self.name] = self + * ``default_registry.get_overrides_from_module("my_scrapy_project.page_objects.site_A")`` + * ``default_registry.get_overrides_from_module("my_scrapy_project.page_objects.site_B")`` + """ - setattr(module, REGISTRY_MODULE_ANCHOR, registries) + def __init__(self): + self.data: Dict[Callable, OverrideRule] = {} def handle_urls( self, @@ -157,20 +127,19 @@ def handle_urls( """ def wrapper(cls): - self._declare_registry_in_module(cls) - - spec = HandleUrlsSpec( - patterns=Patterns( + rule = OverrideRule( + for_patterns=Patterns( include=_as_list(include), exclude=_as_list(exclude), priority=priority, ), - overrides=overrides, + use=cls, + instead_of=overrides, meta=kwargs, ) # If it was already defined, we don't want to override it if cls not in self.data: - self.data[cls] = spec + self.data[cls] = rule else: warnings.warn( f"Multiple @handle_urls annotations with the same 'overrides' " @@ -182,33 +151,44 @@ def wrapper(cls): return wrapper - def get_data_from_module(self, module: str) -> Dict[Callable, HandleUrlsSpec]: - """Returns the override mappings that were declared using ``handle_urls`` + def get_overrides(self) -> List[OverrideRule]: + """Returns all override rules that were declared using ``@handle_urls``.""" + return list(self.data.values()) + + def get_overrides_from_module(self, module: str) -> List[OverrideRule]: + """Returns the override rules that were declared using ``@handle_urls`` in a specific module. This is useful if you've organized your Page Objects into multiple - submodules in your project. + submodules in your project as you can filter them easily. """ + rules: Dict[Callable, OverrideRule] = {} + + for mod in walk_modules(module): + # Dict ensures that no duplicates are collected and returned. + rules.update(self._filter_from_module(mod.__name__)) + + return list(rules.values()) + + def _filter_from_module(self, module: str) -> Dict[Callable, OverrideRule]: return { - cls: spec - for cls, spec in self.data.items() + cls: rule + for cls, rule in self.data.items() if cls.__module__.startswith(module) } - def __repr__(self) -> str: - return f"PageObjectRegistry(name='{self.name}')" - # For ease of use, we'll create a default registry so that users can simply # use its `handles_url()` method directly by `from web_poet import handles_url` -default_registry = PageObjectRegistry(name="default") +default_registry = PageObjectRegistry() handle_urls = default_registry.handle_urls def walk_modules(module: str) -> Iterable: - """ - Return all modules from a module recursively. Note that this will import all the modules and submodules. - It returns the provided module as well. + """Return all modules from a module recursively. + + Note that this will import all the modules and submodules. It returns the + provided module as well. """ def onerror(err): @@ -225,42 +205,3 @@ def onerror(err): ): mod = importlib.import_module(info.name) yield mod - - -def find_page_object_overrides( - module: str, registry_name: str = "default" -) -> List[OverrideRule]: - """ - Find all the Page Objects overrides in the given module/package and its - submodules. - - The Page Objects that have been decorated with the ``handle_urls`` decorator - from the specified Registry ``name`` will be returned. - - Note that this will explore the `module` and traverse its `submodules`. - - :param module: The module or package to search in - :param registry_name: Only return page objects overrides in this registry - :return: Return a list of :py:class:`web_poet.overrides.OverrideRule` metadata. - """ - - page_objects: Dict[Callable, HandleUrlsSpec] = {} - for mod in walk_modules(module): - handle_urls_dict = getattr(mod, REGISTRY_MODULE_ANCHOR, {}) - - # A module could have multiple non-default PageObjectRegistry instances - registry = handle_urls_dict.get(registry_name) - if not registry: - continue - - page_objects.update(registry.get_data_from_module(mod.__name__)) - - return [ - OverrideRule( - for_patterns=spec.patterns, - use=po, - instead_of=spec.overrides, - meta=spec.meta, - ) - for po, spec in page_objects.items() - ] From 495642bcbcf86a2a17613bfe3068a19a97006cbc Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal <kevinoxy@gmail.com> Date: Thu, 23 Dec 2021 20:42:33 +0800 Subject: [PATCH 19/34] add docs about using multiple PageObjectRegistries --- docs/intro/overrides.rst | 44 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/docs/intro/overrides.rst b/docs/intro/overrides.rst index 7fc7a24e..7e013dcc 100644 --- a/docs/intro/overrides.rst +++ b/docs/intro/overrides.rst @@ -86,10 +86,45 @@ The code above declares that: when we're parsing pages from ``anotherexample.com`` which doesn't contain ``/digital-goods/`` in its URL path. -The override mechanism that ``web-poet`` offers could also still be further +The override mechanism that ``web-poet`` offers could still be further customized. You can read some of the specific parameters and alternative ways to organize the rules via the :ref:`Overrides API section <api-overrides>`. +To demonstrate another alternative way to declare the Override rules, see the +code example below: + +.. code-block:: python + + from web_poet.pages import ItemWebPage + from web_poet import PageObjectRegistry + + primary_registry = PageObjectRegistry() + secondary_registry = PageObjectRegistry() + + class GenericProductPage(ItemWebPage): + def to_item(self): + return {"product title": self.css("title::text").get()} + + @primary_registry.handle_urls("example.com", overrides=GenericProductPage) + class ExampleProductPage(ItemWebPage): + def to_item(self): + ... # more specific parsing + + @secondary_registry.handle_urls("anotherexample.com", overrides=GenericProductPage, exclude="/digital-goods/") + class AnotherExampleProductPage(ItemWebPage): + def to_item(self): + ... # more specific parsing + + @primary_registry.handle_urls(["dualexample.com", "dualexample.net"], overrides=GenericProductPage) + @secondary_registry.handle_urls(["dualexample.com", "dualexample.net"], overrides=GenericProductPage) + class DualExampleProductPage(ItemWebPage): + def to_item(self): + +If you need more control over the Registry, you could instantiate your very +own :class:`~.PageObjectRegistry` and use its ``@handle_urls`` to annotate and +register the rules. This might benefit you in certain project use cases where you +need more organizational control over your rules. + Viewing all available Overrides ------------------------------- @@ -111,6 +146,13 @@ to see the other functionalities. print(len(rules)) # 3 print(rules[0]) # OverrideRule(for_patterns=Patterns(include=['example.com'], exclude=[], priority=500), use=<class 'my_project.page_objects.ExampleProductPage'>, instead_of=<class 'my_project.page_objects.GenericProductPage'>, meta={}) +.. note:: + + Notice in the code sample above where we could filter out the Override rules + per module via :meth:`~.PageObjectRegistry.get_overrides_from_module`. This + could also offer another alternative way to organize your Page Object rules + using only the ``default_registry``. There's no need to declare multiple + :class:`~.PageObjectRegistry` instances and use multiple annotations. A handy CLI tool is also available at your disposal to quickly see the available Override rules in a given module in your project. For example, invoking something From 75593edb29378299093dc80ae97c9e3fa483ee13 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal <kevinoxy@gmail.com> Date: Tue, 4 Jan 2022 14:54:59 +0800 Subject: [PATCH 20/34] add docs regarding organizing Page Object Overrides --- docs/intro/overrides.rst | 226 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 223 insertions(+), 3 deletions(-) diff --git a/docs/intro/overrides.rst b/docs/intro/overrides.rst index 7e013dcc..613080fe 100644 --- a/docs/intro/overrides.rst +++ b/docs/intro/overrides.rst @@ -125,7 +125,6 @@ own :class:`~.PageObjectRegistry` and use its ``@handle_urls`` to annotate and register the rules. This might benefit you in certain project use cases where you need more organizational control over your rules. - Viewing all available Overrides ------------------------------- @@ -133,12 +132,12 @@ A convenience function is available discover and retrieve all rules from your project. Make sure to check out :ref:`Overrides API section <api-overrides>` to see the other functionalities. -.. code-block:: +.. code-block:: python from web_poet import default_registry # Retrieves all rules that were registered in the registry - rules = default_registry.get_overrides() + rules = default_registry.get_overrides() # Or, we could also filter out the rules by the module they were defined in rules = default_registry.get_overrides_from_module("my_project.page_objects") @@ -165,3 +164,224 @@ like ``web_poet my_project.page_objects`` would produce the following: my_project.page_objects.ExampleProductPage my_project.page_objects.GenericProductPage ['example.com'] [] 500 {} my_project.page_objects.AnotherExampleProductPage my_project.page_objects.GenericProductPage ['anotherexample.com'] ['/digital-goods/'] 500 {} my_project.page_objects.DualExampleProductPage my_project.page_objects.GenericProductPage ['dualexample.com', 'dualexample.net'] [] 500 {} + +Organizing Page Object Overrides +-------------------------------- + +After tackling the two (2) different approaches from the previous chapters on how +to declare overrides, we can now explore how to organize them in our projects. +Although it's mostly up to the developer which override declaration method to +use. Yet, we'll present some approaches depending on the situation. + +To put this thought into action, let's suppose we are tasked to create a Page +Object Project with overrides for eCommerce websites. + +Package-based Approach +~~~~~~~~~~~~~~~~~~~~~~ + +Using the **package-based** approach, we might organize them into something like: + +.. code-block:: + + my_page_obj_project + ├── cool_gadget_site + | ├── us + | | ├── __init__.py + | | ├── products.py + | | └── product_listings.py + | ├── fr + | | ├── __init__.py + | | ├── products.py + | | └── product_listings.py + | └── __init__.py + └── furniture_shop + ├── __init__.py + ├── products.py + └── product_listings.py + +Assuming that we've declared the Page Objects in each of the modules to use the +``default_registry`` like: + +.. code-block:: python + + # my_page_obj_project/cool_gadget_site/us/products.py + + from web_poet import handle_urls # remember that this uses the default_registry + from web_poet.pages import ItemWebPage + + @handle_urls("coolgadgetsite.com", overrides=GenericProductPage) + class CoolGadgetUsSiteProductPage(ItemWebPage): + def to_item(self): + ... # parsers here + +Then we could easily retrieve all Page Objects per subpackage or module like this: + +.. code-block:: python + + from web_poet import default_registry + + # We can do it per website. + rules = default_registry.get_overrides_from_module("my_page_obj_project.cool_gadget_site") + rules = default_registry.get_overrides_from_module("my_page_obj_project.furniture_site") + + # It can also drill down to the country domains on a given site. + rules = default_registry.get_overrides_from_module("my_page_obj_project.cool_gadget_site.us") + rules = default_registry.get_overrides_from_module("my_page_obj_project.cool_gadget_site.fr") + + # or even drill down further to the specific module. + rules = default_registry.get_overrides_from_module("my_page_obj_project.cool_gadget_site.us.products") + rules = default_registry.get_overrides_from_module("my_page_obj_project.cool_gadget_site.us.product_listings") + + # Or simply all of Override rules ever declared. + rules = default_registry.get_overrides() + +Multiple Registry Approach +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The **package-based** approach heavily relies on how the developer organizes the +files into intuitive hierarchies depending on the nature of the project. There +might be cases that for some reason, a developer would want to use a **flat +hierarchy** like this: + +.. code-block:: + + my_page_obj_project + ├── __init__.py + ├── cool_gadget_site_us_products.py + ├── cool_gadget_site_us_product_listings.py + ├── cool_gadget_site_fr_products.py + ├── cool_gadget_site_fr_product_listings.py + ├── furniture_shop_products.py + └── furniture_shop_product_listings.py + +As such, calling ``default_registry.get_overrides_from_module()`` would not work +on projects with a **flat hierarchy**. Thus, we can organize them using our own +instances of the :class:`~.PageObjectRegistry` instead: + +.. code-block:: python + + # my_page_obj_project/__init__.py + + from web_poet import PageObjectRegistry + + cool_gadget_registry = PageObjectRegistry() + cool_gadget_us_registry = PageObjectRegistry() + cool_gadget_fr_registry = PageObjectRegistry() + furniture_shop_registry = PageObjectRegistry() + +After declaring the :class:`~.PageObjectRegistry` instances, they can be imported +in each of the Page Object packages like so: + +.. code-block:: python + + # my_page_obj_project/cool_gadget_site_us_products.py + + from . import cool_gadget_registry, cool_gadget_us_registry + from web_poet.pages import ItemWebPage + + @cool_gadget_registry.handle_urls("coolgadgetsite.com", overrides=GenericProductPage) + @cool_gadget_us_registry.handle_urls("coolgadgetsite.com", overrides=GenericProductPage) + class CoolGadgetSiteProductPage(ItemWebPage): + def to_item(self): + ... # parsers here + +Retrieving the rules would simply be: + +.. code-block:: python + + from my_page_obj_project import ( + cool_gadget_registry, + cool_gadget_us_registry, + cool_gadget_fr_registry, + furniture_shop_registry, + ) + + rules = cool_gadget_registry.get_overrides() + rules = cool_gadget_us_registry.get_overrides() + rules = cool_gadget_fr_registry.get_overrides() + rules = furniture_shop_registry.get_overrides() + +Developers can create as much :class:`~.PageObjectRegistry` instances as they want +in order to satisfy their organization and classification needs. + +Mixed Approach +~~~~~~~~~~~~~~ + +Developers are free to choose whichever approach would best fit their particular +use case. They can even mix both approach together to handle some particular +cases. + +For instance, going back to our **package-based** approach organized as: + +.. code-block:: + + my_page_obj_project + ├── cool_gadget_site + | ├── us + | | ├── __init__.py + | | ├── products.py + | | └── product_listings.py + | ├── fr + | | ├── __init__.py + | | ├── products.py + | | └── product_listings.py + | └── __init__.py + └── furniture_shop + ├── __init__.py + ├── products.py + └── product_listings.py + +Suppose we'd want to get all the rules for all of the listings, then one way to +retrieve such rules would be: + +.. code-block:: python + + from web_poet import default_registry + + product_listing_rules = [ + default_registry.get_overrides_from_module("my_page_obj_project.cool_gadget_site.us.product_listings") + + default_registry.get_overrides_from_module("my_page_obj_project.cool_gadget_site.fr.product_listings") + + default_registry.get_overrides_from_module("my_page_obj_project.furniture_shop.product_listings") + ] + +On the other hand, we can also create another :class:`~.PageObjectRegistry` instance +that we'll be using aside from the ``default_registry`` to help us better organize +our Override Rules. + +.. code-block:: python + + # my_page_obj_project/__init__.py + + from web_poet import PageObjectRegistry + + product_listings_registry = PageObjectRegistry() + +Using the additional registry instance above, we'll use it to provide another +annotation for the Page Objects in each of the ``product_listings.py`` module. +For example: + +.. code-block:: python + + # my_page_obj_project/cool_gadget_site_us_product_listings.py + + from . import product_listings_registry + from web_poet import handle_urls # remember that this uses the default_registry + from web_poet.pages import ItemWebPage + + @product_listings_registry.handle_urls("coolgadgetsite.com", overrides=GenericProductPage) + @handle_urls("coolgadgetsite.com", overrides=GenericProductPage) + class CoolGadgetSiteProductPage(ItemWebPage): + def to_item(self): + ... # parsers here + +Retrieving all of the Product Listing Override rules would simply be: + +.. code-block:: python + + from my_page_obj_project import product_listings_registry + + # Getting all of the override rules for product listings. + rules = product_listings_registry.get_overrides() + + # We can also filter it down further on a per site basis if needed. + rules = product_listings_registry.get_overrides_from_module("my_page_obj_project.cool_gadget_site") From 0a2d7792af1ae079c5db001ca36e93b0d598dc31 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal <kevinoxy@gmail.com> Date: Thu, 6 Jan 2022 11:28:54 +0800 Subject: [PATCH 21/34] update override docs to showcase url-matcher patterns --- docs/intro/overrides.rst | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/docs/intro/overrides.rst b/docs/intro/overrides.rst index 613080fe..aa915279 100644 --- a/docs/intro/overrides.rst +++ b/docs/intro/overrides.rst @@ -70,21 +70,28 @@ Let's take a look at how the following code is structured: def to_item(self): ... # more specific parsing - @handle_urls(["dualexample.com", "dualexample.net"], overrides=GenericProductPage) + @handle_urls(["dualexample.com/shop/?product=*", "dualexample.net/store/?pid=*"], overrides=GenericProductPage) class DualExampleProductPage(ItemWebPage): def to_item(self): ... # more specific parsing The code above declares that: - - For sites that matches the ``example.com`` pattern, ``ExampleProductPage`` + - For sites that match the ``example.com`` pattern, ``ExampleProductPage`` would be used instead of ``GenericProductPage``. - - The same is true for ``YetAnotherExampleProductPage`` where it is used - instead of ``GenericProductPage`` for two URLs: ``dualexample.com`` and - ``dualexample.net``. - - However, ``AnotherExampleProductPage`` is only used instead of ``GenericProductPage`` - when we're parsing pages from ``anotherexample.com`` which doesn't contain - ``/digital-goods/`` in its URL path. + - The same is true for ``DualExampleProductPage`` where it is used + instead of ``GenericProductPage`` for two URL patterns which works as: + + - **(match)** https://www.dualexample.com/shop/electronics/?product=123 + - **(match)** https://www.dualexample.com/shop/books/paperback/?product=849 + - (NO match) https://www.dualexample.com/on-sale/books/?product=923 + - **(match)** https://www.dualexample.net/store/kitchen/?pid=776 + - **(match)** https://www.dualexample.net/store/?pid=892 + - (NO match) https://www.dualexample.net/new-offers/fitness/?pid=892 + + - On the other hand, ``AnotherExampleProductPage`` is only used instead of + ``GenericProductPage`` when we're parsing pages from ``anotherexample.com`` + which doesn't contain ``/digital-goods/`` in its URL path. The override mechanism that ``web-poet`` offers could still be further customized. You can read some of the specific parameters and alternative ways @@ -115,10 +122,11 @@ code example below: def to_item(self): ... # more specific parsing - @primary_registry.handle_urls(["dualexample.com", "dualexample.net"], overrides=GenericProductPage) - @secondary_registry.handle_urls(["dualexample.com", "dualexample.net"], overrides=GenericProductPage) + @primary_registry.handle_urls(["dualexample.com/shop/?product=*", "dualexample.net/store/?pid=*"], overrides=GenericProductPage) + @secondary_registry.handle_urls(["dualexample.com/shop/?product=*", "dualexample.net/store/?pid=*"], overrides=GenericProductPage) class DualExampleProductPage(ItemWebPage): def to_item(self): + ... # more specific parsing If you need more control over the Registry, you could instantiate your very own :class:`~.PageObjectRegistry` and use its ``@handle_urls`` to annotate and @@ -159,11 +167,11 @@ like ``web_poet my_project.page_objects`` would produce the following: .. code-block:: - Use this instead of for the URL patterns except for the patterns with priority meta - ---------------------------------------------------- ------------------------------------------ -------------------------------------- ------------------------- --------------- ------ - my_project.page_objects.ExampleProductPage my_project.page_objects.GenericProductPage ['example.com'] [] 500 {} - my_project.page_objects.AnotherExampleProductPage my_project.page_objects.GenericProductPage ['anotherexample.com'] ['/digital-goods/'] 500 {} - my_project.page_objects.DualExampleProductPage my_project.page_objects.GenericProductPage ['dualexample.com', 'dualexample.net'] [] 500 {} + Use this instead of for the URL patterns except for the patterns with priority meta + ---------------------------------------------------- ------------------------------------------ -------------------------------------- ------------------------- --------------- ------ + my_project.page_objects.ExampleProductPage my_project.page_objects.GenericProductPage ['example.com'] [] 500 {} + my_project.page_objects.AnotherExampleProductPage my_project.page_objects.GenericProductPage ['anotherexample.com'] ['/digital-goods/'] 500 {} + my_project.page_objects.DualExampleProductPage my_project.page_objects.GenericProductPage ['dualexample.com/shop/?product=*', 'dualexample.net/store/?pid=*'] [] 500 {} Organizing Page Object Overrides -------------------------------- From c000cbc9d36f1c3022c22d27d67d83e4d86b41c7 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal <kevinoxy@gmail.com> Date: Thu, 6 Jan 2022 11:48:08 +0800 Subject: [PATCH 22/34] rename get_overrides_from_module into get_overrides_from --- docs/intro/overrides.rst | 26 +++++++++++++------------- tests/test_overrides.py | 12 ++++++------ web_poet/__main__.py | 2 +- web_poet/overrides.py | 6 +++--- 4 files changed, 23 insertions(+), 23 deletions(-) diff --git a/docs/intro/overrides.rst b/docs/intro/overrides.rst index aa915279..0e86392f 100644 --- a/docs/intro/overrides.rst +++ b/docs/intro/overrides.rst @@ -148,7 +148,7 @@ to see the other functionalities. rules = default_registry.get_overrides() # Or, we could also filter out the rules by the module they were defined in - rules = default_registry.get_overrides_from_module("my_project.page_objects") + rules = default_registry.get_overrides_from("my_project.page_objects") print(len(rules)) # 3 print(rules[0]) # OverrideRule(for_patterns=Patterns(include=['example.com'], exclude=[], priority=500), use=<class 'my_project.page_objects.ExampleProductPage'>, instead_of=<class 'my_project.page_objects.GenericProductPage'>, meta={}) @@ -156,7 +156,7 @@ to see the other functionalities. .. note:: Notice in the code sample above where we could filter out the Override rules - per module via :meth:`~.PageObjectRegistry.get_overrides_from_module`. This + per module via :meth:`~.PageObjectRegistry.get_overrides_from`. This could also offer another alternative way to organize your Page Object rules using only the ``default_registry``. There's no need to declare multiple :class:`~.PageObjectRegistry` instances and use multiple annotations. @@ -229,16 +229,16 @@ Then we could easily retrieve all Page Objects per subpackage or module like thi from web_poet import default_registry # We can do it per website. - rules = default_registry.get_overrides_from_module("my_page_obj_project.cool_gadget_site") - rules = default_registry.get_overrides_from_module("my_page_obj_project.furniture_site") + rules = default_registry.get_overrides_from("my_page_obj_project.cool_gadget_site") + rules = default_registry.get_overrides_from("my_page_obj_project.furniture_site") # It can also drill down to the country domains on a given site. - rules = default_registry.get_overrides_from_module("my_page_obj_project.cool_gadget_site.us") - rules = default_registry.get_overrides_from_module("my_page_obj_project.cool_gadget_site.fr") + rules = default_registry.get_overrides_from("my_page_obj_project.cool_gadget_site.us") + rules = default_registry.get_overrides_from("my_page_obj_project.cool_gadget_site.fr") # or even drill down further to the specific module. - rules = default_registry.get_overrides_from_module("my_page_obj_project.cool_gadget_site.us.products") - rules = default_registry.get_overrides_from_module("my_page_obj_project.cool_gadget_site.us.product_listings") + rules = default_registry.get_overrides_from("my_page_obj_project.cool_gadget_site.us.products") + rules = default_registry.get_overrides_from("my_page_obj_project.cool_gadget_site.us.product_listings") # Or simply all of Override rules ever declared. rules = default_registry.get_overrides() @@ -262,7 +262,7 @@ hierarchy** like this: ├── furniture_shop_products.py └── furniture_shop_product_listings.py -As such, calling ``default_registry.get_overrides_from_module()`` would not work +As such, calling ``default_registry.get_overrides_from()`` would not work on projects with a **flat hierarchy**. Thus, we can organize them using our own instances of the :class:`~.PageObjectRegistry` instead: @@ -347,9 +347,9 @@ retrieve such rules would be: from web_poet import default_registry product_listing_rules = [ - default_registry.get_overrides_from_module("my_page_obj_project.cool_gadget_site.us.product_listings") - + default_registry.get_overrides_from_module("my_page_obj_project.cool_gadget_site.fr.product_listings") - + default_registry.get_overrides_from_module("my_page_obj_project.furniture_shop.product_listings") + default_registry.get_overrides_from("my_page_obj_project.cool_gadget_site.us.product_listings") + + default_registry.get_overrides_from("my_page_obj_project.cool_gadget_site.fr.product_listings") + + default_registry.get_overrides_from("my_page_obj_project.furniture_shop.product_listings") ] On the other hand, we can also create another :class:`~.PageObjectRegistry` instance @@ -392,4 +392,4 @@ Retrieving all of the Product Listing Override rules would simply be: rules = product_listings_registry.get_overrides() # We can also filter it down further on a per site basis if needed. - rules = product_listings_registry.get_overrides_from_module("my_page_obj_project.cool_gadget_site") + rules = product_listings_registry.get_overrides_from("my_page_obj_project.cool_gadget_site") diff --git a/tests/test_overrides.py b/tests/test_overrides.py index 59cb3139..3be33c01 100644 --- a/tests/test_overrides.py +++ b/tests/test_overrides.py @@ -31,7 +31,7 @@ def test_list_page_objects_all(): def test_list_page_objects_from_pkg(): """Tests that metadata is extracted properly from the po_lib package""" - rules = default_registry.get_overrides_from_module("tests.po_lib") + rules = default_registry.get_overrides_from("tests.po_lib") page_objects = {po.use for po in rules} # Ensure that the "tests.po_lib", which imports another module named @@ -47,7 +47,7 @@ def test_list_page_objects_from_pkg(): def test_list_page_objects_from_module(): - rules = default_registry.get_overrides_from_module("tests.po_lib.a_module") + rules = default_registry.get_overrides_from("tests.po_lib.a_module") assert len(rules) == 1 rule = rules[0] assert rule.use == POModule @@ -56,22 +56,22 @@ def test_list_page_objects_from_module(): def test_list_page_objects_from_empty_module(): - rules = default_registry.get_overrides_from_module("tests.po_lib.an_empty_module") + rules = default_registry.get_overrides_from("tests.po_lib.an_empty_module") assert len(rules) == 0 def test_list_page_objects_from_empty_pkg(): - rules = default_registry.get_overrides_from_module("tests.po_lib.an_empty_package") + rules = default_registry.get_overrides_from("tests.po_lib.an_empty_package") assert len(rules) == 0 def test_list_page_objects_from_unknown_module(): with pytest.raises(ImportError): - default_registry.get_overrides_from_module("tests.po_lib.unknown_module") + default_registry.get_overrides_from("tests.po_lib.unknown_module") def test_list_page_objects_from_imported_registry(): - rules = secondary_registry.get_overrides_from_module("tests.po_lib") + rules = secondary_registry.get_overrides_from("tests.po_lib") assert len(rules) == 2 rule_for = {po.use: po for po in rules} diff --git a/web_poet/__main__.py b/web_poet/__main__.py index 16b231b8..c3b7ff1b 100644 --- a/web_poet/__main__.py +++ b/web_poet/__main__.py @@ -42,7 +42,7 @@ def main(args=None): rule.for_patterns.priority, rule.meta, ) - for rule in default_registry.get_overrides_from_module(args.module) + for rule in default_registry.get_overrides_from(args.module) ] print(tabulate.tabulate(table, headers="firstrow")) diff --git a/web_poet/overrides.py b/web_poet/overrides.py index 405670af..ebbed976 100644 --- a/web_poet/overrides.py +++ b/web_poet/overrides.py @@ -90,8 +90,8 @@ class ExampleComProductPage(ItemPage): The rules could then be accessed using this method: - * ``default_registry.get_overrides_from_module("my_scrapy_project.page_objects.site_A")`` - * ``default_registry.get_overrides_from_module("my_scrapy_project.page_objects.site_B")`` + * ``default_registry.get_overrides_from("my_scrapy_project.page_objects.site_A")`` + * ``default_registry.get_overrides_from("my_scrapy_project.page_objects.site_B")`` """ def __init__(self): @@ -155,7 +155,7 @@ def get_overrides(self) -> List[OverrideRule]: """Returns all override rules that were declared using ``@handle_urls``.""" return list(self.data.values()) - def get_overrides_from_module(self, module: str) -> List[OverrideRule]: + def get_overrides_from(self, module: str) -> List[OverrideRule]: """Returns the override rules that were declared using ``@handle_urls`` in a specific module. From 10dff5b15f8d13947fb52307d0f139673f184c59 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal <kevinoxy@gmail.com> Date: Thu, 6 Jan 2022 13:29:16 +0800 Subject: [PATCH 23/34] fix bug where module substring paths are not filtered out correctly --- tests/po_lib/__init__.py | 2 +- tests/{po_sub_lib => po_lib_sub}/__init__.py | 8 ++++---- tests/test_overrides.py | 8 ++++---- web_poet/overrides.py | 11 +++++++++-- 4 files changed, 18 insertions(+), 11 deletions(-) rename tests/{po_sub_lib => po_lib_sub}/__init__.py (75%) diff --git a/tests/po_lib/__init__.py b/tests/po_lib/__init__.py index 2925a668..3f0632d3 100644 --- a/tests/po_lib/__init__.py +++ b/tests/po_lib/__init__.py @@ -5,7 +5,7 @@ from url_matcher import Patterns -from .. import po_sub_lib # NOTE: this module contains a PO with @handle_rules +from .. import po_lib_sub # NOTE: this module contains a PO with @handle_rules from web_poet import handle_urls, PageObjectRegistry diff --git a/tests/po_sub_lib/__init__.py b/tests/po_lib_sub/__init__.py similarity index 75% rename from tests/po_sub_lib/__init__.py rename to tests/po_lib_sub/__init__.py index e74ffab3..33850f5c 100644 --- a/tests/po_sub_lib/__init__.py +++ b/tests/po_lib_sub/__init__.py @@ -14,12 +14,12 @@ class POBase: expected_meta: Dict[str, Any] -class POSubLibOverriden: +class POLibSubOverriden: ... -@handle_urls("sub_example.com", POSubLibOverriden) -class POSubLib(POBase): - expected_overrides = POSubLibOverriden +@handle_urls("sub_example.com", POLibSubOverriden) +class POLibSub(POBase): + expected_overrides = POLibSubOverriden expected_patterns = Patterns(["sub_example.com"]) expected_meta = {} # type: ignore diff --git a/tests/test_overrides.py b/tests/test_overrides.py index 3be33c01..833adb00 100644 --- a/tests/test_overrides.py +++ b/tests/test_overrides.py @@ -1,7 +1,7 @@ import pytest from url_matcher import Patterns -from tests.po_sub_lib import POSubLib +from tests.po_lib_sub import POLibSub from tests.po_lib import POTopLevel1, POTopLevel2, POTopLevelOverriden2, secondary_registry from tests.po_lib.a_module import POModule from tests.po_lib.nested_package import PONestedPkg @@ -22,7 +22,7 @@ def test_list_page_objects_all(): # Ensure that ALL Override Rules are returned as long as the given # registry's @handle_urls annotation was used. - assert page_objects == POS.union({POSubLib}) + assert page_objects == POS.union({POLibSub}) for rule in rules: assert rule.instead_of == rule.use.expected_overrides, rule.use assert rule.for_patterns == rule.use.expected_patterns, rule.use @@ -35,9 +35,9 @@ def test_list_page_objects_from_pkg(): page_objects = {po.use for po in rules} # Ensure that the "tests.po_lib", which imports another module named - # "tests.po_sub_lib" which contains @handle_urls decorators, does not + # "tests.po_lib_sub" which contains @handle_urls decorators, does not # retrieve the override rules from the external package. - assert POSubLib not in page_objects + assert POLibSub not in page_objects assert page_objects == POS for rule in rules: diff --git a/web_poet/overrides.py b/web_poet/overrides.py index ebbed976..92bd9fbc 100644 --- a/web_poet/overrides.py +++ b/web_poet/overrides.py @@ -174,12 +174,19 @@ def _filter_from_module(self, module: str) -> Dict[Callable, OverrideRule]: return { cls: rule for cls, rule in self.data.items() - if cls.__module__.startswith(module) + + # A "." is added at the end to prevent incorrect matching on cases + # where package names are substrings of one another. For example, + # if module = "my_project.po_lib", then it filters like so: + # - "my_project.po_lib_sub.POLibSub" (filtered out) + # - "my_project.po_lib.POTopLevel1" (accepted) + # - "my_project.po_lib.nested_package.PONestedPkg" (accepted) + if cls.__module__.startswith(module + ".") or cls.__module__ == module } # For ease of use, we'll create a default registry so that users can simply -# use its `handles_url()` method directly by `from web_poet import handles_url` +# use its `handle_urls()` method directly by `from web_poet import handle_urls` default_registry = PageObjectRegistry() handle_urls = default_registry.handle_urls From daa3ff9bdfa754bacfd8a20862f68acb12a3fa61 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal <kevinoxy@gmail.com> Date: Thu, 6 Jan 2022 16:13:23 +0800 Subject: [PATCH 24/34] create consume_modules() to properly load annotations in get_overrides() --- docs/intro/overrides.rst | 36 +++++++++++++-- tests/test_overrides.py | 23 +++++++++- tests_extra/__init__.py | 5 +++ .../po_lib_sub_not_imported/__init__.py | 28 ++++++++++++ web_poet/__init__.py | 2 +- web_poet/overrides.py | 44 ++++++++++++++++++- 6 files changed, 131 insertions(+), 7 deletions(-) create mode 100644 tests_extra/__init__.py create mode 100644 tests_extra/po_lib_sub_not_imported/__init__.py diff --git a/docs/intro/overrides.rst b/docs/intro/overrides.rst index 0e86392f..72311813 100644 --- a/docs/intro/overrides.rst +++ b/docs/intro/overrides.rst @@ -161,6 +161,31 @@ to see the other functionalities. using only the ``default_registry``. There's no need to declare multiple :class:`~.PageObjectRegistry` instances and use multiple annotations. +.. warning:: + + :meth:`~.PageObjectRegistry.get_overrides` relies on the fact that all essential + packages/modules which contains the :meth:`~.PageObjectRegistry.handle_urls` + annotations are properly loaded. + + Thus, for cases like importing Page Objects from another external package, you'd + need to properly load all :meth:`~.PageObjectRegistry.handle_urls` annotations + from the external module. This ensures that the external Page Objects' have + their annotations properly loaded. + + This can be done via the function named :func:`~.web_poet.overrides.consume_modules`. + Here's an example: + + .. code-block:: python + + from web_poet import default_registry, consume_modules + + consume_modules("external_package_A.po", "another_ext_package.lib") + rules = default_registry.get_overrides() + + **NOTE**: :func:`~.web_poet.overrides.consume_modules` must be called before + :meth:`~.PageObjectRegistry.get_overrides` for the imports to properly load. + + A handy CLI tool is also available at your disposal to quickly see the available Override rules in a given module in your project. For example, invoking something like ``web_poet my_project.page_objects`` would produce the following: @@ -226,7 +251,7 @@ Then we could easily retrieve all Page Objects per subpackage or module like thi .. code-block:: python - from web_poet import default_registry + from web_poet import default_registry, consume_modules # We can do it per website. rules = default_registry.get_overrides_from("my_page_obj_project.cool_gadget_site") @@ -236,11 +261,16 @@ Then we could easily retrieve all Page Objects per subpackage or module like thi rules = default_registry.get_overrides_from("my_page_obj_project.cool_gadget_site.us") rules = default_registry.get_overrides_from("my_page_obj_project.cool_gadget_site.fr") - # or even drill down further to the specific module. + # Or even drill down further to the specific module. rules = default_registry.get_overrides_from("my_page_obj_project.cool_gadget_site.us.products") rules = default_registry.get_overrides_from("my_page_obj_project.cool_gadget_site.us.product_listings") - # Or simply all of Override rules ever declared. + # Or simply all of the Override rules ever declared. + rules = default_registry.get_overrides() + + # Lastly, you'd need to properly load external packages/modules for the + # @handle_urls annotation to be correctly read. + consume_modules("external_package_A.po", "another_ext_package.lib") rules = default_registry.get_overrides() Multiple Registry Approach diff --git a/tests/test_overrides.py b/tests/test_overrides.py index 833adb00..43cb50c5 100644 --- a/tests/test_overrides.py +++ b/tests/test_overrides.py @@ -9,6 +9,7 @@ PONestedModule, PONestedModuleOverridenSecondary, ) +from web_poet import consume_modules from web_poet.overrides import PageObjectRegistry, default_registry @@ -17,9 +18,19 @@ def test_list_page_objects_all(): rules = default_registry.get_overrides() - page_objects = {po.use for po in rules} + # Note that the 'tests_extra.po_lib_sub_not_imported.POLibSubNotImported' + # Page Object is not included here since it was never imported anywhere in + # our test package. It would only be included if we run any of the following + # below. (Note that they should run before `get_overrides` is called.) + # - from tests_extra import po_lib_sub_not_imported + # - import tests_extra.po_lib_sub_not_imported + # - web_poet.consume_modules("tests_extra") + # Merely having `import tests_extra` won't work since the subpackages and + # modules needs to be traversed and imported as well. + assert all(["po_lib_sub_not_imported" not in po.__module__ for po in page_objects]) + # Ensure that ALL Override Rules are returned as long as the given # registry's @handle_urls annotation was used. assert page_objects == POS.union({POLibSub}) @@ -29,6 +40,16 @@ def test_list_page_objects_all(): assert rule.meta == rule.use.expected_meta, rule.use +def test_list_page_objects_all_consume_modules(): + """A test similar to the one above but calls ``consume_modules()`` to properly + load the @handle_urls annotations from other modules/packages. + """ + consume_modules("tests_extra") + rules = default_registry.get_overrides() + page_objects = {po.use for po in rules} + assert any(["po_lib_sub_not_imported" in po.__module__ for po in page_objects]) + + def test_list_page_objects_from_pkg(): """Tests that metadata is extracted properly from the po_lib package""" rules = default_registry.get_overrides_from("tests.po_lib") diff --git a/tests_extra/__init__.py b/tests_extra/__init__.py new file mode 100644 index 00000000..62c40098 --- /dev/null +++ b/tests_extra/__init__.py @@ -0,0 +1,5 @@ +""" +This test package was created separately to see the behavior of retrieving the +Override rules declared on a registry where @handle_urls is defined on another +package. +""" diff --git a/tests_extra/po_lib_sub_not_imported/__init__.py b/tests_extra/po_lib_sub_not_imported/__init__.py new file mode 100644 index 00000000..8f68f79b --- /dev/null +++ b/tests_extra/po_lib_sub_not_imported/__init__.py @@ -0,0 +1,28 @@ +""" +This package quite is similar to tests/po_lib_sub in terms of code contents. + +What we're ultimately trying to test here is to see if the `default_registry` +captures the rules annotated in this module if it was not imported. +""" +from typing import Dict, Any, Callable + +from url_matcher import Patterns + +from web_poet import handle_urls + + +class POBase: + expected_overrides: Callable + expected_patterns: Patterns + expected_meta: Dict[str, Any] + + +class POLibSubOverridenNotImported: + ... + + +@handle_urls("sub_example_not_imported.com", POLibSubOverridenNotImported) +class POLibSubNotImported(POBase): + expected_overrides = POLibSubOverridenNotImported + expected_patterns = Patterns(["sub_example_not_imported.com"]) + expected_meta = {} # type: ignore diff --git a/web_poet/__init__.py b/web_poet/__init__.py index f4366d05..30c29b94 100644 --- a/web_poet/__init__.py +++ b/web_poet/__init__.py @@ -1,3 +1,3 @@ from .pages import WebPage, ItemPage, ItemWebPage, Injectable from .page_inputs import ResponseData -from .overrides import handle_urls, PageObjectRegistry, default_registry +from .overrides import handle_urls, PageObjectRegistry, default_registry, consume_modules diff --git a/web_poet/overrides.py b/web_poet/overrides.py index 92bd9fbc..4fcb057d 100644 --- a/web_poet/overrides.py +++ b/web_poet/overrides.py @@ -2,7 +2,9 @@ import importlib.util import warnings import pkgutil +from collections import deque from dataclasses import dataclass, field +from types import ModuleType from typing import Iterable, Union, List, Callable, Dict, Any from url_matcher import Patterns @@ -164,7 +166,7 @@ def get_overrides_from(self, module: str) -> List[OverrideRule]: """ rules: Dict[Callable, OverrideRule] = {} - for mod in walk_modules(module): + for mod in walk_module(module): # Dict ensures that no duplicates are collected and returned. rules.update(self._filter_from_module(mod.__name__)) @@ -191,7 +193,7 @@ def _filter_from_module(self, module: str) -> Dict[Callable, OverrideRule]: handle_urls = default_registry.handle_urls -def walk_modules(module: str) -> Iterable: +def walk_module(module: str) -> Iterable: """Return all modules from a module recursively. Note that this will import all the modules and submodules. It returns the @@ -212,3 +214,41 @@ def onerror(err): ): mod = importlib.import_module(info.name) yield mod + + +def consume_modules(*modules: str) -> None: + """A quick wrapper for :func:`~.walk_module` to efficiently consume the + generator and recursively load all packages/modules. + + This function is essential to be run before calling :meth:`~.PageObjectRegistry.get_overrides` + from the :class:`~.PageObjectRegistry`. It essentially ensures that the + ``@handle_urls`` are properly acknowledged for modules/packages that are not + imported. + + Let's take a look at an example: + + .. code-block:: python + + # my_page_obj_project/load_rules.py + + from web_poet import default_registry, consume_modules + + consume_modules("other_external_pkg.po", "another_pkg.lib") + rules = default_registry.get_overrides() + + For this case, the Override rules are coming from: + + - ``my_page_obj_project`` `(since it's the same module as the file above)` + - ``other_external_pkg.po`` + - ``another_pkg.lib`` + + So if the ``default_registry`` had other ``@handle_urls`` annotations outside + of the packages/modules list above, then the Override rules won't be returned. + """ + + for module in modules: + gen = walk_module(module) + + # Inspired by itertools recipe: https://docs.python.org/3/library/itertools.html + # Using a deque() results in a tiny bit performance improvement that list(). + deque(gen, maxlen=0) From 3b05c075d1c49058139fb36ec7ec9982f3ab41a2 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal <kevinoxy@gmail.com> Date: Thu, 6 Jan 2022 19:53:33 +0800 Subject: [PATCH 25/34] update get_overrides_from to accept an arbitrary number of str inputs --- docs/intro/overrides.rst | 28 +++++++++++++++++----------- tests/test_overrides.py | 17 ++++++++++++++++- web_poet/overrides.py | 11 ++++++----- 3 files changed, 39 insertions(+), 17 deletions(-) diff --git a/docs/intro/overrides.rst b/docs/intro/overrides.rst index 72311813..1166ceaf 100644 --- a/docs/intro/overrides.rst +++ b/docs/intro/overrides.rst @@ -254,16 +254,16 @@ Then we could easily retrieve all Page Objects per subpackage or module like thi from web_poet import default_registry, consume_modules # We can do it per website. - rules = default_registry.get_overrides_from("my_page_obj_project.cool_gadget_site") - rules = default_registry.get_overrides_from("my_page_obj_project.furniture_site") + rules_gadget = default_registry.get_overrides_from("my_page_obj_project.cool_gadget_site") + rules_furniture = default_registry.get_overrides_from("my_page_obj_project.furniture_site") # It can also drill down to the country domains on a given site. - rules = default_registry.get_overrides_from("my_page_obj_project.cool_gadget_site.us") - rules = default_registry.get_overrides_from("my_page_obj_project.cool_gadget_site.fr") + rules_gadget_us = default_registry.get_overrides_from("my_page_obj_project.cool_gadget_site.us") + rules_gadget_fr = default_registry.get_overrides_from("my_page_obj_project.cool_gadget_site.fr") # Or even drill down further to the specific module. - rules = default_registry.get_overrides_from("my_page_obj_project.cool_gadget_site.us.products") - rules = default_registry.get_overrides_from("my_page_obj_project.cool_gadget_site.us.product_listings") + rules_gadget_us_products = default_registry.get_overrides_from("my_page_obj_project.cool_gadget_site.us.products") + rules_gadget_us_listings = default_registry.get_overrides_from("my_page_obj_project.cool_gadget_site.us.product_listings") # Or simply all of the Override rules ever declared. rules = default_registry.get_overrides() @@ -273,6 +273,12 @@ Then we could easily retrieve all Page Objects per subpackage or module like thi consume_modules("external_package_A.po", "another_ext_package.lib") rules = default_registry.get_overrides() +.. warning:: + + Remember to consider calling :func:`~.web_poet.overrides.consume_modules` + when using :meth:`~.PageObjectRegistry.get_overrides` in case you have some + external package containing Page Objects of interest. + Multiple Registry Approach ~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -376,11 +382,11 @@ retrieve such rules would be: from web_poet import default_registry - product_listing_rules = [ - default_registry.get_overrides_from("my_page_obj_project.cool_gadget_site.us.product_listings") - + default_registry.get_overrides_from("my_page_obj_project.cool_gadget_site.fr.product_listings") - + default_registry.get_overrides_from("my_page_obj_project.furniture_shop.product_listings") - ] + product_listing_rules = default_registry.get_overrrides_from( + "my_page_obj_project.cool_gadget_site.us.product_listings", + "my_page_obj_project.cool_gadget_site.fr.product_listings", + "my_page_obj_project.furniture_shop.product_listings" + ) On the other hand, we can also create another :class:`~.PageObjectRegistry` instance that we'll be using aside from the ``default_registry`` to help us better organize diff --git a/tests/test_overrides.py b/tests/test_overrides.py index 43cb50c5..de9185a6 100644 --- a/tests/test_overrides.py +++ b/tests/test_overrides.py @@ -67,7 +67,7 @@ def test_list_page_objects_from_pkg(): assert rule.meta == rule.use.expected_meta, rule.use -def test_list_page_objects_from_module(): +def test_list_page_objects_from_single(): rules = default_registry.get_overrides_from("tests.po_lib.a_module") assert len(rules) == 1 rule = rules[0] @@ -75,6 +75,21 @@ def test_list_page_objects_from_module(): assert rule.for_patterns == POModule.expected_patterns assert rule.instead_of == POModule.expected_overrides +def test_list_page_objects_from_multiple(): + rules = default_registry.get_overrides_from( + "tests.po_lib.a_module", + "tests.po_lib.nested_package.a_nested_module" + ) + assert len(rules) == 2 + + assert rules[0].use == POModule + assert rules[0].for_patterns == POModule.expected_patterns + assert rules[0].instead_of == POModule.expected_overrides + + assert rules[1].use == PONestedModule + assert rules[1].for_patterns == PONestedModule.expected_patterns + assert rules[1].instead_of == PONestedModule.expected_overrides + def test_list_page_objects_from_empty_module(): rules = default_registry.get_overrides_from("tests.po_lib.an_empty_module") diff --git a/web_poet/overrides.py b/web_poet/overrides.py index 4fcb057d..b2c44419 100644 --- a/web_poet/overrides.py +++ b/web_poet/overrides.py @@ -157,18 +157,19 @@ def get_overrides(self) -> List[OverrideRule]: """Returns all override rules that were declared using ``@handle_urls``.""" return list(self.data.values()) - def get_overrides_from(self, module: str) -> List[OverrideRule]: + def get_overrides_from(self, *pkgs_or_modules: str) -> List[OverrideRule]: """Returns the override rules that were declared using ``@handle_urls`` - in a specific module. + in a specific modules/packages. This is useful if you've organized your Page Objects into multiple submodules in your project as you can filter them easily. """ + # Dict ensures that no duplicates are collected and returned. rules: Dict[Callable, OverrideRule] = {} - for mod in walk_module(module): - # Dict ensures that no duplicates are collected and returned. - rules.update(self._filter_from_module(mod.__name__)) + for item in pkgs_or_modules: + for mod in walk_module(item): + rules.update(self._filter_from_module(mod.__name__)) return list(rules.values()) From f626efc660bd1b9bb4c3fb122198d3fbe17f60b6 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal <kevinoxy@gmail.com> Date: Thu, 6 Jan 2022 20:13:14 +0800 Subject: [PATCH 26/34] add more warning docs to get_overrides() to use consume_modules() --- docs/intro/overrides.rst | 3 +++ web_poet/overrides.py | 12 +++++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/docs/intro/overrides.rst b/docs/intro/overrides.rst index 1166ceaf..cfed26cf 100644 --- a/docs/intro/overrides.rst +++ b/docs/intro/overrides.rst @@ -279,6 +279,9 @@ Then we could easily retrieve all Page Objects per subpackage or module like thi when using :meth:`~.PageObjectRegistry.get_overrides` in case you have some external package containing Page Objects of interest. + This enables the :meth:`~.PageObjectRegistry.handle_urls` that annotates + the external Page Objects to be properly loadeded. + Multiple Registry Approach ~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/web_poet/overrides.py b/web_poet/overrides.py index b2c44419..b6539001 100644 --- a/web_poet/overrides.py +++ b/web_poet/overrides.py @@ -154,7 +154,17 @@ def wrapper(cls): return wrapper def get_overrides(self) -> List[OverrideRule]: - """Returns all override rules that were declared using ``@handle_urls``.""" + """Returns all override rules that were declared using ``@handle_urls``. + + .. warning:: + + Remember to consider calling :func:`~.web_poet.overrides.consume_modules` + when using :meth:`~.PageObjectRegistry.get_overrides` in case you have + some external package containing Page Objects of interest. + + This enables the :meth:`~.PageObjectRegistry.handle_urls` that annotates + the external Page Objects to be properly loadeded. + """ return list(self.data.values()) def get_overrides_from(self, *pkgs_or_modules: str) -> List[OverrideRule]: From bd3a88eacc0cdf69f7e94d1d428a88872a9baf50 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal <kevinoxy@gmail.com> Date: Fri, 7 Jan 2022 19:46:02 +0800 Subject: [PATCH 27/34] enable ease of combining external Page Object packages --- docs/intro/overrides.rst | 172 ++++++++++++++++++++++++++++++++++++++- tests/test_overrides.py | 6 ++ web_poet/overrides.py | 28 +++++-- 3 files changed, 199 insertions(+), 7 deletions(-) diff --git a/docs/intro/overrides.rst b/docs/intro/overrides.rst index cfed26cf..09920b3a 100644 --- a/docs/intro/overrides.rst +++ b/docs/intro/overrides.rst @@ -316,7 +316,7 @@ instances of the :class:`~.PageObjectRegistry` instead: cool_gadget_fr_registry = PageObjectRegistry() furniture_shop_registry = PageObjectRegistry() -After declaring the :class:`~.PageObjectRegistry` instances, they can be imported +After declaring the :class:`~.PageObjectRegistry` instances, they can be used in each of the Page Object packages like so: .. code-block:: python @@ -432,3 +432,173 @@ Retrieving all of the Product Listing Override rules would simply be: # We can also filter it down further on a per site basis if needed. rules = product_listings_registry.get_overrides_from("my_page_obj_project.cool_gadget_site") + +Using Overrides from External Packages +-------------------------------------- + +Developers have the option to import existing Page Objects alongside the Override +Rules attached to them. This section aims to showcase different ways you can +play with the Registries to manipulate the Override Rules according to your needs. + +Let's suppose we have the following use case before us: + + - An external Python package named ``ecommerce_page_objects`` is available + which contains Page Objects for common websites. It's using the + ``default_registry`` from **web-poet**. + - Another similar package named ``gadget_sites_page_objects`` is available + for more specific websites. It's using its own registry named + ``gadget_registry``. + - Your project's objectives is to handle as much eCommerce websites as you + can. Thus, you'd want to use the already available packages above and + perhaps improve on them or create new Page Objects for new websites. + +Assuming that you'd want to **use all existing Override rules from the external +packages** in your project, you can do it like: + +.. code-block:: python + + import ecommerce_page_objects + import gadget_sites_page_objects + from web_poet import PageObjectRegistry, consume_modules, default_registry + + consume_modules("ecommerce_page_objects", "gadget_sites_page_objects") + + combined_registry = PageObjectRegistry() + combined_registry.data = { + # Since ecommerce_page_objects is using web_poet.default_registry, then + # it functions like a global registry which we can access as: + **default_registry.data, + + **gadget_sites_page_objects.gadget_registry.data, + } + + combined_rules = combined_registry.get_overrides() + + # The combined_rules would be as follows: + # 1. OverrideRule(for_patterns=Patterns(include=['site_1.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_1.EcomSite1'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, meta={}) + # 2. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_2.EcomSite2'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, meta={}) + # 3. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_2.GadgetSite2'>, instead_of=<class 'gadget_sites_page_objects.GadgetGenericPage'>, meta={}) + # 4. OverrideRule(for_patterns=Patterns(include=['site_3.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_3.GadgetSite3'>, instead_of=<class 'gadget_sites_page_objects.GadgetGenericPage'>, meta={}) + +.. note:: + + Note that ``registry.get_overrides() == list(registry.data.values())``. We're + using ``registry.data`` for these cases so that we can easily look up specific + Page Objects using the ``dict``'s key. Otherwise, it may become a problem on + large cases with lots of Override rules. + +.. note:: + + If you don't need the entire data contents of Registries, then you can opt + to use :meth:`~.PageObjectRegistry.data_from` to easily filter them out + per package/module. + + Here's an example: + + .. code-block:: python + + default_registry.data_from("ecommerce_page_objects.site_1", "ecommerce_page_objects.site_2") + +As you can see in the example above, we can easily combine the data from multiple +different registries as it simply follows a ``Dict[Callable, OverrideRule]`` +structure. There won't be any duplication or clashes of ``dict`` keys between +registries of different external packages since the keys are the Page Object +classes intended to be used. From our example above, the ``dict`` keys from a +given ``data`` registry attribute would be: + + 1. ``<class 'ecommerce_page_objects.site_1.EcomSite1'>`` + 2. ``<class 'ecommerce_page_objects.site_2.EcomSite2'>`` + 3. ``<class 'gadget_sites_page_objects.site_2.GadgetSite2'>`` + 4. ``<class 'gadget_sites_page_objects.site_3.GadgetSite3'>`` + +As you might've observed, combining the two Registries above may result in a +conflict for the Override rules for **#2** and **#3**: + +.. code-block:: python + + # 2. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_2.EcomSite2'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, meta={}) + # 3. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_2.GadgetSite2'>, instead_of=<class 'gadget_sites_page_objects.GadgetGenericPage'>, meta={}) + +The `url-matcher`_ library is the one responsible breaking such conflicts. It's +specifically discussed in this section: `rules-conflict-resolution +<https://url-matcher.readthedocs.io/en/stable/intro.html#rules-conflict-resolution>`_. + +However, it's technically **NOT** a conflict, **yet**, since: + + - ``ecommerce_page_objects.site_2.EcomSite2`` would only be used in **site_2.com** + if ``ecommerce_page_objects.EcomGenericPage`` is to be replaced. + - The same case with ``gadget_sites_page_objects.site_2.GadgetSite2`` wherein + it's only going to be utilized for **site_2.com** if the following is to be + replaced: ``gadget_sites_page_objects.GadgetGenericPage``. + +It would be only become a conflict if the **#2** and **#3** Override Rules for +**site_2.com** both intend to replace the same Page Object. In fact, none of the +Override Rules above would ever be used if your project never intends to use the +following Page Objects *(since there's nothing to override)*. You can import +these Page Objects into your project and use them so they can be overridden: + + - ``ecommerce_page_objects.EcomGenericPage`` + - ``gadget_sites_page_objects.GadgetGenericPage`` + +However, let's assume that you want to create your own generic Page Object and +only intend to use it instead of the ones above. We can easily replace them like: + +.. code-block:: python + + class ImprovedEcommerceGenericPage: + def to_item(self): + ... # different type of generic parsers + + for _, rule in combined_registry.data.items(): + rule.instead_of = ImprovedEcommerceGenericPage + + updated_rules = combined_registry.get_overrides() + + # The updated_rules would be as follows: + # 1. OverrideRule(for_patterns=Patterns(include=['site_1.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_1.EcomSite1'>, instead_of=<class 'my_project.ImprovedEcommerceGenericPage'>, meta={}) + # 2. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_2.EcomSite2'>, instead_of=<class 'my_project.ImprovedEcommerceGenericPage'>, meta={}) + # 3. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_2.GadgetSite2'>, instead_of=<class 'my_project.ImprovedEcommerceGenericPage'>, meta={}) + # 4. OverrideRule(for_patterns=Patterns(include=['site_3.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_3.GadgetSite3'>, instead_of=<class 'my_project.ImprovedEcommerceGenericPage'>, meta={}) + +Now, **#2** and **#3** have a conflict since they now both intend to replace +``ImprovedEcommerceGenericPage``. As mentioned earlier, the `url-matcher`_ +would be the one to resolve such conflicts. + +However, it would help prevent future confusion if we could remove the source of +ambiguity in our Override Rules. + +Suppose, we prefer ``gadget_sites_page_objects.site_2.GadgetSite2`` more than +``ecommerce_page_objects.site_2.EcomSite2``. As such, we could remove the latter: + +.. code-block:: python + + del combined_registry.data[ecommerce_page_objects.site_2.EcomSite2] + + updated_rules = combined_registry.get_overrides() + + # The newly updated_rules would be as follows: + # 1. OverrideRule(for_patterns=Patterns(include=['site_1.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_1.EcomSite1'>, instead_of=<class 'my_project.ImprovedEcommerceGenericPage'>, meta={}) + # 2. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_2.EcomSite2'>, instead_of=<class 'my_project.ImprovedEcommerceGenericPage'>, meta={}) + # 3. OverrideRule(for_patterns=Patterns(include=['site_3.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_3.GadgetSite3'>, instead_of=<class 'my_project.ImprovedEcommerceGenericPage'>, meta={}) + +As discussed before, the Registry's data is structured simply as +``Dict[Callable, OverrideRule]`` for which we can easily manipulate it via ``dict`` +operations. + +Now, suppose we want to improve ``ecommerce_page_objects.site_1.EcomSite1`` +from **#1** above by perhaps adding/fixing fields. We can do that by: + +.. code-block:: python + + class ImprovedEcomSite1(ecommerce_page_objects.site_1.EcomSite1): + def to_item(self): + ... # replace and improve some of the parsers here + + combined_registry.data[ecommerce_page_objects.site_1.EcomSite1].use = ImprovedEcomSite1 + + updated_rules = combined_registry.get_overrides() + + # The newly updated_rules would be as follows: + # 1. OverrideRule(for_patterns=Patterns(include=['site_1.com'], exclude=[], priority=500), use=<class 'my_project.ImprovedEcomSite1'>, instead_of=<class 'my_project.ImprovedEcommerceGenericPage'>, meta={}) + # 2. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_2.GadgetSite2'>, instead_of=<class 'my_project.ImprovedEcommerceGenericPage'>, meta={}) + # 3. OverrideRule(for_patterns=Patterns(include=['site_3.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_3.GadgetSite3'>, instead_of=<class 'my_project.ImprovedEcommerceGenericPage'>, meta={}) diff --git a/tests/test_overrides.py b/tests/test_overrides.py index de9185a6..a3aaf7a0 100644 --- a/tests/test_overrides.py +++ b/tests/test_overrides.py @@ -119,6 +119,12 @@ def test_list_page_objects_from_imported_registry(): assert pones.for_patterns == Patterns(["example.com"]) assert pones.instead_of == PONestedModuleOverridenSecondary +def test_registry_data_from(): + data = default_registry.data_from("tests.po_lib.nested_package") + + assert len(data) == 2 + assert PONestedModule in data + assert PONestedPkg in data def test_cmd(): from web_poet.__main__ import main diff --git a/web_poet/overrides.py b/web_poet/overrides.py index b6539001..ea068a2e 100644 --- a/web_poet/overrides.py +++ b/web_poet/overrides.py @@ -10,7 +10,7 @@ from url_matcher import Patterns -@dataclass(frozen=True) +@dataclass class OverrideRule: """A single override rule that specifies when a page object should be used instead of another.""" @@ -97,7 +97,7 @@ class ExampleComProductPage(ItemPage): """ def __init__(self): - self.data: Dict[Callable, OverrideRule] = {} + self._data: Dict[Callable, OverrideRule] = {} def handle_urls( self, @@ -140,8 +140,8 @@ def wrapper(cls): meta=kwargs, ) # If it was already defined, we don't want to override it - if cls not in self.data: - self.data[cls] = rule + if cls not in self._data: + self._data[cls] = rule else: warnings.warn( f"Multiple @handle_urls annotations with the same 'overrides' " @@ -165,7 +165,7 @@ def get_overrides(self) -> List[OverrideRule]: This enables the :meth:`~.PageObjectRegistry.handle_urls` that annotates the external Page Objects to be properly loadeded. """ - return list(self.data.values()) + return list(self._data.values()) def get_overrides_from(self, *pkgs_or_modules: str) -> List[OverrideRule]: """Returns the override rules that were declared using ``@handle_urls`` @@ -186,7 +186,7 @@ def get_overrides_from(self, *pkgs_or_modules: str) -> List[OverrideRule]: def _filter_from_module(self, module: str) -> Dict[Callable, OverrideRule]: return { cls: rule - for cls, rule in self.data.items() + for cls, rule in self._data.items() # A "." is added at the end to prevent incorrect matching on cases # where package names are substrings of one another. For example, @@ -197,6 +197,22 @@ def _filter_from_module(self, module: str) -> Dict[Callable, OverrideRule]: if cls.__module__.startswith(module + ".") or cls.__module__ == module } + @property + def data(self) -> Dict[Callable, OverrideRule]: + return self._data # pragma: no cover + + @data.setter + def data(self, value: Dict[Callable, OverrideRule]) -> None: + self._data = value # pragma: no cover + + def data_from(self, *pkgs_or_modules: str) -> Dict[Callable, OverrideRule]: + """Return ``data`` values that are filtered by package/module.""" + + results = {} + for item in pkgs_or_modules: + results.update(self._filter_from_module(item)) + return results + # For ease of use, we'll create a default registry so that users can simply # use its `handle_urls()` method directly by `from web_poet import handle_urls` From 0cbeb0b22f49cf9bcdc3e16372dfacda5c457471 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal <kevinoxy@gmail.com> Date: Wed, 12 Jan 2022 14:31:33 +0800 Subject: [PATCH 28/34] refactor get_overrides() to have a simpler interface with consume_modules() --- docs/intro/overrides.rst | 60 +++++++++++++++++----------- tests/test_overrides.py | 36 ++++++++++------- web_poet/__main__.py | 2 +- web_poet/overrides.py | 84 ++++++++++++++++++++++++---------------- 4 files changed, 111 insertions(+), 71 deletions(-) diff --git a/docs/intro/overrides.rst b/docs/intro/overrides.rst index 09920b3a..7f068793 100644 --- a/docs/intro/overrides.rst +++ b/docs/intro/overrides.rst @@ -148,7 +148,7 @@ to see the other functionalities. rules = default_registry.get_overrides() # Or, we could also filter out the rules by the module they were defined in - rules = default_registry.get_overrides_from("my_project.page_objects") + rules = default_registry.get_overrides(filters="my_project.page_objects") print(len(rules)) # 3 print(rules[0]) # OverrideRule(for_patterns=Patterns(include=['example.com'], exclude=[], priority=500), use=<class 'my_project.page_objects.ExampleProductPage'>, instead_of=<class 'my_project.page_objects.GenericProductPage'>, meta={}) @@ -156,10 +156,10 @@ to see the other functionalities. .. note:: Notice in the code sample above where we could filter out the Override rules - per module via :meth:`~.PageObjectRegistry.get_overrides_from`. This - could also offer another alternative way to organize your Page Object rules - using only the ``default_registry``. There's no need to declare multiple - :class:`~.PageObjectRegistry` instances and use multiple annotations. + per module via the ``filters`` param. This could also offer another alternative + way to organize your Page Object rules using only the ``default_registry``. + There's no need to declare multiple :class:`~.PageObjectRegistry` instances + and use multiple annotations. .. warning:: @@ -182,8 +182,12 @@ to see the other functionalities. consume_modules("external_package_A.po", "another_ext_package.lib") rules = default_registry.get_overrides() - **NOTE**: :func:`~.web_poet.overrides.consume_modules` must be called before - :meth:`~.PageObjectRegistry.get_overrides` for the imports to properly load. + # Fortunately, `get_overrides()` provides a shortcut for the lines above: + rules = default_registry.get_overrides(consume=["external_package_A.po", "another_ext_package.lib"]) + + **NOTE**: :func:`~.web_poet.overrides.consume_modules` or the ``consume`` param + of :meth:`~.PageObjectRegistry.get_overrides` for the imports to properly load. + Most especially if you intend to use Page Objects from externally imported packages. A handy CLI tool is also available at your disposal to quickly see the available @@ -254,16 +258,16 @@ Then we could easily retrieve all Page Objects per subpackage or module like thi from web_poet import default_registry, consume_modules # We can do it per website. - rules_gadget = default_registry.get_overrides_from("my_page_obj_project.cool_gadget_site") - rules_furniture = default_registry.get_overrides_from("my_page_obj_project.furniture_site") + rules_gadget = default_registry.get_overrides(filters="my_page_obj_project.cool_gadget_site") + rules_furniture = default_registry.get_overrides(filters="my_page_obj_project.furniture_site") # It can also drill down to the country domains on a given site. - rules_gadget_us = default_registry.get_overrides_from("my_page_obj_project.cool_gadget_site.us") - rules_gadget_fr = default_registry.get_overrides_from("my_page_obj_project.cool_gadget_site.fr") + rules_gadget_us = default_registry.get_overrides(filters="my_page_obj_project.cool_gadget_site.us") + rules_gadget_fr = default_registry.get_overrides(filters="my_page_obj_project.cool_gadget_site.fr") # Or even drill down further to the specific module. - rules_gadget_us_products = default_registry.get_overrides_from("my_page_obj_project.cool_gadget_site.us.products") - rules_gadget_us_listings = default_registry.get_overrides_from("my_page_obj_project.cool_gadget_site.us.product_listings") + rules_gadget_us_products = default_registry.get_overrides(filters="my_page_obj_project.cool_gadget_site.us.products") + rules_gadget_us_listings = default_registry.get_overrides(filters="my_page_obj_project.cool_gadget_site.us.product_listings") # Or simply all of the Override rules ever declared. rules = default_registry.get_overrides() @@ -273,11 +277,16 @@ Then we could easily retrieve all Page Objects per subpackage or module like thi consume_modules("external_package_A.po", "another_ext_package.lib") rules = default_registry.get_overrides() + # Remember, a shortcut for consuming imports would be: + rules = default_registry.get_overrides(consume=["external_package_A.po", "another_ext_package.lib"]) + + .. warning:: Remember to consider calling :func:`~.web_poet.overrides.consume_modules` - when using :meth:`~.PageObjectRegistry.get_overrides` in case you have some - external package containing Page Objects of interest. + or the ``consume`` param of :meth:`~.PageObjectRegistry.get_overrides` for the + imports to properly load. Most especially if you intend to use Page Objects + from externally imported packages. This enables the :meth:`~.PageObjectRegistry.handle_urls` that annotates the external Page Objects to be properly loadeded. @@ -301,9 +310,9 @@ hierarchy** like this: ├── furniture_shop_products.py └── furniture_shop_product_listings.py -As such, calling ``default_registry.get_overrides_from()`` would not work -on projects with a **flat hierarchy**. Thus, we can organize them using our own -instances of the :class:`~.PageObjectRegistry` instead: +As such, calling ``default_registry.get_overrides()`` with a ``from`` parameter +would not effectively work on projects with a **flat hierarchy**. Thus, we can +organize them using our own instances of the :class:`~.PageObjectRegistry` instead: .. code-block:: python @@ -385,10 +394,12 @@ retrieve such rules would be: from web_poet import default_registry - product_listing_rules = default_registry.get_overrrides_from( - "my_page_obj_project.cool_gadget_site.us.product_listings", - "my_page_obj_project.cool_gadget_site.fr.product_listings", - "my_page_obj_project.furniture_shop.product_listings" + product_listing_rules = default_registry.get_overrrides( + filters=[ + "my_page_obj_project.cool_gadget_site.us.product_listings", + "my_page_obj_project.cool_gadget_site.fr.product_listings", + "my_page_obj_project.furniture_shop.product_listings", + ] ) On the other hand, we can also create another :class:`~.PageObjectRegistry` instance @@ -431,7 +442,7 @@ Retrieving all of the Product Listing Override rules would simply be: rules = product_listings_registry.get_overrides() # We can also filter it down further on a per site basis if needed. - rules = product_listings_registry.get_overrides_from("my_page_obj_project.cool_gadget_site") + rules = product_listings_registry.get_overrides(filters="my_page_obj_project.cool_gadget_site") Using Overrides from External Packages -------------------------------------- @@ -461,6 +472,9 @@ packages** in your project, you can do it like: import gadget_sites_page_objects from web_poet import PageObjectRegistry, consume_modules, default_registry + # We're using `consume_modules()` here instead of the `consume` param of + # `PageObjectRegistry.get_overrides()` since we need to access the `data` + # attribute of the registry even before calling `PageObjectRegistry.get_overrides()` consume_modules("ecommerce_page_objects", "gadget_sites_page_objects") combined_registry = PageObjectRegistry() diff --git a/tests/test_overrides.py b/tests/test_overrides.py index a3aaf7a0..9cb488d9 100644 --- a/tests/test_overrides.py +++ b/tests/test_overrides.py @@ -2,14 +2,18 @@ from url_matcher import Patterns from tests.po_lib_sub import POLibSub -from tests.po_lib import POTopLevel1, POTopLevel2, POTopLevelOverriden2, secondary_registry +from tests.po_lib import ( + POTopLevel1, + POTopLevel2, + POTopLevelOverriden2, + secondary_registry, +) from tests.po_lib.a_module import POModule from tests.po_lib.nested_package import PONestedPkg from tests.po_lib.nested_package.a_nested_module import ( PONestedModule, PONestedModuleOverridenSecondary, ) -from web_poet import consume_modules from web_poet.overrides import PageObjectRegistry, default_registry @@ -40,19 +44,18 @@ def test_list_page_objects_all(): assert rule.meta == rule.use.expected_meta, rule.use -def test_list_page_objects_all_consume_modules(): +def test_list_page_objects_all_consume(): """A test similar to the one above but calls ``consume_modules()`` to properly load the @handle_urls annotations from other modules/packages. """ - consume_modules("tests_extra") - rules = default_registry.get_overrides() + rules = default_registry.get_overrides(consume="tests_extra") page_objects = {po.use for po in rules} assert any(["po_lib_sub_not_imported" in po.__module__ for po in page_objects]) def test_list_page_objects_from_pkg(): """Tests that metadata is extracted properly from the po_lib package""" - rules = default_registry.get_overrides_from("tests.po_lib") + rules = default_registry.get_overrides(filters="tests.po_lib") page_objects = {po.use for po in rules} # Ensure that the "tests.po_lib", which imports another module named @@ -68,17 +71,20 @@ def test_list_page_objects_from_pkg(): def test_list_page_objects_from_single(): - rules = default_registry.get_overrides_from("tests.po_lib.a_module") + rules = default_registry.get_overrides(filters="tests.po_lib.a_module") assert len(rules) == 1 rule = rules[0] assert rule.use == POModule assert rule.for_patterns == POModule.expected_patterns assert rule.instead_of == POModule.expected_overrides + def test_list_page_objects_from_multiple(): - rules = default_registry.get_overrides_from( - "tests.po_lib.a_module", - "tests.po_lib.nested_package.a_nested_module" + rules = default_registry.get_overrides( + filters=[ + "tests.po_lib.a_module", + "tests.po_lib.nested_package.a_nested_module", + ] ) assert len(rules) == 2 @@ -92,22 +98,22 @@ def test_list_page_objects_from_multiple(): def test_list_page_objects_from_empty_module(): - rules = default_registry.get_overrides_from("tests.po_lib.an_empty_module") + rules = default_registry.get_overrides(filters="tests.po_lib.an_empty_module") assert len(rules) == 0 def test_list_page_objects_from_empty_pkg(): - rules = default_registry.get_overrides_from("tests.po_lib.an_empty_package") + rules = default_registry.get_overrides(filters="tests.po_lib.an_empty_package") assert len(rules) == 0 def test_list_page_objects_from_unknown_module(): with pytest.raises(ImportError): - default_registry.get_overrides_from("tests.po_lib.unknown_module") + default_registry.get_overrides(filters="tests.po_lib.unknown_module") def test_list_page_objects_from_imported_registry(): - rules = secondary_registry.get_overrides_from("tests.po_lib") + rules = secondary_registry.get_overrides(filters="tests.po_lib") assert len(rules) == 2 rule_for = {po.use: po for po in rules} @@ -119,6 +125,7 @@ def test_list_page_objects_from_imported_registry(): assert pones.for_patterns == Patterns(["example.com"]) assert pones.instead_of == PONestedModuleOverridenSecondary + def test_registry_data_from(): data = default_registry.data_from("tests.po_lib.nested_package") @@ -126,6 +133,7 @@ def test_registry_data_from(): assert PONestedModule in data assert PONestedPkg in data + def test_cmd(): from web_poet.__main__ import main diff --git a/web_poet/__main__.py b/web_poet/__main__.py index c3b7ff1b..f1787293 100644 --- a/web_poet/__main__.py +++ b/web_poet/__main__.py @@ -42,7 +42,7 @@ def main(args=None): rule.for_patterns.priority, rule.meta, ) - for rule in default_registry.get_overrides_from(args.module) + for rule in default_registry.get_overrides(filters=args.module) ] print(tabulate.tabulate(table, headers="firstrow")) diff --git a/web_poet/overrides.py b/web_poet/overrides.py index ea068a2e..78d1ae60 100644 --- a/web_poet/overrides.py +++ b/web_poet/overrides.py @@ -5,14 +5,16 @@ from collections import deque from dataclasses import dataclass, field from types import ModuleType -from typing import Iterable, Union, List, Callable, Dict, Any +from typing import Iterable, Optional, Union, List, Callable, Dict, Any from url_matcher import Patterns +Strings = Union[str, Iterable[str]] + @dataclass class OverrideRule: - """A single override rule that specifies when a page object should be used + """A single override rule that specifies when a Page Object should be used instead of another.""" for_patterns: Patterns @@ -21,7 +23,7 @@ class OverrideRule: meta: Dict[str, Any] = field(default_factory=dict) -def _as_list(value: Union[str, Iterable[str], None]) -> List[str]: +def _as_list(value: Optional[Strings]) -> List[str]: """ >>> _as_list(None) [] @@ -92,8 +94,8 @@ class ExampleComProductPage(ItemPage): The rules could then be accessed using this method: - * ``default_registry.get_overrides_from("my_scrapy_project.page_objects.site_A")`` - * ``default_registry.get_overrides_from("my_scrapy_project.page_objects.site_B")`` + * ``default_registry.get_overrides(filters="my_scrapy_project.page_objects.site_A")`` + * ``default_registry.get_overrides(filters="my_scrapy_project.page_objects.site_B")`` """ def __init__(self): @@ -101,10 +103,10 @@ def __init__(self): def handle_urls( self, - include: Union[str, Iterable[str]], + include: Strings, overrides: Callable, *, - exclude: Union[str, Iterable[str], None] = None, + exclude: Optional[Strings] = None, priority: int = 500, **kwargs, ): @@ -153,41 +155,46 @@ def wrapper(cls): return wrapper - def get_overrides(self) -> List[OverrideRule]: - """Returns all override rules that were declared using ``@handle_urls``. + def get_overrides( + self, consume: Optional[Strings] = None, filters: Optional[Strings] = None + ) -> List[OverrideRule]: + """Returns all Override Rules that were declared using ``@handle_urls``. + + :param consume: packages/modules that need to be imported so that it can + properly load the :meth:`~.PageObjectRegistry.handle_urls` annotations. + :param filters: packages/modules that are of interest can be declared + here to easily extract the rules from them. Use this when you need + to pinpoint specific rules. .. warning:: - Remember to consider calling :func:`~.web_poet.overrides.consume_modules` - when using :meth:`~.PageObjectRegistry.get_overrides` in case you have - some external package containing Page Objects of interest. + Remember to consider using the ``consume`` parameter to properly load + the :meth:`~.PageObjectRegistry.handle_urls` from external Page + Objects - This enables the :meth:`~.PageObjectRegistry.handle_urls` that annotates - the external Page Objects to be properly loadeded. + The ``consume`` parameter provides a convenient shortcut for calling + :func:`~.web_poet.overrides.consume_modules`. """ - return list(self._data.values()) + if consume: + consume_modules(*_as_list(consume)) - def get_overrides_from(self, *pkgs_or_modules: str) -> List[OverrideRule]: - """Returns the override rules that were declared using ``@handle_urls`` - in a specific modules/packages. + if not filters: + return list(self._data.values()) - This is useful if you've organized your Page Objects into multiple - submodules in your project as you can filter them easily. - """ - # Dict ensures that no duplicates are collected and returned. - rules: Dict[Callable, OverrideRule] = {} + else: + # Dict ensures that no duplicates are collected and returned. + rules: Dict[Callable, OverrideRule] = {} - for item in pkgs_or_modules: - for mod in walk_module(item): - rules.update(self._filter_from_module(mod.__name__)) + for item in _as_list(filters): + for mod in walk_module(item): + rules.update(self._filter_from_module(mod.__name__)) - return list(rules.values()) + return list(rules.values()) def _filter_from_module(self, module: str) -> Dict[Callable, OverrideRule]: return { cls: rule for cls, rule in self._data.items() - # A "." is added at the end to prevent incorrect matching on cases # where package names are substrings of one another. For example, # if module = "my_project.po_lib", then it filters like so: @@ -247,10 +254,9 @@ def consume_modules(*modules: str) -> None: """A quick wrapper for :func:`~.walk_module` to efficiently consume the generator and recursively load all packages/modules. - This function is essential to be run before calling :meth:`~.PageObjectRegistry.get_overrides` - from the :class:`~.PageObjectRegistry`. It essentially ensures that the - ``@handle_urls`` are properly acknowledged for modules/packages that are not - imported. + This function is essential to be run before attempting to retrieve all + :meth:`~.PageObjectRegistry.handle_urls` annotations from :class:`~.PageObjectRegistry` + to ensure that they are properly acknowledge by importing them in runtime. Let's take a look at an example: @@ -270,7 +276,19 @@ def consume_modules(*modules: str) -> None: - ``another_pkg.lib`` So if the ``default_registry`` had other ``@handle_urls`` annotations outside - of the packages/modules list above, then the Override rules won't be returned. + of the packages/modules listed above, then the Override rules won't be returned. + + .. note:: + + :meth:`~.PageObjectRegistry.get_overrides` provides a shortcut for this + using its ``consume`` parameter. Thus, the code example above could be + shortened even further by: + + .. code-block:: python + + from web_poet import default_registry + + rules = default_registry.get_overrides(consume=["other_external_pkg.po", "another_pkg.lib"]) """ for module in modules: From de5563ab967010ae03bab93b3cc276c9ab5f8ee4 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal <kevinoxy@gmail.com> Date: Thu, 13 Jan 2022 15:26:23 +0800 Subject: [PATCH 29/34] introduce concept of 'registry_pool' to access all PageObjectRegistry instances --- docs/intro/overrides.rst | 79 +++++++++++++++++++++++++++++++++------- tests/po_lib/__init__.py | 2 +- tests/test_overrides.py | 33 +++++++++++++++-- web_poet/__init__.py | 14 ++++++- web_poet/__main__.py | 60 ++++++++++++++++++++++-------- web_poet/overrides.py | 69 +++++++++++++++++++++++++++++------ 6 files changed, 212 insertions(+), 45 deletions(-) diff --git a/docs/intro/overrides.rst b/docs/intro/overrides.rst index 7f068793..bbfac9b0 100644 --- a/docs/intro/overrides.rst +++ b/docs/intro/overrides.rst @@ -105,8 +105,8 @@ code example below: from web_poet.pages import ItemWebPage from web_poet import PageObjectRegistry - primary_registry = PageObjectRegistry() - secondary_registry = PageObjectRegistry() + primary_registry = PageObjectRegistry(name="primary") + secondary_registry = PageObjectRegistry(name="secondary") class GenericProductPage(ItemWebPage): def to_item(self): @@ -196,11 +196,22 @@ like ``web_poet my_project.page_objects`` would produce the following: .. code-block:: - Use this instead of for the URL patterns except for the patterns with priority meta - ---------------------------------------------------- ------------------------------------------ -------------------------------------- ------------------------- --------------- ------ - my_project.page_objects.ExampleProductPage my_project.page_objects.GenericProductPage ['example.com'] [] 500 {} - my_project.page_objects.AnotherExampleProductPage my_project.page_objects.GenericProductPage ['anotherexample.com'] ['/digital-goods/'] 500 {} - my_project.page_objects.DualExampleProductPage my_project.page_objects.GenericProductPage ['dualexample.com/shop/?product=*', 'dualexample.net/store/?pid=*'] [] 500 {} + Registry Use this instead of for the URL patterns except for the patterns with priority meta + --------- ---------------------------------------------------- ------------------------------------------ ------------------------------------------------------------------- ------------------------- --------------- ------ + default my_project.page_objects.ExampleProductPage my_project.page_objects.GenericProductPage ['example.com'] [] 500 {} + default my_project.page_objects.AnotherExampleProductPage my_project.page_objects.GenericProductPage ['anotherexample.com'] ['/digital-goods/'] 500 {} + default my_project.page_objects.DualExampleProductPage my_project.page_objects.GenericProductPage ['dualexample.com/shop/?product=*', 'dualexample.net/store/?pid=*'] [] 500 {} + +You can also filter them via the **name** of :class:`~.PageObjectRegistry`. For example, +invoking ``web_poet my_project.page_objects --registry_name=custom`` would produce +something like: + +.. code-block:: + + Registry Use this instead of for the URL patterns except for the patterns with priority meta + ---------- ---------------------------------------------------- ------------------------------------------ ---------------------- ------------------------- --------------- ------ + custom my_project.page_objects.CustomProductPage my_project.page_objects.GenericProductPage ['example.com'] [] 500 {} + custom my_project.page_objects.AnotherCustomProductPage my_project.page_objects.GenericProductPage ['anotherexample.com'] ['/digital-goods/'] 500 {} Organizing Page Object Overrides -------------------------------- @@ -320,10 +331,52 @@ organize them using our own instances of the :class:`~.PageObjectRegistry` inste from web_poet import PageObjectRegistry - cool_gadget_registry = PageObjectRegistry() - cool_gadget_us_registry = PageObjectRegistry() - cool_gadget_fr_registry = PageObjectRegistry() - furniture_shop_registry = PageObjectRegistry() + cool_gadget_registry = PageObjectRegistry(name="cool_gadget") + cool_gadget_us_registry = PageObjectRegistry(name="cool_gadget_us") + cool_gadget_fr_registry = PageObjectRegistry(name="cool_gadget_fr") + furniture_shop_registry = PageObjectRegistry(name="furniture_shop") + +Note that you can access all of the :class:`~.PageObjectRegistry` that were +ever instantiated via ``web_poet.registry_pool`` which is simply a mapping +structured as ``Dict[str, PageObjectRegistry]``: + +.. code-block:: python + + from web_poet import registry_pool + + print(registry_pool) + # { + # 'default': <web_poet.overrides.PageObjectRegistry object at 0x7f47d654d8b0>, + # 'cool_gadget' = <my_page_obj_project.PageObjectRegistry object at 0x7f47d654382a>, + # 'cool_gadget_us' = <my_page_obj_project.PageObjectRegistry object at 0xb247d65433c3>, + # 'cool_gadget_fr' = <my_page_obj_project.PageObjectRegistry object at 0xd93746549dea>, + # 'furniture_shop' = <my_page_obj_project.PageObjectRegistry object at 0x82n78654441b> + # } + +.. warning:: + + Please be aware that there might be some :class:`~.PageObjectRegistry` + that are not available, most especially if you're using them from external + packages. + + Thus, it's imperative to use :func:`~.web_poet.overrides.consume_modules` + beforehand: + + .. code-block:: python + + from web_poet import registry_pool, consume_modules + + consume_modules("external_pkg") + + print(registry_pool) + # { + # 'default': <web_poet.overrides.PageObjectRegistry object at 0x7f47d654d8b0>, + # 'cool_gadget' = <my_page_obj_project.PageObjectRegistry object at 0x7f47d654382a>, + # 'cool_gadget_us' = <my_page_obj_project.PageObjectRegistry object at 0xb247d65433c3>, + # 'cool_gadget_fr' = <my_page_obj_project.PageObjectRegistry object at 0xd93746549dea>, + # 'furniture_shop' = <my_page_obj_project.PageObjectRegistry object at 0x82n78654441b>, + # 'ecommerce': <external_pkg.PageObjectRegistry object at 0xbc45d8328420> + # } After declaring the :class:`~.PageObjectRegistry` instances, they can be used in each of the Page Object packages like so: @@ -412,7 +465,7 @@ our Override Rules. from web_poet import PageObjectRegistry - product_listings_registry = PageObjectRegistry() + product_listings_registry = PageObjectRegistry(name="product_listings") Using the additional registry instance above, we'll use it to provide another annotation for the Page Objects in each of the ``product_listings.py`` module. @@ -477,7 +530,7 @@ packages** in your project, you can do it like: # attribute of the registry even before calling `PageObjectRegistry.get_overrides()` consume_modules("ecommerce_page_objects", "gadget_sites_page_objects") - combined_registry = PageObjectRegistry() + combined_registry = PageObjectRegistry(name="combined") combined_registry.data = { # Since ecommerce_page_objects is using web_poet.default_registry, then # it functions like a global registry which we can access as: diff --git a/tests/po_lib/__init__.py b/tests/po_lib/__init__.py index 3f0632d3..51e9631c 100644 --- a/tests/po_lib/__init__.py +++ b/tests/po_lib/__init__.py @@ -23,7 +23,7 @@ class POTopLevelOverriden2: ... -secondary_registry = PageObjectRegistry() +secondary_registry = PageObjectRegistry(name="secondary") # This first annotation is ignored. A single annotation per registry is allowed diff --git a/tests/test_overrides.py b/tests/test_overrides.py index 9cb488d9..dc06e379 100644 --- a/tests/test_overrides.py +++ b/tests/test_overrides.py @@ -1,3 +1,5 @@ +import argparse + import pytest from url_matcher import Patterns @@ -14,7 +16,7 @@ PONestedModule, PONestedModuleOverridenSecondary, ) -from web_poet.overrides import PageObjectRegistry, default_registry +from web_poet import PageObjectRegistry, default_registry, registry_pool POS = {POTopLevel1, POTopLevel2, POModule, PONestedPkg, PONestedModule} @@ -134,7 +136,32 @@ def test_registry_data_from(): assert PONestedPkg in data -def test_cmd(): +def test_registry_name_conflict(): + """Registries can only have a unique name.""" + + PageObjectRegistry("main") + + assert "main" in registry_pool + + with pytest.raises(ValueError): + PageObjectRegistry("main") + + +def test_cli_tool(): + """Ensure that CLI parameters returns the expected results. + + There's no need to check each specific OverrideRule below as we already have + extensive tests for those above. We can simply count how many rules there are + for a given registry. + """ + from web_poet.__main__ import main - assert main(["tests.po_lib"]) is None + results = main(["tests"]) + assert len(results) == 6 + + results = main(["tests", "--registry_name=secondary"]) + assert len(results) == 2 + + results = main(["tests", "--registry_name=not_exist"]) + assert not results diff --git a/web_poet/__init__.py b/web_poet/__init__.py index 30c29b94..7c865e1e 100644 --- a/web_poet/__init__.py +++ b/web_poet/__init__.py @@ -1,3 +1,15 @@ +from typing import Dict + from .pages import WebPage, ItemPage, ItemWebPage, Injectable from .page_inputs import ResponseData -from .overrides import handle_urls, PageObjectRegistry, default_registry, consume_modules +from .overrides import ( + PageObjectRegistry, + consume_modules, + registry_pool, +) + + +# For ease of use, we'll create a default registry so that users can simply +# use its `handle_urls()` method directly by `from web_poet import handle_urls` +default_registry = PageObjectRegistry(name="default") +handle_urls = default_registry.handle_urls diff --git a/web_poet/__main__.py b/web_poet/__main__.py index f1787293..34978e67 100644 --- a/web_poet/__main__.py +++ b/web_poet/__main__.py @@ -1,18 +1,19 @@ """Returns all Override Rules from the default registry.""" import argparse -from typing import Callable +from typing import Callable, Optional, List import tabulate -from web_poet import default_registry +from web_poet import registry_pool, consume_modules, PageObjectRegistry +from web_poet.overrides import OverrideRule def qualified_name(cls: Callable) -> str: return f"{cls.__module__}.{cls.__name__}" -def main(args=None): +def parse_args(raw_args: Optional[List[str]] = None) -> argparse.Namespace: parser = argparse.ArgumentParser( description="Tool that list the Page Object overrides from a package or module recursively" ) @@ -22,19 +23,35 @@ def main(args=None): type=str, help="A package or module to list overrides from", ) - args = parser.parse_args(args) - table = [ - ( - "Use this", - "instead of", - "for the URL patterns", - "except for the patterns", - "with priority", - "meta", - ) + parser.add_argument( + "--registry_name", + default="default", + type=str, + help="Name of the registry to retrieve the rules from.", + ) + return parser.parse_args(args=raw_args) + + +def load_registry(args: argparse.Namespace) -> Optional[PageObjectRegistry]: + consume_modules(args.module) + registry = registry_pool.get(args.registry_name) + return registry + + +def display_table(registry_name: str, rules: List[OverrideRule]) -> None: + headers = [ + "Registry", + "Use this", + "instead of", + "for the URL patterns", + "except for the patterns", + "with priority", + "meta", ] - table += [ + + table = [ ( + registry_name, qualified_name(rule.use), qualified_name(rule.instead_of), rule.for_patterns.include, @@ -42,9 +59,20 @@ def main(args=None): rule.for_patterns.priority, rule.meta, ) - for rule in default_registry.get_overrides(filters=args.module) + for rule in rules ] - print(tabulate.tabulate(table, headers="firstrow")) + print(tabulate.tabulate(table, headers=headers)) + + +def main(raw_args: Optional[List[str]] = None) -> Optional[List[OverrideRule]]: + args = parse_args(raw_args) # pragma: no cover + registry = load_registry(args) + if not registry: + print(f"No registry named {args.registry_name} found.") + return None + rules = registry.get_overrides(filters=args.module) + display_table(registry.name, rules) + return rules # for ease of testing if __name__ == "__main__": diff --git a/web_poet/overrides.py b/web_poet/overrides.py index 78d1ae60..4f9d01ee 100644 --- a/web_poet/overrides.py +++ b/web_poet/overrides.py @@ -50,21 +50,26 @@ class PageObjectRegistry: from web_poet import PageObjectRegistry - main_registry = PageObjectRegistry() - secondary_registry = PageObjectRegistry() + main_registry = PageObjectRegistry(name="main") + secondary_registry = PageObjectRegistry(name="secondary") @main_registry.handle_urls("example.com", overrides=ProductPageObject) @secondary_registry.handle_urls("example.com", overrides=ProductPageObject) class ExampleComProductPage(ItemPage): ... + .. warning:: + + Each :class:`~.PageObjectRegistry` instance should have a unique **name** + value. Otherwise, a ``ValueError`` is raised. + The annotation indicates that the ``ExampleComProductPage`` Page Object should be used instead of the ``ProductPageObject`` Page Object for all the URLs whose top level domain is ``example.com``. Moreover, this rule is available for the two (2) registries we've declared. This could be useful in cases wherein you want to categorize the rules by - ``PageObjectRegistry``. They could each be accessed via: + :class:`~.PageObjectRegistry`. They could each be accessed via: .. code-block:: python @@ -89,18 +94,61 @@ class ExampleComProductPage(ItemPage): with the built-in default Registry. In addition, if you need to organize your Page Objects in your project, a - single (1) default instance of the ``PageObjectRegistry`` would work, as - long as you organize your files into modules. + single (1) default instance of the :class:`~.PageObjectRegistry` would work, + as long as you organize your files into modules. The rules could then be accessed using this method: * ``default_registry.get_overrides(filters="my_scrapy_project.page_objects.site_A")`` * ``default_registry.get_overrides(filters="my_scrapy_project.page_objects.site_B")`` + + Lastly, you can access all of the :class:`~.PageObjectRegistry` that were + ever instantiated via ``web_poet.registry_pool`` which is simply a mapping + structured as ``Dict[str, PageObjectRegistry]``: + + .. code-block:: python + + from web_poet import registry_pool + + print(registry_pool) + # { + # 'default': <web_poet.overrides.PageObjectRegistry object at 0x7f47d654d8b0>, + # 'main': <web_poet.overrides.PageObjectRegistry object at 0x7f47d525c3d0>, + # 'secondary': <web_poet.overrides.PageObjectRegistry object at 0x7f47d52024c0> + # } + + .. warning:: + + Please be aware that there might be some :class:`~.PageObjectRegistry` + that are not available, most especially if you're using them from external + packages. + + Thus, it's imperative to use :func:`~.web_poet.overrides.consume_modules` + beforehand: + + .. code-block:: python + + from web_poet import registry_pool, consume_modules + + consume_modules("external_pkg") + + print(registry_pool) + # { + # 'default': <web_poet.overrides.PageObjectRegistry object at 0x7f47d654d8b0>, + # 'main': <web_poet.overrides.PageObjectRegistry object at 0x7f47d525c3d0>, + # 'secondary': <web_poet.overrides.PageObjectRegistry object at 0x7f47d52024c0> + # 'ecommerce': <external_pkg.PageObjectRegistry object at 0x7f47d8328420> + # } """ - def __init__(self): + def __init__(self, name: str): self._data: Dict[Callable, OverrideRule] = {} + if name in registry_pool: + raise ValueError(f"A registry named '{name}' already exists.") + registry_pool[name] = self + self.name = name + def handle_urls( self, include: Strings, @@ -221,10 +269,9 @@ def data_from(self, *pkgs_or_modules: str) -> Dict[Callable, OverrideRule]: return results -# For ease of use, we'll create a default registry so that users can simply -# use its `handle_urls()` method directly by `from web_poet import handle_urls` -default_registry = PageObjectRegistry() -handle_urls = default_registry.handle_urls +# When the `PageObjectRegistry` class is instantiated, it records itself to +# this pool so that all instances can easily be accessed later on. +registry_pool: Dict[str, PageObjectRegistry] = {} def walk_module(module: str) -> Iterable: @@ -256,7 +303,7 @@ def consume_modules(*modules: str) -> None: This function is essential to be run before attempting to retrieve all :meth:`~.PageObjectRegistry.handle_urls` annotations from :class:`~.PageObjectRegistry` - to ensure that they are properly acknowledge by importing them in runtime. + to ensure that they are properly acknowledged by importing them in runtime. Let's take a look at an example: From e7cca693b77295a3f083d3bcc3585b1baa94101d Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal <kevinoxy@gmail.com> Date: Thu, 13 Jan 2022 17:55:00 +0800 Subject: [PATCH 30/34] implement __hash__() in OverrideRule to easily identify uniqueness --- docs/intro/overrides.rst | 4 ++++ tests/test_overrides.py | 24 ++++++++++++++++++++++++ web_poet/overrides.py | 15 ++++++++++++++- 3 files changed, 42 insertions(+), 1 deletion(-) diff --git a/docs/intro/overrides.rst b/docs/intro/overrides.rst index bbfac9b0..e65409f0 100644 --- a/docs/intro/overrides.rst +++ b/docs/intro/overrides.rst @@ -547,6 +547,10 @@ packages** in your project, you can do it like: # 3. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_2.GadgetSite2'>, instead_of=<class 'gadget_sites_page_objects.GadgetGenericPage'>, meta={}) # 4. OverrideRule(for_patterns=Patterns(include=['site_3.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_3.GadgetSite3'>, instead_of=<class 'gadget_sites_page_objects.GadgetGenericPage'>, meta={}) + # If there are any duplicates when combining the OverrideRules, + # you could do the following to ensure uniqueness: + combined_rules = set(combined_registry) + .. note:: Note that ``registry.get_overrides() == list(registry.data.values())``. We're diff --git a/tests/test_overrides.py b/tests/test_overrides.py index dc06e379..a7532fe4 100644 --- a/tests/test_overrides.py +++ b/tests/test_overrides.py @@ -17,11 +17,35 @@ PONestedModuleOverridenSecondary, ) from web_poet import PageObjectRegistry, default_registry, registry_pool +from web_poet.overrides import OverrideRule POS = {POTopLevel1, POTopLevel2, POModule, PONestedPkg, PONestedModule} +def test_override_rule_uniqueness(): + """The same instance of an OverrideRule with the same attribute values should + have the same hash identity. + """ + + patterns = Patterns(include=["example.com"], exclude=["example.com/blog"]) + + rule1 = OverrideRule( + for_patterns=patterns, + use=POTopLevel1, + instead_of=POTopLevelOverriden2, + meta={"key_1": 1} + ) + rule2 = OverrideRule( + for_patterns=patterns, + use=POTopLevel1, + instead_of=POTopLevelOverriden2, + meta={"key_2": 2} + ) + + assert hash(rule1) == hash(rule2) + + def test_list_page_objects_all(): rules = default_registry.get_overrides() page_objects = {po.use for po in rules} diff --git a/web_poet/overrides.py b/web_poet/overrides.py index 4f9d01ee..a9180599 100644 --- a/web_poet/overrides.py +++ b/web_poet/overrides.py @@ -15,13 +15,26 @@ @dataclass class OverrideRule: """A single override rule that specifies when a Page Object should be used - instead of another.""" + instead of another. + """ for_patterns: Patterns use: Callable instead_of: Callable meta: Dict[str, Any] = field(default_factory=dict) + def __hash__(self): + # TODO: Remove this when the following has been implemented: + # - https://github.com/zytedata/url-matcher/issues/3 + pattern_hash = hash( + ( + tuple(self.for_patterns.include), + tuple(self.for_patterns.exclude), + self.for_patterns.priority, + ) + ) + return hash((pattern_hash, self.use, self.instead_of)) + def _as_list(value: Optional[Strings]) -> List[str]: """ From eab277a01983175e98a9a08c43f7a5f20c9ac8c9 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal <kevinoxy@gmail.com> Date: Mon, 17 Jan 2022 14:16:20 +0800 Subject: [PATCH 31/34] polish documentation with better examples and discussion --- docs/intro/overrides.rst | 233 ++++++++++++++++++++++----------------- web_poet/overrides.py | 34 +++++- 2 files changed, 163 insertions(+), 104 deletions(-) diff --git a/docs/intro/overrides.rst b/docs/intro/overrides.rst index e65409f0..7ba54393 100644 --- a/docs/intro/overrides.rst +++ b/docs/intro/overrides.rst @@ -40,15 +40,18 @@ Here's where the Overrides concept comes in: then the Page Object associated with that rule `overrides (or replaces)` the default ``GenericProductPage``. -This enables us to fine tune our parsing logic `(which are abstracted away for -each Page Object)` depending on the page we're parsing. +This enables us to conveniently declare which Page Object would be used for a +given webpage `(based on a page's URL pattern)`. -Let's see this in action by creating Page Objects below. +Let's see this in action by declaring the Overrides in the Page Objects below. Creating Overrides ------------------ +Using Default Registry +~~~~~~~~~~~~~~~~~~~~~~ + Let's take a look at how the following code is structured: .. code-block:: python @@ -56,20 +59,24 @@ Let's take a look at how the following code is structured: from web_poet import handle_urls from web_poet.pages import ItemWebPage + class GenericProductPage(ItemWebPage): def to_item(self): return {"product title": self.css("title::text").get()} + @handle_urls("example.com", overrides=GenericProductPage) class ExampleProductPage(ItemWebPage): def to_item(self): ... # more specific parsing + @handle_urls("anotherexample.com", overrides=GenericProductPage, exclude="/digital-goods/") class AnotherExampleProductPage(ItemWebPage): def to_item(self): ... # more specific parsing + @handle_urls(["dualexample.com/shop/?product=*", "dualexample.net/store/?pid=*"], overrides=GenericProductPage) class DualExampleProductPage(ItemWebPage): def to_item(self): @@ -82,94 +89,101 @@ The code above declares that: - The same is true for ``DualExampleProductPage`` where it is used instead of ``GenericProductPage`` for two URL patterns which works as: - - **(match)** https://www.dualexample.com/shop/electronics/?product=123 - - **(match)** https://www.dualexample.com/shop/books/paperback/?product=849 - - (NO match) https://www.dualexample.com/on-sale/books/?product=923 - - **(match)** https://www.dualexample.net/store/kitchen/?pid=776 - - **(match)** https://www.dualexample.net/store/?pid=892 - - (NO match) https://www.dualexample.net/new-offers/fitness/?pid=892 + - :sub:`(match) https://www.dualexample.com/shop/electronics/?product=123` + - :sub:`(match) https://www.dualexample.com/shop/books/paperback/?product=849` + - :sub:`(NO match) https://www.dualexample.com/on-sale/books/?product=923` + - :sub:`(match) https://www.dualexample.net/store/kitchen/?pid=776` + - :sub:`(match) https://www.dualexample.net/store/?pid=892` + - :sub:`(NO match) https://www.dualexample.net/new-offers/fitness/?pid=892` - On the other hand, ``AnotherExampleProductPage`` is only used instead of ``GenericProductPage`` when we're parsing pages from ``anotherexample.com`` - which doesn't contain ``/digital-goods/`` in its URL path. + that doesn't contain ``/digital-goods/`` in its URL path. -The override mechanism that ``web-poet`` offers could still be further -customized. You can read some of the specific parameters and alternative ways -to organize the rules via the :ref:`Overrides API section <api-overrides>`. +.. tip:: + + The URL patterns declared in the :func:`web_poet.handle_urls` can still be + further customized. You can read some of the specific parameters and + alternative ways in the API section <api-overrides> of + :func:`web_poet.handle_urls`. + +Using Multiple Registries +~~~~~~~~~~~~~~~~~~~~~~~~~ To demonstrate another alternative way to declare the Override rules, see the code example below: .. code-block:: python + from web_poet import handle_urls, PageObjectRegistry from web_poet.pages import ItemWebPage - from web_poet import PageObjectRegistry - primary_registry = PageObjectRegistry(name="primary") - secondary_registry = PageObjectRegistry(name="secondary") + + clothes_registry = PageObjectRegistry(name="clothes") + class GenericProductPage(ItemWebPage): def to_item(self): return {"product title": self.css("title::text").get()} - @primary_registry.handle_urls("example.com", overrides=GenericProductPage) - class ExampleProductPage(ItemWebPage): - def to_item(self): - ... # more specific parsing - @secondary_registry.handle_urls("anotherexample.com", overrides=GenericProductPage, exclude="/digital-goods/") - class AnotherExampleProductPage(ItemWebPage): - def to_item(self): - ... # more specific parsing - - @primary_registry.handle_urls(["dualexample.com/shop/?product=*", "dualexample.net/store/?pid=*"], overrides=GenericProductPage) - @secondary_registry.handle_urls(["dualexample.com/shop/?product=*", "dualexample.net/store/?pid=*"], overrides=GenericProductPage) + @handle_urls(["dualexample.com/shop/?product=*", "dualexample.net/store/?pid=*"], overrides=GenericProductPage) + @clothes_registry.handle_urls("dualexample.com/shop/?category=clothes&product=*", overrides=GenericProductPage) class DualExampleProductPage(ItemWebPage): def to_item(self): ... # more specific parsing +In the example above, we're splitting the Page Objects into two separate Registries. +If you may notice, ``DualExampleProductPage`` is being declared into both of them +but with a different URL pattern. + If you need more control over the Registry, you could instantiate your very own :class:`~.PageObjectRegistry` and use its ``@handle_urls`` to annotate and register the rules. This might benefit you in certain project use cases where you need more organizational control over your rules. +Such an approach could be useful especially when you're publishing your Page +Objects as an external dependency. Other projects may use it and could import +a specific Registry containing the URL rules that they may need. + Viewing all available Overrides ------------------------------- -A convenience function is available discover and retrieve all rules from your -project. Make sure to check out :ref:`Overrides API section <api-overrides>` -to see the other functionalities. +A convenience function is available discover and retrieve all :class:`~.OverrideRule` +from your project. Make sure to check out the :meth:`~.PageObjectRegistry.get_overrides` +API section to see other functionalities. .. code-block:: python from web_poet import default_registry - # Retrieves all rules that were registered in the registry + # Retrieves all OverrideRules that were registered in the registry rules = default_registry.get_overrides() - # Or, we could also filter out the rules by the module they were defined in + # Or, we could also filter out the OverrideRules by the module they were defined in rules = default_registry.get_overrides(filters="my_project.page_objects") print(len(rules)) # 3 - print(rules[0]) # OverrideRule(for_patterns=Patterns(include=['example.com'], exclude=[], priority=500), use=<class 'my_project.page_objects.ExampleProductPage'>, instead_of=<class 'my_project.page_objects.GenericProductPage'>, meta={}) + print(rules[0]) # OverrideRule(for_patterns=Patterns(include=['example.com'], exclude=[], priority=500), use=<class 'my_project.page_objects.ExampleProductPage'>, instead_of=<class 'my_project.page_objects.GenericProductPage'>, meta={}) .. note:: Notice in the code sample above where we could filter out the Override rules per module via the ``filters`` param. This could also offer another alternative - way to organize your Page Object rules using only the ``default_registry``. - There's no need to declare multiple :class:`~.PageObjectRegistry` instances - and use multiple annotations. + way to organize your Page Object rules by module hierarchies in your project. + This could require on solely using the ``default_registry``. There's no need + to declare multiple :class:`~.PageObjectRegistry` instances and use multiple + annotations. .. warning:: :meth:`~.PageObjectRegistry.get_overrides` relies on the fact that all essential - packages/modules which contains the :meth:`~.PageObjectRegistry.handle_urls` + packages/modules which contains the :func:`web_poet.handle_urls` annotations are properly loaded. Thus, for cases like importing Page Objects from another external package, you'd - need to properly load all :meth:`~.PageObjectRegistry.handle_urls` annotations - from the external module. This ensures that the external Page Objects' have + need to properly load all :meth:`web_poet.handle_urls` annotations + from the external module. This ensures that the external Page Objects have their annotations properly loaded. This can be done via the function named :func:`~.web_poet.overrides.consume_modules`. @@ -185,14 +199,9 @@ to see the other functionalities. # Fortunately, `get_overrides()` provides a shortcut for the lines above: rules = default_registry.get_overrides(consume=["external_package_A.po", "another_ext_package.lib"]) - **NOTE**: :func:`~.web_poet.overrides.consume_modules` or the ``consume`` param - of :meth:`~.PageObjectRegistry.get_overrides` for the imports to properly load. - Most especially if you intend to use Page Objects from externally imported packages. - - A handy CLI tool is also available at your disposal to quickly see the available -Override rules in a given module in your project. For example, invoking something -like ``web_poet my_project.page_objects`` would produce the following: +:class:`~.OverrideRule` in a given module in your project. For example, invoking +something like ``web_poet my_project.page_objects`` would produce the following: .. code-block:: @@ -219,7 +228,7 @@ Organizing Page Object Overrides After tackling the two (2) different approaches from the previous chapters on how to declare overrides, we can now explore how to organize them in our projects. Although it's mostly up to the developer which override declaration method to -use. Yet, we'll present some approaches depending on the situation. +use. Yet, we'll present a few different approaches depending on the situation. To put this thought into action, let's suppose we are tasked to create a Page Object Project with overrides for eCommerce websites. @@ -248,7 +257,7 @@ Using the **package-based** approach, we might organize them into something like └── product_listings.py Assuming that we've declared the Page Objects in each of the modules to use the -``default_registry`` like: +``default_registry`` as something like: .. code-block:: python @@ -262,7 +271,8 @@ Assuming that we've declared the Page Objects in each of the modules to use the def to_item(self): ... # parsers here -Then we could easily retrieve all Page Objects per subpackage or module like this: +Then we could easily retrieve all :class:`~.OverrideRule` filtered per subpackage +or module like this: .. code-block:: python @@ -284,7 +294,7 @@ Then we could easily retrieve all Page Objects per subpackage or module like thi rules = default_registry.get_overrides() # Lastly, you'd need to properly load external packages/modules for the - # @handle_urls annotation to be correctly read. + # @handle_urls annotation to be correctly read. If there are any. consume_modules("external_package_A.po", "another_ext_package.lib") rules = default_registry.get_overrides() @@ -300,14 +310,14 @@ Then we could easily retrieve all Page Objects per subpackage or module like thi from externally imported packages. This enables the :meth:`~.PageObjectRegistry.handle_urls` that annotates - the external Page Objects to be properly loadeded. + the external Page Objects to be properly loaded. Multiple Registry Approach ~~~~~~~~~~~~~~~~~~~~~~~~~~ The **package-based** approach heavily relies on how the developer organizes the -files into intuitive hierarchies depending on the nature of the project. There -might be cases that for some reason, a developer would want to use a **flat +project modules into intuitive hierarchies depending on the nature of the project. +There might be cases that for some reason, a developer would want to use a **flat hierarchy** like this: .. code-block:: @@ -321,9 +331,10 @@ hierarchy** like this: ├── furniture_shop_products.py └── furniture_shop_product_listings.py -As such, calling ``default_registry.get_overrides()`` with a ``from`` parameter -would not effectively work on projects with a **flat hierarchy**. Thus, we can -organize them using our own instances of the :class:`~.PageObjectRegistry` instead: +As such, calling :meth:`~.PageObjectRegistry.get_overrides` with a ``from`` +filter parameter would not effectively work on projects with a **flat hierarchy**. +Thus, we can organize them using our own instances of the :class:`~.PageObjectRegistry` +instead: .. code-block:: python @@ -336,22 +347,29 @@ organize them using our own instances of the :class:`~.PageObjectRegistry` inste cool_gadget_fr_registry = PageObjectRegistry(name="cool_gadget_fr") furniture_shop_registry = PageObjectRegistry(name="furniture_shop") -Note that you can access all of the :class:`~.PageObjectRegistry` that were -ever instantiated via ``web_poet.registry_pool`` which is simply a mapping -structured as ``Dict[str, PageObjectRegistry]``: +.. tip:: -.. code-block:: python + Later on, you can access all of the :class:`~.PageObjectRegistry` that were + ever instantiated. This can be done via ``web_poet.registry_pool`` which + simply a holds a mapping structured as ``Dict[str, PageObjectRegistry]``. + + So after declaring the :class:`~.PageObjectRegistry` instances above, we can + view them via: + + .. code-block:: python + + from web_poet import registry_pool - from web_poet import registry_pool + print(registry_pool) + # { + # 'default': <web_poet.overrides.PageObjectRegistry object at 0x7f47d654d8b0>, + # 'cool_gadget' = <my_page_obj_project.PageObjectRegistry object at 0x7f47d654382a>, + # 'cool_gadget_us' = <my_page_obj_project.PageObjectRegistry object at 0xb247d65433c3>, + # 'cool_gadget_fr' = <my_page_obj_project.PageObjectRegistry object at 0xd93746549dea>, + # 'furniture_shop' = <my_page_obj_project.PageObjectRegistry object at 0x82n78654441b> + # } - print(registry_pool) - # { - # 'default': <web_poet.overrides.PageObjectRegistry object at 0x7f47d654d8b0>, - # 'cool_gadget' = <my_page_obj_project.PageObjectRegistry object at 0x7f47d654382a>, - # 'cool_gadget_us' = <my_page_obj_project.PageObjectRegistry object at 0xb247d65433c3>, - # 'cool_gadget_fr' = <my_page_obj_project.PageObjectRegistry object at 0xd93746549dea>, - # 'furniture_shop' = <my_page_obj_project.PageObjectRegistry object at 0x82n78654441b> - # } + Notice that the ``default`` registry will always be present. .. warning:: @@ -360,7 +378,11 @@ structured as ``Dict[str, PageObjectRegistry]``: packages. Thus, it's imperative to use :func:`~.web_poet.overrides.consume_modules` - beforehand: + beforehand. Not only that it helps us find the :meth:`~.PageObjectRegistry.handle_urls` + annotated in external packages, but also finds the instances of + :class:`~.PageObjectRegistry` as well. + + Here's an example: .. code-block:: python @@ -378,6 +400,9 @@ structured as ``Dict[str, PageObjectRegistry]``: # 'ecommerce': <external_pkg.PageObjectRegistry object at 0xbc45d8328420> # } + Notice that the ``external_pkg.PageObjectRegistry`` named **ecommerce** has + now been successfully discovered. + After declaring the :class:`~.PageObjectRegistry` instances, they can be used in each of the Page Object packages like so: @@ -388,6 +413,7 @@ in each of the Page Object packages like so: from . import cool_gadget_registry, cool_gadget_us_registry from web_poet.pages import ItemWebPage + @cool_gadget_registry.handle_urls("coolgadgetsite.com", overrides=GenericProductPage) @cool_gadget_us_registry.handle_urls("coolgadgetsite.com", overrides=GenericProductPage) class CoolGadgetSiteProductPage(ItemWebPage): @@ -440,8 +466,8 @@ For instance, going back to our **package-based** approach organized as: ├── products.py └── product_listings.py -Suppose we'd want to get all the rules for all of the listings, then one way to -retrieve such rules would be: +Suppose we'd want to get all the rules for all of the listings `(ignoring anything +else)`, then one way to retrieve such rules would be: .. code-block:: python @@ -457,7 +483,7 @@ retrieve such rules would be: On the other hand, we can also create another :class:`~.PageObjectRegistry` instance that we'll be using aside from the ``default_registry`` to help us better organize -our Override Rules. +our :class:`~.OverrideRule`. .. code-block:: python @@ -467,9 +493,9 @@ our Override Rules. product_listings_registry = PageObjectRegistry(name="product_listings") -Using the additional registry instance above, we'll use it to provide another -annotation for the Page Objects in each of the ``product_listings.py`` module. -For example: +Using the new **product_listings_registr** instance above, we'll use it to +provide another annotation for the Page Objects in each of the +``product_listings.py`` module. For example: .. code-block:: python @@ -479,13 +505,14 @@ For example: from web_poet import handle_urls # remember that this uses the default_registry from web_poet.pages import ItemWebPage + @product_listings_registry.handle_urls("coolgadgetsite.com", overrides=GenericProductPage) @handle_urls("coolgadgetsite.com", overrides=GenericProductPage) class CoolGadgetSiteProductPage(ItemWebPage): def to_item(self): ... # parsers here -Retrieving all of the Product Listing Override rules would simply be: +Retrieving all of the Product Listing :class:`~.OverrideRule` would simply be: .. code-block:: python @@ -500,24 +527,25 @@ Retrieving all of the Product Listing Override rules would simply be: Using Overrides from External Packages -------------------------------------- -Developers have the option to import existing Page Objects alongside the Override -Rules attached to them. This section aims to showcase different ways you can -play with the Registries to manipulate the Override Rules according to your needs. +Developers have the option to import existing Page Objects alongside the +:class:`~.OverrideRule` attached to them. This section aims to showcase different +ways you can play with the Registries to manipulate the :class:`~.OverrideRule` +according to your needs. Let's suppose we have the following use case before us: - - An external Python package named ``ecommerce_page_objects`` is available + - An **external** Python package named ``ecommerce_page_objects`` is available which contains Page Objects for common websites. It's using the ``default_registry`` from **web-poet**. - Another similar package named ``gadget_sites_page_objects`` is available - for more specific websites. It's using its own registry named + for even more specific websites. It's using its own registry named ``gadget_registry``. - Your project's objectives is to handle as much eCommerce websites as you can. Thus, you'd want to use the already available packages above and perhaps improve on them or create new Page Objects for new websites. -Assuming that you'd want to **use all existing Override rules from the external -packages** in your project, you can do it like: +Assuming that you'd want to **use all existing** :class:`~.OverrideRule` **from +the external packages** in your project, you can do it like: .. code-block:: python @@ -525,6 +553,7 @@ packages** in your project, you can do it like: import gadget_sites_page_objects from web_poet import PageObjectRegistry, consume_modules, default_registry + # We're using `consume_modules()` here instead of the `consume` param of # `PageObjectRegistry.get_overrides()` since we need to access the `data` # attribute of the registry even before calling `PageObjectRegistry.get_overrides()` @@ -536,6 +565,8 @@ packages** in your project, you can do it like: # it functions like a global registry which we can access as: **default_registry.data, + # External packages not using the web_poet.default_registry would need + # to have their own registry accessed. **gadget_sites_page_objects.gadget_registry.data, } @@ -549,16 +580,17 @@ packages** in your project, you can do it like: # If there are any duplicates when combining the OverrideRules, # you could do the following to ensure uniqueness: - combined_rules = set(combined_registry) + combined_rules = set(combined_rules) .. note:: - Note that ``registry.get_overrides() == list(registry.data.values())``. We're - using ``registry.data`` for these cases so that we can easily look up specific - Page Objects using the ``dict``'s key. Otherwise, it may become a problem on - large cases with lots of Override rules. + Note that ``registry.get_overrides() == list(registry.data.values())``. -.. note:: + We're using ``registry.data`` for these cases so that we can easily look up + specific Page Objects using the ``dict``'s key. Otherwise, it may become a + problem on large cases with lots of :class:`~.OverrideRule`. + +.. tip:: If you don't need the entire data contents of Registries, then you can opt to use :meth:`~.PageObjectRegistry.data_from` to easily filter them out @@ -574,8 +606,10 @@ As you can see in the example above, we can easily combine the data from multipl different registries as it simply follows a ``Dict[Callable, OverrideRule]`` structure. There won't be any duplication or clashes of ``dict`` keys between registries of different external packages since the keys are the Page Object -classes intended to be used. From our example above, the ``dict`` keys from a -given ``data`` registry attribute would be: +classes intended to be used. + +From our example above, the ``dict`` keys from a given ``data`` registry +attribute would be: 1. ``<class 'ecommerce_page_objects.site_1.EcomSite1'>`` 2. ``<class 'ecommerce_page_objects.site_2.EcomSite2'>`` @@ -583,7 +617,7 @@ given ``data`` registry attribute would be: 4. ``<class 'gadget_sites_page_objects.site_3.GadgetSite3'>`` As you might've observed, combining the two Registries above may result in a -conflict for the Override rules for **#2** and **#3**: +conflict for the :class:`~.OverrideRule` for **#2** and **#3**: .. code-block:: python @@ -602,11 +636,12 @@ However, it's technically **NOT** a conflict, **yet**, since: it's only going to be utilized for **site_2.com** if the following is to be replaced: ``gadget_sites_page_objects.GadgetGenericPage``. -It would be only become a conflict if the **#2** and **#3** Override Rules for -**site_2.com** both intend to replace the same Page Object. In fact, none of the -Override Rules above would ever be used if your project never intends to use the -following Page Objects *(since there's nothing to override)*. You can import -these Page Objects into your project and use them so they can be overridden: +It would be only become a conflict if the **#2** and **#3** :class:`~.OverrideRule` +for **site_2.com** both `intend to replace the` **same** `Page Object`. In fact, +none of the :class:`~.OverrideRule` above would ever be used if your project never +intends to use the following Page Objects *(since there's nothing to override)*. +You can import these Page Objects into your project and use them so they can be +overridden: - ``ecommerce_page_objects.EcomGenericPage`` - ``gadget_sites_page_objects.GadgetGenericPage`` @@ -636,7 +671,7 @@ Now, **#2** and **#3** have a conflict since they now both intend to replace would be the one to resolve such conflicts. However, it would help prevent future confusion if we could remove the source of -ambiguity in our Override Rules. +ambiguity in our :class:`~.OverrideRule`. Suppose, we prefer ``gadget_sites_page_objects.site_2.GadgetSite2`` more than ``ecommerce_page_objects.site_2.EcomSite2``. As such, we could remove the latter: diff --git a/web_poet/overrides.py b/web_poet/overrides.py index a9180599..60dd6f91 100644 --- a/web_poet/overrides.py +++ b/web_poet/overrides.py @@ -16,6 +16,22 @@ class OverrideRule: """A single override rule that specifies when a Page Object should be used instead of another. + + This is instantiated when using the :func:`web_poet.handle_urls` decorator. + It's also being returned as a ``List[OverrideRule]`` when calling + :meth:`~.PageObjectRegistry.get_overrides`. + + You can access any of its attributes: + + * ``for_patterns: Patterns`` - contains the URL patterns associated + with this rule. You can read the API documentation of the + `url-matcher <https://url-matcher.readthedocs.io/>`_ package for more + information. + * ``use: Callable`` - the Page Object that will be used. + * ``instead_of: Callable`` - the Page Object that will be **replaced**. + * ``meta: Dict[str, Any] = field(default_factory=dict)`` - Any other + information you many want to store. This doesn't do anything for now + but may be useful for future API updates. """ for_patterns: Patterns @@ -67,7 +83,7 @@ class PageObjectRegistry: secondary_registry = PageObjectRegistry(name="secondary") @main_registry.handle_urls("example.com", overrides=ProductPageObject) - @secondary_registry.handle_urls("example.com", overrides=ProductPageObject) + @secondary_registry.handle_urls("example.com/shop/?id=*", overrides=ProductPageObject) class ExampleComProductPage(ItemPage): ... @@ -219,7 +235,8 @@ def wrapper(cls): def get_overrides( self, consume: Optional[Strings] = None, filters: Optional[Strings] = None ) -> List[OverrideRule]: - """Returns all Override Rules that were declared using ``@handle_urls``. + """Returns a ``List`` of :class:`~.OverrideRule` that were declared using + ``@handle_urls``. :param consume: packages/modules that need to be imported so that it can properly load the :meth:`~.PageObjectRegistry.handle_urls` annotations. @@ -267,6 +284,9 @@ def _filter_from_module(self, module: str) -> Dict[Callable, OverrideRule]: @property def data(self) -> Dict[Callable, OverrideRule]: + """Return the ``Dict[Calalble, OverrideRule]`` mapping that were + registered via :meth:`web_poet.handle_urls` annotations. + """ return self._data # pragma: no cover @data.setter @@ -274,7 +294,10 @@ def data(self, value: Dict[Callable, OverrideRule]) -> None: self._data = value # pragma: no cover def data_from(self, *pkgs_or_modules: str) -> Dict[Callable, OverrideRule]: - """Return ``data`` values that are filtered by package/module.""" + """Return ``data`` values that are filtered by package/module. + + This can be used in lieu of :meth:`PageObjectRegistry.data`. + """ results = {} for item in pkgs_or_modules: @@ -329,14 +352,15 @@ def consume_modules(*modules: str) -> None: consume_modules("other_external_pkg.po", "another_pkg.lib") rules = default_registry.get_overrides() - For this case, the Override rules are coming from: + For this case, the ``List`` of :class:`~.OverrideRule` are coming from: - ``my_page_obj_project`` `(since it's the same module as the file above)` - ``other_external_pkg.po`` - ``another_pkg.lib`` So if the ``default_registry`` had other ``@handle_urls`` annotations outside - of the packages/modules listed above, then the Override rules won't be returned. + of the packages/modules listed above, then the :class:`~.OverrideRule` won't + be returned. .. note:: From 38e56cd8249b7f88d677ed9edb34c25ae971622f Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal <kevinoxy@gmail.com> Date: Tue, 18 Jan 2022 19:40:09 +0800 Subject: [PATCH 32/34] add more tests when PageObjectRegistry is instantiated --- tests/test_overrides.py | 10 ++++++++-- web_poet/overrides.py | 3 +++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/tests/test_overrides.py b/tests/test_overrides.py index a7532fe4..72f795f5 100644 --- a/tests/test_overrides.py +++ b/tests/test_overrides.py @@ -161,14 +161,20 @@ def test_registry_data_from(): def test_registry_name_conflict(): - """Registries can only have a unique name.""" + """Registries can only have valid unique names.""" PageObjectRegistry("main") assert "main" in registry_pool with pytest.raises(ValueError): - PageObjectRegistry("main") + PageObjectRegistry("main") # a duplicate name + + with pytest.raises(TypeError): + PageObjectRegistry() + + with pytest.raises(ValueError): + PageObjectRegistry("") def test_cli_tool(): diff --git a/web_poet/overrides.py b/web_poet/overrides.py index 60dd6f91..259a24cd 100644 --- a/web_poet/overrides.py +++ b/web_poet/overrides.py @@ -173,6 +173,9 @@ class ExampleComProductPage(ItemPage): def __init__(self, name: str): self._data: Dict[Callable, OverrideRule] = {} + if not name: + raise ValueError("A registry should have a name.") + if name in registry_pool: raise ValueError(f"A registry named '{name}' already exists.") registry_pool[name] = self From bf0b3e5e04b3bbb2a8e25f9118d271727fc4f3a4 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal <kevinoxy@gmail.com> Date: Thu, 20 Jan 2022 19:27:32 +0800 Subject: [PATCH 33/34] update PageObjectRegistry API for manipulating rules from different registries --- docs/intro/overrides.rst | 82 ++++++++++--------------------- tests/test_overrides.py | 87 +++++++++++++++++++++++++++++---- web_poet/overrides.py | 101 ++++++++++++++++++++++++++++++++------- 3 files changed, 189 insertions(+), 81 deletions(-) diff --git a/docs/intro/overrides.rst b/docs/intro/overrides.rst index 7ba54393..fa4c6eaa 100644 --- a/docs/intro/overrides.rst +++ b/docs/intro/overrides.rst @@ -553,22 +553,21 @@ the external packages** in your project, you can do it like: import gadget_sites_page_objects from web_poet import PageObjectRegistry, consume_modules, default_registry - # We're using `consume_modules()` here instead of the `consume` param of - # `PageObjectRegistry.get_overrides()` since we need to access the `data` - # attribute of the registry even before calling `PageObjectRegistry.get_overrides()` + # `PageObjectRegistry.get_overrides()` since we need to properly load all + # of the annotated rules from the registry. consume_modules("ecommerce_page_objects", "gadget_sites_page_objects") combined_registry = PageObjectRegistry(name="combined") - combined_registry.data = { + combined_registry.copy_overrides_from( # Since ecommerce_page_objects is using web_poet.default_registry, then - # it functions like a global registry which we can access as: - **default_registry.data, + # it functions like a global registry which we can access simply as: + default_registry, # External packages not using the web_poet.default_registry would need # to have their own registry accessed. - **gadget_sites_page_objects.gadget_registry.data, - } + gadget_sites_page_objects.gadget_registry + ) combined_rules = combined_registry.get_overrides() @@ -578,45 +577,12 @@ the external packages** in your project, you can do it like: # 3. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_2.GadgetSite2'>, instead_of=<class 'gadget_sites_page_objects.GadgetGenericPage'>, meta={}) # 4. OverrideRule(for_patterns=Patterns(include=['site_3.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_3.GadgetSite3'>, instead_of=<class 'gadget_sites_page_objects.GadgetGenericPage'>, meta={}) - # If there are any duplicates when combining the OverrideRules, - # you could do the following to ensure uniqueness: - combined_rules = set(combined_rules) - -.. note:: - - Note that ``registry.get_overrides() == list(registry.data.values())``. +As you can see in the example above, we can easily combine the rules from multiple +different registries. There won't be any duplication of :class:`~.OverrideRule` +entries since :meth:`PageObjectRegistry.copy_overrides_from` already deduplicates +the rules. - We're using ``registry.data`` for these cases so that we can easily look up - specific Page Objects using the ``dict``'s key. Otherwise, it may become a - problem on large cases with lots of :class:`~.OverrideRule`. - -.. tip:: - - If you don't need the entire data contents of Registries, then you can opt - to use :meth:`~.PageObjectRegistry.data_from` to easily filter them out - per package/module. - - Here's an example: - - .. code-block:: python - - default_registry.data_from("ecommerce_page_objects.site_1", "ecommerce_page_objects.site_2") - -As you can see in the example above, we can easily combine the data from multiple -different registries as it simply follows a ``Dict[Callable, OverrideRule]`` -structure. There won't be any duplication or clashes of ``dict`` keys between -registries of different external packages since the keys are the Page Object -classes intended to be used. - -From our example above, the ``dict`` keys from a given ``data`` registry -attribute would be: - - 1. ``<class 'ecommerce_page_objects.site_1.EcomSite1'>`` - 2. ``<class 'ecommerce_page_objects.site_2.EcomSite2'>`` - 3. ``<class 'gadget_sites_page_objects.site_2.GadgetSite2'>`` - 4. ``<class 'gadget_sites_page_objects.site_3.GadgetSite3'>`` - -As you might've observed, combining the two Registries above may result in a +You might've observed that combining the two Registries above may result in a conflict for the :class:`~.OverrideRule` for **#2** and **#3**: .. code-block:: python @@ -651,12 +617,15 @@ only intend to use it instead of the ones above. We can easily replace them like .. code-block:: python + # Our new generic Page Object that we'd prefer instead of: + # - ecommerce_page_objects.EcomGenericPage + # - gadget_sites_page_objects.GadgetGenericPage class ImprovedEcommerceGenericPage: def to_item(self): ... # different type of generic parsers - for _, rule in combined_registry.data.items(): - rule.instead_of = ImprovedEcommerceGenericPage + for rule in combined_registry.get_overrides(): + combined_registry.replace_override(rule, instead_of=ImprovedEcommerceGenericPage) updated_rules = combined_registry.get_overrides() @@ -678,7 +647,8 @@ Suppose, we prefer ``gadget_sites_page_objects.site_2.GadgetSite2`` more than .. code-block:: python - del combined_registry.data[ecommerce_page_objects.site_2.EcomSite2] + rules = combined_registry.search_overrides(use=ecommerce_page_objects.site_2.EcomSite2) + combined_registry.remove_overrides(*rules) updated_rules = combined_registry.get_overrides() @@ -687,10 +657,6 @@ Suppose, we prefer ``gadget_sites_page_objects.site_2.GadgetSite2`` more than # 2. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_2.EcomSite2'>, instead_of=<class 'my_project.ImprovedEcommerceGenericPage'>, meta={}) # 3. OverrideRule(for_patterns=Patterns(include=['site_3.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_3.GadgetSite3'>, instead_of=<class 'my_project.ImprovedEcommerceGenericPage'>, meta={}) -As discussed before, the Registry's data is structured simply as -``Dict[Callable, OverrideRule]`` for which we can easily manipulate it via ``dict`` -operations. - Now, suppose we want to improve ``ecommerce_page_objects.site_1.EcomSite1`` from **#1** above by perhaps adding/fixing fields. We can do that by: @@ -700,11 +666,13 @@ from **#1** above by perhaps adding/fixing fields. We can do that by: def to_item(self): ... # replace and improve some of the parsers here - combined_registry.data[ecommerce_page_objects.site_1.EcomSite1].use = ImprovedEcomSite1 + rules = combined_registry.search_overrides(use=ecommerce_page_objects.site_1.EcomSite1) + for rule in rules: + combined_registry.replace_override(rules, use=ImprovedEcomSite1) updated_rules = combined_registry.get_overrides() # The newly updated_rules would be as follows: - # 1. OverrideRule(for_patterns=Patterns(include=['site_1.com'], exclude=[], priority=500), use=<class 'my_project.ImprovedEcomSite1'>, instead_of=<class 'my_project.ImprovedEcommerceGenericPage'>, meta={}) - # 2. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_2.GadgetSite2'>, instead_of=<class 'my_project.ImprovedEcommerceGenericPage'>, meta={}) - # 3. OverrideRule(for_patterns=Patterns(include=['site_3.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_3.GadgetSite3'>, instead_of=<class 'my_project.ImprovedEcommerceGenericPage'>, meta={}) + # 1. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_2.GadgetSite2'>, instead_of=<class 'my_project.ImprovedEcommerceGenericPage'>, meta={}) + # 2. OverrideRule(for_patterns=Patterns(include=['site_3.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_3.GadgetSite3'>, instead_of=<class 'my_project.ImprovedEcommerceGenericPage'>, meta={}) + # 3. OverrideRule(for_patterns=Patterns(include=['site_1.com'], exclude=[], priority=500), use=<class 'my_project.ImprovedEcomSite1'>, instead_of=<class 'my_project.ImprovedEcommerceGenericPage'>, meta={}) diff --git a/tests/test_overrides.py b/tests/test_overrides.py index 72f795f5..567d18ff 100644 --- a/tests/test_overrides.py +++ b/tests/test_overrides.py @@ -1,4 +1,5 @@ import argparse +import dataclasses import pytest from url_matcher import Patterns @@ -152,14 +153,6 @@ def test_list_page_objects_from_imported_registry(): assert pones.instead_of == PONestedModuleOverridenSecondary -def test_registry_data_from(): - data = default_registry.data_from("tests.po_lib.nested_package") - - assert len(data) == 2 - assert PONestedModule in data - assert PONestedPkg in data - - def test_registry_name_conflict(): """Registries can only have valid unique names.""" @@ -177,6 +170,84 @@ def test_registry_name_conflict(): PageObjectRegistry("") +def test_registry_copy_overrides_from(): + combined_registry = PageObjectRegistry("combined") + combined_registry.copy_overrides_from(default_registry, secondary_registry) + + # Copying overrides from other PageObjectRegistries should have duplicate + # OverrideRules removed. + combined_rule_count = combined_registry.get_overrides() + assert len(combined_rule_count) == 7 + + raw_count = len(default_registry.get_overrides()) + len(secondary_registry.get_overrides()) + assert len(combined_rule_count) < raw_count + + # Copying overrides again does not result in duplicates + combined_registry.copy_overrides_from(default_registry, secondary_registry) + combined_registry.copy_overrides_from(default_registry, secondary_registry) + combined_registry.copy_overrides_from(default_registry, secondary_registry) + assert len(combined_rule_count) == 7 + + +def test_registry_replace_override(): + registry = PageObjectRegistry("replace") + registry.copy_overrides_from(secondary_registry) + rules = registry.get_overrides() + + replacement_rule = registry.replace_override(rules[0], instead_of=POTopLevel1) + + new_rules = registry.get_overrides() + assert len(new_rules) == 2 + assert new_rules[-1].instead_of == POTopLevel1 # newly replace rules at the bottom + assert replacement_rule.instead_of == POTopLevel1 # newly replace rules at the bottom + + # Replacing a rule not in the registry would result in ValueError + rule_not_in_registry = dataclasses.replace(new_rules[0], instead_of=POTopLevelOverriden2) + with pytest.raises(ValueError): + registry.replace_override(rule_not_in_registry, instead_of=POTopLevel2) + + +def test_registry_search_overrides(): + registry = PageObjectRegistry("search") + registry.copy_overrides_from(secondary_registry) + + rules = registry.search_overrides(use=POTopLevel2) + assert len(rules) == 1 + assert rules[0].use == POTopLevel2 + + rules = registry.search_overrides(instead_of=POTopLevelOverriden2) + assert len(rules) == 1 + assert rules[0].instead_of == POTopLevelOverriden2 + + rules = registry.search_overrides( + instead_of=PONestedModuleOverridenSecondary, use=PONestedModule + ) + assert len(rules) == 1 + assert rules[0].instead_of == PONestedModuleOverridenSecondary + assert rules[0].use == PONestedModule + + # These rules doesn't exist + rules = registry.search_overrides(use=POTopLevel1) + assert len(rules) == 0 + + rules = registry.search_overrides(instead_of=POTopLevel1) + assert len(rules) == 0 + + +def test_registry_remove_overrides(): + registry = PageObjectRegistry("remove") + registry.copy_overrides_from(secondary_registry) + + rules = registry.get_overrides() + + registry.remove_overrides(*rules) + assert len(registry.get_overrides()) == 0 + + # Removing non-existing rules won't error out. + registry.remove_overrides(*rules) + assert len(registry.get_overrides()) == 0 + + def test_cli_tool(): """Ensure that CLI parameters returns the expected results. diff --git a/web_poet/overrides.py b/web_poet/overrides.py index 259a24cd..77bc7ee2 100644 --- a/web_poet/overrides.py +++ b/web_poet/overrides.py @@ -1,9 +1,12 @@ +from __future__ import annotations # https://www.python.org/dev/peps/pep-0563/ + import importlib import importlib.util import warnings import pkgutil from collections import deque -from dataclasses import dataclass, field +from dataclasses import dataclass, field, replace +from operator import attrgetter from types import ModuleType from typing import Iterable, Optional, Union, List, Callable, Dict, Any @@ -12,7 +15,7 @@ Strings = Union[str, Iterable[str]] -@dataclass +@dataclass(frozen=True) class OverrideRule: """A single override rule that specifies when a Page Object should be used instead of another. @@ -285,28 +288,94 @@ def _filter_from_module(self, module: str) -> Dict[Callable, OverrideRule]: if cls.__module__.startswith(module + ".") or cls.__module__ == module } - @property - def data(self) -> Dict[Callable, OverrideRule]: - """Return the ``Dict[Calalble, OverrideRule]`` mapping that were - registered via :meth:`web_poet.handle_urls` annotations. + def copy_overrides_from(self, *page_object_registries: PageObjectRegistry) -> None: + """Copies the :class:`OverrideRule` data from one or more + :class:`PageObjectRegistry` instances. + + Any duplicate :class:`OverrideRule` are also removed. + """ + + for registry in page_object_registries: + for rule in registry.get_overrides(): + if rule.use not in self._data: + self._data[rule.use] = rule + + def replace_override(self, rule: OverrideRule, **kwargs) -> OverrideRule: + """Given a :class:`OverrideRule`, replace its attributes with the new + ones specified. + + If the supplied :class:`OverrideRule` instance does not belong in the + registry, a ``ValueError`` is raised. + + .. note:: + + Since :class:`OverrideRule` are frozen dataclasses, this method + removes the instance of the old rule completely and instead, creates + a new instance with the newly replaced attributes. + + The new instance of the :class:`OverrideRule` with the new specified + attribites is returned. """ - return self._data # pragma: no cover - @data.setter - def data(self, value: Dict[Callable, OverrideRule]) -> None: - self._data = value # pragma: no cover + if rule not in self._data.values(): + raise ValueError(f"The given rule is not present in {self}: {rule}") + + new_rule = replace(rule, **kwargs) + del self._data[rule.use] + self._data[new_rule.use] = new_rule - def data_from(self, *pkgs_or_modules: str) -> Dict[Callable, OverrideRule]: - """Return ``data`` values that are filtered by package/module. + return new_rule + + def search_overrides(self, **kwargs) -> List[OverrideRule]: + """Returns a list of :class:`OverrideRule` if any of the attributes + matches the rules inside the registry. + + Sample usage: + + .. code-block:: python + + rules = registry.search_overrides(use=ProductPO, instead_of=GenericPO) + print(len(rules)) # 1 - This can be used in lieu of :meth:`PageObjectRegistry.data`. """ - results = {} - for item in pkgs_or_modules: - results.update(self._filter_from_module(item)) + # Short-circuit operation if "use" is the only search param used, since + # we know that it's being used as the dict key. + if set(["use"]) == kwargs.keys(): + rule = self._data.get(kwargs["use"]) + if rule: + return [rule] + return [] + + getter = attrgetter(*kwargs.keys()) + + def matcher(rule: OverrideRule): + attribs = getter(rule) + if not isinstance(attribs, tuple): + attribs = tuple([attribs]) + if list(attribs) == list(kwargs.values()): + return True + return False + + results = [] + for rule in self.get_overrides(): + if matcher(rule): + results.append(rule) return results + def remove_overrides(self, *rules: OverrideRule) -> None: + """Given a list of :class:`OverrideRule`, remove them from the Registry. + + Non-existing rules won't pose an issue as no errors will be raised. + """ + + for rule in rules: + if rule.use in self._data: + del self._data[rule.use] + + def __repr__(self): + return f"PageObjectRegistry(name='{self.name}')" + # When the `PageObjectRegistry` class is instantiated, it records itself to # this pool so that all instances can easily be accessed later on. From d5a5d75b53ec5246f4ad8e307bbe9c425577ccfa Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal <kevinoxy@gmail.com> Date: Tue, 1 Feb 2022 15:21:23 +0800 Subject: [PATCH 34/34] update OverrideRule __hash__() implementation after url-matcher==0.2.0 update --- web_poet/overrides.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/web_poet/overrides.py b/web_poet/overrides.py index 77bc7ee2..f1b92548 100644 --- a/web_poet/overrides.py +++ b/web_poet/overrides.py @@ -43,16 +43,7 @@ class OverrideRule: meta: Dict[str, Any] = field(default_factory=dict) def __hash__(self): - # TODO: Remove this when the following has been implemented: - # - https://github.com/zytedata/url-matcher/issues/3 - pattern_hash = hash( - ( - tuple(self.for_patterns.include), - tuple(self.for_patterns.exclude), - self.for_patterns.priority, - ) - ) - return hash((pattern_hash, self.use, self.instead_of)) + return hash((self.for_patterns, self.use, self.instead_of)) def _as_list(value: Optional[Strings]) -> List[str]: