From 5a4f18dc79de61a0a703c65076c4a1e0274e780a Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Wed, 10 Apr 2024 15:36:51 +0300 Subject: [PATCH 1/5] Extend the docstrings for the paginators --- dlt/sources/helpers/rest_client/paginators.py | 157 +++++++++++++++--- 1 file changed, 137 insertions(+), 20 deletions(-) diff --git a/dlt/sources/helpers/rest_client/paginators.py b/dlt/sources/helpers/rest_client/paginators.py index f34557bdfe..dbd9faa1f2 100644 --- a/dlt/sources/helpers/rest_client/paginators.py +++ b/dlt/sources/helpers/rest_client/paginators.py @@ -7,35 +7,48 @@ class BasePaginator(ABC): + """ + Base class for all paginator implementations. Paginators are used + to handle paginated responses from RESTful APIs. + + See `RESTClient.paginate()` for example usage. + """ def __init__(self) -> None: self._has_next_page = True @property def has_next_page(self) -> bool: - """ - Check if there is a next page available. + """Determines if there is a next page available. Returns: - bool: True if there is a next page available, False otherwise. + bool: True if a next page is available, otherwise False. """ return self._has_next_page @abstractmethod def update_state(self, response: Response) -> None: - """Update the paginator state based on the response. + """Updates the paginator's state based on the response from the API. + + This method should extract necessary pagination details (like next page + references) from the response and update the paginator's state + accordingly. Args: - response (Response): The response object from the API. + response (Response): The response object from the API request. """ ... @abstractmethod def update_request(self, request: Request) -> None: - """ - Update the request object with the next arguments for the API request. + """Updates the request object with arguments for fetching the next page. + + This method should modify the request object to include necessary + details (like URLs or parameters) for requesting the next page based on + the current state of the paginator. Args: - request (Request): The request object to be updated. + request (Request): The request object to be updated for the next + page fetch. """ ... @@ -51,8 +64,31 @@ def update_request(self, request: Request) -> None: class OffsetPaginator(BasePaginator): - """A paginator that uses the 'offset' parameter for pagination.""" - + """A paginator that uses offset-based pagination strategy. + + This paginator is useful for APIs where pagination is controlled + through offset and limit query parameters and the total count of items + is returned in the response. + + Example: + Assuming an API at `https://api.example.com/items` supports offset + and limit for pagination, and includes the total count in its responses + e.g. (`{"items": [...], "total": 1000}`), we can create a client + with an `OffsetPaginator` like this: + + >>> from dlt.sources.helpers.rest_client import RESTClient + >>> client = RESTClient( + ... base_url="https://api.example.com", + ... paginator=OffsetPaginator( + ... initial_limit=100, + ... total_path="total" + ... ) + ... ) + >>> @dlt.resource + ... def get_items(): + ... for page in client.paginate("/items", params={"limit": 100}): + ... yield page + """ def __init__( self, initial_limit: int, @@ -61,6 +97,19 @@ def __init__( limit_param: str = "limit", total_path: jsonpath.TJsonPath = "total", ) -> None: + """ + Args: + initial_limit (int): The maximum number of items to retrieve + in each request. + initial_offset (int): The offset for the first request. + Defaults to 0. + offset_param (str): The query parameter name for the offset. + Defaults to 'offset'. + limit_param (str): The query parameter name for the limit. + Defaults to 'limit'. + total_path (jsonpath.TJsonPath): The JSONPath expression for + the total number of items. + """ super().__init__() self.offset_param = offset_param self.limit_param = limit_param @@ -70,6 +119,7 @@ def __init__( self.limit = initial_limit def update_state(self, response: Response) -> None: + """Extracts the total count from the response and updates the offset.""" values = jsonpath.find_values(self.total_path, response.json()) total = values[0] if values else None @@ -90,6 +140,7 @@ def update_state(self, response: Response) -> None: self._has_next_page = False def update_request(self, request: Request) -> None: + """Updates the request with the offset and limit query parameters.""" if request.params is None: request.params = {} @@ -104,10 +155,22 @@ def __init__(self) -> None: @property def _next_reference(self) -> Optional[str]: + """The reference to the next page, such as a URL or a cursor. + + Returns: + Optional[str]: The reference to the next page if available, + otherwise None. + """ return self.__next_reference @_next_reference.setter def _next_reference(self, value: Optional[str]) -> None: + """Sets the reference to the next page and updates the availability + of the next page. + + Args: + value (Optional[str]): The reference to the next page. + """ self.__next_reference = value self._has_next_page = value is not None @@ -129,48 +192,98 @@ class HeaderLinkPaginator(BaseNextUrlPaginator): A good example of this is the GitHub API: https://docs.github.com/en/rest/guides/traversing-with-pagination + + Example: + >>> from dlt.sources.helpers.rest_client import RESTClient + >>> client = RESTClient( + ... base_url="https://api.github.com", + ... paginator=HeaderLinkPaginator() + ... ) + >>> @dlt.resource + ... def get_issues(): + ... for page in client.paginate("/repos/dlt-hub/dlt/issues"): + ... yield page """ def __init__(self, links_next_key: str = "next") -> None: """ Args: - links_next_key (str, optional): The key (rel ) in the 'Link' header + links_next_key (str, optional): The key (rel) in the 'Link' header that contains the next page URL. Defaults to 'next'. """ super().__init__() self.links_next_key = links_next_key def update_state(self, response: Response) -> None: + """Extracts the next page URL from the 'Link' header in the response.""" self._next_reference = response.links.get(self.links_next_key, {}).get("url") class JSONResponsePaginator(BaseNextUrlPaginator): - """A paginator that uses a specific key in the JSON response to find - the next page URL. + """Locates the next page URL within the JSON response body. The key + containing the URL can be specified using a JSON path. + + Example: + Suppose the JSON response from an API contains a 'pagination' object + with a 'next' key like this: + `{"items": [...], "pagination": {"next": "https://api.example.com/items?page=2"}}`. + + We can create a client with a `JSONResponsePaginator` this way: + + >>> from dlt.sources.helpers.rest_client import RESTClient + >>> client = RESTClient( + ... base_url="https://api.example.com", + ... paginator=JSONResponsePaginator(next_url_path="pagination.next") + ... ) + >>> @dlt.resource + ... def get_data(): + ... for page in client.paginate("/posts"): + ... yield page """ - def __init__( self, next_url_path: jsonpath.TJsonPath = "next", ): """ Args: - next_url_path: The JSON path to the key that contains the next page URL in the response. + next_url_path (jsonpath.TJsonPath): The JSON path to the key + containing the next page URL in the response body. Defaults to 'next'. """ super().__init__() self.next_url_path = jsonpath.compile_path(next_url_path) def update_state(self, response: Response) -> None: + """Extracts the next page URL from the JSON response.""" values = jsonpath.find_values(self.next_url_path, response.json()) self._next_reference = values[0] if values else None class JSONResponseCursorPaginator(BaseReferencePaginator): - """A paginator that uses a cursor query param to paginate. The cursor for the - next page is found in the JSON response. + """Uses a cursor parameter for pagination, with the cursor value found in + the JSON response body. + + Example: + Suppose the JSON response from an API contains a 'cursors' object with + a 'next' key like this: + `{"items": [...], "cursors": {"next": "eyJpZCI6MjM0fQ"}}` and the API + expects a 'cursor' query parameter to fetch the next page. + + We can create a client with a `JSONResponseCursorPaginator` this way: + + >>> from dlt.sources.helpers.rest_client import RESTClient + >>> client = RESTClient( + ... base_url="https://api.example.com", + ... paginator=JSONResponseCursorPaginator( + ... cursor_path="cursors.next", + ... cursor_param="cursor" + ... ) + ... ) + >>> @dlt.resource + ... def get_data(): + ... for page in client.paginate("/posts"): + ... yield page """ - def __init__( self, cursor_path: jsonpath.TJsonPath = "cursors.next", @@ -178,18 +291,22 @@ def __init__( ): """ Args: - cursor_path: The JSON path to the key that contains the cursor in the response. - cursor_param: The name of the query parameter to be used in the request to get the next page. + cursor_path: The JSON path to the key that contains the cursor in + the response. + cursor_param: The name of the query parameter to be used in + the request to get the next page. """ super().__init__() self.cursor_path = jsonpath.compile_path(cursor_path) self.cursor_param = cursor_param def update_state(self, response: Response) -> None: + """Extracts the cursor value from the JSON response.""" values = jsonpath.find_values(self.cursor_path, response.json()) self._next_reference = values[0] if values else None def update_request(self, request: Request) -> None: + """Updates the request with the cursor query parameter.""" if request.params is None: request.params = {} From 3b4257fb55948b2c3070bb0e3b649b1cd210af05 Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Wed, 10 Apr 2024 15:41:57 +0300 Subject: [PATCH 2/5] Format --- dlt/sources/helpers/rest_client/paginators.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/dlt/sources/helpers/rest_client/paginators.py b/dlt/sources/helpers/rest_client/paginators.py index dbd9faa1f2..1788905d82 100644 --- a/dlt/sources/helpers/rest_client/paginators.py +++ b/dlt/sources/helpers/rest_client/paginators.py @@ -13,6 +13,7 @@ class BasePaginator(ABC): See `RESTClient.paginate()` for example usage. """ + def __init__(self) -> None: self._has_next_page = True @@ -89,6 +90,7 @@ class OffsetPaginator(BasePaginator): ... for page in client.paginate("/items", params={"limit": 100}): ... yield page """ + def __init__( self, initial_limit: int, @@ -176,6 +178,16 @@ def _next_reference(self, value: Optional[str]) -> None: class BaseNextUrlPaginator(BaseReferencePaginator): + """ + A base paginator class for paginators that use a URL provided in the API + response to fetch the next page. For example, the URL can be found in HTTP + headers or in the JSON response. + + Subclasses should implement the `update_state` method to extract the next + page URL and set the `next_reference` attribute accordingly. + + See `HeaderLinkPaginator` and `JSONResponsePaginator` for examples. + """ def update_request(self, request: Request) -> None: # Handle relative URLs if self._next_reference: @@ -240,6 +252,7 @@ class JSONResponsePaginator(BaseNextUrlPaginator): ... for page in client.paginate("/posts"): ... yield page """ + def __init__( self, next_url_path: jsonpath.TJsonPath = "next", @@ -284,6 +297,7 @@ class JSONResponseCursorPaginator(BaseReferencePaginator): ... for page in client.paginate("/posts"): ... yield page """ + def __init__( self, cursor_path: jsonpath.TJsonPath = "cursors.next", From d5c02c76e35bf2a55ea0fc418cfa6b1c9b6f35e3 Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Wed, 10 Apr 2024 20:42:21 +0300 Subject: [PATCH 3/5] Add an example data to HeaderLinkPaginator --- dlt/sources/helpers/rest_client/paginators.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/dlt/sources/helpers/rest_client/paginators.py b/dlt/sources/helpers/rest_client/paginators.py index 1788905d82..035585a974 100644 --- a/dlt/sources/helpers/rest_client/paginators.py +++ b/dlt/sources/helpers/rest_client/paginators.py @@ -206,6 +206,21 @@ class HeaderLinkPaginator(BaseNextUrlPaginator): https://docs.github.com/en/rest/guides/traversing-with-pagination Example: + Consider an API response that includes 'Link' header: + + ... + Content-Type: application/json + Link: ; rel="next", ; rel="prev" + + { + "items": [...] + } + + In this scenario, the URL for the next page (`https://api.example.com/items?page=2`) + is identified by its relation type `rel="next"`. + `HeaderLinkPaginator` extracts this URL from the 'Link' header and uses it to + fetch the next page of results: + >>> from dlt.sources.helpers.rest_client import RESTClient >>> client = RESTClient( ... base_url="https://api.github.com", From b8771df7a90f75c766ba9a91f991d0ae74ad3cd7 Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Thu, 11 Apr 2024 00:23:57 +0300 Subject: [PATCH 4/5] Fix examples formatting --- dlt/sources/helpers/rest_client/paginators.py | 186 +++++++++++------- 1 file changed, 112 insertions(+), 74 deletions(-) diff --git a/dlt/sources/helpers/rest_client/paginators.py b/dlt/sources/helpers/rest_client/paginators.py index 035585a974..c78734792f 100644 --- a/dlt/sources/helpers/rest_client/paginators.py +++ b/dlt/sources/helpers/rest_client/paginators.py @@ -71,24 +71,35 @@ class OffsetPaginator(BasePaginator): through offset and limit query parameters and the total count of items is returned in the response. - Example: - Assuming an API at `https://api.example.com/items` supports offset - and limit for pagination, and includes the total count in its responses - e.g. (`{"items": [...], "total": 1000}`), we can create a client - with an `OffsetPaginator` like this: - - >>> from dlt.sources.helpers.rest_client import RESTClient - >>> client = RESTClient( - ... base_url="https://api.example.com", - ... paginator=OffsetPaginator( - ... initial_limit=100, - ... total_path="total" - ... ) - ... ) - >>> @dlt.resource - ... def get_items(): - ... for page in client.paginate("/items", params={"limit": 100}): - ... yield page + For example, consider an API located at `https://api.example.com/items` + that supports pagination through offset and limit, and provides the total + item count in its responses, as shown below: + + { + "items": [...], + "total": 1000 + } + + To use `OffsetPaginator` with such an API, you can instantiate `RESTClient` + as follows: + + from dlt.sources.helpers.rest_client import RESTClient + + client = RESTClient( + base_url="https://api.example.com", + paginator=OffsetPaginator( + initial_limit=100, + total_path="total" + ) + ) + @dlt.resource + def get_items(): + for page in client.paginate("/items", params={"limit": 100}): + yield page + + Note that we pass the `limit` parameter in the initial request to the API. + The `OffsetPaginator` will automatically increment the offset for each + subsequent request until all items are fetched. """ def __init__( @@ -205,31 +216,32 @@ class HeaderLinkPaginator(BaseNextUrlPaginator): A good example of this is the GitHub API: https://docs.github.com/en/rest/guides/traversing-with-pagination - Example: - Consider an API response that includes 'Link' header: + For example, consider an API response that includes 'Link' header: ... Content-Type: application/json Link: ; rel="next", ; rel="prev" - { - "items": [...] - } - - In this scenario, the URL for the next page (`https://api.example.com/items?page=2`) - is identified by its relation type `rel="next"`. - `HeaderLinkPaginator` extracts this URL from the 'Link' header and uses it to - fetch the next page of results: - - >>> from dlt.sources.helpers.rest_client import RESTClient - >>> client = RESTClient( - ... base_url="https://api.github.com", - ... paginator=HeaderLinkPaginator() - ... ) - >>> @dlt.resource - ... def get_issues(): - ... for page in client.paginate("/repos/dlt-hub/dlt/issues"): - ... yield page + [ + {"id": 1, "name": "item1"}, + {"id": 2, "name": "item2"}, + ... + ] + + In this scenario, the URL for the next page (`https://api.example.com/items?page=2`) + is identified by its relation type `rel="next"`. `HeaderLinkPaginator` extracts + this URL from the 'Link' header and uses it to fetch the next page of results: + + from dlt.sources.helpers.rest_client import RESTClient + client = RESTClient( + base_url="https://api.example.com", + paginator=HeaderLinkPaginator() + ) + + @dlt.resource + def get_issues(): + for page in client.paginate("/items"): + yield page """ def __init__(self, links_next_key: str = "next") -> None: @@ -250,22 +262,34 @@ class JSONResponsePaginator(BaseNextUrlPaginator): """Locates the next page URL within the JSON response body. The key containing the URL can be specified using a JSON path. - Example: - Suppose the JSON response from an API contains a 'pagination' object - with a 'next' key like this: - `{"items": [...], "pagination": {"next": "https://api.example.com/items?page=2"}}`. - - We can create a client with a `JSONResponsePaginator` this way: - - >>> from dlt.sources.helpers.rest_client import RESTClient - >>> client = RESTClient( - ... base_url="https://api.example.com", - ... paginator=JSONResponsePaginator(next_url_path="pagination.next") - ... ) - >>> @dlt.resource - ... def get_data(): - ... for page in client.paginate("/posts"): - ... yield page + For example, suppose the JSON response from an API contains data items + along with a 'pagination' object: + + { + "items": [ + {"id": 1, "name": "item1"}, + {"id": 2, "name": "item2"}, + ... + ], + "pagination": { + "next": "https://api.example.com/items?page=2" + } + } + + The link to the next page (`https://api.example.com/items?page=2`) is + located in the 'next' key of the 'pagination' object. You can use + `JSONResponsePaginator` to paginate through the API endpoint: + + from dlt.sources.helpers.rest_client import RESTClient + client = RESTClient( + base_url="https://api.example.com", + paginator=JSONResponsePaginator(next_url_path="pagination.next") + ) + + @dlt.resource + def get_data(): + for page in client.paginate("/posts"): + yield page """ def __init__( @@ -291,26 +315,40 @@ class JSONResponseCursorPaginator(BaseReferencePaginator): """Uses a cursor parameter for pagination, with the cursor value found in the JSON response body. - Example: - Suppose the JSON response from an API contains a 'cursors' object with - a 'next' key like this: - `{"items": [...], "cursors": {"next": "eyJpZCI6MjM0fQ"}}` and the API - expects a 'cursor' query parameter to fetch the next page. - - We can create a client with a `JSONResponseCursorPaginator` this way: - - >>> from dlt.sources.helpers.rest_client import RESTClient - >>> client = RESTClient( - ... base_url="https://api.example.com", - ... paginator=JSONResponseCursorPaginator( - ... cursor_path="cursors.next", - ... cursor_param="cursor" - ... ) - ... ) - >>> @dlt.resource - ... def get_data(): - ... for page in client.paginate("/posts"): - ... yield page + For example, suppose the JSON response from an API contains + a 'cursors' object: + + { + "items": [ + {"id": 1, "name": "item1"}, + {"id": 2, "name": "item2"}, + ... + ], + "cursors": { + "next": "aW1wb3J0IGFudGlncmF2aXR5" + } + } + + And the API endpoint expects a 'cursor' query parameter to fetch + the next page. So the URL for the next page would look + like `https://api.example.com/items?cursor=aW1wb3J0IGFudGlncmF2aXR5`. + + You can paginate through this API endpoint using + `JSONResponseCursorPaginator`: + + from dlt.sources.helpers.rest_client import RESTClient + client = RESTClient( + base_url="https://api.example.com", + paginator=JSONResponseCursorPaginator( + cursor_path="cursors.next", + cursor_param="cursor" + ) + ) + + @dlt.resource + def get_data(): + for page in client.paginate("/posts"): + yield page """ def __init__( From 3fe37aeb9646662447503c87a25701fb03f3a459 Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Thu, 25 Apr 2024 14:32:31 +0300 Subject: [PATCH 5/5] Extend BasePaginator and BaseReferencePaginator docstrings --- dlt/sources/helpers/rest_client/paginators.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/dlt/sources/helpers/rest_client/paginators.py b/dlt/sources/helpers/rest_client/paginators.py index c78734792f..a46558e4ab 100644 --- a/dlt/sources/helpers/rest_client/paginators.py +++ b/dlt/sources/helpers/rest_client/paginators.py @@ -7,8 +7,7 @@ class BasePaginator(ABC): - """ - Base class for all paginator implementations. Paginators are used + """A base class for all paginator implementations. Paginators are used to handle paginated responses from RESTful APIs. See `RESTClient.paginate()` for example usage. @@ -32,7 +31,8 @@ def update_state(self, response: Response) -> None: This method should extract necessary pagination details (like next page references) from the response and update the paginator's state - accordingly. + accordingly. It should also set the `_has_next_page` attribute to + indicate if there is a next page available. Args: response (Response): The response object from the API request. @@ -162,6 +162,15 @@ def update_request(self, request: Request) -> None: class BaseReferencePaginator(BasePaginator): + """A base paginator class for paginators that use a reference to the next + page, such as a URL or a cursor string. + + Subclasses should implement: + 1. `update_state` method to extract the next page reference and + set the `_next_reference` attribute accordingly. + 2. `update_request` method to update the request object with the next + page reference. + """ def __init__(self) -> None: super().__init__() self.__next_reference: Optional[str] = None @@ -195,7 +204,7 @@ class BaseNextUrlPaginator(BaseReferencePaginator): headers or in the JSON response. Subclasses should implement the `update_state` method to extract the next - page URL and set the `next_reference` attribute accordingly. + page URL and set the `_next_reference` attribute accordingly. See `HeaderLinkPaginator` and `JSONResponsePaginator` for examples. """