From 951b2bed014e0a869e8e71a78db198e916003f00 Mon Sep 17 00:00:00 2001 From: ldbiz Date: Thu, 4 May 2023 12:58:28 +0100 Subject: [PATCH 1/5] #43 fix regex --- ukwa_api/mementos/router.py | 15 --------------- ukwa_api/mementos/schemas.py | 7 +++++-- 2 files changed, 5 insertions(+), 17 deletions(-) diff --git a/ukwa_api/mementos/router.py b/ukwa_api/mementos/router.py index ecd58a0..87c4562 100644 --- a/ukwa_api/mementos/router.py +++ b/ukwa_api/mementos/router.py @@ -112,7 +112,6 @@ async def lookup_url( collapseToFirst: str = schemas.create_query_param_from_path(schemas.path_collapse), collapseToLast: str = schemas.create_query_param_from_path(schemas.path_collapse) - ): # Basic validation and derived parameters: @@ -120,23 +119,9 @@ async def lookup_url( raise HTTPException(status_code=400, detail="Timestamp required for Closest sort.") if sort.value != "closest" and closest: raise HTTPException(status_code=400, detail="Closest Sort required for Closest Timestamp.") - - - ALLOWED_CDX_FIELDS = ["timestamp", "statuscode"] - ALLOWED_LENGTHS = range(1, 15) - - valid_collapse_options = [f"{cdx_field}:{length}" for cdx_field in ALLOWED_CDX_FIELDS for length in ALLOWED_LENGTHS] - if collapseToFirst and collapseToLast: raise HTTPException(status_code=400, detail="Only one of collapseToFirst or collapseToLast can be specified") - if collapseToFirst: - if collapseToFirst not in valid_collapse_options: - raise HTTPException(status_code=400, detail="Invalid collapseToFirst option") - elif collapseToLast: - if collapseToLast not in valid_collapse_options: - raise HTTPException(status_code=400, detail="Invalid collapseToLast option") - # Only put through allowed parameters: params = { 'url': url, diff --git a/ukwa_api/mementos/schemas.py b/ukwa_api/mementos/schemas.py index 19aafa4..c8f3ef9 100644 --- a/ukwa_api/mementos/schemas.py +++ b/ukwa_api/mementos/schemas.py @@ -51,10 +51,13 @@ class LookupOutputType(str, Enum): in other words, return only the first/last row when of the series multiple consecutive rows have the same value for the supplied field. Example: "timestamp:4" will return a single row per year (YYYY are the first 4 digits).''', - regex="^(timestamp|statuscode):\d{1,2}$" # Allow 4-14 digits + # Allow 4-14 digits for timestamp, 1-3 for status code + # note that in this case we are expecting a timestamp (string) _length_, + # rather than an actual _timestamp_ (of varying length) so the timestamp regex is different + regex="^(timestamp(:(1[0-4]|[4-9]))?|(statuscode(:[1-3])?))?$" ) -# allows us to reuse the timestamp definition as a whole +# allows us to reuse a basic param definition as a whole # rather than having having to reference the attibutes each time def create_query_param_from_path(path: Path, alias: Optional[str] = None) -> Query: query_params = { From 122440837fce859697f4825ffc525229917bd376 Mon Sep 17 00:00:00 2001 From: ldbiz Date: Mon, 15 May 2023 12:44:40 +0100 Subject: [PATCH 2/5] #43 extend api coverage - add all cdx fields with pattern matching to constrain leading characters to collapse on --- ukwa_api/mementos/schemas.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/ukwa_api/mementos/schemas.py b/ukwa_api/mementos/schemas.py index c8f3ef9..3a962d8 100644 --- a/ukwa_api/mementos/schemas.py +++ b/ukwa_api/mementos/schemas.py @@ -45,16 +45,24 @@ class LookupOutputType(str, Enum): regex="^\d{4,14}$", # Allow 4-14 digits ) + +# note the pattern here is to specify the RANGE of leading characters or digits of the field value to be collapsed on +# not to specify otherwise the pattern of the field value itself. see description below for timestamp example path_collapse = Path( ..., description= '''CDX Field to collapse on, optionally with :number suffix to collapse on substring of field; in other words, return only the first/last row when of the series multiple consecutive rows have the same value for the supplied field. Example: "timestamp:4" - will return a single row per year (YYYY are the first 4 digits).''', - # Allow 4-14 digits for timestamp, 1-3 for status code - # note that in this case we are expecting a timestamp (string) _length_, - # rather than an actual _timestamp_ (of varying length) so the timestamp regex is different - regex="^(timestamp(:(1[0-4]|[4-9]))?|(statuscode(:[1-3])?))?$" + will return a single row per year (YYYY are the first 4 digits).''', + regex = ( + r"^(statuscode:([1-3])|digest:(?:[1-9]|[1-3][0-9]|40)|urlkey:(?:[1-9]|[1-2][0-9]|30)|" + r"timestamp:(1[0-4]|[4-9])|mimetype:([1-9][0-9]?)|" + r"original:([1-9][0-9]?)|redirecturl:([1-9][0-9]?)|" + r"filename:([1-9][0-9]?)|robotflags:([1-9])|" + r"offset:(?:[1-9]|1[0-2])|length:(?:[1-9]|1[0-2])|" + r"(urlkey|timestamp|original|mimetype|statuscode|digest|length|offset|filename|redirect|robotflags)?)$" + + ) ) # allows us to reuse a basic param definition as a whole From 4354645122c4dfe46be3effb94f6607784c94002 Mon Sep 17 00:00:00 2001 From: ldbiz Date: Tue, 16 May 2023 13:15:48 +0100 Subject: [PATCH 3/5] #43 extend api coverage - extend urlkey range allowed in Collapse param --- ukwa_api/mementos/schemas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ukwa_api/mementos/schemas.py b/ukwa_api/mementos/schemas.py index 3a962d8..2672a22 100644 --- a/ukwa_api/mementos/schemas.py +++ b/ukwa_api/mementos/schemas.py @@ -55,7 +55,7 @@ class LookupOutputType(str, Enum): have the same value for the supplied field. Example: "timestamp:4" will return a single row per year (YYYY are the first 4 digits).''', regex = ( - r"^(statuscode:([1-3])|digest:(?:[1-9]|[1-3][0-9]|40)|urlkey:(?:[1-9]|[1-2][0-9]|30)|" + r"^(statuscode:([1-3])|digest:(?:[1-9]|[1-3][0-9]|40)|urlkey:([1-9][0-9]?)|" r"timestamp:(1[0-4]|[4-9])|mimetype:([1-9][0-9]?)|" r"original:([1-9][0-9]?)|redirecturl:([1-9][0-9]?)|" r"filename:([1-9][0-9]?)|robotflags:([1-9])|" From a10ff37b36ecaefe29b16450806187efba6ce148 Mon Sep 17 00:00:00 2001 From: ldbiz Date: Thu, 25 May 2023 13:51:13 +0100 Subject: [PATCH 4/5] #43 extend api coverage - clarification in comments --- ukwa_api/mementos/schemas.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ukwa_api/mementos/schemas.py b/ukwa_api/mementos/schemas.py index 2672a22..dfac825 100644 --- a/ukwa_api/mementos/schemas.py +++ b/ukwa_api/mementos/schemas.py @@ -48,6 +48,7 @@ class LookupOutputType(str, Enum): # note the pattern here is to specify the RANGE of leading characters or digits of the field value to be collapsed on # not to specify otherwise the pattern of the field value itself. see description below for timestamp example +# it is more restrictive than the CDX endpoint, but meets sensible use cases following the CDX spec path_collapse = Path( ..., description= '''CDX Field to collapse on, optionally with :number suffix to collapse on substring of field; From 0e91f9a75ff509086db73a56e0875b7724ccb797 Mon Sep 17 00:00:00 2001 From: ldbiz Date: Fri, 2 Jun 2023 10:32:01 +0100 Subject: [PATCH 5/5] #43 extend api coverage - redirect fieldname incorrect --- ukwa_api/mementos/schemas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ukwa_api/mementos/schemas.py b/ukwa_api/mementos/schemas.py index dfac825..1ba4f84 100644 --- a/ukwa_api/mementos/schemas.py +++ b/ukwa_api/mementos/schemas.py @@ -61,7 +61,7 @@ class LookupOutputType(str, Enum): r"original:([1-9][0-9]?)|redirecturl:([1-9][0-9]?)|" r"filename:([1-9][0-9]?)|robotflags:([1-9])|" r"offset:(?:[1-9]|1[0-2])|length:(?:[1-9]|1[0-2])|" - r"(urlkey|timestamp|original|mimetype|statuscode|digest|length|offset|filename|redirect|robotflags)?)$" + r"(urlkey|timestamp|original|mimetype|statuscode|digest|length|offset|filename|redirecturl|robotflags)?)$" ) )