diff --git a/ukwa_api/mementos/router.py b/ukwa_api/mementos/router.py index ecd58a0..87c4562 100644 --- a/ukwa_api/mementos/router.py +++ b/ukwa_api/mementos/router.py @@ -112,7 +112,6 @@ async def lookup_url( collapseToFirst: str = schemas.create_query_param_from_path(schemas.path_collapse), collapseToLast: str = schemas.create_query_param_from_path(schemas.path_collapse) - ): # Basic validation and derived parameters: @@ -120,23 +119,9 @@ async def lookup_url( raise HTTPException(status_code=400, detail="Timestamp required for Closest sort.") if sort.value != "closest" and closest: raise HTTPException(status_code=400, detail="Closest Sort required for Closest Timestamp.") - - - ALLOWED_CDX_FIELDS = ["timestamp", "statuscode"] - ALLOWED_LENGTHS = range(1, 15) - - valid_collapse_options = [f"{cdx_field}:{length}" for cdx_field in ALLOWED_CDX_FIELDS for length in ALLOWED_LENGTHS] - if collapseToFirst and collapseToLast: raise HTTPException(status_code=400, detail="Only one of collapseToFirst or collapseToLast can be specified") - if collapseToFirst: - if collapseToFirst not in valid_collapse_options: - raise HTTPException(status_code=400, detail="Invalid collapseToFirst option") - elif collapseToLast: - if collapseToLast not in valid_collapse_options: - raise HTTPException(status_code=400, detail="Invalid collapseToLast option") - # Only put through allowed parameters: params = { 'url': url, diff --git a/ukwa_api/mementos/schemas.py b/ukwa_api/mementos/schemas.py index 19aafa4..1ba4f84 100644 --- a/ukwa_api/mementos/schemas.py +++ b/ukwa_api/mementos/schemas.py @@ -45,16 +45,28 @@ class LookupOutputType(str, Enum): regex="^\d{4,14}$", # Allow 4-14 digits ) + +# note the pattern here is to specify the RANGE of leading characters or digits of the field value to be collapsed on +# not to specify otherwise the pattern of the field value itself. see description below for timestamp example +# it is more restrictive than the CDX endpoint, but meets sensible use cases following the CDX spec path_collapse = Path( ..., description= '''CDX Field to collapse on, optionally with :number suffix to collapse on substring of field; in other words, return only the first/last row when of the series multiple consecutive rows have the same value for the supplied field. Example: "timestamp:4" - will return a single row per year (YYYY are the first 4 digits).''', - regex="^(timestamp|statuscode):\d{1,2}$" # Allow 4-14 digits + will return a single row per year (YYYY are the first 4 digits).''', + regex = ( + r"^(statuscode:([1-3])|digest:(?:[1-9]|[1-3][0-9]|40)|urlkey:([1-9][0-9]?)|" + r"timestamp:(1[0-4]|[4-9])|mimetype:([1-9][0-9]?)|" + r"original:([1-9][0-9]?)|redirecturl:([1-9][0-9]?)|" + r"filename:([1-9][0-9]?)|robotflags:([1-9])|" + r"offset:(?:[1-9]|1[0-2])|length:(?:[1-9]|1[0-2])|" + r"(urlkey|timestamp|original|mimetype|statuscode|digest|length|offset|filename|redirecturl|robotflags)?)$" + + ) ) -# allows us to reuse the timestamp definition as a whole +# allows us to reuse a basic param definition as a whole # rather than having having to reference the attibutes each time def create_query_param_from_path(path: Path, alias: Optional[str] = None) -> Query: query_params = {