Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

43 Rework changes #47

Open
wants to merge 9 commits into
base: dev
Choose a base branch
from
15 changes: 0 additions & 15 deletions ukwa_api/mementos/router.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,31 +112,16 @@ async def lookup_url(

collapseToFirst: str = schemas.create_query_param_from_path(schemas.path_collapse),
collapseToLast: str = schemas.create_query_param_from_path(schemas.path_collapse)

):

# Basic validation and derived parameters:
if sort.value == "closest" and not closest:
raise HTTPException(status_code=400, detail="Timestamp required for Closest sort.")
if sort.value != "closest" and closest:
raise HTTPException(status_code=400, detail="Closest Sort required for Closest Timestamp.")


ALLOWED_CDX_FIELDS = ["timestamp", "statuscode"]
ALLOWED_LENGTHS = range(1, 15)

valid_collapse_options = [f"{cdx_field}:{length}" for cdx_field in ALLOWED_CDX_FIELDS for length in ALLOWED_LENGTHS]

if collapseToFirst and collapseToLast:
raise HTTPException(status_code=400, detail="Only one of collapseToFirst or collapseToLast can be specified")

if collapseToFirst:
if collapseToFirst not in valid_collapse_options:
raise HTTPException(status_code=400, detail="Invalid collapseToFirst option")
elif collapseToLast:
if collapseToLast not in valid_collapse_options:
raise HTTPException(status_code=400, detail="Invalid collapseToLast option")

# Only put through allowed parameters:
params = {
'url': url,
Expand Down
18 changes: 15 additions & 3 deletions ukwa_api/mementos/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,16 +45,28 @@ class LookupOutputType(str, Enum):
regex="^\d{4,14}$", # Allow 4-14 digits
)


# note the pattern here is to specify the RANGE of leading characters or digits of the field value to be collapsed on
# not to specify otherwise the pattern of the field value itself. see description below for timestamp example
# it is more restrictive than the CDX endpoint, but meets sensible use cases following the CDX spec
path_collapse = Path(
...,
description= '''CDX Field to collapse on, optionally with :number suffix to collapse on substring of field;
in other words, return only the first/last row when of the series multiple consecutive rows
have the same value for the supplied field. Example: "timestamp:4"
will return a single row per year (YYYY are the first 4 digits).''',
regex="^(timestamp|statuscode):\d{1,2}$" # Allow 4-14 digits
will return a single row per year (YYYY are the first 4 digits).''',
regex = (
r"^(statuscode:([1-3])|digest:(?:[1-9]|[1-3][0-9]|40)|urlkey:([1-9][0-9]?)|"
r"timestamp:(1[0-4]|[4-9])|mimetype:([1-9][0-9]?)|"
r"original:([1-9][0-9]?)|redirecturl:([1-9][0-9]?)|"
r"filename:([1-9][0-9]?)|robotflags:([1-9])|"
r"offset:(?:[1-9]|1[0-2])|length:(?:[1-9]|1[0-2])|"
r"(urlkey|timestamp|original|mimetype|statuscode|digest|length|offset|filename|redirecturl|robotflags)?)$"

)
)

# allows us to reuse the timestamp definition as a whole
# allows us to reuse a basic param definition as a whole
# rather than having having to reference the attibutes each time
def create_query_param_from_path(path: Path, alias: Optional[str] = None) -> Query:
query_params = {
Expand Down