Skip to content

Commit

Permalink
[app][rfct] update code to p3.6+
Browse files Browse the repository at this point in the history
  • Loading branch information
M3ssman committed Sep 27, 2024
1 parent 83ad5e6 commit 1b8435b
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 43 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "digiflow"
version = "5.2.6"
version = "5.3.6"
description = "Father's Little Digitization Workflow Helper"
readme = "README.md"
requires-python = ">=3.8"
Expand Down
80 changes: 40 additions & 40 deletions src/digiflow/digiflow_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,17 @@
import digiflow.common as dfc
import digiflow.digiflow_metadata as df_md

# please pylinter
# pylint:disable=c-extension-no-member

OAI_KWARG_FGROUP_IMG = "fgroup_images"
DEFAULT_FGROUP_IMG = "MAX"
OAI_KWARG_FGROUP_OCR = "fgroup_ocr"
DEFAULT_FGROUP_OCR = "FULLTEXT"
OAI_KWARG_POSTFUNC = "post_oai"
OAI_KWARG_REQUESTS = "request_kwargs"
REQUESTS_DEFAULT_TIMEOUT = 20


class LoadException(Exception):
"""Load of OAI Data failed"""
Expand All @@ -36,12 +47,6 @@ class ContentException(LoadException):
or even missing complete record"""


OAI_KWARG_FGROUP_IMG = "fgroup_images"
DEFAULT_FGROUP_IMG = "MAX"
OAI_KWARG_FGROUP_OCR = "fgroup_ocr"
DEFAULT_FGROUP_OCR = "FULLTEXT"
OAI_KWARG_POSTFUNC = "post_oai"
OAI_KWARG_REQUESTS = "request_kwargs"
class OAILoader:
"""
Load OAI Records with corresponding metadata
Expand Down Expand Up @@ -140,24 +145,22 @@ def _handle_load(self, res_url, res_path, post_func, force_load=False):
if stored_path:
if not force_load:
return None
else:
# force update:
# 1. rename existing data
file_name = os.path.basename(stored_path)
file_dir = os.path.dirname(stored_path)
mets_ctime = str(int(os.stat(stored_path).st_mtime))
bkp_mets = file_name.replace('mets', mets_ctime)
os.rename(stored_path, os.path.join(file_dir, bkp_mets))
# 2. download again anyway
data_path = self.load_resource(res_url, res_path, post_func)
if data_path:
self.store.put(data_path)
return res_path
else:
# force update:
# 1. rename existing data
file_name = os.path.basename(stored_path)
file_dir = os.path.dirname(stored_path)
mets_ctime = str(int(os.stat(stored_path).st_mtime))
bkp_mets = file_name.replace('mets', mets_ctime)
os.rename(stored_path, os.path.join(file_dir, bkp_mets))
# 2. download again anyway
data_path = self.load_resource(res_url, res_path, post_func)
if data_path:
self.store.put(data_path)
return res_path
data_path = self.load_resource(res_url, res_path, post_func)
if data_path:
self.store.put(data_path)
return res_path
else:
return self.load_resource(res_url, res_path, post_func)

Expand Down Expand Up @@ -202,7 +205,7 @@ def load_resource(self, url, path_local, post_func):
except LoadException as load_exc:
raise load_exc
except Exception as exc:
msg = "processing '{}': {}".format(url, exc)
msg = f"load {url} exception: {exc}"
raise RuntimeError(msg) from exc


Expand All @@ -212,11 +215,11 @@ class OAIFileSweeper:
parse *.mets.xml to identify files to be deleted
"""

def __init__(self, path_store, pattern='mets.xml', filegroups=['MAX', ]):
def __init__(self, path_store, pattern='mets.xml', filegroups=None):
self.work_dir = path_store
self.pattern = pattern
self.filegroups = filegroups if isinstance(filegroups, list)\
else list(filegroups)
else ["MAX"]

def sweep(self):
"""remove OAI-Resources from given dir, if any contained"""
Expand Down Expand Up @@ -258,13 +261,12 @@ def sweep(self):
len(list(_parent.iterdir())) == 0:
_parent.rmdir()
except PermissionError:
return 'cannot delete {} (insuff. permission)'\
.format(pth)
return (work_dir, total, "{} Mb".format(size >> 20))
return f"cannot delete {pth} due insuff. perm."
return (work_dir, total, f"{(size >> 20)} Mb")

def _get_files(self, mets_xml, filegroup):
xml_root = ET.parse(str(mets_xml)).getroot()
xpath = ".//mets:fileGrp[@USE='{}']/mets:file/mets:FLocat".format(filegroup)
xpath = f".//mets:fileGrp[@USE='{filegroup}']/mets:file/mets:FLocat"
locats = xml_root.findall(xpath, {"mets": "http://www.loc.gov/METS/"})
links = [xl.get('{http://www.w3.org/1999/xlink}href') for xl in locats]
return [Path(ln).stem for ln in links]
Expand Down Expand Up @@ -360,29 +362,29 @@ def request_resource(url: str, path_local: Path, **kwargs):
status = 0
result = None
try:
response = requests.get(url, **kwargs)
the_timeout = REQUESTS_DEFAULT_TIMEOUT
if "timeout" in kwargs:
the_timeout = kwargs["timeout"]
del kwargs["timeout"]
response = requests.get(url, timeout=the_timeout, **kwargs)
status = response.status_code
if status >= 400:
_inf = "url '{}' returned '{}'".format(url, status)
the_info = f"{url} status {status}"
if status < 500:
raise ClientError(_inf)
else:
raise ServerError(_inf)
raise ClientError(the_info)
raise ServerError(the_info)
if status == 200:
content_type = response.headers['Content-Type']

# textual xml data
if 'text' in content_type or 'xml' in content_type:
result = response.content
xml_root = ET.fromstring(result)
check_error = xml_root.find('.//error', xml_root.nsmap)
if check_error is not None:
msg = "the download of {} fails due to: '{}'".format(
url, check_error.text)
msg = f"request {url} failed due {check_error.text}"
raise LoadException(msg)
path_local = _sanitize_local_file_extension(
path_local, content_type)

# catch other content types by MIMI sub_type
# split "<application|image>/<sub_type>"
elif content_type.split('/')[-1] in ['jpg', 'jpeg', 'pdf', 'png']:
Expand All @@ -391,15 +393,13 @@ def request_resource(url: str, path_local: Path, **kwargs):
if not isinstance(path_local, Path):
path_local = Path(path_local)
path_local.write_bytes(response.content)

# if we went this far, something unexpected has been returned
else:
msg = "download {} with unhandled content-type {}".format(
url, content_type)
msg = f"download {url} with unhandled content-type {content_type}"
raise ContentException(msg)
return (path_local, result, content_type)
except (OSError) as exc:
msg = "fail to download '{}' to '{}'".format(url, path_local)
msg = f"fail to download {url} to {path_local}"
raise RuntimeError(msg) from exc


Expand Down
4 changes: 2 additions & 2 deletions tests/test_digiflow_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -432,7 +432,7 @@ def test_response_404(mock_requests: unittest.mock.Mock):

# assert
assert exc.typename == 'ClientError'
assert "url 'http://foo.bar' returned '417'" == exc.value.args[0]
assert "http://foo.bar status 417" == exc.value.args[0]


@unittest.mock.patch('requests.get')
Expand Down Expand Up @@ -494,7 +494,7 @@ def test_oai_load_exception_for_server_error(mock_504: unittest.mock.Mock, tmp_p
# assert
assert exc.typename == 'ServerError'
a_msg = exc.value.args[0]
assert a_msg == "url 'opendata.uni-halle.de/oai/dd?verb=GetRecord&metadataPrefix=mets&identifier=foo' returned '504'"
assert a_msg == "opendata.uni-halle.de/oai/dd?verb=GetRecord&metadataPrefix=mets&identifier=foo status 504"


def test_call_requests_kwargs_invalid_str(tmp_path):
Expand Down

0 comments on commit 1b8435b

Please sign in to comment.