Skip to content

Commit

Permalink
[app][feat] some generalities concerning SBB
Browse files Browse the repository at this point in the history
  • Loading branch information
M3ssman committed Oct 28, 2024
1 parent 156b00a commit b900bcf
Show file tree
Hide file tree
Showing 7 changed files with 352 additions and 37 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "digiflow"
version = "5.5.9"
version = "5.6.9"
description = "Father's Little Digitization Workflow Helper"
readme = "README.md"
requires-python = ">=3.8"
Expand Down
6 changes: 6 additions & 0 deletions src/digiflow/digiflow_generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,7 @@ def __init__(
raise RuntimeError(f"[DerivansManager] config missing: {path_configuration}!")
self.path_mets_file = path_mets_file
self.path_configuration = path_configuration
self.images = None

@abstractmethod
def init(self) -> None:
Expand Down Expand Up @@ -405,6 +406,8 @@ def start(self) -> DerivansResult:
cmd = f'{path_exec}{self.xargs} -jar {self.path_binary} {self.path_mets_file}'
if self.path_configuration:
cmd += f' -c {self.path_configuration}'
if self.images:
cmd += f" -i {self.images}"
# disable pylint due it is not able to recognize
# output being created by decorator
time_duration, label, result = self._execute_derivans(
Expand Down Expand Up @@ -508,6 +511,9 @@ def start(self) -> DerivansResult:
mounts.append(Mount(source=config_dir, target=DERIVANS_CNT_CONF_DIR, type='bind'))
command.append('-c')
command.append(target_config_file)
if self.images:
command.append("-i")
command.append(self.images)
if self._path_logging:
_log_dir = self._path_logging
mounts.append(Mount(source=_log_dir, target=DERIVANS_CNT_LOGG_DIR, type='bind'))
Expand Down
28 changes: 19 additions & 9 deletions src/digiflow/digiflow_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
OAI_KWARG_REQUESTS = "request_kwargs"
REQUESTS_DEFAULT_TIMEOUT = 20

_MIME_TYPE_JPG = "image/jpg"


class LoadException(Exception):
"""Load of OAI Data failed"""
Expand Down Expand Up @@ -94,7 +96,8 @@ def _sanitize_kwargs(self, in_kwargs):
return top_dict

def load(self, record_identifier, local_dst, mets_digital_object_identifier=None,
skip_resources=False, force_update=False, metadata_format='mets') -> int:
skip_resources=False, force_update=False, metadata_format='mets',
use_file_id=False) -> int:
"""
load metadata from OAI with optional caching in-between
request additional linked resources if required
Expand Down Expand Up @@ -124,17 +127,20 @@ def load(self, record_identifier, local_dst, mets_digital_object_identifier=None

# get linked resources
for k in self.groups:
self.groups[k] = mets_reader.get_filegrp_links(group=k)
self.groups[k] = mets_reader.get_filegrp_info(group=k)

# if exist, download them too
post_func = None
for k, linked_res_urls in self.groups.items():
for k, file_entries in self.groups.items():
if k == self.key_ocr:
post_func = post_oai_store_ocr
for linked_res_url in linked_res_urls:
res_val_end = linked_res_url.split('/')[-1]
res_val_path = self._calculate_path(k, res_val_end)
if self._handle_load(linked_res_url, res_val_path, post_func):
for mets_file in file_entries:
res_url = mets_file.loc_url
url_final_token = res_url.split('/')[-1]
if use_file_id:
url_final_token = mets_file.file_id
local_path = self._calculate_path(mets_file.file_type, k, url_final_token)
if self._handle_load(res_url, local_path, post_func):
loaded += 1
return loaded

Expand Down Expand Up @@ -164,15 +170,19 @@ def _handle_load(self, res_url, res_path, post_func, force_load=False):
else:
return self.load_resource(res_url, res_path, post_func)

def _calculate_path(self, *args):
def _calculate_path(self, mime_type, *args):
"""
calculate final path depending on some heuristics which
fileGrp has been used - 'MAX' means images, not means 'xml'
"""
res_path = os.path.join(str(self.dir_local), os.sep.join(list(args)))
if mime_type == _MIME_TYPE_JPG and not res_path.endswith('.jpg'):
res_path += ".jpg"
if '/MAX/' in res_path and not res_path.endswith('.jpg'):
res_path += '.jpg'
elif '/FULLTEXT/' in res_path and not res_path.endswith('.xml'):
if "xml" in mime_type and not res_path.endswith('.xml'):
res_path += ".xml"
if '/FULLTEXT/' in res_path and not res_path.endswith('.xml'):
res_path += '.xml'
return res_path

Expand Down
69 changes: 43 additions & 26 deletions src/digiflow/digiflow_metadata.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
"""Metadata module regarding METS/MODS"""

# -*- coding: utf-8 -*-

import abc
import dataclasses
import os
import time
import typing

from collections import (
defaultdict
Expand Down Expand Up @@ -131,6 +134,14 @@ def extract_mets(path_mets, the_data):
write_xml_file(mets_tree, path_mets, preamble=None)


@dataclasses.dataclass
class METSFileInfo:
"""Represents filGrp/file entry"""
file_id: str
file_type: str
loc_url: str


class XMLProcessor(abc.ABC):
"""Basic XML-Processing"""

Expand Down Expand Up @@ -448,7 +459,7 @@ def report(self):
if self._report is None:
self._report = MetsReaderReport()
self._report.identifiers = self.get_identifiers()
self._report.system_identifiers = self._system_identifiers()
self._report.system_identifiers = self._ulb_digi_system_identifier()
self._report.languages = self.get_language_information()
outcome = self.get_type_and_hierarchy()
if outcome:
Expand Down Expand Up @@ -606,7 +617,7 @@ def _identifiers_from_prime_mods(self) -> dict:
_identifiers[rec.attrib['source']] = rec.text
return _identifiers

def _system_identifiers(self) -> dict:
def _ulb_digi_system_identifier(self) -> dict:
"""Determine system identifier(s)
* use METS-agent-information, if available
Expand All @@ -615,13 +626,14 @@ def _system_identifiers(self) -> dict:
than one system id present: actual plus
legacy _identifiers
"""
_idents = {}
ident_dict = {}
repositories = []
# inspect mets header
_mhdrs = self.xpath('//mets:metsHdr')
if len(_mhdrs) == 1:
_mhdr = _mhdrs[0]
_repos = self.xpath('mets:agent[@OTHERTYPE="REPOSITORY"]/mets:name/text()', _mhdr)
_repo = _repos[0] if len(_repos) == 1 else MARK_AGENT_LEGACY.split(':', maxsplit=1)[0]
repositories = self.xpath('mets:agent[@OTHERTYPE="REPOSITORY"]/mets:name/text()', _mhdr)
repo_one = repositories[0] if len(repositories) == 1 else MARK_AGENT_LEGACY.split(':', maxsplit=1)[0]
# legacy migrated record found?
_legacy_marks = self.xpath(f'//mets:note[contains(text(), "{MARK_AGENT_LEGACY}")]/text()')
if len(_legacy_marks) == 1:
Expand All @@ -630,38 +642,33 @@ def _system_identifiers(self) -> dict:
_legacy_ident = _id.rsplit(':', maxsplit=1)[1].strip()
_legacy_ident = _legacy_ident[2:] if _legacy_ident.startswith(
'md') else _legacy_ident
_idents[_repo] = _legacy_ident
ident_dict[repo_one] = _legacy_ident
else:
_idents[_repo] = _id
ident_dict[repo_one] = _id
_vls_marks = self.xpath(f'//mets:note[contains(text(), "{MARK_AGENT_VLID}")]/text()')
if len(_vls_marks) == 1:
_id = _vls_marks[0][len(MARK_AGENT_VLID):].strip()
_idents[_repo] = _id
ident_dict[repo_one] = _id
# legacy vls record which is not mapped by now?
if _repo and ('digital' in _repo or 'menadoc' in _repo) and _repo not in _idents:
if repo_one and ('digital' in repo_one or 'menadoc' in repo_one) and repo_one not in ident_dict:
_legacy_id = self.dmd_id[2:] if self.dmd_id.startswith('md') else self.dmd_id
_idents[_repo] = _legacy_id
ident_dict[repo_one] = _legacy_id
# legacy kitodo2 source _without_ OAI envelope
_creators = self.tree.xpath(
'//mets:agent[@OTHERTYPE="SOFTWARE" and @ROLE="CREATOR"]/mets:name', namespaces=dfc.XMLNS)
if len(_creators) == 1 and 'kitodo-ugh' in _creators[0].text.lower():
_idents[MARK_KITODO2] = None
ident_dict[MARK_KITODO2] = None
# kitodo3 metsDocumentID?
_doc_ids = self.xpath('//mets:metsDocumentID/text()', _mhdr)
if len(_doc_ids) == 1:
_idents[MARK_KITODO3] = _doc_ids[0]
ident_dict[MARK_KITODO3] = _doc_ids[0]
# once migrated, now hosted at opendata
_pres = self.tree.xpath(
viewer_pres = self.tree.xpath(
'.//dv:presentation[contains(./text(), "://opendata")]/text()', namespaces=dfc.XMLNS)
if len(_pres) == 1 and 'simple-search' not in _pres[0]:
_idents[_pres[0].split('/')[2]] = _pres[0]
if self._report and len(self._report.system_identifiers) > 0:
_idents = {**_idents, **self._report.system_identifiers}
if len(_idents) > 0:
return _idents

# we quit, no ideas left
raise RuntimeError(f"No System _identifiers n {self.tree.base}!")
if len(viewer_pres) == 1 and 'simple-search' not in viewer_pres[0]:
ident_dict[viewer_pres[0].split('/')[2]] = viewer_pres[0]
# return what has been learned
return {**ident_dict, **self.report.system_identifiers}

def _validate_identifier_types(self, _identifiers: dict):
"""Transform all known _identifiers to match configuration
Expand Down Expand Up @@ -696,12 +703,22 @@ def get_location_shelfs(self):
xpr_signature = f'.//mets:dmdSec[@ID="{self._prime_mods_id}"]//mods:shelfLocator/text()'
return self.tree.xpath(xpr_signature, namespaces=dfc.XMLNS)

def get_filegrp_links(self, group='MAX'):
"""Gather resource links for given filegroup"""
def get_filegrp_info(self, group='MAX') -> typing.List[METSFileInfo]:
"""Gather resource information for given filegroup
concerning URL, MIMETYPE and container ID"""

xpath = f'.//mets:fileGrp[@USE="{group}"]/mets:file/mets:FLocat'
resources = self.tree.findall(xpath, dfc.XMLNS)
return [res.attrib[XLINK_HREF] for res in resources]
the_files = self.tree.findall(xpath, dfc.XMLNS)
the_info = []
for a_file in the_files:
the_parent = a_file.getparent()
parent_id = the_parent.attrib["ID"]
the_type = the_parent.get("MIMETYPE", "image/jpg")
the_loc = a_file.attrib[XLINK_HREF]
the_info.append(METSFileInfo(file_id=parent_id,
file_type=the_type,
loc_url=the_loc))
return the_info

def get_invalid_physical_structs(self):
"""
Expand Down
Loading

0 comments on commit b900bcf

Please sign in to comment.