diff --git a/pyproject.toml b/pyproject.toml index 5605891..1efff4c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "digiflow" -version = "5.5.9" +version = "5.6.9" description = "Father's Little Digitization Workflow Helper" readme = "README.md" requires-python = ">=3.8" diff --git a/src/digiflow/digiflow_generate.py b/src/digiflow/digiflow_generate.py index 7fce664..4e914e6 100644 --- a/src/digiflow/digiflow_generate.py +++ b/src/digiflow/digiflow_generate.py @@ -313,6 +313,7 @@ def __init__( raise RuntimeError(f"[DerivansManager] config missing: {path_configuration}!") self.path_mets_file = path_mets_file self.path_configuration = path_configuration + self.images = None @abstractmethod def init(self) -> None: @@ -405,6 +406,8 @@ def start(self) -> DerivansResult: cmd = f'{path_exec}{self.xargs} -jar {self.path_binary} {self.path_mets_file}' if self.path_configuration: cmd += f' -c {self.path_configuration}' + if self.images: + cmd += f" -i {self.images}" # disable pylint due it is not able to recognize # output being created by decorator time_duration, label, result = self._execute_derivans( @@ -508,6 +511,9 @@ def start(self) -> DerivansResult: mounts.append(Mount(source=config_dir, target=DERIVANS_CNT_CONF_DIR, type='bind')) command.append('-c') command.append(target_config_file) + if self.images: + command.append("-i") + command.append(self.images) if self._path_logging: _log_dir = self._path_logging mounts.append(Mount(source=_log_dir, target=DERIVANS_CNT_LOGG_DIR, type='bind')) diff --git a/src/digiflow/digiflow_io.py b/src/digiflow/digiflow_io.py index 1aa4ba6..51f5121 100644 --- a/src/digiflow/digiflow_io.py +++ b/src/digiflow/digiflow_io.py @@ -26,6 +26,8 @@ OAI_KWARG_REQUESTS = "request_kwargs" REQUESTS_DEFAULT_TIMEOUT = 20 +_MIME_TYPE_JPG = "image/jpg" + class LoadException(Exception): """Load of OAI Data failed""" @@ -94,7 +96,8 @@ def _sanitize_kwargs(self, in_kwargs): return top_dict def load(self, record_identifier, local_dst, mets_digital_object_identifier=None, - skip_resources=False, force_update=False, metadata_format='mets') -> int: + skip_resources=False, force_update=False, metadata_format='mets', + use_file_id=False) -> int: """ load metadata from OAI with optional caching in-between request additional linked resources if required @@ -124,17 +127,20 @@ def load(self, record_identifier, local_dst, mets_digital_object_identifier=None # get linked resources for k in self.groups: - self.groups[k] = mets_reader.get_filegrp_links(group=k) + self.groups[k] = mets_reader.get_filegrp_info(group=k) # if exist, download them too post_func = None - for k, linked_res_urls in self.groups.items(): + for k, file_entries in self.groups.items(): if k == self.key_ocr: post_func = post_oai_store_ocr - for linked_res_url in linked_res_urls: - res_val_end = linked_res_url.split('/')[-1] - res_val_path = self._calculate_path(k, res_val_end) - if self._handle_load(linked_res_url, res_val_path, post_func): + for mets_file in file_entries: + res_url = mets_file.loc_url + url_final_token = res_url.split('/')[-1] + if use_file_id: + url_final_token = mets_file.file_id + local_path = self._calculate_path(mets_file.file_type, k, url_final_token) + if self._handle_load(res_url, local_path, post_func): loaded += 1 return loaded @@ -164,15 +170,19 @@ def _handle_load(self, res_url, res_path, post_func, force_load=False): else: return self.load_resource(res_url, res_path, post_func) - def _calculate_path(self, *args): + def _calculate_path(self, mime_type, *args): """ calculate final path depending on some heuristics which fileGrp has been used - 'MAX' means images, not means 'xml' """ res_path = os.path.join(str(self.dir_local), os.sep.join(list(args))) + if mime_type == _MIME_TYPE_JPG and not res_path.endswith('.jpg'): + res_path += ".jpg" if '/MAX/' in res_path and not res_path.endswith('.jpg'): res_path += '.jpg' - elif '/FULLTEXT/' in res_path and not res_path.endswith('.xml'): + if "xml" in mime_type and not res_path.endswith('.xml'): + res_path += ".xml" + if '/FULLTEXT/' in res_path and not res_path.endswith('.xml'): res_path += '.xml' return res_path diff --git a/src/digiflow/digiflow_metadata.py b/src/digiflow/digiflow_metadata.py index b5a82f7..e38e330 100644 --- a/src/digiflow/digiflow_metadata.py +++ b/src/digiflow/digiflow_metadata.py @@ -1,9 +1,12 @@ """Metadata module regarding METS/MODS""" # -*- coding: utf-8 -*- + import abc +import dataclasses import os import time +import typing from collections import ( defaultdict @@ -131,6 +134,14 @@ def extract_mets(path_mets, the_data): write_xml_file(mets_tree, path_mets, preamble=None) +@dataclasses.dataclass +class METSFileInfo: + """Represents filGrp/file entry""" + file_id: str + file_type: str + loc_url: str + + class XMLProcessor(abc.ABC): """Basic XML-Processing""" @@ -448,7 +459,7 @@ def report(self): if self._report is None: self._report = MetsReaderReport() self._report.identifiers = self.get_identifiers() - self._report.system_identifiers = self._system_identifiers() + self._report.system_identifiers = self._ulb_digi_system_identifier() self._report.languages = self.get_language_information() outcome = self.get_type_and_hierarchy() if outcome: @@ -606,7 +617,7 @@ def _identifiers_from_prime_mods(self) -> dict: _identifiers[rec.attrib['source']] = rec.text return _identifiers - def _system_identifiers(self) -> dict: + def _ulb_digi_system_identifier(self) -> dict: """Determine system identifier(s) * use METS-agent-information, if available @@ -615,13 +626,14 @@ def _system_identifiers(self) -> dict: than one system id present: actual plus legacy _identifiers """ - _idents = {} + ident_dict = {} + repositories = [] # inspect mets header _mhdrs = self.xpath('//mets:metsHdr') if len(_mhdrs) == 1: _mhdr = _mhdrs[0] - _repos = self.xpath('mets:agent[@OTHERTYPE="REPOSITORY"]/mets:name/text()', _mhdr) - _repo = _repos[0] if len(_repos) == 1 else MARK_AGENT_LEGACY.split(':', maxsplit=1)[0] + repositories = self.xpath('mets:agent[@OTHERTYPE="REPOSITORY"]/mets:name/text()', _mhdr) + repo_one = repositories[0] if len(repositories) == 1 else MARK_AGENT_LEGACY.split(':', maxsplit=1)[0] # legacy migrated record found? _legacy_marks = self.xpath(f'//mets:note[contains(text(), "{MARK_AGENT_LEGACY}")]/text()') if len(_legacy_marks) == 1: @@ -630,38 +642,33 @@ def _system_identifiers(self) -> dict: _legacy_ident = _id.rsplit(':', maxsplit=1)[1].strip() _legacy_ident = _legacy_ident[2:] if _legacy_ident.startswith( 'md') else _legacy_ident - _idents[_repo] = _legacy_ident + ident_dict[repo_one] = _legacy_ident else: - _idents[_repo] = _id + ident_dict[repo_one] = _id _vls_marks = self.xpath(f'//mets:note[contains(text(), "{MARK_AGENT_VLID}")]/text()') if len(_vls_marks) == 1: _id = _vls_marks[0][len(MARK_AGENT_VLID):].strip() - _idents[_repo] = _id + ident_dict[repo_one] = _id # legacy vls record which is not mapped by now? - if _repo and ('digital' in _repo or 'menadoc' in _repo) and _repo not in _idents: + if repo_one and ('digital' in repo_one or 'menadoc' in repo_one) and repo_one not in ident_dict: _legacy_id = self.dmd_id[2:] if self.dmd_id.startswith('md') else self.dmd_id - _idents[_repo] = _legacy_id + ident_dict[repo_one] = _legacy_id # legacy kitodo2 source _without_ OAI envelope _creators = self.tree.xpath( '//mets:agent[@OTHERTYPE="SOFTWARE" and @ROLE="CREATOR"]/mets:name', namespaces=dfc.XMLNS) if len(_creators) == 1 and 'kitodo-ugh' in _creators[0].text.lower(): - _idents[MARK_KITODO2] = None + ident_dict[MARK_KITODO2] = None # kitodo3 metsDocumentID? _doc_ids = self.xpath('//mets:metsDocumentID/text()', _mhdr) if len(_doc_ids) == 1: - _idents[MARK_KITODO3] = _doc_ids[0] + ident_dict[MARK_KITODO3] = _doc_ids[0] # once migrated, now hosted at opendata - _pres = self.tree.xpath( + viewer_pres = self.tree.xpath( './/dv:presentation[contains(./text(), "://opendata")]/text()', namespaces=dfc.XMLNS) - if len(_pres) == 1 and 'simple-search' not in _pres[0]: - _idents[_pres[0].split('/')[2]] = _pres[0] - if self._report and len(self._report.system_identifiers) > 0: - _idents = {**_idents, **self._report.system_identifiers} - if len(_idents) > 0: - return _idents - - # we quit, no ideas left - raise RuntimeError(f"No System _identifiers n {self.tree.base}!") + if len(viewer_pres) == 1 and 'simple-search' not in viewer_pres[0]: + ident_dict[viewer_pres[0].split('/')[2]] = viewer_pres[0] + # return what has been learned + return {**ident_dict, **self.report.system_identifiers} def _validate_identifier_types(self, _identifiers: dict): """Transform all known _identifiers to match configuration @@ -696,12 +703,22 @@ def get_location_shelfs(self): xpr_signature = f'.//mets:dmdSec[@ID="{self._prime_mods_id}"]//mods:shelfLocator/text()' return self.tree.xpath(xpr_signature, namespaces=dfc.XMLNS) - def get_filegrp_links(self, group='MAX'): - """Gather resource links for given filegroup""" + def get_filegrp_info(self, group='MAX') -> typing.List[METSFileInfo]: + """Gather resource information for given filegroup + concerning URL, MIMETYPE and container ID""" xpath = f'.//mets:fileGrp[@USE="{group}"]/mets:file/mets:FLocat' - resources = self.tree.findall(xpath, dfc.XMLNS) - return [res.attrib[XLINK_HREF] for res in resources] + the_files = self.tree.findall(xpath, dfc.XMLNS) + the_info = [] + for a_file in the_files: + the_parent = a_file.getparent() + parent_id = the_parent.attrib["ID"] + the_type = the_parent.get("MIMETYPE", "image/jpg") + the_loc = a_file.attrib[XLINK_HREF] + the_info.append(METSFileInfo(file_id=parent_id, + file_type=the_type, + loc_url=the_loc)) + return the_info def get_invalid_physical_structs(self): """ diff --git a/tests/resources/mets/SBB_PPN1000056597.xml b/tests/resources/mets/SBB_PPN1000056597.xml new file mode 100644 index 0000000..d0488b3 --- /dev/null +++ b/tests/resources/mets/SBB_PPN1000056597.xml @@ -0,0 +1,239 @@ + + + + Goobi - UGH-1.11.1-v1.11.0-11-gbafb11b - 16−November−2015 + Goobi + + + + + + + + DE-1 + 5 an: 4" Ee 8101 + + + + Glückstadt + + 1719 + Königliche privil. Buchdruckerey + + + + Berlin + + 2017 + Staatsbibliothek zu Berlin - Preußischer Kulturbesitz, Germany + [Electronic ed.] + + Historische Drucke + Theologie + VD18 digital + + PPN1000056597 + + http://resolver.staatsbibliothek-berlin.de/SBB00020C9400000000 + 10500693 + + + PPN452851599 + + + + Schuldige Condolentz-Zeilen Womit seinem Geliebten und Wehrtesten Freunde Mons: Sebastian Peter Wolters, wegen des unvermuthlichen Hintritts seines Hochzuehrenden Herren Vaters, seine Pflicht abstatten wolte Otto Benedict Koltemann + + P_Drucke_VD18 + VD18 10500693 + Gelegenheitsschrift:Tod + Lyrik + + Online-Publikation + + + ger + + + + VD18 digital + + + + + aut + + Koltemann + Otto Benedict + Koltemann, Otto Benedict + + + + asn + + Wolters + Sebastian Peter + Wolters, Sebastian Peter + + + + asn + + Wolters + Joachim + Wolters, Joachim + + + + fnd + + Deutsche Forschungsgemeinschaft + + + reformatted digital + [2] Bl. + + + + + + Kondolenz-Zeilen + + + + ISO15924:217 + + Public Domain Mark 1.0 + text + open access + + + + + + + + + + Staatsbibliothek zu Berlin - Preußischer Kulturbesitz + http://resolver.staatsbibliothek-berlin.de/SBB0000000100000000 + http://www.staatsbibliothek-berlin.de + mailto:info@sbb.spk-berlin.de + + + + + + + + + http://www.stabikat.de/DB=1/PPN?PPN=1000056597 + http://digital.staatsbibliothek-berlin.de/dms/werkansicht/?PPN=PPN1000056597 + https://content.staatsbibliothek-berlin.de/dc/PPN1000056597/manifest + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/test_digiflow_metadata.py b/tests/test_digiflow_metadata.py index 04cae27..524a6d4 100644 --- a/tests/test_digiflow_metadata.py +++ b/tests/test_digiflow_metadata.py @@ -615,7 +615,7 @@ def test_metsreader_zd2_issue_18680621(): 'urn': 'urn:nbn:de:gbv:3:1-171133730-102163406918680621-11', 'kxp-ppn': '102163406918680621', } - assert mets_reader._system_identifiers() == {'kitodo3': '4583'} + assert mets_reader._ulb_digi_system_identifier() == {'kitodo3': '4583'} _pica, _type, _tree = mets_reader.get_type_and_hierarchy() assert _pica == 'AB' assert _type == 'issue' @@ -831,3 +831,12 @@ def test_metadata_processor_contains_multiple_fgroup(): _proc = df.MetsProcessor(TEST_RES / 'k2_mets_morbio_1748529021.xml') assert _proc.contains_group(['MAX']) + + +def test_mets_reader_some_sbb_mets(): + """Explore SBB METS/MODS""" + + the_reader = df.MetsReader(TEST_RES / "mets" / "SBB_PPN1000056597.xml") + the_report = the_reader.report + assert the_report.type == "monograph" + assert the_report.languages == ["ger"] diff --git a/tests/test_record_handling.py b/tests/test_record_handling.py index 7b049dc..8fbfe97 100644 --- a/tests/test_record_handling.py +++ b/tests/test_record_handling.py @@ -231,6 +231,40 @@ def test_oai_load_opendata_request_kwargs( assert os.path.exists(str(store_dir)) +@unittest.mock.patch("digiflow.requests.get") +def test_oai_load_opendata_file_identifier( + mock_request_1981185920_36020, tmp_path): + """Ensure OAI Loader switches behavior and renames + downloaded resources according to FILE@ID rather + """ + + # arrange + mock_request_1981185920_36020.side_effect = fixture_request_results + ident = 'oai:opendata.uni-halle.de:1981185920/36020' + record = df_r.Record(ident) + the_id = record.local_identifier + local_dir: Path = tmp_path / "WORKDIR" / the_id + store_dir: Path = tmp_path / "STORE" / "dd" / the_id + local_dir.mkdir(parents=True) + store_dir.mkdir(parents=True) + key_images = 'MAX' + dst_path = local_dir / f"{the_id}.xml" + + # act + loader = df_io.OAILoader(local_dir, base_url=OAI_BASE_URL_OPENDATA, + group_images=key_images, + post_oai=df_md.extract_mets) + loader.store = df_io.LocalStore(store_dir, local_dir) + number = loader.load(record.identifier, str(dst_path), + use_file_id=True) + + # assert + assert number == 12 + assert dst_path.is_file() + assert (local_dir / "MAX" / "FILE_0001_MAX.jpg").is_file() + assert (local_dir / "MAX" / "FILE_0011_MAX.jpg").is_file() + + def fixture_request_vls_zd1_16359609(*args, **kwargs): """ Provide local copies for corresponding download request