Skip to content

Commit

Permalink
[app][rfct] move xml validation
Browse files Browse the repository at this point in the history
  • Loading branch information
M3ssman committed May 10, 2024
1 parent c791023 commit 50e6c38
Show file tree
Hide file tree
Showing 18 changed files with 852 additions and 328 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "digiflow"
version = "3.9.2"
version = "3.10.2"
description = "Father's Little Digitization Workflow Helper"
readme = "README.md"
requires-python = ">=3.8"
Expand Down
6 changes: 4 additions & 2 deletions src/digiflow/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@

from .digflow_identifier import *
from .digiflow_io import *
from .digiflow_metadata import *
from .digiflow_generate import *
from .digiflow_export import *
from .digiflow_validate import (
from .validate.metadata_xslt import (
DDB_IGNORE_RULES_BASIC,
DDB_IGNORE_RULES_MVW,
DDB_IGNORE_RULES_NEWSPAPERS,
Expand All @@ -23,7 +24,6 @@
LABEL_SCAN_VALIDATOR_COMPRESSION,
LABEL_SCAN_VALIDATOR_RESOLUTION,
LABEL_SCAN_VALIDATOR_FILEDATA,
UNSET_LABEL,
UNSET_NUMBR,
INVALID_LABEL_UNSET,
FSReadException,
Expand All @@ -45,3 +45,5 @@
group_can_write,
validate_tiff,
)

from .common import UNSET_LABEL, XMLNS
25 changes: 25 additions & 0 deletions src/digiflow/common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
"""common constants"""


XMLNS = {
'alto': 'http://www.loc.gov/standards/alto/ns-v4#',
'dc': 'http://purl.org/dc/elements/1.1/',
'dv': 'http://dfg-viewer.de/',
'epicur': 'urn:nbn:de:1111-2004033116',
'marcxml': 'http://www.loc.gov/MARC21/slim',
'goobi': 'http://meta.goobi.org/v1.5.1/',
'mets': 'http://www.loc.gov/METS/',
'mix': 'http://www.loc.gov/mix/v20',
'mods': 'http://www.loc.gov/mods/v3',
'oai': 'http://www.openarchives.org/OAI/2.0/',
'oai_dc': 'http://www.openarchives.org/OAI/2.0/oai_dc/',
'ulb': 'https://bibliothek.uni-halle.de',
'vl': 'http://visuallibrary.net/vl',
'vlz': 'http://visuallibrary.net/vlz/1.0/',
'xlink': 'http://www.w3.org/1999/xlink',
'zvdd': 'http://zvdd.gdz-cms.de/',
}


UNSET_LABEL = 'n.a.'

7 changes: 4 additions & 3 deletions src/digiflow/digflow_identifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,10 @@

from .digiflow_metadata import (
MetsReader,
XMLNS,
)

import digiflow.common as dfc


##################
# script constants
Expand Down Expand Up @@ -218,7 +219,7 @@ def enrich_urn_granular(xml_tree,
# if no granular urns exist at all, then all must get new urn ...
if not cnt_urn_fits:
# identify insertion point
phys_divs = xml_tree.findall('.//mets:structMap[@TYPE="PHYSICAL"]/mets:div/mets:div', XMLNS)
phys_divs = xml_tree.findall('.//mets:structMap[@TYPE="PHYSICAL"]/mets:div/mets:div', dfc.XMLNS)
# alert invalid data
if len(phys_divs) == 0:
raise GranularURNException(f"No phys pages in {xml_tree.base}!")
Expand Down Expand Up @@ -329,7 +330,7 @@ def insert_granular_urn(urn_type, phys_conts, main_urn, page_num=None, padd_left
def _get_phys_containers(xml_tree):
"""Get all physical containers that contain @ORDER attribute"""
return xml_tree.findall(
'.//mets:structMap[@TYPE="PHYSICAL"]//mets:div[@ORDER]', XMLNS)
'.//mets:structMap[@TYPE="PHYSICAL"]//mets:div[@ORDER]', dfc.XMLNS)


def enrich_urn_kitodo2(process_path:'str|Path', collection='1', system_id=KITODO2_ID, exemplar='1',
Expand Down
29 changes: 4 additions & 25 deletions src/digiflow/digiflow_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,12 @@
from lxml import etree as ET

from .digiflow_metadata import (
XMLNS,
MetsReader,
write_xml_file,
)

import digiflow.common as dfc


####
#
Expand All @@ -53,28 +54,6 @@
STATETIME_FORMAT_ALT = '%Y-%m-%dT%H:%M:%SZ'


def post_oai_extract_metsdata(xml_tree):
"""Extract METS as new root from OAI envelope"""

namespace = xml_tree.xpath('namespace-uri(.)')
if namespace == 'http://www.loc.gov/METS/':
return xml_tree

if namespace == 'http://www.openarchives.org/OAI/2.0/':
mets_root_el = xml_tree.find('.//mets:mets', XMLNS)
if mets_root_el is not None:
return ET.ElementTree(mets_root_el).getroot()
return None


def post_oai_extract_mets(the_self, the_data):
"""Just extract METS from OAI body"""

xml_root = ET.fromstring(the_data)
mets_tree = post_oai_extract_metsdata(xml_root)
write_xml_file(mets_tree, the_self.path_mets, preamble=None)


def post_oai_store_ocr(path_local, the_data):
"""
Store OCR XML as it is
Expand Down Expand Up @@ -816,8 +795,8 @@ def load_resource(self, url, path_local, post_func):
# from test-data or *real* requests
if not isinstance(_snippet, str):
_snippet = _snippet.decode('utf-8')
if XMLNS['mets'] in _snippet or XMLNS['oai'] in _snippet:
data = post_func(self, data)
if dfc.XMLNS['mets'] in _snippet or dfc.XMLNS['oai'] in _snippet:
data = post_func(self.path_mets, data)
elif 'http://www.loc.gov/standards/alto' in _snippet:
data = post_func(local_path, data)
else:
Expand Down
Loading

0 comments on commit 50e6c38

Please sign in to comment.