From 723eef06dda70b35e6e50ada8435f75371ffef94 Mon Sep 17 00:00:00 2001 From: Uwe Hartwig Date: Fri, 24 May 2024 19:20:48 +0200 Subject: [PATCH] [app][rfct] mjr re-organization --- README.md | 23 ++-- pyproject.toml | 33 ++++++ requirements.txt | 6 - setup.cfg | 36 ------ src/digital_eval/VERSION | 1 - src/digital_eval/__init__.py | 6 +- src/digital_eval/cli.py | 189 ++++++++++++++----------------- src/digital_eval/evaluation.py | 197 +++++---------------------------- src/digital_eval/metrics.py | 172 +++++++++++++++++++++++++--- src/ocr_util/cli.py | 18 ++- tests/test_dict_metric.py | 37 +++---- tests/test_digital_eval_cli.py | 20 ++-- tests/test_ocr_evaluate.py | 94 ++++++++-------- tests/test_ocr_metrics.py | 50 ++++----- tests/test_ocr_metrics_base.py | 34 +++--- 15 files changed, 434 insertions(+), 482 deletions(-) delete mode 100644 requirements.txt delete mode 100644 setup.cfg delete mode 100644 src/digital_eval/VERSION diff --git a/README.md b/README.md index eff5d84..377f02c 100644 --- a/README.md +++ b/README.md @@ -7,16 +7,16 @@ Python3 Tool to report evaluation outcomes from mass digitalization workflows. ## Features -* match automatically groundtruth (i.e. reference data) and candidates by filename +* match groundtruth (i.e. reference data) and candidates by filename start * use geometric information to evaluate only specific frame (i.e. specific column or region from large page) of candidates (requires ALTO or PAGE format) -* aggregate evaluation outcome on domain range (with multiple subdomains) +* aggregate evaluation outcomes on domain range (with multiple subdomains) according to folder layout * choose from textual metrics based on characters or words plus common Information Retrieval -* choose between accuracy / error rate and different UTF-8 Python norms +* choose from different UTF-8 Python norms * formats: ALTO, PAGE or plain text for both groundtruth and candidates * speedup with parallel execution * additional OCR util: - * filter custom areas of single OCR files + * filter custom areas of single OCR files of ALTO files ## Installation @@ -28,13 +28,12 @@ pip install digital-eval ### Metrics -Calculate similarity (`acc`) or difference (`err`) ratios between single reference/groundtruth and test/candidate item. +#### Edit-Distance based Strin Similarity -#### Edit-Distance based - -Character-based text string minus whitechars (`Cs`, `Characters`) or Letter-based (`Ls`, `Letters`) minus whites, -punctuation and digits. -Word/Token-based edit-distance of single tokens identified by whitespaces. +Calculate similarity for each single reference/groundtruth and test/candidate item. +Complete haracter-based text string (`Cs`, `Characters`) or Letter-based (`Ls`, `Letters`) minus whitespaces, +punctuation and common digits (arabic, persian). +Word/Token-based edit-distance of single tokens identified by Word or String elements or whitespaces, depending on data. #### Set based @@ -141,8 +140,8 @@ digital-eval --help Contributions, suggestions and proposals welcome! -## Licence +## License Under terms of the [MIT license](https://opensource.org/licenses/MIT). -**NOTE**: This software depends on other packages that _may_ be licensed under different open source licenses. +**NOTE**: This software depends on packages that _may_ be licensed under different terms. diff --git a/pyproject.toml b/pyproject.toml index 638dd9c..31be912 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,36 @@ [build-system] requires = ["setuptools>=61.0"] build-backend = "setuptools.build_meta" + +[tool.setuptools.dynamic] +version = {attr = "digital_eval.__version__"} + +[tool.setuptools.packages.find] +where = ["src"] + +[project] +name = "digital-eval" +dynamic = ["version"] +description = "Evaluate Digitalization Data" +readme = "README.md" +requires-python = ">=3.8" +authors = [{name = "Universitäts- und Landesbibliothek Sachsen-Anhalt",email = "development@bibliothek.uni-halle.de"}] +classifiers = [ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License" +] +dependencies = [ + "rapidfuzz>3", + "nltk", + "requests", + "docker", + "numpy", + "digital-object==0.2.0", +] + +[project.urls] +Homepage = "https://github.com/ulb-sachsen-anhalt/digital-eval" + +[project.scripts] +digital-eval = "digital_eval.cli:start" +ocr-util = "ocr_util.cli:start" diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index da93842..0000000 --- a/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -rapidfuzz -numpy -nltk -requests -docker -digital-object==0.2.0 \ No newline at end of file diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index a582a59..0000000 --- a/setup.cfg +++ /dev/null @@ -1,36 +0,0 @@ -[metadata] -name = digital-eval -version = file:src/digital_eval/VERSION -description = Evaluate Mass Digitalization Data -long_description = file:README.md -long_description_content_type = text/markdown -author = Universitäts- und Landesbibliothek Sachsen-Anhalt -author_email = development@bibliothek.uni-halle.de -maintainer = Uwe Hartwig -maintainer_email = uwe.hartwig@bibliothek.uni-halle.de -classifiers = - Programming Language :: Python :: 3 - License :: OSI Approved :: MIT License - Operating System :: OS Independent -project_urls = - Homepage = https://github.com/ulb-sachsen-anhalt/digital-eval - -[options] -python_requires = >=3.8 -package_dir = - =src -packages = find: -include_package_data = True -install_requires = - rapidfuzz>3 - numpy - nltk - shapely - -[options.packages.find] -where = src - -[options.entry_points] -console_scripts = - digital-eval = digital_eval.cli:start - ocr-util = ocr_util.cli:start diff --git a/src/digital_eval/VERSION b/src/digital_eval/VERSION deleted file mode 100644 index dc1e644..0000000 --- a/src/digital_eval/VERSION +++ /dev/null @@ -1 +0,0 @@ -1.6.0 diff --git a/src/digital_eval/__init__.py b/src/digital_eval/__init__.py index e0f0165..6a10bbd 100644 --- a/src/digital_eval/__init__.py +++ b/src/digital_eval/__init__.py @@ -1,6 +1,5 @@ -# -# provided API exports -# +"""digital eval main API""" +__version__ = '1.6.0' from .evaluation import ( Evaluator, find_groundtruth, @@ -18,4 +17,3 @@ MetricIRRec, MetricIRFM, ) - diff --git a/src/digital_eval/cli.py b/src/digital_eval/cli.py index a92a8a2..5e45e29 100644 --- a/src/digital_eval/cli.py +++ b/src/digital_eval/cli.py @@ -2,12 +2,9 @@ """OCR QA Evaluation CLI""" import argparse -import datetime as dt import os import sys -from typing import ( - List, Type -) +import typing import digital_eval as digev import digital_eval.dictionary_metrics.common as digev_cm @@ -46,29 +43,18 @@ } -def _get_info(): - here = os.path.abspath(os.path.dirname(__file__)) - _v = '' - _t = '' - _fp = os.path.join(here, 'VERSION') - with open(_fp) as fp: - _v = fp.read() - _t = dt.datetime.fromtimestamp(os.stat(_fp).st_mtime).strftime("%Y-%m-%d") - return f'v{_v}/{_t}' - - def _initialize_metrics( the_metrics, norm, -) -> List[digem.SimilarityMetric]: +) -> typing.List[digem.SimilarityMetric]: _tokens = the_metrics.split(',') try: - metric_objects: List[digem.SimilarityMetric] = [] + metric_objects: typing.List[digem.SimilarityMetric] = [] for m in _tokens: - clazz: Type[digem.SimilarityMetric] = METRIC_DICT[m] + clazz: typing.Type[digem.SimilarityMetric] = METRIC_DICT[m] if 'Dict' in m: norm = digem.UC_NORMALIZATION_NFKD - metric_inst: digem.SimilarityMetric = clazz(normalization=norm) + metric_inst: digem.SimilarityMetric = clazz(normalization=norm) metric_objects.append(metric_inst) return metric_objects except KeyError as _err: @@ -78,30 +64,66 @@ def _initialize_metrics( sys.exit(1) -######## -# MAIN # -######## -def _main( - path_candidates, - path_reference, - metrics, - utf8norm, - # calc, - xtra, - is_sequential=False, -): +######### +# START # +######### +def start_evaluation(parse_args: typing.Dict): + """Main workflow""" + + path_candidates = parse_args["candidates"] + path_reference = parse_args["reference"] + metrics: str = parse_args["metrics"] + utf8norm = parse_args["utf8"] + verbosity = parse_args["verbosity"] + is_seq = parse_args["sequential"] if "sequential" in parse_args else False + xtra = parse_args["extra"] if "extra" in parse_args else None + + if "language" in parse_args: + digem.MetricDictionary.LANGUAGE = parse_args["language"] + uses_lang_tool: bool = 'DictLT' in metrics or "DictionaryLangTool" in metrics + if uses_lang_tool: + lt_url: str = parse_args["lt_api_url"] if "lp_api_url" in parse_args else LanguageTool.DEFAULT_URL + LanguageTool.initialize(lt_url) + + # go on with basic validation + if not os.path.isdir(path_candidates): + print(f'[ERROR] input "{path_candidates}": invalid directory! exit!') + sys.exit(1) + if path_reference and not os.path.isdir(path_reference): + print(f'[ERROR] reference "{path_reference}": invalid directory! exit!') + sys.exit(1) + + # sanitize trailing slash + if not isinstance(path_candidates, str): + path_candidates = str(path_candidates) + if not isinstance(path_reference, str): + path_reference = str(path_reference) + path_candidates = path_candidates[:-1] if path_candidates.endswith('/') else path_candidates + path_reference = path_reference[:-1] if path_reference.endswith('/') else path_reference + + # if candidates and both reference provided: do domains match? + if path_candidates and path_reference: + _base_can = os.path.basename(path_candidates) + _base_ref = os.path.basename(path_reference) + if _base_can != _base_ref: + print(f"[WARN ] start domains '{_base_can}' and '{_base_ref}' mismatch, summary might be inaccurate!") + + # some diagnostics + if verbosity >= 2: + args = f"{path_candidates}, {path_reference}, {verbosity}, {xtra}" + print(f'[DEBUG] called with {args}') + # create basic evaluator instance evaluator = digev.Evaluator( path_candidates, - verbosity=VERBOSITY, + verbosity=verbosity, extras=xtra, ) evaluator.metrics = _initialize_metrics(metrics, norm=utf8norm)#, calc=calc) - # evaluator.calc = calc - if VERBOSITY >= 1: - print(f"[DEBUG] text normalized using '{utf8norm}' values for '{metrics}'") + if verbosity >= 1: + print(f"[DEBUG] text normalized using '{utf8norm}' code points for '{metrics}'") - evaluator.is_sequential = is_sequential + evaluator.is_sequential = is_seq evaluator.domain_reference = path_reference # gather structure information @@ -122,7 +144,7 @@ def _main( n_diff = n_entries - len(gt_entries) gt_missing = set(gt_entries) ^ set(candidates) rnd_str = f" ({gt_missing})" if gt_missing else "" - if VERBOSITY >= 1: + if verbosity >= 1: print(f'[DEBUG] from "{n_entries}" filtered "{n_diff}" candidates missing groundtruth{rnd_str}') # trigger actual evaluation @@ -135,110 +157,67 @@ def _main( evaluator.eval_map() # serialize stdout report - if VERBOSITY >= 0: - digev.report_stdout(evaluator, VERBOSITY) - + if verbosity >= 0: + digev.report_stdout(evaluator, verbosity) + # for testing purposes - return evaluator.get_results() + eval_results = evaluator.get_results() + + # final clean-up + if uses_lang_tool: + LanguageTool.deinitialize() + + return eval_results def start(): - PARSER = argparse.ArgumentParser(description=f""" - Evaluate Mass Digital Data. ({_get_info()}) - """) - PARSER.add_argument( - "candidates", - help="Root Directory for evaluation candidates" - ) - PARSER.add_argument("-ref", "--reference", + """Wrap argparsing""" + parser = argparse.ArgumentParser(description=f"Evaluate Mass Digitalization Data {digev.__version__}") + parser.add_argument("candidates", + help="Root Directory for evaluation candidates" + ) + parser.add_argument("-ref", "--reference", required=False, help="Root directory for Reference/Groundtruth data (optional, but necessary for most metrics)" ) - PARSER.add_argument("-v", "--VERBOSITY", + parser.add_argument("-v", "--verbosity", action='count', default=DEFAULT_VERBOSITY, required=False, help=f"Verbosity flag. To increase, append multiple 'v's (optional; default: '{DEFAULT_VERBOSITY}')" ) - PARSER.add_argument("--metrics", + parser.add_argument("--metrics", default=DEFAULT_OCR_METRICS, required=False, help=f"List of metrics to use (optional, default: '{DEFAULT_OCR_METRICS}'; available: '{','.join(METRIC_DICT.keys())}')" ) - PARSER.add_argument("--utf8", + parser.add_argument("--utf8", default=DEFAULT_UTF8_NORM, required=False, help=f"UTF-8 Unicode Python Normalization (optional; default: '{DEFAULT_UTF8_NORM}'; available: 'NFC','NFKC','NFD','NFKD')", ) - PARSER.add_argument("-s", "--sequential", + parser.add_argument("-s", "--sequential", action='store_true', required=False, help="Execute calculations sequentially (optional; default: 'False')", ) - PARSER.add_argument("-x", "--extra", + parser.add_argument("-x", "--extra", required=False, help="pass additional information to evaluation, like 'ignore_geometry' (compare only text, ignore coords)" ) - PARSER.add_argument('-l', "--language", + parser.add_argument('-l', "--language", default=digev_cm.LANGUAGE_KEY_DEFAULT, choices=digev_cm.LANGUAGE_KEYS, required=False, help=f"Language code for LanguagTool according to ISO 639-2 (optional; default: '{digev_cm.LANGUAGE_KEY_DEFAULT}')", ) - PARSER.add_argument('-u', "--lt-api-url", + parser.add_argument('-u', "--lt-api-url", default=LanguageTool.DEFAULT_URL, required=False, help=f"Language Tool Api URL (optional; default: '{LanguageTool.DEFAULT_URL}')", ) - PARSER.set_defaults(sequential=False) - - ARGS = vars(PARSER.parse_args()) - path_candidates = ARGS["candidates"] - path_reference = ARGS["reference"] - global VERBOSITY - VERBOSITY = ARGS["VERBOSITY"] - IS_SEQUENTIAL = ARGS["sequential"] - xtra = ARGS["extra"] - metrics: str = ARGS["metrics"] - utf8norm = ARGS["utf8"] - digem.MetricDictionary.LANGUAGE = ARGS["language"] - lt_api_url = ARGS["lt_api_url"] - - uses_lang_tool: bool = 'DictLT' in metrics or "DictionaryLangTool" in metrics - - if uses_lang_tool: - lt_url: str = lt_api_url if LanguageTool.DEFAULT_URL != lt_api_url else LanguageTool.DEFAULT_URL - LanguageTool.initialize(lt_url) - # go on - # basic validation - if not os.path.isdir(path_candidates): - print(f'[ERROR] input "{path_candidates}": invalid directory! exit!') - sys.exit(1) - if path_reference and not os.path.isdir(path_reference): - print(f'[ERROR] reference "{path_reference}": invalid directory! exit!') - sys.exit(1) - - # sanitize trailing slash - path_candidates = path_candidates[:-1] if path_candidates.endswith('/') else path_candidates - path_reference = path_reference[:-1] if path_reference.endswith('/') else path_reference - - # if candidates and both reference provided: do domains match? - if path_candidates and path_reference: - _base_can = os.path.basename(path_candidates) - _base_ref = os.path.basename(path_reference) - if _base_can != _base_ref: - print(f"[WARN ] start domains '{_base_can}' and '{_base_ref}' mismatch, summary might be inaccurate!") - - # some diagnostics - if VERBOSITY >= 2: - args = f"{path_candidates}, {path_reference}, {VERBOSITY}, {xtra}" - print(f'[DEBUG] called with {args}') - - # here we go - _main(path_candidates, path_reference, metrics, utf8norm, xtra, is_sequential=IS_SEQUENTIAL) - - if uses_lang_tool: - LanguageTool.deinitialize() + main_args = vars(parser.parse_args()) + start_evaluation(main_args) if __name__ == "__main__": diff --git a/src/digital_eval/evaluation.py b/src/digital_eval/evaluation.py index ef7f3d1..19a3758 100644 --- a/src/digital_eval/evaluation.py +++ b/src/digital_eval/evaluation.py @@ -4,35 +4,23 @@ import concurrent.futures import copy +import datetime +import math +import multiprocessing import os import re import sys +import typing import xml.dom.minidom import xml.etree.ElementTree as ET -from datetime import ( - date -) -from math import ( - floor -) -from multiprocessing import ( - cpu_count -) + from pathlib import ( Path ) -from typing import ( - List, - Tuple, -) import numpy as np -from digital_object import ( - DigitalObjectTree, - DigitalObjectLevel, - to_digital_object, -) +import digital_eval.metrics as digem PAGE_2013 = 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15' XML_NS = {'alto': 'http://www.loc.gov/standards/alto/ns-v3#', @@ -77,7 +65,7 @@ def get_statistics(data_points): return (the_mean, the_deviation, the_median) -def gather_candidates(start_path) -> List[EvalEntry]: +def gather_candidates(start_path) -> typing.List[EvalEntry]: candidates = [] if os.path.isdir(start_path): for curr_dir, _, files in os.walk(start_path): @@ -118,10 +106,9 @@ def match_candidates(path_candidates, path_gt_file): '''Find candidates that match groundtruth''' if not os.path.isdir(path_candidates): - raise IOError('invalid ocr result path "{}"'.format(path_candidates)) + raise IOError(f'invalid ocr result path "{path_candidates}"') if not os.path.exists(path_gt_file): - raise IOError( - 'invalid groundtruth data path "{}"'.format(path_gt_file)) + raise IOError(f'invalid groundtruth data path "{path_gt_file}"') gt_filename = os.path.basename(path_gt_file) @@ -205,7 +192,7 @@ def get_bbox_data(file_path): '''Get Bounding Box Data from given resource, if any exists''' if not os.path.exists(file_path): - raise IOError('{} not existing!'.format(file_path)) + raise IOError(f'{file_path} not existing!') # 1: inspect filename file_name = os.path.basename(file_path) @@ -266,7 +253,7 @@ def get_bbox_data(file_path): return None -def _map_alto(e: ET.Element) -> Tuple[str, int, int, int, int]: +def _map_alto(e: ET.Element) -> typing.Tuple[str, int, int, int, int]: i = e.attrib['ID'] x0 = int(e.attrib['HPOS']) y0 = int(e.attrib['VPOS']) @@ -275,25 +262,14 @@ def _map_alto(e: ET.Element) -> Tuple[str, int, int, int, int]: return (i, x0, y0, x1, y1) -def _map_page2013(elem: ET.Element) -> Tuple[str, int, int, int, int]: +def _map_page2013(elem: ET.Element) -> typing.Tuple[str, int, int, int, int]: points = elem.attrib['points'].strip().split(' ') _xs = [int(p.split(',')[0]) for p in points] _ys = [int(p.split(',')[1]) for p in points] return (NOT_SET, min(_xs), min(_ys), max(_xs), max(_ys)) -def _get_line_digos_from_digo(digo: DigitalObjectTree, lines: List[DigitalObjectTree] = None) -> List[DigitalObjectTree]: - if lines is None: - lines = [] - if digo.level == DigitalObjectLevel.LINE and digo.transcription: - lines.append(digo) - return lines - for child in digo.children: - _get_line_digos_from_digo(child, lines) - return lines - - -def calculate_bounding_box(elements: List[ET.Element], map_func) -> Tuple[int, int, int, int]: +def calculate_bounding_box(elements: typing.List[ET.Element], map_func) -> typing.Tuple[int, int, int, int]: """Review element's points to get points for minimum (top-left) and maximum (bottom-right)""" @@ -305,99 +281,6 @@ def calculate_bounding_box(elements: List[ET.Element], map_func) -> Tuple[int, i return ((min(all_x1), min(all_y1)), (max(all_x2), max(all_y2))) -def digital_object_to_text(file_path, frame=None, oneliner=True) -> Tuple[str | List[str], int]: - """Wrap OCR-Data Comparison""" - - try: - top_digo: DigitalObjectTree = to_digital_object(file_path) - # explicit filter frame? - if not frame: - frame = top_digo.dimensions - elif len(frame) == 2: - frame = [[frame[0][0], frame[0][1]], - [frame[1][0], frame[0][1]], - [frame[1][0], frame[1][1]], - [frame[0][0], frame[1][1]]] - frame_digo = DigitalObjectTree() - frame_digo.dimensions = frame - filter_word_pieces(frame_digo, top_digo) - the_lines = _get_line_digos_from_digo(top_digo) - if oneliner: - return top_digo.transcription, len(the_lines) - else: - return [line.transcription for line in the_lines], len(the_lines) - except xml.parsers.expat.ExpatError as _: - with open(file_path, mode='r', encoding='utf-8') as fhandle: - text_lines = fhandle.readlines() - if oneliner: - text_lines = ' '.join([l.strip() for l in text_lines]) - return text_lines, len(text_lines) - except RuntimeError as exc: - raise RuntimeError(f"{file_path}: {exc}") from exc - - -def digital_object_to_dict_text(file_path: str, frame=None, oneliner=False) -> Tuple[str | List[str], int]: - line_texts: List[str] - len_lines: int - line_texts, len_lines = digital_object_to_text(file_path=file_path, frame=frame, oneliner=False) - non_empty_lines: List[str] = [line_text for line_text in line_texts if len(line_text) > 0] - lines_sanitized_wraps: List[str] = _sanitize_wraps(non_empty_lines) - lines_sanitized_chars: List[str] = _sanitize_chars(lines_sanitized_wraps) - text = ' '.join(lines_sanitized_chars) if oneliner else lines_sanitized_chars - return text, len_lines - - -_HYPHENS: List[str] = [ - "⸗", - "-", - "—", -] - - -def _sanitize_wraps(lines: List[str]) -> List[str]: - """Sanitize word wraps if - * last word token ends with '-', "⸗" or "—" - * another line following - * following line not empty - """ - - normalized_lines: List[str] = [] - for i, line in enumerate(lines): - if i < len(lines) - 1: - for hyphen in _HYPHENS: - if line.endswith(hyphen): - next_line = lines[i + 1] - if len(next_line.strip()) == 0: - # encountered empty next line, no merge possible - continue - next_line_tokens = next_line.split() - nextline_first_token = next_line_tokens.pop(0) - # join the rest of valid next line - lines[i + 1] = ' '.join(next_line_tokens) - line = line[:-1] + nextline_first_token - break - normalized_lines.append(line) - return normalized_lines - - -def _sanitize_chars(lines: List[str]) -> List[str]: - """Replace or remove nonrelevant chars for current german word error rate""" - - sanitized: List[str] = [] - for line in lines: - text = line.strip() - bad_chars = '0123456789“„"\'?!*.;:-=[]()|' - text = ''.join([c for c in text if c not in bad_chars]) - if '..' in text: - text = text.replace('..', '') - if ' ' in text: - text = text.replace(' ', ' ') - text = ' '.join([t for t in text.split() if len(t) > 1]) - sanitized.append(text) - - return sanitized - - def _get_groundtruth_from_filename(file_path) -> str: _file_name = os.path.basename(file_path) result = re.match(r'.*gt.(\w{3,}).xml$', _file_name) @@ -411,36 +294,6 @@ def _get_groundtruth_from_filename(file_path) -> str: return NOT_SET -def filter_word_pieces(frame, current) -> int: - _filtered = 0 - _tmp_stack = [] - _total_stack = [] - # stack all items - _total_stack.append(current) - _tmp_stack.append(current) - while _tmp_stack: - _current: DigitalObjectTree = _tmp_stack.pop() - if _current.children: - _tmp_stack += _current.children - _total_stack += _current.children - # now pick words - _words = [_p for _p in _total_stack if _p.level == DigitalObjectLevel.WORD] - - # check for each word piece - for _word in _words: - if _word not in frame: - _filtered += 1 - _uplete(_word) - return _filtered - - -def _uplete(curr: DigitalObjectTree): - if len(curr.children) == 0 and curr.level < DigitalObjectLevel.PAGE: - _pa: DigitalObjectTree = curr.parent - _pa.remove_children(curr) - _uplete(_pa) - - def _normalize_gt_type(label) -> str: if label.startswith('art'): return 'article' @@ -505,7 +358,7 @@ def __str__(self) -> str: _val = m.value _ref = m.n_ref if _ref > 10000: - _ref_fmt = f'{(floor(float(m.n_ref) / 1000)):>2}K+' + _ref_fmt = f'{(math.floor(float(m.n_ref) / 1000)):>2}K+' else: _ref_fmt = f'{m.n_ref:>4}' _raw = f'{m.label}:{_val:>5.2f}({_ref_fmt})' @@ -519,7 +372,7 @@ def __str__(self) -> str: return ', '.join(_raws) def __repr__(self) -> str: - return '{} {}'.format(self.gt_type, self.path_c) + return f'{self.gt_type} {self.path_c}' class Evaluator: @@ -552,17 +405,17 @@ def __init__( self.evaluation_map = {} self.text_mode = extras == EVAL_EXTRA_IGNORE_GEOMETRY self.is_sequential = False - self.metrics: List = [] + self.metrics: typing.List[digem.SimilarityMetric] = [] self.evaluation_report = {} - def eval_all(self, entries: List[EvalEntry], sequential=False) -> None: + def eval_all(self, entries: typing.List[EvalEntry], sequential=False) -> None: """evaluate all pairs groundtruth-candidate""" _entries = [] if sequential or self.is_sequential: _entries = [self._wrap_eval_entry(e) for e in entries] else: - cpus = cpu_count() + cpus = multiprocessing.cpu_count() n_executors = cpus // 2 if cpus > 3 else 1 if self.verbosity == 1: print(f"[DEBUG] use {n_executors} executors ({cpus}) to create evaluation data") @@ -714,10 +567,14 @@ def eval_map(self): # re-order self.evaluation_results = sorted(self.evaluation_results, key=lambda e: e.eval_key) - def aggregate(self, by_type=False, by_metrics=[0, 1, 2, 3]): + def aggregate(self, by_type=False, by_metrics=None): + """Aggregate item's metrics for domain/directory + and/or annotated type (if present)""" # precheck - having root dir self._check_aggregate_preconditions() + if by_metrics is None: + by_metrics = [0, 1, 2, 3] root_base = Path(self.domain_reference).parts[-1] @@ -780,16 +637,16 @@ def report_stdout(evaluator: Evaluator, verbosity): results = evaluator.get_results() _path_can = evaluator.domain_candidate _path_ref = evaluator.domain_reference - evaluation_date = date.today().isoformat() + evaluation_date = datetime.date.today().isoformat() print(f'[INFO ] Evaluation Summary (candidates: "{_path_can}" vs. reference: "{_path_ref}" ({evaluation_date})') for result in results: (gt_type, n_total, mean_total, med, _n_refs) = result.get_defaults() - add_stats = f', std: {result.std:.2f}, median: {med:.2f}' if n_total > 1 else '' - print(f'[INFO ] "{gt_type}"\t∅: {mean_total:.2f}\t{n_total} items, {_n_refs} refs{add_stats}') + add_stats = f', std: {result.std:5.2f}, median: {med:5.2f}' if n_total > 1 else '' + print(f'[INFO ] "{gt_type}"\t∅: {mean_total:5.2f}\t{n_total: 3d} items, {_n_refs:_} refs{add_stats}') if result.cleared_result: (_, n_t2, mean2, med2, n_c2) = result.cleared_result.get_defaults() ccr_std = result.cleared_result.std drops = n_total - n_t2 if drops > 0: print( - f'[INFO ] "{gt_type}"\t∅: {mean2:.2f}\t{n_t2} items (-{drops}), {n_c2} refs, std: {ccr_std:.2f}, median: {med2:.2f}') + f'[INFO ] "{gt_type}(-{drops})"\t∅: {mean2:5.2f}\t{n_t2: 3d} items, {n_c2:_} refs, std: {ccr_std:5.2f}, median: {med2:5.2f}') diff --git a/src/digital_eval/metrics.py b/src/digital_eval/metrics.py index 0861522..c829201 100644 --- a/src/digital_eval/metrics.py +++ b/src/digital_eval/metrics.py @@ -8,6 +8,7 @@ import string import typing import unicodedata +import xml.dom.minidom import nltk import nltk.corpus as nltk_corp @@ -18,9 +19,10 @@ ) import rapidfuzz.distance.Levenshtein as rfls +import digital_object as do + from digital_eval.dictionary_metrics.common import LANGUAGE_KEY_DEFAULT from digital_eval.dictionary_metrics.language_tool.LanguageTool import LanguageTool -from digital_eval.evaluation import digital_object_to_text, digital_object_to_dict_text # Python3 standard Unicode Normalization # @@ -32,7 +34,7 @@ # usual spatium and special control sequences WHITESPACES = string.whitespace -WHITESPACES_EXCLUDING_BLANK_CHARS = WHITESPACES[1:] +WHITESPACES_EXCL_BLANK_CHARS = WHITESPACES[1:] # punctuations # @@ -64,26 +66,26 @@ # filter mechanics # # via Python3 string translation maps -WHITESPACE_TRANSLATOR = str.maketrans('', '', WHITESPACES) -WHITESPACE_EXCLUDING_BLANK_CHARS_TRANSLATOR = str.maketrans('', '', WHITESPACES_EXCLUDING_BLANK_CHARS) -PUNCT_TRANLATOR = str.maketrans('', '', PUNCTUATIONS) -DIGIT_TRANSLATOR = str.maketrans('', '', DIGITS) +WHITESPACE_TRNSL = str.maketrans('', '', WHITESPACES) +WHITESPACE_EXCL_BLANK_CHARS_TRNSL = str.maketrans('', '', WHITESPACES_EXCL_BLANK_CHARS) +PUNCT_TRNSL = str.maketrans('', '', PUNCTUATIONS) +DIGIT_TRNSL = str.maketrans('', '', DIGITS) def _filter_whitespaces(a_str) -> str: - return a_str.translate(WHITESPACE_TRANSLATOR) + return a_str.translate(WHITESPACE_TRNSL) def _filter_whitespaces_excluding_blank_chars(a_str) -> str: - return a_str.translate(WHITESPACE_EXCLUDING_BLANK_CHARS_TRANSLATOR) + return a_str.translate(WHITESPACE_EXCL_BLANK_CHARS_TRNSL) def _filter_puncts(a_str) -> str: - return a_str.translate(PUNCT_TRANLATOR) + return a_str.translate(PUNCT_TRNSL) def _filter_digits(a_str) -> str: - return a_str.translate(DIGIT_TRANSLATOR) + return a_str.translate(DIGIT_TRNSL) def _tokenize(a_str) -> typing.List[str]: @@ -109,11 +111,13 @@ def _tokenize_to_sorted_set(a_str) -> typing.Set[str]: STOPWORDS_DEFAULT = ['german', 'english', 'arabic', 'russian'] -def get_stopwords(nltk_mappings=NLTK_STOPWORDS, languages=None) -> typing.Set[str]: +def get_stopwords(nltk_mappings=None, languages=None) -> typing.Set[str]: """Helper Function to gather NLTK stopword data * ensure stopwords files are locally available * extract them as set """ + if nltk_mappings is None: + nltk_mappings = NLTK_STOPWORDS try: for mapping in nltk_mappings: nltk_corp.stopwords.words(mapping) @@ -155,6 +159,140 @@ def transform_string(the_content): return the_content +def digital_object_to_dict_text(file_path: str, frame=None, oneliner=False) -> typing.Tuple: + line_texts: typing.List[str] + len_lines: int + line_texts, len_lines = digital_object_to_text(file_path=file_path, frame=frame, oneliner=False) + non_empty_lines: typing.List[str] = [line_text for line_text in line_texts if len(line_text) > 0] + lines_sanitized_wraps: typing.List[str] = _sanitize_wraps(non_empty_lines) + lines_sanitized_chars: typing.List[str] = _sanitize_chars(lines_sanitized_wraps) + text = ' '.join(lines_sanitized_chars) if oneliner else lines_sanitized_chars + return text, len_lines + + +def digital_object_to_text(file_path, frame=None, oneliner=True) -> typing.Tuple: + """Wrap OCR-Data Comparison""" + + try: + top_digo: do.DigitalObjectTree = do.to_digital_object(file_path) + # explicit filter frame? + if not frame: + frame = top_digo.dimensions + elif len(frame) == 2: + frame = [[frame[0][0], frame[0][1]], + [frame[1][0], frame[0][1]], + [frame[1][0], frame[1][1]], + [frame[0][0], frame[1][1]]] + frame_digo = do.DigitalObjectTree() + frame_digo.dimensions = frame + filter_word_pieces(frame_digo, top_digo) + the_lines = _get_line_digos_from_digo(top_digo) + if oneliner: + return top_digo.transcription, len(the_lines) + else: + return [line.transcription for line in the_lines], len(the_lines) + except xml.parsers.expat.ExpatError as _: + with open(file_path, mode='r', encoding='utf-8') as fhandle: + text_lines = fhandle.readlines() + if oneliner: + text_lines = ' '.join([l.strip() for l in text_lines]) + return text_lines, len(text_lines) + except RuntimeError as exc: + raise RuntimeError(f"{file_path}: {exc}") from exc + + +def filter_word_pieces(frame, current) -> int: + _filtered = 0 + _tmp_stack = [] + _total_stack = [] + # stack all items + _total_stack.append(current) + _tmp_stack.append(current) + while _tmp_stack: + _current: do.DigitalObjectTree = _tmp_stack.pop() + if _current.children: + _tmp_stack += _current.children + _total_stack += _current.children + # now pick words + _words = [_p for _p in _total_stack if _p.level == do.DigitalObjectLevel.WORD] + + # check for each word piece + for _word in _words: + if _word not in frame: + _filtered += 1 + _uplete(_word) + return _filtered + + +def _uplete(curr: do.DigitalObjectTree): + if len(curr.children) == 0 and curr.level < do.DigitalObjectLevel.PAGE: + _pa: do.DigitalObjectTree = curr.parent + _pa.remove_children(curr) + _uplete(_pa) + + +def _get_line_digos_from_digo(digo: do.DigitalObjectTree, lines: typing.List = None) -> typing.List[do.DigitalObjectTree]: + if lines is None: + lines = [] + if digo.level == do.DigitalObjectLevel.LINE and digo.transcription: + lines.append(digo) + return lines + for child in digo.children: + _get_line_digos_from_digo(child, lines) + return lines + + +_HYPHENS: typing.List[str] = [ + "⸗", + "-", + "—", +] + + +def _sanitize_wraps(lines: typing.List[str]) -> typing.List[str]: + """Sanitize word wraps if + * last word token ends with '-', "⸗" or "—" + * another line following + * following line not empty + """ + + normalized_lines: typing.List[str] = [] + for i, line in enumerate(lines): + if i < len(lines) - 1: + for hyphen in _HYPHENS: + if line.endswith(hyphen): + next_line = lines[i + 1] + if len(next_line.strip()) == 0: + # encountered empty next line, no merge possible + continue + next_line_tokens = next_line.split() + nextline_first_token = next_line_tokens.pop(0) + # join the rest of valid next line + lines[i + 1] = ' '.join(next_line_tokens) + line = line[:-1] + nextline_first_token + break + normalized_lines.append(line) + return normalized_lines + + +def _sanitize_chars(lines: typing.List[str]) -> typing.List[str]: + """Replace or remove nonrelevant chars for current german word error rate""" + + sanitized: typing.List[str] = [] + for line in lines: + text = line.strip() + bad_chars = '0123456789“„"\'?!*.;:-=[]()|' + text = ''.join([c for c in text if c not in bad_chars]) + if '..' in text: + text = text.replace('..', '') + if ' ' in text: + text = text.replace(' ', ' ') + text = ' '.join([t for t in text.split() if len(t) > 1]) + sanitized.append(text) + + return sanitized + + class DigitalEvalMetricException(Exception): """Mark Exception during validation/calculating metrics""" @@ -163,7 +301,9 @@ def __init__(self, *args: object) -> None: class SimilarityMetric: - """Basic definition of a OCRDifferenceMetric""" + """Basic definition of OCR Similarity Metric, + expressed in percent (0 - 100) + """ def __init__( self, @@ -186,9 +326,6 @@ def __init__( self._data_candidate = None self.languages = None - def norm_percentual(self): - self._value = self._value * 100 - @property def reference(self): """Reference/Groundtruth data""" @@ -319,6 +456,7 @@ def __init__(self, precision=2, normalization=UC_NORMALIZATION_NFKD, preprocessings=preprocessings, ) self._label = 'DictLT' + self.diff = 0 def _forward(self): text: str = self._data_candidate @@ -476,7 +614,7 @@ def ir_fmeasure(reference_data, candidate_data) -> float: # diacritica to take care of -_COMBINING_SMALL_E = u'\u0364' +_COMBINING_SMALL_E = '\u0364' def _normalize_vocal_ligatures(a_string) -> str: """Replace vocal ligatures, which otherwise @@ -505,4 +643,4 @@ def _normalize_vocal_ligatures(a_string) -> str: _out.append(_c) # strip all combining e's anyway - return ''.join(_out).replace(_COMBINING_SMALL_E, '') \ No newline at end of file + return ''.join(_out).replace(_COMBINING_SMALL_E, '') diff --git a/src/ocr_util/cli.py b/src/ocr_util/cli.py index f116f71..1c7b9a3 100644 --- a/src/ocr_util/cli.py +++ b/src/ocr_util/cli.py @@ -4,19 +4,17 @@ import argparse import re from pathlib import PurePath -from typing import Final -from digital_object import DigitalObject, from_digital_objects -from digital_object.filter import PolygonFrameFilterUtil, PolygonFrameFilter +import digital_object as do +import digital_object.filter as dofi # script constants - -DEFAULT_VERBOSITY: int = 0 -SUB_CMD_FRAME: Final[str] = 'frame' +DEFAULT_VERBOSITY = 0 +SUB_CMD_FRAME = 'frame' def points_type(points: str) -> str: - match: re.Match = re.match(PolygonFrameFilterUtil.POINT_LIST_PATTERN, points) + match: re.Match = re.match(dofi.PolygonFrameFilterUtil.POINT_LIST_PATTERN, points) if not match: raise argparse.ArgumentTypeError(f"Invalid point coordinates: '{points}'") return points @@ -72,13 +70,13 @@ def start() -> None: points: str = args.points if verbosity > 1: print(f"[DEBUG] args: {input_ocr_file}, {output_ocr_file}, {points}, {verbosity}") - polygon_frame_filter: PolygonFrameFilter = PolygonFrameFilter( + polygon_frame_filter: dofi.PolygonFrameFilter = dofi.PolygonFrameFilter( input_ocr_file, points, verbosity ) - piece_result: DigitalObject = polygon_frame_filter.process() - file_result: PurePath = from_digital_objects(piece_result, output_ocr_file) + piece_result: do.DigitalObjectTree = polygon_frame_filter.process() + file_result: PurePath = do.from_digital_object(piece_result, output_ocr_file) if verbosity > 0: print('[INFO ] file_result', file_result) diff --git a/tests/test_dict_metric.py b/tests/test_dict_metric.py index 638ca73..501af91 100644 --- a/tests/test_dict_metric.py +++ b/tests/test_dict_metric.py @@ -1,8 +1,5 @@ -from digital_eval.evaluation import ( - digital_object_to_dict_text, - digital_object_to_text -) -from digital_eval.metrics import normalize_unicode, UC_NORMALIZATION_NFKD, _normalize_vocal_ligatures +import digital_eval.metrics as digem + from .conftest import TEST_RES_DIR @@ -10,12 +7,12 @@ def test_piece_to_dict_text_alto(): alto_path = f'{TEST_RES_DIR}/dict_metric/alto.xml' # act - alto_text_no_sanit, _ = digital_object_to_text(alto_path, oneliner=True) + alto_text_no_sanit, _ = digem.digital_object_to_text(alto_path, oneliner=True) alto_words_no_sanit = alto_text_no_sanit.split() - alto_text, _ = digital_object_to_dict_text(alto_path, oneliner=True) - alto_lines, alto_num_lines = digital_object_to_dict_text(alto_path, oneliner=False) - alto_lines_norm_vocal_ligatures = [_normalize_vocal_ligatures(line) for line in alto_lines] - alto_lines_norm = [normalize_unicode(line, UC_NORMALIZATION_NFKD) for line in alto_lines_norm_vocal_ligatures] + alto_text, _ = digem.digital_object_to_dict_text(alto_path, oneliner=True) + alto_lines, alto_num_lines = digem.digital_object_to_dict_text(alto_path, oneliner=False) + alto_lines_norm_vocal_ligatures = [digem._normalize_vocal_ligatures(line) for line in alto_lines] + alto_lines_norm = [digem.normalize_unicode(line, digem.UC_NORMALIZATION_NFKD) for line in alto_lines_norm_vocal_ligatures] alto_text_norm = " ".join(alto_lines_norm) alto_words = alto_text_norm.split() @@ -31,12 +28,12 @@ def test_piece_to_dict_text_page2019(): page_path = f'{TEST_RES_DIR}/dict_metric/page2019.xml' # act - page_text_no_sanit, _ = digital_object_to_text(page_path, oneliner=True) + page_text_no_sanit, _ = digem.digital_object_to_text(page_path, oneliner=True) page_words_no_sanit = page_text_no_sanit.split() - page_text, _ = digital_object_to_dict_text(page_path, oneliner=True) - page_lines, alto_num_lines = digital_object_to_dict_text(page_path, oneliner=False) - page_lines_norm_vocal_ligatures = [_normalize_vocal_ligatures(line) for line in page_lines] - page_lines_norm = [normalize_unicode(line, UC_NORMALIZATION_NFKD) for line in page_lines_norm_vocal_ligatures] + page_text, _ = digem.digital_object_to_dict_text(page_path, oneliner=True) + page_lines, alto_num_lines = digem.digital_object_to_dict_text(page_path, oneliner=False) + page_lines_norm_vocal_ligatures = [digem._normalize_vocal_ligatures(line) for line in page_lines] + page_lines_norm = [digem.normalize_unicode(line, digem.UC_NORMALIZATION_NFKD) for line in page_lines_norm_vocal_ligatures] page_text_norm = " ".join(page_lines_norm) page_words = page_text_norm.split() @@ -52,12 +49,12 @@ def test_piece_to_dict_text_page2013(): page_path = f'{TEST_RES_DIR}/dict_metric/page2013.xml' # act - page_text_no_sanit, _ = digital_object_to_text(page_path, oneliner=True) + page_text_no_sanit, _ = digem.digital_object_to_text(page_path, oneliner=True) page_words_no_sanit = page_text_no_sanit.split() - page_text, _ = digital_object_to_dict_text(page_path, oneliner=True) - page_lines, alto_num_lines = digital_object_to_dict_text(page_path, oneliner=False) - page_lines_norm_vocal_ligatures = [_normalize_vocal_ligatures(line) for line in page_lines] - page_lines_norm = [normalize_unicode(line, UC_NORMALIZATION_NFKD) for line in page_lines_norm_vocal_ligatures] + page_text, _ = digem.digital_object_to_dict_text(page_path, oneliner=True) + page_lines, alto_num_lines = digem.digital_object_to_dict_text(page_path, oneliner=False) + page_lines_norm_vocal_ligatures = [digem._normalize_vocal_ligatures(line) for line in page_lines] + page_lines_norm = [digem.normalize_unicode(line, digem.UC_NORMALIZATION_NFKD) for line in page_lines_norm_vocal_ligatures] page_text_norm = " ".join(page_lines_norm) page_words = page_text_norm.split() diff --git a/tests/test_digital_eval_cli.py b/tests/test_digital_eval_cli.py index ab15336..ccbfd80 100644 --- a/tests/test_digital_eval_cli.py +++ b/tests/test_digital_eval_cli.py @@ -5,7 +5,7 @@ from pathlib import Path -import digital_eval.cli as dival +import digital_eval.cli as dig from .conftest import TEST_RES_DIR @@ -25,7 +25,7 @@ def test_mwe_cli(tmp_path, capsys): """ # arrange - dival.VERBOSITY = 1 + dig.VERBOSITY = 1 src_candidates = TEST_RES_DIR / 'candidate' / 'frk_alto' src_reference = TEST_RES_DIR / 'groundtruth' / 'page' dst_candidates = tmp_path / 'candidate' / _DOMAIN_LABEL @@ -38,16 +38,18 @@ def test_mwe_cli(tmp_path, capsys): assert _DOMAIN_LABEL == tmp_reference.name # act - _results = dival._main(dst_candidates, dst_reference, - dival.DEFAULT_OCR_METRICS, dival.DEFAULT_UTF8_NORM, None) + cli_args = {"candidates": dst_candidates, "reference": dst_reference, + "metrics": dig.DEFAULT_OCR_METRICS, + "verbosity": 1, + "utf8": dig.DEFAULT_UTF8_NORM, + "sequential": True} + eval_results = dig.start_evaluation(cli_args) # assert + assert len(eval_results) == 4 captured = capsys.readouterr().out - assert captured.startswith("[DEBUG] text normalized using 'NFC'") - assert len(captured) == 1027 std_lines = captured.split('\n') assert len(std_lines) == 11 - assert std_lines[1].startswith('[DEBUG] from "5" filtered "3" candidates') + assert std_lines[0] == "[DEBUG] text normalized using 'NFC' code points for 'Cs,Ls'" + assert str(std_lines[1]).startswith('[DEBUG] from "5" filtered "3" candidates') assert std_lines[4] == "[DEBUG] [1667522809_J_0001_0002](art) [Cs:39.20(5309), Ls:38.54(4383)(- 0.66)]" - assert len(_results) == 4 - assert _results[0] diff --git a/tests/test_ocr_evaluate.py b/tests/test_ocr_evaluate.py index 42c1051..14a3917 100644 --- a/tests/test_ocr_evaluate.py +++ b/tests/test_ocr_evaluate.py @@ -12,22 +12,16 @@ approx ) -from digital_eval.evaluation import ( - EvalEntry, - Evaluator, - match_candidates, - digital_object_to_text, - get_bbox_data, - _get_groundtruth_from_filename, -) -from digital_eval.metrics import MetricIRFM, MetricIRPre, MetricIRRec, MetricChars, SimilarityMetric +import digital_eval.evaluation as digev +import digital_eval.metrics as digem + from .conftest import ( TEST_RES_DIR ) def test_match_candidates_alto_candidate_with_coords(): - actual_matches = match_candidates(f'{TEST_RES_DIR}/candidate/frk_alto', + actual_matches = digev.match_candidates(f'{TEST_RES_DIR}/candidate/frk_alto', f'{TEST_RES_DIR}/groundtruth/alto/1667522809_J_0073_0001_375x2050_2325x9550.xml') assert f'{TEST_RES_DIR}/candidate/frk_alto/1667522809_J_0073_0001_part.xml' == actual_matches[0] @@ -35,13 +29,13 @@ def test_match_candidates_alto_candidate_with_coords(): def test_match_candidates_both_txt_files(): path_candidates = f'{TEST_RES_DIR}/candidate/txt' path_gt = f'{TEST_RES_DIR}/groundtruth/txt/1246734.gt.txt' - actual_matches = match_candidates(path_candidates, path_gt) + actual_matches = digev.match_candidates(path_candidates, path_gt) assert f'{TEST_RES_DIR}/candidate/txt/OCR-Fraktur_1246734.txt' == actual_matches[0] def test_match_candidates_fails_no_groundtruth(): with pytest.raises(IOError) as exc: - match_candidates( + digev.match_candidates( f'{TEST_RES_DIR}/candidate/txt', './test/sresources/txt/no_gt.txt') assert "invalid groundtruth data path" in str(exc) @@ -49,7 +43,7 @@ def test_match_candidates_fails_no_groundtruth(): def test_match_candidates_fails_no_candidates(): with pytest.raises(IOError) as exc: - match_candidates( + digev.match_candidates( './text/no_results', f'{TEST_RES_DIR}/txt/gt/1246734.txt') assert "invalid ocr result path" in str(exc) @@ -60,7 +54,7 @@ def test_match_candidates_groundtruth_txt_candidate_alto(): path_gt = f'{TEST_RES_DIR}/groundtruth/txt/217745.gt.txt' # act - actual_matches = match_candidates(path_cd, path_gt) + actual_matches = digev.match_candidates(path_cd, path_gt) # assert assert actual_matches[0] == f'{TEST_RES_DIR}/candidate/ara_alto/217745.xml' @@ -74,8 +68,8 @@ def test_piece_to_text_alto_candidate_with_coords(): p2 = (6200, 3425) # act - _as_lines, _ = digital_object_to_text(alto_path, frame=(p1, p2), oneliner=False) - _gt_type = _get_groundtruth_from_filename(alto_path) + _as_lines, _ = digem.digital_object_to_text(alto_path, frame=(p1, p2), oneliner=False) + _gt_type = digev._get_groundtruth_from_filename(alto_path) # assert assert _gt_type == 'n.a.' @@ -91,8 +85,8 @@ def test_evaluate_single_alto_candidate_with_page_groundtruth(tmp_path): eval_domain.mkdir(parents=True) gt_domain = tmp_path / 'groundtruth' / '1667522809_J_0001' gt_domain.mkdir(parents=True) - evaluator = Evaluator(eval_domain) - evaluator.metrics = [MetricChars()] + evaluator = digev.Evaluator(eval_domain) + evaluator.metrics = [digem.MetricChars()] # required for directory-like aggregation evaluator.domain_reference = gt_domain _candidate_src = os.path.join(f'{TEST_RES_DIR}/candidate/frk_alto/1667522809_J_0001_0002.xml') @@ -102,7 +96,7 @@ def test_evaluate_single_alto_candidate_with_page_groundtruth(tmp_path): shutil.copy(_gt_src, _gt_dst) # act - eval_entry = EvalEntry(str(eval_domain / '1667522809_J_0001_0002.xml')) + eval_entry = digev.EvalEntry(str(eval_domain / '1667522809_J_0001_0002.xml')) eval_entry.path_g = _gt_dst evaluator.eval_all([eval_entry], sequential=True) evaluator.aggregate(by_type=True) @@ -136,8 +130,8 @@ def test_evaluate_page_groundtruth_with_itself(tmp_path): eval_domain.mkdir(parents=True) gt_domain = tmp_path / 'groundtruth' / '1667522809_J_0001' gt_domain.mkdir(parents=True) - evaluator = Evaluator(eval_domain) - evaluator.metrics = [MetricChars()] + evaluator = digev.Evaluator(eval_domain) + evaluator.metrics = [digem.MetricChars()] evaluator.domain_reference = gt_domain _candidate_src = os.path.join(f'{TEST_RES_DIR}/groundtruth/page/1667522809_J_0001_0002.art.gt.xml') _candidate_dst = str(eval_domain / '1667522809_J_0001_0002.xml') @@ -147,7 +141,7 @@ def test_evaluate_page_groundtruth_with_itself(tmp_path): shutil.copy(_gt_src, _gt_dst) # act - eval_entry = EvalEntry(str(eval_domain / '1667522809_J_0001_0002.xml')) + eval_entry = digev.EvalEntry(str(eval_domain / '1667522809_J_0001_0002.xml')) eval_entry.path_g = _gt_dst evaluator.eval_all([eval_entry], sequential=True) evaluator.aggregate(by_type=True) @@ -188,44 +182,44 @@ def test_evaluate_set_with_5_entries(tmp_path): path_dir_gt.mkdir() path_dir_c = tmp_path / 'media' / 'jpg' / 'odem' path_dir_c.mkdir(parents=True) - evaluator = Evaluator(path_dir_c) + evaluator = digev.Evaluator(path_dir_c) evaluator.domain_reference = path_dir_gt - _metric_ca1 = MetricChars() + _metric_ca1 = digem.MetricChars() _metric_ca1._value = 95.70 _metric_ca1._data_reference = 't' * 810 - _metric_ca2 = MetricChars() + _metric_ca2 = digem.MetricChars() _metric_ca2._value = 96.53 _metric_ca2._data_reference = 't' * 675 - _metric_ca3 = MetricChars() + _metric_ca3 = digem.MetricChars() _metric_ca3._value = 94.91 _metric_ca3._data_reference = 't' * 1395 - _metric_ca4 = MetricChars() + _metric_ca4 = digem.MetricChars() _metric_ca4._value = 94.40 _metric_ca4._data_reference = 't' * 1466 # outlier ! - _metric_ca5 = MetricChars() + _metric_ca5 = digem.MetricChars() _metric_ca5._value = 86.44 _metric_ca5._data_reference = 't' * 1520 - _metric_ca6 = MetricChars() + _metric_ca6 = digem.MetricChars() _metric_ca6._value = 93.44 _metric_ca6._data_reference = 't' * 1520 - entry1 = EvalEntry(path_dir_c / 'eng' / 'urn+nbn+de+gbv+3+1-135654-p0403-5_eng.xml') + entry1 = digev.EvalEntry(path_dir_c / 'eng' / 'urn+nbn+de+gbv+3+1-135654-p0403-5_eng.xml') entry1.path_g = str(path_dir_gt / 'eng' / 'urn+nbn+de+gbv+3+1-135654-p0403-5_eng.gt.xml') entry1.metrics = [_metric_ca1] - entry2 = EvalEntry(path_dir_c / 'ger' / 'urn+nbn+de+gbv+3+1-816198-p0493-2_ger.xml') + entry2 = digev.EvalEntry(path_dir_c / 'ger' / 'urn+nbn+de+gbv+3+1-816198-p0493-2_ger.xml') entry2.path_g = str('/data/ocr/groundtruth/odem/ger/urn+nbn+de+gbv+3+1-816198-p0493-2_ger.gt.xml') entry2.metrics = [_metric_ca2] - entry3 = EvalEntry(path_dir_c / 'ger' / 'urn+nbn+de+gbv+3+1-818383-p0034-5_ger.xml') + entry3 = digev.EvalEntry(path_dir_c / 'ger' / 'urn+nbn+de+gbv+3+1-818383-p0034-5_ger.xml') entry3.path_g = '/data/ocr/groundtruth/odem/ger/urn+nbn+de+gbv+3+1-818383-p0034-5_ger.gt.xml' entry3.metrics = [_metric_ca3] - entry4 = EvalEntry(path_dir_c / 'ger' / 'urn+nbn+de+gbv+3+1-822479-p1119-4_ger.xml') + entry4 = digev.EvalEntry(path_dir_c / 'ger' / 'urn+nbn+de+gbv+3+1-822479-p1119-4_ger.xml') entry4.path_g = '/data/ocr/groundtruth/odem/ger/urn+nbn+de+gbv+3+1-822479-p1119-4_ger.gt.xml' entry4.metrics = [_metric_ca4] - entry5 = EvalEntry(path_dir_c / 'ger' / 'urn+nbn+de+gbv+3+1-828020-p0173-6_ger.xml') + entry5 = digev.EvalEntry(path_dir_c / 'ger' / 'urn+nbn+de+gbv+3+1-828020-p0173-6_ger.xml') entry5.path_g = '/data/ocr/groundtruth/odem/ger/urn+nbn+de+gbv+3+1-828020-p0173-6_ger.gt.xml' entry5.metrics = [_metric_ca5] - entry6 = EvalEntry(path_dir_c / 'ger' / 'urn+nbn+de+gbv+3+1-125584-p0314-6_ger.xml') + entry6 = digev.EvalEntry(path_dir_c / 'ger' / 'urn+nbn+de+gbv+3+1-125584-p0314-6_ger.xml') entry6.path_g = '/data/ocr/groundtruth/odem/ger/urn+nbn+de+gbv+3+1-125584-p0314-6_ger.gt.xml' entry6.metrics = [_metric_ca6] evaluator.evaluation_entries = [entry1, entry2, entry3, entry4, entry5, entry6] @@ -250,7 +244,7 @@ def test_no_groundtruth_at_all(tmp_path): doesn't make any sense so far """ - evaluator = Evaluator(tmp_path) + evaluator = digev.Evaluator(tmp_path) evaluator.eval_all([]) with pytest.raises(RuntimeError) as err: @@ -271,12 +265,12 @@ def test_handle_exception_invalid_literal_for_int(): # arrange path_gt = f'{TEST_RES_DIR}/groundtruth/page/urn+nbn+de+gbv+3+1-792101-p0667-5_ger.gt.xml' - eval_entry = EvalEntry('dummy_candidate') + eval_entry = digev.EvalEntry('dummy_candidate') eval_entry.path_g = path_gt # act - evaluator = Evaluator('dummy_path') - evaluator.metrics = [SimilarityMetric()] + evaluator = digev.Evaluator('dummy_path') + evaluator.metrics = [digem.SimilarityMetric()] with pytest.raises(RuntimeError) as err: evaluator.eval_entry(eval_entry) @@ -295,10 +289,10 @@ def test_handle_empty_candidate_information_retrival(): # arrange path_gt = f'{TEST_RES_DIR}/groundtruth/page/urn+nbn+de+gbv+3+1-138193-p0904-0_ger.gt.xml' path_cd = f'{TEST_RES_DIR}/candidate/frk_page/urn+nbn+de+gbv+3+1-138193-p0904-0_ger.xml' - eval_entry = EvalEntry(path_cd) + eval_entry = digev.EvalEntry(path_cd) eval_entry.path_g = path_gt - evaluator = Evaluator('/data') - evaluator.metrics = [MetricIRPre(), MetricIRRec(), MetricIRFM()] + evaluator = digev.Evaluator('/data') + evaluator.metrics = [digem.MetricIRPre(), digem.MetricIRRec(), digem.MetricIRFM()] evaluator.verbosity = 1 # act @@ -323,12 +317,12 @@ def test_handle_table_text_groundtruth(): # arrange path_gt = f'{TEST_RES_DIR}/groundtruth/page/urn+nbn+de+gbv+3+1-126343-p0285-7_ger.gt.xml' path_cd = f'{TEST_RES_DIR}/candidate/frk_page/urn+nbn+de+gbv+3+1-126343-p0285-7_ger.xml' - eval_entry = EvalEntry(path_cd) + eval_entry = digev.EvalEntry(path_cd) eval_entry.path_g = path_gt # act - evaluator = Evaluator('/data') - evaluator.metrics = [MetricChars()] + evaluator = digev.Evaluator('/data') + evaluator.metrics = [digem.MetricChars()] evaluator._wrap_eval_entry(eval_entry) # assert / legacy: 5.825 , actual 4.0 @@ -343,7 +337,7 @@ def test_get_box_from_empty_page(): _path_gt = f'{TEST_RES_DIR}/groundtruth/page/urn+nbn+de+gbv+3+1-201080-p0034-8_ger.gt.xml' # act - _p1, _p2 = get_bbox_data(_path_gt) + _p1, _p2 = digev.get_bbox_data(_path_gt) # assert assert _p1 == (77, 58) @@ -359,7 +353,7 @@ def test_get_box_when_line_points_messy(): _path_gt = f'{TEST_RES_DIR}/groundtruth/page/rahbar-1771946695-00000040.xml' # act - _p1, _p2 = get_bbox_data(_path_gt) + _p1, _p2 = digev.get_bbox_data(_path_gt) # assert assert _p1 == (368, 619) @@ -375,12 +369,12 @@ def test_handle_exception_invalid_alto_xml(): # arrange path_gt = f'{TEST_RES_DIR}/candidate/frk_alto/1667522809_J_0001_0256_corrupt.xml' - eval_entry = EvalEntry('dummy_candidate') + eval_entry = digev.EvalEntry('dummy_candidate') eval_entry.path_g = path_gt # act - evaluator = Evaluator('dummy_path') - evaluator.metrics = [SimilarityMetric()] + evaluator = digev.Evaluator('dummy_path') + evaluator.metrics = [digem.SimilarityMetric()] with pytest.raises(ParseError) as err: evaluator.eval_entry(eval_entry) diff --git a/tests/test_ocr_metrics.py b/tests/test_ocr_metrics.py index 096b41c..80cbc12 100644 --- a/tests/test_ocr_metrics.py +++ b/tests/test_ocr_metrics.py @@ -5,7 +5,7 @@ import pytest -import digital_eval.metrics as deme +import digital_eval.metrics as digem # default reference THE_COMBINED_A_FOX = 'the á lazy brown fox jumps over the hump' @@ -20,7 +20,7 @@ def test_metric_unicode_normalization_textual_metric(): """ # arrange - char_metric = deme.MetricChars() + char_metric = digem.MetricChars() char_metric.reference = THE_LAZY_FOX char_metric.candidate = THE_COMBINED_A_FOX @@ -32,7 +32,7 @@ def test_metric_characters_from_empty_gt(): """Total un-similarity""" # arrange - _metric = deme.MetricChars() + _metric = digem.MetricChars() # _metric.preprocessings = [_filter_whitespaces] _metric.reference = '' _metric.candidate = THE_LAZY_FOX @@ -45,7 +45,7 @@ def test_metric_letter_from_empty_gt_and_empty_candidate(): """Behavor: Similarity of empty strings""" # arrange - _metric = deme.MetricLetters() + _metric = digem.MetricLetters() _metric.reference = '' _metric.candidate = '' @@ -57,7 +57,7 @@ def test_metric_words_with_only_slight_difference(): """simple word accurracy test""" # arrange - _metric = deme.MetricWords() + _metric = digem.MetricWords() _metric.reference = THE_LAZY_FOX _metric.candidate = THE_FOX_LAZY @@ -75,7 +75,7 @@ def test_metric_wa_with_identical_data(): """simple word similarity for similar inputs""" # arrange - _metric = deme.MetricWords() + _metric = digem.MetricWords() _metric.reference = THE_LAZY_FOX _metric.candidate = THE_LAZY_FOX @@ -87,7 +87,7 @@ def test_metric_bow_from_reasonable_input(): """simple bag of words test""" # arrange - _metric = deme.MetricBoW() + _metric = digem.MetricBoW() _metric.reference = THE_LAZY_FOX _metric.candidate = THE_FOX_LAZY @@ -99,7 +99,7 @@ def test_metric_bow_from_empty_gt_and_empty_candidate(): """how to handle empty data - means: no errors""" # arrange - _metric = deme.MetricBoW() + _metric = digem.MetricBoW() _metric.reference = '' _metric.candidate = '' @@ -117,7 +117,7 @@ def test_bow_ocrd_similarity_rate(): """ # arrange - _metric = deme.MetricBoW() + _metric = digem.MetricBoW() _metric.reference = "der Mann steht an der Ampel" _metric.candidate = "cer Mann fteht an der Ampel" @@ -135,7 +135,7 @@ def test_bow_ocrd_spec_similarity_rate_ref_contains_more_data(): """ # arrange - _metric = deme.MetricBoW() + _metric = digem.MetricBoW() _metric.reference = "der Mann steht an der roten Ampel" _metric.candidate = "cer Mann fteht an der Ampel" @@ -153,7 +153,7 @@ def test_bow_ocrd_spec_similarity_rate_ref_contains_less_data(): """ # arrange - _metric = deme.MetricBoW() + _metric = digem.MetricBoW() _metric.reference = "der Mann steht an der Ampel" _metric.candidate = "cer Mann fteht an der schönen roten Ampel" @@ -171,7 +171,7 @@ def test_metric_character_accuracy(): str2 = 'fthe lazy brown fox jumps ouer the hump' # arrange - char_metric = deme.MetricChars() + char_metric = digem.MetricChars() char_metric.reference = str1 char_metric.candidate = str2 @@ -187,7 +187,7 @@ def test_metric_bot_ident(): random.shuffle(list2) str2 = ' '.join(list2) - result = deme.bag_of_tokens(gt1.split(), str2.split()) + result = digem.bag_of_tokens(gt1.split(), str2.split()) assert result == 1.0 assert len(gt1.split()) == len(str2.split()) @@ -201,7 +201,7 @@ def test_metric_bot_candidate_with_only_repetitions(): str2 = "the dizzy brown fox fox fox jumps" # actsert - assert 0.83 == pytest.approx(deme.bag_of_tokens(gt1.split(), str2.split()), abs=1e-2) + assert 0.83 == pytest.approx(digem.bag_of_tokens(gt1.split(), str2.split()), abs=1e-2) def test_metric_bot_miss_tokens(): @@ -211,7 +211,7 @@ def test_metric_bot_miss_tokens(): str2 = "the brown fux jumps" # acsert - assert 0.66 == pytest.approx(deme.bag_of_tokens(gt1.split(), str2.split()), abs=1e-2) + assert 0.66 == pytest.approx(digem.bag_of_tokens(gt1.split(), str2.split()), abs=1e-2) def test_ir_metric_precision_fox(): @@ -219,7 +219,7 @@ def test_ir_metric_precision_fox(): having all tokens included (minus stopwords)""" # arrange - m_prec = deme.MetricIRPre() + m_prec = digem.MetricIRPre() m_prec.reference = THE_LAZY_FOX m_prec.candidate = THE_FOX_INPUT_IR @@ -236,7 +236,7 @@ def test_ir_metric_recall_fox(): (minus stoppwords)""" # arrange - m_prec = deme.MetricIRRec() + m_prec = digem.MetricIRRec() m_prec.reference = THE_LAZY_FOX m_prec.candidate = THE_FOX_INPUT_IR @@ -256,7 +256,7 @@ def test_ir_metrics_precision_english_poor_candidate(): a rather poor candidate""" # arrange - pre = deme.MetricIRPre() + pre = digem.MetricIRPre() pre.reference = THE_LAZY_FOX pre.candidate = IR_CANDIDATE_TEXT @@ -271,7 +271,7 @@ def test_ir_metrics_recall_english_poor_candidate(): a rather poor candidate""" # arrange - rec = deme.MetricIRRec() + rec = digem.MetricIRRec() rec.reference = THE_LAZY_FOX rec.candidate = IR_CANDIDATE_TEXT @@ -284,7 +284,7 @@ def test_ir_metrics_fmeasure_english_poor_candidate(): a rather poor candidate""" # arrange - metric_fm = deme.MetricIRFM() + metric_fm = digem.MetricIRFM() metric_fm.reference = THE_LAZY_FOX metric_fm.candidate = IR_CANDIDATE_TEXT @@ -302,7 +302,7 @@ def test_ir_metrics_precision_german(): and very nice candidate precision""" # arrange - prec = deme.MetricIRPre(languages=['german']) + prec = digem.MetricIRPre(languages=['german']) prec.reference = IR_REFERENCE_TEXT_GERMAN prec.candidate = IR_CANDIDATE_TEXT_GERMAN @@ -315,7 +315,7 @@ def test_ir_metrics_recall_german(): and very nice candidate recall""" # arrange - rec = deme.MetricIRRec(languages=['german']) + rec = digem.MetricIRRec(languages=['german']) rec.reference = IR_REFERENCE_TEXT_GERMAN rec.candidate = IR_CANDIDATE_TEXT_GERMAN @@ -328,7 +328,7 @@ def test_ir_metrics_precision_german_poor_candidate(): and rather poor candidate""" # arrange - metric_pre = deme.MetricIRPre(languages=['german']) + metric_pre = digem.MetricIRPre(languages=['german']) metric_pre.reference = IR_CANDIDATE_TEXT_GERMAN metric_pre.candidate = IR_REFERENCE_TEXT_GERMAN_POOR @@ -341,7 +341,7 @@ def test_ir_metrics_recall_german_poor_candidate(): and rather poor candidate""" # arrange - metric_rec = deme.MetricIRRec(languages=['german']) + metric_rec = digem.MetricIRRec(languages=['german']) metric_rec.reference = IR_CANDIDATE_TEXT_GERMAN metric_rec.candidate = IR_REFERENCE_TEXT_GERMAN_POOR @@ -367,7 +367,7 @@ def test_metrics_token_based_more_gt_than_tc(): cand = "faule springt Fuchs Hecke".split() # act - m_word = deme.MetricWords() + m_word = digem.MetricWords() m_word._data_reference = gt1 m_word._data_candidate = cand diff --git a/tests/test_ocr_metrics_base.py b/tests/test_ocr_metrics_base.py index 4f86d56..75baad4 100644 --- a/tests/test_ocr_metrics_base.py +++ b/tests/test_ocr_metrics_base.py @@ -5,7 +5,7 @@ import pytest -import digital_eval.metrics as deme +import digital_eval.metrics as digem # default reference THE_COMBINED_A_FOX = 'the á lazy brown fox jumps over the hump' @@ -23,11 +23,11 @@ def test_metric_unicode_normalization_happens(): # arrange raw1 = 'the á lazy brown fox jumps over the hump' raw2 = THE_COMBINED_A_FOX - norm1 = deme.normalize_unicode(raw1, uc_norm_by=deme.UC_NORMALIZATION_NFKD) - norm2 = deme.normalize_unicode(raw2, uc_norm_by=deme.UC_NORMALIZATION_NFKD) + norm1 = digem.normalize_unicode(raw1, uc_norm_by=digem.UC_NORMALIZATION_NFKD) + norm2 = digem.normalize_unicode(raw2, uc_norm_by=digem.UC_NORMALIZATION_NFKD) # act - similarity = deme.levenshtein_norm(norm1, norm2) + similarity = digem.levenshtein_norm(norm1, norm2) assert 1.0 == pytest.approx(similarity, abs=1e-6) # assert @@ -54,14 +54,14 @@ def test_metric_unicode_normalization_not_happens(): # arrange raw1 = THE_LAZY_FOX raw2 = THE_COMBINED_A_FOX - norm1_nfc = deme.normalize_unicode(raw1, uc_norm_by=deme.UC_NORMALIZATION_DEFAULT) - norm1_nfkd = deme.normalize_unicode(raw1, uc_norm_by=deme.UC_NORMALIZATION_NFKD) - norm2_nfc = deme.normalize_unicode(raw2, uc_norm_by=deme.UC_NORMALIZATION_DEFAULT) - norm2_nfkd = deme.normalize_unicode(raw2, uc_norm_by=deme.UC_NORMALIZATION_NFKD) + norm1_nfc = digem.normalize_unicode(raw1, uc_norm_by=digem.UC_NORMALIZATION_DEFAULT) + norm1_nfkd = digem.normalize_unicode(raw1, uc_norm_by=digem.UC_NORMALIZATION_NFKD) + norm2_nfc = digem.normalize_unicode(raw2, uc_norm_by=digem.UC_NORMALIZATION_DEFAULT) + norm2_nfkd = digem.normalize_unicode(raw2, uc_norm_by=digem.UC_NORMALIZATION_NFKD) # act - sim_nfc = deme.levenshtein_norm(norm1_nfc, norm2_nfc) - sim_nfkd = deme.levenshtein_norm(norm1_nfkd, norm2_nfkd) + sim_nfc = digem.levenshtein_norm(norm1_nfc, norm2_nfc) + sim_nfkd = digem.levenshtein_norm(norm1_nfkd, norm2_nfkd) # assert assert 0.95 == sim_nfc @@ -72,7 +72,7 @@ def test_metric_calculate_character_edit_distance(): """explore edit-distance""" str1 = 'sthe lazy brown fox jumps overthe hump' str2 = 'fthe lazy brown fox jumps ouer the hump' - distance = deme.levenshtein_norm(str1, str2) + distance = digem.levenshtein_norm(str1, str2) assert 0.923 == pytest.approx(distance, 1e-4) @@ -84,7 +84,7 @@ def test_metric_bot_ident(): random.shuffle(list2) str2 = ' '.join(list2) - similarity = deme.bag_of_tokens(gt1.split(), str2.split()) + similarity = digem.bag_of_tokens(gt1.split(), str2.split()) assert similarity == 1.0 assert len(gt1.split()) == len(str2.split()) @@ -98,7 +98,7 @@ def test_metric_bot_candidate_with_only_repetitions(): str2 = "the dizzy brown fox fox fox jumps" # actsert - assert 0.833 == pytest.approx(deme.bag_of_tokens(gt1.split(), str2.split()), 1e-3) + assert 0.833 == pytest.approx(digem.bag_of_tokens(gt1.split(), str2.split()), 1e-3) def test_metric_bot_miss_tokens(): @@ -108,7 +108,7 @@ def test_metric_bot_miss_tokens(): str2 = "the brown fux jumps" # acsert - assert 0.66 == pytest.approx(deme.bag_of_tokens(gt1.split(), str2.split()), abs=1e-2) + assert 0.66 == pytest.approx(digem.bag_of_tokens(gt1.split(), str2.split()), abs=1e-2) def test_metrics_token_based_more_gt_than_tc(): @@ -129,7 +129,7 @@ def test_metrics_token_based_more_gt_than_tc(): cand = "faule springt Fuchs Hecke".split() # act - result = deme.levenshtein_norm(gt1, cand) + result = digem.levenshtein_norm(gt1, cand) # assert assert 0.2857 == pytest.approx(result, rel=1e-4) @@ -144,7 +144,7 @@ def test_metrics_token_based_equal(): cand = "der fahle Fuchs springt über die Hecke" # act - sim = deme.levenshtein_norm(gt1.split(), cand.split()) + sim = digem.levenshtein_norm(gt1.split(), cand.split()) # assert assert 1.0 == sim @@ -159,7 +159,7 @@ def test_metrics_token_based_no_test_candidate(): gt1 = "ein Dachs springt die Hecke" # act - diff = deme.levenshtein_norm(gt1.split(), [], inverse=True) + diff = digem.levenshtein_norm(gt1.split(), [], inverse=True) # assert assert diff == 1.0