[app][fix] div by zero for tricky data

ulb-sachsen-anhalt · Sep 9, 2022 · 98a9c24 · 98a9c24
1 parent 0827034
commit 98a9c24
Show file tree

Hide file tree

Showing 7 changed files with 184 additions and 48 deletions.
diff --git a/digital_eval/evaluation.py b/digital_eval/evaluation.py
@@ -3,11 +3,12 @@
 
 import os
 import re
+import sys
+
 import xml.dom.minidom
 import xml.etree.ElementTree as ET
-from concurrent.futures import (
-    ProcessPoolExecutor
-)
+import concurrent.futures
+
 from datetime import (
     date
 )
@@ -56,6 +57,10 @@
 # mark unset values as 'not available'
 NOT_SET = 'n.a.'
 
+# how long evaluation shall take maximal
+# where "None" means "no timeout"
+EVAL_TIMEOUT = None
+
 
 def strip_outliers_from(data_tuples, fence_ratio=1.5):
     """Determine a data set's outliers by interquartile range (IQR)
@@ -228,6 +233,7 @@ def get_bbox_data(file_path):
         # rather brute force approach
         # to recognize OCR formats inside
         start_token = _handle.read(128)
+        _frame_points = None
 
         # switch by estimated ocr format
         if 'alto' in start_token:
@@ -254,12 +260,20 @@ def get_bbox_data(file_path):
             doc_root = xml.dom.minidom.parse(file_path).documentElement
             name_space = doc_root.namespaceURI
             root_element = ET.parse(file_path).getroot()
-            _xpr_coords = f'.//{{{name_space}}}TextLine/{{{name_space}}}Coords'
-            raw_elements = root_element.findall(_xpr_coords)
-            if not raw_elements:
-                raise RuntimeError(f"{file_path} missing {_xpr_coords} !")
-            return extract_from_geometric_data(raw_elements, _map_page2013)
-
+            # step one: read PAGE border coords
+            _xpr_page_borders = f'{{{name_space}}}Page/{{{name_space}}}Border/{{{name_space}}}Coords'
+            _page_coords = root_element.findall(_xpr_page_borders)
+            if len(_page_coords) > 0:
+                _frame_points = extract_from_geometric_data(_page_coords, _map_page2013)
+            # step two: if possible, go for sub-part geometry
+            _xpr_line_coords = f'.//{{{name_space}}}TextLine/{{{name_space}}}Coords'
+            _line_coords = root_element.findall(_xpr_line_coords)
+            if len(_line_coords) > 0:
+                _frame_points = extract_from_geometric_data(_line_coords, _map_page2013)
+            if _frame_points:
+                return _frame_points
+            else:
+                raise RuntimeError(f"{file_path} missing page/line coords!")
     return None
 
 
@@ -516,26 +530,31 @@ def __init__(self, root_candidates, verbosity=0, extras=None, to_text_func=piece
                         MetricPre(), MetricRec(), MetricFM()]
 
     def eval_all(self, entries: List[EvalEntry], sequential=False) -> None:
-        """remove all paths where no groundtruth exists"""
+        """evaluate all pairs groundtruth-candidate"""
 
         if sequential or self.is_sequential:
             for e in entries:
                 if e.path_g:
                     try:
                         self.eval_entry(e)
                     except Exception as exc:
-                        print(f"[WARN ] '{exc}'")
+                        print(f"[WARN ][{e.path_g}] {exc}")
         else:
             cpus = cpu_count()
             n_executors = cpus - 1 if cpus > 3 else 1
             if self.verbosity == 1:
                 print(f"[DEBUG] use {n_executors} executors ({cpus}) to create evaluation data")
             _entries = []
-            with ProcessPoolExecutor(max_workers=n_executors) as executor:
+
+            with concurrent.futures.ProcessPoolExecutor(max_workers=n_executors) as executor:
                 try:
-                    _entries = list(executor.map(self._wrap_eval_entry, entries, timeout=30))
+                    _entries = list(executor.map(self._wrap_eval_entry, entries, timeout=EVAL_TIMEOUT))
+                except concurrent.futures.TimeoutError:
+                    print(f"[ERROR] takes longer than {EVAL_TIMEOUT}s to evaluate {len(entries)} entries!")
+                    sys.exit(1)
                 except Exception as err:
-                    print(f"[WARN ] '{err}' creating evaluation data")
+                    print(f"[ERROR] '{err}' creating evaluation data!")
+                    sys.exit(1)
             if _entries:
                 _not_nones = [e for e in _entries if e is not None]
                 if self.verbosity == 1:
@@ -568,7 +587,7 @@ def _wrap_eval_entry(self, entry: EvalEntry):
             try:
                 return self.eval_entry(entry)
             except Exception as exc:
-                print(f"[WARN ] _wrap' {exc}'")
+                print(f"[WARN ][{entry.path_g}] _wrap {exc}")
 
     def eval_entry(self, entry: EvalEntry) -> EvalEntry:
         """Create evaluation entry for matching pair of 
@@ -586,7 +605,7 @@ def eval_entry(self, entry: EvalEntry) -> EvalEntry:
         # load ground-thruth text
         (gt_type, txt_gt, _) = self.to_text_func(path_g, oneliner=True)
         if not txt_gt:
-            raise RuntimeError(f"missing gt text from {path_g}!")
+            print(f"[WARN ] {path_g} contains no text")
 
         # if text mode is enforced
         # forget groundtruth coordinates

diff --git a/digital_eval/metrics.py b/digital_eval/metrics.py
@@ -74,6 +74,8 @@ def calc(self):
         First, normalize text on UTF-8 level
         """
 
+        # reference might be desired to be empty
+        # if self.input_reference:
         self.data_reference = unicodedata.normalize(UC_NORMALIZATION, self.input_reference)
         self.data_candidate = unicodedata.normalize(UC_NORMALIZATION, self.input_candidate)
 
@@ -236,7 +238,7 @@ def bag_of_tokens(reference_tokens: List[str], candidate_tokens: List[str]) -> T
     n_tokens_gt = len(reference_tokens)
     diff_tokens =_diff(reference_tokens, candidate_tokens)
     n_tokens_missed = len(diff_tokens)
-    hit_rate = 100 * (n_tokens_gt - len(diff_tokens)) / n_tokens_gt
+    hit_rate =_norm(n_tokens_gt, len(diff_tokens))
     _len_ref = len(reference_tokens)
     return (hit_rate, n_tokens_missed, _len_ref)
 
@@ -303,7 +305,18 @@ def ir_fmeasure(refrence_data, candidate_data) -> Tuple:
 
 
 def _norm(reference, errs, scale_by=100) -> float:
-    '''Normalize outcome based on specific reference into range 0 - 100'''
+    """
+    Normalize outcome in range 0 - 100
+
+    * if more differences than actual len reference => 0
+    * if both len reference and errs eq zero => 100
+      there was nothing to find and it did detect nothing 
+      (i.e. no false-positive for an image page) 
+    * otherwise align to len reference
+    """
+
     if (reference - errs) < 0:
         return 0
+    if reference == 0 and errs == 0:
+        return 100
     return scale_by * ((reference - errs) / reference)
diff --git a/digital_eval/model.py b/digital_eval/model.py
@@ -122,7 +122,8 @@ def transcription(self):
     def transcription(self, transcription):
         """Set textual transcription representing this piece"""
         _transcription = PieceTranscription()
-        _transcription.text = transcription
+        if transcription is not None and len(transcription.strip()) > 0:
+            _transcription.text = transcription
         self._transcriptions.append(_transcription)
 
     def __contains__(self, other_piece) -> bool:
@@ -337,31 +338,55 @@ def _read_lines_page(page_lines, parent, ns) -> List:
 
 
 def __from_page_text_element(element, parent, ns) -> Piece:
-    """Most basic transformation from PAGE XML textual nodes"""
+    """transformation from PAGE XML textual nodes
+    to generic pieces with specific transkription.
+    
+    If on PAGE level Word creates word pieces, and
+    also inspects textual contents.
+
+    catches several *bad* data flavours regarding
+    coordinates, points and text content
+
+    * missing Coord node
+    * Coord exists, but misses attribute "points"
+    * Coord exists, attribute "points" exists, but 
+      contains less than 3 point-pairs, thous only
+      forms a line and not even a triangle
+    * missing TextEquiv node
+    * TextEquiv exists, but no Unicode child
+    * Unicode exists, but lacking any text content
+
+    """
     _id = element.getAttribute('id')
     _type, _local = ___map_piece_type(element)
     _piece = Piece(_id)
     _piece.level = _type
     _piece.parent = parent
+
     # inspect geometry
     _coords = [n for n in element.childNodes if n.localName == 'Coords']
     if len(_coords) < 1 or 'points' not in _coords[0].attributes:
         raise RuntimeError(f"{_local}@ID={_id} invalid coordinate data")
     _points = _coords[0].getAttribute('points').split()
-    if len(_points) < 4:
+    # invariant: at least want 3 points, otherwise polygon area == Zero
+    if len(_points) < 3:
         raise RuntimeError(f"{_local}@ID={_id} way too few points {_points}")
     _piece.dimensions = [[int(_point.split(',')[0]),int(_point.split(',')[1])] 
         for _point in _points]
+
     # pick text if on word level
     if _type == PieceLevel.WORD:
         _txt_eqs = [n for n in element.childNodes if n.localName == 'TextEquiv']
         if len(_txt_eqs) != 1:
             raise RuntimeError(f"{_local}@ID={_id} invalid txt node {_txt_eqs}")
-        _content = _txt_eqs[0].getElementsByTagName(ns+'Unicode')[0].firstChild.nodeValue
+        _first_unicode = _txt_eqs[0].getElementsByTagName(ns+'Unicode')[0]
+        if not _first_unicode.firstChild:
+            raise RuntimeError(f"{_local}@ID={_id} empty unicode node!")
+        _content = _first_unicode.firstChild.nodeValue
         if not _content or not _content.strip():
             raise RuntimeError(f"{_local}@ID={_id} invalid txt content!")
-        # only add content when not top-level piece
         _piece.transcription = _content
+
     return _piece
 
 

diff --git a/tests/resources/groundtruth/page/urn+nbn+de+gbv+3+1-201080-p0034-8_ger.gt.xml b/tests/resources/groundtruth/page/urn+nbn+de+gbv+3+1-201080-p0034-8_ger.gt.xml
@@ -0,0 +1,37 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15"
+    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" pcGtsId="PAGE_01" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15/pagecontent.xsd">
+    <Metadata>
+        <Creator>OCR-D/core 2.26.0</Creator>
+        <Created>2021-12-03T22:38:20</Created>
+        <LastChange>2021-12-03T22:38:20</LastChange>
+        <Comments/>
+    </Metadata>
+    <Page imageFilename="MAX/urn+nbn+de+gbv+3+1-201080_urn+nbn+de+gbv+3+1-201080-p0034-8_ger.png" imageHeight="2598" imageWidth="2104" type="content">
+        <AlternativeImage comments="binarized" filename="OCR-D-BINPAGE/OCR-D-BINPAGE_01-BIN_sauvola-ms-split.png"/>
+        <AlternativeImage comments="binarized,cropped" filename="OCR-D-SEG-PAGE-ANYOCR/OCR-D-SEG-PAGE-ANYOCR_01.IMG-CROP.png"/>
+        <AlternativeImage comments="binarized,cropped,despeckled" filename="OCR-D-DENOISE-OCROPY/OCR-D-DENOISE-OCROPY_01.IMG-DESPECK.png"/>
+        <AlternativeImage comments="binarized,cropped,despeckled,deskewed" filename="OCR-D-DESKEW-OCROPY/OCR-D-DESKEW-OCROPY_01.IMG-DESKEW.png"/>
+        <AlternativeImage comments="binarized,cropped,despeckled,deskewed,recropped,binarized,clipped" filename="OCR-D-SEG-BLOCK-TESSERACT/OCR-D-SEG-BLOCK-TESSERACT_01.IMG-BIN.png"/>
+        <Border>
+            <Coords points="101,60 77,2497 1982,2506 2012,58"/>
+        </Border>
+        <ReadingOrder>
+            <OrderedGroup id="reading-order">
+                <RegionRefIndexed index="0" regionRef="region0003"/>
+            </OrderedGroup>
+        </ReadingOrder>
+        <ImageRegion id="region0003" orientation="0.0">
+            <Coords points="147,59 147,2497 1982,2506 2012,58"/>
+        </ImageRegion>
+        <SeparatorRegion id="region0000">
+            <Coords points="2008,319 1995,319 1995,1083 1999,1083"/>
+        </SeparatorRegion>
+        <SeparatorRegion id="region0001">
+            <Coords points="1533,2475 1930,2475 1930,2492 1533,2492"/>
+        </SeparatorRegion>
+        <SeparatorRegion id="region0002">
+            <Coords points="193,355 206,355 206,915 193,915"/>
+        </SeparatorRegion>
+    </Page>
+</PcGts>
diff --git a/tests/resources/groundtruth/page/urn+nbn+de+gbv+3+1-792620-p1008-8_ger.gt.xml b/tests/resources/groundtruth/page/urn+nbn+de+gbv+3+1-792620-p1008-8_ger.gt.xml
diff --git a/tests/test_ocr_evaluate.py b/tests/test_ocr_evaluate.py
@@ -23,6 +23,7 @@
     ocr_to_text,
     piece_to_text,
     filter_word_pieces,
+    get_bbox_data,
 )
 
 from digital_eval.model import (
@@ -558,29 +559,6 @@ def test_handle_empty_candidate():
     assert eval_entry.metrics[6].value == 0.0
 
 
-def test_handle_exception_min_empty_slice():
-    """Handle evaluation exception: 
-        min() arg is an empty sequence
-        results from empty GT data
-    """
-
-    # arrange
-    path_gt = f'{TEST_RES_DIR}/groundtruth/page/urn+nbn+de+gbv+3+1-792620-p1008-8_ger.gt.xml'
-    eval_entry = EvalEntry('dummy_candidate')
-    eval_entry.path_g = path_gt
-
-    # act
-    evaluator = Evaluator('/data')
-    with pytest.raises(RuntimeError) as err:
-        evaluator.eval_entry(eval_entry)
-
-    # assert
-    # split error message and check specific tokens
-    assert 'urn+nbn+de+gbv+3+1-792620-p1008-8_ger.gt.xml' in err.value.args[0]
-    assert ' missing ' in err.value.args[0]
-    assert 'TextLine' in err.value.args[0]
-
-
 def test_handle_table_text_groundtruth():
     """Handle evaluation exception: 
         missing gt text from urn+nbn+de+gbv+3+1-126343-p0285-7_ger.gt.xml
@@ -599,3 +577,17 @@ def test_handle_table_text_groundtruth():
     # assert / legacy: 5.825 , actual 6.008
     _result_cca = eval_entry.metrics[0].value
     assert _result_cca > 5.7 and _result_cca < 6.1
+
+
+def test_get_box_from_empty_page():
+    """How to deal with empty PAGE"""
+
+    # arrange
+    _path_gt = f'{TEST_RES_DIR}/groundtruth/page/urn+nbn+de+gbv+3+1-201080-p0034-8_ger.gt.xml'
+
+    # act
+    _p1, _p2 = get_bbox_data(_path_gt)
+
+    # assert 
+    assert _p1 == (77, 58)
+    assert _p2 == (2012, 2506)
diff --git a/tests/test_ocr_metrics.py b/tests/test_ocr_metrics.py
@@ -8,6 +8,9 @@
 import pytest
 
 from digital_eval.metrics import (
+    MetricBoW,
+    MetricCA,
+    MetricLA,
     character_accuracy,
     bag_of_tokens,
     ir_fmeasure,
@@ -17,7 +20,7 @@
     token_based,
 )
 
-def test_metric_normalization():
+def test_metric_unicode_normalization():
     """Normalization required
     raw1 has "á" as {U+00E0} 
     str2 has "á" as {U+0061}+{U+0301} 
@@ -53,6 +56,54 @@ def test_metric_calculate_correctness():
     assert 92.10 == pytest.approx(ccr, 0.001)
 
 
+def test_metric_characters_from_empty_gt():
+    """What happens when there's something reported"""
+
+    # arrange
+    _metric = MetricCA()
+    _metric.input_reference = ''
+    _metric.input_candidate = 'fthe lazy brown fox jumps ouer the hump'
+
+    # act
+    _metric.calc()
+
+    # assert
+    assert 39 == _metric.diff
+    assert 0 == _metric.value
+
+
+def test_metric_letter_from_empty_gt_and_empty_candidate():
+    """explore edit-distance"""
+
+    # arrange
+    _metric = MetricLA()
+    _metric.input_reference = ''
+    _metric.input_candidate = ''
+
+    # act
+    _metric.calc()
+
+    # assert
+    assert 0 == _metric.diff
+    assert 100 == _metric.value
+
+
+def test_metric_bow_from_empty_gt_and_empty_candidate():
+    """explore edit-distance"""
+
+    # arrange
+    _metric = MetricBoW()
+    _metric.input_reference = ''
+    _metric.input_candidate = ''
+
+    # act
+    _metric.calc()
+
+    # assert
+    assert 0 == _metric.diff
+    assert 100 == _metric.value
+
+
 def test_metric_bot_ident():
     """BOW with identical tokens"""