Skip to content

Commit

Permalink
[app][fix] div by zero for tricky data
Browse files Browse the repository at this point in the history
  • Loading branch information
M3ssman committed Sep 9, 2022
1 parent 0827034 commit 98a9c24
Show file tree
Hide file tree
Showing 7 changed files with 184 additions and 48 deletions.
51 changes: 35 additions & 16 deletions digital_eval/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@

import os
import re
import sys

import xml.dom.minidom
import xml.etree.ElementTree as ET
from concurrent.futures import (
ProcessPoolExecutor
)
import concurrent.futures

from datetime import (
date
)
Expand Down Expand Up @@ -56,6 +57,10 @@
# mark unset values as 'not available'
NOT_SET = 'n.a.'

# how long evaluation shall take maximal
# where "None" means "no timeout"
EVAL_TIMEOUT = None


def strip_outliers_from(data_tuples, fence_ratio=1.5):
"""Determine a data set's outliers by interquartile range (IQR)
Expand Down Expand Up @@ -228,6 +233,7 @@ def get_bbox_data(file_path):
# rather brute force approach
# to recognize OCR formats inside
start_token = _handle.read(128)
_frame_points = None

# switch by estimated ocr format
if 'alto' in start_token:
Expand All @@ -254,12 +260,20 @@ def get_bbox_data(file_path):
doc_root = xml.dom.minidom.parse(file_path).documentElement
name_space = doc_root.namespaceURI
root_element = ET.parse(file_path).getroot()
_xpr_coords = f'.//{{{name_space}}}TextLine/{{{name_space}}}Coords'
raw_elements = root_element.findall(_xpr_coords)
if not raw_elements:
raise RuntimeError(f"{file_path} missing {_xpr_coords} !")
return extract_from_geometric_data(raw_elements, _map_page2013)

# step one: read PAGE border coords
_xpr_page_borders = f'{{{name_space}}}Page/{{{name_space}}}Border/{{{name_space}}}Coords'
_page_coords = root_element.findall(_xpr_page_borders)
if len(_page_coords) > 0:
_frame_points = extract_from_geometric_data(_page_coords, _map_page2013)
# step two: if possible, go for sub-part geometry
_xpr_line_coords = f'.//{{{name_space}}}TextLine/{{{name_space}}}Coords'
_line_coords = root_element.findall(_xpr_line_coords)
if len(_line_coords) > 0:
_frame_points = extract_from_geometric_data(_line_coords, _map_page2013)
if _frame_points:
return _frame_points
else:
raise RuntimeError(f"{file_path} missing page/line coords!")
return None


Expand Down Expand Up @@ -516,26 +530,31 @@ def __init__(self, root_candidates, verbosity=0, extras=None, to_text_func=piece
MetricPre(), MetricRec(), MetricFM()]

def eval_all(self, entries: List[EvalEntry], sequential=False) -> None:
"""remove all paths where no groundtruth exists"""
"""evaluate all pairs groundtruth-candidate"""

if sequential or self.is_sequential:
for e in entries:
if e.path_g:
try:
self.eval_entry(e)
except Exception as exc:
print(f"[WARN ] '{exc}'")
print(f"[WARN ][{e.path_g}] {exc}")
else:
cpus = cpu_count()
n_executors = cpus - 1 if cpus > 3 else 1
if self.verbosity == 1:
print(f"[DEBUG] use {n_executors} executors ({cpus}) to create evaluation data")
_entries = []
with ProcessPoolExecutor(max_workers=n_executors) as executor:

with concurrent.futures.ProcessPoolExecutor(max_workers=n_executors) as executor:
try:
_entries = list(executor.map(self._wrap_eval_entry, entries, timeout=30))
_entries = list(executor.map(self._wrap_eval_entry, entries, timeout=EVAL_TIMEOUT))
except concurrent.futures.TimeoutError:
print(f"[ERROR] takes longer than {EVAL_TIMEOUT}s to evaluate {len(entries)} entries!")
sys.exit(1)
except Exception as err:
print(f"[WARN ] '{err}' creating evaluation data")
print(f"[ERROR] '{err}' creating evaluation data!")
sys.exit(1)
if _entries:
_not_nones = [e for e in _entries if e is not None]
if self.verbosity == 1:
Expand Down Expand Up @@ -568,7 +587,7 @@ def _wrap_eval_entry(self, entry: EvalEntry):
try:
return self.eval_entry(entry)
except Exception as exc:
print(f"[WARN ] _wrap' {exc}'")
print(f"[WARN ][{entry.path_g}] _wrap {exc}")

def eval_entry(self, entry: EvalEntry) -> EvalEntry:
"""Create evaluation entry for matching pair of
Expand All @@ -586,7 +605,7 @@ def eval_entry(self, entry: EvalEntry) -> EvalEntry:
# load ground-thruth text
(gt_type, txt_gt, _) = self.to_text_func(path_g, oneliner=True)
if not txt_gt:
raise RuntimeError(f"missing gt text from {path_g}!")
print(f"[WARN ] {path_g} contains no text")

# if text mode is enforced
# forget groundtruth coordinates
Expand Down
17 changes: 15 additions & 2 deletions digital_eval/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ def calc(self):
First, normalize text on UTF-8 level
"""

# reference might be desired to be empty
# if self.input_reference:
self.data_reference = unicodedata.normalize(UC_NORMALIZATION, self.input_reference)
self.data_candidate = unicodedata.normalize(UC_NORMALIZATION, self.input_candidate)

Expand Down Expand Up @@ -236,7 +238,7 @@ def bag_of_tokens(reference_tokens: List[str], candidate_tokens: List[str]) -> T
n_tokens_gt = len(reference_tokens)
diff_tokens =_diff(reference_tokens, candidate_tokens)
n_tokens_missed = len(diff_tokens)
hit_rate = 100 * (n_tokens_gt - len(diff_tokens)) / n_tokens_gt
hit_rate =_norm(n_tokens_gt, len(diff_tokens))
_len_ref = len(reference_tokens)
return (hit_rate, n_tokens_missed, _len_ref)

Expand Down Expand Up @@ -303,7 +305,18 @@ def ir_fmeasure(refrence_data, candidate_data) -> Tuple:


def _norm(reference, errs, scale_by=100) -> float:
'''Normalize outcome based on specific reference into range 0 - 100'''
"""
Normalize outcome in range 0 - 100
* if more differences than actual len reference => 0
* if both len reference and errs eq zero => 100
there was nothing to find and it did detect nothing
(i.e. no false-positive for an image page)
* otherwise align to len reference
"""

if (reference - errs) < 0:
return 0
if reference == 0 and errs == 0:
return 100
return scale_by * ((reference - errs) / reference)
35 changes: 30 additions & 5 deletions digital_eval/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,8 @@ def transcription(self):
def transcription(self, transcription):
"""Set textual transcription representing this piece"""
_transcription = PieceTranscription()
_transcription.text = transcription
if transcription is not None and len(transcription.strip()) > 0:
_transcription.text = transcription
self._transcriptions.append(_transcription)

def __contains__(self, other_piece) -> bool:
Expand Down Expand Up @@ -337,31 +338,55 @@ def _read_lines_page(page_lines, parent, ns) -> List:


def __from_page_text_element(element, parent, ns) -> Piece:
"""Most basic transformation from PAGE XML textual nodes"""
"""transformation from PAGE XML textual nodes
to generic pieces with specific transkription.
If on PAGE level Word creates word pieces, and
also inspects textual contents.
catches several *bad* data flavours regarding
coordinates, points and text content
* missing Coord node
* Coord exists, but misses attribute "points"
* Coord exists, attribute "points" exists, but
contains less than 3 point-pairs, thous only
forms a line and not even a triangle
* missing TextEquiv node
* TextEquiv exists, but no Unicode child
* Unicode exists, but lacking any text content
"""
_id = element.getAttribute('id')
_type, _local = ___map_piece_type(element)
_piece = Piece(_id)
_piece.level = _type
_piece.parent = parent

# inspect geometry
_coords = [n for n in element.childNodes if n.localName == 'Coords']
if len(_coords) < 1 or 'points' not in _coords[0].attributes:
raise RuntimeError(f"{_local}@ID={_id} invalid coordinate data")
_points = _coords[0].getAttribute('points').split()
if len(_points) < 4:
# invariant: at least want 3 points, otherwise polygon area == Zero
if len(_points) < 3:
raise RuntimeError(f"{_local}@ID={_id} way too few points {_points}")
_piece.dimensions = [[int(_point.split(',')[0]),int(_point.split(',')[1])]
for _point in _points]

# pick text if on word level
if _type == PieceLevel.WORD:
_txt_eqs = [n for n in element.childNodes if n.localName == 'TextEquiv']
if len(_txt_eqs) != 1:
raise RuntimeError(f"{_local}@ID={_id} invalid txt node {_txt_eqs}")
_content = _txt_eqs[0].getElementsByTagName(ns+'Unicode')[0].firstChild.nodeValue
_first_unicode = _txt_eqs[0].getElementsByTagName(ns+'Unicode')[0]
if not _first_unicode.firstChild:
raise RuntimeError(f"{_local}@ID={_id} empty unicode node!")
_content = _first_unicode.firstChild.nodeValue
if not _content or not _content.strip():
raise RuntimeError(f"{_local}@ID={_id} invalid txt content!")
# only add content when not top-level piece
_piece.transcription = _content

return _piece


Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
<?xml version="1.0" encoding="UTF-8"?>
<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" pcGtsId="PAGE_01" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15/pagecontent.xsd">
<Metadata>
<Creator>OCR-D/core 2.26.0</Creator>
<Created>2021-12-03T22:38:20</Created>
<LastChange>2021-12-03T22:38:20</LastChange>
<Comments/>
</Metadata>
<Page imageFilename="MAX/urn+nbn+de+gbv+3+1-201080_urn+nbn+de+gbv+3+1-201080-p0034-8_ger.png" imageHeight="2598" imageWidth="2104" type="content">
<AlternativeImage comments="binarized" filename="OCR-D-BINPAGE/OCR-D-BINPAGE_01-BIN_sauvola-ms-split.png"/>
<AlternativeImage comments="binarized,cropped" filename="OCR-D-SEG-PAGE-ANYOCR/OCR-D-SEG-PAGE-ANYOCR_01.IMG-CROP.png"/>
<AlternativeImage comments="binarized,cropped,despeckled" filename="OCR-D-DENOISE-OCROPY/OCR-D-DENOISE-OCROPY_01.IMG-DESPECK.png"/>
<AlternativeImage comments="binarized,cropped,despeckled,deskewed" filename="OCR-D-DESKEW-OCROPY/OCR-D-DESKEW-OCROPY_01.IMG-DESKEW.png"/>
<AlternativeImage comments="binarized,cropped,despeckled,deskewed,recropped,binarized,clipped" filename="OCR-D-SEG-BLOCK-TESSERACT/OCR-D-SEG-BLOCK-TESSERACT_01.IMG-BIN.png"/>
<Border>
<Coords points="101,60 77,2497 1982,2506 2012,58"/>
</Border>
<ReadingOrder>
<OrderedGroup id="reading-order">
<RegionRefIndexed index="0" regionRef="region0003"/>
</OrderedGroup>
</ReadingOrder>
<ImageRegion id="region0003" orientation="0.0">
<Coords points="147,59 147,2497 1982,2506 2012,58"/>
</ImageRegion>
<SeparatorRegion id="region0000">
<Coords points="2008,319 1995,319 1995,1083 1999,1083"/>
</SeparatorRegion>
<SeparatorRegion id="region0001">
<Coords points="1533,2475 1930,2475 1930,2492 1533,2492"/>
</SeparatorRegion>
<SeparatorRegion id="region0002">
<Coords points="193,355 206,355 206,915 193,915"/>
</SeparatorRegion>
</Page>
</PcGts>

This file was deleted.

38 changes: 15 additions & 23 deletions tests/test_ocr_evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
ocr_to_text,
piece_to_text,
filter_word_pieces,
get_bbox_data,
)

from digital_eval.model import (
Expand Down Expand Up @@ -558,29 +559,6 @@ def test_handle_empty_candidate():
assert eval_entry.metrics[6].value == 0.0


def test_handle_exception_min_empty_slice():
"""Handle evaluation exception:
min() arg is an empty sequence
results from empty GT data
"""

# arrange
path_gt = f'{TEST_RES_DIR}/groundtruth/page/urn+nbn+de+gbv+3+1-792620-p1008-8_ger.gt.xml'
eval_entry = EvalEntry('dummy_candidate')
eval_entry.path_g = path_gt

# act
evaluator = Evaluator('/data')
with pytest.raises(RuntimeError) as err:
evaluator.eval_entry(eval_entry)

# assert
# split error message and check specific tokens
assert 'urn+nbn+de+gbv+3+1-792620-p1008-8_ger.gt.xml' in err.value.args[0]
assert ' missing ' in err.value.args[0]
assert 'TextLine' in err.value.args[0]


def test_handle_table_text_groundtruth():
"""Handle evaluation exception:
missing gt text from urn+nbn+de+gbv+3+1-126343-p0285-7_ger.gt.xml
Expand All @@ -599,3 +577,17 @@ def test_handle_table_text_groundtruth():
# assert / legacy: 5.825 , actual 6.008
_result_cca = eval_entry.metrics[0].value
assert _result_cca > 5.7 and _result_cca < 6.1


def test_get_box_from_empty_page():
"""How to deal with empty PAGE"""

# arrange
_path_gt = f'{TEST_RES_DIR}/groundtruth/page/urn+nbn+de+gbv+3+1-201080-p0034-8_ger.gt.xml'

# act
_p1, _p2 = get_bbox_data(_path_gt)

# assert
assert _p1 == (77, 58)
assert _p2 == (2012, 2506)
53 changes: 52 additions & 1 deletion tests/test_ocr_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
import pytest

from digital_eval.metrics import (
MetricBoW,
MetricCA,
MetricLA,
character_accuracy,
bag_of_tokens,
ir_fmeasure,
Expand All @@ -17,7 +20,7 @@
token_based,
)

def test_metric_normalization():
def test_metric_unicode_normalization():
"""Normalization required
raw1 has "á" as {U+00E0}
str2 has "á" as {U+0061}+{U+0301}
Expand Down Expand Up @@ -53,6 +56,54 @@ def test_metric_calculate_correctness():
assert 92.10 == pytest.approx(ccr, 0.001)


def test_metric_characters_from_empty_gt():
"""What happens when there's something reported"""

# arrange
_metric = MetricCA()
_metric.input_reference = ''
_metric.input_candidate = 'fthe lazy brown fox jumps ouer the hump'

# act
_metric.calc()

# assert
assert 39 == _metric.diff
assert 0 == _metric.value


def test_metric_letter_from_empty_gt_and_empty_candidate():
"""explore edit-distance"""

# arrange
_metric = MetricLA()
_metric.input_reference = ''
_metric.input_candidate = ''

# act
_metric.calc()

# assert
assert 0 == _metric.diff
assert 100 == _metric.value


def test_metric_bow_from_empty_gt_and_empty_candidate():
"""explore edit-distance"""

# arrange
_metric = MetricBoW()
_metric.input_reference = ''
_metric.input_candidate = ''

# act
_metric.calc()

# assert
assert 0 == _metric.diff
assert 100 == _metric.value


def test_metric_bot_ident():
"""BOW with identical tokens"""

Expand Down

0 comments on commit 98a9c24

Please sign in to comment.