diff --git a/src/digital_eval/evaluation.py b/src/digital_eval/evaluation.py index e35dbdd..ed7b1b4 100644 --- a/src/digital_eval/evaluation.py +++ b/src/digital_eval/evaluation.py @@ -16,7 +16,7 @@ floor ) from multiprocessing import ( - cpu_count, Queue, Lock, Manager, Array + cpu_count ) from pathlib import ( Path @@ -285,6 +285,17 @@ def _map_page2013(elem: ET.Element) -> Tuple[str, int, int, int, int]: return (NOT_SET, min(_xs), min(_ys), max(_xs), max(_ys)) +def _get_line_pieces_from_piece(piece: Piece, lines: List[Piece] = None) -> List[Piece]: + if lines is None: + lines = [] + if piece.level == PieceLevel.LINE and piece.transcription: + lines.append(piece) + return lines + for child in piece.pieces: + _get_line_pieces_from_piece(child, lines) + return lines + + def calculate_bounding_box(elements: List[ET.Element], map_func) -> Tuple[int, int, int, int]: """Review element's points to get points for minimum (top-left) and maximum (bottom-right)""" @@ -346,11 +357,7 @@ def piece_to_text(file_path, frame=None, oneliner=True) -> Tuple[str | List[str] frame_piece = Piece() frame_piece.dimensions = frame filter_word_pieces(frame_piece, top_piece) - the_lines = [l - for r in top_piece.pieces - for l in r.pieces - if l.transcription and l.level == PieceLevel.LINE] - # print('!!!!!!!!!', oneliner, len(top_piece.transcription), frame, ) + the_lines = _get_line_pieces_from_piece(top_piece) if oneliner: return top_piece.transcription, len(the_lines) else: @@ -583,7 +590,7 @@ def __init__( self.evaluation_map = {} self.text_mode = extras == EVAL_EXTRA_IGNORE_GEOMETRY self.is_sequential = False - self.metrics = [] + self.metrics: List = [] self.evaluation_report = {} def eval_all(self, entries: List[EvalEntry], sequential=False) -> None: @@ -636,6 +643,7 @@ def eval_entry(self, entry: EvalEntry) -> EvalEntry: # evaluate metric copies _current_metrics = [] + for _m in self.metrics: path_g = entry.path_g @@ -661,6 +669,7 @@ def eval_entry(self, entry: EvalEntry) -> EvalEntry: # read candidate data as text (txt_c, _) = to_text_func(path_c, coords, oneliner=True) + if not txt_c: print(f"[WARN ] candidate '{path_c}' contains no text") @@ -720,7 +729,7 @@ def eval_map(self): # if more than one single evaluation item # calculate additional statistics to reflect - # impact of outlying data sets + # impact of outlying data sets # take CA and number of GT into account # also calculate statistics (mean, std) if len(data_points) > 1: @@ -753,13 +762,13 @@ def aggregate(self, by_type=False, by_metrics=[0, 1, 2, 3]): # aggregate on each directory for _metrics_index in by_metrics: for ee in self.evaluation_entries: - # if we do not have all these different metrics set, + # if we do not have all these different metrics set, # do of course not aggregate by non-existing index! if _metrics_index >= len(self.evaluation_entries[0].metrics): continue path_key = f"{ee.metrics[_metrics_index].label}@{root_base}" # ATTENZIONE! works only when forehand - # the *real* attribute has been accessed + # the *real* attribute has been accessed # *at least one time* # kept this way for testing reasons metric_value = ee.metrics[_metrics_index].value diff --git a/src/digital_eval/metrics.py b/src/digital_eval/metrics.py index 9fa2b4e..8bc8d42 100644 --- a/src/digital_eval/metrics.py +++ b/src/digital_eval/metrics.py @@ -410,7 +410,6 @@ def __init__(self, precision=2, normalization=UC_NORMALIZATION_NFKD, calc_func=a self._label = 'DictLT' def _forward(self): - # print('#####', len(self._data_reference), len(self._data_candidate)) text: str = self._data_candidate text_list: List[str] = self._data_candidate.split() self._data_reference = text_list