From 2ed2c4f89ab4611d24e0a9328479124f88750ca1 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Tue, 13 Aug 2024 10:41:03 +0200
Subject: [PATCH 01/97] add executable property

---
 ocrd_cis/ocropy/binarize.py  | 10 ++++++----
 ocrd_cis/ocropy/clip.py      |  8 +++++---
 ocrd_cis/ocropy/denoise.py   |  8 +++++---
 ocrd_cis/ocropy/deskew.py    |  6 +++++-
 ocrd_cis/ocropy/dewarp.py    | 10 ++++++----
 ocrd_cis/ocropy/recognize.py | 10 ++++++----
 ocrd_cis/ocropy/resegment.py |  8 +++++---
 ocrd_cis/ocropy/segment.py   |  8 +++++---
 ocrd_cis/ocropy/train.py     |  6 +++++-
 9 files changed, 48 insertions(+), 26 deletions(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index 872185c3..7429d14a 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -28,8 +28,6 @@
 
 #sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 
-TOOL = 'ocrd-cis-ocropy-binarize'
-
 def binarize(pil_image, method='ocropy', maxskew=2, threshold=0.5, nrm=False, zoom=1.0):
     LOG = getLogger('processor.OcropyBinarize')
     LOG.debug('binarizing %dx%d image with method=%s', pil_image.width, pil_image.height, method)
@@ -71,13 +69,17 @@ class OcropyBinarize(Processor):
 
     def __init__(self, *args, **kwargs):
         self.ocrd_tool = get_ocrd_tool()
-        kwargs['ocrd_tool'] = self.ocrd_tool['tools'][TOOL]
+        kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
         kwargs['version'] = self.ocrd_tool['version']
         super(OcropyBinarize, self).__init__(*args, **kwargs)
         if hasattr(self, 'output_file_grp'):
             # processing context
             self.setup()
-    
+
+    @property
+    def executable(self):
+        return 'ocrd-cis-ocropy-binarize'
+
     def setup(self):
         self.logger = getLogger('processor.OcropyBinarize')
         if self.parameter['grayscale'] and self.parameter['method'] != 'ocropy':
diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py
index a305f09e..919b26b0 100644
--- a/ocrd_cis/ocropy/clip.py
+++ b/ocrd_cis/ocropy/clip.py
@@ -31,16 +31,18 @@
     pil2array, array2pil
 )
 
-TOOL = 'ocrd-cis-ocropy-clip'
-
 class OcropyClip(Processor):
 
     def __init__(self, *args, **kwargs):
         self.ocrd_tool = get_ocrd_tool()
-        kwargs['ocrd_tool'] = self.ocrd_tool['tools'][TOOL]
+        kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
         kwargs['version'] = self.ocrd_tool['version']
         super(OcropyClip, self).__init__(*args, **kwargs)
 
+    @property
+    def executable(self):
+        return 'ocrd-cis-ocropy-clip'
+
     def process(self):
         """Clip text regions / lines of the workspace at intersections with neighbours.
 
diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py
index cbbdf8cf..ac3c4dc5 100644
--- a/ocrd_cis/ocropy/denoise.py
+++ b/ocrd_cis/ocropy/denoise.py
@@ -19,16 +19,18 @@
     # binarize,
     remove_noise)
 
-TOOL = 'ocrd-cis-ocropy-denoise'
-
 class OcropyDenoise(Processor):
 
     def __init__(self, *args, **kwargs):
         self.ocrd_tool = get_ocrd_tool()
-        kwargs['ocrd_tool'] = self.ocrd_tool['tools'][TOOL]
+        kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
         kwargs['version'] = self.ocrd_tool['version']
         super(OcropyDenoise, self).__init__(*args, **kwargs)
 
+    @property
+    def executable(self):
+        return 'ocrd-cis-ocropy-denoise'
+
     def process(self):
         """Despeckle the pages / regions / lines of the workspace.
 
diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py
index 4ed04218..fe61fce3 100644
--- a/ocrd_cis/ocropy/deskew.py
+++ b/ocrd_cis/ocropy/deskew.py
@@ -34,10 +34,14 @@ class OcropyDeskew(Processor):
 
     def __init__(self, *args, **kwargs):
         ocrd_tool = get_ocrd_tool()
-        kwargs['ocrd_tool'] = ocrd_tool['tools'][TOOL]
+        kwargs['ocrd_tool'] = ocrd_tool['tools'][self.executable]
         kwargs['version'] = ocrd_tool['version']
         super(OcropyDeskew, self).__init__(*args, **kwargs)
 
+    @property
+    def executable(self):
+        return 'ocrd-cis-ocropy-deskew'
+
     def process(self):
         """Deskew the pages or regions of the workspace.
 
diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py
index 7d3251bf..1bc4a805 100644
--- a/ocrd_cis/ocropy/dewarp.py
+++ b/ocrd_cis/ocropy/dewarp.py
@@ -24,8 +24,6 @@
 
 #sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 
-TOOL = 'ocrd-cis-ocropy-dewarp'
-
 class InvalidLine(Exception):
     """Line image does not allow dewarping and should be ignored."""
 
@@ -72,13 +70,17 @@ class OcropyDewarp(Processor):
 
     def __init__(self, *args, **kwargs):
         self.ocrd_tool = get_ocrd_tool()
-        kwargs['ocrd_tool'] = self.ocrd_tool['tools'][TOOL]
+        kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
         kwargs['version'] = self.ocrd_tool['version']
         super(OcropyDewarp, self).__init__(*args, **kwargs)
         if hasattr(self, 'output_file_grp'):
             # processing context
             self.setup()
-    
+
+    @property
+    def executable(self):
+        return 'ocrd-cis-ocropy-dewarp'
+
     def setup(self):
         # defaults from ocrolib.lineest:
         self.lnorm = lineest.CenterNormalizer(
diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py
index 74d858ab..5734aa92 100644
--- a/ocrd_cis/ocropy/recognize.py
+++ b/ocrd_cis/ocropy/recognize.py
@@ -30,8 +30,6 @@
     check_line
 )
 
-TOOL = 'ocrd-cis-ocropy-recognize'
-
 def resize_keep_ratio(image, baseheight=48):
     scale = baseheight / image.height
     wsize = round(image.width * scale)
@@ -85,13 +83,17 @@ def __init__(self, *args, **kwargs):
         self.ocrd_tool = get_ocrd_tool()
         self.pad = 16 # ocropus-rpred default
         self.network = None # set in process
-        kwargs['ocrd_tool'] = self.ocrd_tool['tools'][TOOL]
+        kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
         kwargs['version'] = self.ocrd_tool['version']
         super(OcropyRecognize, self).__init__(*args, **kwargs)
         if hasattr(self, 'output_file_grp'):
             # processing context
             self.setup()
-    
+
+    @property
+    def executable(self):
+        return 'ocrd-cis-ocropy-recognize'
+
     def setup(self):
         self.logger = getLogger('processor.OcropyRecognize')
         # from ocropus-rpred:
diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index a337b5e0..2b1f73c3 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -46,16 +46,18 @@
     diff_polygons
 )
 
-TOOL = 'ocrd-cis-ocropy-resegment'
-
 class OcropyResegment(Processor):
 
     def __init__(self, *args, **kwargs):
         self.ocrd_tool = get_ocrd_tool()
-        kwargs['ocrd_tool'] = self.ocrd_tool['tools'][TOOL]
+        kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
         kwargs['version'] = self.ocrd_tool['version']
         super().__init__(*args, **kwargs)
 
+    @property
+    def executable(self):
+        return 'ocrd-cis-ocropy-resegment'
+
     def process(self):
         """Resegment lines of the workspace.
 
diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index 49cb6776..1624597e 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -58,8 +58,6 @@
     lines2regions
 )
 
-TOOL = 'ocrd-cis-ocropy-segment'
-
 def masks2polygons(bg_labels, baselines, fg_bin, name, min_area=None, simplify=None, open_holes=False, reorder=True):
     """Convert label masks into polygon coordinates.
 
@@ -248,10 +246,14 @@ class OcropySegment(Processor):
 
     def __init__(self, *args, **kwargs):
         self.ocrd_tool = get_ocrd_tool()
-        kwargs['ocrd_tool'] = self.ocrd_tool['tools'][TOOL]
+        kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
         kwargs['version'] = self.ocrd_tool['version']
         super(OcropySegment, self).__init__(*args, **kwargs)
 
+    @property
+    def executable(self):
+        return 'ocrd-cis-ocropy-segment'
+
     def process(self):
         """Segment pages into regions+lines, tables into cells+lines, or regions into lines.
         
diff --git a/ocrd_cis/ocropy/train.py b/ocrd_cis/ocropy/train.py
index d257a61f..46e9d258 100644
--- a/ocrd_cis/ocropy/train.py
+++ b/ocrd_cis/ocropy/train.py
@@ -32,13 +32,17 @@ class OcropyTrain(Processor):
     def __init__(self, *args, **kwargs):
         self.oldcwd = os.getcwd()
         ocrd_tool = get_ocrd_tool()
-        kwargs['ocrd_tool'] = ocrd_tool['tools']['ocrd-cis-ocropy-train']
+        kwargs['ocrd_tool'] = ocrd_tool['tools'][self.executable]
         kwargs['version'] = ocrd_tool['version']
         super(OcropyTrain, self).__init__(*args, **kwargs)
         if hasattr(self, 'input_file_grp'):
             # processing context
             self.setup()
 
+    @property
+    def executable(self):
+        return 'ocrd-cis-ocropy-train'
+
     def setup(self):
         self.log = getLogger('processor.OcropyTrain')
         #print(self.parameter)

From 61e6caf06ff479d4e6a8c59d85254d5a25fa79e4 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Tue, 13 Aug 2024 10:54:46 +0200
Subject: [PATCH 02/97] add setup method if missing

---
 ocrd_cis/ocropy/binarize.py  | 10 ++++++----
 ocrd_cis/ocropy/clip.py      |  5 +++++
 ocrd_cis/ocropy/denoise.py   |  5 +++++
 ocrd_cis/ocropy/deskew.py    |  5 +++++
 ocrd_cis/ocropy/dewarp.py    |  4 +++-
 ocrd_cis/ocropy/recognize.py |  4 +++-
 ocrd_cis/ocropy/resegment.py |  5 +++++
 ocrd_cis/ocropy/segment.py   |  5 +++++
 ocrd_cis/ocropy/train.py     |  2 +-
 9 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index 7429d14a..f42ff2bd 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -68,6 +68,7 @@ def binarize(pil_image, method='ocropy', maxskew=2, threshold=0.5, nrm=False, zo
 class OcropyBinarize(Processor):
 
     def __init__(self, *args, **kwargs):
+        self.logger = getLogger('processor.OcropyBinarize')
         self.ocrd_tool = get_ocrd_tool()
         kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
         kwargs['version'] = self.ocrd_tool['version']
@@ -81,10 +82,11 @@ def executable(self):
         return 'ocrd-cis-ocropy-binarize'
 
     def setup(self):
-        self.logger = getLogger('processor.OcropyBinarize')
-        if self.parameter['grayscale'] and self.parameter['method'] != 'ocropy':
-            self.logger.critical('requested method %s does not support grayscale normalized output',
-                                 self.parameter['method'])
+        assert_file_grp_cardinality(self.input_file_grp, 1)
+        assert_file_grp_cardinality(self.output_file_grp, 1)
+        method = self.parameter['method']
+        if self.parameter['grayscale'] and method != 'ocropy':
+            self.logger.critical(f'Requested method {method} does not support grayscale normalized output')
             raise Exception('only method=ocropy allows grayscale=true')
 
     def process(self):
diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py
index 919b26b0..d11b8eae 100644
--- a/ocrd_cis/ocropy/clip.py
+++ b/ocrd_cis/ocropy/clip.py
@@ -34,6 +34,7 @@
 class OcropyClip(Processor):
 
     def __init__(self, *args, **kwargs):
+        self.logger = getLogger('processor.OcropyClip')
         self.ocrd_tool = get_ocrd_tool()
         kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
         kwargs['version'] = self.ocrd_tool['version']
@@ -43,6 +44,10 @@ def __init__(self, *args, **kwargs):
     def executable(self):
         return 'ocrd-cis-ocropy-clip'
 
+    def setup(self):
+        assert_file_grp_cardinality(self.input_file_grp, 1)
+        assert_file_grp_cardinality(self.output_file_grp, 1)
+
     def process(self):
         """Clip text regions / lines of the workspace at intersections with neighbours.
 
diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py
index ac3c4dc5..fc1b582e 100644
--- a/ocrd_cis/ocropy/denoise.py
+++ b/ocrd_cis/ocropy/denoise.py
@@ -22,6 +22,7 @@
 class OcropyDenoise(Processor):
 
     def __init__(self, *args, **kwargs):
+        self.logger = getLogger('processor.OcropyDenoise')
         self.ocrd_tool = get_ocrd_tool()
         kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
         kwargs['version'] = self.ocrd_tool['version']
@@ -31,6 +32,10 @@ def __init__(self, *args, **kwargs):
     def executable(self):
         return 'ocrd-cis-ocropy-denoise'
 
+    def setup(self):
+        assert_file_grp_cardinality(self.input_file_grp, 1)
+        assert_file_grp_cardinality(self.output_file_grp, 1)
+
     def process(self):
         """Despeckle the pages / regions / lines of the workspace.
 
diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py
index fe61fce3..1ffaec62 100644
--- a/ocrd_cis/ocropy/deskew.py
+++ b/ocrd_cis/ocropy/deskew.py
@@ -33,6 +33,7 @@ def deskew(pil_image, maxskew=2):
 class OcropyDeskew(Processor):
 
     def __init__(self, *args, **kwargs):
+        self.logger = getLogger('processor.OcropyDeskew')
         ocrd_tool = get_ocrd_tool()
         kwargs['ocrd_tool'] = ocrd_tool['tools'][self.executable]
         kwargs['version'] = ocrd_tool['version']
@@ -42,6 +43,10 @@ def __init__(self, *args, **kwargs):
     def executable(self):
         return 'ocrd-cis-ocropy-deskew'
 
+    def setup(self):
+        assert_file_grp_cardinality(self.input_file_grp, 1)
+        assert_file_grp_cardinality(self.output_file_grp, 1)
+
     def process(self):
         """Deskew the pages or regions of the workspace.
 
diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py
index 1bc4a805..89a62e11 100644
--- a/ocrd_cis/ocropy/dewarp.py
+++ b/ocrd_cis/ocropy/dewarp.py
@@ -69,6 +69,7 @@ def padvert(image, range_):
 class OcropyDewarp(Processor):
 
     def __init__(self, *args, **kwargs):
+        self.logger = getLogger('processor.OcropyDewarp')
         self.ocrd_tool = get_ocrd_tool()
         kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
         kwargs['version'] = self.ocrd_tool['version']
@@ -82,6 +83,8 @@ def executable(self):
         return 'ocrd-cis-ocropy-dewarp'
 
     def setup(self):
+        assert_file_grp_cardinality(self.input_file_grp, 1)
+        assert_file_grp_cardinality(self.output_file_grp, 1)
         # defaults from ocrolib.lineest:
         self.lnorm = lineest.CenterNormalizer(
             params=(self.parameter['range'],
@@ -91,7 +94,6 @@ def setup(self):
                     #  dependency between smoothness
                     #  and extra params)
                     0.3))
-        self.logger = getLogger('processor.OcropyDewarp')
 
     def process(self):
         """Dewarp the lines of the workspace.
diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py
index 5734aa92..fdeaed27 100644
--- a/ocrd_cis/ocropy/recognize.py
+++ b/ocrd_cis/ocropy/recognize.py
@@ -80,6 +80,7 @@ def recognize(image, pad, network, check=True):
 class OcropyRecognize(Processor):
 
     def __init__(self, *args, **kwargs):
+        self.logger = getLogger('processor.OcropyRecognize')
         self.ocrd_tool = get_ocrd_tool()
         self.pad = 16 # ocropus-rpred default
         self.network = None # set in process
@@ -95,7 +96,8 @@ def executable(self):
         return 'ocrd-cis-ocropy-recognize'
 
     def setup(self):
-        self.logger = getLogger('processor.OcropyRecognize')
+        assert_file_grp_cardinality(self.input_file_grp, 1)
+        assert_file_grp_cardinality(self.output_file_grp, 1)
         # from ocropus-rpred:
         self.network = load_object(self.get_model(), verbose=1)
         for x in self.network.walk():
diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index 2b1f73c3..d9a92390 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -49,6 +49,7 @@
 class OcropyResegment(Processor):
 
     def __init__(self, *args, **kwargs):
+        self.logger = getLogger('processor.OcropyResegment')
         self.ocrd_tool = get_ocrd_tool()
         kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
         kwargs['version'] = self.ocrd_tool['version']
@@ -58,6 +59,10 @@ def __init__(self, *args, **kwargs):
     def executable(self):
         return 'ocrd-cis-ocropy-resegment'
 
+    def setup(self):
+        assert_file_grp_cardinality(self.input_file_grp, 1)
+        assert_file_grp_cardinality(self.output_file_grp, 1)
+
     def process(self):
         """Resegment lines of the workspace.
 
diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index 1624597e..7488eefe 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -245,6 +245,7 @@ def getx(xy):
 class OcropySegment(Processor):
 
     def __init__(self, *args, **kwargs):
+        self.logger = getLogger('processor.OcropySegment')
         self.ocrd_tool = get_ocrd_tool()
         kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
         kwargs['version'] = self.ocrd_tool['version']
@@ -254,6 +255,10 @@ def __init__(self, *args, **kwargs):
     def executable(self):
         return 'ocrd-cis-ocropy-segment'
 
+    def setup(self):
+        assert_file_grp_cardinality(self.input_file_grp, 1)
+        assert_file_grp_cardinality(self.output_file_grp, 1)
+
     def process(self):
         """Segment pages into regions+lines, tables into cells+lines, or regions into lines.
         
diff --git a/ocrd_cis/ocropy/train.py b/ocrd_cis/ocropy/train.py
index 46e9d258..25317c4d 100644
--- a/ocrd_cis/ocropy/train.py
+++ b/ocrd_cis/ocropy/train.py
@@ -30,6 +30,7 @@ def resize_keep_ratio(image, baseheight=48):
 class OcropyTrain(Processor):
 
     def __init__(self, *args, **kwargs):
+        self.log = getLogger('processor.OcropyTrain')
         self.oldcwd = os.getcwd()
         ocrd_tool = get_ocrd_tool()
         kwargs['ocrd_tool'] = ocrd_tool['tools'][self.executable]
@@ -44,7 +45,6 @@ def executable(self):
         return 'ocrd-cis-ocropy-train'
 
     def setup(self):
-        self.log = getLogger('processor.OcropyTrain')
         #print(self.parameter)
         if 'model' in self.parameter:
             model = self.parameter['model']

From a0965c2aa7d6315f001606bc1c6043a020095ef9 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Tue, 13 Aug 2024 14:02:55 +0200
Subject: [PATCH 03/97] add self.logger wherever missing

---
 ocrd_cis/ocropy/clip.py      |  20 +++---
 ocrd_cis/ocropy/denoise.py   |  16 ++---
 ocrd_cis/ocropy/deskew.py    |  14 ++--
 ocrd_cis/ocropy/resegment.py |  74 +++++++++----------
 ocrd_cis/ocropy/segment.py   | 136 ++++++++++++++++++-----------------
 5 files changed, 129 insertions(+), 131 deletions(-)

diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py
index d11b8eae..4c0eebea 100644
--- a/ocrd_cis/ocropy/clip.py
+++ b/ocrd_cis/ocropy/clip.py
@@ -83,13 +83,12 @@ def process(self):
         # too. However, region-level clipping _must_ be run before region-level
         # deskewing, because that would make segments incomensurable with their
         # neighbours.
-        LOG = getLogger('processor.OcropyClip')
         level = self.parameter['level-of-operation']
         assert_file_grp_cardinality(self.input_file_grp, 1)
         assert_file_grp_cardinality(self.output_file_grp, 1)
 
         for (n, input_file) in enumerate(self.input_files):
-            LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
+            self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
             file_id = make_file_id(input_file, self.output_file_grp)
 
             pcgts = page_from_file(self.workspace.download_file(input_file))
@@ -105,7 +104,7 @@ def process(self):
                 dpi = page_image_info.resolution
                 if page_image_info.resolutionUnit == 'cm':
                     dpi *= 2.54
-                LOG.info('Page "%s" uses %f DPI', page_id, dpi)
+                self.logger.info('Page "%s" uses %f DPI', page_id, dpi)
                 zoom = 300.0/dpi
             else:
                 zoom = 1
@@ -127,7 +126,7 @@ def process(self):
                 page.get_TableRegion() +
                 page.get_UnknownRegion())
             if not num_texts:
-                LOG.warning('Page "%s" contains no text regions', page_id)
+                self.logger.warning('Page "%s" contains no text regions', page_id)
             background = ImageStat.Stat(page_image)
             # workaround for Pillow#4925
             if len(background.bands) > 1:
@@ -158,7 +157,7 @@ def process(self):
                 if level == 'region':
                     if region.get_AlternativeImage():
                         # FIXME: This should probably be an exception (bad workflow configuration).
-                        LOG.warning('Page "%s" region "%s" already contains image data: skipping',
+                        self.logger.warning('Page "%s" region "%s" already contains image data: skipping',
                                     page_id, region.id)
                         continue
                     shape = prep(shapes[i])
@@ -176,7 +175,7 @@ def process(self):
                 # level == 'line':
                 lines = region.get_TextLine()
                 if not lines:
-                    LOG.warning('Page "%s" region "%s" contains no text lines', page_id, region.id)
+                    self.logger.warning('Page "%s" region "%s" contains no text lines', page_id, region.id)
                     continue
                 region_image, region_coords = self.workspace.image_from_segment(
                     region, page_image, page_coords, feature_selector='binarized')
@@ -194,7 +193,7 @@ def process(self):
                 for j, line in enumerate(lines):
                     if line.get_AlternativeImage():
                         # FIXME: This should probably be an exception (bad workflow configuration).
-                        LOG.warning('Page "%s" region "%s" line "%s" already contains image data: skipping',
+                        self.logger.warning('Page "%s" region "%s" line "%s" already contains image data: skipping',
                                     page_id, region.id, line.id)
                         continue
                     shape = prep(shapes[j])
@@ -219,13 +218,12 @@ def process(self):
                 local_filename=file_path,
                 mimetype=MIMETYPE_PAGE,
                 content=to_xml(pcgts))
-            LOG.info('created file ID: %s, file_grp: %s, path: %s',
+            self.logger.info('created file ID: %s, file_grp: %s, path: %s',
                      file_id, self.output_file_grp, out.local_filename)
 
     def process_segment(self, segment, segment_mask, segment_polygon, neighbours,
                         background_image, parent_image, parent_coords, parent_bin,
                         page_id, file_id):
-        LOG = getLogger('processor.OcropyClip')
         # initialize AlternativeImage@comments classes from parent, except
         # for those operations that can apply on multiple hierarchy levels:
         features = ','.join(
@@ -237,7 +235,7 @@ def process_segment(self, segment, segment_mask, segment_polygon, neighbours,
         segment_bbox = bbox_from_polygon(segment_polygon)
         for neighbour, neighbour_mask in neighbours:
             if not np.any(segment_mask > neighbour_mask):
-                LOG.info('Ignoring enclosing neighbour "%s" of segment "%s" on page "%s"',
+                self.logger.info('Ignoring enclosing neighbour "%s" of segment "%s" on page "%s"',
                          neighbour.id, segment.id, page_id)
                 continue
             # find connected components that (only) belong to the neighbour:
@@ -247,7 +245,7 @@ def process_segment(self, segment, segment_mask, segment_polygon, neighbours,
             num_foreground = np.count_nonzero(segment_mask * parent_bin)
             if not num_intruders:
                 continue
-            LOG.debug('segment "%s" vs neighbour "%s": suppressing %d of %d pixels on page "%s"',
+            self.logger.debug('segment "%s" vs neighbour "%s": suppressing %d of %d pixels on page "%s"',
                       segment.id, neighbour.id, num_intruders, num_foreground, page_id)
             # suppress in segment_mask so these intruders can stay in the neighbours
             # (are not removed from both sides)
diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py
index fc1b582e..d6a4f7ff 100644
--- a/ocrd_cis/ocropy/denoise.py
+++ b/ocrd_cis/ocropy/denoise.py
@@ -57,13 +57,12 @@ def process(self):
 
         Produce a new output file by serialising the resulting hierarchy.
         """
-        LOG = getLogger('processor.OcropyDenoise')
         level = self.parameter['level-of-operation']
         assert_file_grp_cardinality(self.input_file_grp, 1)
         assert_file_grp_cardinality(self.output_file_grp, 1)
 
         for (n, input_file) in enumerate(self.input_files):
-            LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
+            self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
             file_id = make_file_id(input_file, self.output_file_grp)
 
             pcgts = page_from_file(self.workspace.download_file(input_file))
@@ -80,7 +79,7 @@ def process(self):
                 dpi = page_image_info.resolution
                 if page_image_info.resolutionUnit == 'cm':
                     dpi *= 2.54
-                LOG.info('Page "%s" uses %f DPI', page_id, dpi)
+                self.logger.info('Page "%s" uses %f DPI', page_id, dpi)
                 zoom = 300.0/dpi
             else:
                 zoom = 1
@@ -91,7 +90,7 @@ def process(self):
             else:
                 regions = page.get_AllRegions(classes=['Text'], order='reading-order')
                 if not regions:
-                    LOG.warning('Page "%s" contains no text regions', page_id)
+                    self.logger.warning('Page "%s" contains no text regions', page_id)
                 for region in regions:
                     region_image, region_xywh = self.workspace.image_from_segment(
                         region, page_image, page_xywh,
@@ -102,7 +101,7 @@ def process(self):
                         continue
                     lines = region.get_TextLine()
                     if not lines:
-                        LOG.warning('Page "%s" region "%s" contains no text lines', page_id, region.id)
+                        self.logger.warning('Page "%s" region "%s" contains no text lines', page_id, region.id)
                     for line in lines:
                         line_image, line_xywh = self.workspace.image_from_segment(
                             line, region_image, region_xywh,
@@ -121,15 +120,14 @@ def process(self):
                 local_filename=file_path,
                 mimetype=MIMETYPE_PAGE,
                 content=to_xml(pcgts))
-            LOG.info('created file ID: %s, file_grp: %s, path: %s',
+            self.logger.info('created file ID: %s, file_grp: %s, path: %s',
                      file_id, self.output_file_grp, out.local_filename)
 
     def process_segment(self, segment, segment_image, segment_xywh, zoom, page_id, file_id):
-        LOG = getLogger('processor.OcropyDenoise')
         if not segment_image.width or not segment_image.height:
-            LOG.warning("Skipping '%s' with zero size", file_id)
+            self.logger.warning("Skipping '%s' with zero size", file_id)
             return
-        LOG.info("About to despeckle '%s'", file_id)
+        self.logger.info("About to despeckle '%s'", file_id)
         bin_image = remove_noise(segment_image,
                                  maxsize=self.parameter['noise_maxsize']/zoom*300/72) # in pt
         # update METS (add the image file):
diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py
index 1ffaec62..63bb6b97 100644
--- a/ocrd_cis/ocropy/deskew.py
+++ b/ocrd_cis/ocropy/deskew.py
@@ -65,13 +65,12 @@ def process(self):
 
         Produce a new output file by serialising the resulting hierarchy.
         """
-        LOG = getLogger('processor.OcropyDeskew')
         level = self.parameter['level-of-operation']
         assert_file_grp_cardinality(self.input_file_grp, 1)
         assert_file_grp_cardinality(self.output_file_grp, 1)
 
         for (n, input_file) in enumerate(self.input_files):
-            LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
+            self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
             file_id = make_file_id(input_file, self.output_file_grp)
 
             pcgts = page_from_file(self.workspace.download_file(input_file))
@@ -95,7 +94,7 @@ def process(self):
                 else: # region
                     regions = page.get_AllRegions(classes=['Text'], order='reading-order')
                 if not regions:
-                    LOG.warning('Page "%s" contains no text regions', page_id)
+                    self.logger.warning('Page "%s" contains no text regions', page_id)
                 for region in regions:
                     # process region:
                     region_image, region_coords = self.workspace.image_from_segment(
@@ -118,23 +117,22 @@ def process(self):
                 local_filename=file_path,
                 mimetype=MIMETYPE_PAGE,
                 content=to_xml(pcgts))
-            LOG.info('created file ID: %s, file_grp: %s, path: %s',
+            self.logger.info('created file ID: %s, file_grp: %s, path: %s',
                      file_id, self.output_file_grp, out.local_filename)
 
     def _process_segment(self, segment, segment_image, segment_coords, segment_id, page_id, file_id):
-        LOG = getLogger('processor.OcropyDeskew')
         if not segment_image.width or not segment_image.height:
-            LOG.warning("Skipping %s with zero size", segment_id)
+            self.logger.warning("Skipping %s with zero size", segment_id)
             return
         angle0 = segment_coords['angle'] # deskewing (w.r.t. top image) already applied to segment_image
-        LOG.info("About to deskew %s", segment_id)
+        self.logger.info("About to deskew %s", segment_id)
         angle = deskew(segment_image, maxskew=self.parameter['maxskew']) # additional angle to be applied
         # segment angle: PAGE orientation is defined clockwise,
         # whereas PIL/ndimage rotation is in mathematical direction:
         orientation = -(angle + angle0)
         orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180]
         segment.set_orientation(orientation) # also removes all deskewed AlternativeImages
-        LOG.info("Found angle for %s: %.1f", segment_id, angle)
+        self.logger.info("Found angle for %s: %.1f", segment_id, angle)
         # delegate reflection, rotation and re-cropping to core:
         if isinstance(segment, PageType):
             segment_image, segment_coords, _ = self.workspace.image_from_page(
diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index d9a92390..2261cf3e 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -105,7 +105,6 @@ def process(self):
 
         Produce a new output file by serialising the resulting hierarchy.
         """
-        LOG = getLogger('processor.OcropyResegment')
         # This makes best sense for bad/coarse line segmentation, like current GT
         # or as postprocessing for bbox-only steps like Tesseract.
         # Most notably, it can convert rectangles to polygons (polygonalization),
@@ -120,7 +119,7 @@ def process(self):
         assert_file_grp_cardinality(self.output_file_grp, 1)
 
         for n, input_file in enumerate(self.input_files):
-            LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
+            self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
             file_id = make_file_id(input_file, self.output_file_grp)
 
             pcgts = page_from_file(self.workspace.download_file(input_file))
@@ -136,7 +135,7 @@ def process(self):
                 dpi = page_image_info.resolution
                 if page_image_info.resolutionUnit == 'cm':
                     dpi *= 2.54
-                LOG.info('Page "%s" uses %f DPI', page_id, dpi)
+                self.logger.info('Page "%s" uses %f DPI', page_id, dpi)
                 zoom = 300.0/dpi
             else:
                 zoom = 1
@@ -156,14 +155,14 @@ def process(self):
                       page.get_CustomRegion())
             regions = page.get_AllRegions(classes=['Text'])
             if not regions:
-                LOG.warning('Page "%s" contains no text regions', page_id)
+                self.logger.warning('Page "%s" contains no text regions', page_id)
             elif level == 'page':
                 lines = [line for region in regions
                          for line in region.get_TextLine()]
                 if lines:
                     self._process_segment(page, page_image, page_coords, page_id, zoom, lines, ignore)
                 else:
-                    LOG.warning('Page "%s" contains no text regions with lines', page_id)
+                    self.logger.warning('Page "%s" contains no text regions with lines', page_id)
             else:
                 for region in regions:
                     lines = region.get_TextLine()
@@ -172,7 +171,7 @@ def process(self):
                             region, page_image, page_coords, feature_selector='binarized')
                         self._process_segment(region, region_image, region_coords, page_id, zoom, lines, ignore)
                     else:
-                        LOG.warning('Page "%s" region "%s" contains no text lines', page_id, region.id)
+                        self.logger.warning('Page "%s" region "%s" contains no text lines', page_id, region.id)
 
             # update METS (add the PAGE file):
             file_path = os.path.join(self.output_file_grp, file_id + '.xml')
@@ -184,11 +183,10 @@ def process(self):
                 local_filename=file_path,
                 mimetype=MIMETYPE_PAGE,
                 content=to_xml(pcgts))
-            LOG.info('created file ID: %s, file_grp: %s, path: %s',
+            self.logger.info('created file ID: %s, file_grp: %s, path: %s',
                      file_id, self.output_file_grp, out.local_filename)
 
     def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, lines, ignore):
-        LOG = getLogger('processor.OcropyResegment')
         threshold = self.parameter['min_fraction']
         method = self.parameter['method']
         maxdist = self.parameter['spread']/zoom*300/72 # in pt
@@ -206,7 +204,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
             fullpage = False
             report = check_region(parent_bin, zoom)
         if report:
-            LOG.warning('Invalid %s "%s": %s', tag,
+            self.logger.warning('Invalid %s "%s": %s', tag,
                         page_id if fullpage else parent.id, report)
             return
         # get existing line labels:
@@ -234,7 +232,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
             line_labels[i, line_y, line_x] = True
         # only text region(s) may contain new text lines
         for i, region in enumerate(set(line.parent_object_ for line in lines)):
-            LOG.debug('unmasking area of text region "%s" for "%s"',
+            self.logger.debug('unmasking area of text region "%s" for "%s"',
                       region.id, page_id if fullpage else parent.id)
             region_polygon = coordinates_of_segment(region, parent_image, parent_coords)
             region_polygon = make_valid(Polygon(region_polygon))
@@ -244,14 +242,14 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                                     parent_bin.shape)] = False
         # mask/ignore overlapping neighbours
         for i, segment in enumerate(ignore):
-            LOG.debug('masking area of %s "%s" for "%s"', type(segment).__name__[:-4],
+            self.logger.debug('masking area of %s "%s" for "%s"', type(segment).__name__[:-4],
                       segment.id, page_id if fullpage else parent.id)
             segment_polygon = coordinates_of_segment(segment, parent_image, parent_coords)
             ignore_bin[draw.polygon(segment_polygon[:, 1],
                                     segment_polygon[:, 0],
                                     parent_bin.shape)] = True
         if method != 'lineest':
-            LOG.debug('calculating connected component and distance transforms for "%s"', parent.id)
+            self.logger.debug('calculating connected component and distance transforms for "%s"', parent.id)
             bin = parent_bin & ~ ignore_bin
             components, _ = morph.label(bin)
             # estimate glyph scale (roughly)
@@ -260,7 +258,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                 counts = np.sqrt(3 * counts)
                 scale = int(np.median(counts[(5/zoom < counts) & (counts < 100/zoom)]))
                 components *= (counts > 15/zoom)[components]
-                LOG.debug("estimated scale: %d", scale)
+                self.logger.debug("estimated scale: %d", scale)
             else:
                 scale = 43
             if method == 'ccomps':
@@ -278,7 +276,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                 new_labels = np.zeros_like(parent_bin, np.uint8)
                 for i, line in enumerate(lines):
                     if line.Baseline is None:
-                        LOG.warning("Skipping '%s' without baseline", line.id)
+                        self.logger.warning("Skipping '%s' without baseline", line.id)
                         new_labels[line_labels[i]] = i + 1
                         continue
                     line_baseline = baseline_of_segment(line, parent_coords)
@@ -289,22 +287,23 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                                                   parent_bin.shape)
                     new_labels[line_y, line_x] = i + 1
             spread_dist(lines, line_labels, new_labels, parent_bin, components, parent_coords,
-                        maxdist=maxdist or scale/2, loc=parent.id, threshold=threshold)
+                        maxdist=maxdist or scale/2, loc=parent.id, threshold=threshold, logger=self.logger)
             return
         try:
+            # TODO: 'scale' passed as a param may not be always defined (mehmedGIT)
             new_line_labels, new_baselines, _, _, _, scale = compute_segmentation(
                 parent_bin, seps=ignore_bin, zoom=zoom, spread_dist=maxdist or scale/2,
                 fullpage=fullpage, maxseps=0, maxcolseps=len(ignore), maximages=0)
         except Exception as err:
-            LOG.error('Cannot line-segment %s "%s": %s',
+            self.logger.error('Cannot line-segment %s "%s": %s',
                       tag, page_id if fullpage else parent.id, err)
             return
-        LOG.info("Found %d new line labels for %d existing lines on %s '%s'",
+        self.logger.info("Found %d new line labels for %d existing lines on %s '%s'",
                  new_line_labels.max(), len(lines), tag, parent.id)
         # polygonalize and prepare comparison
         new_line_polygons, new_line_labels = masks2polygons(
             new_line_labels, new_baselines, parent_bin, '%s "%s"' % (tag, parent.id),
-            min_area=640/zoom/zoom)
+            min_area=640/zoom/zoom, logger=self.logger)
         DSAVE('line_labels', [np.argmax(np.insert(line_labels, 0, 0, axis=0), axis=0), parent_bin])
         DSAVE('new_line_labels', [new_line_labels, parent_bin])
         new_line_polygons, new_baselines = list(zip(*[(Polygon(poly), LineString(base))
@@ -387,41 +386,41 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
         for j, line in enumerate(lines):
             new_lines = np.nonzero(assignments == j)[0]
             if not np.prod(new_lines.shape):
-                LOG.debug("no lines for '%s' match or fit", line.id)
+                self.logger.debug("no lines for '%s' match or fit", line.id)
                 continue
             covers = np.sum(covers_bg[new_lines,j])
             if covers < threshold / 3:
-                LOG.debug("new lines for '%s' only cover %.1f%% bg",
+                self.logger.debug("new lines for '%s' only cover %.1f%% bg",
                           line.id, covers * 100)
                 continue
             covers = np.sum(covers_fg[new_lines,j])
             if covers < threshold:
-                LOG.debug("new lines for '%s' only cover %.1f%% fg",
+                self.logger.debug("new lines for '%s' only cover %.1f%% fg",
                           line.id, covers * 100)
                 continue
             looses = (assignments < 0) & (covers_bg[:,j] > 0.1)
             if looses.any():
                 covers = np.sum(covers_bg[np.nonzero(looses)[0],j])
-                LOG.debug("new lines for '%s' would loose %d non-matching segments totalling %.1f%% bg",
+                self.logger.debug("new lines for '%s' would loose %d non-matching segments totalling %.1f%% bg",
                           line.id, np.count_nonzero(looses), covers * 100)
                 continue
             line_count = np.count_nonzero(line_labels[j] & parent_bin)
             new_count = covers * line_count
-            LOG.debug('Black pixels before/after resegment of line "%s": %d/%d',
+            self.logger.debug('Black pixels before/after resegment of line "%s": %d/%d',
                       line.id, line_count, new_count)
             # combine all assigned new lines to single outline polygon
             if len(new_lines) > 1:
-                LOG.debug("joining %d new line polygons for '%s'", len(new_lines), line.id)
+                self.logger.debug("joining %d new line polygons for '%s'", len(new_lines), line.id)
             new_polygon = join_polygons([new_line_polygons[i] #intersections[(i, j)]
                                          for i in new_lines], loc=line.id, scale=scale)
             new_baseline = join_baselines([new_polygon.intersection(new_baselines[i])
-                                           for i in new_lines], loc=line.id)
+                                           for i in new_lines], loc=line.id, logger=self.logger)
             # convert back to absolute (page) coordinates:
             line_polygon = coordinates_for_segment(new_polygon.exterior.coords[:-1],
                                                    parent_image, parent_coords)
             line_polygon = polygon_for_parent(line_polygon, line.parent_object_)
             if line_polygon is None:
-                LOG.warning("Ignoring extant new polygon for line '%s'", line.id)
+                self.logger.warning("Ignoring extant new polygon for line '%s'", line.id)
                 return
             # annotate result:
             line.get_Coords().set_points(points_from_polygon(line_polygon))
@@ -436,7 +435,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                     if j == otherj:
                         continue
                     otherline = lines[otherj]
-                    LOG.debug("subtracting new '%s' from overlapping '%s'", line.id, otherline.id)
+                    self.logger.debug("subtracting new '%s' from overlapping '%s'", line.id, otherline.id)
                     other_polygon = diff_polygons(line_polygons[otherj].context, new_polygon)
                     if other_polygon.is_empty:
                         continue
@@ -445,14 +444,15 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                                                             parent_image, parent_coords)
                     other_polygon = polygon_for_parent(other_polygon, otherline.parent_object_)
                     if other_polygon is None:
-                        LOG.warning("Ignoring extant new polygon for line '%s'", otherline.id)
+                        self.logger.warning("Ignoring extant new polygon for line '%s'", otherline.id)
                         continue
                     otherline.get_Coords().set_points(points_from_polygon(other_polygon))
 
 def spread_dist(lines, old_labels, new_labels, binarized, components, coords,
-                maxdist=43, loc='', threshold=0.9):
+                maxdist=43, loc='', threshold=0.9, logger = None):
     """redefine line coordinates by contourizing spread of connected components propagated from new labels"""
-    LOG = getLogger('processor.OcropyResegment')
+    if not logger:
+        raise ValueError(f"Logger has not been passed by the caller")
     DSAVE('seeds', [new_labels, (components>0)])
     # allocate to connected components consistently
     # (ignoring smallest components like punctuation)
@@ -477,29 +477,29 @@ def spread_dist(lines, old_labels, new_labels, binarized, components, coords,
             continue
         count = np.count_nonzero(old_label)
         if not count:
-            LOG.warning("skipping zero-area line '%s'", line.id)
+            logger.warning("skipping zero-area line '%s'", line.id)
             continue
         covers = np.count_nonzero(new_label) / count
         if covers < threshold / 3:
-            LOG.debug("new line for '%s' only covers %.1f%% bg",
+            logger.debug("new line for '%s' only covers %.1f%% bg",
                       line.id, covers * 100)
             continue
         count = np.count_nonzero(old_label * binarized)
         if not count:
-            LOG.warning("skipping binary-empty line '%s'", line.id)
+            logger.warning("skipping binary-empty line '%s'", line.id)
             continue
         covers = np.count_nonzero(new_label * binarized) / count
         if covers < threshold:
-            LOG.debug("new line for '%s' only covers %.1f%% fg",
+            logger.debug("new line for '%s' only covers %.1f%% fg",
                       line.id, covers * 100)
             continue
-        LOG.debug('Black pixels before/after resegment of line "%s": %d/%d',
+        logger.debug('Black pixels before/after resegment of line "%s": %d/%d',
                   line.id, count, covers * count)
         contours = [contour[:,::-1] # get x,y order again
                     for contour, area in morph.find_contours(new_label)]
         #LOG.debug("joining %d subsegments for %s", len(contours), line.id)
         if len(contours) == 0:
-            LOG.warning("no contours for %s - keeping", line.id)
+            logger.warning("no contours for %s - keeping", line.id)
             continue
         else:
             # get alpha shape
@@ -511,7 +511,7 @@ def spread_dist(lines, old_labels, new_labels, binarized, components, coords,
         polygon = coordinates_for_segment(poly, None, coords)
         polygon = polygon_for_parent(polygon, line.parent_object_)
         if polygon is None:
-            LOG.warning("Ignoring extant line for %s", line.id)
+            logger.warning("Ignoring extant line for %s", line.id)
             continue
         line.get_Coords().set_points(points_from_polygon(polygon))
 
diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index 7488eefe..35f309b6 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -58,7 +58,7 @@
     lines2regions
 )
 
-def masks2polygons(bg_labels, baselines, fg_bin, name, min_area=None, simplify=None, open_holes=False, reorder=True):
+def masks2polygons(bg_labels, baselines, fg_bin, name, min_area=None, simplify=None, open_holes=False, reorder=True, logger=None):
     """Convert label masks into polygon coordinates.
 
     Given a Numpy array of background labels ``bg_labels``,
@@ -75,7 +75,8 @@ def masks2polygons(bg_labels, baselines, fg_bin, name, min_area=None, simplify=N
     - these polygons as a list of label, polygon, baseline tuples, and
     - a Numpy array of new background labels for that list.
     """
-    LOG = getLogger('processor.OcropySegment')
+    if not logger:
+        raise ValueError(f"Logger has not been passed by the caller")
     # find sharp baseline
     if baselines is not None:
         def getx(xy):
@@ -92,7 +93,7 @@ def getx(xy):
         bg_mask = np.array(bg_labels == label, bool)
         if not np.count_nonzero(bg_mask * fg_bin):
             # ignore if missing foreground
-            LOG.debug('skipping label %d in %s due to empty fg',
+            logger.debug('skipping label %d in %s due to empty fg',
                       label, name)
             continue
         # simplify to convex hull
@@ -101,7 +102,7 @@ def getx(xy):
             conflicts = np.setdiff1d(hull * simplify,
                                      bg_mask * simplify)
             if conflicts.any():
-                LOG.debug('Cannot simplify %d: convex hull would create additional intersections %s',
+                logger.debug('Cannot simplify %d: convex hull would create additional intersections %s',
                           label, str(conflicts))
             else:
                 bg_mask = hull
@@ -130,7 +131,7 @@ def getx(xy):
                     if len(hole) < 3:
                         idx_hole = hier[0, idx_hole, 0]
                         continue
-                    LOG.debug("label %d contour %d [%d pts] has hole %d [%d pts]",
+                    logger.debug("label %d contour %d [%d pts] has hole %d [%d pts]",
                               label, idx, len(contour), idx_hole, len(hole))
                     #plot_poly(hole, 'blue')
                     # cut child from outside...
@@ -172,7 +173,7 @@ def getx(xy):
                         diff2 = (interpol[interpol_idx+1:interpol_idx+2] - cispoint2) // 5
                     cispoint1 = cispoint1 + diff1
                     cispoint2 = cispoint2 + diff2
-                    LOG.debug("stitching at interpolation pos %d hole pos %d", interpol_idx, hole_idx)
+                    logger.debug("stitching at interpolation pos %d hole pos %d", interpol_idx, hole_idx)
                     # now stitch together outer (up to cision), inner (re-arranged around cision), outer (rest)
                     # (this works, because inner contours have inverse direction)
                     contour = np.concatenate([contour[:contour_idx], cispoint1,
@@ -181,7 +182,7 @@ def getx(xy):
                     #plot_poly(contour, 'green')
                     idx_hole = hier[0, idx_hole, 0]
                 #plot_poly(contour, 'red')
-                LOG.debug("adding label %d contour %d [%d pts]", label, idx, len(contour))
+                logger.debug("adding label %d contour %d [%d pts]", label, idx, len(contour))
                 contours.append(contour)
                 idx = hier[0, idx, 0]
         else:
@@ -207,7 +208,7 @@ def getx(xy):
             contour = contours[i]
             area = areas[i]
             if min_area and area < min_area and area / total_area < 0.1:
-                LOG.warning('Label %d contour %d is too small (%d/%d) in %s',
+                logger.warning('Label %d contour %d is too small (%d/%d) in %s',
                             label, i, area, total_area, name)
                 continue
             # simplify shape:
@@ -217,22 +218,22 @@ def getx(xy):
             # simplify and validate:
             polygon = Polygon(polygon)
             if not polygon.is_valid:
-                #LOG.debug(polygon.wkt)
-                LOG.debug(explain_validity(polygon))
+                #logger.debug(polygon.wkt)
+                logger.debug(explain_validity(polygon))
             polygon = make_valid(polygon)
             if not polygon.is_valid:
                 #LOG.debug(polygon.wkt)
-                LOG.warning(explain_validity(polygon))
+                logger.warning(explain_validity(polygon))
             poly = polygon.exterior.coords[:-1] # keep open
             if len(poly) < 4:
-                LOG.warning('Label %d contour %d for %s has less than 4 points', label, i, name)
+                logger.warning('Label %d contour %d for %s has less than 4 points', label, i, name)
                 continue
             # get baseline segments intersecting with this line mask
             # and concatenate them from left to right
             if baselines is not None:
                 base = join_baselines([baseline.intersection(polygon)
                                        for baseline in baselines
-                                       if baseline.intersects(polygon)], name)
+                                       if baseline.intersects(polygon)], name, logger)
                 if base is not None:
                     base = base.coords
             else:
@@ -324,7 +325,6 @@ def process(self):
         
         Produce a new output file by serialising the resulting hierarchy.
         """
-        LOG = getLogger('processor.OcropySegment')
         # FIXME: allow passing a-priori info on reading order / textline order
         # (and then pass on as ``bt`` and ``rl``; however, there may be a mixture
         #  of different scripts; also, vertical writing needs internal rotation
@@ -339,7 +339,7 @@ def process(self):
         assert_file_grp_cardinality(self.output_file_grp, 1)
 
         for (n, input_file) in enumerate(self.input_files):
-            LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
+            self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
             file_id = make_file_id(input_file, self.output_file_grp)
 
             pcgts = page_from_file(self.workspace.download_file(input_file))
@@ -356,7 +356,7 @@ def process(self):
                 dpi = page_image_info.resolution
                 if page_image_info.resolutionUnit == 'cm':
                     dpi *= 2.54
-                LOG.info('Page "%s" uses %f DPI', page_id, dpi)
+                self.logger.info('Page "%s" uses %f DPI', page_id, dpi)
                 zoom = 300.0/dpi
             else:
                 zoom = 1
@@ -393,7 +393,7 @@ def process(self):
                 if regions:
                     # page is already region-segmented
                     if overwrite_regions:
-                        LOG.info('removing existing TextRegions in page "%s"', page_id)
+                        self.logger.info('removing existing TextRegions in page "%s"', page_id)
                         # we could remove all other region types as well,
                         # but this is more flexible (for workflows with
                         # specialized separator/image/table detectors):
@@ -401,7 +401,7 @@ def process(self):
                         page.set_ReadingOrder(None)
                         ro = None
                     else:
-                        LOG.warning('keeping existing TextRegions in page "%s"', page_id)
+                        self.logger.warning('keeping existing TextRegions in page "%s"', page_id)
                         ignore.extend(regions)
                 # create reading order if necessary
                 if not ro or overwrite_order:
@@ -425,20 +425,20 @@ def process(self):
                 ignore.extend(page.get_TextRegion())
                 regions = list(page.get_TableRegion())
                 if not regions:
-                    LOG.warning('Page "%s" contains no table regions', page_id)
+                    self.logger.warning('Page "%s" contains no table regions', page_id)
                 for region in regions:
                     subregions = region.get_TextRegion()
                     if subregions:
                         # table is already cell-segmented
                         if overwrite_regions:
-                            LOG.info('removing existing TextRegions in table "%s"', region.id)
+                            self.logger.info('removing existing TextRegions in table "%s"', region.id)
                             region.set_TextRegion([])
                             roelem = reading_order.get(region.id)
                             # replace by empty group with same index and ref
                             # (which can then take the cells as subregions)
-                            reading_order[region.id] = page_subgroup_in_reading_order(roelem)
+                            reading_order[region.id] = page_subgroup_in_reading_order(roelem, self.logger)
                         else:
-                            LOG.warning('skipping table "%s" with existing TextRegions', region.id)
+                            self.logger.warning('skipping table "%s" with existing TextRegions', region.id)
                             continue
                     # TODO: also allow grayscale_normalized (try/except?)
                     region_image, region_coords = self.workspace.image_from_segment(
@@ -449,24 +449,24 @@ def process(self):
                     # create reading order group if necessary
                     roelem = reading_order.get(region.id)
                     if not roelem:
-                        LOG.warning("Page '%s' table region '%s' is not referenced in reading order (%s)",
+                        self.logger.warning("Page '%s' table region '%s' is not referenced in reading order (%s)",
                                     page_id, region.id, "no target to add cells to")
                     elif overwrite_order:
                         # replace by empty ordered group with same (index and) ref
                         # (which can then take the cells as subregions)
-                        roelem = page_subgroup_in_reading_order(roelem)
+                        roelem = page_subgroup_in_reading_order(roelem, self.logger)
                         reading_order[region.id] = roelem
                     elif isinstance(roelem, (OrderedGroupType, OrderedGroupIndexedType)):
-                        LOG.warning("Page '%s' table region '%s' already has an ordered group (%s)",
+                        self.logger.warning("Page '%s' table region '%s' already has an ordered group (%s)",
                                     page_id, region.id, "cells will be appended")
                     elif isinstance(roelem, (UnorderedGroupType, UnorderedGroupIndexedType)):
-                        LOG.warning("Page '%s' table region '%s' already has an unordered group (%s)",
+                        self.logger.warning("Page '%s' table region '%s' already has an unordered group (%s)",
                                     page_id, region.id, "cells will not be appended")
                         roelem = None
                     else:
                         # replace regionRef(Indexed) by group with same index and ref
                         # (which can then take the cells as subregions)
-                        roelem = page_subgroup_in_reading_order(roelem)
+                        roelem = page_subgroup_in_reading_order(roelem, self.logger)
                         reading_order[region.id] = roelem
                     # go get TextRegions with TextLines (and SeparatorRegions)
                     self._process_element(region, subignore, region_image, region_coords,
@@ -488,14 +488,14 @@ def process(self):
                         region.add_TextRegion(subregion)
                         regions.append(subregion)
                 if not regions:
-                    LOG.warning('Page "%s" contains no text regions', page_id)
+                    self.logger.warning('Page "%s" contains no text regions', page_id)
                 for region in regions:
                     if region.get_TextLine():
                         if overwrite_lines:
-                            LOG.info('removing existing TextLines in page "%s" region "%s"', page_id, region.id)
+                            self.logger.info('removing existing TextLines in page "%s" region "%s"', page_id, region.id)
                             region.set_TextLine([])
                         else:
-                            LOG.warning('keeping existing TextLines in page "%s" region "%s"', page_id, region.id)
+                            self.logger.warning('keeping existing TextLines in page "%s" region "%s"', page_id, region.id)
                             ignore.extend(region.get_TextLine())
                     # TODO: also allow grayscale_normalized (try/except?)
                     region_image, region_coords = self.workspace.image_from_segment(
@@ -519,7 +519,7 @@ def process(self):
                 local_filename=file_path,
                 mimetype=MIMETYPE_PAGE,
                 content=to_xml(pcgts))
-            LOG.info('created file ID: %s, file_grp: %s, path: %s',
+            self.logger.info('created file ID: %s, file_grp: %s, path: %s',
                      file_id, self.output_file_grp, out.local_filename)
 
     def _process_element(self, element, ignore, image, coords, element_id, file_id, page_id, zoom=1.0, rogroup=None):
@@ -540,16 +540,15 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
         in full page/table mode, then combine all separators among them with the
         newly detected separators to guide region segmentation.
         """
-        LOG = getLogger('processor.OcropySegment')
         if not image.width or not image.height:
-            LOG.warning("Skipping '%s' with zero size", element_id)
+            self.logger.warning("Skipping '%s' with zero size", element_id)
             return
         element_array = pil2array(image)
         element_bin = np.array(element_array <= midrange(element_array), bool)
         sep_bin = np.zeros_like(element_bin, bool)
         ignore_labels = np.zeros_like(element_bin, int)
         for i, segment in enumerate(ignore):
-            LOG.debug('masking foreground of %s "%s" for "%s"',
+            self.logger.debug('masking foreground of %s "%s" for "%s"',
                       type(segment).__name__[:-4], segment.id, element_id)
             # mark these segments (e.g. separator regions, tables, images)
             # for workflows where they have been detected already;
@@ -583,7 +582,7 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
             element_name = 'region'
             fullpage = False
             report = check_region(element_bin, zoom)
-        LOG.info('computing line segmentation for %s "%s"', element_name, element_id)
+        self.logger.info('computing line segmentation for %s "%s"', element_name, element_id)
         # TODO: we should downscale if DPI is large enough to save time
         try:
             if report:
@@ -601,14 +600,14 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                 csminheight=self.parameter['csminheight'])
         except Exception as err:
             if isinstance(element, TextRegionType):
-                LOG.error('Cannot line-segment region "%s": %s', element_id, err)
+                self.logger.error('Cannot line-segment region "%s": %s', element_id, err)
                 # as a fallback, add a single text line comprising the whole region:
                 element.add_TextLine(TextLineType(id=element_id + "_line", Coords=element.get_Coords()))
             else:
-                LOG.error('Cannot line-segment %s "%s": %s', element_name, element_id, err)
+                self.logger.error('Cannot line-segment %s "%s": %s', element_name, element_id, err)
             return
 
-        LOG.info('Found %d text lines for %s "%s"',
+        self.logger.info('Found %d text lines for %s "%s"',
                  len(np.unique(line_labels)) - 1,
                  element_name, element_id)
         # post-process line labels
@@ -631,11 +630,11 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                     gap_height=self.parameter['gap_height'],
                     gap_width=self.parameter['gap_width'],
                     scale=scale, zoom=zoom)
-                LOG.info('Found %d text regions for %s "%s"',
+                self.logger.info('Found %d text regions for %s "%s"',
                          len(np.unique(region_labels)) - 1,
                          element_name, element_id)
             except Exception as err:
-                LOG.error('Cannot region-segment %s "%s": %s',
+                self.logger.error('Cannot region-segment %s "%s": %s',
                           element_name, element_id, err)
                 region_labels = np.where(line_labels > len(ignore), 1 + len(ignore), line_labels)
             
@@ -669,7 +668,7 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                     region = ignore[region_line_labels0[0] - 1]
                     if rogroup and region.parent_object_ is element and not isinstance(region, SeparatorRegionType):
                         index = page_add_to_reading_order(rogroup, region.id, index)
-                    LOG.debug('Region label %d is for ignored region "%s"',
+                    self.logger.debug('Region label %d is for ignored region "%s"',
                               region_label, region.id)
                     continue
                 # normal case: new lines inside new regions
@@ -685,11 +684,13 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                 regions, _ = masks2polygons(region_mask * region_label, None, element_bin,
                                             '%s "%s"' % (element_name, element_id),
                                             min_area=6000/zoom/zoom,
-                                            simplify=ignore_labels * ~(sep_bin))
+                                            simplify=ignore_labels * ~(sep_bin),
+                                            logger=self.logger)
                 # find contours for lines (can be non-contiguous)
                 lines, _ = masks2polygons(region_line_labels, baselines, element_bin,
                                           'region "%s"' % element_id,
-                                          min_area=640/zoom/zoom)
+                                          min_area=640/zoom/zoom,
+                                          logger=self.logger)
                 # create new lines in new regions (allocating by intersection)
                 line_polys = [Polygon(polygon) for _, polygon, _ in lines]
                 for _, region_polygon, _ in regions:
@@ -698,12 +699,12 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                     region_polygon = coordinates_for_segment(region_polygon, image, coords)
                     region_polygon = polygon_for_parent(region_polygon, element)
                     if region_polygon is None:
-                        LOG.warning('Ignoring extant region contour for region label %d', region_label)
+                        self.logger.warning('Ignoring extant region contour for region label %d', region_label)
                         continue
                     # annotate result:
                     region_no += 1
                     region_id = element_id + "_region%04d" % region_no
-                    LOG.debug('Region label %d becomes ID "%s"', region_label, region_id)
+                    self.logger.debug('Region label %d becomes ID "%s"', region_label, region_id)
                     region = TextRegionType(
                         id=region_id, Coords=CoordsType(
                         points=points_from_polygon(region_polygon)))
@@ -717,13 +718,13 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                         line_polygon = coordinates_for_segment(line_polygon, image, coords)
                         line_polygon = polygon_for_parent(line_polygon, region)
                         if line_polygon is None:
-                            LOG.warning('Ignoring extant line contour for region label %d line label %d',
+                            self.logger.warning('Ignoring extant line contour for region label %d line label %d',
                                         region_label, line_label)
                             continue
                         # annotate result:
                         line_no += 1
                         line_id = region_id + "_line%04d" % line_no
-                        LOG.debug('Line label %d becomes ID "%s"', line_label, line_id)
+                        self.logger.debug('Line label %d becomes ID "%s"', line_label, line_id)
                         line = TextLineType(id=line_id,
                                             Coords=CoordsType(points=points_from_polygon(line_polygon)))
                         if line_baseline:
@@ -733,22 +734,22 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                     # if the region has received text lines, keep it
                     if region.get_TextLine():
                         element.add_TextRegion(region)
-                        LOG.info('Added region "%s" with %d lines for %s "%s"',
+                        self.logger.info('Added region "%s" with %d lines for %s "%s"',
                                  region_id, line_no, element_name, element_id)
                         if rogroup:
                             index = page_add_to_reading_order(rogroup, region.id, index)
             # add additional image/non-text regions from compute_segmentation
             # (e.g. drop-capitals or images) ...
-            LOG.info('Found %d large image regions for %s "%s"', images.max(), element_name, element_id)
+            self.logger.info('Found %d large image regions for %s "%s"', images.max(), element_name, element_id)
             # find contours around region labels (can be non-contiguous):
             image_polygons, _ = masks2polygons(images, None, element_bin,
-                                               '%s "%s"' % (element_name, element_id))
+                                               '%s "%s"' % (element_name, element_id), self.logger)
             for image_label, polygon, _ in image_polygons:
                 # convert back to absolute (page) coordinates:
                 region_polygon = coordinates_for_segment(polygon, image, coords)
                 region_polygon = polygon_for_parent(region_polygon, element)
                 if region_polygon is None:
-                    LOG.warning('Ignoring extant region contour for image label %d', image_label)
+                    self.logger.warning('Ignoring extant region contour for image label %d', image_label)
                     continue
                 region_no += 1
                 # annotate result:
@@ -757,17 +758,17 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                     id=region_id, Coords=CoordsType(
                     points=points_from_polygon(region_polygon))))
             # split detected separator labels into separator regions:
-            LOG.info('Found %d separators for %s "%s"', seplines.max(), element_name, element_id)
+            self.logger.info('Found %d separators for %s "%s"', seplines.max(), element_name, element_id)
             # find contours around region labels (can be non-contiguous):
             sep_polygons, _ = masks2polygons(seplines, None, element_bin,
                                              '%s "%s"' % (element_name, element_id),
-                                             open_holes=True, reorder=False)
+                                             open_holes=True, reorder=False, logger=self.logger)
             for sep_label, polygon, _ in sep_polygons:
                 # convert back to absolute (page) coordinates:
                 region_polygon = coordinates_for_segment(polygon, image, coords)
                 region_polygon = polygon_for_parent(region_polygon, element)
                 if region_polygon is None:
-                    LOG.warning('Ignoring extant region contour for separator %d', sep_label)
+                    self.logger.warning('Ignoring extant region contour for separator %d', sep_label)
                     continue
                 # annotate result:
                 region_no += 1
@@ -795,14 +796,14 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
             # find contours around labels (can be non-contiguous):
             line_polygons, _ = masks2polygons(line_labels, baselines, element_bin,
                                               'region "%s"' % element_id,
-                                              min_area=640/zoom/zoom)
+                                              min_area=640/zoom/zoom, logger=self.logger)
             line_no = 0
             for line_label, polygon, baseline in line_polygons:
                 # convert back to absolute (page) coordinates:
                 line_polygon = coordinates_for_segment(polygon, image, coords)
                 line_polygon = polygon_for_parent(line_polygon, element)
                 if line_polygon is None:
-                    LOG.warning('Ignoring extant line contour for line label %d',
+                    self.logger.warning('Ignoring extant line contour for line label %d',
                                 line_label)
                     continue
                 # annotate result:
@@ -937,8 +938,9 @@ def join_polygons(polygons, loc='', scale=20):
         jointp = make_valid(jointp)
     return jointp
 
-def join_baselines(baselines, loc=''):
-    LOG = getLogger('processor.OcropyResegment')
+def join_baselines(baselines, loc='', logger = None):
+    if not logger:
+        raise ValueError(f"Logger has not been passed by the caller")
     lines = []
     for baseline in baselines:
         if (baseline.is_empty or
@@ -955,9 +957,9 @@ def join_baselines(baselines, loc=''):
                 elif geom.geom_type == 'MultiLineString':
                     lines.extend(geom)
                 else:
-                    LOG.warning("ignoring baseline subtype %s in %s", geom.geom_type, loc)
+                    logger.warning("ignoring baseline subtype %s in %s", geom.geom_type, loc)
         else:
-            LOG.warning("ignoring baseline type %s in %s", baseline.geom_type, loc)
+            logger.warning("ignoring baseline type %s in %s", baseline.geom_type, loc)
     nlines = len(lines)
     if nlines == 0:
         return None
@@ -1019,7 +1021,7 @@ def join_baselines(baselines, loc=''):
         else:
             chains.append([prevl, nextl])
     if len(chains) > 1:
-        LOG.warning("baseline merge impossible (no spanning tree) in %s", loc)
+        logger.warning("baseline merge impossible (no spanning tree) in %s", loc)
         return None
     assert len(chains) == 1, chains
     assert len(chains[0]) == nlines, chains[0]
@@ -1031,7 +1033,7 @@ def join_baselines(baselines, loc=''):
         coords.extend(line.normalize().coords)
     result = LineString(coords)
     if result.is_empty:
-        LOG.warning("baseline merge is empty in %s", loc)
+        logger.warning("baseline merge is empty in %s", loc)
         return None
     assert result.geom_type == 'LineString', result.wkt
     result = set_precision(result, 1.0)
@@ -1080,7 +1082,7 @@ def page_add_to_reading_order(rogroup, region_id, index=None):
             index += 1
     return index
 
-def page_subgroup_in_reading_order(roelem):
+def page_subgroup_in_reading_order(roelem, logger = None):
     """Replace given RO element by an equivalent OrderedGroup.
     
     Given a ReadingOrder element ``roelem`` (of any type),
@@ -1094,12 +1096,14 @@ def page_subgroup_in_reading_order(roelem):
     
     Return the new group object.
     """
-    LOG = getLogger('processor.OcropySegment')
+    if not logger:
+        raise ValueError(f"Logger has not been passed by the caller")
+
     if not roelem:
-        LOG.error('Cannot subgroup from empty ReadingOrder element')
+        logger.error('Cannot subgroup from empty ReadingOrder element')
         return roelem
     if not roelem.parent_object_:
-        LOG.error('Cannot subgroup from orphan ReadingOrder element')
+        logger.error('Cannot subgroup from orphan ReadingOrder element')
         return roelem
     if isinstance(roelem, (OrderedGroupType,OrderedGroupIndexedType)) and not (
             roelem.get_OrderedGroupIndexed() or

From dbccae58d9213d5df4e072502a7eae8484902ef6 Mon Sep 17 00:00:00 2001
From: kba <unixprog@gmail.com>
Date: Tue, 13 Aug 2024 14:57:16 +0200
Subject: [PATCH 04/97] require core >= 3.0.0a1

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 6df9445c..38f09abd 100644
--- a/setup.py
+++ b/setup.py
@@ -38,7 +38,7 @@
     packages=find_packages(),
     include_package_data=True,
     install_requires=[
-        'ocrd>=2.47',
+        'ocrd>=3.0.0a1',
         'click',
         'scipy',
         'numpy>=1.17.0',

From 8557a26dc75cf858f9e6819296389f71ab972cf3 Mon Sep 17 00:00:00 2001
From: kba <unixprog@gmail.com>
Date: Tue, 13 Aug 2024 15:26:32 +0200
Subject: [PATCH 05/97] port part of binarize to core v3

---
 ocrd_cis/ocropy/binarize.py | 157 ++++++++++++++++--------------------
 1 file changed, 70 insertions(+), 87 deletions(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index f42ff2bd..c3b4cded 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -1,9 +1,13 @@
 from __future__ import absolute_import
+import logging
 
 import os.path
+import PIL
 import cv2
 import numpy as np
 from PIL import Image
+from os.path import join
+from ocrd_models import OcrdExif
 
 #import kraken.binarization
 
@@ -15,11 +19,10 @@
 )
 from ocrd_modelfactory import page_from_file
 from ocrd_models.ocrd_page import (
-    to_xml, AlternativeImageType
+    OcrdPage, to_xml, AlternativeImageType
 )
 from ocrd import Processor
 
-from .. import get_ocrd_tool
 from . import common
 from .common import (
     pil2array, array2pil,
@@ -64,18 +67,20 @@ def binarize(pil_image, method='ocropy', maxskew=2, threshold=0.5, nrm=False, zo
             raise Exception('unknown binarization method %s' % method)
         return Image.fromarray(th), 0
 
+def determine_zoom(dpi: float, page_image_info: OcrdExif) -> float:
+    if dpi > 0:
+        zoom = 300.0/dpi
+    elif page_image_info.resolution != 1:
+        dpi = page_image_info.resolution
+        if page_image_info.resolutionUnit == 'cm':
+            dpi *= 2.54
+        zoom = 300.0/dpi
+    else:
+        zoom = 1
+    return zoom
 
 class OcropyBinarize(Processor):
-
-    def __init__(self, *args, **kwargs):
-        self.logger = getLogger('processor.OcropyBinarize')
-        self.ocrd_tool = get_ocrd_tool()
-        kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
-        kwargs['version'] = self.ocrd_tool['version']
-        super(OcropyBinarize, self).__init__(*args, **kwargs)
-        if hasattr(self, 'output_file_grp'):
-            # processing context
-            self.setup()
+    logger : logging.Logger
 
     @property
     def executable(self):
@@ -84,16 +89,16 @@ def executable(self):
     def setup(self):
         assert_file_grp_cardinality(self.input_file_grp, 1)
         assert_file_grp_cardinality(self.output_file_grp, 1)
+        self.logger = getLogger('processor.OcropyBinarize')
         method = self.parameter['method']
         if self.parameter['grayscale'] and method != 'ocropy':
             self.logger.critical(f'Requested method {method} does not support grayscale normalized output')
             raise Exception('only method=ocropy allows grayscale=true')
 
-    def process(self):
+    def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage:
         """Binarize (and optionally deskew/despeckle) the pages/regions/lines of the workspace.
 
-        Open and deserialise PAGE input files and their respective images,
-        then iterate over the element hierarchy down to the requested
+        THEN Iterate over the PAGE-XML element hierarchy down to the requested
         ``level-of-operation``.
 
         Next, for each file, crop each segment image according to the layout
@@ -109,80 +114,61 @@ def process(self):
 
         Reference each new image in the AlternativeImage of the element.
 
-        Produce a new output file by serialising the resulting hierarchy.
+        Return a PAGE-XML with AlternativeImage and the arguments for ``workspace.save_image_file``.
         """
         level = self.parameter['level-of-operation']
-        assert_file_grp_cardinality(self.input_file_grp, 1)
-        assert_file_grp_cardinality(self.output_file_grp, 1)
+        assert self.workspace
+        self.logger.debug(f'Level of operation: "{level}"')
 
-        for (n, input_file) in enumerate(self.input_files):
-            self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
-            file_id = make_file_id(input_file, self.output_file_grp)
+        pcgts = input_pcgts[0]
+        page = pcgts.get_Page()
+        assert page
 
-            pcgts = page_from_file(self.workspace.download_file(input_file))
-            self.add_metadata(pcgts)
-            page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
-            page = pcgts.get_Page()
-                
-            page_image, page_xywh, page_image_info = self.workspace.image_from_page(
-                page, page_id, feature_filter='binarized')
-            if self.parameter['dpi'] > 0:
-                zoom = 300.0/self.parameter['dpi']
-            elif page_image_info.resolution != 1:
-                dpi = page_image_info.resolution
-                if page_image_info.resolutionUnit == 'cm':
-                    dpi *= 2.54
-                self.logger.info('Page "%s" uses %f DPI', page_id, dpi)
-                zoom = 300.0/dpi
-            else:
-                zoom = 1
-            
-            if level == 'page':
-                self.process_page(page, page_image, page_xywh, zoom,
-                                  input_file.pageId, file_id)
-            else:
-                if level == 'table':
-                    regions = page.get_TableRegion()
-                else: # region
-                    regions = page.get_AllRegions(classes=['Text'], order='reading-order')
-                if not regions:
-                    self.logger.warning('Page "%s" contains no text regions', page_id)
-                for region in regions:
-                    region_image, region_xywh = self.workspace.image_from_segment(
-                        region, page_image, page_xywh, feature_filter='binarized')
-                    if level == 'region':
-                        self.process_region(region, region_image, region_xywh, zoom,
-                                            input_file.pageId, file_id + '_' + region.id)
-                        continue
-                    lines = region.get_TextLine()
-                    if not lines:
-                        self.logger.warning('Page "%s" region "%s" contains no text lines',
-                                            page_id, region.id)
-                    for line in lines:
-                        line_image, line_xywh = self.workspace.image_from_segment(
-                            line, region_image, region_xywh, feature_filter='binarized')
-                        self.process_line(line, line_image, line_xywh, zoom,
-                                          input_file.pageId, region.id,
-                                          file_id + '_' + region.id + '_' + line.id)
+        page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id, feature_filter='binarized')
+        zoom = determine_zoom(self.parameter['dpi'], page_image_info)
+        self.logger.info('Page "%s" uses %f DPI', page_id, self.parameter['dpi'])
+        
+        ret = [pcgts]
+        if level == 'page':
+            try:
+                ret.append(self.process_page(page, page_image, page_xywh, zoom, page_id, output_file_id))
+            except ValueError as e:
+                self.logger.exception(e)
+        else:
+            # TODO
+            raise NotImplementedError
+            if level == 'table':
+                regions = page.get_TableRegion()
+            else: # region
+                regions = page.get_AllRegions(classes=['Text'], order='reading-order')
+            if not regions:
+                self.logger.warning('Page "%s" contains no text regions', page_id)
+            for region in regions:
+                region_image, region_xywh = self.workspace.image_from_segment(
+                    region, page_image, page_xywh, feature_filter='binarized')
+                if level == 'region':
+                    self.process_region(region, region_image, region_xywh, zoom,
+                                        input_file.pageId, file_id + '_' + region.id)
+                    continue
+                lines = region.get_TextLine()
+                if not lines:
+                    self.logger.warning('Page "%s" region "%s" contains no text lines',
+                                        page_id, region.id)
+                for line in lines:
+                    line_image, line_xywh = self.workspace.image_from_segment(
+                        line, region_image, region_xywh, feature_filter='binarized')
+                    self.process_line(line, line_image, line_xywh, zoom,
+                                      input_file.pageId, region.id,
+                                      file_id + '_' + region.id + '_' + line.id)
 
-            # update METS (add the PAGE file):
-            file_path = os.path.join(self.output_file_grp, file_id + '.xml')
-            pcgts.set_pcGtsId(file_id)
-            out = self.workspace.add_file(
-                ID=file_id,
-                file_grp=self.output_file_grp,
-                pageId=input_file.pageId,
-                local_filename=file_path,
-                mimetype=MIMETYPE_PAGE,
-                content=to_xml(pcgts))
-            self.logger.info('created file ID: %s, file_grp: %s, path: %s',
-                             file_id, self.output_file_grp, out.local_filename)
+        return ret
 
-    def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id):
+    def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id) -> tuple[Image.Image, str, str]:
         if not page_image.width or not page_image.height:
-            self.logger.warning("Skipping page '%s' with zero size", page_id)
-            return
+            raise ValueError("Skipping page '%s' with zero size", page_id)
         self.logger.info("About to binarize page '%s'", page_id)
+        assert self.output_file_grp
+
         features = page_xywh['features']
         if 'angle' in page_xywh and page_xywh['angle']:
             # orientation has already been annotated (by previous deskewing),
@@ -216,13 +202,10 @@ def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id):
         else:
             file_id += '.IMG-BIN'
             features += ',binarized'
-        file_path = self.workspace.save_image_file(
-            bin_image, file_id, self.output_file_grp,
-            page_id=page_id)
+        bin_image_path = join(self.output_file_grp, f'{file_id}.png')
         # update PAGE (reference the image file):
-        page.add_AlternativeImage(AlternativeImageType(
-            filename=file_path,
-            comments=features))
+        page.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=features))
+        return (bin_image, file_id, bin_image_path)
 
     def process_region(self, region, region_image, region_xywh, zoom, page_id, file_id):
         if not region_image.width or not region_image.height:

From 278b706246e24ec0fc0b5030aff6d16673bad817 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Tue, 13 Aug 2024 16:08:10 +0200
Subject: [PATCH 06/97] move: determine_zoom to common.py

---
 ocrd_cis/ocropy/binarize.py  | 18 ++----------------
 ocrd_cis/ocropy/clip.py      | 15 +++------------
 ocrd_cis/ocropy/common.py    | 14 +++++++++++++-
 ocrd_cis/ocropy/denoise.py   | 15 ++++-----------
 ocrd_cis/ocropy/deskew.py    |  6 +-----
 ocrd_cis/ocropy/dewarp.py    | 18 ++++--------------
 ocrd_cis/ocropy/resegment.py | 14 ++++----------
 ocrd_cis/ocropy/segment.py   | 13 +++----------
 8 files changed, 34 insertions(+), 79 deletions(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index c3b4cded..b5e2bc7e 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -7,7 +7,6 @@
 import numpy as np
 from PIL import Image
 from os.path import join
-from ocrd_models import OcrdExif
 
 #import kraken.binarization
 
@@ -25,9 +24,8 @@
 
 from . import common
 from .common import (
-    pil2array, array2pil,
     # binarize,
-    remove_noise)
+     array2pil, determine_zoom, pil2array, remove_noise)
 
 #sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 
@@ -67,18 +65,6 @@ def binarize(pil_image, method='ocropy', maxskew=2, threshold=0.5, nrm=False, zo
             raise Exception('unknown binarization method %s' % method)
         return Image.fromarray(th), 0
 
-def determine_zoom(dpi: float, page_image_info: OcrdExif) -> float:
-    if dpi > 0:
-        zoom = 300.0/dpi
-    elif page_image_info.resolution != 1:
-        dpi = page_image_info.resolution
-        if page_image_info.resolutionUnit == 'cm':
-            dpi *= 2.54
-        zoom = 300.0/dpi
-    else:
-        zoom = 1
-    return zoom
-
 class OcropyBinarize(Processor):
     logger : logging.Logger
 
@@ -126,7 +112,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
 
         page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id, feature_filter='binarized')
         zoom = determine_zoom(self.parameter['dpi'], page_image_info)
-        self.logger.info('Page "%s" uses %f DPI', page_id, self.parameter['dpi'])
+        self.logger.info(f"Page '{page_id}' uses {self.parameter['dpi']} DPI")
         
         ret = [pcgts]
         if level == 'page':
diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py
index 4c0eebea..3b854897 100644
--- a/ocrd_cis/ocropy/clip.py
+++ b/ocrd_cis/ocropy/clip.py
@@ -28,8 +28,7 @@
 from .ocrolib import midrange, morph
 from .common import (
     # binarize,
-    pil2array, array2pil
-)
+    array2pil, determine_zoom, pil2array)
 
 class OcropyClip(Processor):
 
@@ -98,16 +97,8 @@ def process(self):
             
             page_image, page_coords, page_image_info = self.workspace.image_from_page(
                 page, page_id, feature_selector='binarized')
-            if self.parameter['dpi'] > 0:
-                zoom = 300.0/self.parameter['dpi']
-            elif page_image_info.resolution != 1:
-                dpi = page_image_info.resolution
-                if page_image_info.resolutionUnit == 'cm':
-                    dpi *= 2.54
-                self.logger.info('Page "%s" uses %f DPI', page_id, dpi)
-                zoom = 300.0/dpi
-            else:
-                zoom = 1
+            zoom = determine_zoom(self.parameter['dpi'], page_image_info)
+            self.logger.info(f"Page '{page_id}' uses {self.parameter['dpi']} DPI")
 
             # FIXME: what about text regions inside table regions?
             regions = list(page.get_TextRegion())
diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py
index 3cb9e4c4..1804c29d 100644
--- a/ocrd_cis/ocropy/common.py
+++ b/ocrd_cis/ocropy/common.py
@@ -10,7 +10,7 @@
 from skimage.morphology import medial_axis
 import networkx as nx
 from PIL import Image
-
+from ocrd_models import OcrdExif
 from . import ocrolib
 from .ocrolib import morph, psegutils, sl
 # for decorators (type-checks etc):
@@ -2102,3 +2102,15 @@ def find_topological():
     #     rlabels[region_hull] = region
     # DSAVE('rlabels_closed', rlabels)
     return rlabels
+
+def determine_zoom(dpi: float, page_image_info: OcrdExif) -> float:
+    if dpi > 0:
+        zoom = 300.0/dpi
+    elif page_image_info.resolution != 1:
+        dpi = page_image_info.resolution
+        if page_image_info.resolutionUnit == 'cm':
+            dpi *= 2.54
+        zoom = 300.0/dpi
+    else:
+        zoom = 1
+    return zoom
diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py
index d6a4f7ff..d8554a3e 100644
--- a/ocrd_cis/ocropy/denoise.py
+++ b/ocrd_cis/ocropy/denoise.py
@@ -17,7 +17,7 @@
 from .. import get_ocrd_tool
 from .common import (
     # binarize,
-    remove_noise)
+    determine_zoom, remove_noise)
 
 class OcropyDenoise(Processor):
 
@@ -73,16 +73,9 @@ def process(self):
             page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                 page, page_id,
                 feature_selector='binarized' if level == 'page' else '')
-            if self.parameter['dpi'] > 0:
-                zoom = 300.0/self.parameter['dpi']
-            elif page_image_info.resolution != 1:
-                dpi = page_image_info.resolution
-                if page_image_info.resolutionUnit == 'cm':
-                    dpi *= 2.54
-                self.logger.info('Page "%s" uses %f DPI', page_id, dpi)
-                zoom = 300.0/dpi
-            else:
-                zoom = 1
+
+            zoom = determine_zoom(self.parameter['dpi'], page_image_info)
+            self.logger.info(f"Page '{page_id}' uses {self.parameter['dpi']} DPI")
 
             if level == 'page':
                 self.process_segment(page, page_image, page_xywh, zoom,
diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py
index 63bb6b97..055ab27d 100644
--- a/ocrd_cis/ocropy/deskew.py
+++ b/ocrd_cis/ocropy/deskew.py
@@ -17,14 +17,10 @@
 
 from .. import get_ocrd_tool
 from . import common
-from .common import (
-    pil2array
-)
+from .common import pil2array
 
 #sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 
-TOOL = 'ocrd-cis-ocropy-deskew'
-
 def deskew(pil_image, maxskew=2):
     array = pil2array(pil_image)
     _, angle = common.binarize(array, maxskew=maxskew)
diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py
index 89a62e11..4c9a1bdb 100644
--- a/ocrd_cis/ocropy/dewarp.py
+++ b/ocrd_cis/ocropy/dewarp.py
@@ -17,10 +17,7 @@
 
 from .. import get_ocrd_tool
 from .ocrolib import lineest
-from .common import (
-    pil2array, array2pil,
-    check_line,
-)
+from .common import array2pil, check_line, determine_zoom, pil2array
 
 #sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 
@@ -128,16 +125,9 @@ def process(self):
                 
             page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                 page, page_id)
-            if self.parameter['dpi'] > 0:
-                zoom = 300.0/self.parameter['dpi']
-            elif page_image_info.resolution != 1:
-                dpi = page_image_info.resolution
-                if page_image_info.resolutionUnit == 'cm':
-                    dpi *= 2.54
-                self.logger.info('Page "%s" uses %f DPI', page_id, dpi)
-                zoom = 300.0/dpi
-            else:
-                zoom = 1
+
+            zoom = determine_zoom(self.parameter['dpi'], page_image_info)
+            self.logger.info(f"Page '{page_id}' uses {self.parameter['dpi']} DPI")
 
             regions = page.get_AllRegions(classes=['Text'], order='reading-order')
             if not regions:
diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index 2261cf3e..e4681b23 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -30,6 +30,7 @@
     pil2array,
     odd,
     DSAVE,
+    determine_zoom,
     # binarize,
     check_page,
     check_region,
@@ -129,16 +130,9 @@ def process(self):
 
             page_image, page_coords, page_image_info = self.workspace.image_from_page(
                 page, page_id, feature_selector='binarized')
-            if self.parameter['dpi'] > 0:
-                zoom = 300.0/self.parameter['dpi']
-            elif page_image_info.resolution != 1:
-                dpi = page_image_info.resolution
-                if page_image_info.resolutionUnit == 'cm':
-                    dpi *= 2.54
-                self.logger.info('Page "%s" uses %f DPI', page_id, dpi)
-                zoom = 300.0/dpi
-            else:
-                zoom = 1
+
+            zoom = determine_zoom(self.parameter['dpi'], page_image_info)
+            self.logger.info(f"Page '{page_id}' uses {self.parameter['dpi']} DPI")
 
             ignore = (page.get_ImageRegion() +
                       page.get_LineDrawingRegion() +
diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index 35f309b6..e13c3d71 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -53,6 +53,7 @@
     pil2array,
     array2pil,
     check_page, check_region,
+    determine_zoom,
     hmerge_line_seeds,
     compute_segmentation,
     lines2regions
@@ -350,16 +351,8 @@ def process(self):
             # TODO: also allow grayscale_normalized (try/except?)
             page_image, page_coords, page_image_info = self.workspace.image_from_page(
                 page, page_id, feature_selector='binarized')
-            if self.parameter['dpi'] > 0:
-                zoom = 300.0/self.parameter['dpi']
-            elif page_image_info.resolution != 1:
-                dpi = page_image_info.resolution
-                if page_image_info.resolutionUnit == 'cm':
-                    dpi *= 2.54
-                self.logger.info('Page "%s" uses %f DPI', page_id, dpi)
-                zoom = 300.0/dpi
-            else:
-                zoom = 1
+            zoom = determine_zoom(self.parameter['dpi'], page_image_info)
+            self.logger.info(f"Page '{page_id}' uses {self.parameter['dpi']} DPI")
 
             # aggregate existing regions so their foreground can be ignored
             ignore = (page.get_ImageRegion() +

From 6beec175ed89e321cae93917dbe02bd2809cd83b Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Tue, 13 Aug 2024 16:14:31 +0200
Subject: [PATCH 07/97] move: logger init to setup()

---
 ocrd_cis/ocropy/binarize.py  | 6 +++---
 ocrd_cis/ocropy/clip.py      | 4 +++-
 ocrd_cis/ocropy/denoise.py   | 5 +++--
 ocrd_cis/ocropy/deskew.py    | 5 +++--
 ocrd_cis/ocropy/dewarp.py    | 5 +++--
 ocrd_cis/ocropy/recognize.py | 5 +++--
 ocrd_cis/ocropy/resegment.py | 5 +++--
 ocrd_cis/ocropy/segment.py   | 6 ++++--
 ocrd_cis/ocropy/train.py     | 5 +++--
 9 files changed, 28 insertions(+), 18 deletions(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index b5e2bc7e..cc34690e 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -1,5 +1,5 @@
 from __future__ import absolute_import
-import logging
+from logging import Logger
 
 import os.path
 import PIL
@@ -66,16 +66,16 @@ def binarize(pil_image, method='ocropy', maxskew=2, threshold=0.5, nrm=False, zo
         return Image.fromarray(th), 0
 
 class OcropyBinarize(Processor):
-    logger : logging.Logger
+    logger: Logger
 
     @property
     def executable(self):
         return 'ocrd-cis-ocropy-binarize'
 
     def setup(self):
+        self.logger = getLogger('processor.OcropyBinarize')
         assert_file_grp_cardinality(self.input_file_grp, 1)
         assert_file_grp_cardinality(self.output_file_grp, 1)
-        self.logger = getLogger('processor.OcropyBinarize')
         method = self.parameter['method']
         if self.parameter['grayscale'] and method != 'ocropy':
             self.logger.critical(f'Requested method {method} does not support grayscale normalized output')
diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py
index 3b854897..1b7fb28b 100644
--- a/ocrd_cis/ocropy/clip.py
+++ b/ocrd_cis/ocropy/clip.py
@@ -1,4 +1,5 @@
 from __future__ import absolute_import
+from logging import Logger
 
 import os.path
 import numpy as np
@@ -31,9 +32,9 @@
     array2pil, determine_zoom, pil2array)
 
 class OcropyClip(Processor):
+    logger: Logger
 
     def __init__(self, *args, **kwargs):
-        self.logger = getLogger('processor.OcropyClip')
         self.ocrd_tool = get_ocrd_tool()
         kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
         kwargs['version'] = self.ocrd_tool['version']
@@ -44,6 +45,7 @@ def executable(self):
         return 'ocrd-cis-ocropy-clip'
 
     def setup(self):
+        self.logger = getLogger('processor.OcropyClip')
         assert_file_grp_cardinality(self.input_file_grp, 1)
         assert_file_grp_cardinality(self.output_file_grp, 1)
 
diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py
index d8554a3e..34750a53 100644
--- a/ocrd_cis/ocropy/denoise.py
+++ b/ocrd_cis/ocropy/denoise.py
@@ -1,5 +1,5 @@
 from __future__ import absolute_import
-
+from logging import Logger
 import os.path
 
 from ocrd_utils import (
@@ -20,9 +20,9 @@
     determine_zoom, remove_noise)
 
 class OcropyDenoise(Processor):
+    logger: Logger
 
     def __init__(self, *args, **kwargs):
-        self.logger = getLogger('processor.OcropyDenoise')
         self.ocrd_tool = get_ocrd_tool()
         kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
         kwargs['version'] = self.ocrd_tool['version']
@@ -33,6 +33,7 @@ def executable(self):
         return 'ocrd-cis-ocropy-denoise'
 
     def setup(self):
+        self.logger = getLogger('processor.OcropyDenoise')
         assert_file_grp_cardinality(self.input_file_grp, 1)
         assert_file_grp_cardinality(self.output_file_grp, 1)
 
diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py
index 055ab27d..2eb898ca 100644
--- a/ocrd_cis/ocropy/deskew.py
+++ b/ocrd_cis/ocropy/deskew.py
@@ -1,5 +1,5 @@
 from __future__ import absolute_import
-
+from logging import Logger
 import os.path
 
 from ocrd_utils import (
@@ -27,9 +27,9 @@ def deskew(pil_image, maxskew=2):
     return angle
 
 class OcropyDeskew(Processor):
+    logger: Logger
 
     def __init__(self, *args, **kwargs):
-        self.logger = getLogger('processor.OcropyDeskew')
         ocrd_tool = get_ocrd_tool()
         kwargs['ocrd_tool'] = ocrd_tool['tools'][self.executable]
         kwargs['version'] = ocrd_tool['version']
@@ -40,6 +40,7 @@ def executable(self):
         return 'ocrd-cis-ocropy-deskew'
 
     def setup(self):
+        self.logger = getLogger('processor.OcropyDeskew')
         assert_file_grp_cardinality(self.input_file_grp, 1)
         assert_file_grp_cardinality(self.output_file_grp, 1)
 
diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py
index 4c9a1bdb..cad280c6 100644
--- a/ocrd_cis/ocropy/dewarp.py
+++ b/ocrd_cis/ocropy/dewarp.py
@@ -1,5 +1,5 @@
 from __future__ import absolute_import
-
+from logging import Logger
 import os.path
 import numpy as np
 
@@ -64,9 +64,9 @@ def padvert(image, range_):
     return array2pil(line)
 
 class OcropyDewarp(Processor):
+    logger: Logger
 
     def __init__(self, *args, **kwargs):
-        self.logger = getLogger('processor.OcropyDewarp')
         self.ocrd_tool = get_ocrd_tool()
         kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
         kwargs['version'] = self.ocrd_tool['version']
@@ -80,6 +80,7 @@ def executable(self):
         return 'ocrd-cis-ocropy-dewarp'
 
     def setup(self):
+        self.logger = getLogger('processor.OcropyDewarp')
         assert_file_grp_cardinality(self.input_file_grp, 1)
         assert_file_grp_cardinality(self.output_file_grp, 1)
         # defaults from ocrolib.lineest:
diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py
index fdeaed27..8e147fea 100644
--- a/ocrd_cis/ocropy/recognize.py
+++ b/ocrd_cis/ocropy/recognize.py
@@ -1,5 +1,5 @@
 from __future__ import absolute_import
-
+from logging import Logger
 import sys
 import os.path
 import numpy as np
@@ -78,9 +78,9 @@ def recognize(image, pad, network, check=True):
 
 
 class OcropyRecognize(Processor):
+    logger: Logger
 
     def __init__(self, *args, **kwargs):
-        self.logger = getLogger('processor.OcropyRecognize')
         self.ocrd_tool = get_ocrd_tool()
         self.pad = 16 # ocropus-rpred default
         self.network = None # set in process
@@ -96,6 +96,7 @@ def executable(self):
         return 'ocrd-cis-ocropy-recognize'
 
     def setup(self):
+        self.logger = getLogger('processor.OcropyRecognize')
         assert_file_grp_cardinality(self.input_file_grp, 1)
         assert_file_grp_cardinality(self.output_file_grp, 1)
         # from ocropus-rpred:
diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index e4681b23..1e920b0f 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -1,5 +1,5 @@
 from __future__ import absolute_import
-
+from logging import Logger
 import os.path
 import numpy as np
 from skimage import draw, segmentation
@@ -48,9 +48,9 @@
 )
 
 class OcropyResegment(Processor):
+    logger: Logger
 
     def __init__(self, *args, **kwargs):
-        self.logger = getLogger('processor.OcropyResegment')
         self.ocrd_tool = get_ocrd_tool()
         kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
         kwargs['version'] = self.ocrd_tool['version']
@@ -61,6 +61,7 @@ def executable(self):
         return 'ocrd-cis-ocropy-resegment'
 
     def setup(self):
+        self.logger = getLogger('processor.OcropyResegment')
         assert_file_grp_cardinality(self.input_file_grp, 1)
         assert_file_grp_cardinality(self.output_file_grp, 1)
 
diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index e13c3d71..3b89bda6 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -1,5 +1,5 @@
 from __future__ import absolute_import
-
+from logging import Logger
 import os.path
 import itertools
 import numpy as np
@@ -245,9 +245,10 @@ def getx(xy):
 
 
 class OcropySegment(Processor):
+    logger: Logger
 
     def __init__(self, *args, **kwargs):
-        self.logger = getLogger('processor.OcropySegment')
+
         self.ocrd_tool = get_ocrd_tool()
         kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
         kwargs['version'] = self.ocrd_tool['version']
@@ -258,6 +259,7 @@ def executable(self):
         return 'ocrd-cis-ocropy-segment'
 
     def setup(self):
+        self.logger = getLogger('processor.OcropySegment')
         assert_file_grp_cardinality(self.input_file_grp, 1)
         assert_file_grp_cardinality(self.output_file_grp, 1)
 
diff --git a/ocrd_cis/ocropy/train.py b/ocrd_cis/ocropy/train.py
index 25317c4d..61a918c7 100644
--- a/ocrd_cis/ocropy/train.py
+++ b/ocrd_cis/ocropy/train.py
@@ -1,5 +1,5 @@
 from __future__ import absolute_import
-
+from logging import Logger
 import sys
 import os
 import tempfile
@@ -28,9 +28,9 @@ def resize_keep_ratio(image, baseheight=48):
 
 
 class OcropyTrain(Processor):
+    log: Logger
 
     def __init__(self, *args, **kwargs):
-        self.log = getLogger('processor.OcropyTrain')
         self.oldcwd = os.getcwd()
         ocrd_tool = get_ocrd_tool()
         kwargs['ocrd_tool'] = ocrd_tool['tools'][self.executable]
@@ -45,6 +45,7 @@ def executable(self):
         return 'ocrd-cis-ocropy-train'
 
     def setup(self):
+        self.log = getLogger('processor.OcropyTrain')
         #print(self.parameter)
         if 'model' in self.parameter:
             model = self.parameter['model']

From 1b2fea3ed5b7c9d1a02f2dcabe0770aa3eb87da6 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Tue, 13 Aug 2024 16:16:55 +0200
Subject: [PATCH 08/97] refactor: log -> logger

---
 ocrd_cis/ocropy/train.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/ocrd_cis/ocropy/train.py b/ocrd_cis/ocropy/train.py
index 61a918c7..9278da92 100644
--- a/ocrd_cis/ocropy/train.py
+++ b/ocrd_cis/ocropy/train.py
@@ -28,7 +28,7 @@ def resize_keep_ratio(image, baseheight=48):
 
 
 class OcropyTrain(Processor):
-    log: Logger
+    logger: Logger
 
     def __init__(self, *args, **kwargs):
         self.oldcwd = os.getcwd()
@@ -45,7 +45,7 @@ def executable(self):
         return 'ocrd-cis-ocropy-train'
 
     def setup(self):
-        self.log = getLogger('processor.OcropyTrain')
+        self.logger = getLogger('processor.OcropyTrain')
         #print(self.parameter)
         if 'model' in self.parameter:
             model = self.parameter['model']
@@ -54,9 +54,9 @@ def setup(self):
             except SystemExit:
                 ocropydir = os.path.dirname(os.path.abspath(__file__))
                 modelpath = os.path.join(ocropydir, 'models', model)
-                self.log.info("Failed to resolve model '%s' path, trying '%s'", model, modelpath)
+                self.logger.info("Failed to resolve model '%s' path, trying '%s'", model, modelpath)
             if not os.path.isfile(modelpath):
-                self.log.error("Could not find model '%s'. Try 'ocrd resmgr download ocrd-cis-ocropy-recognize %s'",
+                self.logger.error("Could not find model '%s'. Try 'ocrd resmgr download ocrd-cis-ocropy-recognize %s'",
                                model, model)
                 sys.exit(1)
             outputpath = os.path.join(self.oldcwd, 'output', model)
@@ -78,18 +78,18 @@ def process(self):
         """
         filelist = []
         filepath = tempfile.mkdtemp(prefix='ocrd-cis-ocropy-train-')
-        #self.log.info("Using model %s in %s for recognition", model)
+        #self.logger.info("Using model %s in %s for recognition", model)
         for (n, input_file) in enumerate(self.input_files):
-            #self.log.info("INPUT FILE %i / %s", n, input_file)
+            #self.logger.info("INPUT FILE %i / %s", n, input_file)
             pcgts = page_from_file(self.workspace.download_file(input_file))
             page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
             page = pcgts.get_Page()
             page_image, page_coords, _ = self.workspace.image_from_page(page, page_id)
 
-            self.log.info("Extracting from page '%s'", page_id)
+            self.logger.info("Extracting from page '%s'", page_id)
             for region in page.get_AllRegions(classes=['Text']):
                 textlines = region.get_TextLine()
-                self.log.info("Extracting %i lines from region '%s'", len(textlines), region.id)
+                self.logger.info("Extracting %i lines from region '%s'", len(textlines), region.id)
                 for line in textlines:
                     if self.parameter['textequiv_level'] == 'line':
                         path = os.path.join(filepath, page_id + region.id + line.id)
@@ -110,7 +110,7 @@ def process(self):
                             if imgpath:
                                 filelist.append(imgpath)
 
-        self.log.info("Training %s from %s on %i file pairs",
+        self.logger.info("Training %s from %s on %i file pairs",
                       self.outputpath,
                       self.modelpath or 'scratch',
                       len(filelist))
@@ -130,7 +130,7 @@ def extract_segment(self, path, segment, page_image, page_coords):
         with open(gtpath, "w", encoding='utf-8') as f:
             f.write(gt)
 
-        self.log.debug("Extracting %s '%s'", segment.__class__.__name__, segment.id)
+        self.logger.debug("Extracting %s '%s'", segment.__class__.__name__, segment.id)
         image, coords = self.workspace.image_from_segment(segment, page_image, page_coords)
 
         if 'binarized' not in coords['features'].split(','):

From fe33494814e845cfd969a5f1a51234ceadb865a3 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Tue, 13 Aug 2024 16:32:17 +0200
Subject: [PATCH 09/97] remove: unused imports

---
 ocrd_cis/ocropy/binarize.py  | 19 +++++++-----------
 ocrd_cis/ocropy/clip.py      |  4 ++--
 ocrd_cis/ocropy/denoise.py   |  4 ++--
 ocrd_cis/ocropy/deskew.py    |  4 ++--
 ocrd_cis/ocropy/dewarp.py    |  4 ++--
 ocrd_cis/ocropy/recognize.py | 20 +++++++++---------
 ocrd_cis/ocropy/resegment.py |  9 +++------
 ocrd_cis/ocropy/segment.py   |  4 ++--
 ocrd_cis/ocropy/train.py     | 39 ++++++++++++++++++------------------
 9 files changed, 49 insertions(+), 58 deletions(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index cc34690e..5d3fc7c3 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -1,12 +1,12 @@
 from __future__ import absolute_import
 from logging import Logger
 
-import os.path
-import PIL
 import cv2
 import numpy as np
 from PIL import Image
-from os.path import join
+from os.path import abspath, dirname, join
+
+from typing import Tuple
 
 #import kraken.binarization
 
@@ -16,18 +16,13 @@
     assert_file_grp_cardinality,
     MIMETYPE_PAGE
 )
-from ocrd_modelfactory import page_from_file
-from ocrd_models.ocrd_page import (
-    OcrdPage, to_xml, AlternativeImageType
-)
+from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage, to_xml
 from ocrd import Processor
 
 from . import common
-from .common import (
-    # binarize,
-     array2pil, determine_zoom, pil2array, remove_noise)
+from .common import array2pil, determine_zoom, pil2array, remove_noise
 
-#sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+#sys.path.append(dirname(abspath(__file__)))
 
 def binarize(pil_image, method='ocropy', maxskew=2, threshold=0.5, nrm=False, zoom=1.0):
     LOG = getLogger('processor.OcropyBinarize')
@@ -149,7 +144,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
 
         return ret
 
-    def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id) -> tuple[Image.Image, str, str]:
+    def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id) -> Tuple[Image.Image, str, str]:
         if not page_image.width or not page_image.height:
             raise ValueError("Skipping page '%s' with zero size", page_id)
         self.logger.info("About to binarize page '%s'", page_id)
diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py
index 1b7fb28b..b70d1fb0 100644
--- a/ocrd_cis/ocropy/clip.py
+++ b/ocrd_cis/ocropy/clip.py
@@ -1,7 +1,7 @@
 from __future__ import absolute_import
 from logging import Logger
 
-import os.path
+from os.path import join
 import numpy as np
 from PIL import Image, ImageStat, ImageOps
 from shapely.geometry import Polygon
@@ -202,7 +202,7 @@ def process(self):
                                              input_file.pageId, file_id + '_' + region.id + '_' + line.id)
 
             # update METS (add the PAGE file):
-            file_path = os.path.join(self.output_file_grp, file_id + '.xml')
+            file_path = join(self.output_file_grp, file_id + '.xml')
             pcgts.set_pcGtsId(file_id)
             out = self.workspace.add_file(
                 ID=file_id,
diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py
index 34750a53..7cf74727 100644
--- a/ocrd_cis/ocropy/denoise.py
+++ b/ocrd_cis/ocropy/denoise.py
@@ -1,6 +1,6 @@
 from __future__ import absolute_import
 from logging import Logger
-import os.path
+from os.path import join
 
 from ocrd_utils import (
     getLogger,
@@ -105,7 +105,7 @@ def process(self):
                                              file_id + '_' + region.id + '_' + line.id)
 
             # update METS (add the PAGE file):
-            file_path = os.path.join(self.output_file_grp, file_id + '.xml')
+            file_path = join(self.output_file_grp, file_id + '.xml')
             pcgts.set_pcGtsId(file_id)
             out = self.workspace.add_file(
                 ID=file_id,
diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py
index 2eb898ca..bcd3be01 100644
--- a/ocrd_cis/ocropy/deskew.py
+++ b/ocrd_cis/ocropy/deskew.py
@@ -1,6 +1,6 @@
 from __future__ import absolute_import
 from logging import Logger
-import os.path
+from os.path import join
 
 from ocrd_utils import (
     getLogger,
@@ -105,7 +105,7 @@ def process(self):
                                           file_id + '_' + region.id)
 
             # update METS (add the PAGE file):
-            file_path = os.path.join(self.output_file_grp, file_id + '.xml')
+            file_path = join(self.output_file_grp, file_id + '.xml')
             pcgts.set_pcGtsId(file_id)
             out = self.workspace.add_file(
                 ID=file_id,
diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py
index cad280c6..6c27c5c6 100644
--- a/ocrd_cis/ocropy/dewarp.py
+++ b/ocrd_cis/ocropy/dewarp.py
@@ -1,6 +1,6 @@
 from __future__ import absolute_import
 from logging import Logger
-import os.path
+from os.path import join
 import numpy as np
 
 from ocrd_utils import (
@@ -172,7 +172,7 @@ def process(self):
                         comments=line_xywh['features'] + ',dewarped'))
 
             # update METS (add the PAGE file):
-            file_path = os.path.join(self.output_file_grp, file_id + '.xml')
+            file_path = join(self.output_file_grp, file_id + '.xml')
             pcgts.set_pcGtsId(file_id)
             out = self.workspace.add_file(
                 ID=file_id,
diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py
index 8e147fea..f3ecf199 100644
--- a/ocrd_cis/ocropy/recognize.py
+++ b/ocrd_cis/ocropy/recognize.py
@@ -1,7 +1,7 @@
 from __future__ import absolute_import
 from logging import Logger
-import sys
-import os.path
+from sys import exit
+from os.path import abspath, dirname, isfile, join
 import numpy as np
 from PIL import Image
 
@@ -24,11 +24,9 @@
 from ocrd import Processor
 
 from .. import get_ocrd_tool
+from .common import check_line, pil2array
 from .ocrolib import lstm, load_object, midrange
-from .common import (
-    pil2array,
-    check_line
-)
+
 
 def resize_keep_ratio(image, baseheight=48):
     scale = baseheight / image.height
@@ -112,20 +110,20 @@ def get_model(self):
         be resolved with OcrdResourceManager to a valid readeable file and
         returns it.  If not, it checks if the model can be found in the
         dirname(__file__)/models/ directory."""
-        canread = lambda p: os.path.isfile(p) and os.access(p, os.R_OK)
+        canread = lambda p: isfile(p) and os.access(p, os.R_OK)
         try:
             model = self.resolve_resource(self.parameter['model'])
             if canread(model):
                 return model
         except SystemExit:
-            ocropydir = os.path.dirname(os.path.abspath(__file__))
-            path = os.path.join(ocropydir, 'models', self.parameter['model'])
+            ocropydir = dirname(abspath(__file__))
+            path = join(ocropydir, 'models', self.parameter['model'])
             self.logger.info("Failed to resolve model with OCR-D/core mechanism, trying %s", path)
             if canread(path):
                 return path
         self.logger.error("Could not find model %s. Try 'ocrd resmgr download ocrd-cis-ocropy-recognize %s",
                 self.parameter['model'], self.parameter['model'])
-        sys.exit(1)
+        exit(1)
 
     def process(self):
         """Recognize lines / words / glyphs of the workspace.
@@ -176,7 +174,7 @@ def process(self):
 
             # update METS (add the PAGE file):
             file_id = make_file_id(input_file, self.output_file_grp)
-            file_path = os.path.join(self.output_file_grp, file_id + '.xml')
+            file_path = join(self.output_file_grp, file_id + '.xml')
             pcgts.set_pcGtsId(file_id)
             out = self.workspace.add_file(
                 ID=file_id,
diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index 1e920b0f..329694d0 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -1,16 +1,13 @@
 from __future__ import absolute_import
 from logging import Logger
-import os.path
+from os.path import join
 import numpy as np
 from skimage import draw, segmentation
 from shapely.geometry import Polygon, LineString
 from shapely.prepared import prep
-from shapely.ops import unary_union
 
 from ocrd_modelfactory import page_from_file
-from ocrd_models.ocrd_page import (
-    to_xml, PageType, BaselineType
-)
+from ocrd_models.ocrd_page import BaselineType, PageType, to_xml
 from ocrd import Processor
 from ocrd_utils import (
     getLogger,
@@ -169,7 +166,7 @@ def process(self):
                         self.logger.warning('Page "%s" region "%s" contains no text lines', page_id, region.id)
 
             # update METS (add the PAGE file):
-            file_path = os.path.join(self.output_file_grp, file_id + '.xml')
+            file_path = join(self.output_file_grp, file_id + '.xml')
             pcgts.set_pcGtsId(file_id)
             out = self.workspace.add_file(
                 ID=file_id,
diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index 3b89bda6..446fc628 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -1,6 +1,6 @@
 from __future__ import absolute_import
 from logging import Logger
-import os.path
+from os.path import join
 import itertools
 import numpy as np
 from scipy.sparse.csgraph import minimum_spanning_tree
@@ -505,7 +505,7 @@ def process(self):
                                           input_file.pageId, zoom)
 
             # update METS (add the PAGE file):
-            file_path = os.path.join(self.output_file_grp, file_id + '.xml')
+            file_path = join(self.output_file_grp, file_id + '.xml')
             pcgts.set_pcGtsId(file_id)
             out = self.workspace.add_file(
                 ID=file_id,
diff --git a/ocrd_cis/ocropy/train.py b/ocrd_cis/ocropy/train.py
index 9278da92..ff460523 100644
--- a/ocrd_cis/ocropy/train.py
+++ b/ocrd_cis/ocropy/train.py
@@ -1,7 +1,8 @@
 from __future__ import absolute_import
 from logging import Logger
-import sys
-import os
+from sys import exit
+from os import getcwd, makedirs, remove
+from os.path import abspath, dirname, exists, join, isfile
 import tempfile
 
 from ocrd_modelfactory import page_from_file
@@ -15,10 +16,10 @@
 
 def deletefiles(filelist):
     for file in filelist:
-        if os.path.exists(file):
-            os.remove(file)
-        if os.path.exists(file[:-3]+'gt.txt'):
-            os.remove(file[:-3]+'gt.txt')
+        if exists(file):
+            remove(file)
+        if exists(file[:-3]+'gt.txt'):
+            remove(file[:-3]+'gt.txt')
 
 def resize_keep_ratio(image, baseheight=48):
     hpercent = (baseheight / float(image.size[1]))
@@ -31,7 +32,7 @@ class OcropyTrain(Processor):
     logger: Logger
 
     def __init__(self, *args, **kwargs):
-        self.oldcwd = os.getcwd()
+        self.oldcwd = getcwd()
         ocrd_tool = get_ocrd_tool()
         kwargs['ocrd_tool'] = ocrd_tool['tools'][self.executable]
         kwargs['version'] = ocrd_tool['version']
@@ -52,22 +53,22 @@ def setup(self):
             try:
                 modelpath = self.resolve_resource(model)
             except SystemExit:
-                ocropydir = os.path.dirname(os.path.abspath(__file__))
-                modelpath = os.path.join(ocropydir, 'models', model)
+                ocropydir = dirname(abspath(__file__))
+                modelpath = join(ocropydir, 'models', model)
                 self.logger.info("Failed to resolve model '%s' path, trying '%s'", model, modelpath)
-            if not os.path.isfile(modelpath):
+            if not isfile(modelpath):
                 self.logger.error("Could not find model '%s'. Try 'ocrd resmgr download ocrd-cis-ocropy-recognize %s'",
                                model, model)
-                sys.exit(1)
-            outputpath = os.path.join(self.oldcwd, 'output', model)
+                exit(1)
+            outputpath = join(self.oldcwd, 'output', model)
             if 'outputpath' in self.parameter:
-                outputpath = os.path.join(self.parameter, model)
+                outputpath = join(self.parameter, model)
         else:
             modelpath = None
-            outputpath = os.path.join(self.oldcwd, 'output', 'lstm')
+            outputpath = join(self.oldcwd, 'output', 'lstm')
             if 'outputpath' in self.parameter:
-                outputpath = os.path.join(self.parameter, 'lstm')
-        os.makedirs(os.path.dirname(outputpath))
+                outputpath = join(self.parameter, 'lstm')
+        makedirs(dirname(outputpath))
         self.modelpath = modelpath
         self.outputpath = outputpath
 
@@ -92,20 +93,20 @@ def process(self):
                 self.logger.info("Extracting %i lines from region '%s'", len(textlines), region.id)
                 for line in textlines:
                     if self.parameter['textequiv_level'] == 'line':
-                        path = os.path.join(filepath, page_id + region.id + line.id)
+                        path = join(filepath, page_id + region.id + line.id)
                         imgpath = self.extract_segment(path, line, page_image, page_coords)
                         if imgpath:
                             filelist.append(imgpath)
                         continue
                     for word in line.get_Word():
                         if self.parameter['textequiv_level'] == 'word':
-                            path = os.path.join(filepath, page_id + region.id + line.id + word.id)
+                            path = join(filepath, page_id + region.id + line.id + word.id)
                             imgpath = self.extract_segment(path, word, page_image, page_coords)
                             if imgpath:
                                 filelist.append(imgpath)
                             continue
                         for glyph in word.get_Glyph():
-                            path = os.path.join(filepath, page_id + region.id + line.id + glyph.id)
+                            path = join(filepath, page_id + region.id + line.id + glyph.id)
                             imgpath = self.extract_segment(path, glyph, page_image, page_coords)
                             if imgpath:
                                 filelist.append(imgpath)

From 3368a53e8341ab265ac5fa115a740cfc02bcc5ef Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Tue, 13 Aug 2024 16:34:21 +0200
Subject: [PATCH 10/97] remove: file grp cardinality checks inside process()

---
 ocrd_cis/ocropy/clip.py      | 2 --
 ocrd_cis/ocropy/denoise.py   | 2 --
 ocrd_cis/ocropy/deskew.py    | 2 --
 ocrd_cis/ocropy/dewarp.py    | 2 --
 ocrd_cis/ocropy/recognize.py | 2 --
 ocrd_cis/ocropy/resegment.py | 2 --
 ocrd_cis/ocropy/segment.py   | 3 ---
 7 files changed, 15 deletions(-)

diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py
index b70d1fb0..777b3d3d 100644
--- a/ocrd_cis/ocropy/clip.py
+++ b/ocrd_cis/ocropy/clip.py
@@ -85,8 +85,6 @@ def process(self):
         # deskewing, because that would make segments incomensurable with their
         # neighbours.
         level = self.parameter['level-of-operation']
-        assert_file_grp_cardinality(self.input_file_grp, 1)
-        assert_file_grp_cardinality(self.output_file_grp, 1)
 
         for (n, input_file) in enumerate(self.input_files):
             self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py
index 7cf74727..5d3b9d44 100644
--- a/ocrd_cis/ocropy/denoise.py
+++ b/ocrd_cis/ocropy/denoise.py
@@ -59,8 +59,6 @@ def process(self):
         Produce a new output file by serialising the resulting hierarchy.
         """
         level = self.parameter['level-of-operation']
-        assert_file_grp_cardinality(self.input_file_grp, 1)
-        assert_file_grp_cardinality(self.output_file_grp, 1)
 
         for (n, input_file) in enumerate(self.input_files):
             self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py
index bcd3be01..16b4bc81 100644
--- a/ocrd_cis/ocropy/deskew.py
+++ b/ocrd_cis/ocropy/deskew.py
@@ -63,8 +63,6 @@ def process(self):
         Produce a new output file by serialising the resulting hierarchy.
         """
         level = self.parameter['level-of-operation']
-        assert_file_grp_cardinality(self.input_file_grp, 1)
-        assert_file_grp_cardinality(self.output_file_grp, 1)
 
         for (n, input_file) in enumerate(self.input_files):
             self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py
index 6c27c5c6..dbe512f2 100644
--- a/ocrd_cis/ocropy/dewarp.py
+++ b/ocrd_cis/ocropy/dewarp.py
@@ -112,8 +112,6 @@ def process(self):
 
         Produce a new output file by serialising the resulting hierarchy.
         """
-        assert_file_grp_cardinality(self.input_file_grp, 1)
-        assert_file_grp_cardinality(self.output_file_grp, 1)
 
         for (n, input_file) in enumerate(self.input_files):
             self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py
index f3ecf199..4b5da4b1 100644
--- a/ocrd_cis/ocropy/recognize.py
+++ b/ocrd_cis/ocropy/recognize.py
@@ -150,8 +150,6 @@ def process(self):
 
         Produce a new output file by serialising the resulting hierarchy.
         """
-        assert_file_grp_cardinality(self.input_file_grp, 1)
-        assert_file_grp_cardinality(self.output_file_grp, 1)
         maxlevel = self.parameter['textequiv_level']
 
         # self.logger.info("Using model %s in %s for recognition", model)
diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index 329694d0..378c2fd3 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -114,8 +114,6 @@ def process(self):
         # accuracy crucially depends on a good estimate of the images'
         # pixel density (at least if source input is not 300 DPI).
         level = self.parameter['level-of-operation']
-        assert_file_grp_cardinality(self.input_file_grp, 1)
-        assert_file_grp_cardinality(self.output_file_grp, 1)
 
         for n, input_file in enumerate(self.input_files):
             self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index 446fc628..6feb6e29 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -338,9 +338,6 @@ def process(self):
         overwrite_order = self.parameter['overwrite_order']
         oplevel = self.parameter['level-of-operation']
 
-        assert_file_grp_cardinality(self.input_file_grp, 1)
-        assert_file_grp_cardinality(self.output_file_grp, 1)
-
         for (n, input_file) in enumerate(self.input_files):
             self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
             file_id = make_file_id(input_file, self.output_file_grp)

From ae97768ea73a900092f656c6ad42a64670525a11 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Tue, 13 Aug 2024 16:41:13 +0200
Subject: [PATCH 11/97] remove: constructors, adapt setup()

---
 ocrd_cis/ocropy/clip.py      |  7 -------
 ocrd_cis/ocropy/denoise.py   |  7 -------
 ocrd_cis/ocropy/deskew.py    |  7 -------
 ocrd_cis/ocropy/dewarp.py    | 10 ----------
 ocrd_cis/ocropy/recognize.py | 19 ++++++-------------
 ocrd_cis/ocropy/resegment.py |  7 -------
 ocrd_cis/ocropy/segment.py   |  8 --------
 ocrd_cis/ocropy/train.py     | 17 ++++-------------
 8 files changed, 10 insertions(+), 72 deletions(-)

diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py
index 777b3d3d..62f68fcf 100644
--- a/ocrd_cis/ocropy/clip.py
+++ b/ocrd_cis/ocropy/clip.py
@@ -25,7 +25,6 @@
     MIMETYPE_PAGE
 )
 
-from .. import get_ocrd_tool
 from .ocrolib import midrange, morph
 from .common import (
     # binarize,
@@ -34,12 +33,6 @@
 class OcropyClip(Processor):
     logger: Logger
 
-    def __init__(self, *args, **kwargs):
-        self.ocrd_tool = get_ocrd_tool()
-        kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
-        kwargs['version'] = self.ocrd_tool['version']
-        super(OcropyClip, self).__init__(*args, **kwargs)
-
     @property
     def executable(self):
         return 'ocrd-cis-ocropy-clip'
diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py
index 5d3b9d44..a68e2e3c 100644
--- a/ocrd_cis/ocropy/denoise.py
+++ b/ocrd_cis/ocropy/denoise.py
@@ -14,7 +14,6 @@
 )
 from ocrd import Processor
 
-from .. import get_ocrd_tool
 from .common import (
     # binarize,
     determine_zoom, remove_noise)
@@ -22,12 +21,6 @@
 class OcropyDenoise(Processor):
     logger: Logger
 
-    def __init__(self, *args, **kwargs):
-        self.ocrd_tool = get_ocrd_tool()
-        kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
-        kwargs['version'] = self.ocrd_tool['version']
-        super(OcropyDenoise, self).__init__(*args, **kwargs)
-
     @property
     def executable(self):
         return 'ocrd-cis-ocropy-denoise'
diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py
index 16b4bc81..e41a557d 100644
--- a/ocrd_cis/ocropy/deskew.py
+++ b/ocrd_cis/ocropy/deskew.py
@@ -15,7 +15,6 @@
 )
 from ocrd import Processor
 
-from .. import get_ocrd_tool
 from . import common
 from .common import pil2array
 
@@ -29,12 +28,6 @@ def deskew(pil_image, maxskew=2):
 class OcropyDeskew(Processor):
     logger: Logger
 
-    def __init__(self, *args, **kwargs):
-        ocrd_tool = get_ocrd_tool()
-        kwargs['ocrd_tool'] = ocrd_tool['tools'][self.executable]
-        kwargs['version'] = ocrd_tool['version']
-        super(OcropyDeskew, self).__init__(*args, **kwargs)
-
     @property
     def executable(self):
         return 'ocrd-cis-ocropy-deskew'
diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py
index dbe512f2..bb9e4098 100644
--- a/ocrd_cis/ocropy/dewarp.py
+++ b/ocrd_cis/ocropy/dewarp.py
@@ -15,7 +15,6 @@
 from ocrd import Processor
 from ocrd_utils import MIMETYPE_PAGE
 
-from .. import get_ocrd_tool
 from .ocrolib import lineest
 from .common import array2pil, check_line, determine_zoom, pil2array
 
@@ -66,15 +65,6 @@ def padvert(image, range_):
 class OcropyDewarp(Processor):
     logger: Logger
 
-    def __init__(self, *args, **kwargs):
-        self.ocrd_tool = get_ocrd_tool()
-        kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
-        kwargs['version'] = self.ocrd_tool['version']
-        super(OcropyDewarp, self).__init__(*args, **kwargs)
-        if hasattr(self, 'output_file_grp'):
-            # processing context
-            self.setup()
-
     @property
     def executable(self):
         return 'ocrd-cis-ocropy-dewarp'
diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py
index 4b5da4b1..5880675c 100644
--- a/ocrd_cis/ocropy/recognize.py
+++ b/ocrd_cis/ocropy/recognize.py
@@ -1,6 +1,8 @@
 from __future__ import absolute_import
 from logging import Logger
 from sys import exit
+from typing import Any
+from os import access, R_OK
 from os.path import abspath, dirname, isfile, join
 import numpy as np
 from PIL import Image
@@ -23,7 +25,6 @@
 )
 from ocrd import Processor
 
-from .. import get_ocrd_tool
 from .common import check_line, pil2array
 from .ocrolib import lstm, load_object, midrange
 
@@ -77,17 +78,8 @@ def recognize(image, pad, network, check=True):
 
 class OcropyRecognize(Processor):
     logger: Logger
-
-    def __init__(self, *args, **kwargs):
-        self.ocrd_tool = get_ocrd_tool()
-        self.pad = 16 # ocropus-rpred default
-        self.network = None # set in process
-        kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
-        kwargs['version'] = self.ocrd_tool['version']
-        super(OcropyRecognize, self).__init__(*args, **kwargs)
-        if hasattr(self, 'output_file_grp'):
-            # processing context
-            self.setup()
+    network: Any
+    pad: int
 
     @property
     def executable(self):
@@ -95,6 +87,7 @@ def executable(self):
 
     def setup(self):
         self.logger = getLogger('processor.OcropyRecognize')
+        self.pad = 16
         assert_file_grp_cardinality(self.input_file_grp, 1)
         assert_file_grp_cardinality(self.output_file_grp, 1)
         # from ocropus-rpred:
@@ -110,7 +103,7 @@ def get_model(self):
         be resolved with OcrdResourceManager to a valid readeable file and
         returns it.  If not, it checks if the model can be found in the
         dirname(__file__)/models/ directory."""
-        canread = lambda p: isfile(p) and os.access(p, os.R_OK)
+        canread = lambda p: isfile(p) and access(p, R_OK)
         try:
             model = self.resolve_resource(self.parameter['model'])
             if canread(model):
diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index 378c2fd3..17b90f65 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -21,7 +21,6 @@
     MIMETYPE_PAGE
 )
 
-from .. import get_ocrd_tool
 from .ocrolib import midrange, morph
 from .common import (
     pil2array,
@@ -47,12 +46,6 @@
 class OcropyResegment(Processor):
     logger: Logger
 
-    def __init__(self, *args, **kwargs):
-        self.ocrd_tool = get_ocrd_tool()
-        kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
-        kwargs['version'] = self.ocrd_tool['version']
-        super().__init__(*args, **kwargs)
-
     @property
     def executable(self):
         return 'ocrd-cis-ocropy-resegment'
diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index 6feb6e29..f886e1d1 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -46,7 +46,6 @@
     MIMETYPE_PAGE
 )
 
-from .. import get_ocrd_tool
 from .ocrolib import midrange
 from .ocrolib import morph
 from .common import (
@@ -247,13 +246,6 @@ def getx(xy):
 class OcropySegment(Processor):
     logger: Logger
 
-    def __init__(self, *args, **kwargs):
-
-        self.ocrd_tool = get_ocrd_tool()
-        kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable]
-        kwargs['version'] = self.ocrd_tool['version']
-        super(OcropySegment, self).__init__(*args, **kwargs)
-
     @property
     def executable(self):
         return 'ocrd-cis-ocropy-segment'
diff --git a/ocrd_cis/ocropy/train.py b/ocrd_cis/ocropy/train.py
index ff460523..08b68693 100644
--- a/ocrd_cis/ocropy/train.py
+++ b/ocrd_cis/ocropy/train.py
@@ -8,7 +8,6 @@
 from ocrd_modelfactory import page_from_file
 from ocrd import Processor
 from ocrd_utils import getLogger
-from ocrd_cis import get_ocrd_tool
 
 from .ocropus_rtrain import *
 from .binarize import binarize
@@ -30,16 +29,7 @@ def resize_keep_ratio(image, baseheight=48):
 
 class OcropyTrain(Processor):
     logger: Logger
-
-    def __init__(self, *args, **kwargs):
-        self.oldcwd = getcwd()
-        ocrd_tool = get_ocrd_tool()
-        kwargs['ocrd_tool'] = ocrd_tool['tools'][self.executable]
-        kwargs['version'] = ocrd_tool['version']
-        super(OcropyTrain, self).__init__(*args, **kwargs)
-        if hasattr(self, 'input_file_grp'):
-            # processing context
-            self.setup()
+    old_cwd: str
 
     @property
     def executable(self):
@@ -47,6 +37,7 @@ def executable(self):
 
     def setup(self):
         self.logger = getLogger('processor.OcropyTrain')
+        self.old_cwd = getcwd()
         #print(self.parameter)
         if 'model' in self.parameter:
             model = self.parameter['model']
@@ -60,12 +51,12 @@ def setup(self):
                 self.logger.error("Could not find model '%s'. Try 'ocrd resmgr download ocrd-cis-ocropy-recognize %s'",
                                model, model)
                 exit(1)
-            outputpath = join(self.oldcwd, 'output', model)
+            outputpath = join(self.old_cwd, 'output', model)
             if 'outputpath' in self.parameter:
                 outputpath = join(self.parameter, model)
         else:
             modelpath = None
-            outputpath = join(self.oldcwd, 'output', 'lstm')
+            outputpath = join(self.old_cwd, 'output', 'lstm')
             if 'outputpath' in self.parameter:
                 outputpath = join(self.parameter, 'lstm')
         makedirs(dirname(outputpath))

From 60d02d28040f5b1bc2b4f5497f5353d4f53d5c45 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Wed, 14 Aug 2024 00:39:18 +0200
Subject: [PATCH 12/97] completed: OcropyBinarize

---
 ocrd_cis/ocropy/binarize.py | 138 +++++++++++++++++-------------------
 1 file changed, 65 insertions(+), 73 deletions(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index 5d3fc7c3..0728f852 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -116,38 +116,36 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
             except ValueError as e:
                 self.logger.exception(e)
         else:
-            # TODO
-            raise NotImplementedError
             if level == 'table':
                 regions = page.get_TableRegion()
             else: # region
                 regions = page.get_AllRegions(classes=['Text'], order='reading-order')
             if not regions:
-                self.logger.warning('Page "%s" contains no text regions', page_id)
+                self.logger.warning(f"Page '{page_id}' contains no text regions")
             for region in regions:
                 region_image, region_xywh = self.workspace.image_from_segment(
                     region, page_image, page_xywh, feature_filter='binarized')
                 if level == 'region':
-                    self.process_region(region, region_image, region_xywh, zoom,
-                                        input_file.pageId, file_id + '_' + region.id)
-                    continue
+                    try:
+                        ret.append(self.process_region(region, region_image, region_xywh, zoom, page_id, file_id))
+                    except ValueError as e:
+                        self.logger.exception(e)
                 lines = region.get_TextLine()
                 if not lines:
-                    self.logger.warning('Page "%s" region "%s" contains no text lines',
-                                        page_id, region.id)
+                    self.logger.warning(f"Page '{page_id}' region '{region.id}' contains no text lines")
                 for line in lines:
                     line_image, line_xywh = self.workspace.image_from_segment(
                         line, region_image, region_xywh, feature_filter='binarized')
-                    self.process_line(line, line_image, line_xywh, zoom,
-                                      input_file.pageId, region.id,
-                                      file_id + '_' + region.id + '_' + line.id)
-
+                    try:
+                        ret.append(self.process_line(line, line_image, line_xywh, zoom, page_id, region.id, file_id))
+                    except ValueError as e:
+                        self.logger.exception(e)
         return ret
 
     def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id) -> Tuple[Image.Image, str, str]:
         if not page_image.width or not page_image.height:
-            raise ValueError("Skipping page '%s' with zero size", page_id)
-        self.logger.info("About to binarize page '%s'", page_id)
+            raise ValueError(f"Skipping page '{page_id}' with zero size")
+        self.logger.info(f"About to binarize page '{page_id}'")
         assert self.output_file_grp
 
         features = page_xywh['features']
@@ -157,18 +155,18 @@ def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id) -> T
             maxskew = 0
         else:
             maxskew = self.parameter['maxskew']
-        bin_image, angle = binarize(page_image,
-                                    method=self.parameter['method'],
-                                    maxskew=maxskew,
-                                    threshold=self.parameter['threshold'],
-                                    nrm=self.parameter['grayscale'],
-                                    zoom=zoom)
+        bin_image, angle = binarize(
+            page_image,
+            method=self.parameter['method'],
+            maxskew=maxskew,
+            threshold=self.parameter['threshold'],
+            nrm=self.parameter['grayscale'],
+            zoom=zoom)
         if angle:
             features += ',deskewed'
         page_xywh['angle'] = angle
         if self.parameter['noise_maxsize']:
-            bin_image = remove_noise(
-                bin_image, maxsize=self.parameter['noise_maxsize'])
+            bin_image = remove_noise(bin_image, maxsize=self.parameter['noise_maxsize'])
             features += ',despeckled'
         # annotate angle in PAGE (to allow consumers of the AlternativeImage
         # to do consistent coordinate transforms, and non-consumers
@@ -176,43 +174,43 @@ def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id) -> T
         orientation = -page_xywh['angle']
         orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180]
         page.set_orientation(orientation)
-        # update METS (add the image file):
         if self.parameter['grayscale']:
             file_id += '.IMG-NRM'
             features += ',grayscale_normalized'
         else:
             file_id += '.IMG-BIN'
             features += ',binarized'
-        bin_image_path = join(self.output_file_grp, f'{file_id}.png')
+        bin_image_id = f'{file_id}.IMG-BIN'
+        bin_image_path = join(self.output_file_grp, f'{bin_image_id}.png')
         # update PAGE (reference the image file):
         page.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=features))
-        return (bin_image, file_id, bin_image_path)
+        return bin_image, bin_image_id, bin_image_path
 
-    def process_region(self, region, region_image, region_xywh, zoom, page_id, file_id):
+    def process_region(self, region, region_image, region_xywh, zoom, page_id, file_id) -> Tuple[Image.Image, str, str]:
         if not region_image.width or not region_image.height:
-            self.logger.warning("Skipping region '%s' with zero size", region.id)
-            return
-        self.logger.info("About to binarize page '%s' region '%s'", page_id, region.id)
+            raise ValueError(f"Skipping region '{region.id}' with zero size")
+        self.logger.info(f"About to binarize page '{page_id}' region '{region.id}'")
         features = region_xywh['features']
         if 'angle' in region_xywh and region_xywh['angle']:
             # orientation has already been annotated (by previous deskewing),
             # so skip deskewing here:
-            bin_image, _ = binarize(region_image,
-                                    method=self.parameter['method'],
-                                    maxskew=0,
-                                    nrm=self.parameter['grayscale'],
-                                    zoom=zoom)
+            bin_image, _ = binarize(
+                region_image,
+                method=self.parameter['method'],
+                maxskew=0,
+                nrm=self.parameter['grayscale'],
+                zoom=zoom)
         else:
-            bin_image, angle = binarize(region_image,
-                                        method=self.parameter['method'],
-                                        maxskew=self.parameter['maxskew'],
-                                        nrm=self.parameter['grayscale'],
-                                        zoom=zoom)
+            bin_image, angle = binarize(
+                region_image,
+                method=self.parameter['method'],
+                maxskew=self.parameter['maxskew'],
+                nrm=self.parameter['grayscale'],
+                zoom=zoom)
             if angle:
                 features += ',deskewed'
             region_xywh['angle'] = angle
-        bin_image = remove_noise(bin_image,
-                                 maxsize=self.parameter['noise_maxsize'])
+        bin_image = remove_noise(bin_image, maxsize=self.parameter['noise_maxsize'])
         if self.parameter['noise_maxsize']:
             features += ',despeckled'
         # annotate angle in PAGE (to allow consumers of the AlternativeImage
@@ -221,33 +219,31 @@ def process_region(self, region, region_image, region_xywh, zoom, page_id, file_
         orientation = -region_xywh['angle']
         orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180]
         region.set_orientation(orientation)
-        # update METS (add the image file):
+        bin_image_id = f'{file_id}_{region.id}'
         if self.parameter['grayscale']:
-            file_id += '.IMG-NRM'
+            bin_image_id += '.IMG-NRM'
             features += ',grayscale_normalized'
         else:
-            file_id += '.IMG-BIN'
+            bin_image_id += '.IMG-BIN'
             features += ',binarized'
-        file_path = self.workspace.save_image_file(
-            bin_image, file_id, self.output_file_grp,
-            page_id=page_id)
+        bin_image_path = join(self.output_file_grp, f'{bin_image_id}.png')
         # update PAGE (reference the image file):
-        region.add_AlternativeImage(AlternativeImageType(
-            filename=file_path,
-            comments=features))
+        region.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=features))
+        return bin_image, bin_image_id, bin_image_path
 
-    def process_line(self, line, line_image, line_xywh, zoom, page_id, region_id, file_id):
+    def process_line(
+        self, line, line_image, line_xywh, zoom, page_id, region_id, file_id
+    ) -> Tuple[Image.Image, str, str]:
         if not line_image.width or not line_image.height:
-            self.logger.warning("Skipping line '%s' with zero size", line.id)
-            return
-        self.logger.info("About to binarize page '%s' region '%s' line '%s'",
-                         page_id, region_id, line.id)
+            raise ValueError(f"Skipping line '{line.id}' with zero size")
+        self.logger.info(f"About to binarize page '{page_id}' region '{region_id}' line '{line.id}'")
         features = line_xywh['features']
-        bin_image, angle = binarize(line_image,
-                                    method=self.parameter['method'],
-                                    maxskew=self.parameter['maxskew'],
-                                    nrm=self.parameter['grayscale'],
-                                    zoom=zoom)
+        bin_image, angle = binarize(
+            line_image,
+            method=self.parameter['method'],
+            maxskew=self.parameter['maxskew'],
+            nrm=self.parameter['grayscale'],
+            zoom=zoom)
         if angle:
             features += ',deskewed'
         # annotate angle in PAGE (to allow consumers of the AlternativeImage
@@ -256,23 +252,19 @@ def process_line(self, line, line_image, line_xywh, zoom, page_id, region_id, fi
         #orientation = -angle
         #orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180]
         #line.set_orientation(orientation) # does not exist on line level!
-        self.logger.warning("cannot add orientation %.2f to page '%s' region '%s' line '%s'",
-                            -angle, page_id, region_id, line.id)
-        bin_image = remove_noise(bin_image,
-                                 maxsize=self.parameter['noise_maxsize'])
+        self.logger.warning(f"cannot add orientation %.2f to page '{page_id}' region '{region_id}' line '{line.id}'",
+                            -angle)
+        bin_image = remove_noise(bin_image, maxsize=self.parameter['noise_maxsize'])
         if self.parameter['noise_maxsize']:
             features += ',despeckled'
-        # update METS (add the image file):
+        bin_image_id = f'{file_id}_{region_id}_{line.id}'
         if self.parameter['grayscale']:
-            file_id += '.IMG-NRM'
+            bin_image_id += '.IMG-NRM'
             features += ',grayscale_normalized'
         else:
-            file_id += '.IMG-BIN'
+            bin_image_id += '.IMG-BIN'
             features += ',binarized'
-        file_path = self.workspace.save_image_file(
-            bin_image, file_id, self.output_file_grp,
-            page_id=page_id)
+        bin_image_path = join(self.output_file_grp, f'{bin_image_id}.png')
         # update PAGE (reference the image file):
-        line.add_AlternativeImage(AlternativeImageType(
-            filename=file_path,
-            comments=features))
+        line.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=features))
+        return bin_image, bin_image_id, bin_image_path

From dcaccd4b5bb357c4f73356aaed04fd8a4483caa8 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Wed, 14 Aug 2024 00:46:34 +0200
Subject: [PATCH 13/97] remove file grp cardinality asserts

---
 ocrd_cis/ocropy/binarize.py  | 3 ---
 ocrd_cis/ocropy/clip.py      | 3 ---
 ocrd_cis/ocropy/denoise.py   | 3 ---
 ocrd_cis/ocropy/deskew.py    | 3 ---
 ocrd_cis/ocropy/dewarp.py    | 3 ---
 ocrd_cis/ocropy/recognize.py | 3 ---
 ocrd_cis/ocropy/resegment.py | 3 ---
 ocrd_cis/ocropy/segment.py   | 3 ---
 8 files changed, 24 deletions(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index 0728f852..746aba5e 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -13,7 +13,6 @@
 from ocrd_utils import (
     getLogger,
     make_file_id,
-    assert_file_grp_cardinality,
     MIMETYPE_PAGE
 )
 from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage, to_xml
@@ -69,8 +68,6 @@ def executable(self):
 
     def setup(self):
         self.logger = getLogger('processor.OcropyBinarize')
-        assert_file_grp_cardinality(self.input_file_grp, 1)
-        assert_file_grp_cardinality(self.output_file_grp, 1)
         method = self.parameter['method']
         if self.parameter['grayscale'] and method != 'ocropy':
             self.logger.critical(f'Requested method {method} does not support grayscale normalized output')
diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py
index 62f68fcf..3e76157b 100644
--- a/ocrd_cis/ocropy/clip.py
+++ b/ocrd_cis/ocropy/clip.py
@@ -15,7 +15,6 @@
 from ocrd_utils import (
     getLogger,
     make_file_id,
-    assert_file_grp_cardinality,
     coordinates_of_segment,
     polygon_from_points,
     bbox_from_polygon,
@@ -39,8 +38,6 @@ def executable(self):
 
     def setup(self):
         self.logger = getLogger('processor.OcropyClip')
-        assert_file_grp_cardinality(self.input_file_grp, 1)
-        assert_file_grp_cardinality(self.output_file_grp, 1)
 
     def process(self):
         """Clip text regions / lines of the workspace at intersections with neighbours.
diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py
index a68e2e3c..24852f24 100644
--- a/ocrd_cis/ocropy/denoise.py
+++ b/ocrd_cis/ocropy/denoise.py
@@ -5,7 +5,6 @@
 from ocrd_utils import (
     getLogger,
     make_file_id,
-    assert_file_grp_cardinality,
     MIMETYPE_PAGE
 )
 from ocrd_modelfactory import page_from_file
@@ -27,8 +26,6 @@ def executable(self):
 
     def setup(self):
         self.logger = getLogger('processor.OcropyDenoise')
-        assert_file_grp_cardinality(self.input_file_grp, 1)
-        assert_file_grp_cardinality(self.output_file_grp, 1)
 
     def process(self):
         """Despeckle the pages / regions / lines of the workspace.
diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py
index e41a557d..616864e1 100644
--- a/ocrd_cis/ocropy/deskew.py
+++ b/ocrd_cis/ocropy/deskew.py
@@ -5,7 +5,6 @@
 from ocrd_utils import (
     getLogger,
     make_file_id,
-    assert_file_grp_cardinality,
     MIMETYPE_PAGE
 )
 from ocrd_modelfactory import page_from_file
@@ -34,8 +33,6 @@ def executable(self):
 
     def setup(self):
         self.logger = getLogger('processor.OcropyDeskew')
-        assert_file_grp_cardinality(self.input_file_grp, 1)
-        assert_file_grp_cardinality(self.output_file_grp, 1)
 
     def process(self):
         """Deskew the pages or regions of the workspace.
diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py
index bb9e4098..17b69bc5 100644
--- a/ocrd_cis/ocropy/dewarp.py
+++ b/ocrd_cis/ocropy/dewarp.py
@@ -6,7 +6,6 @@
 from ocrd_utils import (
     getLogger,
     make_file_id,
-    assert_file_grp_cardinality,
 )
 from ocrd_modelfactory import page_from_file
 from ocrd_models.ocrd_page import (
@@ -71,8 +70,6 @@ def executable(self):
 
     def setup(self):
         self.logger = getLogger('processor.OcropyDewarp')
-        assert_file_grp_cardinality(self.input_file_grp, 1)
-        assert_file_grp_cardinality(self.output_file_grp, 1)
         # defaults from ocrolib.lineest:
         self.lnorm = lineest.CenterNormalizer(
             params=(self.parameter['range'],
diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py
index 5880675c..40de2817 100644
--- a/ocrd_cis/ocropy/recognize.py
+++ b/ocrd_cis/ocropy/recognize.py
@@ -12,7 +12,6 @@
 from ocrd_utils import (
     getLogger,
     make_file_id,
-    assert_file_grp_cardinality,
     coordinates_for_segment,
     polygon_from_bbox,
     points_from_polygon,
@@ -88,8 +87,6 @@ def executable(self):
     def setup(self):
         self.logger = getLogger('processor.OcropyRecognize')
         self.pad = 16
-        assert_file_grp_cardinality(self.input_file_grp, 1)
-        assert_file_grp_cardinality(self.output_file_grp, 1)
         # from ocropus-rpred:
         self.network = load_object(self.get_model(), verbose=1)
         for x in self.network.walk():
diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index 17b90f65..2483411d 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -12,7 +12,6 @@
 from ocrd_utils import (
     getLogger,
     make_file_id,
-    assert_file_grp_cardinality,
     coordinates_of_segment,
     coordinates_for_segment,
     points_from_polygon,
@@ -52,8 +51,6 @@ def executable(self):
 
     def setup(self):
         self.logger = getLogger('processor.OcropyResegment')
-        assert_file_grp_cardinality(self.input_file_grp, 1)
-        assert_file_grp_cardinality(self.output_file_grp, 1)
 
     def process(self):
         """Resegment lines of the workspace.
diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index f886e1d1..9a1b8e11 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -38,7 +38,6 @@
 from ocrd_utils import (
     getLogger,
     make_file_id,
-    assert_file_grp_cardinality,
     coordinates_of_segment,
     coordinates_for_segment,
     points_from_polygon,
@@ -252,8 +251,6 @@ def executable(self):
 
     def setup(self):
         self.logger = getLogger('processor.OcropySegment')
-        assert_file_grp_cardinality(self.input_file_grp, 1)
-        assert_file_grp_cardinality(self.output_file_grp, 1)
 
     def process(self):
         """Segment pages into regions+lines, tables into cells+lines, or regions into lines.

From b178227763b834802b1e775623402b7bb5cdf84c Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Wed, 14 Aug 2024 10:51:52 +0200
Subject: [PATCH 14/97] Update ocrd_cis/ocropy/binarize.py

Co-authored-by: Konstantin Baierer <kba@users.noreply.github.com>
---
 ocrd_cis/ocropy/binarize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index 746aba5e..27a3667c 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -118,7 +118,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
             else: # region
                 regions = page.get_AllRegions(classes=['Text'], order='reading-order')
             if not regions:
-                self.logger.warning(f"Page '{page_id}' contains no text regions")
+                self.logger.warning(f"Page '{page_id}' contains no regions")
             for region in regions:
                 region_image, region_xywh = self.workspace.image_from_segment(
                     region, page_image, page_xywh, feature_filter='binarized')

From 67b6107e19c604063e9dae37473fcc48e04b4558 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Wed, 14 Aug 2024 10:52:25 +0200
Subject: [PATCH 15/97] Update ocrd_cis/ocropy/binarize.py

Co-authored-by: Konstantin Baierer <kba@users.noreply.github.com>
---
 ocrd_cis/ocropy/binarize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index 27a3667c..fea064af 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -76,7 +76,7 @@ def setup(self):
     def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage:
         """Binarize (and optionally deskew/despeckle) the pages/regions/lines of the workspace.
 
-        THEN Iterate over the PAGE-XML element hierarchy down to the requested
+        Iterate over the PAGE-XML element hierarchy down to the requested
         ``level-of-operation``.
 
         Next, for each file, crop each segment image according to the layout

From 06a98b1f601d80511e73b0c366a60f574e2a8e27 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Wed, 14 Aug 2024 10:55:29 +0200
Subject: [PATCH 16/97] Update ocrd_cis/ocropy/binarize.py

Co-authored-by: Konstantin Baierer <kba@users.noreply.github.com>
---
 ocrd_cis/ocropy/binarize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index fea064af..7e355d73 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -71,7 +71,7 @@ def setup(self):
         method = self.parameter['method']
         if self.parameter['grayscale'] and method != 'ocropy':
             self.logger.critical(f'Requested method {method} does not support grayscale normalized output')
-            raise Exception('only method=ocropy allows grayscale=true')
+            raise ValueError('only method=ocropy allows grayscale=true')
 
     def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage:
         """Binarize (and optionally deskew/despeckle) the pages/regions/lines of the workspace.

From 1e6cd7bd53547de5c41f2100cdad8adc1a2091ca Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Wed, 14 Aug 2024 10:55:45 +0200
Subject: [PATCH 17/97] Update ocrd_cis/ocropy/binarize.py

Co-authored-by: Konstantin Baierer <kba@users.noreply.github.com>
---
 ocrd_cis/ocropy/binarize.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index 7e355d73..af60e613 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -21,7 +21,6 @@
 from . import common
 from .common import array2pil, determine_zoom, pil2array, remove_noise
 
-#sys.path.append(dirname(abspath(__file__)))
 
 def binarize(pil_image, method='ocropy', maxskew=2, threshold=0.5, nrm=False, zoom=1.0):
     LOG = getLogger('processor.OcropyBinarize')

From 71bb26d9c4f0b45498625b90c9e4cd136d8e667e Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Wed, 14 Aug 2024 11:04:12 +0200
Subject: [PATCH 18/97] fix: potentially wrong dpi in logs

---
 ocrd_cis/ocropy/binarize.py  | 4 ++--
 ocrd_cis/ocropy/clip.py      | 4 ++--
 ocrd_cis/ocropy/common.py    | 4 ++--
 ocrd_cis/ocropy/denoise.py   | 4 ++--
 ocrd_cis/ocropy/dewarp.py    | 4 ++--
 ocrd_cis/ocropy/resegment.py | 4 ++--
 ocrd_cis/ocropy/segment.py   | 4 ++--
 7 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index af60e613..61e959ca 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -102,8 +102,8 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
         assert page
 
         page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id, feature_filter='binarized')
-        zoom = determine_zoom(self.parameter['dpi'], page_image_info)
-        self.logger.info(f"Page '{page_id}' uses {self.parameter['dpi']} DPI")
+        zoom, dpi = determine_zoom(self.parameter['dpi'], page_image_info)
+        self.logger.info(f"Page '{page_id}' uses {dpi} DPI. Determined zoom={zoom}")
         
         ret = [pcgts]
         if level == 'page':
diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py
index 3e76157b..3607399b 100644
--- a/ocrd_cis/ocropy/clip.py
+++ b/ocrd_cis/ocropy/clip.py
@@ -87,8 +87,8 @@ def process(self):
             
             page_image, page_coords, page_image_info = self.workspace.image_from_page(
                 page, page_id, feature_selector='binarized')
-            zoom = determine_zoom(self.parameter['dpi'], page_image_info)
-            self.logger.info(f"Page '{page_id}' uses {self.parameter['dpi']} DPI")
+            zoom, dpi = determine_zoom(self.parameter['dpi'], page_image_info)
+            self.logger.info(f"Page '{page_id}' uses {dpi} DPI. Determined zoom={zoom}")
 
             # FIXME: what about text regions inside table regions?
             regions = list(page.get_TextRegion())
diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py
index 1804c29d..49e8f248 100644
--- a/ocrd_cis/ocropy/common.py
+++ b/ocrd_cis/ocropy/common.py
@@ -2103,7 +2103,7 @@ def find_topological():
     # DSAVE('rlabels_closed', rlabels)
     return rlabels
 
-def determine_zoom(dpi: float, page_image_info: OcrdExif) -> float:
+def determine_zoom(dpi: float, page_image_info: OcrdExif) -> (float, float):
     if dpi > 0:
         zoom = 300.0/dpi
     elif page_image_info.resolution != 1:
@@ -2113,4 +2113,4 @@ def determine_zoom(dpi: float, page_image_info: OcrdExif) -> float:
         zoom = 300.0/dpi
     else:
         zoom = 1
-    return zoom
+    return zoom, dpi
diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py
index 24852f24..713af889 100644
--- a/ocrd_cis/ocropy/denoise.py
+++ b/ocrd_cis/ocropy/denoise.py
@@ -63,8 +63,8 @@ def process(self):
                 page, page_id,
                 feature_selector='binarized' if level == 'page' else '')
 
-            zoom = determine_zoom(self.parameter['dpi'], page_image_info)
-            self.logger.info(f"Page '{page_id}' uses {self.parameter['dpi']} DPI")
+            zoom, dpi = determine_zoom(self.parameter['dpi'], page_image_info)
+            self.logger.info(f"Page '{page_id}' uses {dpi} DPI. Determined zoom={zoom}")
 
             if level == 'page':
                 self.process_segment(page, page_image, page_xywh, zoom,
diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py
index 17b69bc5..412724db 100644
--- a/ocrd_cis/ocropy/dewarp.py
+++ b/ocrd_cis/ocropy/dewarp.py
@@ -112,8 +112,8 @@ def process(self):
             page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                 page, page_id)
 
-            zoom = determine_zoom(self.parameter['dpi'], page_image_info)
-            self.logger.info(f"Page '{page_id}' uses {self.parameter['dpi']} DPI")
+            zoom, dpi = determine_zoom(self.parameter['dpi'], page_image_info)
+            self.logger.info(f"Page '{page_id}' uses {dpi} DPI. Determined zoom={zoom}")
 
             regions = page.get_AllRegions(classes=['Text'], order='reading-order')
             if not regions:
diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index 2483411d..5bc9d008 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -117,8 +117,8 @@ def process(self):
             page_image, page_coords, page_image_info = self.workspace.image_from_page(
                 page, page_id, feature_selector='binarized')
 
-            zoom = determine_zoom(self.parameter['dpi'], page_image_info)
-            self.logger.info(f"Page '{page_id}' uses {self.parameter['dpi']} DPI")
+            zoom, dpi = determine_zoom(self.parameter['dpi'], page_image_info)
+            self.logger.info(f"Page '{page_id}' uses {dpi} DPI. Determined zoom={zoom}")
 
             ignore = (page.get_ImageRegion() +
                       page.get_LineDrawingRegion() +
diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index 9a1b8e11..d171b6ed 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -339,8 +339,8 @@ def process(self):
             # TODO: also allow grayscale_normalized (try/except?)
             page_image, page_coords, page_image_info = self.workspace.image_from_page(
                 page, page_id, feature_selector='binarized')
-            zoom = determine_zoom(self.parameter['dpi'], page_image_info)
-            self.logger.info(f"Page '{page_id}' uses {self.parameter['dpi']} DPI")
+            zoom, dpi = determine_zoom(self.parameter['dpi'], page_image_info)
+            self.logger.info(f"Page '{page_id}' uses {dpi} DPI. Determined zoom={zoom}")
 
             # aggregate existing regions so their foreground can be ignored
             ignore = (page.get_ImageRegion() +

From 64f02a32f938a00e01d6d390993246a617cbab5e Mon Sep 17 00:00:00 2001
From: kba <unixprog@gmail.com>
Date: Wed, 14 Aug 2024 11:14:31 +0200
Subject: [PATCH 19/97] binarize: don't conflate region/lines seg, pass
 output_file_id

---
 ocrd_cis/ocropy/binarize.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index 61e959ca..817d4a8a 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -123,7 +123,8 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
                     region, page_image, page_xywh, feature_filter='binarized')
                 if level == 'region':
                     try:
-                        ret.append(self.process_region(region, region_image, region_xywh, zoom, page_id, file_id))
+                        ret.append(self.process_region(region, region_image, region_xywh, zoom, region.id, output_file_id))
+                        continue
                     except ValueError as e:
                         self.logger.exception(e)
                 lines = region.get_TextLine()
@@ -133,8 +134,8 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
                     line_image, line_xywh = self.workspace.image_from_segment(
                         line, region_image, region_xywh, feature_filter='binarized')
                     try:
-                        ret.append(self.process_line(line, line_image, line_xywh, zoom, page_id, region.id, file_id))
-                    except ValueError as e:
+                        ret.append(self.process_line(line, line_image, line_xywh, zoom, page_id, region.id, output_file_id))
+                    except alueError as e:
                         self.logger.exception(e)
         return ret
 

From d7c15c7738cdad474eb1999718c41371192e0e14 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Wed, 14 Aug 2024 11:29:21 +0200
Subject: [PATCH 20/97] Update binarize.py

---
 ocrd_cis/ocropy/binarize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index 817d4a8a..064a733e 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -135,7 +135,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
                         line, region_image, region_xywh, feature_filter='binarized')
                     try:
                         ret.append(self.process_line(line, line_image, line_xywh, zoom, page_id, region.id, output_file_id))
-                    except alueError as e:
+                    except ValueError as e:
                         self.logger.exception(e)
         return ret
 

From 19566c0567b5b23bdc4596384d3867601045ca57 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Wed, 14 Aug 2024 13:53:35 +0200
Subject: [PATCH 21/97] try to migrate recognize

---
 ocrd_cis/ocropy/recognize.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py
index 40de2817..140a3c83 100644
--- a/ocrd_cis/ocropy/recognize.py
+++ b/ocrd_cis/ocropy/recognize.py
@@ -115,6 +115,30 @@ def get_model(self):
                 self.parameter['model'], self.parameter['model'])
         exit(1)
 
+    def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage:
+        maxlevel = self.parameter['textequiv_level']
+        assert self.workspace
+        self.logger.debug(f'Max level: "{maxlevel}"')
+
+        pcgts = input_pcgts[0]
+        page = pcgts.get_Page()
+        assert page
+
+        page_image, page_coords, _ = self.workspace.image_from_page(page, page_id)
+        ret = [pcgts]
+
+        self.logger.info(f"Recognizing text in page '{page_id}'")
+        # region, line, word, or glyph level:
+        regions = page.get_AllRegions(classes=['Text'])
+        if not regions:
+            self.logger.warning(f"Page '{page_id}' contains no text regions")
+        self.process_regions(regions, maxlevel, page_image, page_coords)
+
+        file_path = join(self.output_file_grp, output_file_id + '.xml')
+        ret.append((output_file_id, file_path))
+        return ret
+
+    # TODO: remove when `process_page_pcgts` is validated to be correct
     def process(self):
         """Recognize lines / words / glyphs of the workspace.
 

From 5f60976452011656fd05c1375055dd5ebd5f89d9 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Wed, 14 Aug 2024 13:59:33 +0200
Subject: [PATCH 22/97] fix: migrate recognize

---
 ocrd_cis/ocropy/recognize.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py
index 140a3c83..9729b480 100644
--- a/ocrd_cis/ocropy/recognize.py
+++ b/ocrd_cis/ocropy/recognize.py
@@ -125,18 +125,13 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
         assert page
 
         page_image, page_coords, _ = self.workspace.image_from_page(page, page_id)
-        ret = [pcgts]
-
         self.logger.info(f"Recognizing text in page '{page_id}'")
         # region, line, word, or glyph level:
         regions = page.get_AllRegions(classes=['Text'])
         if not regions:
             self.logger.warning(f"Page '{page_id}' contains no text regions")
         self.process_regions(regions, maxlevel, page_image, page_coords)
-
-        file_path = join(self.output_file_grp, output_file_id + '.xml')
-        ret.append((output_file_id, file_path))
-        return ret
+        return [pcgts]
 
     # TODO: remove when `process_page_pcgts` is validated to be correct
     def process(self):

From e8b26035f0d4bd84e689ce92f8da805cb0adaf13 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Wed, 14 Aug 2024 14:35:53 +0200
Subject: [PATCH 23/97] fix: detect_zoom logging

---
 ocrd_cis/ocropy/binarize.py  | 5 ++---
 ocrd_cis/ocropy/clip.py      | 4 ++--
 ocrd_cis/ocropy/common.py    | 5 +++--
 ocrd_cis/ocropy/denoise.py   | 3 +--
 ocrd_cis/ocropy/dewarp.py    | 3 +--
 ocrd_cis/ocropy/resegment.py | 3 +--
 ocrd_cis/ocropy/segment.py   | 3 +--
 7 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index 064a733e..387c51dc 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -102,9 +102,8 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
         assert page
 
         page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id, feature_filter='binarized')
-        zoom, dpi = determine_zoom(self.parameter['dpi'], page_image_info)
-        self.logger.info(f"Page '{page_id}' uses {dpi} DPI. Determined zoom={zoom}")
-        
+        zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info)
+
         ret = [pcgts]
         if level == 'page':
             try:
diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py
index 3607399b..dd0de012 100644
--- a/ocrd_cis/ocropy/clip.py
+++ b/ocrd_cis/ocropy/clip.py
@@ -87,8 +87,8 @@ def process(self):
             
             page_image, page_coords, page_image_info = self.workspace.image_from_page(
                 page, page_id, feature_selector='binarized')
-            zoom, dpi = determine_zoom(self.parameter['dpi'], page_image_info)
-            self.logger.info(f"Page '{page_id}' uses {dpi} DPI. Determined zoom={zoom}")
+            # TODO: zoom is not used anywhere, is it still useful to have this call here?
+            zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info)
 
             # FIXME: what about text regions inside table regions?
             regions = list(page.get_TextRegion())
diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py
index 49e8f248..095de5eb 100644
--- a/ocrd_cis/ocropy/common.py
+++ b/ocrd_cis/ocropy/common.py
@@ -2103,14 +2103,15 @@ def find_topological():
     # DSAVE('rlabels_closed', rlabels)
     return rlabels
 
-def determine_zoom(dpi: float, page_image_info: OcrdExif) -> (float, float):
+def determine_zoom(logger: logging.Logger, dpi: float, page_image_info: OcrdExif) -> float:
     if dpi > 0:
         zoom = 300.0/dpi
     elif page_image_info.resolution != 1:
         dpi = page_image_info.resolution
         if page_image_info.resolutionUnit == 'cm':
             dpi *= 2.54
+        logger.info(f"Page '{page_id}' uses {dpi} DPI.")
         zoom = 300.0/dpi
     else:
         zoom = 1
-    return zoom, dpi
+    return zoom
diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py
index 713af889..78d11c28 100644
--- a/ocrd_cis/ocropy/denoise.py
+++ b/ocrd_cis/ocropy/denoise.py
@@ -63,8 +63,7 @@ def process(self):
                 page, page_id,
                 feature_selector='binarized' if level == 'page' else '')
 
-            zoom, dpi = determine_zoom(self.parameter['dpi'], page_image_info)
-            self.logger.info(f"Page '{page_id}' uses {dpi} DPI. Determined zoom={zoom}")
+            zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info)
 
             if level == 'page':
                 self.process_segment(page, page_image, page_xywh, zoom,
diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py
index 412724db..9dddae44 100644
--- a/ocrd_cis/ocropy/dewarp.py
+++ b/ocrd_cis/ocropy/dewarp.py
@@ -112,8 +112,7 @@ def process(self):
             page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                 page, page_id)
 
-            zoom, dpi = determine_zoom(self.parameter['dpi'], page_image_info)
-            self.logger.info(f"Page '{page_id}' uses {dpi} DPI. Determined zoom={zoom}")
+            zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info)
 
             regions = page.get_AllRegions(classes=['Text'], order='reading-order')
             if not regions:
diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index 5bc9d008..e8c52a69 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -117,8 +117,7 @@ def process(self):
             page_image, page_coords, page_image_info = self.workspace.image_from_page(
                 page, page_id, feature_selector='binarized')
 
-            zoom, dpi = determine_zoom(self.parameter['dpi'], page_image_info)
-            self.logger.info(f"Page '{page_id}' uses {dpi} DPI. Determined zoom={zoom}")
+            zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info)
 
             ignore = (page.get_ImageRegion() +
                       page.get_LineDrawingRegion() +
diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index d171b6ed..c092718f 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -339,8 +339,7 @@ def process(self):
             # TODO: also allow grayscale_normalized (try/except?)
             page_image, page_coords, page_image_info = self.workspace.image_from_page(
                 page, page_id, feature_selector='binarized')
-            zoom, dpi = determine_zoom(self.parameter['dpi'], page_image_info)
-            self.logger.info(f"Page '{page_id}' uses {dpi} DPI. Determined zoom={zoom}")
+            zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info)
 
             # aggregate existing regions so their foreground can be ignored
             ignore = (page.get_ImageRegion() +

From 7dfd4964be3f4e4db9bfe6ff548eda477ed36ae6 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Wed, 14 Aug 2024 14:38:05 +0200
Subject: [PATCH 24/97] update: test_lib base url

---
 tests/test_lib.bash | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_lib.bash b/tests/test_lib.bash
index f28acb1e..c018d253 100644
--- a/tests/test_lib.bash
+++ b/tests/test_lib.bash
@@ -5,7 +5,7 @@ trap "trap 'echo exiting without removing $tmpdir' EXIT" ERR
 trap "rm -rf $tmpdir" EXIT
 
 OCRD_CIS_FILEGRP="OCR-D-GT-SEG-LINE"
-data_url="https://github.com/OCR-D/gt_structure_text/releases/download/v1.2.4/"
+data_url="https://github.com/OCR-D/gt_structure_text/releases/tag/v1.5.0/"
 function ocrd_cis_download_bagit() {
 	local url="$data_url/$1"
 	mkdir -p "$PWD/download"

From 033c38ac3e3a6fdd9e74ab502d792878aad77439 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Wed, 14 Aug 2024 15:07:22 +0200
Subject: [PATCH 25/97] logging exception -> error

---
 ocrd_cis/ocropy/binarize.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index 387c51dc..0ea170e4 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -109,7 +109,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
             try:
                 ret.append(self.process_page(page, page_image, page_xywh, zoom, page_id, output_file_id))
             except ValueError as e:
-                self.logger.exception(e)
+                self.logger.error(e)
         else:
             if level == 'table':
                 regions = page.get_TableRegion()
@@ -125,7 +125,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
                         ret.append(self.process_region(region, region_image, region_xywh, zoom, region.id, output_file_id))
                         continue
                     except ValueError as e:
-                        self.logger.exception(e)
+                        self.logger.error(e)
                 lines = region.get_TextLine()
                 if not lines:
                     self.logger.warning(f"Page '{page_id}' region '{region.id}' contains no text lines")
@@ -135,7 +135,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
                     try:
                         ret.append(self.process_line(line, line_image, line_xywh, zoom, page_id, region.id, output_file_id))
                     except ValueError as e:
-                        self.logger.exception(e)
+                        self.logger.error(e)
         return ret
 
     def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id) -> Tuple[Image.Image, str, str]:

From 46d84d58b7474adc3cb9f9b756b215efebd495e3 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Wed, 14 Aug 2024 15:50:10 +0200
Subject: [PATCH 26/97] refactor: logger as a first positional argument

---
 ocrd_cis/ocropy/binarize.py  |  9 +++++---
 ocrd_cis/ocropy/resegment.py | 18 +++++++--------
 ocrd_cis/ocropy/segment.py   | 43 +++++++++++++++---------------------
 3 files changed, 32 insertions(+), 38 deletions(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index 0ea170e4..8f7d8d3a 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -22,9 +22,8 @@
 from .common import array2pil, determine_zoom, pil2array, remove_noise
 
 
-def binarize(pil_image, method='ocropy', maxskew=2, threshold=0.5, nrm=False, zoom=1.0):
-    LOG = getLogger('processor.OcropyBinarize')
-    LOG.debug('binarizing %dx%d image with method=%s', pil_image.width, pil_image.height, method)
+def binarize(logger: Logger, pil_image, method='ocropy', maxskew=2, threshold=0.5, nrm=False, zoom=1.0):
+    logger.debug('binarizing %dx%d image with method=%s', pil_image.width, pil_image.height, method)
     if method == 'none':
         # useful if the images are already binary,
         # but lack image attribute `binarized`
@@ -152,6 +151,7 @@ def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id) -> T
         else:
             maxskew = self.parameter['maxskew']
         bin_image, angle = binarize(
+            self.logger,
             page_image,
             method=self.parameter['method'],
             maxskew=maxskew,
@@ -191,6 +191,7 @@ def process_region(self, region, region_image, region_xywh, zoom, page_id, file_
             # orientation has already been annotated (by previous deskewing),
             # so skip deskewing here:
             bin_image, _ = binarize(
+                self.logger,
                 region_image,
                 method=self.parameter['method'],
                 maxskew=0,
@@ -198,6 +199,7 @@ def process_region(self, region, region_image, region_xywh, zoom, page_id, file_
                 zoom=zoom)
         else:
             bin_image, angle = binarize(
+                self.logger,
                 region_image,
                 method=self.parameter['method'],
                 maxskew=self.parameter['maxskew'],
@@ -235,6 +237,7 @@ def process_line(
         self.logger.info(f"About to binarize page '{page_id}' region '{region_id}' line '{line.id}'")
         features = line_xywh['features']
         bin_image, angle = binarize(
+            self.logger,
             line_image,
             method=self.parameter['method'],
             maxskew=self.parameter['maxskew'],
diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index e8c52a69..b18c0b5e 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -265,8 +265,8 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                                                   line_polygon[:, 0],
                                                   parent_bin.shape)
                     new_labels[line_y, line_x] = i + 1
-            spread_dist(lines, line_labels, new_labels, parent_bin, components, parent_coords,
-                        maxdist=maxdist or scale/2, loc=parent.id, threshold=threshold, logger=self.logger)
+            spread_dist(self.logger, lines, line_labels, new_labels, parent_bin, components, parent_coords,
+                        maxdist=maxdist or scale/2, loc=parent.id, threshold=threshold)
             return
         try:
             # TODO: 'scale' passed as a param may not be always defined (mehmedGIT)
@@ -280,9 +280,9 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
         self.logger.info("Found %d new line labels for %d existing lines on %s '%s'",
                  new_line_labels.max(), len(lines), tag, parent.id)
         # polygonalize and prepare comparison
-        new_line_polygons, new_line_labels = masks2polygons(
+        new_line_polygons, new_line_labels = masks2polygons(self.logger,
             new_line_labels, new_baselines, parent_bin, '%s "%s"' % (tag, parent.id),
-            min_area=640/zoom/zoom, logger=self.logger)
+            min_area=640/zoom/zoom)
         DSAVE('line_labels', [np.argmax(np.insert(line_labels, 0, 0, axis=0), axis=0), parent_bin])
         DSAVE('new_line_labels', [new_line_labels, parent_bin])
         new_line_polygons, new_baselines = list(zip(*[(Polygon(poly), LineString(base))
@@ -392,8 +392,8 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                 self.logger.debug("joining %d new line polygons for '%s'", len(new_lines), line.id)
             new_polygon = join_polygons([new_line_polygons[i] #intersections[(i, j)]
                                          for i in new_lines], loc=line.id, scale=scale)
-            new_baseline = join_baselines([new_polygon.intersection(new_baselines[i])
-                                           for i in new_lines], loc=line.id, logger=self.logger)
+            new_baseline = join_baselines(self.logger, [new_polygon.intersection(new_baselines[i])
+                                           for i in new_lines], loc=line.id)
             # convert back to absolute (page) coordinates:
             line_polygon = coordinates_for_segment(new_polygon.exterior.coords[:-1],
                                                    parent_image, parent_coords)
@@ -427,11 +427,9 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                         continue
                     otherline.get_Coords().set_points(points_from_polygon(other_polygon))
 
-def spread_dist(lines, old_labels, new_labels, binarized, components, coords,
-                maxdist=43, loc='', threshold=0.9, logger = None):
+def spread_dist(logger: Logger, lines, old_labels, new_labels, binarized, components, coords,
+                maxdist=43, loc='', threshold=0.9):
     """redefine line coordinates by contourizing spread of connected components propagated from new labels"""
-    if not logger:
-        raise ValueError(f"Logger has not been passed by the caller")
     DSAVE('seeds', [new_labels, (components>0)])
     # allocate to connected components consistently
     # (ignoring smallest components like punctuation)
diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index c092718f..782425cc 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -57,7 +57,7 @@
     lines2regions
 )
 
-def masks2polygons(bg_labels, baselines, fg_bin, name, min_area=None, simplify=None, open_holes=False, reorder=True, logger=None):
+def masks2polygons(logger: Logger, bg_labels, baselines, fg_bin, name, min_area=None, simplify=None, open_holes=False, reorder=True):
     """Convert label masks into polygon coordinates.
 
     Given a Numpy array of background labels ``bg_labels``,
@@ -230,9 +230,9 @@ def getx(xy):
             # get baseline segments intersecting with this line mask
             # and concatenate them from left to right
             if baselines is not None:
-                base = join_baselines([baseline.intersection(polygon)
+                base = join_baselines(logger, [baseline.intersection(polygon)
                                        for baseline in baselines
-                                       if baseline.intersects(polygon)], name, logger)
+                                       if baseline.intersects(polygon)], name)
                 if base is not None:
                     base = base.coords
             else:
@@ -416,7 +416,7 @@ def process(self):
                             roelem = reading_order.get(region.id)
                             # replace by empty group with same index and ref
                             # (which can then take the cells as subregions)
-                            reading_order[region.id] = page_subgroup_in_reading_order(roelem, self.logger)
+                            reading_order[region.id] = page_subgroup_in_reading_order(self.logger, roelem)
                         else:
                             self.logger.warning('skipping table "%s" with existing TextRegions', region.id)
                             continue
@@ -434,7 +434,7 @@ def process(self):
                     elif overwrite_order:
                         # replace by empty ordered group with same (index and) ref
                         # (which can then take the cells as subregions)
-                        roelem = page_subgroup_in_reading_order(roelem, self.logger)
+                        roelem = page_subgroup_in_reading_order(self.logger, roelem)
                         reading_order[region.id] = roelem
                     elif isinstance(roelem, (OrderedGroupType, OrderedGroupIndexedType)):
                         self.logger.warning("Page '%s' table region '%s' already has an ordered group (%s)",
@@ -446,7 +446,7 @@ def process(self):
                     else:
                         # replace regionRef(Indexed) by group with same index and ref
                         # (which can then take the cells as subregions)
-                        roelem = page_subgroup_in_reading_order(roelem, self.logger)
+                        roelem = page_subgroup_in_reading_order(self.logger, roelem)
                         reading_order[region.id] = roelem
                     # go get TextRegions with TextLines (and SeparatorRegions)
                     self._process_element(region, subignore, region_image, region_coords,
@@ -661,16 +661,14 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                                                        seps=np.maximum(sepmask, colseps))
                 region_mask |= region_line_labels > 0
                 # find contours for region (can be non-contiguous)
-                regions, _ = masks2polygons(region_mask * region_label, None, element_bin,
+                regions, _ = masks2polygons(self.logger, region_mask * region_label, None, element_bin,
                                             '%s "%s"' % (element_name, element_id),
                                             min_area=6000/zoom/zoom,
-                                            simplify=ignore_labels * ~(sep_bin),
-                                            logger=self.logger)
+                                            simplify=ignore_labels * ~(sep_bin))
                 # find contours for lines (can be non-contiguous)
-                lines, _ = masks2polygons(region_line_labels, baselines, element_bin,
+                lines, _ = masks2polygons(self.logger, region_line_labels, baselines, element_bin,
                                           'region "%s"' % element_id,
-                                          min_area=640/zoom/zoom,
-                                          logger=self.logger)
+                                          min_area=640/zoom/zoom)
                 # create new lines in new regions (allocating by intersection)
                 line_polys = [Polygon(polygon) for _, polygon, _ in lines]
                 for _, region_polygon, _ in regions:
@@ -722,8 +720,8 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
             # (e.g. drop-capitals or images) ...
             self.logger.info('Found %d large image regions for %s "%s"', images.max(), element_name, element_id)
             # find contours around region labels (can be non-contiguous):
-            image_polygons, _ = masks2polygons(images, None, element_bin,
-                                               '%s "%s"' % (element_name, element_id), self.logger)
+            image_polygons, _ = masks2polygons(self.logger, images, None, element_bin,
+                                               '%s "%s"' % (element_name, element_id))
             for image_label, polygon, _ in image_polygons:
                 # convert back to absolute (page) coordinates:
                 region_polygon = coordinates_for_segment(polygon, image, coords)
@@ -740,9 +738,9 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
             # split detected separator labels into separator regions:
             self.logger.info('Found %d separators for %s "%s"', seplines.max(), element_name, element_id)
             # find contours around region labels (can be non-contiguous):
-            sep_polygons, _ = masks2polygons(seplines, None, element_bin,
+            sep_polygons, _ = masks2polygons(self.logger, seplines, None, element_bin,
                                              '%s "%s"' % (element_name, element_id),
-                                             open_holes=True, reorder=False, logger=self.logger)
+                                             open_holes=True, reorder=False)
             for sep_label, polygon, _ in sep_polygons:
                 # convert back to absolute (page) coordinates:
                 region_polygon = coordinates_for_segment(polygon, image, coords)
@@ -774,9 +772,9 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
             # ensure the new line labels do not extrude from the region:
             line_labels = line_labels * region_mask
             # find contours around labels (can be non-contiguous):
-            line_polygons, _ = masks2polygons(line_labels, baselines, element_bin,
+            line_polygons, _ = masks2polygons(self.logger, line_labels, baselines, element_bin,
                                               'region "%s"' % element_id,
-                                              min_area=640/zoom/zoom, logger=self.logger)
+                                              min_area=640/zoom/zoom)
             line_no = 0
             for line_label, polygon, baseline in line_polygons:
                 # convert back to absolute (page) coordinates:
@@ -918,9 +916,7 @@ def join_polygons(polygons, loc='', scale=20):
         jointp = make_valid(jointp)
     return jointp
 
-def join_baselines(baselines, loc='', logger = None):
-    if not logger:
-        raise ValueError(f"Logger has not been passed by the caller")
+def join_baselines(logger: Logger, baselines, loc=''):
     lines = []
     for baseline in baselines:
         if (baseline.is_empty or
@@ -1062,7 +1058,7 @@ def page_add_to_reading_order(rogroup, region_id, index=None):
             index += 1
     return index
 
-def page_subgroup_in_reading_order(roelem, logger = None):
+def page_subgroup_in_reading_order(logger: Logger, roelem):
     """Replace given RO element by an equivalent OrderedGroup.
     
     Given a ReadingOrder element ``roelem`` (of any type),
@@ -1076,9 +1072,6 @@ def page_subgroup_in_reading_order(roelem, logger = None):
     
     Return the new group object.
     """
-    if not logger:
-        raise ValueError(f"Logger has not been passed by the caller")
-
     if not roelem:
         logger.error('Cannot subgroup from empty ReadingOrder element')
         return roelem

From f6fe4cf4caaf056ded182b498b44a610349627fc Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Wed, 14 Aug 2024 15:54:25 +0200
Subject: [PATCH 27/97] fix: test_lib.bash data url

---
 tests/test_lib.bash | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_lib.bash b/tests/test_lib.bash
index c018d253..801be01a 100644
--- a/tests/test_lib.bash
+++ b/tests/test_lib.bash
@@ -5,7 +5,7 @@ trap "trap 'echo exiting without removing $tmpdir' EXIT" ERR
 trap "rm -rf $tmpdir" EXIT
 
 OCRD_CIS_FILEGRP="OCR-D-GT-SEG-LINE"
-data_url="https://github.com/OCR-D/gt_structure_text/releases/tag/v1.5.0/"
+data_url="https://github.com/OCR-D/gt_structure_text/releases/download/v1.5.0/"
 function ocrd_cis_download_bagit() {
 	local url="$data_url/$1"
 	mkdir -p "$PWD/download"

From aed0f95ccdc0dfe4cc26982258ef1c8acd613e1e Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Wed, 14 Aug 2024 16:33:31 +0200
Subject: [PATCH 28/97] fix: recognize OcrdPage import

---
 ocrd_cis/ocropy/recognize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py
index 9729b480..ccb019eb 100644
--- a/ocrd_cis/ocropy/recognize.py
+++ b/ocrd_cis/ocropy/recognize.py
@@ -19,7 +19,7 @@
 )
 from ocrd_modelfactory import page_from_file
 from ocrd_models.ocrd_page import (
-    to_xml, TextEquivType,
+    to_xml, TextEquivType, OcrdPage,
     CoordsType, GlyphType, WordType
 )
 from ocrd import Processor

From 804f031221eb4e64649e167c2f554d26555d5637 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Wed, 14 Aug 2024 18:10:00 +0200
Subject: [PATCH 29/97] try to migrate clip

---
 ocrd_cis/ocropy/clip.py | 178 +++++++++++++++++++++++++++++++---------
 1 file changed, 138 insertions(+), 40 deletions(-)

diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py
index dd0de012..0675257b 100644
--- a/ocrd_cis/ocropy/clip.py
+++ b/ocrd_cis/ocropy/clip.py
@@ -8,9 +8,7 @@
 from shapely.prepared import prep
 
 from ocrd_modelfactory import page_from_file
-from ocrd_models.ocrd_page import (
-    to_xml, AlternativeImageType
-)
+from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage, to_xml
 from ocrd import Processor
 from ocrd_utils import (
     getLogger,
@@ -39,6 +37,113 @@ def executable(self):
     def setup(self):
         self.logger = getLogger('processor.OcropyClip')
 
+    def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage:
+        level = self.parameter['level-of-operation']
+        assert self.workspace
+        self.logger.debug(f'Level of operation: "{level}"')
+
+        pcgts = input_pcgts[0]
+        page = pcgts.get_Page()
+        assert page
+
+        page_image, page_coords, page_image_info = self.workspace.image_from_page(
+            page, page_id, feature_selector='binarized')
+        # TODO: zoom is not used anywhere, is it still useful to have this call here?
+        zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info)
+        ret = [pcgts]
+
+        # FIXME: what about text regions inside table regions?
+        regions = list(page.get_TextRegion())
+        num_texts = len(regions)
+        regions += (
+                page.get_AdvertRegion() +
+                page.get_ChartRegion() +
+                page.get_ChemRegion() +
+                page.get_GraphicRegion() +
+                page.get_ImageRegion() +
+                page.get_LineDrawingRegion() +
+                page.get_MathsRegion() +
+                page.get_MusicRegion() +
+                page.get_NoiseRegion() +
+                page.get_SeparatorRegion() +
+                page.get_TableRegion() +
+                page.get_UnknownRegion())
+        if not num_texts:
+            self.logger.warning('Page "%s" contains no text regions', page_id)
+        background = ImageStat.Stat(page_image)
+        # workaround for Pillow#4925
+        if len(background.bands) > 1:
+            background = tuple(background.median)
+        else:
+            background = background.median[0]
+        if level == 'region':
+            background_image = Image.new(page_image.mode, page_image.size, background)
+            page_array = pil2array(page_image)
+            page_bin = np.array(page_array <= midrange(page_array), np.uint8)
+            # in absolute coordinates merely for comparison/intersection
+            shapes = [Polygon(polygon_from_points(region.get_Coords().points)) for region in regions]
+            # in relative coordinates for mask/cropping
+            polygons = [coordinates_of_segment(region, page_image, page_coords) for region in regions]
+            for i, polygon in enumerate(polygons[num_texts:], num_texts):
+                # for non-text regions, extend mask by 3 pixels in each direction
+                # to ensure they do not leak components accidentally
+                # (accounts for bad cropping of such regions in GT):
+                polygon = Polygon(polygon).buffer(3).exterior.coords[:-1] # keep open
+                polygons[i] = polygon
+            masks = [pil2array(polygon_mask(page_image, polygon)).astype(np.uint8) for polygon in polygons]
+        for i, region in enumerate(regions):
+            if i >= num_texts:
+                break # keep non-text regions unchanged
+            if level == 'region':
+                if region.get_AlternativeImage():
+                    # FIXME: This should probably be an exception (bad workflow configuration).
+                    self.logger.warning(
+                        f'Page "{page_id}" region "{region.id}" already contains image data: skipping')
+                    continue
+                shape = prep(shapes[i])
+                neighbours = [(regionj, maskj) for shapej, regionj, maskj
+                              in zip(shapes[:i] + shapes[i+1:], regions[:i] + regions[i+1:], masks[:i] + masks[i+1:])
+                              if shape.intersects(shapej)]
+                if neighbours:
+                    segment_region_file_id = f"{output_file_id}_{region.id}"
+                    ret.append(self.process_segment(
+                        region, masks[i], polygons[i], neighbours, background_image,
+                        page_image, page_coords, page_bin, page_id, segment_region_file_id))
+                continue
+            # level == 'line':
+            lines = region.get_TextLine()
+            if not lines:
+                self.logger.warning(f'Page "{page_id}" region "{region.id}" contains no text lines')
+                continue
+            region_image, region_coords = self.workspace.image_from_segment(
+                region, page_image, page_coords, feature_selector='binarized')
+            background_image = Image.new(region_image.mode, region_image.size, background)
+            region_array = pil2array(region_image)
+            region_bin = np.array(region_array <= midrange(region_array), np.uint8)
+            # in absolute coordinates merely for comparison/intersection
+            shapes = [Polygon(polygon_from_points(line.get_Coords().points)) for line in lines]
+            # in relative coordinates for mask/cropping
+            polygons = [coordinates_of_segment(line, region_image, region_coords) for line in lines]
+            masks = [pil2array(polygon_mask(region_image, polygon)).astype(np.uint8) for polygon in polygons]
+            for j, line in enumerate(lines):
+                if line.get_AlternativeImage():
+                    # FIXME: This should probably be an exception (bad workflow configuration).
+                    self.logger.warning(
+                        f'Page "{page_id}" region "{region.id}" line "{line.id}" already contains image '
+                        f'data: skipping')
+                    continue
+                shape = prep(shapes[j])
+                neighbours = [(linej, maskj) for shapej, linej, maskj
+                              in zip(shapes[:j] + shapes[j+1:], lines[:j] + lines[j+1:], masks[:j] + masks[j+1:])
+                              if shape.intersects(shapej)]
+                if neighbours:
+                    segment_line_file_id = f"{output_file_id}_{region.id}_{line.id}"
+                    ret.append(self.process_segment(
+                        line, masks[j], polygons[j], neighbours, background_image,
+                        region_image, region_coords, region_bin, page_id, segment_line_file_id))
+        return ret
+
+    # TODO: remove when `process_page_pcgts` is validated to be correct
     def process(self):
         """Clip text regions / lines of the workspace at intersections with neighbours.
 
@@ -119,27 +224,24 @@ def process(self):
                 page_array = pil2array(page_image)
                 page_bin = np.array(page_array <= midrange(page_array), np.uint8)
                 # in absolute coordinates merely for comparison/intersection
-                shapes = [Polygon(polygon_from_points(region.get_Coords().points))
-                          for region in regions]
+                shapes = [Polygon(polygon_from_points(region.get_Coords().points)) for region in regions]
                 # in relative coordinates for mask/cropping
-                polygons = [coordinates_of_segment(region, page_image, page_coords)
-                            for region in regions]
+                polygons = [coordinates_of_segment(region, page_image, page_coords) for region in regions]
                 for i, polygon in enumerate(polygons[num_texts:], num_texts):
                     # for non-text regions, extend mask by 3 pixels in each direction
                     # to ensure they do not leak components accidentally
                     # (accounts for bad cropping of such regions in GT):
                     polygon = Polygon(polygon).buffer(3).exterior.coords[:-1] # keep open
                     polygons[i] = polygon
-                masks = [pil2array(polygon_mask(page_image, polygon)).astype(np.uint8)
-                         for polygon in polygons]
+                masks = [pil2array(polygon_mask(page_image, polygon)).astype(np.uint8) for polygon in polygons]
             for i, region in enumerate(regions):
                 if i >= num_texts:
                     break # keep non-text regions unchanged
                 if level == 'region':
                     if region.get_AlternativeImage():
                         # FIXME: This should probably be an exception (bad workflow configuration).
-                        self.logger.warning('Page "%s" region "%s" already contains image data: skipping',
-                                    page_id, region.id)
+                        self.logger.warning(
+                            f'Page "{page_id}" region "{region.id}" already contains image data: skipping')
                         continue
                     shape = prep(shapes[i])
                     neighbours = [(regionj, maskj) for shapej, regionj, maskj
@@ -148,15 +250,15 @@ def process(self):
                                          masks[:i] + masks[i+1:])
                                   if shape.intersects(shapej)]
                     if neighbours:
-                        self.process_segment(region, masks[i], polygons[i],
-                                             neighbours, background_image,
-                                             page_image, page_coords, page_bin,
-                                             input_file.pageId, file_id + '_' + region.id)
+                        segment_region_file_id = f"{file_id}_{region.id}"
+                        self.process_segment(
+                            region, masks[i], polygons[i], neighbours, background_image,
+                            page_image, page_coords, page_bin, input_file.pageId, segment_region_file_id)
                     continue
                 # level == 'line':
                 lines = region.get_TextLine()
                 if not lines:
-                    self.logger.warning('Page "%s" region "%s" contains no text lines', page_id, region.id)
+                    self.logger.warning(f'Page "{page_id}" region "{region.id}" contains no text lines')
                     continue
                 region_image, region_coords = self.workspace.image_from_segment(
                     region, page_image, page_coords, feature_selector='binarized')
@@ -164,18 +266,16 @@ def process(self):
                 region_array = pil2array(region_image)
                 region_bin = np.array(region_array <= midrange(region_array), np.uint8)
                 # in absolute coordinates merely for comparison/intersection
-                shapes = [Polygon(polygon_from_points(line.get_Coords().points))
-                          for line in lines]
+                shapes = [Polygon(polygon_from_points(line.get_Coords().points)) for line in lines]
                 # in relative coordinates for mask/cropping
-                polygons = [coordinates_of_segment(line, region_image, region_coords)
-                            for line in lines]
-                masks = [pil2array(polygon_mask(region_image, polygon)).astype(np.uint8)
-                         for polygon in polygons]
+                polygons = [coordinates_of_segment(line, region_image, region_coords) for line in lines]
+                masks = [pil2array(polygon_mask(region_image, polygon)).astype(np.uint8) for polygon in polygons]
                 for j, line in enumerate(lines):
                     if line.get_AlternativeImage():
                         # FIXME: This should probably be an exception (bad workflow configuration).
-                        self.logger.warning('Page "%s" region "%s" line "%s" already contains image data: skipping',
-                                    page_id, region.id, line.id)
+                        self.logger.warning(
+                            f'Page "{page_id}" region "{region.id}" line "{line.id}" already contains image '
+                            f'data: skipping')
                         continue
                     shape = prep(shapes[j])
                     neighbours = [(linej, maskj) for shapej, linej, maskj
@@ -184,10 +284,10 @@ def process(self):
                                          masks[:j] + masks[j+1:])
                                   if shape.intersects(shapej)]
                     if neighbours:
-                        self.process_segment(line, masks[j], polygons[j],
-                                             neighbours, background_image,
-                                             region_image, region_coords, region_bin,
-                                             input_file.pageId, file_id + '_' + region.id + '_' + line.id)
+                        segment_line_file_id = f"{file_id}_{region.id}_{line.id}"
+                        self.process_segment(
+                            line, masks[j], polygons[j], neighbours, background_image,
+                            region_image, region_coords, region_bin, input_file.pageId, segment_line_file_id)
 
             # update METS (add the PAGE file):
             file_path = join(self.output_file_grp, file_id + '.xml')
@@ -204,7 +304,7 @@ def process(self):
 
     def process_segment(self, segment, segment_mask, segment_polygon, neighbours,
                         background_image, parent_image, parent_coords, parent_bin,
-                        page_id, file_id):
+                        page_id, file_id) -> Tuple[Image.Image, str, str]:
         # initialize AlternativeImage@comments classes from parent, except
         # for those operations that can apply on multiple hierarchy levels:
         features = ','.join(
@@ -216,8 +316,8 @@ def process_segment(self, segment, segment_mask, segment_polygon, neighbours,
         segment_bbox = bbox_from_polygon(segment_polygon)
         for neighbour, neighbour_mask in neighbours:
             if not np.any(segment_mask > neighbour_mask):
-                self.logger.info('Ignoring enclosing neighbour "%s" of segment "%s" on page "%s"',
-                         neighbour.id, segment.id, page_id)
+                self.logger.info(
+                    f'Ignoring enclosing neighbour "{neighbour.id}" of segment "{segment.id}" on page "{page_id}"')
                 continue
             # find connected components that (only) belong to the neighbour:
             intruders = segment_mask * morph.keep_marked(parent_bin, neighbour_mask > 0) # overlaps neighbour
@@ -226,8 +326,9 @@ def process_segment(self, segment, segment_mask, segment_polygon, neighbours,
             num_foreground = np.count_nonzero(segment_mask * parent_bin)
             if not num_intruders:
                 continue
-            self.logger.debug('segment "%s" vs neighbour "%s": suppressing %d of %d pixels on page "%s"',
-                      segment.id, neighbour.id, num_intruders, num_foreground, page_id)
+            self.logger.debug(
+                f'segment "{segment.id}" vs neighbour "{neighbour.id}": suppressing {num_intruders} of '
+                f'{num_foreground} pixels on page "{page_id}"')
             # suppress in segment_mask so these intruders can stay in the neighbours
             # (are not removed from both sides)
             segment_mask -= intruders
@@ -241,11 +342,8 @@ def process_segment(self, segment, segment_mask, segment_polygon, neighbours,
         # recrop segment into rectangle, just as image_from_segment would do
         # (and also clipping with background colour):
         segment_image = crop_image(segment_image,box=segment_bbox)
-        # update METS (add the image file):
-        file_path = self.workspace.save_image_file(
-            segment_image, file_id + '.IMG-CLIP', self.output_file_grp,
-            page_id=page_id)
+        segment_image_id = file_id + '.IMG-CLIP'
+        segment_image_path = join(self.output_file_grp, f'{segment_image_id}.png')
         # update PAGE (reference the image file):
-        segment.add_AlternativeImage(AlternativeImageType(
-            filename=file_path,
-            comments=features))
+        segment.add_AlternativeImage(AlternativeImageType(filename=segment_image_path, comments=features))
+        return segment_image, segment_image_id, segment_image_path

From 7bdff31747ad2c9cdb834569b8b1adf8b90303d2 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Thu, 15 Aug 2024 11:51:42 +0200
Subject: [PATCH 30/97] remove: process() methods

---
 ocrd_cis/ocropy/clip.py      | 194 +++++++----------------------------
 ocrd_cis/ocropy/recognize.py |  65 +++---------
 2 files changed, 50 insertions(+), 209 deletions(-)

diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py
index 0675257b..9e6d8d19 100644
--- a/ocrd_cis/ocropy/clip.py
+++ b/ocrd_cis/ocropy/clip.py
@@ -37,7 +37,42 @@ def executable(self):
     def setup(self):
         self.logger = getLogger('processor.OcropyClip')
 
+    # TODO: Adapt the docstring comment to process_page_pcgts
     def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage:
+        """Clip text regions / lines of the workspace at intersections with neighbours.
+
+        Open and deserialise PAGE input files and their respective images,
+        then iterate over the element hierarchy down to the requested
+        ``level-of-operation``.
+
+        Next, get each segment image according to the layout annotation (by cropping
+        via coordinates into the higher-level image), as well as all its neighbours',
+        binarize them (without deskewing), and make a connected component analysis.
+        (Segments must not already have AlternativeImage annotated, otherwise they
+        will be skipped.)
+
+        Then, for each section of overlap with a neighbour, re-assign components
+        which are only contained in the neighbour by clipping them to white (background),
+        and export the (final) result as image file.
+
+        Add the new image file to the workspace along with the output fileGrp,
+        and using a file ID with suffix ``.IMG-CLIP`` along with further
+        identification of the input element.
+
+        Reference each new image in the AlternativeImage of the element.
+
+        Produce a new output file by serialising the resulting hierarchy.
+        """
+        # This makes best sense for overlapping segmentation, like current GT
+        # or Tesseract layout analysis. Most notably, it can suppress graphics
+        # and separators within or across a region or line. It _should_ ideally
+        # be run after binarization (on page level for region-level clipping,
+        # and on the region level for line-level clipping), because the
+        # connected component analysis after implicit binarization could be
+        # suboptimal, and the explicit binarization after clipping could be,
+        # too. However, region-level clipping _must_ be run before region-level
+        # deskewing, because that would make segments incomensurable with their
+        # neighbours.
         level = self.parameter['level-of-operation']
         assert self.workspace
         self.logger.debug(f'Level of operation: "{level}"')
@@ -143,165 +178,6 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
                         region_image, region_coords, region_bin, page_id, segment_line_file_id))
         return ret
 
-    # TODO: remove when `process_page_pcgts` is validated to be correct
-    def process(self):
-        """Clip text regions / lines of the workspace at intersections with neighbours.
-
-        Open and deserialise PAGE input files and their respective images,
-        then iterate over the element hierarchy down to the requested
-        ``level-of-operation``.
-
-        Next, get each segment image according to the layout annotation (by cropping
-        via coordinates into the higher-level image), as well as all its neighbours',
-        binarize them (without deskewing), and make a connected component analysis.
-        (Segments must not already have AlternativeImage annotated, otherwise they
-        will be skipped.)
-
-        Then, for each section of overlap with a neighbour, re-assign components
-        which are only contained in the neighbour by clipping them to white (background),
-        and export the (final) result as image file.
-
-        Add the new image file to the workspace along with the output fileGrp,
-        and using a file ID with suffix ``.IMG-CLIP`` along with further
-        identification of the input element.
-
-        Reference each new image in the AlternativeImage of the element.
-
-        Produce a new output file by serialising the resulting hierarchy.
-        """
-        # This makes best sense for overlapping segmentation, like current GT
-        # or Tesseract layout analysis. Most notably, it can suppress graphics
-        # and separators within or across a region or line. It _should_ ideally
-        # be run after binarization (on page level for region-level clipping,
-        # and on the region level for line-level clipping), because the
-        # connected component analysis after implicit binarization could be
-        # suboptimal, and the explicit binarization after clipping could be,
-        # too. However, region-level clipping _must_ be run before region-level
-        # deskewing, because that would make segments incomensurable with their
-        # neighbours.
-        level = self.parameter['level-of-operation']
-
-        for (n, input_file) in enumerate(self.input_files):
-            self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
-            file_id = make_file_id(input_file, self.output_file_grp)
-
-            pcgts = page_from_file(self.workspace.download_file(input_file))
-            self.add_metadata(pcgts)
-            page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
-            page = pcgts.get_Page()
-            
-            page_image, page_coords, page_image_info = self.workspace.image_from_page(
-                page, page_id, feature_selector='binarized')
-            # TODO: zoom is not used anywhere, is it still useful to have this call here?
-            zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info)
-
-            # FIXME: what about text regions inside table regions?
-            regions = list(page.get_TextRegion())
-            num_texts = len(regions)
-            regions += (
-                page.get_AdvertRegion() +
-                page.get_ChartRegion() +
-                page.get_ChemRegion() +
-                page.get_GraphicRegion() +
-                page.get_ImageRegion() +
-                page.get_LineDrawingRegion() +
-                page.get_MathsRegion() +
-                page.get_MusicRegion() +
-                page.get_NoiseRegion() +
-                page.get_SeparatorRegion() +
-                page.get_TableRegion() +
-                page.get_UnknownRegion())
-            if not num_texts:
-                self.logger.warning('Page "%s" contains no text regions', page_id)
-            background = ImageStat.Stat(page_image)
-            # workaround for Pillow#4925
-            if len(background.bands) > 1:
-                background = tuple(background.median)
-            else:
-                background = background.median[0]
-            if level == 'region':
-                background_image = Image.new(page_image.mode, page_image.size, background)
-                page_array = pil2array(page_image)
-                page_bin = np.array(page_array <= midrange(page_array), np.uint8)
-                # in absolute coordinates merely for comparison/intersection
-                shapes = [Polygon(polygon_from_points(region.get_Coords().points)) for region in regions]
-                # in relative coordinates for mask/cropping
-                polygons = [coordinates_of_segment(region, page_image, page_coords) for region in regions]
-                for i, polygon in enumerate(polygons[num_texts:], num_texts):
-                    # for non-text regions, extend mask by 3 pixels in each direction
-                    # to ensure they do not leak components accidentally
-                    # (accounts for bad cropping of such regions in GT):
-                    polygon = Polygon(polygon).buffer(3).exterior.coords[:-1] # keep open
-                    polygons[i] = polygon
-                masks = [pil2array(polygon_mask(page_image, polygon)).astype(np.uint8) for polygon in polygons]
-            for i, region in enumerate(regions):
-                if i >= num_texts:
-                    break # keep non-text regions unchanged
-                if level == 'region':
-                    if region.get_AlternativeImage():
-                        # FIXME: This should probably be an exception (bad workflow configuration).
-                        self.logger.warning(
-                            f'Page "{page_id}" region "{region.id}" already contains image data: skipping')
-                        continue
-                    shape = prep(shapes[i])
-                    neighbours = [(regionj, maskj) for shapej, regionj, maskj
-                                  in zip(shapes[:i] + shapes[i+1:],
-                                         regions[:i] + regions[i+1:],
-                                         masks[:i] + masks[i+1:])
-                                  if shape.intersects(shapej)]
-                    if neighbours:
-                        segment_region_file_id = f"{file_id}_{region.id}"
-                        self.process_segment(
-                            region, masks[i], polygons[i], neighbours, background_image,
-                            page_image, page_coords, page_bin, input_file.pageId, segment_region_file_id)
-                    continue
-                # level == 'line':
-                lines = region.get_TextLine()
-                if not lines:
-                    self.logger.warning(f'Page "{page_id}" region "{region.id}" contains no text lines')
-                    continue
-                region_image, region_coords = self.workspace.image_from_segment(
-                    region, page_image, page_coords, feature_selector='binarized')
-                background_image = Image.new(region_image.mode, region_image.size, background)
-                region_array = pil2array(region_image)
-                region_bin = np.array(region_array <= midrange(region_array), np.uint8)
-                # in absolute coordinates merely for comparison/intersection
-                shapes = [Polygon(polygon_from_points(line.get_Coords().points)) for line in lines]
-                # in relative coordinates for mask/cropping
-                polygons = [coordinates_of_segment(line, region_image, region_coords) for line in lines]
-                masks = [pil2array(polygon_mask(region_image, polygon)).astype(np.uint8) for polygon in polygons]
-                for j, line in enumerate(lines):
-                    if line.get_AlternativeImage():
-                        # FIXME: This should probably be an exception (bad workflow configuration).
-                        self.logger.warning(
-                            f'Page "{page_id}" region "{region.id}" line "{line.id}" already contains image '
-                            f'data: skipping')
-                        continue
-                    shape = prep(shapes[j])
-                    neighbours = [(linej, maskj) for shapej, linej, maskj
-                                  in zip(shapes[:j] + shapes[j+1:],
-                                         lines[:j] + lines[j+1:],
-                                         masks[:j] + masks[j+1:])
-                                  if shape.intersects(shapej)]
-                    if neighbours:
-                        segment_line_file_id = f"{file_id}_{region.id}_{line.id}"
-                        self.process_segment(
-                            line, masks[j], polygons[j], neighbours, background_image,
-                            region_image, region_coords, region_bin, input_file.pageId, segment_line_file_id)
-
-            # update METS (add the PAGE file):
-            file_path = join(self.output_file_grp, file_id + '.xml')
-            pcgts.set_pcGtsId(file_id)
-            out = self.workspace.add_file(
-                ID=file_id,
-                file_grp=self.output_file_grp,
-                pageId=input_file.pageId,
-                local_filename=file_path,
-                mimetype=MIMETYPE_PAGE,
-                content=to_xml(pcgts))
-            self.logger.info('created file ID: %s, file_grp: %s, path: %s',
-                     file_id, self.output_file_grp, out.local_filename)
-
     def process_segment(self, segment, segment_mask, segment_polygon, neighbours,
                         background_image, parent_image, parent_coords, parent_bin,
                         page_id, file_id) -> Tuple[Image.Image, str, str]:
diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py
index ccb019eb..389cf8db 100644
--- a/ocrd_cis/ocropy/recognize.py
+++ b/ocrd_cis/ocropy/recognize.py
@@ -115,26 +115,8 @@ def get_model(self):
                 self.parameter['model'], self.parameter['model'])
         exit(1)
 
+    # TODO: Adapt the docstring comment to process_page_pcgts
     def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage:
-        maxlevel = self.parameter['textequiv_level']
-        assert self.workspace
-        self.logger.debug(f'Max level: "{maxlevel}"')
-
-        pcgts = input_pcgts[0]
-        page = pcgts.get_Page()
-        assert page
-
-        page_image, page_coords, _ = self.workspace.image_from_page(page, page_id)
-        self.logger.info(f"Recognizing text in page '{page_id}'")
-        # region, line, word, or glyph level:
-        regions = page.get_AllRegions(classes=['Text'])
-        if not regions:
-            self.logger.warning(f"Page '{page_id}' contains no text regions")
-        self.process_regions(regions, maxlevel, page_image, page_coords)
-        return [pcgts]
-
-    # TODO: remove when `process_page_pcgts` is validated to be correct
-    def process(self):
         """Recognize lines / words / glyphs of the workspace.
 
         Open and deserialise each PAGE input file and its respective image,
@@ -160,38 +142,21 @@ def process(self):
         Produce a new output file by serialising the resulting hierarchy.
         """
         maxlevel = self.parameter['textequiv_level']
+        assert self.workspace
+        self.logger.debug(f'Max level: "{maxlevel}"')
+
+        pcgts = input_pcgts[0]
+        page = pcgts.get_Page()
+        assert page
 
-        # self.logger.info("Using model %s in %s for recognition", model)
-        for (n, input_file) in enumerate(self.input_files):
-            self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
-            pcgts = page_from_file(self.workspace.download_file(input_file))
-            self.add_metadata(pcgts)
-            page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
-            page = pcgts.get_Page()
-
-            page_image, page_coords, _ = self.workspace.image_from_page(
-                page, page_id)
-
-            self.logger.info("Recognizing text in page '%s'", page_id)
-            # region, line, word, or glyph level:
-            regions = page.get_AllRegions(classes=['Text'])
-            if not regions:
-                self.logger.warning("Page '%s' contains no text regions", page_id)
-            self.process_regions(regions, maxlevel, page_image, page_coords)
-
-            # update METS (add the PAGE file):
-            file_id = make_file_id(input_file, self.output_file_grp)
-            file_path = join(self.output_file_grp, file_id + '.xml')
-            pcgts.set_pcGtsId(file_id)
-            out = self.workspace.add_file(
-                ID=file_id,
-                file_grp=self.output_file_grp,
-                pageId=input_file.pageId,
-                local_filename=file_path,
-                mimetype=MIMETYPE_PAGE,
-                content=to_xml(pcgts))
-            self.logger.info('created file ID: %s, file_grp: %s, path: %s',
-                             file_id, self.output_file_grp, out.local_filename)
+        page_image, page_coords, _ = self.workspace.image_from_page(page, page_id)
+        self.logger.info(f"Recognizing text in page '{page_id}'")
+        # region, line, word, or glyph level:
+        regions = page.get_AllRegions(classes=['Text'])
+        if not regions:
+            self.logger.warning(f"Page '{page_id}' contains no text regions")
+        self.process_regions(regions, maxlevel, page_image, page_coords)
+        return [pcgts]
 
     def process_regions(self, regions, maxlevel, page_image, page_coords):
         edits = 0

From 03c2f158fa02ddeae40baa93cee686be1fd0ca09 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Thu, 15 Aug 2024 11:57:36 +0200
Subject: [PATCH 31/97] adapt: docstring of process_page_pcgts

---
 ocrd_cis/ocropy/clip.py      |  8 ++++----
 ocrd_cis/ocropy/recognize.py | 17 ++++++++---------
 2 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py
index 9e6d8d19..a5f4f705 100644
--- a/ocrd_cis/ocropy/clip.py
+++ b/ocrd_cis/ocropy/clip.py
@@ -39,9 +39,9 @@ def setup(self):
 
     # TODO: Adapt the docstring comment to process_page_pcgts
     def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage:
-        """Clip text regions / lines of the workspace at intersections with neighbours.
+        """Clip text regions / lines of a page at intersections with neighbours.
 
-        Open and deserialise PAGE input files and their respective images,
+        Open and deserialize PAGE input file and its respective image,
         then iterate over the element hierarchy down to the requested
         ``level-of-operation``.
 
@@ -61,7 +61,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
 
         Reference each new image in the AlternativeImage of the element.
 
-        Produce a new output file by serialising the resulting hierarchy.
+        Return the resulting OcrdPage.
         """
         # This makes best sense for overlapping segmentation, like current GT
         # or Tesseract layout analysis. Most notably, it can suppress graphics
@@ -71,7 +71,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
         # connected component analysis after implicit binarization could be
         # suboptimal, and the explicit binarization after clipping could be,
         # too. However, region-level clipping _must_ be run before region-level
-        # deskewing, because that would make segments incomensurable with their
+        # deskewing, because that would make segments incommensurable with their
         # neighbours.
         level = self.parameter['level-of-operation']
         assert self.workspace
diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py
index 389cf8db..69b374ec 100644
--- a/ocrd_cis/ocropy/recognize.py
+++ b/ocrd_cis/ocropy/recognize.py
@@ -115,18 +115,17 @@ def get_model(self):
                 self.parameter['model'], self.parameter['model'])
         exit(1)
 
-    # TODO: Adapt the docstring comment to process_page_pcgts
     def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage:
-        """Recognize lines / words / glyphs of the workspace.
+        """Recognize lines / words / glyphs of a page.
 
-        Open and deserialise each PAGE input file and its respective image,
+        Open and deserialize the PAGE input file and its respective image,
         then iterate over the element hierarchy down to the requested
         ``textequiv_level``. If any layout annotation below the line level
         already exists, then remove it (regardless of ``textequiv_level``).
 
-        Set up Ocropy to recognise each text line (via coordinates into
+        Set up Ocropy to recognize each text line (via coordinates into
         the higher-level image, or from the alternative image; the image
-        must have been binarised/grayscale-normalised, deskewed and dewarped
+        must have been binarized/grayscale-normalised, deskewed and dewarped
         already). Rescale and pad the image, then recognize.
 
         Create new elements below the line level, if necessary.
@@ -139,11 +138,11 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
         Levenshtein distance. Aggregate these scores for each file and print
         the line-wise and the total character error rates (CER).
 
-        Produce a new output file by serialising the resulting hierarchy.
+        Return the resulting OcrdPage.
         """
-        maxlevel = self.parameter['textequiv_level']
+        max_level = self.parameter['textequiv_level']
         assert self.workspace
-        self.logger.debug(f'Max level: "{maxlevel}"')
+        self.logger.debug(f'Max level: "{max_level}"')
 
         pcgts = input_pcgts[0]
         page = pcgts.get_Page()
@@ -155,7 +154,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
         regions = page.get_AllRegions(classes=['Text'])
         if not regions:
             self.logger.warning(f"Page '{page_id}' contains no text regions")
-        self.process_regions(regions, maxlevel, page_image, page_coords)
+        self.process_regions(regions, max_level, page_image, page_coords)
         return [pcgts]
 
     def process_regions(self, regions, maxlevel, page_image, page_coords):

From 90ac28e1f9c9b6c95492aac765aaf5183a045be2 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Thu, 15 Aug 2024 12:11:30 +0200
Subject: [PATCH 32/97] refactor: other small things

---
 ocrd_cis/ocropy/clip.py      | 16 +++++------
 ocrd_cis/ocropy/recognize.py | 52 +++++++++++++++---------------------
 2 files changed, 29 insertions(+), 39 deletions(-)

diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py
index a5f4f705..75b4123f 100644
--- a/ocrd_cis/ocropy/clip.py
+++ b/ocrd_cis/ocropy/clip.py
@@ -37,7 +37,6 @@ def executable(self):
     def setup(self):
         self.logger = getLogger('processor.OcropyClip')
 
-    # TODO: Adapt the docstring comment to process_page_pcgts
     def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage:
         """Clip text regions / lines of a page at intersections with neighbours.
 
@@ -81,9 +80,9 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
         page = pcgts.get_Page()
         assert page
 
-        page_image, page_coords, page_image_info = self.workspace.image_from_page(
+        page_image, page_xywh, page_image_info = self.workspace.image_from_page(
             page, page_id, feature_selector='binarized')
-        # TODO: zoom is not used anywhere, is it still useful to have this call here?
+        # The zoom is not used anywhere
         zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info)
         ret = [pcgts]
 
@@ -104,7 +103,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
                 page.get_TableRegion() +
                 page.get_UnknownRegion())
         if not num_texts:
-            self.logger.warning('Page "%s" contains no text regions', page_id)
+            self.logger.warning(f'Page "{page_id}" contains no text regions')
         background = ImageStat.Stat(page_image)
         # workaround for Pillow#4925
         if len(background.bands) > 1:
@@ -118,7 +117,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
             # in absolute coordinates merely for comparison/intersection
             shapes = [Polygon(polygon_from_points(region.get_Coords().points)) for region in regions]
             # in relative coordinates for mask/cropping
-            polygons = [coordinates_of_segment(region, page_image, page_coords) for region in regions]
+            polygons = [coordinates_of_segment(region, page_image, page_xywh) for region in regions]
             for i, polygon in enumerate(polygons[num_texts:], num_texts):
                 # for non-text regions, extend mask by 3 pixels in each direction
                 # to ensure they do not leak components accidentally
@@ -143,7 +142,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
                     segment_region_file_id = f"{output_file_id}_{region.id}"
                     ret.append(self.process_segment(
                         region, masks[i], polygons[i], neighbours, background_image,
-                        page_image, page_coords, page_bin, page_id, segment_region_file_id))
+                        page_image, page_xywh, page_bin, page_id, segment_region_file_id))
                 continue
             # level == 'line':
             lines = region.get_TextLine()
@@ -151,7 +150,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
                 self.logger.warning(f'Page "{page_id}" region "{region.id}" contains no text lines')
                 continue
             region_image, region_coords = self.workspace.image_from_segment(
-                region, page_image, page_coords, feature_selector='binarized')
+                region, page_image, page_xywh, feature_selector='binarized')
             background_image = Image.new(region_image.mode, region_image.size, background)
             region_array = pil2array(region_image)
             region_bin = np.array(region_array <= midrange(region_array), np.uint8)
@@ -164,8 +163,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
                 if line.get_AlternativeImage():
                     # FIXME: This should probably be an exception (bad workflow configuration).
                     self.logger.warning(
-                        f'Page "{page_id}" region "{region.id}" line "{line.id}" already contains image '
-                        f'data: skipping')
+                        f'Page "{page_id}" region "{region.id}" line "{line.id}" already contains image data: skipping')
                     continue
                 shape = prep(shapes[j])
                 neighbours = [(linej, maskj) for shapej, linej, maskj
diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py
index 69b374ec..b9fc453f 100644
--- a/ocrd_cis/ocropy/recognize.py
+++ b/ocrd_cis/ocropy/recognize.py
@@ -101,18 +101,19 @@ def get_model(self):
         returns it.  If not, it checks if the model can be found in the
         dirname(__file__)/models/ directory."""
         canread = lambda p: isfile(p) and access(p, R_OK)
+        p_model = self.parameter['model']
         try:
-            model = self.resolve_resource(self.parameter['model'])
+            model = self.resolve_resource(p_model)
             if canread(model):
                 return model
         except SystemExit:
             ocropydir = dirname(abspath(__file__))
-            path = join(ocropydir, 'models', self.parameter['model'])
-            self.logger.info("Failed to resolve model with OCR-D/core mechanism, trying %s", path)
+            path = join(ocropydir, 'models', p_model)
+            self.logger.info(f"Failed to resolve model with OCR-D/core mechanism, trying {path}")
             if canread(path):
                 return path
-        self.logger.error("Could not find model %s. Try 'ocrd resmgr download ocrd-cis-ocropy-recognize %s",
-                self.parameter['model'], self.parameter['model'])
+        self.logger.error(
+            f"Could not find model {p_model}. Try 'ocrd resmgr download ocrd-cis-ocropy-recognize {p_model}")
         exit(1)
 
     def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage:
@@ -148,7 +149,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
         page = pcgts.get_Page()
         assert page
 
-        page_image, page_coords, _ = self.workspace.image_from_page(page, page_id)
+        page_image, page_xywh, _ = self.workspace.image_from_page(page, page_id)
         self.logger.info(f"Recognizing text in page '{page_id}'")
         # region, line, word, or glyph level:
         regions = page.get_AllRegions(classes=['Text'])
@@ -157,37 +158,32 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
         self.process_regions(regions, max_level, page_image, page_coords)
         return [pcgts]
 
-    def process_regions(self, regions, maxlevel, page_image, page_coords):
+    def process_regions(self, regions, maxlevel, page_image, page_xywh):
         edits = 0
         lengs = 0
         for region in regions:
-            region_image, region_coords = self.workspace.image_from_segment(
-                region, page_image, page_coords)
-
-            self.logger.info("Recognizing text in region '%s'", region.id)
+            region_image, region_xywh = self.workspace.image_from_segment(region, page_image, page_xywh)
+            self.logger.info(f"Recognizing text in region '{region.id}'")
             textlines = region.get_TextLine()
             if not textlines:
-                self.logger.warning("Region '%s' contains no text lines", region.id)
+                self.logger.warning(f"Region '{region.id}' contains no text lines")
             else:
-                edits_, lengs_ = self.process_lines(textlines, maxlevel, region_image, region_coords)
+                edits_, lengs_ = self.process_lines(textlines, maxlevel, region_image, region_xywh)
                 edits += edits_
                 lengs += lengs_
             # update region text by concatenation for consistency
-            region_unicode = u'\n'.join(line.get_TextEquiv()[0].Unicode
-                                        if line.get_TextEquiv()
-                                        else u'' for line in textlines)
+            region_unicode = u'\n'.join(
+                line.get_TextEquiv()[0].Unicode if line.get_TextEquiv() else u'' for line in textlines)
             region.set_TextEquiv([TextEquivType(Unicode=region_unicode)])
         if lengs > 0:
             self.logger.info('CER: %.1f%%', 100.0 * edits / lengs)
 
-    def process_lines(self, textlines, maxlevel, region_image, region_coords):
+    def process_lines(self, textlines, maxlevel, region_image, region_xywh):
         edits = 0
         lengs = 0
         for line in textlines:
-            line_image, line_coords = self.workspace.image_from_segment(
-                line, region_image, region_coords)
-
-            self.logger.info("Recognizing text in line '%s'", line.id)
+            line_image, line_coords = self.workspace.image_from_segment(line, region_image, region_xywh)
+            self.logger.info(f"Recognizing text in line '{line.id}'")
             if line.get_TextEquiv():
                 linegt = line.TextEquiv[0].Unicode
             else:
@@ -198,19 +194,18 @@ def process_lines(self, textlines, maxlevel, region_image, region_coords):
             line.set_Word([])
 
             if line_image.size[1] < 16:
-                self.logger.debug("ERROR: bounding box is too narrow at line %s", line.id)
+                self.logger.debug(f"ERROR: bounding box is too narrow at line {line.id}")
                 continue
             # resize image to 48 pixel height
             final_img, scale = resize_keep_ratio(line_image)
 
             # process ocropy:
             try:
-                linepred, clist, rlist, confidlist = recognize(
-                    final_img, self.pad, self.network, check=True)
+                linepred, clist, rlist, confidlist = recognize(final_img, self.pad, self.network, check=True)
             except Exception as err:
-                self.logger.debug('error processing line "%s": %s', line.id, err)
+                self.logger.debug(f'error processing line "{line.id}": {err}')
                 continue
-            self.logger.debug("OCR '%s': '%s'", line.id, linepred)
+            self.logger.debug(f"OCR '{line.id}': '{linepred}'")
             edits += Levenshtein.distance(linepred, linegt)
             lengs += len(linegt)
 
@@ -226,11 +221,9 @@ def process_lines(self, textlines, maxlevel, region_image, region_coords):
                         found_char = True
                         word_conf_list[w_no].append(confidlist[i])
                         word_r_list[w_no].append(rlist[i])
-
                     if c == ' ' and found_char:
                         if i == 0:
                             word_r_list[0][0] = rlist[i]
-
                         elif i+1 <= len(clist)-1 and clist[i+1] != ' ':
                             word_conf_list.append([])
                             word_r_list.append([rlist[i]])
@@ -244,8 +237,7 @@ def process_lines(self, textlines, maxlevel, region_image, region_coords):
             # conf for the line
             line_conf = (min(wordsconf) + max(wordsconf))/2
             # line text
-            line.add_TextEquiv(TextEquivType(
-                Unicode=linepred, conf=line_conf))
+            line.add_TextEquiv(TextEquivType(Unicode=linepred, conf=line_conf))
 
             if maxlevel in ['word', 'glyph']:
                 for word_no, word_str in enumerate(words):

From f24f86b9e963e28f206662e464f8843c99deddf0 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Thu, 15 Aug 2024 12:33:04 +0200
Subject: [PATCH 33/97] fix: determine_zoom

---
 ocrd_cis/ocropy/binarize.py  | 2 +-
 ocrd_cis/ocropy/clip.py      | 3 ++-
 ocrd_cis/ocropy/common.py    | 2 +-
 ocrd_cis/ocropy/denoise.py   | 2 +-
 ocrd_cis/ocropy/dewarp.py    | 2 +-
 ocrd_cis/ocropy/recognize.py | 2 +-
 ocrd_cis/ocropy/resegment.py | 2 +-
 ocrd_cis/ocropy/segment.py   | 2 +-
 8 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index 8f7d8d3a..7478edb5 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -101,7 +101,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
         assert page
 
         page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id, feature_filter='binarized')
-        zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info)
+        zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info)
 
         ret = [pcgts]
         if level == 'page':
diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py
index 75b4123f..400e9b54 100644
--- a/ocrd_cis/ocropy/clip.py
+++ b/ocrd_cis/ocropy/clip.py
@@ -1,5 +1,6 @@
 from __future__ import absolute_import
 from logging import Logger
+from typing import Tuple
 
 from os.path import join
 import numpy as np
@@ -83,7 +84,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
         page_image, page_xywh, page_image_info = self.workspace.image_from_page(
             page, page_id, feature_selector='binarized')
         # The zoom is not used anywhere
-        zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info)
+        zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info)
         ret = [pcgts]
 
         # FIXME: what about text regions inside table regions?
diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py
index 095de5eb..c6b7c49d 100644
--- a/ocrd_cis/ocropy/common.py
+++ b/ocrd_cis/ocropy/common.py
@@ -2103,7 +2103,7 @@ def find_topological():
     # DSAVE('rlabels_closed', rlabels)
     return rlabels
 
-def determine_zoom(logger: logging.Logger, dpi: float, page_image_info: OcrdExif) -> float:
+def determine_zoom(logger: logging.Logger, page_id: str, dpi: float, page_image_info: OcrdExif) -> float:
     if dpi > 0:
         zoom = 300.0/dpi
     elif page_image_info.resolution != 1:
diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py
index 78d11c28..cc622c24 100644
--- a/ocrd_cis/ocropy/denoise.py
+++ b/ocrd_cis/ocropy/denoise.py
@@ -63,7 +63,7 @@ def process(self):
                 page, page_id,
                 feature_selector='binarized' if level == 'page' else '')
 
-            zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info)
+            zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info)
 
             if level == 'page':
                 self.process_segment(page, page_image, page_xywh, zoom,
diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py
index 9dddae44..72efca45 100644
--- a/ocrd_cis/ocropy/dewarp.py
+++ b/ocrd_cis/ocropy/dewarp.py
@@ -112,7 +112,7 @@ def process(self):
             page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                 page, page_id)
 
-            zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info)
+            zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info)
 
             regions = page.get_AllRegions(classes=['Text'], order='reading-order')
             if not regions:
diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py
index b9fc453f..bbb8e415 100644
--- a/ocrd_cis/ocropy/recognize.py
+++ b/ocrd_cis/ocropy/recognize.py
@@ -155,7 +155,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
         regions = page.get_AllRegions(classes=['Text'])
         if not regions:
             self.logger.warning(f"Page '{page_id}' contains no text regions")
-        self.process_regions(regions, max_level, page_image, page_coords)
+        self.process_regions(regions, max_level, page_image, page_xywh)
         return [pcgts]
 
     def process_regions(self, regions, maxlevel, page_image, page_xywh):
diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index b18c0b5e..1e9f8c7f 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -117,7 +117,7 @@ def process(self):
             page_image, page_coords, page_image_info = self.workspace.image_from_page(
                 page, page_id, feature_selector='binarized')
 
-            zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info)
+            zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info)
 
             ignore = (page.get_ImageRegion() +
                       page.get_LineDrawingRegion() +
diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index 782425cc..57368fe8 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -339,7 +339,7 @@ def process(self):
             # TODO: also allow grayscale_normalized (try/except?)
             page_image, page_coords, page_image_info = self.workspace.image_from_page(
                 page, page_id, feature_selector='binarized')
-            zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info)
+            zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info)
 
             # aggregate existing regions so their foreground can be ignored
             ignore = (page.get_ImageRegion() +

From 5f8e1dfb337d78cd757f4a6b5aff968829c2d4a1 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Thu, 15 Aug 2024 13:19:08 +0200
Subject: [PATCH 34/97] add missing Levenshtein req in setup

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index 38f09abd..e3ee8213 100644
--- a/setup.py
+++ b/setup.py
@@ -41,6 +41,7 @@
         'ocrd>=3.0.0a1',
         'click',
         'scipy',
+        'python-Levenshtein>=0.25.1',
         'numpy>=1.17.0',
         'pillow>=7.1.2',
         'shapely>=1.7.1',

From 9a14e1dddf44515630dadbcc23b62e6951eccc5d Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Thu, 15 Aug 2024 13:53:33 +0200
Subject: [PATCH 35/97] fix: remove version req for Levenshtein

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index e3ee8213..6b75d3a3 100644
--- a/setup.py
+++ b/setup.py
@@ -41,7 +41,7 @@
         'ocrd>=3.0.0a1',
         'click',
         'scipy',
-        'python-Levenshtein>=0.25.1',
+        'python-Levenshtein',
         'numpy>=1.17.0',
         'pillow>=7.1.2',
         'shapely>=1.7.1',

From 4ca4d1417030e40818327a7cc3571b22ad4ccda9 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Thu, 15 Aug 2024 13:59:33 +0200
Subject: [PATCH 36/97] fix: Levenshtein import

---
 ocrd_cis/align/cli.py | 2 +-
 setup.py              | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/ocrd_cis/align/cli.py b/ocrd_cis/align/cli.py
index ffe53fd8..7747622e 100644
--- a/ocrd_cis/align/cli.py
+++ b/ocrd_cis/align/cli.py
@@ -2,7 +2,7 @@
 import click
 import json
 import os
-import Levenshtein
+from rapidfuzz.distance import Levenshtein
 from ocrd import Processor
 from ocrd.decorators import ocrd_cli_options
 from ocrd.decorators import ocrd_cli_wrap_processor
diff --git a/setup.py b/setup.py
index 6b75d3a3..38f09abd 100644
--- a/setup.py
+++ b/setup.py
@@ -41,7 +41,6 @@
         'ocrd>=3.0.0a1',
         'click',
         'scipy',
-        'python-Levenshtein',
         'numpy>=1.17.0',
         'pillow>=7.1.2',
         'shapely>=1.7.1',

From fbaafcb4e3f982496aafdf561a4cd4713d859f5c Mon Sep 17 00:00:00 2001
From: kba <unixprog@gmail.com>
Date: Thu, 15 Aug 2024 16:23:00 +0200
Subject: [PATCH 37/97] update ocrd-cis-binarize to be compatible with
 bertsky/core#8

---
 ocrd_cis/ocropy/binarize.py | 70 ++++++++++++++++---------------------
 ocrd_cis/ocropy/common.py   |  3 +-
 2 files changed, 33 insertions(+), 40 deletions(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index 7478edb5..3c9583f9 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -1,21 +1,15 @@
 from __future__ import absolute_import
 from logging import Logger
+from typing import Optional
 
 import cv2
 import numpy as np
 from PIL import Image
-from os.path import abspath, dirname, join
 
-from typing import Tuple
+from ocrd.processor.ocrd_page_result import OcrdPageResult, OcrdPageResultImage
 
-#import kraken.binarization
-
-from ocrd_utils import (
-    getLogger,
-    make_file_id,
-    MIMETYPE_PAGE
-)
-from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage, to_xml
+from ocrd_utils import getLogger
+from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage
 from ocrd import Processor
 
 from . import common
@@ -71,7 +65,7 @@ def setup(self):
             self.logger.critical(f'Requested method {method} does not support grayscale normalized output')
             raise ValueError('only method=ocropy allows grayscale=true')
 
-    def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage:
+    def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
         """Binarize (and optionally deskew/despeckle) the pages/regions/lines of the workspace.
 
         Iterate over the PAGE-XML element hierarchy down to the requested
@@ -97,16 +91,17 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
         self.logger.debug(f'Level of operation: "{level}"')
 
         pcgts = input_pcgts[0]
+        assert pcgts
         page = pcgts.get_Page()
         assert page
 
         page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id, feature_filter='binarized')
         zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info)
 
-        ret = [pcgts]
+        result = OcrdPageResult(pcgts)
         if level == 'page':
             try:
-                ret.append(self.process_page(page, page_image, page_xywh, zoom, page_id, output_file_id))
+                result.images.append(self.process_page(page, page_image, page_xywh, zoom, page_id))
             except ValueError as e:
                 self.logger.error(e)
         else:
@@ -121,7 +116,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
                     region, page_image, page_xywh, feature_filter='binarized')
                 if level == 'region':
                     try:
-                        ret.append(self.process_region(region, region_image, region_xywh, zoom, region.id, output_file_id))
+                        result.images.append(self.process_region(region, region_image, region_xywh, zoom, region.id))
                         continue
                     except ValueError as e:
                         self.logger.error(e)
@@ -132,12 +127,12 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
                     line_image, line_xywh = self.workspace.image_from_segment(
                         line, region_image, region_xywh, feature_filter='binarized')
                     try:
-                        ret.append(self.process_line(line, line_image, line_xywh, zoom, page_id, region.id, output_file_id))
+                        result.images.append(self.process_line(line, line_image, line_xywh, zoom, page_id, region.id))
                     except ValueError as e:
                         self.logger.error(e)
-        return ret
+        return result
 
-    def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id) -> Tuple[Image.Image, str, str]:
+    def process_page(self, page, page_image, page_xywh, zoom, page_id) -> OcrdPageResultImage:
         if not page_image.width or not page_image.height:
             raise ValueError(f"Skipping page '{page_id}' with zero size")
         self.logger.info(f"About to binarize page '{page_id}'")
@@ -171,18 +166,17 @@ def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id) -> T
         orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180]
         page.set_orientation(orientation)
         if self.parameter['grayscale']:
-            file_id += '.IMG-NRM'
+            id_suffix = '.IMG-NRM'
             features += ',grayscale_normalized'
         else:
-            file_id += '.IMG-BIN'
+            id_suffix = '.IMG-BIN'
             features += ',binarized'
-        bin_image_id = f'{file_id}.IMG-BIN'
-        bin_image_path = join(self.output_file_grp, f'{bin_image_id}.png')
         # update PAGE (reference the image file):
-        page.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=features))
-        return bin_image, bin_image_id, bin_image_path
+        alternative_image = AlternativeImageType(comments=features)
+        page.add_AlternativeImage(alternative_image)
+        return OcrdPageResultImage(bin_image, id_suffix, alternative_image)
 
-    def process_region(self, region, region_image, region_xywh, zoom, page_id, file_id) -> Tuple[Image.Image, str, str]:
+    def process_region(self, region, region_image, region_xywh, zoom, page_id) -> OcrdPageResultImage:
         if not region_image.width or not region_image.height:
             raise ValueError(f"Skipping region '{region.id}' with zero size")
         self.logger.info(f"About to binarize page '{page_id}' region '{region.id}'")
@@ -217,21 +211,19 @@ def process_region(self, region, region_image, region_xywh, zoom, page_id, file_
         orientation = -region_xywh['angle']
         orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180]
         region.set_orientation(orientation)
-        bin_image_id = f'{file_id}_{region.id}'
+        id_suffix = f'{region.id}'
         if self.parameter['grayscale']:
-            bin_image_id += '.IMG-NRM'
+            id_suffix += '.IMG-NRM'
             features += ',grayscale_normalized'
         else:
-            bin_image_id += '.IMG-BIN'
+            id_suffix += '.IMG-BIN'
             features += ',binarized'
-        bin_image_path = join(self.output_file_grp, f'{bin_image_id}.png')
         # update PAGE (reference the image file):
-        region.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=features))
-        return bin_image, bin_image_id, bin_image_path
+        alternative_image = AlternativeImageType(comments=features)
+        region.add_AlternativeImage(alternative_image)
+        return OcrdPageResultImage(bin_image, id_suffix, alternative_image)
 
-    def process_line(
-        self, line, line_image, line_xywh, zoom, page_id, region_id, file_id
-    ) -> Tuple[Image.Image, str, str]:
+    def process_line(self, line, line_image, line_xywh, zoom, page_id, region_id) -> OcrdPageResultImage:
         if not line_image.width or not line_image.height:
             raise ValueError(f"Skipping line '{line.id}' with zero size")
         self.logger.info(f"About to binarize page '{page_id}' region '{region_id}' line '{line.id}'")
@@ -256,14 +248,14 @@ def process_line(
         bin_image = remove_noise(bin_image, maxsize=self.parameter['noise_maxsize'])
         if self.parameter['noise_maxsize']:
             features += ',despeckled'
-        bin_image_id = f'{file_id}_{region_id}_{line.id}'
+        id_suffix = f'{region_id}_{line.id}'
         if self.parameter['grayscale']:
-            bin_image_id += '.IMG-NRM'
+            id_suffix += '.IMG-NRM'
             features += ',grayscale_normalized'
         else:
-            bin_image_id += '.IMG-BIN'
+            id_suffix += '.IMG-BIN'
             features += ',binarized'
-        bin_image_path = join(self.output_file_grp, f'{bin_image_id}.png')
         # update PAGE (reference the image file):
-        line.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=features))
-        return bin_image, bin_image_id, bin_image_path
+        alternative_image = AlternativeImageType(comments=features)
+        line.add_AlternativeImage(alternative_image)
+        return OcrdPageResultImage(bin_image, id_suffix, alternative_image)
diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py
index c6b7c49d..c5b56ed0 100644
--- a/ocrd_cis/ocropy/common.py
+++ b/ocrd_cis/ocropy/common.py
@@ -1,4 +1,5 @@
 from __future__ import absolute_import
+from typing import Optional
 
 import warnings
 import logging
@@ -2103,7 +2104,7 @@ def find_topological():
     # DSAVE('rlabels_closed', rlabels)
     return rlabels
 
-def determine_zoom(logger: logging.Logger, page_id: str, dpi: float, page_image_info: OcrdExif) -> float:
+def determine_zoom(logger: logging.Logger, page_id: Optional[str], dpi: float, page_image_info: OcrdExif) -> float:
     if dpi > 0:
         zoom = 300.0/dpi
     elif page_image_info.resolution != 1:

From 516ce4ba4bd4f65dae975472b5632d8d3b6027c2 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
Date: Fri, 16 Aug 2024 00:58:16 +0200
Subject: [PATCH 38/97] binarize: use final v3 API

---
 ocrd_cis/ocropy/binarize.py | 69 +++++++++++++++++--------------------
 1 file changed, 32 insertions(+), 37 deletions(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index 7478edb5..fa47e139 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -6,17 +6,15 @@
 from PIL import Image
 from os.path import abspath, dirname, join
 
-from typing import Tuple
+from typing import Union, Optional
 
 #import kraken.binarization
 
-from ocrd_utils import (
-    getLogger,
-    make_file_id,
-    MIMETYPE_PAGE
-)
-from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage, to_xml
+from ocrd_utils import getLogger
+from ocrd_models.ocrd_page import AlternativeImageType
+from ocrd_models import OcrdFile, ClientSideOcrdFile, OcrdPage
 from ocrd import Processor
+from ocrd.processor import OcrdPageResult, OcrdPageResultImage
 
 from . import common
 from .common import array2pil, determine_zoom, pil2array, remove_noise
@@ -71,7 +69,7 @@ def setup(self):
             self.logger.critical(f'Requested method {method} does not support grayscale normalized output')
             raise ValueError('only method=ocropy allows grayscale=true')
 
-    def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage:
+    def process_page_pcgts(self, *input_pcgts: Optional[Union[OcrdFile, ClientSideOcrdFile]], page_id: str = None) -> OcrdPageResult:
         """Binarize (and optionally deskew/despeckle) the pages/regions/lines of the workspace.
 
         Iterate over the PAGE-XML element hierarchy down to the requested
@@ -90,7 +88,8 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
 
         Reference each new image in the AlternativeImage of the element.
 
-        Return a PAGE-XML with AlternativeImage and the arguments for ``workspace.save_image_file``.
+        Return a PAGE-XML with new AlternativeImage(s) and the arguments
+        for ``workspace.save_image_file``.
         """
         level = self.parameter['level-of-operation']
         assert self.workspace
@@ -103,10 +102,10 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
         page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id, feature_filter='binarized')
         zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info)
 
-        ret = [pcgts]
+        ret = OcrdPageResult(pcgts)
         if level == 'page':
             try:
-                ret.append(self.process_page(page, page_image, page_xywh, zoom, page_id, output_file_id))
+                ret.images.append(self.process_page(page, page_image, page_xywh, zoom, page_id))
             except ValueError as e:
                 self.logger.error(e)
         else:
@@ -121,7 +120,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
                     region, page_image, page_xywh, feature_filter='binarized')
                 if level == 'region':
                     try:
-                        ret.append(self.process_region(region, region_image, region_xywh, zoom, region.id, output_file_id))
+                        ret.images.append(self.process_region(region, region_image, region_xywh, zoom, region.id))
                         continue
                     except ValueError as e:
                         self.logger.error(e)
@@ -132,16 +131,15 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
                     line_image, line_xywh = self.workspace.image_from_segment(
                         line, region_image, region_xywh, feature_filter='binarized')
                     try:
-                        ret.append(self.process_line(line, line_image, line_xywh, zoom, page_id, region.id, output_file_id))
+                        ret.images.append(self.process_line(line, line_image, line_xywh, zoom, page_id, region.id))
                     except ValueError as e:
                         self.logger.error(e)
         return ret
 
-    def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id) -> Tuple[Image.Image, str, str]:
+    def process_page(self, page, page_image, page_xywh, zoom, page_id) -> OcrdPageResultImage:
         if not page_image.width or not page_image.height:
             raise ValueError(f"Skipping page '{page_id}' with zero size")
         self.logger.info(f"About to binarize page '{page_id}'")
-        assert self.output_file_grp
 
         features = page_xywh['features']
         if 'angle' in page_xywh and page_xywh['angle']:
@@ -171,18 +169,17 @@ def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id) -> T
         orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180]
         page.set_orientation(orientation)
         if self.parameter['grayscale']:
-            file_id += '.IMG-NRM'
+            suffix = '.IMG-NRM'
             features += ',grayscale_normalized'
         else:
-            file_id += '.IMG-BIN'
+            suffix = '.IMG-BIN'
             features += ',binarized'
-        bin_image_id = f'{file_id}.IMG-BIN'
-        bin_image_path = join(self.output_file_grp, f'{bin_image_id}.png')
         # update PAGE (reference the image file):
-        page.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=features))
-        return bin_image, bin_image_id, bin_image_path
+        alt_image = AlternativeImageType(comments=features)
+        page.add_AlternativeImage(alt_image)
+        return OcrdPageResultImage(bin_image, suffix, alt_image)
 
-    def process_region(self, region, region_image, region_xywh, zoom, page_id, file_id) -> Tuple[Image.Image, str, str]:
+    def process_region(self, region, region_image, region_xywh, zoom, page_id) -> OcrdPageResultImage:
         if not region_image.width or not region_image.height:
             raise ValueError(f"Skipping region '{region.id}' with zero size")
         self.logger.info(f"About to binarize page '{page_id}' region '{region.id}'")
@@ -217,21 +214,19 @@ def process_region(self, region, region_image, region_xywh, zoom, page_id, file_
         orientation = -region_xywh['angle']
         orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180]
         region.set_orientation(orientation)
-        bin_image_id = f'{file_id}_{region.id}'
+        suffix = region.id
         if self.parameter['grayscale']:
-            bin_image_id += '.IMG-NRM'
+            suffix += '.IMG-NRM'
             features += ',grayscale_normalized'
         else:
-            bin_image_id += '.IMG-BIN'
+            suffix += '.IMG-BIN'
             features += ',binarized'
-        bin_image_path = join(self.output_file_grp, f'{bin_image_id}.png')
         # update PAGE (reference the image file):
-        region.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=features))
-        return bin_image, bin_image_id, bin_image_path
+        alt_image = AlternativeImageType(comments=features)
+        region.add_AlternativeImage(alt_image)
+        return OcrdPageResultImage(bin_image, suffix, alt_image)
 
-    def process_line(
-        self, line, line_image, line_xywh, zoom, page_id, region_id, file_id
-    ) -> Tuple[Image.Image, str, str]:
+    def process_line(self, line, line_image, line_xywh, zoom, page_id, region_id) -> OcrdPageResultImage:
         if not line_image.width or not line_image.height:
             raise ValueError(f"Skipping line '{line.id}' with zero size")
         self.logger.info(f"About to binarize page '{page_id}' region '{region_id}' line '{line.id}'")
@@ -256,14 +251,14 @@ def process_line(
         bin_image = remove_noise(bin_image, maxsize=self.parameter['noise_maxsize'])
         if self.parameter['noise_maxsize']:
             features += ',despeckled'
-        bin_image_id = f'{file_id}_{region_id}_{line.id}'
+        suffix = f'{region_id}_{line.id}'
         if self.parameter['grayscale']:
-            bin_image_id += '.IMG-NRM'
+            suffix += '.IMG-NRM'
             features += ',grayscale_normalized'
         else:
-            bin_image_id += '.IMG-BIN'
+            suffix += '.IMG-BIN'
             features += ',binarized'
-        bin_image_path = join(self.output_file_grp, f'{bin_image_id}.png')
         # update PAGE (reference the image file):
-        line.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=features))
-        return bin_image, bin_image_id, bin_image_path
+        alt_image = AlternativeImageType(comments=features)
+        line.add_AlternativeImage(alt_image)
+        return OcrdPageResultImage(bin_image, suffix, alt_image)

From 2e4f26f04ec5b2070a0396015d4339493e365fa1 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
Date: Fri, 16 Aug 2024 01:05:17 +0200
Subject: [PATCH 39/97] binarize: use correct types

---
 ocrd_cis/ocropy/binarize.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index fa47e139..ac499336 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -11,8 +11,7 @@
 #import kraken.binarization
 
 from ocrd_utils import getLogger
-from ocrd_models.ocrd_page import AlternativeImageType
-from ocrd_models import OcrdFile, ClientSideOcrdFile, OcrdPage
+from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage
 from ocrd import Processor
 from ocrd.processor import OcrdPageResult, OcrdPageResultImage
 
@@ -69,7 +68,7 @@ def setup(self):
             self.logger.critical(f'Requested method {method} does not support grayscale normalized output')
             raise ValueError('only method=ocropy allows grayscale=true')
 
-    def process_page_pcgts(self, *input_pcgts: Optional[Union[OcrdFile, ClientSideOcrdFile]], page_id: str = None) -> OcrdPageResult:
+    def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: str = None) -> OcrdPageResult:
         """Binarize (and optionally deskew/despeckle) the pages/regions/lines of the workspace.
 
         Iterate over the PAGE-XML element hierarchy down to the requested

From 21be94106ac55d001cb5729f21138fb9c7715bcb Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
Date: Fri, 16 Aug 2024 01:12:04 +0200
Subject: [PATCH 40/97] clip: use final v3 API

---
 ocrd_cis/ocropy/clip.py | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py
index 400e9b54..d0119544 100644
--- a/ocrd_cis/ocropy/clip.py
+++ b/ocrd_cis/ocropy/clip.py
@@ -8,19 +8,17 @@
 from shapely.geometry import Polygon
 from shapely.prepared import prep
 
-from ocrd_modelfactory import page_from_file
-from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage, to_xml
+from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage
 from ocrd import Processor
+from ocrd.processor import OcrdPageResult, OcrdPageResultImage
 from ocrd_utils import (
     getLogger,
-    make_file_id,
     coordinates_of_segment,
     polygon_from_points,
     bbox_from_polygon,
     image_from_polygon,
     polygon_mask,
     crop_image,
-    MIMETYPE_PAGE
 )
 
 from .ocrolib import midrange, morph
@@ -38,7 +36,7 @@ def executable(self):
     def setup(self):
         self.logger = getLogger('processor.OcropyClip')
 
-    def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage:
+    def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: str = None) -> OcrdPageResult:
         """Clip text regions / lines of a page at intersections with neighbours.
 
         Open and deserialize PAGE input file and its respective image,
@@ -85,7 +83,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
             page, page_id, feature_selector='binarized')
         # The zoom is not used anywhere
         zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info)
-        ret = [pcgts]
+        ret = OcrdPageResult(pcgts)
 
         # FIXME: what about text regions inside table regions?
         regions = list(page.get_TextRegion())
@@ -141,9 +139,9 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
                               if shape.intersects(shapej)]
                 if neighbours:
                     segment_region_file_id = f"{output_file_id}_{region.id}"
-                    ret.append(self.process_segment(
+                    ret.images.append(self.process_segment(
                         region, masks[i], polygons[i], neighbours, background_image,
-                        page_image, page_xywh, page_bin, page_id, segment_region_file_id))
+                        page_image, page_xywh, page_bin, page_id))
                 continue
             # level == 'line':
             lines = region.get_TextLine()
@@ -172,14 +170,14 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
                               if shape.intersects(shapej)]
                 if neighbours:
                     segment_line_file_id = f"{output_file_id}_{region.id}_{line.id}"
-                    ret.append(self.process_segment(
+                    ret.images.append(self.process_segment(
                         line, masks[j], polygons[j], neighbours, background_image,
-                        region_image, region_coords, region_bin, page_id, segment_line_file_id))
+                        region_image, region_coords, region_bin, page_id))
         return ret
 
     def process_segment(self, segment, segment_mask, segment_polygon, neighbours,
                         background_image, parent_image, parent_coords, parent_bin,
-                        page_id, file_id) -> Tuple[Image.Image, str, str]:
+                        page_id) -> OcrdPageResultImage:
         # initialize AlternativeImage@comments classes from parent, except
         # for those operations that can apply on multiple hierarchy levels:
         features = ','.join(
@@ -217,8 +215,7 @@ def process_segment(self, segment, segment_mask, segment_polygon, neighbours,
         # recrop segment into rectangle, just as image_from_segment would do
         # (and also clipping with background colour):
         segment_image = crop_image(segment_image,box=segment_bbox)
-        segment_image_id = file_id + '.IMG-CLIP'
-        segment_image_path = join(self.output_file_grp, f'{segment_image_id}.png')
         # update PAGE (reference the image file):
-        segment.add_AlternativeImage(AlternativeImageType(filename=segment_image_path, comments=features))
-        return segment_image, segment_image_id, segment_image_path
+        alternative_image = AlternativeImageType(comments=features)
+        segment.add_AlternativeImage(alternative_image)
+        return OcrdPageResultImage(segment_image, '.IMG-CLIP', alternative_image)

From 9539ac9620776e335bbe107e57e92742027f02b3 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
Date: Fri, 16 Aug 2024 01:12:51 +0200
Subject: [PATCH 41/97] clip: use correct types

---
 ocrd_cis/ocropy/clip.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py
index d0119544..3ddd6a70 100644
--- a/ocrd_cis/ocropy/clip.py
+++ b/ocrd_cis/ocropy/clip.py
@@ -1,6 +1,6 @@
 from __future__ import absolute_import
 from logging import Logger
-from typing import Tuple
+from typing import Optional
 
 from os.path import join
 import numpy as np

From 734b5eb4ef9bfee2e24d8053966b17eaf6e9e1f9 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
Date: Fri, 16 Aug 2024 01:14:56 +0200
Subject: [PATCH 42/97] recognize: use final v3 API

---
 ocrd_cis/ocropy/recognize.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py
index bbb8e415..7e4f2957 100644
--- a/ocrd_cis/ocropy/recognize.py
+++ b/ocrd_cis/ocropy/recognize.py
@@ -11,18 +11,16 @@
 
 from ocrd_utils import (
     getLogger,
-    make_file_id,
     coordinates_for_segment,
     polygon_from_bbox,
     points_from_polygon,
-    MIMETYPE_PAGE
 )
-from ocrd_modelfactory import page_from_file
 from ocrd_models.ocrd_page import (
-    to_xml, TextEquivType, OcrdPage,
+    TextEquivType, OcrdPage,
     CoordsType, GlyphType, WordType
 )
 from ocrd import Processor
+from ocrd.processor import OcrdPageResult
 
 from .common import check_line, pil2array
 from .ocrolib import lstm, load_object, midrange
@@ -116,7 +114,7 @@ def get_model(self):
             f"Could not find model {p_model}. Try 'ocrd resmgr download ocrd-cis-ocropy-recognize {p_model}")
         exit(1)
 
-    def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage:
+    def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: str = None) -> OcrdPageResult:
         """Recognize lines / words / glyphs of a page.
 
         Open and deserialize the PAGE input file and its respective image,
@@ -156,7 +154,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id:
         if not regions:
             self.logger.warning(f"Page '{page_id}' contains no text regions")
         self.process_regions(regions, max_level, page_image, page_xywh)
-        return [pcgts]
+        return OcrdPageResult(pcgts)
 
     def process_regions(self, regions, maxlevel, page_image, page_xywh):
         edits = 0

From 28ad585c94f9895b3f5011a72aabf36b73d71a8e Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
Date: Fri, 16 Aug 2024 02:20:58 +0200
Subject: [PATCH 43/97] recognize: fix typing import

---
 ocrd_cis/ocropy/recognize.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py
index 7e4f2957..97fcc64d 100644
--- a/ocrd_cis/ocropy/recognize.py
+++ b/ocrd_cis/ocropy/recognize.py
@@ -1,7 +1,8 @@
 from __future__ import absolute_import
+
 from logging import Logger
 from sys import exit
-from typing import Any
+from typing import Any, Optional
 from os import access, R_OK
 from os.path import abspath, dirname, isfile, join
 import numpy as np

From 9a7c10ab71f7df3783f44848536aa99dd9c8e483 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
Date: Fri, 16 Aug 2024 02:31:27 +0200
Subject: [PATCH 44/97] denoise: adapt to final v3 API

---
 ocrd_cis/ocropy/denoise.py | 122 +++++++++++++++----------------------
 1 file changed, 49 insertions(+), 73 deletions(-)

diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py
index cc622c24..0f368fd5 100644
--- a/ocrd_cis/ocropy/denoise.py
+++ b/ocrd_cis/ocropy/denoise.py
@@ -1,17 +1,15 @@
 from __future__ import absolute_import
+
+from typing import Optional
 from logging import Logger
 from os.path import join
 
-from ocrd_utils import (
-    getLogger,
-    make_file_id,
-    MIMETYPE_PAGE
-)
-from ocrd_modelfactory import page_from_file
+from ocrd_utils import getLogger
 from ocrd_models.ocrd_page import (
-    to_xml, AlternativeImageType
+    AlternativeImageType, OcrdPage
 )
 from ocrd import Processor
+from ocrd.processor import OcrdPageResult, OcrdPageResultImage
 
 from .common import (
     # binarize,
@@ -27,10 +25,10 @@ def executable(self):
     def setup(self):
         self.logger = getLogger('processor.OcropyDenoise')
 
-    def process(self):
+    def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
         """Despeckle the pages / regions / lines of the workspace.
 
-        Open and deserialise PAGE input files and their respective images,
+        Open and deserialise PAGE input file and its respective images,
         then iterate over the element hierarchy down to the requested
         ``level-of-operation``.
 
@@ -49,73 +47,51 @@ def process(self):
         Produce a new output file by serialising the resulting hierarchy.
         """
         level = self.parameter['level-of-operation']
-
-        for (n, input_file) in enumerate(self.input_files):
-            self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
-            file_id = make_file_id(input_file, self.output_file_grp)
-
-            pcgts = page_from_file(self.workspace.download_file(input_file))
-            self.add_metadata(pcgts)
-            page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
-            page = pcgts.get_Page()
-                
-            page_image, page_xywh, page_image_info = self.workspace.image_from_page(
-                page, page_id,
-                feature_selector='binarized' if level == 'page' else '')
-
-            zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info)
-
-            if level == 'page':
-                self.process_segment(page, page_image, page_xywh, zoom,
-                                     input_file.pageId, file_id)
-            else:
-                regions = page.get_AllRegions(classes=['Text'], order='reading-order')
-                if not regions:
-                    self.logger.warning('Page "%s" contains no text regions', page_id)
-                for region in regions:
-                    region_image, region_xywh = self.workspace.image_from_segment(
-                        region, page_image, page_xywh,
-                        feature_selector='binarized' if level == 'region' else '')
-                    if level == 'region':
-                        self.process_segment(region, region_image, region_xywh, zoom,
-                                             input_file.pageId, file_id + '_' + region.id)
-                        continue
-                    lines = region.get_TextLine()
-                    if not lines:
-                        self.logger.warning('Page "%s" region "%s" contains no text lines', page_id, region.id)
-                    for line in lines:
-                        line_image, line_xywh = self.workspace.image_from_segment(
-                            line, region_image, region_xywh,
-                            feature_selector='binarized')
-                        self.process_segment(line, line_image, line_xywh, zoom,
-                                             input_file.pageId,
-                                             file_id + '_' + region.id + '_' + line.id)
-
-            # update METS (add the PAGE file):
-            file_path = join(self.output_file_grp, file_id + '.xml')
-            pcgts.set_pcGtsId(file_id)
-            out = self.workspace.add_file(
-                ID=file_id,
-                file_grp=self.output_file_grp,
-                pageId=input_file.pageId,
-                local_filename=file_path,
-                mimetype=MIMETYPE_PAGE,
-                content=to_xml(pcgts))
-            self.logger.info('created file ID: %s, file_grp: %s, path: %s',
-                     file_id, self.output_file_grp, out.local_filename)
-
-    def process_segment(self, segment, segment_image, segment_xywh, zoom, page_id, file_id):
+        pcgts = input_pcgts[0]
+        result = OcrdPageResult(pcgts)
+        page = pcgts.get_Page()
+
+        page_image, page_xywh, page_image_info = self.workspace.image_from_page(
+            page, page_id,
+            feature_selector='binarized' if level == 'page' else '')
+        zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info)
+
+        if level == 'page':
+            image = self.process_segment(page, page_image, page_xywh, zoom)
+            if image:
+                result.images.append(image)
+        else:
+            regions = page.get_AllRegions(classes=['Text'], order='reading-order')
+            if not regions:
+                self.logger.warning('Page "%s" contains no text regions', page_id)
+            for region in regions:
+                region_image, region_xywh = self.workspace.image_from_segment(
+                    region, page_image, page_xywh,
+                    feature_selector='binarized' if level == 'region' else '')
+                if level == 'region':
+                    image = self.process_segment(region, region_image, region_xywh, zoom)
+                    if image:
+                        result.images.append(image)
+                    continue
+                lines = region.get_TextLine()
+                if not lines:
+                    self.logger.warning('Page "%s" region "%s" contains no text lines', page_id, region.id)
+                for line in lines:
+                    line_image, line_xywh = self.workspace.image_from_segment(
+                        line, region_image, region_xywh,
+                        feature_selector='binarized')
+                    image = self.process_segment(line, line_image, line_xywh, zoom)
+                    if image:
+                        result.images.append(image)
+
+    def process_segment(self, segment, segment_image, segment_xywh, zoom) -> Optional[OcrdPageResultImage]:
         if not segment_image.width or not segment_image.height:
             self.logger.warning("Skipping '%s' with zero size", file_id)
-            return
+            return None
         self.logger.info("About to despeckle '%s'", file_id)
         bin_image = remove_noise(segment_image,
                                  maxsize=self.parameter['noise_maxsize']/zoom*300/72) # in pt
-        # update METS (add the image file):
-        file_path = self.workspace.save_image_file(
-            bin_image, file_id + '.IMG-DESPECK', self.output_file_grp,
-            page_id=page_id)
         # update PAGE (reference the image file):
-        segment.add_AlternativeImage(AlternativeImageType(
-            filename=file_path,
-            comments=segment_xywh['features'] + ',despeckled'))
+        alt_image = AlternativeImageType(comments=segment_xywh['features'] + ',despeckled')
+        segment.add_AlternativeImage(alt_image)
+        return OcrdPageResultImage(bin_image, segment.id + '.IMG-DESPECK', alt_image)

From 7c9f39fa4516401fe17e24d3ca67799c5b85d308 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
Date: Fri, 16 Aug 2024 02:40:41 +0200
Subject: [PATCH 45/97] deskew: adapt to final v3 API

---
 ocrd_cis/ocropy/deskew.py | 116 +++++++++++++++-----------------------
 1 file changed, 47 insertions(+), 69 deletions(-)

diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py
index 616864e1..fae0c90c 100644
--- a/ocrd_cis/ocropy/deskew.py
+++ b/ocrd_cis/ocropy/deskew.py
@@ -1,24 +1,21 @@
 from __future__ import absolute_import
+
+from typing import Optional
 from logging import Logger
 from os.path import join
 
-from ocrd_utils import (
-    getLogger,
-    make_file_id,
-    MIMETYPE_PAGE
-)
-from ocrd_modelfactory import page_from_file
+from ocrd_utils import getLogger
 from ocrd_models.ocrd_page import (
     PageType,
-    to_xml, AlternativeImageType
+    AlternativeImageType,
+    OcrdPage
 )
 from ocrd import Processor
+from ocrd.processor import OcrdPageResult, OcrdPageResultImage
 
 from . import common
 from .common import pil2array
 
-#sys.path.append(os.path.dirname(os.path.abspath(__file__)))
-
 def deskew(pil_image, maxskew=2):
     array = pil2array(pil_image)
     _, angle = common.binarize(array, maxskew=maxskew)
@@ -34,10 +31,10 @@ def executable(self):
     def setup(self):
         self.logger = getLogger('processor.OcropyDeskew')
 
-    def process(self):
+    def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
         """Deskew the pages or regions of the workspace.
 
-        Open and deserialise PAGE input files and their respective images,
+        Open and deserialise PAGE input file and its respective images,
         then iterate over the element hierarchy down to the TextRegion level.
 
         Next, for each file, crop each region image according to the layout
@@ -53,62 +50,45 @@ def process(self):
         Produce a new output file by serialising the resulting hierarchy.
         """
         level = self.parameter['level-of-operation']
-
-        for (n, input_file) in enumerate(self.input_files):
-            self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
-            file_id = make_file_id(input_file, self.output_file_grp)
-
-            pcgts = page_from_file(self.workspace.download_file(input_file))
-            self.add_metadata(pcgts)
-            page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
-            page = pcgts.get_Page()
-                
-            page_image, page_coords, _ = self.workspace.image_from_page(
-                page, page_id,
+        pcgts = input_pcgts[0]
+        result = OcrdPageResult(pcgts)
+        page = pcgts.get_Page()
+
+        page_image, page_coords, _ = self.workspace.image_from_page(
+            page, page_id,
+            # image must not have been rotated already,
+            # (we will overwrite @orientation anyway,)
+            # abort if no such image can be produced:
+            feature_filter='deskewed' if level == 'page' else '')
+        if level == 'page':
+            image = self._process_segment(page, page_image, page_coords, "page '%s'" % page_id, page_id)
+            if image:
+                result.images.append(image)
+            return result
+        if level == 'table':
+            regions = page.get_TableRegion()
+        else: # region
+            regions = page.get_AllRegions(classes=['Text'], order='reading-order')
+        if not regions:
+            self.logger.warning('Page "%s" contains no text regions', page_id)
+        for region in regions:
+            # process region:
+            region_image, region_coords = self.workspace.image_from_segment(
+                region, page_image, page_coords,
                 # image must not have been rotated already,
                 # (we will overwrite @orientation anyway,)
                 # abort if no such image can be produced:
-                feature_filter='deskewed' if level == 'page' else '')
-            if level == 'page':
-                self._process_segment(page, page_image, page_coords,
-                                      "page '%s'" % page_id, input_file.pageId,
-                                      file_id)
-            else:
-                if level == 'table':
-                    regions = page.get_TableRegion()
-                else: # region
-                    regions = page.get_AllRegions(classes=['Text'], order='reading-order')
-                if not regions:
-                    self.logger.warning('Page "%s" contains no text regions', page_id)
-                for region in regions:
-                    # process region:
-                    region_image, region_coords = self.workspace.image_from_segment(
-                        region, page_image, page_coords,
-                        # image must not have been rotated already,
-                        # (we will overwrite @orientation anyway,)
-                        # abort if no such image can be produced:
-                        feature_filter='deskewed')
-                    self._process_segment(region, region_image, region_coords,
-                                          "region '%s'" % region.id, input_file.pageId,
-                                          file_id + '_' + region.id)
-
-            # update METS (add the PAGE file):
-            file_path = join(self.output_file_grp, file_id + '.xml')
-            pcgts.set_pcGtsId(file_id)
-            out = self.workspace.add_file(
-                ID=file_id,
-                file_grp=self.output_file_grp,
-                pageId=input_file.pageId,
-                local_filename=file_path,
-                mimetype=MIMETYPE_PAGE,
-                content=to_xml(pcgts))
-            self.logger.info('created file ID: %s, file_grp: %s, path: %s',
-                     file_id, self.output_file_grp, out.local_filename)
-
-    def _process_segment(self, segment, segment_image, segment_coords, segment_id, page_id, file_id):
+                feature_filter='deskewed')
+            image = self._process_segment(region, region_image, region_coords,
+                                          "region '%s'" % region.id, page_id)
+            if image:
+                result.images.append(image)
+        return result
+
+    def _process_segment(self, segment, segment_image, segment_coords, segment_id, page_id) -> Optional[OcrdPageResultImage]:
         if not segment_image.width or not segment_image.height:
             self.logger.warning("Skipping %s with zero size", segment_id)
-            return
+            return None
         angle0 = segment_coords['angle'] # deskewing (w.r.t. top image) already applied to segment_image
         self.logger.info("About to deskew %s", segment_id)
         angle = deskew(segment_image, maxskew=self.parameter['maxskew']) # additional angle to be applied
@@ -123,20 +103,18 @@ def _process_segment(self, segment, segment_image, segment_coords, segment_id, p
             segment_image, segment_coords, _ = self.workspace.image_from_page(
                 segment, page_id,
                 fill='background', transparency=True)
+            suffix = '.IMG-DESKEW'
         else:
             segment_image, segment_coords = self.workspace.image_from_segment(
                 segment, segment_image, segment_coords,
                 fill='background', transparency=True)
+            suffix = segment.id + '.IMG-DESKEW'
         if not angle:
             # zero rotation does not change coordinates,
             # but assures consuming processors that the
             # workflow had deskewing
             segment_coords['features'] += ',deskewed'
-        # update METS (add the image file):
-        file_path = self.workspace.save_image_file(
-            segment_image, file_id + '.IMG-DESKEW', self.output_file_grp,
-            page_id=page_id)
         # update PAGE (reference the image file):
-        segment.add_AlternativeImage(AlternativeImageType(
-            filename=file_path,
-            comments=segment_coords['features']))
+        alternative = AlternativeImageType(comments=segment_coords['features'])
+        segment.add_AlternativeImage(alternative)
+        return OcrdPageResultImage(segment_image, suffix, alternative)

From 669866857395544ed10c0fbda5ea03abd1b31f14 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
Date: Fri, 16 Aug 2024 02:52:55 +0200
Subject: [PATCH 46/97] dewarp: adapt to final v3 API

---
 ocrd_cis/ocropy/dewarp.py | 129 +++++++++++++++-----------------------
 1 file changed, 50 insertions(+), 79 deletions(-)

diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py
index 72efca45..a063a05e 100644
--- a/ocrd_cis/ocropy/dewarp.py
+++ b/ocrd_cis/ocropy/dewarp.py
@@ -1,24 +1,22 @@
 from __future__ import absolute_import
+
+from typing import Optional
 from logging import Logger
 from os.path import join
+
 import numpy as np
 
-from ocrd_utils import (
-    getLogger,
-    make_file_id,
-)
-from ocrd_modelfactory import page_from_file
+from ocrd_utils import getLogger
 from ocrd_models.ocrd_page import (
-    to_xml, AlternativeImageType
+    AlternativeImageType,
+    OcrdPage
 )
 from ocrd import Processor
-from ocrd_utils import MIMETYPE_PAGE
+from ocrd.processor import OcrdPageResult, OcrdPageResultImage
 
 from .ocrolib import lineest
 from .common import array2pil, check_line, determine_zoom, pil2array
 
-#sys.path.append(os.path.dirname(os.path.abspath(__file__)))
-
 class InvalidLine(Exception):
     """Line image does not allow dewarping and should be ignored."""
 
@@ -80,10 +78,10 @@ def setup(self):
                     #  and extra params)
                     0.3))
 
-    def process(self):
+    def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
         """Dewarp the lines of the workspace.
 
-        Open and deserialise PAGE input files and their respective images,
+        Open and deserialise PAGE input file and its respective images,
         then iterate over the element hierarchy down to the TextLine level.
 
         Next, get each line image according to the layout annotation (from
@@ -99,71 +97,44 @@ def process(self):
 
         Produce a new output file by serialising the resulting hierarchy.
         """
-
-        for (n, input_file) in enumerate(self.input_files):
-            self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
-            file_id = make_file_id(input_file, self.output_file_grp)
-
-            pcgts = page_from_file(self.workspace.download_file(input_file))
-            self.add_metadata(pcgts)
-            page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
-            page = pcgts.get_Page()
-                
-            page_image, page_xywh, page_image_info = self.workspace.image_from_page(
-                page, page_id)
-
-            zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info)
-
-            regions = page.get_AllRegions(classes=['Text'], order='reading-order')
-            if not regions:
-                self.logger.warning('Page "%s" contains no text regions', page_id)
-            for region in regions:
-                region_image, region_xywh = self.workspace.image_from_segment(
-                    region, page_image, page_xywh)
-
-                lines = region.get_TextLine()
-                if not lines:
-                    self.logger.warning('Region %s contains no text lines', region.id)
-                for line in lines:
-                    line_image, line_xywh = self.workspace.image_from_segment(
-                        line, region_image, region_xywh)
-
-                    self.logger.info("About to dewarp page '%s' region '%s' line '%s'",
-                                     page_id, region.id, line.id)
-                    try:
-                        dew_image = dewarp(line_image, self.lnorm, check=True,
-                                           max_neighbour=self.parameter['max_neighbour'],
-                                           zoom=zoom)
-                    except InvalidLine as err:
-                        self.logger.error('cannot dewarp line "%s": %s', line.id, err)
-                        continue
-                    except InadequateLine as err:
-                        self.logger.warning('cannot dewarp line "%s": %s', line.id, err)
-                        # as a fallback, simply pad the image vertically
-                        # (just as dewarping would do on average, so at least
-                        #  this line has similar margins as the others):
-                        dew_image = padvert(line_image, self.parameter['range'])
-                    # update METS (add the image file):
-                    file_path = self.workspace.save_image_file(
-                        dew_image,
-                        file_id + '_' + region.id + '_' + line.id + '.IMG-DEWARP',
-                        self.output_file_grp,
-                        page_id=input_file.pageId)
-                    # update PAGE (reference the image file):
-                    alternative_image = line.get_AlternativeImage()
-                    line.add_AlternativeImage(AlternativeImageType(
-                        filename=file_path,
-                        comments=line_xywh['features'] + ',dewarped'))
-
-            # update METS (add the PAGE file):
-            file_path = join(self.output_file_grp, file_id + '.xml')
-            pcgts.set_pcGtsId(file_id)
-            out = self.workspace.add_file(
-                ID=file_id,
-                file_grp=self.output_file_grp,
-                pageId=input_file.pageId,
-                local_filename=file_path,
-                mimetype=MIMETYPE_PAGE,
-                content=to_xml(pcgts))
-            self.logger.info('created file ID: %s, file_grp: %s, path: %s',
-                             file_id, self.output_file_grp, out.local_filename)
+        pcgts = input_pcgts[0]
+        result = OcrdPageResult(pcgts)
+        page = pcgts.get_Page()
+
+        page_image, page_xywh, page_image_info = self.workspace.image_from_page(
+            page, page_id)
+        zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info)
+
+        regions = page.get_AllRegions(classes=['Text'], order='reading-order')
+        if not regions:
+            self.logger.warning('Page "%s" contains no text regions', page_id)
+        for region in regions:
+            region_image, region_xywh = self.workspace.image_from_segment(
+                region, page_image, page_xywh)
+
+            lines = region.get_TextLine()
+            if not lines:
+                self.logger.warning('Region %s contains no text lines', region.id)
+            for line in lines:
+                line_image, line_xywh = self.workspace.image_from_segment(
+                    line, region_image, region_xywh)
+
+                self.logger.info("About to dewarp page '%s' region '%s' line '%s'",
+                                 page_id, region.id, line.id)
+                try:
+                    dew_image = dewarp(line_image, self.lnorm, check=True,
+                                       max_neighbour=self.parameter['max_neighbour'],
+                                       zoom=zoom)
+                except InvalidLine as err:
+                    self.logger.error('cannot dewarp line "%s": %s', line.id, err)
+                    continue
+                except InadequateLine as err:
+                    self.logger.warning('cannot dewarp line "%s": %s', line.id, err)
+                    # as a fallback, simply pad the image vertically
+                    # (just as dewarping would do on average, so at least
+                    #  this line has similar margins as the others):
+                    dew_image = padvert(line_image, self.parameter['range'])
+                # update PAGE (reference the image file):
+                alt_image = AlternativeImageType(comments=line_xywh['features'] + ',dewarped')
+                line.add_AlternativeImage(alternative_image)
+                return OcrdPageResultImage(dew_image, region.id + '_' + line.id + '.IMG-DEWARP', alt_image)

From 48a3146a4e510b14899aafc80c7f9f05da05fc48 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
Date: Fri, 16 Aug 2024 03:07:40 +0200
Subject: [PATCH 47/97] resegment: adapt to final v3 API

---
 ocrd_cis/ocropy/resegment.py | 109 +++++++++++++++--------------------
 1 file changed, 45 insertions(+), 64 deletions(-)

diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index 1e9f8c7f..05f17d4f 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -1,24 +1,25 @@
 from __future__ import absolute_import
+
+from typing import Optional
 from logging import Logger
 from os.path import join
+
 import numpy as np
 from skimage import draw, segmentation
 from shapely.geometry import Polygon, LineString
 from shapely.prepared import prep
 
-from ocrd_modelfactory import page_from_file
-from ocrd_models.ocrd_page import BaselineType, PageType, to_xml
-from ocrd import Processor
 from ocrd_utils import (
     getLogger,
-    make_file_id,
     coordinates_of_segment,
     coordinates_for_segment,
     points_from_polygon,
     polygon_from_points,
     transform_coordinates,
-    MIMETYPE_PAGE
 )
+from ocrd_models.ocrd_page import BaselineType, PageType, OcrdPage
+from ocrd import Processor
+from ocrd.processor import OcrdPageResult
 
 from .ocrolib import midrange, morph
 from .common import (
@@ -52,10 +53,10 @@ def executable(self):
     def setup(self):
         self.logger = getLogger('processor.OcropyResegment')
 
-    def process(self):
+    def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
         """Resegment lines of the workspace.
 
-        Open and deserialise PAGE input files and their respective images,
+        Open and deserialise PAGE input file and its respective images,
         then iterate over the element hierarchy down to the line level.
 
         Next, get the page image according to the layout annotation (from
@@ -104,67 +105,47 @@ def process(self):
         # accuracy crucially depends on a good estimate of the images'
         # pixel density (at least if source input is not 300 DPI).
         level = self.parameter['level-of-operation']
+        pcgts = input_pcgts[0]
+        page = pcgts.get_Page()
 
-        for n, input_file in enumerate(self.input_files):
-            self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
-            file_id = make_file_id(input_file, self.output_file_grp)
-
-            pcgts = page_from_file(self.workspace.download_file(input_file))
-            self.add_metadata(pcgts)
-            page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID
-            page = pcgts.get_Page()
-
-            page_image, page_coords, page_image_info = self.workspace.image_from_page(
-                page, page_id, feature_selector='binarized')
+        page_image, page_coords, page_image_info = self.workspace.image_from_page(
+            page, page_id, feature_selector='binarized')
+        zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info)
 
-            zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info)
-
-            ignore = (page.get_ImageRegion() +
-                      page.get_LineDrawingRegion() +
-                      page.get_GraphicRegion() +
-                      page.get_ChartRegion() +
-                      page.get_MapRegion() +
-                      page.get_MathsRegion() +
-                      page.get_ChemRegion() +
-                      page.get_MusicRegion() +
-                      page.get_AdvertRegion() +
-                      page.get_NoiseRegion() +
-                      page.get_SeparatorRegion() +
-                      page.get_UnknownRegion() +
-                      page.get_CustomRegion())
-            regions = page.get_AllRegions(classes=['Text'])
-            if not regions:
-                self.logger.warning('Page "%s" contains no text regions', page_id)
-            elif level == 'page':
-                lines = [line for region in regions
-                         for line in region.get_TextLine()]
+        ignore = (page.get_ImageRegion() +
+                  page.get_LineDrawingRegion() +
+                  page.get_GraphicRegion() +
+                  page.get_ChartRegion() +
+                  page.get_MapRegion() +
+                  page.get_MathsRegion() +
+                  page.get_ChemRegion() +
+                  page.get_MusicRegion() +
+                  page.get_AdvertRegion() +
+                  page.get_NoiseRegion() +
+                  page.get_SeparatorRegion() +
+                  page.get_UnknownRegion() +
+                  page.get_CustomRegion())
+        regions = page.get_AllRegions(classes=['Text'])
+        if not regions:
+            self.logger.warning('Page "%s" contains no text regions', page_id)
+        elif level == 'page':
+            lines = [line for region in regions
+                     for line in region.get_TextLine()]
+            if lines:
+                self._process_segment(page, page_image, page_coords, page_id, zoom, lines, ignore)
+            else:
+                self.logger.warning('Page "%s" contains no text regions with lines', page_id)
+        else:
+            for region in regions:
+                lines = region.get_TextLine()
                 if lines:
-                    self._process_segment(page, page_image, page_coords, page_id, zoom, lines, ignore)
+                    region_image, region_coords = self.workspace.image_from_segment(
+                        region, page_image, page_coords, feature_selector='binarized')
+                    self._process_segment(region, region_image, region_coords, page_id, zoom, lines, ignore)
                 else:
-                    self.logger.warning('Page "%s" contains no text regions with lines', page_id)
-            else:
-                for region in regions:
-                    lines = region.get_TextLine()
-                    if lines:
-                        region_image, region_coords = self.workspace.image_from_segment(
-                            region, page_image, page_coords, feature_selector='binarized')
-                        self._process_segment(region, region_image, region_coords, page_id, zoom, lines, ignore)
-                    else:
-                        self.logger.warning('Page "%s" region "%s" contains no text lines', page_id, region.id)
-
-            # update METS (add the PAGE file):
-            file_path = join(self.output_file_grp, file_id + '.xml')
-            pcgts.set_pcGtsId(file_id)
-            out = self.workspace.add_file(
-                ID=file_id,
-                file_grp=self.output_file_grp,
-                pageId=input_file.pageId,
-                local_filename=file_path,
-                mimetype=MIMETYPE_PAGE,
-                content=to_xml(pcgts))
-            self.logger.info('created file ID: %s, file_grp: %s, path: %s',
-                     file_id, self.output_file_grp, out.local_filename)
-
+                    self.logger.warning('Page "%s" region "%s" contains no text lines', page_id, region.id)
+        return OcrdPageResult(pcgts)
+ 
     def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, lines, ignore):
         threshold = self.parameter['min_fraction']
         method = self.parameter['method']

From 0dd6fbac1a63965d241203cdc1dda85ca1fa4728 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Fri, 16 Aug 2024 11:04:23 +0200
Subject: [PATCH 48/97] ocropy_segment: implement process_page_pcgts

---
 ocrd_cis/ocropy/segment.py | 314 +++++++++++++++++++++++++++----------
 1 file changed, 229 insertions(+), 85 deletions(-)

diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index 57368fe8..d2a7a727 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -1,6 +1,7 @@
 from __future__ import absolute_import
 from logging import Logger
 from os.path import join
+from typing import Optional
 import itertools
 import numpy as np
 from scipy.sparse.csgraph import minimum_spanning_tree
@@ -16,6 +17,7 @@
 from ocrd_modelfactory import page_from_file
 from ocrd_models.ocrd_page import (
     to_xml, CoordsType,
+    OcrdPage,
     TextLineType,
     TextRegionType,
     SeparatorRegionType,
@@ -35,6 +37,7 @@
     ReadingOrderType
 )
 from ocrd import Processor
+from ocrd.processor import OcrdPageResult
 from ocrd_utils import (
     getLogger,
     make_file_id,
@@ -252,6 +255,168 @@ def executable(self):
     def setup(self):
         self.logger = getLogger('processor.OcropySegment')
 
+    def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
+        overwrite_lines = self.parameter['overwrite_lines']
+        overwrite_regions = self.parameter['overwrite_regions']
+        overwrite_separators = self.parameter['overwrite_separators']
+        overwrite_order = self.parameter['overwrite_order']
+        oplevel = self.parameter['level-of-operation']
+
+        pcgts = input_pcgts[0]
+        page = pcgts.get_Page()
+
+        # TODO: also allow grayscale_normalized (try/except?)
+        page_image, page_coords, page_image_info = self.workspace.image_from_page(
+            page, page_id, feature_selector='binarized')
+        zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info)
+
+        # aggregate existing regions so their foreground can be ignored
+        ignore = (page.get_ImageRegion() +
+                  page.get_LineDrawingRegion() +
+                  page.get_GraphicRegion() +
+                  page.get_ChartRegion() +
+                  page.get_MapRegion() +
+                  page.get_MathsRegion() +
+                  page.get_ChemRegion() +
+                  page.get_MusicRegion() +
+                  page.get_AdvertRegion() +
+                  page.get_NoiseRegion() +
+                  page.get_UnknownRegion() +
+                  page.get_CustomRegion())
+        if oplevel == 'page' and overwrite_separators:
+            page.set_SeparatorRegion([])
+        else:
+            ignore.extend(page.get_SeparatorRegion())
+        # prepare reading order
+        reading_order = dict()
+        ro = page.get_ReadingOrder()
+        if ro:
+            rogroup = ro.get_OrderedGroup() or ro.get_UnorderedGroup()
+            if rogroup:
+                page_get_reading_order(reading_order, rogroup)
+            # get segments to process / overwrite
+        if oplevel == 'page':
+            ignore.extend(page.get_TableRegion())
+            regions = list(page.get_TextRegion())
+            if regions:
+                # page is already region-segmented
+                if overwrite_regions:
+                    self.logger.info(f'Removing existing TextRegions in page "{page_id}"')
+                    # we could remove all other region types as well,
+                    # but this is more flexible (for workflows with
+                    # specialized separator/image/table detectors):
+                    page.set_TextRegion([])
+                    page.set_ReadingOrder(None)
+                    ro = None
+                else:
+                    self.logger.warning(f'Keeping existing TextRegions in page "{page_id}"')
+                    ignore.extend(regions)
+            # create reading order if necessary
+            if not ro or overwrite_order:
+                ro = ReadingOrderType()
+                page.set_ReadingOrder(ro)
+            rogroup = ro.get_OrderedGroup() or ro.get_UnorderedGroup()
+            if not rogroup:
+                # new top-level group
+                rogroup = OrderedGroupType(id="reading-order")
+                ro.set_OrderedGroup(rogroup)
+            # go get TextRegions with TextLines (and SeparatorRegions):
+            self._process_element(
+                page, ignore, page_image, page_coords, page_id, file_id, page_id, zoom, rogroup=rogroup)
+            if (not rogroup.get_RegionRefIndexed() and
+                    not rogroup.get_OrderedGroupIndexed() and
+                    not rogroup.get_UnorderedGroupIndexed()):
+                # schema forbids empty OrderedGroup
+                ro.set_OrderedGroup(None)
+        elif oplevel == 'table':
+            ignore.extend(page.get_TextRegion())
+            regions = list(page.get_TableRegion())
+            if not regions:
+                self.logger.warning(f'Page "{page_id}" contains no table regions')
+            for region in regions:
+                subregions = region.get_TextRegion()
+                if subregions:
+                    # table is already cell-segmented
+                    if overwrite_regions:
+                        self.logger.info(f'Removing existing TextRegions in table "{region.id}"')
+                        region.set_TextRegion([])
+                        roelem = reading_order.get(region.id)
+                        # replace by empty group with same index and ref
+                        # (which can then take the cells as subregions)
+                        reading_order[region.id] = page_subgroup_in_reading_order(self.logger, roelem)
+                    else:
+                        self.logger.warning(f'Skipping table "{region.id}" with existing TextRegions')
+                        continue
+                # TODO: also allow grayscale_normalized (try/except?)
+                region_image, region_coords = self.workspace.image_from_segment(
+                    region, page_image, page_coords, feature_selector='binarized')
+                # ignore everything but the current table region
+                subignore = regions + ignore
+                subignore.remove(region)
+                # create reading order group if necessary
+                roelem = reading_order.get(region.id)
+                if not roelem:
+                    self.logger.warning(f"Page '{page_id}' table region '{region.id}' is not referenced in reading "
+                                        f"order (no target to add cells to)")
+                elif overwrite_order:
+                    # replace by empty ordered group with same (index and) ref
+                    # (which can then take the cells as subregions)
+                    roelem = page_subgroup_in_reading_order(self.logger, roelem)
+                    reading_order[region.id] = roelem
+                elif isinstance(roelem, (OrderedGroupType, OrderedGroupIndexedType)):
+                    self.logger.warning(f"Page '{page_id}' table region '{region.id}' already has an ordered "
+                                        f"group (cells will be appended)")
+                elif isinstance(roelem, (UnorderedGroupType, UnorderedGroupIndexedType)):
+                    self.logger.warning(f"Page '{page_id}' table region '{region.id}' already has an unordered "
+                                        f"group (cells will not be appended)")
+                    roelem = None
+                else:
+                    # replace regionRef(Indexed) by group with same index and ref
+                    # (which can then take the cells as subregions)
+                    roelem = page_subgroup_in_reading_order(self.logger, roelem)
+                    reading_order[region.id] = roelem
+                # go get TextRegions with TextLines (and SeparatorRegions)
+                self._process_element(
+                    region, subignore, region_image, region_coords, region.id, file_id + '_' + region.id,
+                    page_id, zoom, rogroup=roelem)
+        else:  # 'region'
+            regions = list(page.get_TextRegion())
+            # besides top-level text regions, line-segment any table cells,
+            # and for tables without any cells, add a pseudo-cell
+            for region in page.get_TableRegion():
+                subregions = region.get_TextRegion()
+                if subregions:
+                    regions.extend(subregions)
+                else:
+                    subregion = TextRegionType(
+                        id=region.id + '_text',
+                        Coords=region.get_Coords(),
+                        # as if generated from parser:
+                        parent_object_=region)
+                    region.add_TextRegion(subregion)
+                    regions.append(subregion)
+            if not regions:
+                self.logger.warning('Page "%s" contains no text regions', page_id)
+            for region in regions:
+                if region.get_TextLine():
+                    if overwrite_lines:
+                        self.logger.info(f'Removing existing TextLines in page "{page_id}" region "{region.id}"')
+                        region.set_TextLine([])
+                    else:
+                        self.logger.warning(f'Keeping existing TextLines in page "{page_id}" region "{region.id}"')
+                        ignore.extend(region.get_TextLine())
+                # TODO: also allow grayscale_normalized (try/except?)
+                region_image, region_coords = self.workspace.image_from_segment(
+                    region, page_image, page_coords, feature_selector='binarized')
+                # if the region images have already been clipped against their neighbours specifically,
+                # then we don't need to suppress all neighbours' foreground generally here
+                if 'clipped' in region_coords['features'].split(','):
+                    ignore = []
+                # go get TextLines
+                self._process_element(
+                    region, ignore, region_image, region_coords, region.id, file_id + '_' + region.id, page_id, zoom)
+        return OcrdPageResult(pcgts)
+
     def process(self):
         """Segment pages into regions+lines, tables into cells+lines, or regions into lines.
         
@@ -335,7 +500,7 @@ def process(self):
             self.add_metadata(pcgts)
             page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
             page = pcgts.get_Page()
-            
+
             # TODO: also allow grayscale_normalized (try/except?)
             page_image, page_coords, page_image_info = self.workspace.image_from_page(
                 page, page_id, feature_selector='binarized')
@@ -521,15 +686,14 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
         newly detected separators to guide region segmentation.
         """
         if not image.width or not image.height:
-            self.logger.warning("Skipping '%s' with zero size", element_id)
+            self.logger.warning(f"Skipping '{element_id}' with zero size")
             return
         element_array = pil2array(image)
         element_bin = np.array(element_array <= midrange(element_array), bool)
         sep_bin = np.zeros_like(element_bin, bool)
         ignore_labels = np.zeros_like(element_bin, int)
         for i, segment in enumerate(ignore):
-            self.logger.debug('masking foreground of %s "%s" for "%s"',
-                      type(segment).__name__[:-4], segment.id, element_id)
+            self.logger.debug(f'Masking foreground of {type(segment).__name__[:-4]} "{segment.id}" for "{element_id}"')
             # mark these segments (e.g. separator regions, tables, images)
             # for workflows where they have been detected already;
             # these will be:
@@ -540,13 +704,11 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
             # negative/above-max indices), either fully or partially,
             # then this will silently ignore them. The caller does
             # not need to concern herself with this.
+            sp_row = segment_polygon[:, 1]
+            sp_column = segment_polygon[:, 0]
             if isinstance(segment, SeparatorRegionType):
-                sep_bin[draw.polygon(segment_polygon[:, 1],
-                                     segment_polygon[:, 0],
-                                     sep_bin.shape)] = True
-            ignore_labels[draw.polygon(segment_polygon[:, 1],
-                                       segment_polygon[:, 0],
-                                       ignore_labels.shape)] = i+1 # mapped back for RO
+                sep_bin[draw.polygon(sp_row, sp_column, sep_bin.shape)] = True
+            ignore_labels[draw.polygon(sp_row, sp_column, ignore_labels.shape)] = i + 1  # mapped back for RO
         if isinstance(element, PageType):
             element_name = 'page'
             fullpage = True
@@ -562,7 +724,7 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
             element_name = 'region'
             fullpage = False
             report = check_region(element_bin, zoom)
-        self.logger.info('computing line segmentation for %s "%s"', element_name, element_id)
+        self.logger.info(f'Computing line segmentation for {element_name} "{element_id}"')
         # TODO: we should downscale if DPI is large enough to save time
         try:
             if report:
@@ -570,9 +732,9 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
             line_labels, baselines, seplines, images, colseps, scale = compute_segmentation(
                 # suppress separators and ignored regions for textline estimation
                 # but keep them for h/v-line detection (in fullpage mode):
-                element_bin, seps=(sep_bin+ignore_labels)>0,
+                element_bin, seps=(sep_bin + ignore_labels) > 0,
                 zoom=zoom, fullpage=fullpage,
-                spread_dist=round(self.parameter['spread']/zoom*300/72), # in pt
+                spread_dist=round(self.parameter['spread'] / zoom * 300 / 72),  # in pt
                 # these are ignored when not in fullpage mode:
                 maxcolseps=self.parameter['maxcolseps'],
                 maxseps=self.parameter['maxseps'],
@@ -580,16 +742,13 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                 csminheight=self.parameter['csminheight'])
         except Exception as err:
             if isinstance(element, TextRegionType):
-                self.logger.error('Cannot line-segment region "%s": %s', element_id, err)
+                self.logger.error(f'Cannot line-segment region "{element_id}": {err}')
                 # as a fallback, add a single text line comprising the whole region:
                 element.add_TextLine(TextLineType(id=element_id + "_line", Coords=element.get_Coords()))
             else:
-                self.logger.error('Cannot line-segment %s "%s": %s', element_name, element_id, err)
+                self.logger.error(f'Cannot line-segment {element_name} "{element_id}": {err}')
             return
-
-        self.logger.info('Found %d text lines for %s "%s"',
-                 len(np.unique(line_labels)) - 1,
-                 element_name, element_id)
+        self.logger.info(f'Found {len(np.unique(line_labels)) - 1} text lines for {element_name} "{element_id}"')
         # post-process line labels
         if isinstance(element, (PageType, TableRegionType)):
             # aggregate text lines to text regions
@@ -598,31 +757,28 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                 # i.e. identical line and region labels
                 # to detect their reading order among the others
                 # (these cannot be split or grouped together with other regions)
-                line_labels = np.where(line_labels, line_labels+len(ignore), ignore_labels)
+                line_labels = np.where(line_labels, line_labels + len(ignore), ignore_labels)
                 # suppress separators/images in fg and try to use for partitioning slices
                 sepmask = np.maximum(sep_bin, np.maximum(seplines > 0, images > 0))
                 region_labels = lines2regions(
                     element_bin, line_labels,
                     rlabels=ignore_labels,
-                    sepmask=np.maximum(sepmask, colseps), # add bg
+                    sepmask=np.maximum(sepmask, colseps),  # add bg
                     # decide horizontal vs vertical cut when gaps of similar size
                     prefer_vertical=not isinstance(element, TableRegionType),
                     gap_height=self.parameter['gap_height'],
                     gap_width=self.parameter['gap_width'],
                     scale=scale, zoom=zoom)
-                self.logger.info('Found %d text regions for %s "%s"',
-                         len(np.unique(region_labels)) - 1,
-                         element_name, element_id)
+                self.logger.info(
+                    f'Found {len(np.unique(region_labels)) - 1} text regions for {element_name} "{element_id}"')
             except Exception as err:
-                self.logger.error('Cannot region-segment %s "%s": %s',
-                          element_name, element_id, err)
+                self.logger.error(f'Cannot region-segment {element_name} "{element_id}": {err}')
                 region_labels = np.where(line_labels > len(ignore), 1 + len(ignore), line_labels)
-            
             # prepare reading order group index
             if rogroup:
                 if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)):
                     index = 0
-                    # start counting from largest existing index
+                    # start counting from the largest existing index
                     for elem in (rogroup.get_RegionRefIndexed() +
                                  rogroup.get_OrderedGroupIndexed() +
                                  rogroup.get_UnorderedGroupIndexed()):
@@ -634,7 +790,7 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
             region_no = 0
             for region_label in np.unique(region_labels):
                 if not region_label:
-                    continue # no bg
+                    continue  # no bg
                 region_mask = region_labels == region_label
                 region_line_labels = line_labels * region_mask
                 region_line_labels0 = np.setdiff1d(region_line_labels, [0])
@@ -643,13 +799,12 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                     # (no new region, no actual text lines)
                     region_line_labels0 = np.intersect1d(region_line_labels0, ignore_labels)
                     assert len(region_line_labels0) == 1, \
-                        "region label %d has both existing regions and new lines (%s)" % (
-                            region_label, str(region_line_labels0))
+                        (f"Region label {region_label} has both existing regions and new lines "
+                         f"({str(region_line_labels0)})")
                     region = ignore[region_line_labels0[0] - 1]
                     if rogroup and region.parent_object_ is element and not isinstance(region, SeparatorRegionType):
                         index = page_add_to_reading_order(rogroup, region.id, index)
-                    self.logger.debug('Region label %d is for ignored region "%s"',
-                              region_label, region.id)
+                    self.logger.debug(f'Region label {region_label} is for ignored region "{region.id}"')
                     continue
                 # normal case: new lines inside new regions
                 # remove binary-empty labels, and re-order locally
@@ -657,18 +812,18 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                 order[np.setdiff1d(region_line_labels0, element_bin * region_line_labels)] = 0
                 region_line_labels = order[region_line_labels]
                 # avoid horizontal gaps
-                region_line_labels = hmerge_line_seeds(element_bin, region_line_labels, scale,
-                                                       seps=np.maximum(sepmask, colseps))
+                region_line_labels = hmerge_line_seeds(
+                    element_bin, region_line_labels, scale, seps=np.maximum(sepmask, colseps))
                 region_mask |= region_line_labels > 0
                 # find contours for region (can be non-contiguous)
-                regions, _ = masks2polygons(self.logger, region_mask * region_label, None, element_bin,
-                                            '%s "%s"' % (element_name, element_id),
-                                            min_area=6000/zoom/zoom,
-                                            simplify=ignore_labels * ~(sep_bin))
+                regions, _ = masks2polygons(
+                    self.logger, region_mask * region_label, None, element_bin,
+                    name=f'{element_name} "{element_id}"', min_area=6000 / zoom / zoom,
+                    simplify=ignore_labels * ~(sep_bin))
                 # find contours for lines (can be non-contiguous)
-                lines, _ = masks2polygons(self.logger, region_line_labels, baselines, element_bin,
-                                          'region "%s"' % element_id,
-                                          min_area=640/zoom/zoom)
+                lines, _ = masks2polygons(
+                    self.logger, region_line_labels, baselines, element_bin,
+                    name=f'region "{element_id}"', min_area=640 / zoom / zoom)
                 # create new lines in new regions (allocating by intersection)
                 line_polys = [Polygon(polygon) for _, polygon, _ in lines]
                 for _, region_polygon, _ in regions:
@@ -677,34 +832,31 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                     region_polygon = coordinates_for_segment(region_polygon, image, coords)
                     region_polygon = polygon_for_parent(region_polygon, element)
                     if region_polygon is None:
-                        self.logger.warning('Ignoring extant region contour for region label %d', region_label)
+                        self.logger.warning(f'Ignoring extant region contour for region label {region_label}')
                         continue
                     # annotate result:
                     region_no += 1
                     region_id = element_id + "_region%04d" % region_no
-                    self.logger.debug('Region label %d becomes ID "%s"', region_label, region_id)
-                    region = TextRegionType(
-                        id=region_id, Coords=CoordsType(
-                        points=points_from_polygon(region_polygon)))
+                    self.logger.debug(f'Region label {region_label} becomes ID "{region_id}"')
+                    region = TextRegionType(id=region_id, Coords=CoordsType(points=points_from_polygon(region_polygon)))
                     # find out which line (contours) belong to which region (contours)
                     line_no = 0
                     for i, line_poly in enumerate(line_polys):
-                        if not region_poly.intersects(line_poly): # .contains
+                        if not region_poly.intersects(line_poly):  # .contains
                             continue
                         line_label, line_polygon, line_baseline = lines[i]
                         # convert back to absolute (page) coordinates:
                         line_polygon = coordinates_for_segment(line_polygon, image, coords)
                         line_polygon = polygon_for_parent(line_polygon, region)
                         if line_polygon is None:
-                            self.logger.warning('Ignoring extant line contour for region label %d line label %d',
-                                        region_label, line_label)
+                            self.logger.warning(
+                                f'Ignoring extant line contour for region label {region_label} line label {line_label}')
                             continue
                         # annotate result:
                         line_no += 1
                         line_id = region_id + "_line%04d" % line_no
-                        self.logger.debug('Line label %d becomes ID "%s"', line_label, line_id)
-                        line = TextLineType(id=line_id,
-                                            Coords=CoordsType(points=points_from_polygon(line_polygon)))
+                        self.logger.debug(f'Line label {line_label} becomes ID "{line_id}"')
+                        line = TextLineType(id=line_id, Coords=CoordsType(points=points_from_polygon(line_polygon)))
                         if line_baseline:
                             line_baseline = coordinates_for_segment(line_baseline, image, coords)
                             line.set_Baseline(BaselineType(points=points_from_polygon(line_baseline)))
@@ -712,95 +864,87 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                     # if the region has received text lines, keep it
                     if region.get_TextLine():
                         element.add_TextRegion(region)
-                        self.logger.info('Added region "%s" with %d lines for %s "%s"',
-                                 region_id, line_no, element_name, element_id)
+                        self.logger.info(
+                            f'Added region "{region_id}" with {line_no} lines for {element_name} "{element_id}"')
                         if rogroup:
                             index = page_add_to_reading_order(rogroup, region.id, index)
             # add additional image/non-text regions from compute_segmentation
             # (e.g. drop-capitals or images) ...
-            self.logger.info('Found %d large image regions for %s "%s"', images.max(), element_name, element_id)
+            self.logger.info(f'Found {images.max()} large image regions for {element_name} "{element_id}"')
             # find contours around region labels (can be non-contiguous):
-            image_polygons, _ = masks2polygons(self.logger, images, None, element_bin,
-                                               '%s "%s"' % (element_name, element_id))
+            image_polygons, _ = masks2polygons(
+                self.logger, images, None, element_bin, f'{element_name} "{element_id}"')
             for image_label, polygon, _ in image_polygons:
                 # convert back to absolute (page) coordinates:
                 region_polygon = coordinates_for_segment(polygon, image, coords)
                 region_polygon = polygon_for_parent(region_polygon, element)
                 if region_polygon is None:
-                    self.logger.warning('Ignoring extant region contour for image label %d', image_label)
+                    self.logger.warning(f'Ignoring extant region contour for image label {image_label}')
                     continue
                 region_no += 1
                 # annotate result:
                 region_id = element_id + "_image%04d" % region_no
                 element.add_ImageRegion(ImageRegionType(
-                    id=region_id, Coords=CoordsType(
-                    points=points_from_polygon(region_polygon))))
+                    id=region_id, Coords=CoordsType(points=points_from_polygon(region_polygon))))
             # split detected separator labels into separator regions:
-            self.logger.info('Found %d separators for %s "%s"', seplines.max(), element_name, element_id)
+            self.logger.info(f'Found {seplines.max()} separators for {element_name} "{element_id}"')
             # find contours around region labels (can be non-contiguous):
-            sep_polygons, _ = masks2polygons(self.logger, seplines, None, element_bin,
-                                             '%s "%s"' % (element_name, element_id),
-                                             open_holes=True, reorder=False)
+            sep_polygons, _ = masks2polygons(
+                self.logger, seplines, None, element_bin,
+                name=f'{element_name} "{element_id}"', open_holes=True, reorder=False)
             for sep_label, polygon, _ in sep_polygons:
                 # convert back to absolute (page) coordinates:
                 region_polygon = coordinates_for_segment(polygon, image, coords)
                 region_polygon = polygon_for_parent(region_polygon, element)
                 if region_polygon is None:
-                    self.logger.warning('Ignoring extant region contour for separator %d', sep_label)
+                    self.logger.warning(f'Ignoring extant region contour for separator {sep_label}')
                     continue
                 # annotate result:
                 region_no += 1
                 region_id = element_id + "_sep%04d" % region_no
                 element.add_SeparatorRegion(SeparatorRegionType(
-                    id=region_id, Coords=CoordsType(
-                    points=points_from_polygon(region_polygon))))
+                    id=region_id, Coords=CoordsType(points=points_from_polygon(region_polygon))))
             # annotate a text/image-separated image
             element_array[sepmask] = np.amax(element_array) # clip to white/bg
             image_clipped = array2pil(element_array)
             file_path = self.workspace.save_image_file(
-                image_clipped, file_id + '.IMG-CLIP', self.output_file_grp,
-                page_id=page_id)
+                image_clipped, file_id + '.IMG-CLIP', self.output_file_grp, page_id=page_id)
             element.add_AlternativeImage(AlternativeImageType(
                 filename=file_path, comments=coords['features'] + ',clipped'))
         else:
-            # get mask from region polygon:
+            #  get mask from region polygon:
             region_polygon = coordinates_of_segment(element, image, coords)
             region_mask = np.zeros_like(element_bin, bool)
-            region_mask[draw.polygon(region_polygon[:, 1],
-                                     region_polygon[:, 0],
-                                     region_mask.shape)] = True
+            region_mask[draw.polygon(region_polygon[:, 1], region_polygon[:, 0], region_mask.shape)] = True
             # ensure the new line labels do not extrude from the region:
             line_labels = line_labels * region_mask
             # find contours around labels (can be non-contiguous):
-            line_polygons, _ = masks2polygons(self.logger, line_labels, baselines, element_bin,
-                                              'region "%s"' % element_id,
-                                              min_area=640/zoom/zoom)
+            line_polygons, _ = masks2polygons(
+                self.logger, line_labels, baselines, element_bin,
+                name=f'region "{element_id}"', min_area=640 / zoom / zoom)
             line_no = 0
             for line_label, polygon, baseline in line_polygons:
                 # convert back to absolute (page) coordinates:
                 line_polygon = coordinates_for_segment(polygon, image, coords)
                 line_polygon = polygon_for_parent(line_polygon, element)
                 if line_polygon is None:
-                    self.logger.warning('Ignoring extant line contour for line label %d',
-                                line_label)
+                    self.logger.warning(f'Ignoring extant line contour for line label {line_label}')
                     continue
                 # annotate result:
                 line_no += 1
                 line_id = element_id + "_line%04d" % line_no
-                line = TextLineType(id=line_id,
-                                    Coords=CoordsType(points=points_from_polygon(line_polygon)))
+                line = TextLineType(id=line_id, Coords=CoordsType(points=points_from_polygon(line_polygon)))
                 if baseline:
                     line_baseline = coordinates_for_segment(baseline, image, coords)
                     line.set_Baseline(BaselineType(points=points_from_polygon(line_baseline)))
                 element.add_TextLine(line)
             if not sep_bin.any():
-                return # no derived image
+                return  # no derived image
             # annotate a text/image-separated image
-            element_array[sep_bin] = np.amax(element_array) # clip to white/bg
+            element_array[sep_bin] = np.amax(element_array)  # clip to white/bg
             image_clipped = array2pil(element_array)
             file_path = self.workspace.save_image_file(
-                image_clipped, file_id + '.IMG-CLIP', self.output_file_grp,
-                page_id=page_id)
+                image_clipped, file_id + '.IMG-CLIP', self.output_file_grp, page_id=page_id)
             # update PAGE (reference the image file):
             element.add_AlternativeImage(AlternativeImageType(
                 filename=file_path, comments=coords['features'] + ',clipped'))

From ad5ac7c4ab7f2b52bf313563456feca0094761ce Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Fri, 16 Aug 2024 11:06:01 +0200
Subject: [PATCH 49/97] ocropy_segment: remove process

---
 ocrd_cis/ocropy/segment.py | 317 ++++++++-----------------------------
 1 file changed, 67 insertions(+), 250 deletions(-)

diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index d2a7a727..94b6ab1f 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -256,6 +256,73 @@ def setup(self):
         self.logger = getLogger('processor.OcropySegment')
 
     def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
+        """Segment pages into regions+lines, tables into cells+lines, or regions into lines.
+        Open and deserialise PAGE input files and their respective images,
+        then iterate over the element hierarchy down to the requested level.
+
+        Depending on ``level-of-operation``, consider existing segments:
+        - If ``overwrite_separators=True`` on ``page`` level, then
+          delete any SeparatorRegions.
+        - If ``overwrite_regions=True`` on ``page`` level, then
+          delete any top-level TextRegions (along with ReadingOrder).
+        - If ``overwrite_regions=True`` on ``table`` level, then
+          delete any TextRegions in TableRegions (along with their OrderGroup).
+        - If ``overwrite_lines=True`` on ``region`` level, then
+          delete any TextLines in TextRegions.
+        - If ``overwrite_order=True`` on ``page`` or ``table`` level, then
+          delete the reading order OrderedGroup entry corresponding
+          to the (page/table) segment.
+
+        Next, get each element image according to the layout annotation (from
+        the alternative image of the page/region, or by cropping via coordinates
+        into the higher-level image) in binarized form, and represent it as an array
+        with non-text regions and (remaining) text neighbours suppressed.
+
+        Then compute a text line segmentation for that array (as a label mask).
+        When ``level-of-operation`` is ``page`` or ``table``, this also entails
+        detecting
+        - up to ``maximages`` large foreground images,
+        - up to ``maxseps`` foreground line separators and
+        - up to ``maxcolseps`` background column separators
+        before text line segmentation itself, as well as aggregating text lines
+        to text regions afterwards.
+
+        Text regions are detected via a hybrid variant recursive X-Y cut algorithm
+        (RXYC): RXYC partitions the binarized image in top-down manner by detecting
+        horizontal or vertical gaps. This implementation uses the bottom-up text line
+        segmentation to guide the search, and also uses both pre-existing and newly
+        detected separators to alternatively partition the respective boxes into
+        non-rectangular parts.
+
+        During line segmentation, suppress the foreground of all previously annotated
+        regions (of any kind) and lines, except if just removed due to ``overwrite``.
+        During region aggregation however, combine the existing separators with the
+        new-found separators to guide the column search.
+
+        All detected segments (both text line and text region) are sorted according
+        to their reading order (assuming a top-to-bottom, left-to-right ordering).
+        When ``level-of-operation`` is ``page``, prefer vertical (column-first)
+        succession of regions. When it is ``table``, prefer horizontal (row-first)
+        succession of cells.
+
+        Then for each resulting segment label, convert its background mask into
+        polygon outlines by finding the outer contours consistent with the element's
+        polygon outline. Annotate the result by adding it as a new TextLine/TextRegion:
+        - If ``level-of-operation`` is ``region``, then append the new lines to the
+          parent region.
+        - If it is ``table``, then append the new lines to their respective regions,
+          and append the new regions to the parent table.
+          (Also, create an OrderedGroup for it as the parent's RegionRef.)
+        - If it is ``page``, then append the new lines to their respective regions,
+          and append the new regions to the page.
+          (Also, create an OrderedGroup for it in the ReadingOrder.)
+
+        Produce a new output file by serialising the resulting hierarchy.
+        """
+        # FIXME: allow passing a-priori info on reading order / textline order
+        # (and then pass on as ``bt`` and ``rl``; however, there may be a mixture
+        #  of different scripts; also, vertical writing needs internal rotation
+        #  because our line segmentation only works for horizontal writing)
         overwrite_lines = self.parameter['overwrite_lines']
         overwrite_regions = self.parameter['overwrite_regions']
         overwrite_separators = self.parameter['overwrite_separators']
@@ -417,256 +484,6 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
                     region, ignore, region_image, region_coords, region.id, file_id + '_' + region.id, page_id, zoom)
         return OcrdPageResult(pcgts)
 
-    def process(self):
-        """Segment pages into regions+lines, tables into cells+lines, or regions into lines.
-        
-        Open and deserialise PAGE input files and their respective images,
-        then iterate over the element hierarchy down to the requested level.
-        
-        Depending on ``level-of-operation``, consider existing segments:
-        - If ``overwrite_separators=True`` on ``page`` level, then
-          delete any SeparatorRegions.
-        - If ``overwrite_regions=True`` on ``page`` level, then
-          delete any top-level TextRegions (along with ReadingOrder).
-        - If ``overwrite_regions=True`` on ``table`` level, then
-          delete any TextRegions in TableRegions (along with their OrderGroup).
-        - If ``overwrite_lines=True`` on ``region`` level, then
-          delete any TextLines in TextRegions.
-        - If ``overwrite_order=True`` on ``page`` or ``table`` level, then
-          delete the reading order OrderedGroup entry corresponding
-          to the (page/table) segment.
-        
-        Next, get each element image according to the layout annotation (from
-        the alternative image of the page/region, or by cropping via coordinates
-        into the higher-level image) in binarized form, and represent it as an array
-        with non-text regions and (remaining) text neighbours suppressed.
-        
-        Then compute a text line segmentation for that array (as a label mask).
-        When ``level-of-operation`` is ``page`` or ``table``, this also entails
-        detecting
-        - up to ``maximages`` large foreground images,
-        - up to ``maxseps`` foreground line separators and
-        - up to ``maxcolseps`` background column separators
-        before text line segmentation itself, as well as aggregating text lines
-        to text regions afterwards.
-        
-        Text regions are detected via a hybrid variant recursive X-Y cut algorithm
-        (RXYC): RXYC partitions the binarized image in top-down manner by detecting
-        horizontal or vertical gaps. This implementation uses the bottom-up text line
-        segmentation to guide the search, and also uses both pre-existing and newly
-        detected separators to alternatively partition the respective boxes into
-        non-rectangular parts.
-        
-        During line segmentation, suppress the foreground of all previously annotated
-        regions (of any kind) and lines, except if just removed due to ``overwrite``.
-        During region aggregation however, combine the existing separators with the
-        new-found separators to guide the column search.
-        
-        All detected segments (both text line and text region) are sorted according
-        to their reading order (assuming a top-to-bottom, left-to-right ordering).
-        When ``level-of-operation`` is ``page``, prefer vertical (column-first)
-        succession of regions. When it is ``table``, prefer horizontal (row-first)
-        succession of cells.
-        
-        Then for each resulting segment label, convert its background mask into
-        polygon outlines by finding the outer contours consistent with the element's
-        polygon outline. Annotate the result by adding it as a new TextLine/TextRegion:
-        - If ``level-of-operation`` is ``region``, then append the new lines to the
-          parent region.
-        - If it is ``table``, then append the new lines to their respective regions,
-          and append the new regions to the parent table.
-          (Also, create an OrderedGroup for it as the parent's RegionRef.)
-        - If it is ``page``, then append the new lines to their respective regions,
-          and append the new regions to the page.
-          (Also, create an OrderedGroup for it in the ReadingOrder.)
-        
-        Produce a new output file by serialising the resulting hierarchy.
-        """
-        # FIXME: allow passing a-priori info on reading order / textline order
-        # (and then pass on as ``bt`` and ``rl``; however, there may be a mixture
-        #  of different scripts; also, vertical writing needs internal rotation
-        #  because our line segmentation only works for horizontal writing)
-        overwrite_lines = self.parameter['overwrite_lines']
-        overwrite_regions = self.parameter['overwrite_regions']
-        overwrite_separators = self.parameter['overwrite_separators']
-        overwrite_order = self.parameter['overwrite_order']
-        oplevel = self.parameter['level-of-operation']
-
-        for (n, input_file) in enumerate(self.input_files):
-            self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
-            file_id = make_file_id(input_file, self.output_file_grp)
-
-            pcgts = page_from_file(self.workspace.download_file(input_file))
-            self.add_metadata(pcgts)
-            page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
-            page = pcgts.get_Page()
-
-            # TODO: also allow grayscale_normalized (try/except?)
-            page_image, page_coords, page_image_info = self.workspace.image_from_page(
-                page, page_id, feature_selector='binarized')
-            zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info)
-
-            # aggregate existing regions so their foreground can be ignored
-            ignore = (page.get_ImageRegion() +
-                      page.get_LineDrawingRegion() +
-                      page.get_GraphicRegion() +
-                      page.get_ChartRegion() +
-                      page.get_MapRegion() +
-                      page.get_MathsRegion() +
-                      page.get_ChemRegion() +
-                      page.get_MusicRegion() +
-                      page.get_AdvertRegion() +
-                      page.get_NoiseRegion() +
-                      page.get_UnknownRegion() +
-                      page.get_CustomRegion())
-            if oplevel == 'page' and overwrite_separators:
-                page.set_SeparatorRegion([])
-            else:
-                ignore.extend(page.get_SeparatorRegion())
-            # prepare reading order
-            reading_order = dict()
-            ro = page.get_ReadingOrder()
-            if ro:
-                rogroup = ro.get_OrderedGroup() or ro.get_UnorderedGroup()
-                if rogroup:
-                    page_get_reading_order(reading_order, rogroup)
-            
-            # get segments to process / overwrite
-            if oplevel == 'page':
-                ignore.extend(page.get_TableRegion())
-                regions = list(page.get_TextRegion())
-                if regions:
-                    # page is already region-segmented
-                    if overwrite_regions:
-                        self.logger.info('removing existing TextRegions in page "%s"', page_id)
-                        # we could remove all other region types as well,
-                        # but this is more flexible (for workflows with
-                        # specialized separator/image/table detectors):
-                        page.set_TextRegion([])
-                        page.set_ReadingOrder(None)
-                        ro = None
-                    else:
-                        self.logger.warning('keeping existing TextRegions in page "%s"', page_id)
-                        ignore.extend(regions)
-                # create reading order if necessary
-                if not ro or overwrite_order:
-                    ro = ReadingOrderType()
-                    page.set_ReadingOrder(ro)
-                rogroup = ro.get_OrderedGroup() or ro.get_UnorderedGroup()
-                if not rogroup:
-                    # new top-level group
-                    rogroup = OrderedGroupType(id="reading-order")
-                    ro.set_OrderedGroup(rogroup)
-                # go get TextRegions with TextLines (and SeparatorRegions):
-                self._process_element(page, ignore, page_image, page_coords,
-                                      page_id, file_id,
-                                      input_file.pageId, zoom, rogroup=rogroup)
-                if (not rogroup.get_RegionRefIndexed() and
-                    not rogroup.get_OrderedGroupIndexed() and
-                    not rogroup.get_UnorderedGroupIndexed()):
-                     # schema forbids empty OrderedGroup
-                    ro.set_OrderedGroup(None)
-            elif oplevel == 'table':
-                ignore.extend(page.get_TextRegion())
-                regions = list(page.get_TableRegion())
-                if not regions:
-                    self.logger.warning('Page "%s" contains no table regions', page_id)
-                for region in regions:
-                    subregions = region.get_TextRegion()
-                    if subregions:
-                        # table is already cell-segmented
-                        if overwrite_regions:
-                            self.logger.info('removing existing TextRegions in table "%s"', region.id)
-                            region.set_TextRegion([])
-                            roelem = reading_order.get(region.id)
-                            # replace by empty group with same index and ref
-                            # (which can then take the cells as subregions)
-                            reading_order[region.id] = page_subgroup_in_reading_order(self.logger, roelem)
-                        else:
-                            self.logger.warning('skipping table "%s" with existing TextRegions', region.id)
-                            continue
-                    # TODO: also allow grayscale_normalized (try/except?)
-                    region_image, region_coords = self.workspace.image_from_segment(
-                        region, page_image, page_coords, feature_selector='binarized')
-                    # ignore everything but the current table region
-                    subignore = regions + ignore
-                    subignore.remove(region)
-                    # create reading order group if necessary
-                    roelem = reading_order.get(region.id)
-                    if not roelem:
-                        self.logger.warning("Page '%s' table region '%s' is not referenced in reading order (%s)",
-                                    page_id, region.id, "no target to add cells to")
-                    elif overwrite_order:
-                        # replace by empty ordered group with same (index and) ref
-                        # (which can then take the cells as subregions)
-                        roelem = page_subgroup_in_reading_order(self.logger, roelem)
-                        reading_order[region.id] = roelem
-                    elif isinstance(roelem, (OrderedGroupType, OrderedGroupIndexedType)):
-                        self.logger.warning("Page '%s' table region '%s' already has an ordered group (%s)",
-                                    page_id, region.id, "cells will be appended")
-                    elif isinstance(roelem, (UnorderedGroupType, UnorderedGroupIndexedType)):
-                        self.logger.warning("Page '%s' table region '%s' already has an unordered group (%s)",
-                                    page_id, region.id, "cells will not be appended")
-                        roelem = None
-                    else:
-                        # replace regionRef(Indexed) by group with same index and ref
-                        # (which can then take the cells as subregions)
-                        roelem = page_subgroup_in_reading_order(self.logger, roelem)
-                        reading_order[region.id] = roelem
-                    # go get TextRegions with TextLines (and SeparatorRegions)
-                    self._process_element(region, subignore, region_image, region_coords,
-                                          region.id, file_id + '_' + region.id,
-                                          input_file.pageId, zoom, rogroup=roelem)
-            else: # 'region'
-                regions = list(page.get_TextRegion())
-                # besides top-level text regions, line-segment any table cells,
-                # and for tables without any cells, add a pseudo-cell
-                for region in page.get_TableRegion():
-                    subregions = region.get_TextRegion()
-                    if subregions:
-                        regions.extend(subregions)
-                    else:
-                        subregion = TextRegionType(id=region.id + '_text',
-                                                   Coords=region.get_Coords(),
-                                                   # as if generated from parser:
-                                                   parent_object_=region)
-                        region.add_TextRegion(subregion)
-                        regions.append(subregion)
-                if not regions:
-                    self.logger.warning('Page "%s" contains no text regions', page_id)
-                for region in regions:
-                    if region.get_TextLine():
-                        if overwrite_lines:
-                            self.logger.info('removing existing TextLines in page "%s" region "%s"', page_id, region.id)
-                            region.set_TextLine([])
-                        else:
-                            self.logger.warning('keeping existing TextLines in page "%s" region "%s"', page_id, region.id)
-                            ignore.extend(region.get_TextLine())
-                    # TODO: also allow grayscale_normalized (try/except?)
-                    region_image, region_coords = self.workspace.image_from_segment(
-                        region, page_image, page_coords, feature_selector='binarized')
-                    # if the region images have already been clipped against their neighbours specifically,
-                    # then we don't need to suppress all neighbours' foreground generally here
-                    if 'clipped' in region_coords['features'].split(','):
-                        ignore = []
-                    # go get TextLines
-                    self._process_element(region, ignore, region_image, region_coords,
-                                          region.id, file_id + '_' + region.id,
-                                          input_file.pageId, zoom)
-
-            # update METS (add the PAGE file):
-            file_path = join(self.output_file_grp, file_id + '.xml')
-            pcgts.set_pcGtsId(file_id)
-            out = self.workspace.add_file(
-                ID=file_id,
-                file_grp=self.output_file_grp,
-                pageId=input_file.pageId,
-                local_filename=file_path,
-                mimetype=MIMETYPE_PAGE,
-                content=to_xml(pcgts))
-            self.logger.info('created file ID: %s, file_grp: %s, path: %s',
-                     file_id, self.output_file_grp, out.local_filename)
-
     def _process_element(self, element, ignore, image, coords, element_id, file_id, page_id, zoom=1.0, rogroup=None):
         """Add PAGE layout elements by segmenting an image.
 

From 5d4007be9ec0e352520995302bd8b11e92e51aae Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
Date: Fri, 16 Aug 2024 11:41:01 +0200
Subject: [PATCH 50/97] segment: adapt to final v3 API

---
 ocrd_cis/ocropy/segment.py | 252 +++++++++++++++++++------------------
 1 file changed, 133 insertions(+), 119 deletions(-)

diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index 94b6ab1f..bdeb40dd 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -1,8 +1,10 @@
 from __future__ import absolute_import
+
+from typing import Optional
 from logging import Logger
 from os.path import join
-from typing import Optional
 import itertools
+
 import numpy as np
 from scipy.sparse.csgraph import minimum_spanning_tree
 from skimage import draw
@@ -14,15 +16,21 @@
 from shapely.validation import explain_validity
 from shapely import set_precision
 
-from ocrd_modelfactory import page_from_file
+from ocrd_utils import (
+    getLogger,
+    coordinates_of_segment,
+    coordinates_for_segment,
+    points_from_polygon,
+    polygon_from_points,
+)
 from ocrd_models.ocrd_page import (
-    to_xml, CoordsType,
-    OcrdPage,
+    CoordsType,
     TextLineType,
     TextRegionType,
     SeparatorRegionType,
     PageType,
-    AlternativeImageType
+    AlternativeImageType,
+    OcrdPage
 )
 from ocrd_models.ocrd_page_generateds import (
     BaselineType,
@@ -37,16 +45,7 @@
     ReadingOrderType
 )
 from ocrd import Processor
-from ocrd.processor import OcrdPageResult
-from ocrd_utils import (
-    getLogger,
-    make_file_id,
-    coordinates_of_segment,
-    coordinates_for_segment,
-    points_from_polygon,
-    polygon_from_points,
-    MIMETYPE_PAGE
-)
+from ocrd.processor import OcrdPageResult, OcrdPageResultImage
 
 from .ocrolib import midrange
 from .ocrolib import morph
@@ -255,11 +254,12 @@ def executable(self):
     def setup(self):
         self.logger = getLogger('processor.OcropySegment')
 
-    def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
+    def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
         """Segment pages into regions+lines, tables into cells+lines, or regions into lines.
-        Open and deserialise PAGE input files and their respective images,
+        
+        Open and deserialise PAGE input file and its respective images,
         then iterate over the element hierarchy down to the requested level.
-
+        
         Depending on ``level-of-operation``, consider existing segments:
         - If ``overwrite_separators=True`` on ``page`` level, then
           delete any SeparatorRegions.
@@ -272,12 +272,12 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
         - If ``overwrite_order=True`` on ``page`` or ``table`` level, then
           delete the reading order OrderedGroup entry corresponding
           to the (page/table) segment.
-
+        
         Next, get each element image according to the layout annotation (from
         the alternative image of the page/region, or by cropping via coordinates
         into the higher-level image) in binarized form, and represent it as an array
         with non-text regions and (remaining) text neighbours suppressed.
-
+        
         Then compute a text line segmentation for that array (as a label mask).
         When ``level-of-operation`` is ``page`` or ``table``, this also entails
         detecting
@@ -286,25 +286,25 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
         - up to ``maxcolseps`` background column separators
         before text line segmentation itself, as well as aggregating text lines
         to text regions afterwards.
-
+        
         Text regions are detected via a hybrid variant recursive X-Y cut algorithm
         (RXYC): RXYC partitions the binarized image in top-down manner by detecting
         horizontal or vertical gaps. This implementation uses the bottom-up text line
         segmentation to guide the search, and also uses both pre-existing and newly
         detected separators to alternatively partition the respective boxes into
         non-rectangular parts.
-
+        
         During line segmentation, suppress the foreground of all previously annotated
         regions (of any kind) and lines, except if just removed due to ``overwrite``.
         During region aggregation however, combine the existing separators with the
         new-found separators to guide the column search.
-
+        
         All detected segments (both text line and text region) are sorted according
         to their reading order (assuming a top-to-bottom, left-to-right ordering).
         When ``level-of-operation`` is ``page``, prefer vertical (column-first)
         succession of regions. When it is ``table``, prefer horizontal (row-first)
         succession of cells.
-
+        
         Then for each resulting segment label, convert its background mask into
         polygon outlines by finding the outer contours consistent with the element's
         polygon outline. Annotate the result by adding it as a new TextLine/TextRegion:
@@ -316,7 +316,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
         - If it is ``page``, then append the new lines to their respective regions,
           and append the new regions to the page.
           (Also, create an OrderedGroup for it in the ReadingOrder.)
-
+        
         Produce a new output file by serialising the resulting hierarchy.
         """
         # FIXME: allow passing a-priori info on reading order / textline order
@@ -330,6 +330,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
         oplevel = self.parameter['level-of-operation']
 
         pcgts = input_pcgts[0]
+        result = OcrdPageResult(pcgts)
         page = pcgts.get_Page()
 
         # TODO: also allow grayscale_normalized (try/except?)
@@ -361,14 +362,15 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
             rogroup = ro.get_OrderedGroup() or ro.get_UnorderedGroup()
             if rogroup:
                 page_get_reading_order(reading_order, rogroup)
-            # get segments to process / overwrite
+
+        # get segments to process / overwrite
         if oplevel == 'page':
             ignore.extend(page.get_TableRegion())
             regions = list(page.get_TextRegion())
             if regions:
                 # page is already region-segmented
                 if overwrite_regions:
-                    self.logger.info(f'Removing existing TextRegions in page "{page_id}"')
+                    self.logger.info('removing existing TextRegions in page "%s"', page_id)
                     # we could remove all other region types as well,
                     # but this is more flexible (for workflows with
                     # specialized separator/image/table detectors):
@@ -376,7 +378,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
                     page.set_ReadingOrder(None)
                     ro = None
                 else:
-                    self.logger.warning(f'Keeping existing TextRegions in page "{page_id}"')
+                    self.logger.warning('keeping existing TextRegions in page "%s"', page_id)
                     ignore.extend(regions)
             # create reading order if necessary
             if not ro or overwrite_order:
@@ -387,32 +389,36 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
                 # new top-level group
                 rogroup = OrderedGroupType(id="reading-order")
                 ro.set_OrderedGroup(rogroup)
-            # go get TextRegions with TextLines (and SeparatorRegions):
-            self._process_element(
-                page, ignore, page_image, page_coords, page_id, file_id, page_id, zoom, rogroup=rogroup)
             if (not rogroup.get_RegionRefIndexed() and
-                    not rogroup.get_OrderedGroupIndexed() and
-                    not rogroup.get_UnorderedGroupIndexed()):
-                # schema forbids empty OrderedGroup
+                not rogroup.get_OrderedGroupIndexed() and
+                not rogroup.get_UnorderedGroupIndexed()):
+                 # schema forbids empty OrderedGroup
                 ro.set_OrderedGroup(None)
-        elif oplevel == 'table':
+            # go get TextRegions with TextLines (and SeparatorRegions):
+            image = self._process_element(page, ignore, page_image, page_coords,
+                                          zoom=zoom, rogroup=rogroup)
+            if image:
+                result.images.append(image)
+            return result
+
+        if oplevel == 'table':
             ignore.extend(page.get_TextRegion())
             regions = list(page.get_TableRegion())
             if not regions:
-                self.logger.warning(f'Page "{page_id}" contains no table regions')
+                self.logger.warning('Page "%s" contains no table regions', page_id)
             for region in regions:
                 subregions = region.get_TextRegion()
                 if subregions:
                     # table is already cell-segmented
                     if overwrite_regions:
-                        self.logger.info(f'Removing existing TextRegions in table "{region.id}"')
+                        self.logger.info('removing existing TextRegions in table "%s"', region.id)
                         region.set_TextRegion([])
                         roelem = reading_order.get(region.id)
                         # replace by empty group with same index and ref
                         # (which can then take the cells as subregions)
                         reading_order[region.id] = page_subgroup_in_reading_order(self.logger, roelem)
                     else:
-                        self.logger.warning(f'Skipping table "{region.id}" with existing TextRegions')
+                        self.logger.warning('skipping table "%s" with existing TextRegions', region.id)
                         continue
                 # TODO: also allow grayscale_normalized (try/except?)
                 region_image, region_coords = self.workspace.image_from_segment(
@@ -423,19 +429,19 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
                 # create reading order group if necessary
                 roelem = reading_order.get(region.id)
                 if not roelem:
-                    self.logger.warning(f"Page '{page_id}' table region '{region.id}' is not referenced in reading "
-                                        f"order (no target to add cells to)")
+                    self.logger.warning("Page '%s' table region '%s' is not referenced in reading order (%s)",
+                                page_id, region.id, "no target to add cells to")
                 elif overwrite_order:
                     # replace by empty ordered group with same (index and) ref
                     # (which can then take the cells as subregions)
                     roelem = page_subgroup_in_reading_order(self.logger, roelem)
                     reading_order[region.id] = roelem
                 elif isinstance(roelem, (OrderedGroupType, OrderedGroupIndexedType)):
-                    self.logger.warning(f"Page '{page_id}' table region '{region.id}' already has an ordered "
-                                        f"group (cells will be appended)")
+                    self.logger.warning("Page '%s' table region '%s' already has an ordered group (%s)",
+                                page_id, region.id, "cells will be appended")
                 elif isinstance(roelem, (UnorderedGroupType, UnorderedGroupIndexedType)):
-                    self.logger.warning(f"Page '{page_id}' table region '{region.id}' already has an unordered "
-                                        f"group (cells will not be appended)")
+                    self.logger.warning("Page '%s' table region '%s' already has an unordered group (%s)",
+                                page_id, region.id, "cells will not be appended")
                     roelem = None
                 else:
                     # replace regionRef(Indexed) by group with same index and ref
@@ -443,10 +449,11 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
                     roelem = page_subgroup_in_reading_order(self.logger, roelem)
                     reading_order[region.id] = roelem
                 # go get TextRegions with TextLines (and SeparatorRegions)
-                self._process_element(
-                    region, subignore, region_image, region_coords, region.id, file_id + '_' + region.id,
-                    page_id, zoom, rogroup=roelem)
-        else:  # 'region'
+                image = self._process_element(region, subignore, region_image, region_coords,
+                                              zoom=zoom, rogroup=roelem)
+                if image:
+                    result.images.append(image)
+        else: # 'region'
             regions = list(page.get_TextRegion())
             # besides top-level text regions, line-segment any table cells,
             # and for tables without any cells, add a pseudo-cell
@@ -455,11 +462,10 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
                 if subregions:
                     regions.extend(subregions)
                 else:
-                    subregion = TextRegionType(
-                        id=region.id + '_text',
-                        Coords=region.get_Coords(),
-                        # as if generated from parser:
-                        parent_object_=region)
+                    subregion = TextRegionType(id=region.id + '_text',
+                                               Coords=region.get_Coords(),
+                                               # as if generated from parser:
+                                               parent_object_=region)
                     region.add_TextRegion(subregion)
                     regions.append(subregion)
             if not regions:
@@ -467,10 +473,10 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
             for region in regions:
                 if region.get_TextLine():
                     if overwrite_lines:
-                        self.logger.info(f'Removing existing TextLines in page "{page_id}" region "{region.id}"')
+                        self.logger.info('removing existing TextLines in page "%s" region "%s"', page_id, region.id)
                         region.set_TextLine([])
                     else:
-                        self.logger.warning(f'Keeping existing TextLines in page "{page_id}" region "{region.id}"')
+                        self.logger.warning('keeping existing TextLines in page "%s" region "%s"', page_id, region.id)
                         ignore.extend(region.get_TextLine())
                 # TODO: also allow grayscale_normalized (try/except?)
                 region_image, region_coords = self.workspace.image_from_segment(
@@ -480,11 +486,13 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
                 if 'clipped' in region_coords['features'].split(','):
                     ignore = []
                 # go get TextLines
-                self._process_element(
-                    region, ignore, region_image, region_coords, region.id, file_id + '_' + region.id, page_id, zoom)
-        return OcrdPageResult(pcgts)
+                image = self._process_element(region, ignore, region_image, region_coords, zoom=zoom)
+                if image:
+                    result.images.append(image)
 
-    def _process_element(self, element, ignore, image, coords, element_id, file_id, page_id, zoom=1.0, rogroup=None):
+        return result
+
+    def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=None) -> Optional[OcrdPageResultImage]:
         """Add PAGE layout elements by segmenting an image.
 
         Given a PageType, TableRegionType or TextRegionType ``element``, and
@@ -503,14 +511,15 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
         newly detected separators to guide region segmentation.
         """
         if not image.width or not image.height:
-            self.logger.warning(f"Skipping '{element_id}' with zero size")
-            return
+            self.logger.warning(f"Skipping '{element.id}' with zero size")
+            return None
         element_array = pil2array(image)
         element_bin = np.array(element_array <= midrange(element_array), bool)
         sep_bin = np.zeros_like(element_bin, bool)
         ignore_labels = np.zeros_like(element_bin, int)
         for i, segment in enumerate(ignore):
-            self.logger.debug(f'Masking foreground of {type(segment).__name__[:-4]} "{segment.id}" for "{element_id}"')
+            self.logger.debug(f'masking foreground of {type(segment).__name__[:-4]} '
+                              f'"{segment.id}" for "{element.id}"')
             # mark these segments (e.g. separator regions, tables, images)
             # for workflows where they have been detected already;
             # these will be:
@@ -522,14 +531,15 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
             # then this will silently ignore them. The caller does
             # not need to concern herself with this.
             sp_row = segment_polygon[:, 1]
-            sp_column = segment_polygon[:, 0]
+            sp_col = segment_polygon[:, 0]
             if isinstance(segment, SeparatorRegionType):
-                sep_bin[draw.polygon(sp_row, sp_column, sep_bin.shape)] = True
-            ignore_labels[draw.polygon(sp_row, sp_column, ignore_labels.shape)] = i + 1  # mapped back for RO
+                sep_bin[draw.polygon(sp_row, sp_col, sep_bin.shape)] = True
+            ignore_labels[draw.polygon(sp_row, sp_col, ignore_labels.shape)] = i+1 # mapped back for RO
         if isinstance(element, PageType):
             element_name = 'page'
             fullpage = True
             report = check_page(element_bin, zoom)
+            suffix = '.IMG-CLIP'
         elif isinstance(element, TableRegionType) or (
                 # sole/congruent text region of a table region?
                 element.id.endswith('_text') and
@@ -537,11 +547,13 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
             element_name = 'table'
             fullpage = True
             report = check_region(element_bin, zoom)
+            suffix = element.id + '.IMG-CLIP'
         else:
             element_name = 'region'
             fullpage = False
             report = check_region(element_bin, zoom)
-        self.logger.info(f'Computing line segmentation for {element_name} "{element_id}"')
+            suffix = element.id + '.IMG-CLIP'
+        self.logger.info(f'computing line segmentation for {element_name} "{element.id}"')
         # TODO: we should downscale if DPI is large enough to save time
         try:
             if report:
@@ -551,7 +563,7 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                 # but keep them for h/v-line detection (in fullpage mode):
                 element_bin, seps=(sep_bin + ignore_labels) > 0,
                 zoom=zoom, fullpage=fullpage,
-                spread_dist=round(self.parameter['spread'] / zoom * 300 / 72),  # in pt
+                spread_dist=round(self.parameter['spread'] / zoom * 300 / 72), # in pt
                 # these are ignored when not in fullpage mode:
                 maxcolseps=self.parameter['maxcolseps'],
                 maxseps=self.parameter['maxseps'],
@@ -559,13 +571,15 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                 csminheight=self.parameter['csminheight'])
         except Exception as err:
             if isinstance(element, TextRegionType):
-                self.logger.error(f'Cannot line-segment region "{element_id}": {err}')
+                self.logger.error(f'Cannot line-segment region "{element.id}": {err}')
                 # as a fallback, add a single text line comprising the whole region:
-                element.add_TextLine(TextLineType(id=element_id + "_line", Coords=element.get_Coords()))
+                element.add_TextLine(TextLineType(id=element.id + "_line", Coords=element.get_Coords()))
             else:
-                self.logger.error(f'Cannot line-segment {element_name} "{element_id}": {err}')
-            return
-        self.logger.info(f'Found {len(np.unique(line_labels)) - 1} text lines for {element_name} "{element_id}"')
+                self.logger.error(f'Cannot line-segment {element_name} "{element.id}": {err}')
+            return None
+
+        self.logger.info(f'Found {len(np.unique(line_labels)) - 1} text lines '
+                         f'for {element_name} "{element.id}"')
         # post-process line labels
         if isinstance(element, (PageType, TableRegionType)):
             # aggregate text lines to text regions
@@ -580,17 +594,18 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                 region_labels = lines2regions(
                     element_bin, line_labels,
                     rlabels=ignore_labels,
-                    sepmask=np.maximum(sepmask, colseps),  # add bg
+                    sepmask=np.maximum(sepmask, colseps), # add bg
                     # decide horizontal vs vertical cut when gaps of similar size
                     prefer_vertical=not isinstance(element, TableRegionType),
                     gap_height=self.parameter['gap_height'],
                     gap_width=self.parameter['gap_width'],
                     scale=scale, zoom=zoom)
-                self.logger.info(
-                    f'Found {len(np.unique(region_labels)) - 1} text regions for {element_name} "{element_id}"')
+                self.logger.info(f'Found {len(np.unique(region_labels)) - 1} text regions '
+                                 f'for {element_name} "{element.id}"')
             except Exception as err:
-                self.logger.error(f'Cannot region-segment {element_name} "{element_id}": {err}')
+                self.logger.error(f'Cannot region-segment {element_name} "{element.id}": {err}')
                 region_labels = np.where(line_labels > len(ignore), 1 + len(ignore), line_labels)
+            
             # prepare reading order group index
             if rogroup:
                 if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)):
@@ -607,7 +622,7 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
             region_no = 0
             for region_label in np.unique(region_labels):
                 if not region_label:
-                    continue  # no bg
+                    continue # no bg
                 region_mask = region_labels == region_label
                 region_line_labels = line_labels * region_mask
                 region_line_labels0 = np.setdiff1d(region_line_labels, [0])
@@ -616,12 +631,12 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                     # (no new region, no actual text lines)
                     region_line_labels0 = np.intersect1d(region_line_labels0, ignore_labels)
                     assert len(region_line_labels0) == 1, \
-                        (f"Region label {region_label} has both existing regions and new lines "
-                         f"({str(region_line_labels0)})")
+                        (f'region label "{region_label}" has both existing regions and new lines '
+                         f'({str(region_line_labels0)})')
                     region = ignore[region_line_labels0[0] - 1]
                     if rogroup and region.parent_object_ is element and not isinstance(region, SeparatorRegionType):
                         index = page_add_to_reading_order(rogroup, region.id, index)
-                    self.logger.debug(f'Region label {region_label} is for ignored region "{region.id}"')
+                    self.logger.debug(f'Region label "{region_label}" is for ignored region "{region.id}"')
                     continue
                 # normal case: new lines inside new regions
                 # remove binary-empty labels, and re-order locally
@@ -629,18 +644,18 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                 order[np.setdiff1d(region_line_labels0, element_bin * region_line_labels)] = 0
                 region_line_labels = order[region_line_labels]
                 # avoid horizontal gaps
-                region_line_labels = hmerge_line_seeds(
-                    element_bin, region_line_labels, scale, seps=np.maximum(sepmask, colseps))
+                region_line_labels = hmerge_line_seeds(element_bin, region_line_labels, scale,
+                                                       seps=np.maximum(sepmask, colseps))
                 region_mask |= region_line_labels > 0
                 # find contours for region (can be non-contiguous)
-                regions, _ = masks2polygons(
-                    self.logger, region_mask * region_label, None, element_bin,
-                    name=f'{element_name} "{element_id}"', min_area=6000 / zoom / zoom,
-                    simplify=ignore_labels * ~(sep_bin))
+                regions, _ = masks2polygons(self.logger, region_mask * region_label, None, element_bin,
+                                            name=f'{element_name} "{element.id}"',
+                                            min_area=6000 / zoom / zoom,
+                                            simplify=ignore_labels * ~(sep_bin))
                 # find contours for lines (can be non-contiguous)
-                lines, _ = masks2polygons(
-                    self.logger, region_line_labels, baselines, element_bin,
-                    name=f'region "{element_id}"', min_area=640 / zoom / zoom)
+                lines, _ = masks2polygons(self.logger, region_line_labels, baselines, element_bin,
+                                          name=f'region "{element.id}"',
+                                          min_area=640 / zoom / zoom)
                 # create new lines in new regions (allocating by intersection)
                 line_polys = [Polygon(polygon) for _, polygon, _ in lines]
                 for _, region_polygon, _ in regions:
@@ -653,13 +668,13 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                         continue
                     # annotate result:
                     region_no += 1
-                    region_id = element_id + "_region%04d" % region_no
+                    region_id = element.id + "_region%04d" % region_no
                     self.logger.debug(f'Region label {region_label} becomes ID "{region_id}"')
                     region = TextRegionType(id=region_id, Coords=CoordsType(points=points_from_polygon(region_polygon)))
                     # find out which line (contours) belong to which region (contours)
                     line_no = 0
                     for i, line_poly in enumerate(line_polys):
-                        if not region_poly.intersects(line_poly):  # .contains
+                        if not region_poly.intersects(line_poly): # .contains
                             continue
                         line_label, line_polygon, line_baseline = lines[i]
                         # convert back to absolute (page) coordinates:
@@ -681,16 +696,16 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                     # if the region has received text lines, keep it
                     if region.get_TextLine():
                         element.add_TextRegion(region)
-                        self.logger.info(
-                            f'Added region "{region_id}" with {line_no} lines for {element_name} "{element_id}"')
+                        self.logger.info(f'Added region "{region_id}" with {line_no} lines '
+                                         f'for {element_name} "{element.id}"')
                         if rogroup:
                             index = page_add_to_reading_order(rogroup, region.id, index)
             # add additional image/non-text regions from compute_segmentation
             # (e.g. drop-capitals or images) ...
-            self.logger.info(f'Found {images.max()} large image regions for {element_name} "{element_id}"')
+            self.logger.info(f'Found {images.max()} large image regions for {element_name} "{element.id}"')
             # find contours around region labels (can be non-contiguous):
-            image_polygons, _ = masks2polygons(
-                self.logger, images, None, element_bin, f'{element_name} "{element_id}"')
+            image_polygons, _ = masks2polygons(self.logger, images, None, element_bin,
+                                               name=f'{element_name} "{element.id}"')
             for image_label, polygon, _ in image_polygons:
                 # convert back to absolute (page) coordinates:
                 region_polygon = coordinates_for_segment(polygon, image, coords)
@@ -700,15 +715,15 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                     continue
                 region_no += 1
                 # annotate result:
-                region_id = element_id + "_image%04d" % region_no
+                region_id = element.id + "_image%04d" % region_no
                 element.add_ImageRegion(ImageRegionType(
                     id=region_id, Coords=CoordsType(points=points_from_polygon(region_polygon))))
             # split detected separator labels into separator regions:
-            self.logger.info(f'Found {seplines.max()} separators for {element_name} "{element_id}"')
+            self.logger.info(f'Found {seplines.max()} separators for {element_name} "{element.id}"')
             # find contours around region labels (can be non-contiguous):
-            sep_polygons, _ = masks2polygons(
-                self.logger, seplines, None, element_bin,
-                name=f'{element_name} "{element_id}"', open_holes=True, reorder=False)
+            sep_polygons, _ = masks2polygons(self.logger, seplines, None, element_bin,
+                                             name=f'{element_name} "{element.id}"',
+                                             open_holes=True, reorder=False)
             for sep_label, polygon, _ in sep_polygons:
                 # convert back to absolute (page) coordinates:
                 region_polygon = coordinates_for_segment(polygon, image, coords)
@@ -718,27 +733,28 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                     continue
                 # annotate result:
                 region_no += 1
-                region_id = element_id + "_sep%04d" % region_no
+                region_id = element.id + "_sep%04d" % region_no
                 element.add_SeparatorRegion(SeparatorRegionType(
                     id=region_id, Coords=CoordsType(points=points_from_polygon(region_polygon))))
             # annotate a text/image-separated image
             element_array[sepmask] = np.amax(element_array) # clip to white/bg
             image_clipped = array2pil(element_array)
-            file_path = self.workspace.save_image_file(
-                image_clipped, file_id + '.IMG-CLIP', self.output_file_grp, page_id=page_id)
-            element.add_AlternativeImage(AlternativeImageType(
-                filename=file_path, comments=coords['features'] + ',clipped'))
+            image_ref = AlternativeImageType(comments=coords['features'] + ',clipped')
+            element.add_AlternativeImage(image_ref)
+            return OcrdPageResultImage(image_clipped, suffix, image_ref)
         else:
-            #  get mask from region polygon:
+            # get mask from region polygon:
             region_polygon = coordinates_of_segment(element, image, coords)
             region_mask = np.zeros_like(element_bin, bool)
-            region_mask[draw.polygon(region_polygon[:, 1], region_polygon[:, 0], region_mask.shape)] = True
+            region_mask[draw.polygon(region_polygon[:, 1],
+                                     region_polygon[:, 0],
+                                     region_mask.shape)] = True
             # ensure the new line labels do not extrude from the region:
             line_labels = line_labels * region_mask
             # find contours around labels (can be non-contiguous):
-            line_polygons, _ = masks2polygons(
-                self.logger, line_labels, baselines, element_bin,
-                name=f'region "{element_id}"', min_area=640 / zoom / zoom)
+            line_polygons, _ = masks2polygons(self.logger, line_labels, baselines, element_bin,
+                                              name=f'region "{element.id}"',
+                                              min_area=640 / zoom / zoom)
             line_no = 0
             for line_label, polygon, baseline in line_polygons:
                 # convert back to absolute (page) coordinates:
@@ -749,22 +765,20 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id,
                     continue
                 # annotate result:
                 line_no += 1
-                line_id = element_id + "_line%04d" % line_no
+                line_id = element.id + "_line%04d" % line_no
                 line = TextLineType(id=line_id, Coords=CoordsType(points=points_from_polygon(line_polygon)))
                 if baseline:
                     line_baseline = coordinates_for_segment(baseline, image, coords)
                     line.set_Baseline(BaselineType(points=points_from_polygon(line_baseline)))
                 element.add_TextLine(line)
             if not sep_bin.any():
-                return  # no derived image
+                return None # no derived image
             # annotate a text/image-separated image
-            element_array[sep_bin] = np.amax(element_array)  # clip to white/bg
+            element_array[sep_bin] = np.amax(element_array) # clip to white/bg
             image_clipped = array2pil(element_array)
-            file_path = self.workspace.save_image_file(
-                image_clipped, file_id + '.IMG-CLIP', self.output_file_grp, page_id=page_id)
-            # update PAGE (reference the image file):
-            element.add_AlternativeImage(AlternativeImageType(
-                filename=file_path, comments=coords['features'] + ',clipped'))
+            image_ref = AlternativeImageType(comments=coords['features'] + ',clipped')
+            element.add_AlternativeImage(image_ref)
+            return OcrdPageResultImage(image_clipped, suffix, image_ref)
 
 def polygon_for_parent(polygon, parent):
     """Clip polygon to parent polygon range.

From df1c35cbe1325a8da5dabd2c9227a7246439fd15 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
Date: Fri, 16 Aug 2024 14:42:57 +0200
Subject: [PATCH 51/97] train: adapt to final v3 API

---
 ocrd_cis/ocropy/train.py | 129 +++++++++++++++++++--------------------
 1 file changed, 64 insertions(+), 65 deletions(-)

diff --git a/ocrd_cis/ocropy/train.py b/ocrd_cis/ocropy/train.py
index 08b68693..5c57b2cf 100644
--- a/ocrd_cis/ocropy/train.py
+++ b/ocrd_cis/ocropy/train.py
@@ -1,12 +1,15 @@
 from __future__ import absolute_import
+
+from typing import Optional
 from logging import Logger
 from sys import exit
 from os import getcwd, makedirs, remove
 from os.path import abspath, dirname, exists, join, isfile
 import tempfile
 
-from ocrd_modelfactory import page_from_file
-from ocrd import Processor
+from ocrd_models import OcrdPage
+from ocrd import Processor, Workspace
+from ocrd.processor import OcrdPageResult
 from ocrd_utils import getLogger
 
 from .ocropus_rtrain import *
@@ -37,80 +40,79 @@ def executable(self):
 
     def setup(self):
         self.logger = getLogger('processor.OcropyTrain')
-        self.old_cwd = getcwd()
-        #print(self.parameter)
         if 'model' in self.parameter:
             model = self.parameter['model']
             try:
-                modelpath = self.resolve_resource(model)
+                self.modelpath = self.resolve_resource(model)
             except SystemExit:
                 ocropydir = dirname(abspath(__file__))
-                modelpath = join(ocropydir, 'models', model)
-                self.logger.info("Failed to resolve model '%s' path, trying '%s'", model, modelpath)
+                self.modelpath = join(ocropydir, 'models', model)
+                self.logger.error(f"Failed to resolve model '{model}' path, trying '{modelpath}'")
             if not isfile(modelpath):
-                self.logger.error("Could not find model '%s'. Try 'ocrd resmgr download ocrd-cis-ocropy-recognize %s'",
-                               model, model)
+                self.logger.critical(f"Could not find model '{model}'.\n"
+                                     f"Try 'ocrd resmgr download ocrd-cis-ocropy-recognize {model}'")
                 exit(1)
-            outputpath = join(self.old_cwd, 'output', model)
-            if 'outputpath' in self.parameter:
-                outputpath = join(self.parameter, model)
+            self.outputpath = join(self.parameter.get('outputpath', 'output'), model)
         else:
-            modelpath = None
-            outputpath = join(self.old_cwd, 'output', 'lstm')
-            if 'outputpath' in self.parameter:
-                outputpath = join(self.parameter, 'lstm')
-        makedirs(dirname(outputpath))
-        self.modelpath = modelpath
-        self.outputpath = outputpath
-
-    def process(self):
+            self.modelpath = None
+            self.outputpath = join(self.parameter.get('outputpath', 'output'), 'lstm')
+        makedirs(dirname(self.outputpath))
+        self.filelist = None
+
+    def process_workspace(self, workspace: Workspace) -> None:
         """
         Trains a new model on the text lines from the input fileGrp,
-        extracted as temporary image-text file pairs.
+        extracted as image-text file pairs into the output fileGrp.
+        (If the output fileGrp already exists and these files should
+        be re-used, pass the `--overwrite` option when processing.)
+
+        The model is written into `outputpath` (or just `output`) under
+        the same name as `model` (i.e. the start model, or just `lstm`).
+        """
+        self.filelist = []
+        super().process_workspace(workspace)
+        self.logger.info(f"Training {self.outputpath} from {self.modelpath or 'scratch'} "
+                         f"on {len(self.filelist)} file pairs")
+        rtrain(self.filelist, self.modelpath, self.outputpath, self.parameter['ntrain'])
+        # deletefiles(self.filelist)
+        
+    def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
+        """
+        Extracts pairs of plaintext and cropped image files for each text line
+        in the PAGE file (to be used during training).
         """
-        filelist = []
-        filepath = tempfile.mkdtemp(prefix='ocrd-cis-ocropy-train-')
+        pcgts = input_pcgts[0]
         #self.logger.info("Using model %s in %s for recognition", model)
-        for (n, input_file) in enumerate(self.input_files):
-            #self.logger.info("INPUT FILE %i / %s", n, input_file)
-            pcgts = page_from_file(self.workspace.download_file(input_file))
-            page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
-            page = pcgts.get_Page()
-            page_image, page_coords, _ = self.workspace.image_from_page(page, page_id)
-
-            self.logger.info("Extracting from page '%s'", page_id)
-            for region in page.get_AllRegions(classes=['Text']):
-                textlines = region.get_TextLine()
-                self.logger.info("Extracting %i lines from region '%s'", len(textlines), region.id)
-                for line in textlines:
-                    if self.parameter['textequiv_level'] == 'line':
-                        path = join(filepath, page_id + region.id + line.id)
-                        imgpath = self.extract_segment(path, line, page_image, page_coords)
-                        if imgpath:
-                            filelist.append(imgpath)
+        page = pcgts.get_Page()
+        page_image, page_coords, _ = self.workspace.image_from_page(page, page_id)
+
+        self.logger.debug(f"Extracting from page '{page_id}'")
+        for region in page.get_AllRegions(classes=['Text']):
+            textlines = region.get_TextLine()
+            self.logger.debug(f"Extracting {len(textlines)} lines from region '{region.id}'")
+            for line in textlines:
+                if self.parameter['textequiv_level'] == 'line':
+                    path = join(self.output_file_grp, f"{page_id}_{region.id}_{line.id}")
+                    self.filelist.append(self.extract_segment(path, line, page_image, page_coords))
+                    continue
+                for word in line.get_Word():
+                    if self.parameter['textequiv_level'] == 'word':
+                        path = join(self.output_file_grp, f"{page_id}_{region.id}_{line.id}_{word.id}")
+                        self.filelist.append(self.extract_segment(path, word, page_image, page_coords))
                         continue
-                    for word in line.get_Word():
-                        if self.parameter['textequiv_level'] == 'word':
-                            path = join(filepath, page_id + region.id + line.id + word.id)
-                            imgpath = self.extract_segment(path, word, page_image, page_coords)
-                            if imgpath:
-                                filelist.append(imgpath)
-                            continue
-                        for glyph in word.get_Glyph():
-                            path = join(filepath, page_id + region.id + line.id + glyph.id)
-                            imgpath = self.extract_segment(path, glyph, page_image, page_coords)
-                            if imgpath:
-                                filelist.append(imgpath)
-
-        self.logger.info("Training %s from %s on %i file pairs",
-                      self.outputpath,
-                      self.modelpath or 'scratch',
-                      len(filelist))
-        rtrain(filelist, self.modelpath, self.outputpath, self.parameter['ntrain'])
-        deletefiles(filelist)
+                    for glyph in word.get_Glyph():
+                        path = join(self.output_file_grp, f"{page_id}_{region.id}_{line.id}_{word.id}_{glyph.id}")
+                        self.filelist.append(self.extract_segment(path, glyph, page_image, page_coords))
+        # FIXME: PAGE-XML not really needed, find a way around this (raising special exception?)
+        return OcrdPageResult(pcgts)
 
     def extract_segment(self, path, segment, page_image, page_coords):
-        #ground truth
+        gtpath = path + '.gt.txt'
+        imgpath = path + '.png'
+        if exists(gtpath) and exists(imgpath):
+            self.logger.debug(f"Reusing {segment.__class__.__name__} '{segment.id}' file pair")
+            return imgpath
+
         gt = segment.TextEquiv
         if not gt:
             return None
@@ -118,11 +120,10 @@ def extract_segment(self, path, segment, page_image, page_coords):
         if not gt or not gt.strip():
             return None
         gt = gt.strip()
-        gtpath = path + '.gt.txt'
         with open(gtpath, "w", encoding='utf-8') as f:
             f.write(gt)
 
-        self.logger.debug("Extracting %s '%s'", segment.__class__.__name__, segment.id)
+        self.logger.debug(f"Extracting {segment.__class__.__name__} '{segment.id}' file pair")
         image, coords = self.workspace.image_from_segment(segment, page_image, page_coords)
 
         if 'binarized' not in coords['features'].split(','):
@@ -132,8 +133,6 @@ def extract_segment(self, path, segment, page_image, page_coords):
         # resize image to 48 pixel height
         image = resize_keep_ratio(image)
 
-        #save temp image
-        imgpath = path + '.png'
         image.save(imgpath)
 
         return imgpath

From c08b623f9b0ad9daf4f8dc858b5b416b1212e018 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
Date: Fri, 16 Aug 2024 14:51:54 +0200
Subject: [PATCH 52/97] ocrd-tool.json: add v3 cardinalities

---
 ocrd_cis/ocrd-tool.json | 120 +++++++++++-----------------------------
 1 file changed, 31 insertions(+), 89 deletions(-)

diff --git a/ocrd_cis/ocrd-tool.json b/ocrd_cis/ocrd-tool.json
index a93917da..c2e20268 100644
--- a/ocrd_cis/ocrd-tool.json
+++ b/ocrd_cis/ocrd-tool.json
@@ -12,17 +12,9 @@
 				"preprocessing/optimization/grayscale_normalization",
 				"preprocessing/optimization/deskewing"
 			],
-			"input_file_grp": [
-				"OCR-D-IMG",
-				"OCR-D-SEG-BLOCK",
-				"OCR-D-SEG-LINE"
-			],
-			"output_file_grp": [
-				"OCR-D-IMG-BIN",
-				"OCR-D-SEG-BLOCK",
-				"OCR-D-SEG-LINE"
-			],
-			"description": "Binarize (and optionally deskew/despeckle) pages / regions / lines with ocropy",
+			"input_file_grp_cardinality": 1,
+			"output_file_grp_cardinality": 1,
+			"description": "Binarize (and optionally deskew/despeckle) pages / regions / lines with Ocropy v1",
 			"parameters": {
 				"method": {
 					"type": "string",
@@ -75,15 +67,9 @@
 			"steps": [
 				"preprocessing/optimization/deskewing"
 			],
-			"input_file_grp": [
-				"OCR-D-SEG-BLOCK",
-				"OCR-D-SEG-LINE"
-			],
-			"output_file_grp": [
-				"OCR-D-SEG-BLOCK",
-				"OCR-D-SEG-LINE"
-			],
-			"description": "Deskew regions with ocropy (by annotating orientation angle and adding AlternativeImage)",
+			"input_file_grp_cardinality": 1,
+			"output_file_grp_cardinality": 1,
+			"description": "Deskew regions with Ocropy v1 (by annotating orientation angle and adding AlternativeImage)",
 			"parameters": {
 				"maxskew": {
 					"type": "number",
@@ -106,17 +92,9 @@
 			"steps": [
 				"preprocessing/optimization/despeckling"
 			],
-			"input_file_grp": [
-				"OCR-D-IMG",
-				"OCR-D-SEG-BLOCK",
-				"OCR-D-SEG-LINE"
-			],
-			"output_file_grp": [
-				"OCR-D-IMG-DESPECK",
-				"OCR-D-SEG-BLOCK",
-				"OCR-D-SEG-LINE"
-			],
-			"description": "Despeckle pages / regions / lines with ocropy",
+			"input_file_grp_cardinality": 1,
+			"output_file_grp_cardinality": 1,
+			"description": "Despeckle pages / regions / lines with Ocropy v1",
 			"parameters": {
 				"noise_maxsize": {
 					"type": "number",
@@ -147,14 +125,8 @@
 				"layout/segmentation/region",
 				"layout/segmentation/line"
 			],
-			"input_file_grp": [
-				"OCR-D-SEG-BLOCK",
-				"OCR-D-SEG-LINE"
-			],
-			"output_file_grp": [
-				"OCR-D-SEG-BLOCK",
-				"OCR-D-SEG-LINE"
-			],
+			"input_file_grp_cardinality": 1,
+			"output_file_grp_cardinality": 1,
 			"description": "Clip text regions / lines at intersections with neighbours",
 			"parameters": {
 				"level-of-operation": {
@@ -185,12 +157,8 @@
 			"steps": [
 				"layout/segmentation/line"
 			],
-			"input_file_grp": [
-				"OCR-D-SEG-LINE"
-			],
-			"output_file_grp": [
-				"OCR-D-SEG-LINE"
-			],
+			"input_file_grp_cardinality": 1,
+			"output_file_grp_cardinality": 1,
 			"description": "Improve coordinates of text lines",
 			"parameters": {
 				"level-of-operation": {
@@ -245,12 +213,8 @@
 				"preprocessing/optimization/dewarping"
 			],
 			"description": "Dewarp line images with ocropy",
-			"input_file_grp": [
-				"OCR-D-SEG-LINE"
-			],
-			"output_file_grp": [
-				"OCR-D-SEG-LINE"
-			],
+			"input_file_grp_cardinality": 1,
+			"output_file_grp_cardinality": 1,
 			"parameters": {
 				"dpi": {
 					"type": "number",
@@ -286,15 +250,9 @@
 			"steps": [
 				"recognition/text-recognition"
 			],
-			"description": "Recognize text in (binarized+deskewed+dewarped) lines with ocropy",
-			"input_file_grp": [
-				"OCR-D-SEG-LINE",
-				"OCR-D-SEG-WORD",
-				"OCR-D-SEG-GLYPH"
-			],
-			"output_file_grp": [
-				"OCR-D-OCR-OCRO"
-			],
+			"description": "Recognize text in (binarized+deskewed+dewarped) lines with Ocropy v1",
+			"input_file_grp_cardinality": 1,
+			"output_file_grp_cardinality": 1,
 			"parameters": {
 				"textequiv_level": {
 					"type": "string",
@@ -345,14 +303,9 @@
 				"layout/segmentation/region",
 				"layout/segmentation/line"
 			],
-			"input_file_grp": [
-				"OCR-D-GT-SEG-BLOCK",
-				"OCR-D-SEG-BLOCK"
-			],
-			"output_file_grp": [
-				"OCR-D-SEG-LINE"
-			],
-			"description": "Segment pages into regions and lines, tables into cells and lines, or regions into lines with ocropy",
+			"input_file_grp_cardinality": 1,
+			"output_file_grp_cardinality": 1,
+			"description": "Segment pages into regions and lines, tables into cells and lines, or regions into lines with Ocropy v1",
 			"parameters": {
 				"dpi": {
 					"type": "number",
@@ -444,11 +397,9 @@
 			"steps": [
 				"recognition/text-recognition"
 			],
-			"input_file_grp": [
-				"OCR-D-GT-SEG-BLOCK",
-				"OCR-D-SEG-BLOCK"
-			],
-			"description": "train model with ground truth from mets data",
+			"input_file_grp_cardinality": 1,
+			"output_file_grp_cardinality": 1,
+			"description": "train Ocropy v1 text recognition model with PAGE ground truth from the input fileGrp extracted as file pairs into the output fileGrp",
 			"parameters": {
 				"textequiv_level": {
 					"type": "string",
@@ -470,7 +421,8 @@
 				},
 				"outputpath": {
 					"type": "string",
-					"description": "(existing) path for the trained model"
+					"default": "output",
+					"description": "directory path for the trained model"
 				}
 			}
 		},
@@ -482,15 +434,9 @@
 			"steps": [
 				"recognition/post-correction"
 			],
-			"input_file_grp": [
-				"OCR-D-OCR-1",
-				"OCR-D-OCR-2",
-				"OCR-D-OCR-N"
-			],
-			"output_file_grp": [
-				"OCR-D-ALIGNED"
-			],
-			"description": "Align multiple OCRs and/or GTs"
+			"input_file_grp_cardinality": [2, -1],
+			"output_file_grp_cardinality": 1,
+			"description": "Align multiple OCRs and/or GTs textually on line/word level"
 		},
 		"ocrd-cis-postcorrect": {
 			"executable": "ocrd-cis-postcorrect",
@@ -501,12 +447,8 @@
 				"recognition/post-correction"
 			],
 			"description": "Post correct OCR results",
-			"input_file_grp": [
-				"OCR-D-LINE-ALIGNED"
-			],
-			"output_file_grp": [
-				"OCR-D-POST-CORRECTED"
-			],
+			"input_file_grp_cardinality": 1,
+			"output_file_grp_cardinality": 1,
 			"parameters": {
 				"maxCandidates": {
 					"description": "Maximum number of considered correction candidates per suspicious token",

From a18307d4a8f50b0a4b081016c9d9db55cca63023 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Fri, 16 Aug 2024 15:27:09 +0200
Subject: [PATCH 53/97] fix: ocropy train errors

---
 ocrd_cis/ocropy/train.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ocrd_cis/ocropy/train.py b/ocrd_cis/ocropy/train.py
index 5c57b2cf..f5d70d6a 100644
--- a/ocrd_cis/ocropy/train.py
+++ b/ocrd_cis/ocropy/train.py
@@ -47,8 +47,8 @@ def setup(self):
             except SystemExit:
                 ocropydir = dirname(abspath(__file__))
                 self.modelpath = join(ocropydir, 'models', model)
-                self.logger.error(f"Failed to resolve model '{model}' path, trying '{modelpath}'")
-            if not isfile(modelpath):
+                self.logger.error(f"Failed to resolve model '{model}' path, trying '{self.modelpath}'")
+            if not isfile(self.modelpath):
                 self.logger.critical(f"Could not find model '{model}'.\n"
                                      f"Try 'ocrd resmgr download ocrd-cis-ocropy-recognize {model}'")
                 exit(1)
@@ -128,7 +128,7 @@ def extract_segment(self, path, segment, page_image, page_coords):
 
         if 'binarized' not in coords['features'].split(','):
             # binarize with nlbin
-            image, _ = binarize(image, maxskew=0)
+            image, _ = binarize(self.logger, image, maxskew=0)
 
         # resize image to 48 pixel height
         image = resize_keep_ratio(image)

From 0ba6839c849688431fa2259da4cd934963724cfb Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Fri, 16 Aug 2024 15:39:09 +0200
Subject: [PATCH 54/97] remove: unused imports

---
 ocrd_cis/ocropy/binarize.py  |  6 +-----
 ocrd_cis/ocropy/clip.py      | 14 ++++++--------
 ocrd_cis/ocropy/denoise.py   | 10 ++--------
 ocrd_cis/ocropy/deskew.py    |  8 +-------
 ocrd_cis/ocropy/dewarp.py    | 12 +++---------
 ocrd_cis/ocropy/recognize.py | 12 ++----------
 ocrd_cis/ocropy/resegment.py |  1 -
 ocrd_cis/ocropy/segment.py   |  1 -
 ocrd_cis/ocropy/train.py     |  9 +++++----
 9 files changed, 20 insertions(+), 53 deletions(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index ac499336..271f01fa 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -1,14 +1,10 @@
 from __future__ import absolute_import
 from logging import Logger
+from typing import Optional
 
 import cv2
 import numpy as np
 from PIL import Image
-from os.path import abspath, dirname, join
-
-from typing import Union, Optional
-
-#import kraken.binarization
 
 from ocrd_utils import getLogger
 from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage
diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py
index 3ddd6a70..36ee4eb3 100644
--- a/ocrd_cis/ocropy/clip.py
+++ b/ocrd_cis/ocropy/clip.py
@@ -2,7 +2,6 @@
 from logging import Logger
 from typing import Optional
 
-from os.path import join
 import numpy as np
 from PIL import Image, ImageStat, ImageOps
 from shapely.geometry import Polygon
@@ -12,19 +11,18 @@
 from ocrd import Processor
 from ocrd.processor import OcrdPageResult, OcrdPageResultImage
 from ocrd_utils import (
-    getLogger,
-    coordinates_of_segment,
-    polygon_from_points,
     bbox_from_polygon,
+    coordinates_of_segment,
+    crop_image,
+    getLogger,
     image_from_polygon,
+    polygon_from_points,
     polygon_mask,
-    crop_image,
 )
 
+from .common import array2pil, determine_zoom, pil2array
 from .ocrolib import midrange, morph
-from .common import (
-    # binarize,
-    array2pil, determine_zoom, pil2array)
+
 
 class OcropyClip(Processor):
     logger: Logger
diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py
index 0f368fd5..72757e0c 100644
--- a/ocrd_cis/ocropy/denoise.py
+++ b/ocrd_cis/ocropy/denoise.py
@@ -1,19 +1,13 @@
 from __future__ import absolute_import
-
 from typing import Optional
 from logging import Logger
-from os.path import join
 
 from ocrd_utils import getLogger
-from ocrd_models.ocrd_page import (
-    AlternativeImageType, OcrdPage
-)
+from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage
 from ocrd import Processor
 from ocrd.processor import OcrdPageResult, OcrdPageResultImage
 
-from .common import (
-    # binarize,
-    determine_zoom, remove_noise)
+from .common import determine_zoom, remove_noise
 
 class OcropyDenoise(Processor):
     logger: Logger
diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py
index fae0c90c..9f9f8b0a 100644
--- a/ocrd_cis/ocropy/deskew.py
+++ b/ocrd_cis/ocropy/deskew.py
@@ -1,15 +1,9 @@
 from __future__ import absolute_import
-
 from typing import Optional
 from logging import Logger
-from os.path import join
 
 from ocrd_utils import getLogger
-from ocrd_models.ocrd_page import (
-    PageType,
-    AlternativeImageType,
-    OcrdPage
-)
+from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage, PageType
 from ocrd import Processor
 from ocrd.processor import OcrdPageResult, OcrdPageResultImage
 
diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py
index a063a05e..9902af95 100644
--- a/ocrd_cis/ocropy/dewarp.py
+++ b/ocrd_cis/ocropy/dewarp.py
@@ -1,18 +1,12 @@
 from __future__ import absolute_import
-
-from typing import Optional
 from logging import Logger
-from os.path import join
-
+from typing import Optional
 import numpy as np
 
-from ocrd_utils import getLogger
-from ocrd_models.ocrd_page import (
-    AlternativeImageType,
-    OcrdPage
-)
 from ocrd import Processor
 from ocrd.processor import OcrdPageResult, OcrdPageResultImage
+from ocrd_utils import getLogger
+from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage
 
 from .ocrolib import lineest
 from .common import array2pil, check_line, determine_zoom, pil2array
diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py
index 97fcc64d..41576e43 100644
--- a/ocrd_cis/ocropy/recognize.py
+++ b/ocrd_cis/ocropy/recognize.py
@@ -10,16 +10,8 @@
 
 from rapidfuzz.distance import Levenshtein
 
-from ocrd_utils import (
-    getLogger,
-    coordinates_for_segment,
-    polygon_from_bbox,
-    points_from_polygon,
-)
-from ocrd_models.ocrd_page import (
-    TextEquivType, OcrdPage,
-    CoordsType, GlyphType, WordType
-)
+from ocrd_utils import coordinates_for_segment, getLogger, points_from_polygon, polygon_from_bbox
+from ocrd_models.ocrd_page import CoordsType, GlyphType, OcrdPage, TextEquivType, WordType
 from ocrd import Processor
 from ocrd.processor import OcrdPageResult
 
diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index 05f17d4f..0ef64687 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -2,7 +2,6 @@
 
 from typing import Optional
 from logging import Logger
-from os.path import join
 
 import numpy as np
 from skimage import draw, segmentation
diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index bdeb40dd..edb5751a 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -2,7 +2,6 @@
 
 from typing import Optional
 from logging import Logger
-from os.path import join
 import itertools
 
 import numpy as np
diff --git a/ocrd_cis/ocropy/train.py b/ocrd_cis/ocropy/train.py
index f5d70d6a..8f224b86 100644
--- a/ocrd_cis/ocropy/train.py
+++ b/ocrd_cis/ocropy/train.py
@@ -3,9 +3,8 @@
 from typing import Optional
 from logging import Logger
 from sys import exit
-from os import getcwd, makedirs, remove
+from os import makedirs, remove
 from os.path import abspath, dirname, exists, join, isfile
-import tempfile
 
 from ocrd_models import OcrdPage
 from ocrd import Processor, Workspace
@@ -32,7 +31,9 @@ def resize_keep_ratio(image, baseheight=48):
 
 class OcropyTrain(Processor):
     logger: Logger
+    modelpath: str
     old_cwd: str
+    outputpath: str
 
     @property
     def executable(self):
@@ -75,8 +76,8 @@ def process_workspace(self, workspace: Workspace) -> None:
                          f"on {len(self.filelist)} file pairs")
         rtrain(self.filelist, self.modelpath, self.outputpath, self.parameter['ntrain'])
         # deletefiles(self.filelist)
-        
-    def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
+
+    def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
         """
         Extracts pairs of plaintext and cropped image files for each text line
         in the PAGE file (to be used during training).

From 6b06e8856addd3b4963961df6d6cb1fb29e126cf Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Fri, 16 Aug 2024 15:48:42 +0200
Subject: [PATCH 55/97] Update binarize.py

---
 ocrd_cis/ocropy/binarize.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index 3e87cf8a..e82dbc16 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -98,6 +98,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
         page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id, feature_filter='binarized')
         zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info)
 
+        result = OcrdPageResult(pcgts)
         if level == 'page':
             try:
                 result.images.append(self.process_page(page, page_image, page_xywh, zoom, page_id))
@@ -256,4 +257,4 @@ def process_line(self, line, line_image, line_xywh, zoom, page_id, region_id) ->
         # update PAGE (reference the image file):
         alt_image = AlternativeImageType(comments=features)
         line.add_AlternativeImage(alt_image)
-        return OcrdPageResultImage(bin_image, suffix, alt_image)
\ No newline at end of file
+        return OcrdPageResultImage(bin_image, suffix, alt_image)

From d1a14b704c0d2559685b8f33ddd23d60c65563a7 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Fri, 16 Aug 2024 16:22:42 +0200
Subject: [PATCH 56/97] refactor: python strings v3

---
 ocrd_cis/ocropy/binarize.py  |  6 +--
 ocrd_cis/ocropy/clip.py      |  5 +--
 ocrd_cis/ocropy/denoise.py   |  8 ++--
 ocrd_cis/ocropy/deskew.py    |  7 ++--
 ocrd_cis/ocropy/dewarp.py    | 11 +++---
 ocrd_cis/ocropy/recognize.py |  6 +--
 ocrd_cis/ocropy/resegment.py | 72 +++++++++++++++-------------------
 ocrd_cis/ocropy/segment.py   | 76 ++++++++++++++++++------------------
 8 files changed, 88 insertions(+), 103 deletions(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index e82dbc16..782dd578 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -16,7 +16,7 @@
 
 
 def binarize(logger: Logger, pil_image, method='ocropy', maxskew=2, threshold=0.5, nrm=False, zoom=1.0):
-    logger.debug('binarizing %dx%d image with method=%s', pil_image.width, pil_image.height, method)
+    logger.debug(f'Binarizing {pil_image.width}x{pil_image.height} image with method={method}')
     if method == 'none':
         # useful if the images are already binary,
         # but lack image attribute `binarized`
@@ -242,8 +242,8 @@ def process_line(self, line, line_image, line_xywh, zoom, page_id, region_id) ->
         #orientation = -angle
         #orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180]
         #line.set_orientation(orientation) # does not exist on line level!
-        self.logger.warning(f"cannot add orientation %.2f to page '{page_id}' region '{region_id}' line '{line.id}'",
-                            -angle)
+        self.logger.warning(
+            f"Cannot add orientation %.2f to page '{page_id}' region '{region_id}' line '{line.id}'", -angle)
         bin_image = remove_noise(bin_image, maxsize=self.parameter['noise_maxsize'])
         if self.parameter['noise_maxsize']:
             features += ',despeckled'
diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py
index 36ee4eb3..7f40a214 100644
--- a/ocrd_cis/ocropy/clip.py
+++ b/ocrd_cis/ocropy/clip.py
@@ -128,15 +128,13 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: str = No
             if level == 'region':
                 if region.get_AlternativeImage():
                     # FIXME: This should probably be an exception (bad workflow configuration).
-                    self.logger.warning(
-                        f'Page "{page_id}" region "{region.id}" already contains image data: skipping')
+                    self.logger.warning(f'Page "{page_id}" region "{region.id}" already contains image data: skipping')
                     continue
                 shape = prep(shapes[i])
                 neighbours = [(regionj, maskj) for shapej, regionj, maskj
                               in zip(shapes[:i] + shapes[i+1:], regions[:i] + regions[i+1:], masks[:i] + masks[i+1:])
                               if shape.intersects(shapej)]
                 if neighbours:
-                    segment_region_file_id = f"{output_file_id}_{region.id}"
                     ret.images.append(self.process_segment(
                         region, masks[i], polygons[i], neighbours, background_image,
                         page_image, page_xywh, page_bin, page_id))
@@ -167,7 +165,6 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: str = No
                               in zip(shapes[:j] + shapes[j+1:], lines[:j] + lines[j+1:], masks[:j] + masks[j+1:])
                               if shape.intersects(shapej)]
                 if neighbours:
-                    segment_line_file_id = f"{output_file_id}_{region.id}_{line.id}"
                     ret.images.append(self.process_segment(
                         line, masks[j], polygons[j], neighbours, background_image,
                         region_image, region_coords, region_bin, page_id))
diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py
index 72757e0c..b3c219fb 100644
--- a/ocrd_cis/ocropy/denoise.py
+++ b/ocrd_cis/ocropy/denoise.py
@@ -57,7 +57,7 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option
         else:
             regions = page.get_AllRegions(classes=['Text'], order='reading-order')
             if not regions:
-                self.logger.warning('Page "%s" contains no text regions', page_id)
+                self.logger.warning(f'Page "{page_id}" contains no text regions')
             for region in regions:
                 region_image, region_xywh = self.workspace.image_from_segment(
                     region, page_image, page_xywh,
@@ -69,7 +69,7 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option
                     continue
                 lines = region.get_TextLine()
                 if not lines:
-                    self.logger.warning('Page "%s" region "%s" contains no text lines', page_id, region.id)
+                    self.logger.warning(f'Page "{page_id}" region "{region.id}" contains no text lines')
                 for line in lines:
                     line_image, line_xywh = self.workspace.image_from_segment(
                         line, region_image, region_xywh,
@@ -80,9 +80,9 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option
 
     def process_segment(self, segment, segment_image, segment_xywh, zoom) -> Optional[OcrdPageResultImage]:
         if not segment_image.width or not segment_image.height:
-            self.logger.warning("Skipping '%s' with zero size", file_id)
+            self.logger.warning(f"Skipping '{segment.id}' with zero size")
             return None
-        self.logger.info("About to despeckle '%s'", file_id)
+        self.logger.info(f"About to despeckle '{segment.id}'")
         bin_image = remove_noise(segment_image,
                                  maxsize=self.parameter['noise_maxsize']/zoom*300/72) # in pt
         # update PAGE (reference the image file):
diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py
index 9f9f8b0a..84475d81 100644
--- a/ocrd_cis/ocropy/deskew.py
+++ b/ocrd_cis/ocropy/deskew.py
@@ -73,8 +73,7 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option
                 # (we will overwrite @orientation anyway,)
                 # abort if no such image can be produced:
                 feature_filter='deskewed')
-            image = self._process_segment(region, region_image, region_coords,
-                                          "region '%s'" % region.id, page_id)
+            image = self._process_segment(region, region_image, region_coords, f"region '{region.id}'", page_id)
             if image:
                 result.images.append(image)
         return result
@@ -84,14 +83,14 @@ def _process_segment(self, segment, segment_image, segment_coords, segment_id, p
             self.logger.warning("Skipping %s with zero size", segment_id)
             return None
         angle0 = segment_coords['angle'] # deskewing (w.r.t. top image) already applied to segment_image
-        self.logger.info("About to deskew %s", segment_id)
+        self.logger.info(f"About to deskew {segment_id}")
         angle = deskew(segment_image, maxskew=self.parameter['maxskew']) # additional angle to be applied
         # segment angle: PAGE orientation is defined clockwise,
         # whereas PIL/ndimage rotation is in mathematical direction:
         orientation = -(angle + angle0)
         orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180]
         segment.set_orientation(orientation) # also removes all deskewed AlternativeImages
-        self.logger.info("Found angle for %s: %.1f", segment_id, angle)
+        self.logger.info(f"Found angle for {segment_id}: %.1f", angle)
         # delegate reflection, rotation and re-cropping to core:
         if isinstance(segment, PageType):
             segment_image, segment_coords, _ = self.workspace.image_from_page(
diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py
index 9902af95..302cf2e0 100644
--- a/ocrd_cis/ocropy/dewarp.py
+++ b/ocrd_cis/ocropy/dewarp.py
@@ -101,29 +101,28 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option
 
         regions = page.get_AllRegions(classes=['Text'], order='reading-order')
         if not regions:
-            self.logger.warning('Page "%s" contains no text regions', page_id)
+            self.logger.warning(f'Page "{page_id}" contains no text regions')
         for region in regions:
             region_image, region_xywh = self.workspace.image_from_segment(
                 region, page_image, page_xywh)
 
             lines = region.get_TextLine()
             if not lines:
-                self.logger.warning('Region %s contains no text lines', region.id)
+                self.logger.warning(f'Region {region.id} contains no text lines')
             for line in lines:
                 line_image, line_xywh = self.workspace.image_from_segment(
                     line, region_image, region_xywh)
 
-                self.logger.info("About to dewarp page '%s' region '%s' line '%s'",
-                                 page_id, region.id, line.id)
+                self.logger.info(f"About to dewarp page '{page_id}' region '{region.id}' line '{line.id}'")
                 try:
                     dew_image = dewarp(line_image, self.lnorm, check=True,
                                        max_neighbour=self.parameter['max_neighbour'],
                                        zoom=zoom)
                 except InvalidLine as err:
-                    self.logger.error('cannot dewarp line "%s": %s', line.id, err)
+                    self.logger.error(f'Cannot dewarp line "{line.id}": {err}')
                     continue
                 except InadequateLine as err:
-                    self.logger.warning('cannot dewarp line "%s": %s', line.id, err)
+                    self.logger.warning(f'cannot dewarp line "{line.id}": {err}')
                     # as a fallback, simply pad the image vertically
                     # (just as dewarping would do on average, so at least
                     #  this line has similar margins as the others):
diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py
index 41576e43..f0c4b520 100644
--- a/ocrd_cis/ocropy/recognize.py
+++ b/ocrd_cis/ocropy/recognize.py
@@ -179,13 +179,13 @@ def process_lines(self, textlines, maxlevel, region_image, region_xywh):
                 linegt = line.TextEquiv[0].Unicode
             else:
                 linegt = ''
-            self.logger.debug("GT  '%s': '%s'", line.id, linegt)
+            self.logger.debug(f"GT  '{line.id}': '{linegt}'")
             # remove existing annotation below line level:
             line.set_TextEquiv([])
             line.set_Word([])
 
             if line_image.size[1] < 16:
-                self.logger.debug(f"ERROR: bounding box is too narrow at line {line.id}")
+                self.logger.debug(f"Error: bounding box is too narrow at line {line.id}")
                 continue
             # resize image to 48 pixel height
             final_img, scale = resize_keep_ratio(line_image)
@@ -194,7 +194,7 @@ def process_lines(self, textlines, maxlevel, region_image, region_xywh):
             try:
                 linepred, clist, rlist, confidlist = recognize(final_img, self.pad, self.network, check=True)
             except Exception as err:
-                self.logger.debug(f'error processing line "{line.id}": {err}')
+                self.logger.debug(f'Error processing line "{line.id}": {err}')
                 continue
             self.logger.debug(f"OCR '{line.id}': '{linepred}'")
             edits += Levenshtein.distance(linepred, linegt)
diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index 0ef64687..d429c1de 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -126,14 +126,14 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option
                   page.get_CustomRegion())
         regions = page.get_AllRegions(classes=['Text'])
         if not regions:
-            self.logger.warning('Page "%s" contains no text regions', page_id)
+            self.logger.warning(f'Page "{page_id}" contains no text regions')
         elif level == 'page':
             lines = [line for region in regions
                      for line in region.get_TextLine()]
             if lines:
                 self._process_segment(page, page_image, page_coords, page_id, zoom, lines, ignore)
             else:
-                self.logger.warning('Page "%s" contains no text regions with lines', page_id)
+                self.logger.warning(f'Page "{page_id}" contains no text regions with lines', )
         else:
             for region in regions:
                 lines = region.get_TextLine()
@@ -142,7 +142,7 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option
                         region, page_image, page_coords, feature_selector='binarized')
                     self._process_segment(region, region_image, region_coords, page_id, zoom, lines, ignore)
                 else:
-                    self.logger.warning('Page "%s" region "%s" contains no text lines', page_id, region.id)
+                    self.logger.warning(f'Page "{page_id}" region "{region.id}" contains no text lines')
         return OcrdPageResult(pcgts)
  
     def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, lines, ignore):
@@ -163,8 +163,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
             fullpage = False
             report = check_region(parent_bin, zoom)
         if report:
-            self.logger.warning('Invalid %s "%s": %s', tag,
-                        page_id if fullpage else parent.id, report)
+            self.logger.warning(f'Invalid {tag} "{page_id if fullpage else parent.id}": {report}')
             return
         # get existing line labels:
         line_labels = np.zeros_like(parent_bin, bool)
@@ -191,8 +190,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
             line_labels[i, line_y, line_x] = True
         # only text region(s) may contain new text lines
         for i, region in enumerate(set(line.parent_object_ for line in lines)):
-            self.logger.debug('unmasking area of text region "%s" for "%s"',
-                      region.id, page_id if fullpage else parent.id)
+            self.logger.debug(f'Unmasking area of text region "{region.id}" for "{page_id if fullpage else parent.id}"')
             region_polygon = coordinates_of_segment(region, parent_image, parent_coords)
             region_polygon = make_valid(Polygon(region_polygon))
             region_polygon = np.array(region_polygon.exterior.coords, int)[:-1]
@@ -201,14 +199,14 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                                     parent_bin.shape)] = False
         # mask/ignore overlapping neighbours
         for i, segment in enumerate(ignore):
-            self.logger.debug('masking area of %s "%s" for "%s"', type(segment).__name__[:-4],
-                      segment.id, page_id if fullpage else parent.id)
+            self.logger.debug(f'Masking area of {type(segment).__name__[:-4]} "{segment.id}" for '
+                              f'"{page_id if fullpage else parent.id}"')
             segment_polygon = coordinates_of_segment(segment, parent_image, parent_coords)
             ignore_bin[draw.polygon(segment_polygon[:, 1],
                                     segment_polygon[:, 0],
                                     parent_bin.shape)] = True
         if method != 'lineest':
-            self.logger.debug('calculating connected component and distance transforms for "%s"', parent.id)
+            self.logger.debug(f'Calculating connected component and distance transforms for "{parent.id}"')
             bin = parent_bin & ~ ignore_bin
             components, _ = morph.label(bin)
             # estimate glyph scale (roughly)
@@ -217,7 +215,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                 counts = np.sqrt(3 * counts)
                 scale = int(np.median(counts[(5/zoom < counts) & (counts < 100/zoom)]))
                 components *= (counts > 15/zoom)[components]
-                self.logger.debug("estimated scale: %d", scale)
+                self.logger.debug(f"Estimated scale: {scale}")
             else:
                 scale = 43
             if method == 'ccomps':
@@ -235,7 +233,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                 new_labels = np.zeros_like(parent_bin, np.uint8)
                 for i, line in enumerate(lines):
                     if line.Baseline is None:
-                        self.logger.warning("Skipping '%s' without baseline", line.id)
+                        self.logger.warning(f"Skipping '{line.id}' without baseline")
                         new_labels[line_labels[i]] = i + 1
                         continue
                     line_baseline = baseline_of_segment(line, parent_coords)
@@ -254,14 +252,13 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                 parent_bin, seps=ignore_bin, zoom=zoom, spread_dist=maxdist or scale/2,
                 fullpage=fullpage, maxseps=0, maxcolseps=len(ignore), maximages=0)
         except Exception as err:
-            self.logger.error('Cannot line-segment %s "%s": %s',
-                      tag, page_id if fullpage else parent.id, err)
+            self.logger.error(f'Cannot line-segment {tag} "{page_id if fullpage else parent.id}": {err}')
             return
-        self.logger.info("Found %d new line labels for %d existing lines on %s '%s'",
-                 new_line_labels.max(), len(lines), tag, parent.id)
+        self.logger.info(
+            f"Found {new_line_labels.max()} new line labels for {len(lines)} existing lines on {tag} '{parent.id}'")
         # polygonalize and prepare comparison
         new_line_polygons, new_line_labels = masks2polygons(self.logger,
-            new_line_labels, new_baselines, parent_bin, '%s "%s"' % (tag, parent.id),
+            new_line_labels, new_baselines, parent_bin, name=f'{tag} "{parent.id}"',
             min_area=640/zoom/zoom)
         DSAVE('line_labels', [np.argmax(np.insert(line_labels, 0, 0, axis=0), axis=0), parent_bin])
         DSAVE('new_line_labels', [new_line_labels, parent_bin])
@@ -345,31 +342,29 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
         for j, line in enumerate(lines):
             new_lines = np.nonzero(assignments == j)[0]
             if not np.prod(new_lines.shape):
-                self.logger.debug("no lines for '%s' match or fit", line.id)
+                self.logger.debug(f"no lines for '{line.id}' match or fit", )
                 continue
             covers = np.sum(covers_bg[new_lines,j])
             if covers < threshold / 3:
-                self.logger.debug("new lines for '%s' only cover %.1f%% bg",
-                          line.id, covers * 100)
+                self.logger.debug(f"new lines for '{line.id}' only cover %.1f%% bg", covers * 100)
                 continue
             covers = np.sum(covers_fg[new_lines,j])
             if covers < threshold:
-                self.logger.debug("new lines for '%s' only cover %.1f%% fg",
-                          line.id, covers * 100)
+                self.logger.debug(f"new lines for '{line.id}' only cover %.1f%% fg", covers * 100)
                 continue
             looses = (assignments < 0) & (covers_bg[:,j] > 0.1)
             if looses.any():
                 covers = np.sum(covers_bg[np.nonzero(looses)[0],j])
-                self.logger.debug("new lines for '%s' would loose %d non-matching segments totalling %.1f%% bg",
-                          line.id, np.count_nonzero(looses), covers * 100)
+                self.logger.debug(
+                    f"new lines for '{line.id}' would loose {np.count_nonzero(looses)} non-matching segments "
+                    f"totalling %.1f%% bg", covers * 100)
                 continue
             line_count = np.count_nonzero(line_labels[j] & parent_bin)
             new_count = covers * line_count
-            self.logger.debug('Black pixels before/after resegment of line "%s": %d/%d',
-                      line.id, line_count, new_count)
+            self.logger.debug(f'Black pixels before/after resegment of line "{line.id}": {line_count}/{new_count}')
             # combine all assigned new lines to single outline polygon
             if len(new_lines) > 1:
-                self.logger.debug("joining %d new line polygons for '%s'", len(new_lines), line.id)
+                self.logger.debug(f"joining {len(new_lines)} new line polygons for '{line.id}'")
             new_polygon = join_polygons([new_line_polygons[i] #intersections[(i, j)]
                                          for i in new_lines], loc=line.id, scale=scale)
             new_baseline = join_baselines(self.logger, [new_polygon.intersection(new_baselines[i])
@@ -379,7 +374,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                                                    parent_image, parent_coords)
             line_polygon = polygon_for_parent(line_polygon, line.parent_object_)
             if line_polygon is None:
-                self.logger.warning("Ignoring extant new polygon for line '%s'", line.id)
+                self.logger.warning(f"Ignoring extant new polygon for line '{line.id}'")
                 return
             # annotate result:
             line.get_Coords().set_points(points_from_polygon(line_polygon))
@@ -394,7 +389,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                     if j == otherj:
                         continue
                     otherline = lines[otherj]
-                    self.logger.debug("subtracting new '%s' from overlapping '%s'", line.id, otherline.id)
+                    self.logger.debug(f"subtracting new '{line.id}' from overlapping '{otherline.id}'")
                     other_polygon = diff_polygons(line_polygons[otherj].context, new_polygon)
                     if other_polygon.is_empty:
                         continue
@@ -403,7 +398,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                                                             parent_image, parent_coords)
                     other_polygon = polygon_for_parent(other_polygon, otherline.parent_object_)
                     if other_polygon is None:
-                        self.logger.warning("Ignoring extant new polygon for line '%s'", otherline.id)
+                        self.logger.warning(f"Ignoring extant new polygon for line '{otherline.id}'")
                         continue
                     otherline.get_Coords().set_points(points_from_polygon(other_polygon))
 
@@ -434,29 +429,26 @@ def spread_dist(logger: Logger, lines, old_labels, new_labels, binarized, compon
             continue
         count = np.count_nonzero(old_label)
         if not count:
-            logger.warning("skipping zero-area line '%s'", line.id)
+            logger.warning(f"skipping zero-area line '{line.id}'")
             continue
         covers = np.count_nonzero(new_label) / count
         if covers < threshold / 3:
-            logger.debug("new line for '%s' only covers %.1f%% bg",
-                      line.id, covers * 100)
+            logger.debug(f"new line for '%s' only covers %.1f%% bg", covers * 100)
             continue
         count = np.count_nonzero(old_label * binarized)
         if not count:
-            logger.warning("skipping binary-empty line '%s'", line.id)
+            logger.warning(f"skipping binary-empty line '{line.id}'")
             continue
         covers = np.count_nonzero(new_label * binarized) / count
         if covers < threshold:
-            logger.debug("new line for '%s' only covers %.1f%% fg",
-                      line.id, covers * 100)
+            logger.debug(f"new line for '{line.id}' only covers %.1f%% fg", covers * 100)
             continue
-        logger.debug('Black pixels before/after resegment of line "%s": %d/%d',
-                  line.id, count, covers * count)
+        logger.debug(f'Black pixels before/after resegment of line "{line.id}": {count}/{covers * count}')
         contours = [contour[:,::-1] # get x,y order again
                     for contour, area in morph.find_contours(new_label)]
         #LOG.debug("joining %d subsegments for %s", len(contours), line.id)
         if len(contours) == 0:
-            logger.warning("no contours for %s - keeping", line.id)
+            logger.warning(f"no contours for {line.id} - keeping")
             continue
         else:
             # get alpha shape
@@ -468,7 +460,7 @@ def spread_dist(logger: Logger, lines, old_labels, new_labels, binarized, compon
         polygon = coordinates_for_segment(poly, None, coords)
         polygon = polygon_for_parent(polygon, line.parent_object_)
         if polygon is None:
-            logger.warning("Ignoring extant line for %s", line.id)
+            logger.warning(f"Ignoring extant line for {line.id}")
             continue
         line.get_Coords().set_points(points_from_polygon(polygon))
 
diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index edb5751a..e8c4a1ed 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -75,8 +75,6 @@ def masks2polygons(logger: Logger, bg_labels, baselines, fg_bin, name, min_area=
     - these polygons as a list of label, polygon, baseline tuples, and
     - a Numpy array of new background labels for that list.
     """
-    if not logger:
-        raise ValueError(f"Logger has not been passed by the caller")
     # find sharp baseline
     if baselines is not None:
         def getx(xy):
@@ -93,8 +91,7 @@ def getx(xy):
         bg_mask = np.array(bg_labels == label, bool)
         if not np.count_nonzero(bg_mask * fg_bin):
             # ignore if missing foreground
-            logger.debug('skipping label %d in %s due to empty fg',
-                      label, name)
+            logger.debug(f'Skipping label {label} in {name} due to empty fg')
             continue
         # simplify to convex hull
         if simplify is not None:
@@ -102,8 +99,8 @@ def getx(xy):
             conflicts = np.setdiff1d(hull * simplify,
                                      bg_mask * simplify)
             if conflicts.any():
-                logger.debug('Cannot simplify %d: convex hull would create additional intersections %s',
-                          label, str(conflicts))
+                logger.debug(
+                    f'Cannot simplify {label}: convex hull would create additional intersections {str(conflicts)}')
             else:
                 bg_mask = hull
         if open_holes:
@@ -131,8 +128,8 @@ def getx(xy):
                     if len(hole) < 3:
                         idx_hole = hier[0, idx_hole, 0]
                         continue
-                    logger.debug("label %d contour %d [%d pts] has hole %d [%d pts]",
-                              label, idx, len(contour), idx_hole, len(hole))
+                    logger.debug(
+                        f"Label {label} contour {idx} [{len(contour)} pts] has hole {idx_hole} [{len(hole)} pts]")
                     #plot_poly(hole, 'blue')
                     # cut child from outside...
                     # first get nearest point on child
@@ -173,7 +170,7 @@ def getx(xy):
                         diff2 = (interpol[interpol_idx+1:interpol_idx+2] - cispoint2) // 5
                     cispoint1 = cispoint1 + diff1
                     cispoint2 = cispoint2 + diff2
-                    logger.debug("stitching at interpolation pos %d hole pos %d", interpol_idx, hole_idx)
+                    logger.debug(f"Stitching at interpolation pos {interpol_idx} hole pos {hole_idx}")
                     # now stitch together outer (up to cision), inner (re-arranged around cision), outer (rest)
                     # (this works, because inner contours have inverse direction)
                     contour = np.concatenate([contour[:contour_idx], cispoint1,
@@ -182,7 +179,7 @@ def getx(xy):
                     #plot_poly(contour, 'green')
                     idx_hole = hier[0, idx_hole, 0]
                 #plot_poly(contour, 'red')
-                logger.debug("adding label %d contour %d [%d pts]", label, idx, len(contour))
+                logger.debug(f"Adding label {label} contour {idx} [{len(contour)} pts]")
                 contours.append(contour)
                 idx = hier[0, idx, 0]
         else:
@@ -208,8 +205,7 @@ def getx(xy):
             contour = contours[i]
             area = areas[i]
             if min_area and area < min_area and area / total_area < 0.1:
-                logger.warning('Label %d contour %d is too small (%d/%d) in %s',
-                            label, i, area, total_area, name)
+                logger.warning(f'Label {label} contour {i} is too small ({area}/{total_area}) in {name}')
                 continue
             # simplify shape:
             # can produce invalid (self-intersecting) polygons:
@@ -226,7 +222,7 @@ def getx(xy):
                 logger.warning(explain_validity(polygon))
             poly = polygon.exterior.coords[:-1] # keep open
             if len(poly) < 4:
-                logger.warning('Label %d contour %d for %s has less than 4 points', label, i, name)
+                logger.warning(f'Label {label} contour {i} for {name} has less than 4 points')
                 continue
             # get baseline segments intersecting with this line mask
             # and concatenate them from left to right
@@ -369,7 +365,7 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option
             if regions:
                 # page is already region-segmented
                 if overwrite_regions:
-                    self.logger.info('removing existing TextRegions in page "%s"', page_id)
+                    self.logger.info(f'Removing existing TextRegions in page "{page_id}"', )
                     # we could remove all other region types as well,
                     # but this is more flexible (for workflows with
                     # specialized separator/image/table detectors):
@@ -377,7 +373,7 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option
                     page.set_ReadingOrder(None)
                     ro = None
                 else:
-                    self.logger.warning('keeping existing TextRegions in page "%s"', page_id)
+                    self.logger.warning(f'Keeping existing TextRegions in page "{page_id}"', )
                     ignore.extend(regions)
             # create reading order if necessary
             if not ro or overwrite_order:
@@ -404,20 +400,20 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option
             ignore.extend(page.get_TextRegion())
             regions = list(page.get_TableRegion())
             if not regions:
-                self.logger.warning('Page "%s" contains no table regions', page_id)
+                self.logger.warning(f'Page "{page_id}" contains no table regions')
             for region in regions:
                 subregions = region.get_TextRegion()
                 if subregions:
                     # table is already cell-segmented
                     if overwrite_regions:
-                        self.logger.info('removing existing TextRegions in table "%s"', region.id)
+                        self.logger.info(f'Removing existing TextRegions in table "{region.id}"')
                         region.set_TextRegion([])
                         roelem = reading_order.get(region.id)
                         # replace by empty group with same index and ref
                         # (which can then take the cells as subregions)
                         reading_order[region.id] = page_subgroup_in_reading_order(self.logger, roelem)
                     else:
-                        self.logger.warning('skipping table "%s" with existing TextRegions', region.id)
+                        self.logger.warning(f'Skipping table "{region.id}" with existing TextRegions')
                         continue
                 # TODO: also allow grayscale_normalized (try/except?)
                 region_image, region_coords = self.workspace.image_from_segment(
@@ -428,19 +424,22 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option
                 # create reading order group if necessary
                 roelem = reading_order.get(region.id)
                 if not roelem:
-                    self.logger.warning("Page '%s' table region '%s' is not referenced in reading order (%s)",
-                                page_id, region.id, "no target to add cells to")
+                    self.logger.warning(
+                        f"Page '{page_id}' table region '{region.id}' is not referenced in reading order "
+                        f"(no target to add cells to)")
                 elif overwrite_order:
                     # replace by empty ordered group with same (index and) ref
                     # (which can then take the cells as subregions)
                     roelem = page_subgroup_in_reading_order(self.logger, roelem)
                     reading_order[region.id] = roelem
                 elif isinstance(roelem, (OrderedGroupType, OrderedGroupIndexedType)):
-                    self.logger.warning("Page '%s' table region '%s' already has an ordered group (%s)",
-                                page_id, region.id, "cells will be appended")
+                    self.logger.warning(
+                        f"Page '{page_id}' table region '{region.id}' already has an ordered group "
+                        f"(cells will be appended)")
                 elif isinstance(roelem, (UnorderedGroupType, UnorderedGroupIndexedType)):
-                    self.logger.warning("Page '%s' table region '%s' already has an unordered group (%s)",
-                                page_id, region.id, "cells will not be appended")
+                    self.logger.warning(
+                        f"Page '{page_id}' table region '{region.id}' already has an unordered group "
+                        f"(cells will not be appended)")
                     roelem = None
                 else:
                     # replace regionRef(Indexed) by group with same index and ref
@@ -468,14 +467,14 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option
                     region.add_TextRegion(subregion)
                     regions.append(subregion)
             if not regions:
-                self.logger.warning('Page "%s" contains no text regions', page_id)
+                self.logger.warning(f'Page "{page_id}" contains no text regions')
             for region in regions:
                 if region.get_TextLine():
                     if overwrite_lines:
-                        self.logger.info('removing existing TextLines in page "%s" region "%s"', page_id, region.id)
+                        self.logger.info(f'Removing existing TextLines in page "{page_id}" region "{region.id}"')
                         region.set_TextLine([])
                     else:
-                        self.logger.warning('keeping existing TextLines in page "%s" region "%s"', page_id, region.id)
+                        self.logger.warning(f'Keeping existing TextLines in page "{page_id}" region "{region.id}"')
                         ignore.extend(region.get_TextLine())
                 # TODO: also allow grayscale_normalized (try/except?)
                 region_image, region_coords = self.workspace.image_from_segment(
@@ -517,8 +516,8 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
         sep_bin = np.zeros_like(element_bin, bool)
         ignore_labels = np.zeros_like(element_bin, int)
         for i, segment in enumerate(ignore):
-            self.logger.debug(f'masking foreground of {type(segment).__name__[:-4]} '
-                              f'"{segment.id}" for "{element.id}"')
+            self.logger.debug(
+                f'Masking foreground of {type(segment).__name__[:-4]} "{segment.id}" for "{element.id}"')
             # mark these segments (e.g. separator regions, tables, images)
             # for workflows where they have been detected already;
             # these will be:
@@ -552,7 +551,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
             fullpage = False
             report = check_region(element_bin, zoom)
             suffix = element.id + '.IMG-CLIP'
-        self.logger.info(f'computing line segmentation for {element_name} "{element.id}"')
+        self.logger.info(f'Computing line segmentation for {element_name} "{element.id}"')
         # TODO: we should downscale if DPI is large enough to save time
         try:
             if report:
@@ -577,8 +576,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
                 self.logger.error(f'Cannot line-segment {element_name} "{element.id}": {err}')
             return None
 
-        self.logger.info(f'Found {len(np.unique(line_labels)) - 1} text lines '
-                         f'for {element_name} "{element.id}"')
+        self.logger.info(f'Found {len(np.unique(line_labels)) - 1} text lines for {element_name} "{element.id}"')
         # post-process line labels
         if isinstance(element, (PageType, TableRegionType)):
             # aggregate text lines to text regions
@@ -599,8 +597,8 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
                     gap_height=self.parameter['gap_height'],
                     gap_width=self.parameter['gap_width'],
                     scale=scale, zoom=zoom)
-                self.logger.info(f'Found {len(np.unique(region_labels)) - 1} text regions '
-                                 f'for {element_name} "{element.id}"')
+                self.logger.info(
+                    f'Found {len(np.unique(region_labels)) - 1} text regions for {element_name} "{element.id}"')
             except Exception as err:
                 self.logger.error(f'Cannot region-segment {element_name} "{element.id}": {err}')
                 region_labels = np.where(line_labels > len(ignore), 1 + len(ignore), line_labels)
@@ -630,7 +628,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
                     # (no new region, no actual text lines)
                     region_line_labels0 = np.intersect1d(region_line_labels0, ignore_labels)
                     assert len(region_line_labels0) == 1, \
-                        (f'region label "{region_label}" has both existing regions and new lines '
+                        (f'Region label "{region_label}" has both existing regions and new lines '
                          f'({str(region_line_labels0)})')
                     region = ignore[region_line_labels0[0] - 1]
                     if rogroup and region.parent_object_ is element and not isinstance(region, SeparatorRegionType):
@@ -907,9 +905,9 @@ def join_baselines(logger: Logger, baselines, loc=''):
                 elif geom.geom_type == 'MultiLineString':
                     lines.extend(geom)
                 else:
-                    logger.warning("ignoring baseline subtype %s in %s", geom.geom_type, loc)
+                    logger.warning(f"Ignoring baseline subtype {geom.geom_type} in {loc}")
         else:
-            logger.warning("ignoring baseline type %s in %s", baseline.geom_type, loc)
+            logger.warning(f"Ignoring baseline type {baseline.geom_type} in {loc}")
     nlines = len(lines)
     if nlines == 0:
         return None
@@ -971,7 +969,7 @@ def join_baselines(logger: Logger, baselines, loc=''):
         else:
             chains.append([prevl, nextl])
     if len(chains) > 1:
-        logger.warning("baseline merge impossible (no spanning tree) in %s", loc)
+        logger.warning(f"Baseline merge impossible (no spanning tree) in {loc}")
         return None
     assert len(chains) == 1, chains
     assert len(chains[0]) == nlines, chains[0]
@@ -983,7 +981,7 @@ def join_baselines(logger: Logger, baselines, loc=''):
         coords.extend(line.normalize().coords)
     result = LineString(coords)
     if result.is_empty:
-        logger.warning("baseline merge is empty in %s", loc)
+        logger.warning(f"Baseline merge is empty in {loc}")
         return None
     assert result.geom_type == 'LineString', result.wkt
     result = set_precision(result, 1.0)

From d8542c20d5e39c1bf8670205a75c039f25198bf8 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Fri, 16 Aug 2024 16:39:43 +0200
Subject: [PATCH 57/97] spacing: train

---
 ocrd_cis/ocropy/train.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ocrd_cis/ocropy/train.py b/ocrd_cis/ocropy/train.py
index 8f224b86..6c627231 100644
--- a/ocrd_cis/ocropy/train.py
+++ b/ocrd_cis/ocropy/train.py
@@ -19,8 +19,8 @@ def deletefiles(filelist):
     for file in filelist:
         if exists(file):
             remove(file)
-        if exists(file[:-3]+'gt.txt'):
-            remove(file[:-3]+'gt.txt')
+        if exists(file[:-3] + 'gt.txt'):
+            remove(file[:-3] + 'gt.txt')
 
 def resize_keep_ratio(image, baseheight=48):
     hpercent = (baseheight / float(image.size[1]))

From d7859714ec6622a0b9294d9dc54d9f3e35f4606c Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Fri, 16 Aug 2024 16:39:54 +0200
Subject: [PATCH 58/97] spacing: segment

---
 ocrd_cis/ocropy/segment.py | 41 ++++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 19 deletions(-)

diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index e8c4a1ed..75be2a11 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -58,7 +58,9 @@
     lines2regions
 )
 
-def masks2polygons(logger: Logger, bg_labels, baselines, fg_bin, name, min_area=None, simplify=None, open_holes=False, reorder=True):
+
+def masks2polygons(logger: Logger, bg_labels, baselines, fg_bin, name, min_area=None, simplify=None, open_holes=False,
+                   reorder=True):
     """Convert label masks into polygon coordinates.
 
     Given a Numpy array of background labels ``bg_labels``,
@@ -79,6 +81,7 @@ def masks2polygons(logger: Logger, bg_labels, baselines, fg_bin, name, min_area=
     if baselines is not None:
         def getx(xy):
             return xy[0]
+
         baselines = [LineString(sorted([p[::-1] for p in line], key=getx)).simplify(5)
                      for line in baselines
                      if len(line) >= 2]
@@ -96,8 +99,7 @@ def getx(xy):
         # simplify to convex hull
         if simplify is not None:
             hull = convex_hull_image(bg_mask.astype(np.uint8)).astype(bool)
-            conflicts = np.setdiff1d(hull * simplify,
-                                     bg_mask * simplify)
+            conflicts = np.setdiff1d(hull * simplify, bg_mask * simplify)
             if conflicts.any():
                 logger.debug(
                     f'Cannot simplify {label}: convex hull would create additional intersections {str(conflicts)}')
@@ -143,10 +145,10 @@ def getx(xy):
                     contourtics = np.maximum(1, np.linalg.norm(contour2, axis=2).astype(int)[:,0] // 10)
                     interpol = []
                     for i, ntics in enumerate(contourtics):
-                        interpol.extend(np.array(contour[i:i+1] +
-                                                 contour2[i:i+1] *
-                                                 np.linspace(0, 1, ntics)[:,np.newaxis,np.newaxis],
-                                                 int))
+                        interpol.extend(np.array(
+                            contour[i:i + 1] +
+                            contour2[i:i + 1] *
+                            np.linspace(0, 1, ntics)[:, np.newaxis, np.newaxis], int))
                     interpol.append(contour[-1])
                     interpol = np.array(interpol)
                     contourtics = np.insert(np.cumsum(contourtics), 0, 0)
@@ -159,23 +161,24 @@ def getx(xy):
                         contour_idx2 = contour_idx
                     if contour_idx2 >= len(contour):
                         contour_idx2 = 0
-                    cispoint1 = cispoint2 = interpol[interpol_idx:interpol_idx+1]
+                    cispoint1 = cispoint2 = interpol[interpol_idx:interpol_idx + 1]
                     if interpol_idx == 0:
                         diff1 = (interpol[-1:] - cispoint1) // 5
                     else:
-                        diff1 = (interpol[interpol_idx-1:interpol_idx] - cispoint1) // 5
+                        diff1 = (interpol[interpol_idx - 1: interpol_idx] - cispoint1) // 5
                     if interpol_idx + 1 >= len(interpol):
                         diff2 = (interpol[0:1] - cispoint2) // 5
                     else:
-                        diff2 = (interpol[interpol_idx+1:interpol_idx+2] - cispoint2) // 5
+                        diff2 = (interpol[interpol_idx + 1: interpol_idx + 2] - cispoint2) // 5
                     cispoint1 = cispoint1 + diff1
                     cispoint2 = cispoint2 + diff2
                     logger.debug(f"Stitching at interpolation pos {interpol_idx} hole pos {hole_idx}")
                     # now stitch together outer (up to cision), inner (re-arranged around cision), outer (rest)
                     # (this works, because inner contours have inverse direction)
-                    contour = np.concatenate([contour[:contour_idx], cispoint1,
-                                              hole[hole_idx:], hole[:hole_idx],
-                                              cispoint2, contour[contour_idx:]])
+                    contour = np.concatenate(
+                        [contour[:contour_idx], cispoint1,
+                         hole[hole_idx:], hole[:hole_idx],
+                         cispoint2, contour[contour_idx:]])
                     #plot_poly(contour, 'green')
                     idx_hole = hier[0, idx_hole, 0]
                 #plot_poly(contour, 'red')
@@ -210,7 +213,7 @@ def getx(xy):
             # simplify shape:
             # can produce invalid (self-intersecting) polygons:
             #polygon = cv2.approxPolyDP(contour, 2, False)[:, 0, ::] # already ordered x,y
-            polygon = contour[:, 0, ::] # already ordered x,y
+            polygon = contour[:, 0, ::]  # already ordered x,y
             # simplify and validate:
             polygon = Polygon(polygon)
             if not polygon.is_valid:
@@ -220,22 +223,22 @@ def getx(xy):
             if not polygon.is_valid:
                 #LOG.debug(polygon.wkt)
                 logger.warning(explain_validity(polygon))
-            poly = polygon.exterior.coords[:-1] # keep open
+            poly = polygon.exterior.coords[:-1]  # keep open
             if len(poly) < 4:
                 logger.warning(f'Label {label} contour {i} for {name} has less than 4 points')
                 continue
             # get baseline segments intersecting with this line mask
             # and concatenate them from left to right
             if baselines is not None:
-                base = join_baselines(logger, [baseline.intersection(polygon)
-                                       for baseline in baselines
-                                       if baseline.intersects(polygon)], name)
+                base = join_baselines(
+                    logger,
+                    [baseline.intersection(polygon) for baseline in baselines if baseline.intersects(polygon)], name)
                 if base is not None:
                     base = base.coords
             else:
                 base = None
             results.append((label, poly, base))
-            result_labels[contour_labels == i+1] = len(results)
+            result_labels[contour_labels == i + 1] = len(results)
     return results, result_labels
 
 

From 7ca78a97db34559ebf1a8dd819ea08e5415ec8d9 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Fri, 16 Aug 2024 16:40:08 +0200
Subject: [PATCH 59/97] spacing: resegment

---
 ocrd_cis/ocropy/resegment.py | 94 +++++++++++++++++-------------------
 1 file changed, 43 insertions(+), 51 deletions(-)

diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index d429c1de..48bb0d40 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -144,11 +144,11 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option
                 else:
                     self.logger.warning(f'Page "{page_id}" region "{region.id}" contains no text lines')
         return OcrdPageResult(pcgts)
- 
+
     def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, lines, ignore):
         threshold = self.parameter['min_fraction']
         method = self.parameter['method']
-        maxdist = self.parameter['spread']/zoom*300/72 # in pt
+        maxdist = self.parameter['spread'] / zoom * 300 / 72  # in pt
         # prepare line segmentation
         parent_array = pil2array(parent_image)
         #parent_array, _ = common.binarize(parent_array, maxskew=0) # just in case still raw
@@ -172,7 +172,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
         for i, line in enumerate(lines):
             if self.parameter['baseline_only'] and line.Baseline:
                 line_base = baseline_of_segment(line, parent_coords)
-                line_poly = polygon_from_baseline(line_base, 30/zoom)
+                line_poly = polygon_from_baseline(line_base, 30 / zoom)
             else:
                 line_poly = coordinates_of_segment(line, parent_image, parent_coords)
                 line_poly = make_valid(Polygon(line_poly))
@@ -184,9 +184,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
             # (causing negative/above-max indices), either fully or partially,
             # then this will silently ignore them. The caller does not need
             # to concern herself with this.
-            line_y, line_x = draw.polygon(polygon[:, 1],
-                                          polygon[:, 0],
-                                          parent_bin.shape)
+            line_y, line_x = draw.polygon(polygon[:, 1], polygon[:, 0], parent_bin.shape)
             line_labels[i, line_y, line_x] = True
         # only text region(s) may contain new text lines
         for i, region in enumerate(set(line.parent_object_ for line in lines)):
@@ -194,17 +192,13 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
             region_polygon = coordinates_of_segment(region, parent_image, parent_coords)
             region_polygon = make_valid(Polygon(region_polygon))
             region_polygon = np.array(region_polygon.exterior.coords, int)[:-1]
-            ignore_bin[draw.polygon(region_polygon[:, 1],
-                                    region_polygon[:, 0],
-                                    parent_bin.shape)] = False
+            ignore_bin[draw.polygon(region_polygon[:, 1], region_polygon[:, 0], parent_bin.shape)] = False
         # mask/ignore overlapping neighbours
         for i, segment in enumerate(ignore):
             self.logger.debug(f'Masking area of {type(segment).__name__[:-4]} "{segment.id}" for '
                               f'"{page_id if fullpage else parent.id}"')
             segment_polygon = coordinates_of_segment(segment, parent_image, parent_coords)
-            ignore_bin[draw.polygon(segment_polygon[:, 1],
-                                    segment_polygon[:, 0],
-                                    parent_bin.shape)] = True
+            ignore_bin[draw.polygon(segment_polygon[:, 1], segment_polygon[:, 0], parent_bin.shape)] = True
         if method != 'lineest':
             self.logger.debug(f'Calculating connected component and distance transforms for "{parent.id}"')
             bin = parent_bin & ~ ignore_bin
@@ -213,8 +207,8 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
             _, counts = np.unique(components, return_counts=True)
             if counts.shape[0] > 1:
                 counts = np.sqrt(3 * counts)
-                scale = int(np.median(counts[(5/zoom < counts) & (counts < 100/zoom)]))
-                components *= (counts > 15/zoom)[components]
+                scale = int(np.median(counts[(5 / zoom < counts) & (counts < 100 / zoom)]))
+                components *= (counts > 15 / zoom)[components]
                 self.logger.debug(f"Estimated scale: {scale}")
             else:
                 scale = 43
@@ -244,12 +238,12 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                                                   parent_bin.shape)
                     new_labels[line_y, line_x] = i + 1
             spread_dist(self.logger, lines, line_labels, new_labels, parent_bin, components, parent_coords,
-                        maxdist=maxdist or scale/2, loc=parent.id, threshold=threshold)
+                        maxdist=maxdist or scale / 2, loc=parent.id, threshold=threshold)
             return
         try:
             # TODO: 'scale' passed as a param may not be always defined (mehmedGIT)
             new_line_labels, new_baselines, _, _, _, scale = compute_segmentation(
-                parent_bin, seps=ignore_bin, zoom=zoom, spread_dist=maxdist or scale/2,
+                parent_bin, seps=ignore_bin, zoom=zoom, spread_dist=maxdist or scale / 2,
                 fullpage=fullpage, maxseps=0, maxcolseps=len(ignore), maximages=0)
         except Exception as err:
             self.logger.error(f'Cannot line-segment {tag} "{page_id if fullpage else parent.id}": {err}')
@@ -257,13 +251,13 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
         self.logger.info(
             f"Found {new_line_labels.max()} new line labels for {len(lines)} existing lines on {tag} '{parent.id}'")
         # polygonalize and prepare comparison
-        new_line_polygons, new_line_labels = masks2polygons(self.logger,
-            new_line_labels, new_baselines, parent_bin, name=f'{tag} "{parent.id}"',
-            min_area=640/zoom/zoom)
+        new_line_polygons, new_line_labels = masks2polygons(
+            self.logger, new_line_labels, new_baselines, parent_bin, name=f'{tag} "{parent.id}"',
+            min_area=640 / zoom / zoom)
         DSAVE('line_labels', [np.argmax(np.insert(line_labels, 0, 0, axis=0), axis=0), parent_bin])
         DSAVE('new_line_labels', [new_line_labels, parent_bin])
-        new_line_polygons, new_baselines = list(zip(*[(Polygon(poly), LineString(base))
-                                                      for _, poly, base in new_line_polygons])) or ([], [])
+        new_line_polygons, new_baselines = list(zip(
+            *[(Polygon(poly), LineString(base)) for _, poly, base in new_line_polygons])) or ([], [])
         # polygons for intersecting pairs
         intersections = dict()
         # ratio of overlap between intersection and new line
@@ -281,12 +275,12 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                 inter = make_intersection(line_poly.context, new_line_poly)
                 if not inter:
                     continue
-                new_line_mask = (new_line_labels == i+1) & parent_bin
+                new_line_mask = (new_line_labels == i + 1) & parent_bin
                 line_mask = line_labels[j] & parent_bin
                 inter_mask = new_line_mask & line_mask
                 if (not np.count_nonzero(inter_mask) or
-                    not np.count_nonzero(new_line_mask) or
-                    not np.count_nonzero(line_mask)):
+                        not np.count_nonzero(new_line_mask) or
+                        not np.count_nonzero(line_mask)):
                     continue
                 intersections[(i, j)] = inter
                 fits_bg[i, j] = inter.area / new_line_poly.area
@@ -344,17 +338,17 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
             if not np.prod(new_lines.shape):
                 self.logger.debug(f"no lines for '{line.id}' match or fit", )
                 continue
-            covers = np.sum(covers_bg[new_lines,j])
+            covers = np.sum(covers_bg[new_lines, j])
             if covers < threshold / 3:
                 self.logger.debug(f"new lines for '{line.id}' only cover %.1f%% bg", covers * 100)
                 continue
-            covers = np.sum(covers_fg[new_lines,j])
+            covers = np.sum(covers_fg[new_lines, j])
             if covers < threshold:
                 self.logger.debug(f"new lines for '{line.id}' only cover %.1f%% fg", covers * 100)
                 continue
-            looses = (assignments < 0) & (covers_bg[:,j] > 0.1)
+            looses = (assignments < 0) & (covers_bg[:, j] > 0.1)
             if looses.any():
-                covers = np.sum(covers_bg[np.nonzero(looses)[0],j])
+                covers = np.sum(covers_bg[np.nonzero(looses)[0], j])
                 self.logger.debug(
                     f"new lines for '{line.id}' would loose {np.count_nonzero(looses)} non-matching segments "
                     f"totalling %.1f%% bg", covers * 100)
@@ -365,13 +359,12 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
             # combine all assigned new lines to single outline polygon
             if len(new_lines) > 1:
                 self.logger.debug(f"joining {len(new_lines)} new line polygons for '{line.id}'")
-            new_polygon = join_polygons([new_line_polygons[i] #intersections[(i, j)]
-                                         for i in new_lines], loc=line.id, scale=scale)
-            new_baseline = join_baselines(self.logger, [new_polygon.intersection(new_baselines[i])
-                                           for i in new_lines], loc=line.id)
+            # intersections[(i, j)]
+            new_polygon = join_polygons([new_line_polygons[i] for i in new_lines], loc=line.id, scale=scale)
+            new_baseline = join_baselines(
+                self.logger, [new_polygon.intersection(new_baselines[i]) for i in new_lines], loc=line.id)
             # convert back to absolute (page) coordinates:
-            line_polygon = coordinates_for_segment(new_polygon.exterior.coords[:-1],
-                                                   parent_image, parent_coords)
+            line_polygon = coordinates_for_segment(new_polygon.exterior.coords[:-1], parent_image, parent_coords)
             line_polygon = polygon_for_parent(line_polygon, line.parent_object_)
             if line_polygon is None:
                 self.logger.warning(f"Ignoring extant new polygon for line '{line.id}'")
@@ -379,8 +372,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
             # annotate result:
             line.get_Coords().set_points(points_from_polygon(line_polygon))
             if new_baseline is not None:
-                new_baseline = coordinates_for_segment(new_baseline.coords,
-                                                       parent_image, parent_coords)
+                new_baseline = coordinates_for_segment(new_baseline.coords, parent_image, parent_coords)
                 line.set_Baseline(BaselineType(points=points_from_polygon(new_baseline)))
             line_polygons[j] = prep(new_polygon)
             # now also ensure the assigned lines do not overlap other existing lines
@@ -394,20 +386,22 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l
                     if other_polygon.is_empty:
                         continue
                     # convert back to absolute (page) coordinates:
-                    other_polygon = coordinates_for_segment(other_polygon.exterior.coords[:-1],
-                                                            parent_image, parent_coords)
+                    other_polygon = coordinates_for_segment(
+                        other_polygon.exterior.coords[:-1], parent_image, parent_coords)
                     other_polygon = polygon_for_parent(other_polygon, otherline.parent_object_)
                     if other_polygon is None:
                         self.logger.warning(f"Ignoring extant new polygon for line '{otherline.id}'")
                         continue
                     otherline.get_Coords().set_points(points_from_polygon(other_polygon))
 
-def spread_dist(logger: Logger, lines, old_labels, new_labels, binarized, components, coords,
-                maxdist=43, loc='', threshold=0.9):
+
+def spread_dist(
+        logger: Logger, lines, old_labels, new_labels, binarized, components, coords, maxdist=43, loc='',
+        threshold=0.9):
     """redefine line coordinates by contourizing spread of connected components propagated from new labels"""
     DSAVE('seeds', [new_labels, (components>0)])
     # allocate to connected components consistently
-    # (ignoring smallest components like punctuation)
+    # (ignoring the smallest components like punctuation)
     # but when there are conflicts, meet in the middle via watershed
     new_labels2 = morph.propagate_labels(components > 0, new_labels, conflict=0)
     new_labels2 = segmentation.watershed(new_labels2, markers=new_labels, mask=(components > 0))
@@ -415,7 +409,7 @@ def spread_dist(logger: Logger, lines, old_labels, new_labels, binarized, compon
     # dilate/grow labels from connected components against each other and bg
     new_labels = morph.spread_labels(new_labels2, maxdist=maxdist)
     DSAVE('spread', new_labels)
-    # now propagate again to catch smallest components like punctuation
+    # now propagate again to catch the smallest components like punctuation
     new_labels2 = morph.propagate_labels(binarized, new_labels, conflict=0)
     new_labels2 = segmentation.watershed(new_labels2, markers=new_labels, mask=binarized)
     DSAVE('propagated-again', [new_labels2, binarized & (new_labels2==0)])
@@ -444,7 +438,7 @@ def spread_dist(logger: Logger, lines, old_labels, new_labels, binarized, compon
             logger.debug(f"new line for '{line.id}' only covers %.1f%% fg", covers * 100)
             continue
         logger.debug(f'Black pixels before/after resegment of line "{line.id}": {count}/{covers * count}')
-        contours = [contour[:,::-1] # get x,y order again
+        contours = [contour[:, :: -1]  # get x,y order again
                     for contour, area in morph.find_contours(new_label)]
         #LOG.debug("joining %d subsegments for %s", len(contours), line.id)
         if len(contours) == 0:
@@ -452,10 +446,9 @@ def spread_dist(logger: Logger, lines, old_labels, new_labels, binarized, compon
             continue
         else:
             # get alpha shape
-            poly = join_polygons([make_valid(Polygon(contour))
-                                  for contour in contours
-                                  if len(contour) >= 4],
-                                 loc=line.id, scale=maxdist)
+            poly = join_polygons(
+                [make_valid(Polygon(contour)) for contour in contours if len(contour) >= 4],
+                loc=line.id, scale=maxdist)
         poly = poly.exterior.coords[:-1]
         polygon = coordinates_for_segment(poly, None, coords)
         polygon = polygon_for_parent(polygon, line.parent_object_)
@@ -472,9 +465,8 @@ def baseline_of_segment(segment, coords):
 
 # zzz should go into core ocrd_utils
 def polygon_from_baseline(baseline, scale):
-    ltr = baseline[0,0] < baseline[-1,0]
+    ltr = baseline[0, 0] < baseline[-1, 0]
     # left-hand side if left-to-right, and vice versa
-    polygon = make_valid(join_polygons([LineString(baseline).buffer(scale * (-1) ** ltr,
-                                                                    single_sided=True)],
-                                       scale=scale))
+    polygon = make_valid(join_polygons(
+        [LineString(baseline).buffer(scale * (-1) ** ltr, single_sided=True)], scale=scale))
     return polygon

From 1004b431e451be4288aa98054dff843bce3e306b Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Fri, 16 Aug 2024 16:52:51 +0200
Subject: [PATCH 60/97] spacing: rest

---
 ocrd_cis/ocropy/binarize.py  | 11 ++++++-----
 ocrd_cis/ocropy/clip.py      | 34 ++++++++++++++++++----------------
 ocrd_cis/ocropy/denoise.py   |  9 ++++-----
 ocrd_cis/ocropy/deskew.py    | 22 +++++++++++-----------
 ocrd_cis/ocropy/dewarp.py    | 21 ++++++++++-----------
 ocrd_cis/ocropy/recognize.py | 35 +++++++++++++++--------------------
 ocrd_cis/ocropy/resegment.py |  2 +-
 ocrd_cis/ocropy/segment.py   |  2 +-
 8 files changed, 66 insertions(+), 70 deletions(-)

diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index 782dd578..35b28c5a 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -38,14 +38,14 @@ def binarize(logger: Logger, pil_image, method='ocropy', maxskew=2, threshold=0.
 
         if method == 'global':
             # global thresholding
-            _, th = cv2.threshold(img,threshold*255,255,cv2.THRESH_BINARY)
+            _, th = cv2.threshold(img, threshold * 255, 255, cv2.THRESH_BINARY)
         elif method == 'otsu':
             # Otsu's thresholding
-            _, th = cv2.threshold(img,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
+            _, th = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
         elif method == 'gauss-otsu':
             # Otsu's thresholding after Gaussian filtering
             blur = cv2.GaussianBlur(img, (5, 5), 0)
-            _, th = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU)
+            _, th = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
         else:
             raise Exception('unknown binarization method %s' % method)
         return Image.fromarray(th), 0
@@ -95,7 +95,8 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
         page = pcgts.get_Page()
         assert page
 
-        page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id, feature_filter='binarized')
+        page_image, page_xywh, page_image_info = self.workspace.image_from_page(
+            page, page_id, feature_filter='binarized')
         zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info)
 
         result = OcrdPageResult(pcgts)
@@ -162,7 +163,7 @@ def process_page(self, page, page_image, page_xywh, zoom, page_id) -> OcrdPageRe
         # to do consistent coordinate transforms, and non-consumers
         # to redo the rotation themselves):
         orientation = -page_xywh['angle']
-        orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180]
+        orientation = 180 - (180 - orientation) % 360  # map to [-179.999,180]
         page.set_orientation(orientation)
         if self.parameter['grayscale']:
             suffix = '.IMG-NRM'
diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py
index 7f40a214..f5390dde 100644
--- a/ocrd_cis/ocropy/clip.py
+++ b/ocrd_cis/ocropy/clip.py
@@ -124,16 +124,17 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: str = No
             masks = [pil2array(polygon_mask(page_image, polygon)).astype(np.uint8) for polygon in polygons]
         for i, region in enumerate(regions):
             if i >= num_texts:
-                break # keep non-text regions unchanged
+                break  # keep non-text regions unchanged
             if level == 'region':
                 if region.get_AlternativeImage():
                     # FIXME: This should probably be an exception (bad workflow configuration).
                     self.logger.warning(f'Page "{page_id}" region "{region.id}" already contains image data: skipping')
                     continue
                 shape = prep(shapes[i])
-                neighbours = [(regionj, maskj) for shapej, regionj, maskj
-                              in zip(shapes[:i] + shapes[i+1:], regions[:i] + regions[i+1:], masks[:i] + masks[i+1:])
-                              if shape.intersects(shapej)]
+                neighbours = [
+                    (regionj, maskj) for shapej, regionj, maskj in
+                    zip(shapes[:i] + shapes[i + 1:], regions[:i] + regions[i + 1:], masks[:i] + masks[i + 1:])
+                    if shape.intersects(shapej)]
                 if neighbours:
                     ret.images.append(self.process_segment(
                         region, masks[i], polygons[i], neighbours, background_image,
@@ -161,24 +162,25 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: str = No
                         f'Page "{page_id}" region "{region.id}" line "{line.id}" already contains image data: skipping')
                     continue
                 shape = prep(shapes[j])
-                neighbours = [(linej, maskj) for shapej, linej, maskj
-                              in zip(shapes[:j] + shapes[j+1:], lines[:j] + lines[j+1:], masks[:j] + masks[j+1:])
-                              if shape.intersects(shapej)]
+                neighbours = [
+                    (linej, maskj) for shapej, linej, maskj in
+                    zip(shapes[:j] + shapes[j + 1:], lines[:j] + lines[j + 1:], masks[:j] + masks[j + 1:])
+                    if shape.intersects(shapej)]
                 if neighbours:
                     ret.images.append(self.process_segment(
                         line, masks[j], polygons[j], neighbours, background_image,
                         region_image, region_coords, region_bin, page_id))
         return ret
 
-    def process_segment(self, segment, segment_mask, segment_polygon, neighbours,
-                        background_image, parent_image, parent_coords, parent_bin,
-                        page_id) -> OcrdPageResultImage:
+    def process_segment(
+            self, segment, segment_mask, segment_polygon, neighbours, background_image, parent_image, parent_coords,
+            parent_bin, page_id
+    ) -> OcrdPageResultImage:
         # initialize AlternativeImage@comments classes from parent, except
         # for those operations that can apply on multiple hierarchy levels:
         features = ','.join(
             [feature for feature in parent_coords['features'].split(',')
-             if feature in ['binarized', 'grayscale_normalized',
-                            'despeckled', 'dewarped']]) + ',clipped'
+             if feature in ['binarized', 'grayscale_normalized', 'despeckled', 'dewarped']]) + ',clipped'
         # mask segment within parent image:
         segment_image = image_from_polygon(parent_image, segment_polygon)
         segment_bbox = bbox_from_polygon(segment_polygon)
@@ -188,8 +190,8 @@ def process_segment(self, segment, segment_mask, segment_polygon, neighbours,
                     f'Ignoring enclosing neighbour "{neighbour.id}" of segment "{segment.id}" on page "{page_id}"')
                 continue
             # find connected components that (only) belong to the neighbour:
-            intruders = segment_mask * morph.keep_marked(parent_bin, neighbour_mask > 0) # overlaps neighbour
-            intruders = morph.remove_marked(intruders, segment_mask > neighbour_mask) # but exclusively
+            intruders = segment_mask * morph.keep_marked(parent_bin, neighbour_mask > 0)  # overlaps neighbour
+            intruders = morph.remove_marked(intruders, segment_mask > neighbour_mask)  # but exclusively
             num_intruders = np.count_nonzero(intruders)
             num_foreground = np.count_nonzero(segment_mask * parent_bin)
             if not num_intruders:
@@ -202,14 +204,14 @@ def process_segment(self, segment, segment_mask, segment_polygon, neighbours,
             segment_mask -= intruders
             # suppress in derived image result to be annotated
             clip_mask = array2pil(intruders)
-            segment_image.paste(background_image, mask=clip_mask) # suppress in raw image
+            segment_image.paste(background_image, mask=clip_mask)  # suppress in raw image
             if segment_image.mode in ['RGB', 'L', 'RGBA', 'LA']:
                 # for consumers that do not have to rely on our
                 # guessed background color, but can cope with transparency:
                 segment_image.putalpha(ImageOps.invert(clip_mask))
         # recrop segment into rectangle, just as image_from_segment would do
         # (and also clipping with background colour):
-        segment_image = crop_image(segment_image,box=segment_bbox)
+        segment_image = crop_image(segment_image, box=segment_bbox)
         # update PAGE (reference the image file):
         alternative_image = AlternativeImageType(comments=features)
         segment.add_AlternativeImage(alternative_image)
diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py
index b3c219fb..0dd14ef8 100644
--- a/ocrd_cis/ocropy/denoise.py
+++ b/ocrd_cis/ocropy/denoise.py
@@ -19,7 +19,7 @@ def executable(self):
     def setup(self):
         self.logger = getLogger('processor.OcropyDenoise')
 
-    def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
+    def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
         """Despeckle the pages / regions / lines of the workspace.
 
         Open and deserialise PAGE input file and its respective images,
@@ -72,8 +72,7 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option
                     self.logger.warning(f'Page "{page_id}" region "{region.id}" contains no text lines')
                 for line in lines:
                     line_image, line_xywh = self.workspace.image_from_segment(
-                        line, region_image, region_xywh,
-                        feature_selector='binarized')
+                        line, region_image, region_xywh, feature_selector='binarized')
                     image = self.process_segment(line, line_image, line_xywh, zoom)
                     if image:
                         result.images.append(image)
@@ -83,8 +82,8 @@ def process_segment(self, segment, segment_image, segment_xywh, zoom) -> Optiona
             self.logger.warning(f"Skipping '{segment.id}' with zero size")
             return None
         self.logger.info(f"About to despeckle '{segment.id}'")
-        bin_image = remove_noise(segment_image,
-                                 maxsize=self.parameter['noise_maxsize']/zoom*300/72) # in pt
+        bin_image = remove_noise(
+            segment_image, maxsize=self.parameter['noise_maxsize'] / zoom * 300 / 72)  # in pt
         # update PAGE (reference the image file):
         alt_image = AlternativeImageType(comments=segment_xywh['features'] + ',despeckled')
         segment.add_AlternativeImage(alt_image)
diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py
index 84475d81..7bdbba2d 100644
--- a/ocrd_cis/ocropy/deskew.py
+++ b/ocrd_cis/ocropy/deskew.py
@@ -25,7 +25,7 @@ def executable(self):
     def setup(self):
         self.logger = getLogger('processor.OcropyDeskew')
 
-    def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
+    def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
         """Deskew the pages or regions of the workspace.
 
         Open and deserialise PAGE input file and its respective images,
@@ -61,7 +61,7 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option
             return result
         if level == 'table':
             regions = page.get_TableRegion()
-        else: # region
+        else:  # region
             regions = page.get_AllRegions(classes=['Text'], order='reading-order')
         if not regions:
             self.logger.warning('Page "%s" contains no text regions', page_id)
@@ -78,29 +78,29 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option
                 result.images.append(image)
         return result
 
-    def _process_segment(self, segment, segment_image, segment_coords, segment_id, page_id) -> Optional[OcrdPageResultImage]:
+    def _process_segment(
+            self, segment, segment_image, segment_coords, segment_id, page_id
+    ) -> Optional[OcrdPageResultImage]:
         if not segment_image.width or not segment_image.height:
             self.logger.warning("Skipping %s with zero size", segment_id)
             return None
-        angle0 = segment_coords['angle'] # deskewing (w.r.t. top image) already applied to segment_image
+        angle0 = segment_coords['angle']  # deskewing (w.r.t. top image) already applied to segment_image
         self.logger.info(f"About to deskew {segment_id}")
-        angle = deskew(segment_image, maxskew=self.parameter['maxskew']) # additional angle to be applied
+        angle = deskew(segment_image, maxskew=self.parameter['maxskew'])  # additional angle to be applied
         # segment angle: PAGE orientation is defined clockwise,
         # whereas PIL/ndimage rotation is in mathematical direction:
         orientation = -(angle + angle0)
-        orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180]
-        segment.set_orientation(orientation) # also removes all deskewed AlternativeImages
+        orientation = 180 - (180 - orientation) % 360  # map to [-179.999,180]
+        segment.set_orientation(orientation)  # also removes all deskewed AlternativeImages
         self.logger.info(f"Found angle for {segment_id}: %.1f", angle)
         # delegate reflection, rotation and re-cropping to core:
         if isinstance(segment, PageType):
             segment_image, segment_coords, _ = self.workspace.image_from_page(
-                segment, page_id,
-                fill='background', transparency=True)
+                segment, page_id, fill='background', transparency=True)
             suffix = '.IMG-DESKEW'
         else:
             segment_image, segment_coords = self.workspace.image_from_segment(
-                segment, segment_image, segment_coords,
-                fill='background', transparency=True)
+                segment, segment_image, segment_coords, fill='background', transparency=True)
             suffix = segment.id + '.IMG-DESKEW'
         if not angle:
             # zero rotation does not change coordinates,
diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py
index 302cf2e0..e06718c8 100644
--- a/ocrd_cis/ocropy/dewarp.py
+++ b/ocrd_cis/ocropy/dewarp.py
@@ -22,27 +22,27 @@ def dewarp(image, lnorm, check=True, max_neighbour=0.02, zoom=1.0):
     if not image.width or not image.height:
         raise InvalidLine('image size is zero')
     line = pil2array(image)
-    
+
     if np.prod(line.shape) == 0:
         raise InvalidLine('image dimensions are zero')
     if np.amax(line) == np.amin(line):
         raise InvalidLine('image is blank')
-    
-    temp = np.amax(line)-line # inverse, zero-closed
+
+    temp = np.amax(line) - line  # inverse, zero-closed
     if check:
         report = check_line(temp, zoom=zoom)
         if report:
             raise InadequateLine(report)
-    
-    temp = temp * 1.0 / np.amax(temp) # normalized
+
+    temp = temp * 1.0 / np.amax(temp)  # normalized
     if check:
         report = lnorm.check(temp, max_ignore=max_neighbour)
         if report:
             raise InvalidLine(report)
 
-    lnorm.measure(temp) # find centerline
+    lnorm.measure(temp)  # find centerline
     line = lnorm.dewarp(line, cval=np.amax(line))
-    
+
     return array2pil(line)
 
 # pad with white above and below (as a fallback for dewarp)
@@ -72,7 +72,7 @@ def setup(self):
                     #  and extra params)
                     0.3))
 
-    def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
+    def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
         """Dewarp the lines of the workspace.
 
         Open and deserialise PAGE input file and its respective images,
@@ -115,9 +115,8 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option
 
                 self.logger.info(f"About to dewarp page '{page_id}' region '{region.id}' line '{line.id}'")
                 try:
-                    dew_image = dewarp(line_image, self.lnorm, check=True,
-                                       max_neighbour=self.parameter['max_neighbour'],
-                                       zoom=zoom)
+                    dew_image = dewarp(
+                        line_image, self.lnorm, check=True, max_neighbour=self.parameter['max_neighbour'], zoom=zoom)
                 except InvalidLine as err:
                     self.logger.error(f'Cannot dewarp line "{line.id}": {err}')
                     continue
diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py
index f0c4b520..02d29e7c 100644
--- a/ocrd_cis/ocropy/recognize.py
+++ b/ocrd_cis/ocropy/recognize.py
@@ -47,7 +47,7 @@ def recognize(image, pad, network, check=True):
 
     # getting confidence
     result = lstm.translate_back(network.outputs, pos=1)
-    scale = len(raw_line.T)*1.0/(len(network.outputs)-2*pad)
+    scale = len(raw_line.T) * 1.0 / (len(network.outputs) - 2 * pad)
 
     clist = []
     rlist = []
@@ -57,7 +57,7 @@ def recognize(image, pad, network, check=True):
         if c != 0:
             confid = network.outputs[r, c]
             c = network.l2s([c])
-            r = (r-pad)*scale
+            r = (r - pad) * scale
 
             confidlist.append(confid)
             clist.append(c)
@@ -88,7 +88,7 @@ def setup(self):
 
     def get_model(self):
         """Search for the model file.  First checks if parameter['model'] can
-        be resolved with OcrdResourceManager to a valid readeable file and
+        be resolved with OcrdResourceManager to a valid readable file and
         returns it.  If not, it checks if the model can be found in the
         dirname(__file__)/models/ directory."""
         canread = lambda p: isfile(p) and access(p, R_OK)
@@ -202,8 +202,8 @@ def process_lines(self, textlines, maxlevel, region_image, region_xywh):
 
             words = [x.strip() for x in linepred.split(' ') if x.strip()]
 
-            word_r_list = [[0]] # r-positions of every glyph in every word
-            word_conf_list = [[]] # confidences of every glyph in every word
+            word_r_list = [[0]]  # r-positions of every glyph in every word
+            word_conf_list = [[]]  # confidences of every glyph in every word
             if words != []:
                 w_no = 0
                 found_char = False
@@ -215,7 +215,7 @@ def process_lines(self, textlines, maxlevel, region_image, region_xywh):
                     if c == ' ' and found_char:
                         if i == 0:
                             word_r_list[0][0] = rlist[i]
-                        elif i+1 <= len(clist)-1 and clist[i+1] != ' ':
+                        elif i + 1 <= len(clist) - 1 and clist[i + 1] != ' ':
                             word_conf_list.append([])
                             word_r_list.append([rlist[i]])
                             w_no += 1
@@ -224,9 +224,9 @@ def process_lines(self, textlines, maxlevel, region_image, region_xywh):
                 word_r_list = [[0, line_image.width]]
 
             # conf for each word
-            wordsconf = [(min(x)+max(x))/2 for x in word_conf_list]
+            wordsconf = [(min(x) + max(x)) / 2 for x in word_conf_list]
             # conf for the line
-            line_conf = (min(wordsconf) + max(wordsconf))/2
+            line_conf = (min(wordsconf) + max(wordsconf)) / 2
             # line text
             line.add_TextEquiv(TextEquivType(Unicode=linepred, conf=line_conf))
 
@@ -235,32 +235,27 @@ def process_lines(self, textlines, maxlevel, region_image, region_xywh):
                     word_points = points_from_polygon(
                         coordinates_for_segment(
                             np.array(polygon_from_bbox(
-                                word_r_list[word_no][0] / scale,
-                                0,
-                                word_r_list[word_no][-1] / scale,
-                                0 + line_image.height)),
+                                word_r_list[word_no][0] / scale,0,
+                                word_r_list[word_no][-1] / scale, 0 + line_image.height)),
                             line_image,
                             line_coords))
                     word_id = '%s_word%04d' % (line.id, word_no)
                     word = WordType(id=word_id, Coords=CoordsType(word_points))
                     line.add_Word(word)
-                    word.add_TextEquiv(TextEquivType(
-                        Unicode=word_str, conf=wordsconf[word_no]))
+                    word.add_TextEquiv(TextEquivType(Unicode=word_str, conf=wordsconf[word_no]))
 
                     if maxlevel == 'glyph':
                         for glyph_no, glyph_str in enumerate(word_str):
                             glyph_points = points_from_polygon(
                                 coordinates_for_segment(
                                     np.array(polygon_from_bbox(
-                                        word_r_list[word_no][glyph_no] / scale,
-                                        0,
-                                        word_r_list[word_no][glyph_no+1] / scale,
-                                        0 + line_image.height)),
+                                        word_r_list[word_no][glyph_no] / scale, 0,
+                                        word_r_list[word_no][glyph_no + 1] / scale, 0 + line_image.height)),
                                     line_image,
                                     line_coords))
                             glyph_id = '%s_glyph%04d' % (word.id, glyph_no)
                             glyph = GlyphType(id=glyph_id, Coords=CoordsType(glyph_points))
                             word.add_Glyph(glyph)
-                            glyph.add_TextEquiv(TextEquivType(
-                                Unicode=glyph_str, conf=word_conf_list[word_no][glyph_no]))
+                            glyph.add_TextEquiv(
+                                TextEquivType(Unicode=glyph_str, conf=word_conf_list[word_no][glyph_no]))
         return edits, lengs
diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index 48bb0d40..5a8c7e96 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -52,7 +52,7 @@ def executable(self):
     def setup(self):
         self.logger = getLogger('processor.OcropyResegment')
 
-    def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
+    def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
         """Resegment lines of the workspace.
 
         Open and deserialise PAGE input file and its respective images,
diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index 75be2a11..6dc75056 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -252,7 +252,7 @@ def executable(self):
     def setup(self):
         self.logger = getLogger('processor.OcropySegment')
 
-    def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
+    def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
         """Segment pages into regions+lines, tables into cells+lines, or regions into lines.
         
         Open and deserialise PAGE input file and its respective images,

From c5498a0e8d8bc9a8e3fe3bf0848df9b135bae69c Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Fri, 16 Aug 2024 16:55:44 +0200
Subject: [PATCH 61/97] spacing: dewarp

---
 ocrd_cis/ocropy/dewarp.py | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py
index e06718c8..89901efd 100644
--- a/ocrd_cis/ocropy/dewarp.py
+++ b/ocrd_cis/ocropy/dewarp.py
@@ -95,24 +95,19 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
         result = OcrdPageResult(pcgts)
         page = pcgts.get_Page()
 
-        page_image, page_xywh, page_image_info = self.workspace.image_from_page(
-            page, page_id)
+        page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id)
         zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info)
 
         regions = page.get_AllRegions(classes=['Text'], order='reading-order')
         if not regions:
             self.logger.warning(f'Page "{page_id}" contains no text regions')
         for region in regions:
-            region_image, region_xywh = self.workspace.image_from_segment(
-                region, page_image, page_xywh)
-
+            region_image, region_xywh = self.workspace.image_from_segment(region, page_image, page_xywh)
             lines = region.get_TextLine()
             if not lines:
                 self.logger.warning(f'Region {region.id} contains no text lines')
             for line in lines:
-                line_image, line_xywh = self.workspace.image_from_segment(
-                    line, region_image, region_xywh)
-
+                line_image, line_xywh = self.workspace.image_from_segment(line, region_image, region_xywh)
                 self.logger.info(f"About to dewarp page '{page_id}' region '{region.id}' line '{line.id}'")
                 try:
                     dew_image = dewarp(

From 31e124577faad71f2bb039a6b094900b6cdf9df1 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Fri, 16 Aug 2024 16:58:52 +0200
Subject: [PATCH 62/97] fix: dewarp return

---
 ocrd_cis/ocropy/dewarp.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py
index 89901efd..17d0b4ce 100644
--- a/ocrd_cis/ocropy/dewarp.py
+++ b/ocrd_cis/ocropy/dewarp.py
@@ -123,5 +123,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
                     dew_image = padvert(line_image, self.parameter['range'])
                 # update PAGE (reference the image file):
                 alt_image = AlternativeImageType(comments=line_xywh['features'] + ',dewarped')
-                line.add_AlternativeImage(alternative_image)
-                return OcrdPageResultImage(dew_image, region.id + '_' + line.id + '.IMG-DEWARP', alt_image)
+                line.add_AlternativeImage(alt_image)
+                suffix = f"{region.id}_{line.id}.IMG-DEWARP"
+                result.images.append(OcrdPageResultImage(dew_image, suffix, alt_image))
+        return result

From f86c99391e987d4918b6d626dbf1b2f990d7712b Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Fri, 16 Aug 2024 17:21:14 +0200
Subject: [PATCH 63/97] improve str speed: precompute element_name_id

---
 ocrd_cis/ocropy/segment.py | 92 +++++++++++++++++---------------------
 1 file changed, 42 insertions(+), 50 deletions(-)

diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index 6dc75056..9daf59de 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -388,13 +388,12 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
                 rogroup = OrderedGroupType(id="reading-order")
                 ro.set_OrderedGroup(rogroup)
             if (not rogroup.get_RegionRefIndexed() and
-                not rogroup.get_OrderedGroupIndexed() and
-                not rogroup.get_UnorderedGroupIndexed()):
-                 # schema forbids empty OrderedGroup
+                    not rogroup.get_OrderedGroupIndexed() and
+                    not rogroup.get_UnorderedGroupIndexed()):
+                # schema forbids empty OrderedGroup
                 ro.set_OrderedGroup(None)
             # go get TextRegions with TextLines (and SeparatorRegions):
-            image = self._process_element(page, ignore, page_image, page_coords,
-                                          zoom=zoom, rogroup=rogroup)
+            image = self._process_element(page, ignore, page_image, page_coords, zoom=zoom, rogroup=rogroup)
             if image:
                 result.images.append(image)
             return result
@@ -450,11 +449,11 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
                     roelem = page_subgroup_in_reading_order(self.logger, roelem)
                     reading_order[region.id] = roelem
                 # go get TextRegions with TextLines (and SeparatorRegions)
-                image = self._process_element(region, subignore, region_image, region_coords,
-                                              zoom=zoom, rogroup=roelem)
+                image = self._process_element(
+                    region, subignore, region_image, region_coords, zoom=zoom, rogroup=roelem)
                 if image:
                     result.images.append(image)
-        else: # 'region'
+        else:  # 'region'
             regions = list(page.get_TextRegion())
             # besides top-level text regions, line-segment any table cells,
             # and for tables without any cells, add a pseudo-cell
@@ -463,10 +462,8 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
                 if subregions:
                     regions.extend(subregions)
                 else:
-                    subregion = TextRegionType(id=region.id + '_text',
-                                               Coords=region.get_Coords(),
-                                               # as if generated from parser:
-                                               parent_object_=region)
+                    subregion = TextRegionType(
+                        id=f'{region.id}_text', Coords=region.get_Coords(), parent_object_=region)
                     region.add_TextRegion(subregion)
                     regions.append(subregion)
             if not regions:
@@ -490,7 +487,6 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
                 image = self._process_element(region, ignore, region_image, region_coords, zoom=zoom)
                 if image:
                     result.images.append(image)
-
         return result
 
     def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=None) -> Optional[OcrdPageResultImage]:
@@ -535,7 +531,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
             sp_col = segment_polygon[:, 0]
             if isinstance(segment, SeparatorRegionType):
                 sep_bin[draw.polygon(sp_row, sp_col, sep_bin.shape)] = True
-            ignore_labels[draw.polygon(sp_row, sp_col, ignore_labels.shape)] = i+1 # mapped back for RO
+            ignore_labels[draw.polygon(sp_row, sp_col, ignore_labels.shape)] = i + 1 #  mapped back for RO
         if isinstance(element, PageType):
             element_name = 'page'
             fullpage = True
@@ -555,6 +551,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
             report = check_region(element_bin, zoom)
             suffix = element.id + '.IMG-CLIP'
         self.logger.info(f'Computing line segmentation for {element_name} "{element.id}"')
+        element_name_id = f'{element_name} "{element.id}"'
         # TODO: we should downscale if DPI is large enough to save time
         try:
             if report:
@@ -564,7 +561,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
                 # but keep them for h/v-line detection (in fullpage mode):
                 element_bin, seps=(sep_bin + ignore_labels) > 0,
                 zoom=zoom, fullpage=fullpage,
-                spread_dist=round(self.parameter['spread'] / zoom * 300 / 72), # in pt
+                spread_dist=round(self.parameter['spread'] / zoom * 300 / 72),  # in pt
                 # these are ignored when not in fullpage mode:
                 maxcolseps=self.parameter['maxcolseps'],
                 maxseps=self.parameter['maxseps'],
@@ -576,10 +573,10 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
                 # as a fallback, add a single text line comprising the whole region:
                 element.add_TextLine(TextLineType(id=element.id + "_line", Coords=element.get_Coords()))
             else:
-                self.logger.error(f'Cannot line-segment {element_name} "{element.id}": {err}')
+                self.logger.error(f'Cannot line-segment {element_name_id}: {err}')
             return None
 
-        self.logger.info(f'Found {len(np.unique(line_labels)) - 1} text lines for {element_name} "{element.id}"')
+        self.logger.info(f'Found {len(np.unique(line_labels)) - 1} text lines for {element_name_id}')
         # post-process line labels
         if isinstance(element, (PageType, TableRegionType)):
             # aggregate text lines to text regions
@@ -594,18 +591,18 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
                 region_labels = lines2regions(
                     element_bin, line_labels,
                     rlabels=ignore_labels,
-                    sepmask=np.maximum(sepmask, colseps), # add bg
+                    sepmask=np.maximum(sepmask, colseps),  # add bg
                     # decide horizontal vs vertical cut when gaps of similar size
                     prefer_vertical=not isinstance(element, TableRegionType),
                     gap_height=self.parameter['gap_height'],
                     gap_width=self.parameter['gap_width'],
                     scale=scale, zoom=zoom)
                 self.logger.info(
-                    f'Found {len(np.unique(region_labels)) - 1} text regions for {element_name} "{element.id}"')
+                    f'Found {len(np.unique(region_labels)) - 1} text regions for {element_name_id}')
             except Exception as err:
-                self.logger.error(f'Cannot region-segment {element_name} "{element.id}": {err}')
+                self.logger.error(f'Cannot region-segment {element_name_id}: {err}')
                 region_labels = np.where(line_labels > len(ignore), 1 + len(ignore), line_labels)
-            
+
             # prepare reading order group index
             if rogroup:
                 if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)):
@@ -622,7 +619,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
             region_no = 0
             for region_label in np.unique(region_labels):
                 if not region_label:
-                    continue # no bg
+                    continue  # no bg
                 region_mask = region_labels == region_label
                 region_line_labels = line_labels * region_mask
                 region_line_labels0 = np.setdiff1d(region_line_labels, [0])
@@ -644,18 +641,17 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
                 order[np.setdiff1d(region_line_labels0, element_bin * region_line_labels)] = 0
                 region_line_labels = order[region_line_labels]
                 # avoid horizontal gaps
-                region_line_labels = hmerge_line_seeds(element_bin, region_line_labels, scale,
-                                                       seps=np.maximum(sepmask, colseps))
+                region_line_labels = hmerge_line_seeds(
+                    element_bin, region_line_labels, scale, seps=np.maximum(sepmask, colseps))
                 region_mask |= region_line_labels > 0
                 # find contours for region (can be non-contiguous)
-                regions, _ = masks2polygons(self.logger, region_mask * region_label, None, element_bin,
-                                            name=f'{element_name} "{element.id}"',
-                                            min_area=6000 / zoom / zoom,
-                                            simplify=ignore_labels * ~(sep_bin))
+                regions, _ = masks2polygons(
+                    self.logger, region_mask * region_label, None, element_bin, name=element_name_id,
+                    min_area=6000 / zoom / zoom, simplify=ignore_labels * ~(sep_bin))
                 # find contours for lines (can be non-contiguous)
-                lines, _ = masks2polygons(self.logger, region_line_labels, baselines, element_bin,
-                                          name=f'region "{element.id}"',
-                                          min_area=640 / zoom / zoom)
+                lines, _ = masks2polygons(
+                    self.logger, region_line_labels, baselines, element_bin, name=f'region "{element.id}"',
+                    min_area=640 / zoom / zoom)
                 # create new lines in new regions (allocating by intersection)
                 line_polys = [Polygon(polygon) for _, polygon, _ in lines]
                 for _, region_polygon, _ in regions:
@@ -674,7 +670,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
                     # find out which line (contours) belong to which region (contours)
                     line_no = 0
                     for i, line_poly in enumerate(line_polys):
-                        if not region_poly.intersects(line_poly): # .contains
+                        if not region_poly.intersects(line_poly):  # .contains
                             continue
                         line_label, line_polygon, line_baseline = lines[i]
                         # convert back to absolute (page) coordinates:
@@ -696,16 +692,14 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
                     # if the region has received text lines, keep it
                     if region.get_TextLine():
                         element.add_TextRegion(region)
-                        self.logger.info(f'Added region "{region_id}" with {line_no} lines '
-                                         f'for {element_name} "{element.id}"')
+                        self.logger.info(f'Added region "{region_id}" with {line_no} lines for {element_name_id}')
                         if rogroup:
                             index = page_add_to_reading_order(rogroup, region.id, index)
             # add additional image/non-text regions from compute_segmentation
             # (e.g. drop-capitals or images) ...
-            self.logger.info(f'Found {images.max()} large image regions for {element_name} "{element.id}"')
+            self.logger.info(f'Found {images.max()} large image regions for {element_name_id}')
             # find contours around region labels (can be non-contiguous):
-            image_polygons, _ = masks2polygons(self.logger, images, None, element_bin,
-                                               name=f'{element_name} "{element.id}"')
+            image_polygons, _ = masks2polygons(self.logger, images, None, element_bin, name=element_name_id)
             for image_label, polygon, _ in image_polygons:
                 # convert back to absolute (page) coordinates:
                 region_polygon = coordinates_for_segment(polygon, image, coords)
@@ -719,11 +713,10 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
                 element.add_ImageRegion(ImageRegionType(
                     id=region_id, Coords=CoordsType(points=points_from_polygon(region_polygon))))
             # split detected separator labels into separator regions:
-            self.logger.info(f'Found {seplines.max()} separators for {element_name} "{element.id}"')
+            self.logger.info(f'Found {seplines.max()} separators for {element_name_id}')
             # find contours around region labels (can be non-contiguous):
-            sep_polygons, _ = masks2polygons(self.logger, seplines, None, element_bin,
-                                             name=f'{element_name} "{element.id}"',
-                                             open_holes=True, reorder=False)
+            sep_polygons, _ = masks2polygons(
+                self.logger, seplines, None, element_bin, name=element_name_id, open_holes=True, reorder=False)
             for sep_label, polygon, _ in sep_polygons:
                 # convert back to absolute (page) coordinates:
                 region_polygon = coordinates_for_segment(polygon, image, coords)
@@ -737,7 +730,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
                 element.add_SeparatorRegion(SeparatorRegionType(
                     id=region_id, Coords=CoordsType(points=points_from_polygon(region_polygon))))
             # annotate a text/image-separated image
-            element_array[sepmask] = np.amax(element_array) # clip to white/bg
+            element_array[sepmask] = np.amax(element_array)  # clip to white/bg
             image_clipped = array2pil(element_array)
             image_ref = AlternativeImageType(comments=coords['features'] + ',clipped')
             element.add_AlternativeImage(image_ref)
@@ -746,15 +739,14 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
             # get mask from region polygon:
             region_polygon = coordinates_of_segment(element, image, coords)
             region_mask = np.zeros_like(element_bin, bool)
-            region_mask[draw.polygon(region_polygon[:, 1],
-                                     region_polygon[:, 0],
-                                     region_mask.shape)] = True
+            region_mask[draw.polygon(
+                region_polygon[:, 1], region_polygon[:, 0], region_mask.shape)] = True
             # ensure the new line labels do not extrude from the region:
             line_labels = line_labels * region_mask
             # find contours around labels (can be non-contiguous):
-            line_polygons, _ = masks2polygons(self.logger, line_labels, baselines, element_bin,
-                                              name=f'region "{element.id}"',
-                                              min_area=640 / zoom / zoom)
+            line_polygons, _ = masks2polygons(
+                self.logger, line_labels, baselines, element_bin,
+                name=f'region "{element.id}"', min_area=640 / zoom / zoom)
             line_no = 0
             for line_label, polygon, baseline in line_polygons:
                 # convert back to absolute (page) coordinates:
@@ -772,9 +764,9 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
                     line.set_Baseline(BaselineType(points=points_from_polygon(line_baseline)))
                 element.add_TextLine(line)
             if not sep_bin.any():
-                return None # no derived image
+                return None  # no derived image
             # annotate a text/image-separated image
-            element_array[sep_bin] = np.amax(element_array) # clip to white/bg
+            element_array[sep_bin] = np.amax(element_array)  # clip to white/bg
             image_clipped = array2pil(element_array)
             image_ref = AlternativeImageType(comments=coords['features'] + ',clipped')
             element.add_AlternativeImage(image_ref)

From b8e3ad6207a832fad65bccf5ea4756c004bb1f96 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Fri, 16 Aug 2024 22:26:33 +0200
Subject: [PATCH 64/97] fix: clip suffix

---
 ocrd_cis/ocropy/clip.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py
index f5390dde..b81c731c 100644
--- a/ocrd_cis/ocropy/clip.py
+++ b/ocrd_cis/ocropy/clip.py
@@ -213,6 +213,7 @@ def process_segment(
         # (and also clipping with background colour):
         segment_image = crop_image(segment_image, box=segment_bbox)
         # update PAGE (reference the image file):
+        suffix = f'{segment.id}.IMG_CLIP'
         alternative_image = AlternativeImageType(comments=features)
         segment.add_AlternativeImage(alternative_image)
-        return OcrdPageResultImage(segment_image, '.IMG-CLIP', alternative_image)
+        return OcrdPageResultImage(segment_image, suffix, alternative_image)

From 02724f2db8c1d29f739282a42330c1a9b14e27d2 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Fri, 16 Aug 2024 22:30:11 +0200
Subject: [PATCH 65/97] fix: denoise return

---
 ocrd_cis/ocropy/denoise.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py
index 0dd14ef8..4ae883fd 100644
--- a/ocrd_cis/ocropy/denoise.py
+++ b/ocrd_cis/ocropy/denoise.py
@@ -76,6 +76,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
                     image = self.process_segment(line, line_image, line_xywh, zoom)
                     if image:
                         result.images.append(image)
+        return result
 
     def process_segment(self, segment, segment_image, segment_xywh, zoom) -> Optional[OcrdPageResultImage]:
         if not segment_image.width or not segment_image.height:

From aac6fe0989ccb483626af6b238e98162b780aac5 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Sat, 17 Aug 2024 00:50:08 +0200
Subject: [PATCH 66/97] try to fix: ocropy denoise

---
 ocrd_cis/ocropy/denoise.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py
index 4ae883fd..fd9812f8 100644
--- a/ocrd_cis/ocropy/denoise.py
+++ b/ocrd_cis/ocropy/denoise.py
@@ -51,7 +51,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
         zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info)
 
         if level == 'page':
-            image = self.process_segment(page, page_image, page_xywh, zoom)
+            image = self.process_segment(page, page_image, page_xywh, zoom, page_id)
             if image:
                 result.images.append(image)
         else:
@@ -63,7 +63,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
                     region, page_image, page_xywh,
                     feature_selector='binarized' if level == 'region' else '')
                 if level == 'region':
-                    image = self.process_segment(region, region_image, region_xywh, zoom)
+                    image = self.process_segment(region, region_image, region_xywh, zoom, page_id)
                     if image:
                         result.images.append(image)
                     continue
@@ -73,12 +73,12 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
                 for line in lines:
                     line_image, line_xywh = self.workspace.image_from_segment(
                         line, region_image, region_xywh, feature_selector='binarized')
-                    image = self.process_segment(line, line_image, line_xywh, zoom)
+                    image = self.process_segment(line, line_image, line_xywh, zoom, page_id)
                     if image:
                         result.images.append(image)
         return result
 
-    def process_segment(self, segment, segment_image, segment_xywh, zoom) -> Optional[OcrdPageResultImage]:
+    def process_segment(self, segment, segment_image, segment_xywh, zoom, page_id) -> Optional[OcrdPageResultImage]:
         if not segment_image.width or not segment_image.height:
             self.logger.warning(f"Skipping '{segment.id}' with zero size")
             return None
@@ -87,5 +87,6 @@ def process_segment(self, segment, segment_image, segment_xywh, zoom) -> Optiona
             segment_image, maxsize=self.parameter['noise_maxsize'] / zoom * 300 / 72)  # in pt
         # update PAGE (reference the image file):
         alt_image = AlternativeImageType(comments=segment_xywh['features'] + ',despeckled')
+        suffix = f"{page_id}_{segment.id}.IMG-DESPECK"
         segment.add_AlternativeImage(alt_image)
-        return OcrdPageResultImage(bin_image, segment.id + '.IMG-DESPECK', alt_image)
+        return OcrdPageResultImage(bin_image, suffix, alt_image)

From 5548d0e6043e32d7409fef9817775670b2d1b96f Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Sat, 17 Aug 2024 00:58:12 +0200
Subject: [PATCH 67/97] fix: ocropy denoise

---
 ocrd_cis/ocropy/denoise.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py
index fd9812f8..eb3e7d23 100644
--- a/ocrd_cis/ocropy/denoise.py
+++ b/ocrd_cis/ocropy/denoise.py
@@ -63,7 +63,8 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
                     region, page_image, page_xywh,
                     feature_selector='binarized' if level == 'region' else '')
                 if level == 'region':
-                    image = self.process_segment(region, region_image, region_xywh, zoom, page_id)
+                    file_id = f"{page_id}_{region.id}"
+                    image = self.process_segment(region, region_image, region_xywh, zoom, file_id)
                     if image:
                         result.images.append(image)
                     continue
@@ -73,12 +74,13 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
                 for line in lines:
                     line_image, line_xywh = self.workspace.image_from_segment(
                         line, region_image, region_xywh, feature_selector='binarized')
-                    image = self.process_segment(line, line_image, line_xywh, zoom, page_id)
+                    file_id = f"{page_id}_{region.id}_{line.id}"
+                    image = self.process_segment(line, line_image, line_xywh, zoom, file_id)
                     if image:
                         result.images.append(image)
         return result
 
-    def process_segment(self, segment, segment_image, segment_xywh, zoom, page_id) -> Optional[OcrdPageResultImage]:
+    def process_segment(self, segment, segment_image, segment_xywh, zoom, file_id) -> Optional[OcrdPageResultImage]:
         if not segment_image.width or not segment_image.height:
             self.logger.warning(f"Skipping '{segment.id}' with zero size")
             return None
@@ -87,6 +89,6 @@ def process_segment(self, segment, segment_image, segment_xywh, zoom, page_id) -
             segment_image, maxsize=self.parameter['noise_maxsize'] / zoom * 300 / 72)  # in pt
         # update PAGE (reference the image file):
         alt_image = AlternativeImageType(comments=segment_xywh['features'] + ',despeckled')
-        suffix = f"{page_id}_{segment.id}.IMG-DESPECK"
+        suffix = f"{file_id}.IMG-DESPECK"
         segment.add_AlternativeImage(alt_image)
         return OcrdPageResultImage(bin_image, suffix, alt_image)

From c9f0f56787f2d34d718bc504ee3d07f7501dff75 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Sat, 17 Aug 2024 01:26:54 +0200
Subject: [PATCH 68/97] fix: resegment

---
 ocrd_cis/ocropy/resegment.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index 5a8c7e96..c1809569 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -427,7 +427,7 @@ def spread_dist(
             continue
         covers = np.count_nonzero(new_label) / count
         if covers < threshold / 3:
-            logger.debug(f"new line for '%s' only covers %.1f%% bg", covers * 100)
+            logger.debug(f"new line for '{line.id}' only covers %.1f%% bg", covers * 100)
             continue
         count = np.count_nonzero(old_label * binarized)
         if not count:

From fff909746f1347fc9336f8413fd311ac4e3ce206 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Sat, 17 Aug 2024 01:27:05 +0200
Subject: [PATCH 69/97] optimize segment

---
 ocrd_cis/ocropy/segment.py | 48 ++++++++++++++++----------------------
 1 file changed, 20 insertions(+), 28 deletions(-)

diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index 9daf59de..b363cbd2 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -544,14 +544,14 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
             element_name = 'table'
             fullpage = True
             report = check_region(element_bin, zoom)
-            suffix = element.id + '.IMG-CLIP'
+            suffix = f"{element.id}.IMG-CLIP"
         else:
             element_name = 'region'
             fullpage = False
             report = check_region(element_bin, zoom)
-            suffix = element.id + '.IMG-CLIP'
-        self.logger.info(f'Computing line segmentation for {element_name} "{element.id}"')
+            suffix = f"{element.id}.IMG-CLIP"
         element_name_id = f'{element_name} "{element.id}"'
+        self.logger.info(f'Computing line segmentation for {element_name_id}')
         # TODO: we should downscale if DPI is large enough to save time
         try:
             if report:
@@ -571,7 +571,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
             if isinstance(element, TextRegionType):
                 self.logger.error(f'Cannot line-segment region "{element.id}": {err}')
                 # as a fallback, add a single text line comprising the whole region:
-                element.add_TextLine(TextLineType(id=element.id + "_line", Coords=element.get_Coords()))
+                element.add_TextLine(TextLineType(id=f"{element.id}_line", Coords=element.get_Coords()))
             else:
                 self.logger.error(f'Cannot line-segment {element_name_id}: {err}')
             return None
@@ -664,7 +664,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
                         continue
                     # annotate result:
                     region_no += 1
-                    region_id = element.id + "_region%04d" % region_no
+                    region_id = f"{element.id}_region%04d" % region_no
                     self.logger.debug(f'Region label {region_label} becomes ID "{region_id}"')
                     region = TextRegionType(id=region_id, Coords=CoordsType(points=points_from_polygon(region_polygon)))
                     # find out which line (contours) belong to which region (contours)
@@ -682,7 +682,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
                             continue
                         # annotate result:
                         line_no += 1
-                        line_id = region_id + "_line%04d" % line_no
+                        line_id = f"{region_id}_line%04d" % line_no
                         self.logger.debug(f'Line label {line_label} becomes ID "{line_id}"')
                         line = TextLineType(id=line_id, Coords=CoordsType(points=points_from_polygon(line_polygon)))
                         if line_baseline:
@@ -709,7 +709,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
                     continue
                 region_no += 1
                 # annotate result:
-                region_id = element.id + "_image%04d" % region_no
+                region_id = f"{element.id}_image%04d" % region_no
                 element.add_ImageRegion(ImageRegionType(
                     id=region_id, Coords=CoordsType(points=points_from_polygon(region_polygon))))
             # split detected separator labels into separator regions:
@@ -726,7 +726,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
                     continue
                 # annotate result:
                 region_no += 1
-                region_id = element.id + "_sep%04d" % region_no
+                region_id = f"{element.id}_sep%04d" % region_no
                 element.add_SeparatorRegion(SeparatorRegionType(
                     id=region_id, Coords=CoordsType(points=points_from_polygon(region_polygon))))
             # annotate a text/image-separated image
@@ -739,8 +739,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
             # get mask from region polygon:
             region_polygon = coordinates_of_segment(element, image, coords)
             region_mask = np.zeros_like(element_bin, bool)
-            region_mask[draw.polygon(
-                region_polygon[:, 1], region_polygon[:, 0], region_mask.shape)] = True
+            region_mask[draw.polygon(region_polygon[:, 1], region_polygon[:, 0], region_mask.shape)] = True
             # ensure the new line labels do not extrude from the region:
             line_labels = line_labels * region_mask
             # find contours around labels (can be non-contiguous):
@@ -757,7 +756,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
                     continue
                 # annotate result:
                 line_no += 1
-                line_id = element.id + "_line%04d" % line_no
+                line_id = f"{element.id}_line%04d" % line_no
                 line = TextLineType(id=line_id, Coords=CoordsType(points=points_from_polygon(line_polygon)))
                 if baseline:
                     line_baseline = coordinates_for_segment(baseline, image, coords)
@@ -868,11 +867,12 @@ def join_polygons(polygons, loc='', scale=20):
         dists[j, i] = dist
     dists = minimum_spanning_tree(dists, overwrite=True)
     # add bridge polygons (where necessary)
+    max_dist = max(1.0, scale / 5)
     for prevp, nextp in zip(*dists.nonzero()):
         prevp = polygons[prevp]
         nextp = polygons[nextp]
         nearest = nearest_points(prevp, nextp)
-        bridgep = LineString(nearest).buffer(max(1, scale/5), resolution=1)
+        bridgep = LineString(nearest).buffer(max_dist, resolution=1)
         polygons.append(bridgep)
     jointp = unary_union(polygons)
     assert jointp.geom_type == 'Polygon', jointp.wkt
@@ -1017,11 +1017,9 @@ def page_add_to_reading_order(rogroup, region_id, index=None):
     """
     if rogroup:
         if index is None:
-            rogroup.add_RegionRef(RegionRefType(
-                regionRef=region_id))
+            rogroup.add_RegionRef(RegionRefType(regionRef=region_id))
         else:
-            rogroup.add_RegionRefIndexed(RegionRefIndexedType(
-                regionRef=region_id, index=index))
+            rogroup.add_RegionRefIndexed(RegionRefIndexedType(regionRef=region_id, index=index))
             index += 1
     return index
 
@@ -1045,36 +1043,30 @@ def page_subgroup_in_reading_order(logger: Logger, roelem):
     if not roelem.parent_object_:
         logger.error('Cannot subgroup from orphan ReadingOrder element')
         return roelem
-    if isinstance(roelem, (OrderedGroupType,OrderedGroupIndexedType)) and not (
+    if isinstance(roelem, (OrderedGroupType, OrderedGroupIndexedType)) and not (
             roelem.get_OrderedGroupIndexed() or
             roelem.get_UnorderedGroupIndexed() or
             roelem.get_RegionRefIndexed()):
         # is already a group and still empty
         return roelem
-    if isinstance(roelem, (OrderedGroupType,
-                           UnorderedGroupType,
-                           RegionRefType)):
+    if isinstance(roelem, (OrderedGroupType, UnorderedGroupType, RegionRefType)):
         getattr(roelem.parent_object_, {
             OrderedGroupType: 'get_OrderedGroup',
             UnorderedGroupType: 'get_UnorderedGroup',
             RegionRefType: 'get_RegionRef',
         }.get(roelem.__class__))().remove(roelem)
-        roelem2 = OrderedGroupType(id=roelem.regionRef + '_group',
-                                   regionRef=roelem.regionRef)
+        roelem2 = OrderedGroupType(id=f"{roelem.regionRef}_group", regionRef=roelem.regionRef)
         roelem.parent_object_.add_OrderedGroup(roelem2)
         roelem2.parent_object_ = roelem.parent_object_
         return roelem2
-    if isinstance(roelem, (OrderedGroupIndexedType,
-                           UnorderedGroupIndexedType,
-                           RegionRefIndexedType)):
+    if isinstance(roelem, (OrderedGroupIndexedType, UnorderedGroupIndexedType, RegionRefIndexedType)):
         getattr(roelem.parent_object_, {
             OrderedGroupIndexedType: 'get_OrderedGroupIndexed',
             UnorderedGroupIndexedType: 'get_UnorderedGroupIndexed',
             RegionRefIndexedType: 'get_RegionRefIndexed'
         }.get(roelem.__class__))().remove(roelem)
-        roelem2 = OrderedGroupIndexedType(id=roelem.regionRef + '_group',
-                                          index=roelem.index,
-                                          regionRef=roelem.regionRef)
+        roelem2 = OrderedGroupIndexedType(
+            id=f"{roelem.regionRef}_group", index=roelem.index, regionRef=roelem.regionRef)
         roelem.parent_object_.add_OrderedGroupIndexed(roelem2)
         roelem2.parent_object_ = roelem.parent_object_
         return roelem2

From 8b9283232a57b7c49a78420b32c915b32992ee9a Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Sat, 17 Aug 2024 02:02:28 +0200
Subject: [PATCH 70/97] optimize ocropy common

---
 ocrd_cis/ocropy/common.py | 186 +++++++++++++++++++-------------------
 1 file changed, 93 insertions(+), 93 deletions(-)

diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py
index c5b56ed0..a5806517 100644
--- a/ocrd_cis/ocropy/common.py
+++ b/ocrd_cis/ocropy/common.py
@@ -184,16 +184,19 @@ def estimate_thresholds(flat, bignore=0.1, escale=1.0, lo=5, hi=90):
     d0, d1 = flat.shape
     o0, o1 = int(bignore * d0), int(bignore * d1)
     est = flat[o0:d0 - o0, o1:d1 - o1]
+
     if escale > 0:
         # by default, we use only regions that contain
         # significant variance; this makes the percentile
         # based low and high estimates more reliable
         e = escale
-        v = est - filters.gaussian_filter(est, e * 20.0)
-        v = filters.gaussian_filter(v ** 2, e * 20.0) ** 0.5
+        e_20_0 = e * 20.0
+        e_50 = int(e * 50)
+        v = est - filters.gaussian_filter(est, e_20_0)
+        v = filters.gaussian_filter(v ** 2, e_20_0) ** 0.5
         v = (v > 0.3 * np.amax(v))
-        v = morphology.binary_dilation(v, structure=np.ones((int(e * 50), 1)))
-        v = morphology.binary_dilation(v, structure=np.ones((1, int(e * 50))))
+        v = morphology.binary_dilation(v, structure=np.ones((e_50, 1)))
+        v = morphology.binary_dilation(v, structure=np.ones((1, e_50)))
         est = est[v]
     lo = stats.scoreatpercentile(est.ravel(), lo)
     hi = stats.scoreatpercentile(est.ravel(), hi)
@@ -310,24 +313,24 @@ def check_line(binary, zoom=1.0):
 
     Returns an error report, or None if valid.
     """
-    if np.prod(binary.shape)==0: return "image dimensions are zero"
-    if len(binary.shape)==3: return "image is not monochrome %s"%(binary.shape,)
+    if np.prod(binary.shape) == 0: return "image dimensions are zero"
+    if len(binary.shape) == 3: return f"image is not monochrome {binary.shape}"
     if np.amax(binary)==np.amin(binary): return "image is blank"
     if np.mean(binary)<np.median(binary): return "image may be inverted"
     h,w = binary.shape
-    if h<20/zoom: return "image not tall enough for a text line %s"%(binary.shape,)
-    if h>200/zoom: return "image too tall for a text line %s"%(binary.shape,)
+    if h<20/zoom: return f"image not tall enough for a text line {binary.shape}"
+    if h>200/zoom: return f"image too tall for a text line {binary.shape}"
     ##if w<1.5*h: return "line too short %s"%(binary.shape,)
-    if w<1.5*h and w<32/zoom: return "image too short for a line image %s"%(binary.shape,)
-    if w>4000/zoom: return "image too long for a line image %s"%(binary.shape,)
+    if w<1.5*h and w<32/zoom: return f"image too short for a line image {binary.shape}"
+    if w>4000/zoom: return f"image too long for a line image {binary.shape}"
     return None
     ratio = w*1.0/h
     _, ncomps = measurements.label(binary)
     lo = int(0.5*ratio+0.5)
     hi = int(4*ratio)+1
-    if ncomps<lo: return "too few connected components (got %d, wanted >=%d)"%(ncomps,lo)
-    ##if ncomps>hi*ratio: return "too many connected components (got %d, wanted <=%d)"%(ncomps,hi)
-    if ncomps>hi*ratio and ncomps>10: return "too many connected components (got %d, wanted <=%d)"%(ncomps,hi)
+    if ncomps<lo: return f"too few connected components (got {ncomps}, wanted >={lo})"
+    ##if ncomps>hi*ratio: return f"too many connected components (got {ncomps}, wanted <={hi})"
+    if ncomps>hi*ratio and ncomps>10: return f"too many connected components (got {ncomps}, wanted <={hi})"
     return None
 
 # inspired by ocropus-gpageseg check_page
@@ -341,21 +344,21 @@ def check_region(binary, zoom=1.0):
 
     Returns an error report, or None if valid.
     """
-    if np.prod(binary.shape)==0: return "image dimensions are zero"
-    if len(binary.shape)==3: return "image is not monochrome %s"%(binary.shape,)
+    if np.prod(binary.shape) == 0: return "image dimensions are zero"
+    if len(binary.shape) == 3: return f"image is not monochrome {binary.shape}"
     if np.amax(binary)==np.amin(binary): return "image is blank"
     if np.mean(binary)<np.median(binary): return "image may be inverted"
     h,w = binary.shape
-    if h<45/zoom: return "image not tall enough for a region image %s"%(binary.shape,)
-    if h>5000/zoom: return "image too tall for a region image %s"%(binary.shape,)
-    if w<100/zoom: return "image too narrow for a region image %s"%(binary.shape,)
-    if w>5000/zoom: return "image too wide for a region image %s"%(binary.shape,)
+    if h<45/zoom: return f"image not tall enough for a region image {binary.shape}"
+    if h>5000/zoom: return f"image too tall for a region image {binary.shape}"
+    if w<100/zoom: return f"image too narrow for a region image {binary.shape}"
+    if w>5000/zoom: return f"image too wide for a region image {binary.shape}"
     return None
     # zoom factor (DPI relative) and 4 (against fragmentation from binarization)
     slots = int(w*h*1.0/(30*30)*zoom*zoom) * 4
     _,ncomps = measurements.label(binary)
-    if ncomps<5: return "too few connected components for a region image (got %d)"%(ncomps,)
-    if ncomps>slots and ncomps>10: return "too many connected components for a region image (%d > %d)"%(ncomps,slots)
+    if ncomps<5: return f"too few connected components for a region image (got {ncomps})"
+    if ncomps>slots and ncomps>10: return f"too many connected components for a region image ({ncomps} > {slots})"
     return None
 
 # from ocropus-gpageseg, but with zoom parameter
@@ -369,21 +372,21 @@ def check_page(binary, zoom=1.0):
 
     Returns an error report, or None if valid.
     """
-    if np.prod(binary.shape)==0: return "image dimensions are zero"
-    if len(binary.shape)==3: return "image not monochrome %s"%(binary.shape,)
+    if np.prod(binary.shape) == 0: return "image dimensions are zero"
+    if len(binary.shape) == 3: return f"image not monochrome {binary.shape}"
     if np.amax(binary)==np.amin(binary): return "image is blank"
     if np.mean(binary)<np.median(binary): return "image may be inverted"
     h,w = binary.shape
-    if h<600/zoom: return "image not tall enough for a page image %s"%(binary.shape,)
-    if h>20000/zoom: return "image too tall for a page image %s"%(binary.shape,)
-    if w<600/zoom: return "image too narrow for a page image %s"%(binary.shape,)
-    if w>20000/zoom: return "image too wide for a page image %s"%(binary.shape,)
+    if h<600/zoom: return f"image not tall enough for a page image {binary.shape}"
+    if h>20000/zoom: return f"image too tall for a page image {binary.shape}"
+    if w<600/zoom: return f"image too narrow for a page image {binary.shape}"
+    if w>20000/zoom: return f"image too wide for a page image {binary.shape}"
     return None
     # zoom factor (DPI relative) and 4 (against fragmentation from binarization)
     slots = int(w*h*1.0/(30*30)*zoom*zoom) * 4
     _,ncomps = measurements.label(binary)
-    if ncomps<10: return "too few connected components for a page image (got %d)"%(ncomps,)
-    if ncomps>slots and ncomps>10: return "too many connected components for a page image (%d > %d)"%(ncomps,slots)
+    if ncomps<10: return f"too few connected components for a page image (got {ncomps})"
+    if ncomps>slots and ncomps>10: return f"too many connected components for a page image ({ncomps} > {slots})"
     return None
 
 def odd(num):
@@ -476,8 +479,13 @@ def compute_images(binary, scale, maximages=5):
     #images = morph.rb_closing(images, (d0,d1))
     #DSAVE('images1_closed', images+0.6*binary)
     # 1- filter largest connected components
-    images = morph.select_regions(images,sl.area,min=(4*scale)**2,nbest=2*maximages)
-    DSAVE('images1_large', images+0.6*binary)
+    binary_0_6 = 0.6 * binary
+    odd_scale = odd(scale)
+    odd_half_scale = odd(scale / 2)
+    odd_doubled_scale = odd(2 * scale)
+    region_min = (4 * scale) ** 2
+    images = morph.select_regions(images, sl.area, min=region_min, nbest=2 * maximages)
+    DSAVE('images1_large', images + binary_0_6)
     if not images.any():
         return np.zeros_like(binary, int)
     # 2- open horizontally and vertically to suppress
@@ -486,31 +494,31 @@ def compute_images(binary, scale, maximages=5):
     #    single frame, because then the hull polygon
     #    can cover/overlap large text/table parts which
     #    we cannot discern from the actual image anymore
-    h_opened = morph.rb_opening(images, (1, odd(scale/2)))
-    DSAVE('images2_h-opened', h_opened+0.6*binary)
-    v_opened = morph.rb_opening(images, (odd(scale/2), 1))
-    DSAVE('images2_v-opened', v_opened+0.6*binary)
+    h_opened = morph.rb_opening(images, (1, odd_half_scale))
+    DSAVE('images2_h-opened', h_opened + binary_0_6)
+    v_opened = morph.rb_opening(images, (odd_half_scale, 1))
+    DSAVE('images2_v-opened', v_opened + binary_0_6)
     # 3- close whatever remains
-    closed = morph.rb_closing(h_opened&v_opened, (odd(2*scale),odd(2*scale)))
-    DSAVE('images3_closed', closed+0.6*binary)
+    closed = morph.rb_closing(h_opened&v_opened, (odd_doubled_scale, odd_doubled_scale))
+    DSAVE('images3_closed', closed + binary_0_6)
     # 4- reconstruct the losses up to a certain distance
     #    to avoid creeping into pure h/v-lines again but still
     #    cover most of the large object
     #images = np.where(images, closed, 2)
     #images = morph.spread_labels(images, maxdist=scale) % 2 | closed
     images = morph.rb_reconstruction(closed, images, step=2, maxsteps=scale)
-    DSAVE('images4_reconstructed', images+0.6*binary)
+    DSAVE('images4_reconstructed', images + binary_0_6)
     # 5- select nbest
-    images = morph.select_regions(images,sl.area,min=(4*scale)**2,nbest=maximages)
-    DSAVE('images5_selected', images+0.6*binary)
+    images = morph.select_regions(images, sl.area, min=region_min, nbest=maximages)
+    DSAVE('images5_selected', images + binary_0_6)
     if not images.any():
         return np.zeros_like(binary, int)
     # 6- dilate a little to get a smooth contour without gaps
-    dilated = morph.r_dilation(images, (odd(scale),odd(scale)))
+    dilated = morph.r_dilation(images, (odd_scale, odd_scale))
     images = morph.propagate_labels_majority(binary, dilated+1)
     images = morph.spread_labels(images, maxdist=scale)==2
     images, _ = morph.label(images)
-    DSAVE('images6_dilated', images+0.6*binary)
+    DSAVE('images6_dilated', images + binary_0_6)
     # we could repeat reconstruct-dilate here...
     return images
 
@@ -548,6 +556,7 @@ def compute_seplines(binary, scale, maxseps=0):
     sepsizes = [0]
     sepslices = [None]
     sepdists = [0]
+    doubled_scale = 2 * scale
     for label in range(1, nlabels + 1):
         labelslice = slices[label]
         labelmask = labels == label
@@ -599,8 +608,8 @@ def compute_seplines(binary, scale, maxseps=0):
                 binmask = sublabels == bin + 1
                 binlabels, nbinlabels = morph.label(binmask)
                 _, binlabelcounts = np.unique(binlabels, return_counts=True)
-                largemask = (binlabelcounts > 2 * scale)[binlabels]
-                smallmask = (binlabelcounts <= 2 * scale)[binlabels]
+                largemask = (binlabelcounts > doubled_scale)[binlabels]
+                smallmask = (binlabelcounts <= doubled_scale)[binlabels]
                 sublabels2[binmask & smallmask] = 1
                 if not np.any(binmask & largemask):
                     continue
@@ -1843,11 +1852,13 @@ def find_topological():
             else:
                 llab[box] = lbinary[box]
             # show projection at the sides
-            for i in range(int(scale/2)):
-                llab[box[0],box[1].start+i] = -10*np.log(y+1e-9)
-                llab[box[0],box[1].stop-1-i] = -10*np.log(y+1e-9)
-                llab[box[0].start+i,box[1]] = -10*np.log(x+1e-9)
-                llab[box[0].stop-1-i,box[1]] = -10*np.log(x+1e-9)
+            log_y = -10 * np.log(y + 1e-9)
+            log_x = -10 * np.log(x + 1e-9)
+            for i in range(int(scale / 2)):
+                llab[box[0], box[1].start + i] = log_y
+                llab[box[0], box[1].stop - 1 - i] = log_y
+                llab[box[0].start + i, box[1]] = log_x
+                llab[box[0].stop - 1 - i, box[1]] = log_x
             DSAVE('recursive_x_y_cut_' + (partition_type or 'sliced'), llab)
         gap_weights = list()
         for is_horizontal, profile in enumerate([y, x]):
@@ -1877,19 +1888,19 @@ def find_topological():
                 weights = weights * (1 + 0.5 * props['peak_heights']/gap_height)
             gap_weights.append((gaps, weights))
             if debug:
-                LOG.debug('  {} gaps {} {} weights {}'.format(
-                    'horizontal' if is_horizontal else 'vertical',
-                    gaps, props, weights))
+                orientation = 'horizontal' if is_horizontal else 'vertical'
+                LOG.debug(f'  {orientation} gaps {gaps} {props} weights {weights}')
                 if not gaps.shape[0]:
                     continue
+                half_scale = int(scale / 2)
                 for start, stop, height in sorted(zip(
                         props['left_ips'].astype(int),
                         props['right_ips'].astype(int),
                         props['peak_heights']), key=lambda x: x[2]):
                     if is_horizontal:
-                        llab[box[0].start+int(scale/2):box[0].stop-int(scale/2),box[1].start+start:box[1].start+stop] = -10*np.log(-height+1e-9)
+                        llab[box[0].start+half_scale:box[0].stop-half_scale,box[1].start+start:box[1].start+stop] = -10*np.log(-height+1e-9)
                     else:
-                        llab[box[0].start+start:box[0].start+stop,box[1].start+int(scale/2):box[1].stop-int(scale/2)] = -10*np.log(-height+1e-9)
+                        llab[box[0].start+start:box[0].start+stop,box[1].start+half_scale:box[1].stop-half_scale] = -10*np.log(-height+1e-9)
                 DSAVE('recursive_x_y_cut_gaps_' + ('h' if is_horizontal else 'v'), llab)
         # heuristic (not strict) decision on x or y cut,
         # factors to consider:
@@ -1916,32 +1927,27 @@ def find_topological():
         #   are not allowed
         y_gaps, y_weights = gap_weights[0][0], gap_weights[0][1]
         x_gaps, x_weights = gap_weights[1][0], gap_weights[1][1]
-        if debug: LOG.debug('   all y_gaps {} x_gaps {}'.format(y_gaps, x_gaps))
+        if debug: LOG.debug(f'   all y_gaps {y_gaps} x_gaps {x_gaps}')
         # suppress cuts that significantly split any line labels
+        min_line_scale = min_line * scale
         y_allowed = [not(np.any(np.intersect1d(
             # significant line labels above
-            np.nonzero(np.bincount(lbin[:gap,:].flatten(),
-                                   minlength=len(objects))[1:] > min_line * scale)[0],
+            np.nonzero(np.bincount(lbin[:gap,:].flatten(), minlength=len(objects))[1:] > min_line_scale)[0],
             # significant line labels below
-            np.nonzero(np.bincount(lbin[gap:,:].flatten(),
-                                   minlength=len(objects))[1:] > min_line * scale)[0],
-            assume_unique=True)))
-                        for gap in y_gaps]
+            np.nonzero(np.bincount(lbin[gap:,:].flatten(), minlength=len(objects))[1:] > min_line_scale)[0],
+            assume_unique=True))) for gap in y_gaps]
         x_allowed = [not(np.any(np.intersect1d(
             # significant line labels left
-            np.nonzero(np.bincount(lbin[:,:gap].flatten(),
-                                   minlength=len(objects))[1:] > min_line * scale)[0],
+            np.nonzero(np.bincount(lbin[:,:gap].flatten(), minlength=len(objects))[1:] > min_line_scale)[0],
             # significant line labels right
-            np.nonzero(np.bincount(lbin[:,gap:].flatten(),
-                                   minlength=len(objects))[1:] > min_line * scale)[0],
-            assume_unique=True)))
-                        for gap in x_gaps]
+            np.nonzero(np.bincount(lbin[:,gap:].flatten(), minlength=len(objects))[1:] > min_line_scale)[0],
+            assume_unique=True))) for gap in x_gaps]
         y_gaps, y_weights = y_gaps[y_allowed], y_weights[y_allowed]
         x_gaps, x_weights = x_gaps[x_allowed], x_weights[x_allowed]
-        if debug: LOG.debug('   allowed y_gaps {} x_gaps {}'.format(y_gaps, x_gaps))
+        if debug: LOG.debug(f'   allowed y_gaps {y_gaps} x_gaps {x_gaps}')
         y_prominence = np.amax(y_weights, initial=0)
         x_prominence = np.amax(x_weights, initial=0)
-        if debug: LOG.debug('   y_prominence {} x_prominence {}'.format(y_prominence, x_prominence))
+        if debug: LOG.debug(f'   y_prominence {y_prominence} x_prominence {x_prominence}')
         # suppress less prominent peaks (another heuristic...)
         # they must compete with the other direction next time
         # (when already new cuts or partitions will become visible)
@@ -1949,33 +1955,30 @@ def find_topological():
         x_allowed = x_weights > 0.8 * x_prominence
         y_gaps, y_weights = y_gaps[y_allowed], y_weights[y_allowed]
         x_gaps, x_weights = x_gaps[x_allowed], x_weights[x_allowed]
-        if debug: LOG.debug('   prominent y_gaps {} x_gaps {}'.format(y_gaps, x_gaps))
+        if debug: LOG.debug(f'   prominent y_gaps {y_gaps} x_gaps {x_gaps}')
         if npartitions > 0:
             # TODO this can be avoided when backtracking below
             # suppress peaks creating fewer partitions than others --
             # how large in our preferred direction will the new partitions
             # of sepmask in both slices created by each cut candidate
             # add up?
-            y_partitionscores = [sum(map(sl.height if prefer_vertical else sl.width,
-                                         morph.find_objects(morph.label(
-                                             partitions[:gap,:]>0)[0]) +
-                                         morph.find_objects(morph.label(
-                                             partitions[gap:,:]>0)[0])))
-                                 for gap in y_gaps]
-            x_partitionscores = [sum(map(sl.height if prefer_vertical else sl.width,
-                                         morph.find_objects(morph.label(
-                                             partitions[:,:gap]>0)[0]) +
-                                         morph.find_objects(morph.label(
-                                             partitions[:,gap:]>0)[0])))
-                                 for gap in x_gaps]
-            if debug: LOG.debug('   y_partitionscores {} x_partitionscores {}'.format(
-                    y_partitionscores, x_partitionscores))
+            y_partitionscores = [sum(map(
+                sl.height if prefer_vertical else sl.width,
+                morph.find_objects(morph.label(partitions[:gap, :] > 0)[0]) +
+                morph.find_objects(morph.label(partitions[gap:, :] > 0)[0])))
+                for gap in y_gaps]
+            x_partitionscores = [sum(map(
+                sl.height if prefer_vertical else sl.width,
+                morph.find_objects(morph.label(partitions[:, : gap] > 0)[0]) +
+                morph.find_objects(morph.label(partitions[:, gap :] > 0)[0])))
+                for gap in x_gaps]
+            if debug: LOG.debug(f'   y_partitionscores {y_partitionscores} x_partitionscores {x_partitionscores}')
             # Now identify those gaps with the largest overall score
             y_allowed = y_partitionscores == np.max(y_partitionscores, initial=0)
             x_allowed = x_partitionscores == np.max(x_partitionscores, initial=0)
             y_gaps, y_weights = y_gaps[y_allowed], y_weights[y_allowed]
             x_gaps, x_weights = x_gaps[x_allowed], x_weights[x_allowed]
-            if debug: LOG.debug('   most partitioning y_gaps {} x_gaps {}'.format(y_gaps, x_gaps))
+            if debug: LOG.debug(f'   most partitioning y_gaps {y_gaps} x_gaps {x_gaps}')
         else:
             y_partitionscores = None
             x_partitionscores = None
@@ -1986,7 +1989,7 @@ def find_topological():
         x_allowed = x_weights > 0.9 * x_prominence
         y_gaps, y_weights = y_gaps[y_allowed], y_weights[y_allowed]
         x_gaps, x_weights = x_gaps[x_allowed], x_weights[x_allowed]
-        if debug: LOG.debug('   prominent y_gaps {} x_gaps {}'.format(y_gaps, x_gaps))
+        if debug: LOG.debug(f'   prominent y_gaps {y_gaps} x_gaps {x_gaps}')
         
         # decide which direction, x or y
         # TODO: this most likely needs a backtracking mechanism
@@ -2052,7 +2055,7 @@ def find_topological():
                     llab2[box] = partitions
                 DSAVE('recursive_x_y_cut_partitions', llab2)
             for label in range(1, npartitions+1):
-                LOG.debug('next partition %d on %s', label, box)
+                LOG.debug(f'next partition %d on %s', label, box)
                 recursive_x_y_cut(box, mask=partitions==label, partition_type=new_partition_type)
             return
         
@@ -2060,10 +2063,9 @@ def find_topological():
             # no gaps left
             finalize()
             return
+        orientation = 'vertical' if choose_vertical else 'horizontal'
         # otherwise: cut on gaps
-        LOG.debug('cutting %s on %s into %s', 'vertically'
-                  if choose_vertical else 'horizontally',
-                  box, gaps)
+        LOG.debug(f'cutting {orientation}ly on {box} into {gaps}')
         cuts = list(zip(np.insert(gaps, 0, 0), np.append(gaps, lim)))
         if choose_vertical:
             if rl:
@@ -2078,9 +2080,7 @@ def find_topological():
                 sub = sl.box(0, len(y), start, stop)
             else: # "cut in horizontal direction"
                 sub = sl.box(start, stop, 0, len(x))
-            LOG.debug('next %s block on %s is %s', 'horizontal'
-                      if choose_vertical else 'vertical',
-                      box, sub)
+            LOG.debug(f'next {orientation} block on {box} is {sub}')
             recursive_x_y_cut(sl.compose(box,sub),
                               mask=sl.cut(mask,sub) if isinstance(mask, np.ndarray)
                               else None)

From fceaffe4e928bff7ea70aece7baa3d3717c03cff Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Sat, 17 Aug 2024 02:03:47 +0200
Subject: [PATCH 71/97] optimize ocrolib

---
 ocrd_cis/ocropy/ocrolib/morph.py    | 18 ++++++++++--------
 ocrd_cis/ocropy/ocrolib/toplevel.py | 26 ++++++++------------------
 2 files changed, 18 insertions(+), 26 deletions(-)

diff --git a/ocrd_cis/ocropy/ocrolib/morph.py b/ocrd_cis/ocropy/ocrolib/morph.py
index 7d6ffc85..b9619cca 100644
--- a/ocrd_cis/ocropy/ocrolib/morph.py
+++ b/ocrd_cis/ocropy/ocrolib/morph.py
@@ -292,8 +292,9 @@ def propagate_labels_majority(image,labels):
     with the largest overlap."""
     rlabels,_ = label(image)
     cors = correspondences(rlabels,labels)
-    outputs = zeros(amax(rlabels)+1,'i')
-    counts = zeros(amax(rlabels)+1,'i')
+    amax_rlabels = amax(rlabels) + 1
+    outputs = zeros(amax_rlabels,'i')
+    counts = zeros(amax_rlabels,'i')
     for rlabel, label_, count in cors.T:
         if not rlabel or not label_:
             # ignore background correspondences
@@ -347,12 +348,13 @@ def all_neighbors(image, dist=1, bg=NaN):
     """Given an image with labels, find all pairs of labels
     that are directly (up to ``dist``) neighboring each other, ignoring the label ``bg``."""
     q = 100000
-    assert amax(image)<q
-    assert amin(image)>=0
-    u = unique(q*image+shift(image,(dist,0),order=0,cval=bg))
-    d = unique(q*image+shift(image,(-dist,0),order=0,cval=bg))
-    l = unique(q*image+shift(image,(0,dist),order=0,cval=bg))
-    r = unique(q*image+shift(image,(0,-dist),order=0,cval=bg))
+    assert amax(image) < q
+    assert amin(image) >= 0
+    q_image = q * image
+    u = unique(q_image + shift(image, (dist, 0), order=0, cval=bg))
+    d = unique(q_image + shift(image, (-dist, 0), order=0, cval=bg))
+    l = unique(q_image + shift(image, (0, dist), order=0, cval=bg))
+    r = unique(q_image + shift(image, (0, -dist), order=0, cval=bg))
     all = unique(r_[u,d,l,r])
     all = all[all!=bg]
     all = c_[all//q,all%q]
diff --git a/ocrd_cis/ocropy/ocrolib/toplevel.py b/ocrd_cis/ocropy/ocrolib/toplevel.py
index 87ed18c5..72e397af 100644
--- a/ocrd_cis/ocropy/ocrolib/toplevel.py
+++ b/ocrd_cis/ocropy/ocrolib/toplevel.py
@@ -125,14 +125,10 @@ def __init__(self,*args,**kw):
         self.fun = kw.get("fun","?")
         self.var = kw.get("var","?")
         self.description = " ".join([strc(x) for x in args])
+
     def __str__(self):
-        result = "\nCheckError for argument "
-        result += str(self.var)
-        result += " of function "
-        result += str(self.fun)
-        result += "\n"
-        result += self.description
-        return result
+        return f"\nCheckError for argument {str(self.var)} of function {str(self.fun)}\n{self.description}"
+
 
 class CheckWarning(CheckError):
     def __init__(self,*args,**kw):
@@ -142,14 +138,8 @@ def __init__(self,*args,**kw):
         CheckError.__init__(self, *args, **kw)
 
     def __str__(self):
-        result = "\nCheckWarning for argument "
-        result += str(self.var)
-        result += " of function "
-        result += str(self.fun)
-        result += "\n"
-        result += self.description
-        result += "(This can happen occasionally during normal operations and isn't necessarily a bug or problem.)\n"
-        return result
+        return (f"\nCheckWarning for argument {str(self.var)} of function {str(self.fun)}\n{self.description} "
+                f"(This can happen occasionally during normal operations and isn't necessarily a bug or problem.)\n")
 
 def checktype(value,type_):
     """Check value against the type spec.  If everything
@@ -211,7 +201,7 @@ def argument_checks(*args,**kw):
                     e.var = var
                     raise e
                 except:
-                    LOG.critical("unknown exception while checking function: '%s'", name)
+                    LOG.critical(f"unknown exception while checking function: '{name}'")
                     raise
             result = f(*args,**kw)
             checktype(result,kw.get("_",True))
@@ -225,9 +215,9 @@ def decorator(f):
         def wrapper(arg):
             if not f(arg):
                 if warning:
-                    raise CheckWarning(strc(arg)+" of type "+str(type(arg))+": "+str(message))
+                    raise CheckWarning(f"{strc(arg)} of type {str(type(arg))}: {str(message)}")
                 else:
-                    raise CheckError(strc(arg)+" of type "+str(type(arg))+": "+str(message))
+                    raise CheckError(f"{strc(arg)} of type {str(type(arg))}: {str(message)}")
         return wrapper
     return decorator
 

From 3de2585787ea2b59126a4a1c39d9df3e42d18362 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Sat, 17 Aug 2024 02:03:58 +0200
Subject: [PATCH 72/97] optimize align cli

---
 ocrd_cis/align/cli.py | 85 ++++++++++++++++++-------------------------
 1 file changed, 35 insertions(+), 50 deletions(-)

diff --git a/ocrd_cis/align/cli.py b/ocrd_cis/align/cli.py
index 7747622e..7d6599c2 100644
--- a/ocrd_cis/align/cli.py
+++ b/ocrd_cis/align/cli.py
@@ -57,16 +57,16 @@ def process(self):
     def align(self, alignments, ift):
         """align the alignment objects with the according input file tuples"""
         for t in ift:
-            self.log.debug("tuple %s", os.path.basename(t.input_file.url))
+            self.log.debug(f"tuple {os.path.basename(t.input_file.url)}")
         pcgtst = self.open_input_file_tuples(ift)
         i = 0
         for mi, mr in enumerate(pcgtst[0].get_Page().get_TextRegion()):
             for mj, _ in enumerate(mr.get_TextLine()):
                 for iiii, u in enumerate(mr.get_TextLine()[mj].get_TextEquiv()):
-                    self.log.debug("[%d] %s", iiii, u.Unicode)
+                    self.log.debug(f"[{iiii}] {u.Unicode}")
                 for xx in mr.get_TextLine()[mj].get_Word():
                     for iiii, u in enumerate(xx.get_TextEquiv()):
-                        self.log.debug("[%d] %s", iiii, u.Unicode)
+                        self.log.debug(f"[{iiii}] {u.Unicode}")
 
                 lines = []
                 for ii, t in enumerate(ift):
@@ -88,23 +88,21 @@ def align_lines(self, lines):
         for i, line in enumerate(lines):
             if lines[0].region.get_TextEquiv() is None:
                 lines[0].region.TextEquiv = []
-            self.log.debug('line alignment: %s [%s - %s]',
-                          get_textequiv_unicode(line.region),
-                          line.region.get_id(),
-                          line.input_file.input_file_group)
-            ddt = line.input_file.input_file_group + "/" + line.region.get_id()
+            self.log.debug(f'line alignment: {get_textequiv_unicode(line.region)} '
+                           f'[{line.region.get_id()} - {line.input_file.input_file_group}]')
+            ddt = f"{line.input_file.input_file_group}/{line.region.get_id()}"
             if i != 0:
                 te = TextEquivType(
                     Unicode=get_textequiv_unicode(line.region),
                     conf=get_textequiv_conf(line.region),
                     dataType="other",
-                    dataTypeDetails="ocrd-cis-line-alignment:" + ddt)
+                    dataTypeDetails=f"ocrd-cis-line-alignment:{ddt}")
                 lines[0].region.add_TextEquiv(te)
             else:
                 self.log.debug("len: %i, i: %i", len(lines[0].region.get_TextEquiv()), i)
                 lines[0].region.get_TextEquiv()[i].set_dataType("other")
                 lines[0].region.get_TextEquiv()[i].set_dataTypeDetails(
-                    "ocrd-cis-line-alignment-master-ocr:" + ddt)
+                    f"ocrd-cis-line-alignment-master-ocr:{ddt}")
             lines[0].region.get_TextEquiv()[i].set_index(i+1)
         self.align_words(lines)
 
@@ -113,18 +111,18 @@ def align_words(self, lines):
         mregion = lines[0].region.get_Word()
         oregion = [lines[i].region.get_Word() for i in range(1, len(lines))]
         for word in lines[0].alignment['wordAlignments']:
-            self.log.debug("aligning word %s", word['master'])
+            self.log.debug(f"aligning word {word['master']}", )
             master, rest = self.find_word([word['master']], mregion, "master")
             mregion = rest
             if master is None or len(master) != 1:
-                self.log.warn("cannot find {}; giving up".format(word['master']))
-                # raise Exception("cannot find {}; giving up".format(word['master']))
+                self.log.warn(f"cannot find {word['master']}; giving up")
+                # raise Exception(f"cannot find {word['master']}; giving up")
                 return
             others = list()
             for i, other in enumerate(word['alignments']):
                 match, rest = self.find_word(other, oregion[i])
                 if match is None:
-                    self.log.warn("cannot find {}; giving up".format(other))
+                    self.log.warn(f"cannot find {other}; giving up")
                     return
                 others.append(match)
                 oregion[i] = rest
@@ -132,10 +130,7 @@ def align_words(self, lines):
             words.append(
                 Alignment(lines[0].input_file, master, lines[0].alignment))
             for i, other in enumerate(others):
-                words.append(Alignment(
-                    lines[i+1].input_file,
-                    other,
-                    lines[i+1].alignment))
+                words.append(Alignment(lines[i+1].input_file, other, lines[i+1].alignment))
             self.align_word_regions(words)
 
     def align_word_regions(self, words):
@@ -144,10 +139,8 @@ def te0(x):
         for i, word in enumerate(words):
             if not word.region:
                 ifg = word.input_file.input_file_group
-                self.log.debug("(empty) word alignment: [%s]", ifg)
-                te = TextEquivType(
-                    dataType="other",
-                    dataTypeDetails="ocrd-cis-empty-word-alignment:" + ifg)
+                self.log.debug(f"(empty) word alignment: [{ifg}]")
+                te = TextEquivType(dataType="other", dataTypeDetails=f"ocrd-cis-empty-word-alignment:{ifg}")
                 words[0].region[0].add_TextEquiv(te)
                 words[0].region[0].get_TextEquiv()[i].set_index(i+1)
                 continue
@@ -157,46 +150,38 @@ def te0(x):
             ddt = word.input_file.input_file_group + "/" + _id
             # if conf is none it is most likely ground truth data
             conf = min([float(te0(x).get_conf() or "1.0") for x in word.region])
-            self.log.debug("word alignment: %s [%s - %s]", _str, _id, ifg)
+            self.log.debug(f"word alignment: {_str} [{_id} - {ifg}]")
             if i != 0:
                 te = TextEquivType(
-                    Unicode=_str,
-                    conf=conf,
-                    dataType="other",
-                    dataTypeDetails="ocrd-cis-word-alignment:" + ddt)
+                    Unicode=_str, conf=conf, dataType="other", dataTypeDetails=f"ocrd-cis-word-alignment:{ddt}")
                 words[0].region[0].add_TextEquiv(te)
             else:
                 words[0].region[0].get_TextEquiv()[i].set_dataType("other")
-                words[0].region[0].get_TextEquiv()[i].set_dataTypeDetails(
-                    "ocrd-cis-word-alignment-master-ocr:" + ddt)
+                words[0].region[0].get_TextEquiv()[i].set_dataTypeDetails(f"ocrd-cis-word-alignment-master-ocr:{ddt}")
             words[0].region[0].get_TextEquiv()[i].set_index(i+1)
 
     def find_word(self, tokens, regions, t="other"):
-        self.log.debug("tokens = %s [%s]", tokens, t)
+        tokens_str = f"tokens = {tokens} [{t}]"
+        self.log.debug(tokens_str)
         for i, _ in enumerate(regions):
             n = self.match_tokens(tokens, regions, i)
             if n == 0:
                 continue
             return tuple([regions[i:n], regions[i:]])
         # not found try again with levenshtein
-        self.log.warn(
-            "could not find tokens = %s [%s]; trying again",
-            tokens, t)
+        self.log.warn(f"could not find {tokens_str}; trying again")
         for i, _ in enumerate(regions):
             n = self.match_tokens_lev(tokens, regions, i)
             if n == 0:
                 continue
             return tuple([regions[i:n], regions[i:]])
         # not found try again to match token within another one
-        self.log.warn(
-            "could not find tokens = %s [%s]; trying again",
-            tokens, t)
+        self.log.warn(f"could not find {tokens_str}; trying again")
         for i, _ in enumerate(regions):
             n = self.match_tokens_within(tokens, regions, i)
             if n == 0:
                 continue
             return tuple([regions[i:n], regions[i:]])
-
         # nothing could be found
         return tuple([None, regions])
 
@@ -212,7 +197,7 @@ def match_tokens_lev(self, tokens, regions, i):
         def f(a, b):
             k = 3  # int(len(a)/3)
             d = Levenshtein.distance(a, b)
-            self.log.debug("lev %s <=> %s: %d (%d)", a, b, d, d)
+            self.log.debug(f"lev {a} <=> {b}: {d} ({d})")
             return d <= 1 or d <= k
         return self.match_tokens_lambda(tokens, regions, i, f)
 
@@ -227,14 +212,15 @@ def match_tokens_lambda(self, tokens, regions, i, f):
         Returns 0 if nothing could be matched.
         """
         for j, token in enumerate(tokens):
-            if j + i >= len(regions):
+            sum_i_j = j + i
+            if sum_i_j >= len(regions):
                 return 0
-            if not regions[i+j].get_TextEquiv()[0].Unicode:
-                self.log.warn("cannot find %s", token)
+            unicode = regions[sum_i_j].get_TextEquiv()[0].Unicode
+            if not unicode:
+                self.log.warn(f"cannot find {token}")
                 return 0
-            self.log.debug('checking %s with %s', token,
-                           regions[i+j].get_TextEquiv()[0].Unicode)
-            if f(token, regions[i+j].get_TextEquiv()[0].Unicode):
+            self.log.debug(f'checking {token} with {unicode}')
+            if f(token, unicode):
                 continue
             if j == 0:
                 return 0
@@ -259,19 +245,18 @@ def zip_input_files(self, ifgs):
         """Zip files of the given input file groups"""
         files = list()
         for ifg in ifgs:
-            self.log.info("input file group: %s", ifg)
+            self.log.info(f"input file group: {ifg}")
             ifiles = sorted(
                 self.workspace.mets.find_files(fileGrp=ifg),
                 key=lambda ifile: ifile.url)
             for i in ifiles:
-                self.log.debug("sorted file: %s %s",
-                              os.path.basename(i.url), i.ID)
+                self.log.debug(f"sorted file: {os.path.basename(i.url)} {i.ID}")
             ifiles = [FileAlignment(self.workspace, x, ifg) for x in ifiles]
             files.append(ifiles)
         return zip(*files)
 
     def read_lines_from_input_file(self, ifile):
-        self.log.info("reading input file: %s", ifile)
+        self.log.info(f"reading input file: {ifile}")
         lines = list()
         pcgts = ifile.open()
         for region in pcgts.get_Page().get_TextRegion():
@@ -286,7 +271,7 @@ def run_java_aligner(self, ifs):
         lines = zip(*lines)
         _input = [x.strip() for t in lines for x in t]
         for i in _input:
-            self.log.debug("input line: %s", i)
+            self.log.debug(f"input line: {i}")
         n = len(ifs)
         self.log.debug("starting java client")
         p = JavaAligner(n, getLevelName(self.log.getEffectiveLevel()))
@@ -300,7 +285,7 @@ def __init__(self, workspace, ifile, ifg):
         self.log = getLogger('cis.FileAlignment')
 
     def open(self):
-        self.log.info("opening: %s", os.path.basename(self.input_file.url))
+        self.log.info(f"opening: {os.path.basename(self.input_file.url)}")
         return page_from_file(self.workspace.download_file(self.input_file))
 
 

From 0949277dbe049c1cd6776b3c701980c48cf2ebc8 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Thu, 22 Aug 2024 02:21:34 +0200
Subject: [PATCH 73/97] align: use final v3 API

---
 ocrd_cis/align/cli.py | 229 ++++++++++++++++--------------------------
 1 file changed, 85 insertions(+), 144 deletions(-)

diff --git a/ocrd_cis/align/cli.py b/ocrd_cis/align/cli.py
index 7d6599c2..f85b7348 100644
--- a/ocrd_cis/align/cli.py
+++ b/ocrd_cis/align/cli.py
@@ -1,97 +1,71 @@
 from __future__ import absolute_import
+from __future__ import annotations
+
 import click
 import json
 import os
+from typing import Optional, List, Dict, Type
+
 from rapidfuzz.distance import Levenshtein
-from ocrd import Processor
+
+from ocrd import Processor, OcrdPage, OcrdPageResult
 from ocrd.decorators import ocrd_cli_options
 from ocrd.decorators import ocrd_cli_wrap_processor
-from ocrd_utils import MIMETYPE_PAGE
 from ocrd_utils import getLogger
 from ocrd_utils import getLevelName
-from ocrd_utils import make_file_id
-from ocrd_modelfactory import page_from_file
-from ocrd_models.ocrd_page import to_xml
-from ocrd_models.ocrd_page_generateds import TextEquivType
+from ocrd_models.ocrd_page import TextRegionType, TextEquivType
 from ocrd_cis import JavaAligner
-from ocrd_cis import get_ocrd_tool
 
 @click.command()
 @ocrd_cli_options
 def ocrd_cis_align(*args, **kwargs):
-    return ocrd_cli_wrap_processor(Aligner, *args, **kwargs)
+    return ocrd_cli_wrap_processor(CISAligner, *args, **kwargs)
 
-class Aligner(Processor):
-    def __init__(self, *args, **kwargs):
-        ocrd_tool = get_ocrd_tool()
-        kwargs['ocrd_tool'] = ocrd_tool['tools']['ocrd-cis-align']
-        kwargs['version'] = ocrd_tool['version']
-        super(Aligner, self).__init__(*args, **kwargs)
+class CISAligner(Processor):
+    @property
+    def executable(self):
+        return 'ocrd-cis-align'
 
-        if hasattr(self, 'workspace'):
-            self.log = getLogger('cis.Processor.Aligner')
-
-    def process(self):
-        ifgs = self.input_file_grp.split(",")  # input file groups
-        if len(ifgs) < 2:
-            raise Exception("need at least two input file groups to align")
-        ifts = self.zip_input_files(ifgs)  # input file tuples
-        for _id, ift in enumerate(ifts):
-            alignments = json.loads(self.run_java_aligner(ift))
-            pcgts = self.align(alignments, ift)
-            # keep the right part after OCR-D-...-filename
-            # and prepend output_file_grp
-            input_file = ift[0].input_file
-            file_id = make_file_id(input_file, self.output_file_grp)
-            pcgts.set_pcGtsId(file_id)
-            out = self.workspace.add_file(
-                ID=file_id,
-                file_grp=self.output_file_grp,
-                pageId=input_file.pageId,
-                local_filename=os.path.join(self.output_file_grp, file_id + '.xml'),
-                mimetype=MIMETYPE_PAGE,
-                content=to_xml(pcgts),
-            )
-            self.log.info('created file %s', out)
+    def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
+        assert len(input_pcgts) >= 2
+        alignments = json.loads(self.run_java_aligner(input_pcgts))
+        pcgts = self.align(alignments, input_pcgts)
+        return OcrdPageResult(pcgts)
 
-    def align(self, alignments, ift):
+    def align(self, alignments: List[Dict], pcgts: List[OcrdPage]) -> OcrdPage:
         """align the alignment objects with the according input file tuples"""
-        for t in ift:
-            self.log.debug(f"tuple {os.path.basename(t.input_file.url)}")
-        pcgtst = self.open_input_file_tuples(ift)
         i = 0
-        for mi, mr in enumerate(pcgtst[0].get_Page().get_TextRegion()):
+        file_groups = self.input_file_grp.split(',')
+        for mi, mr in enumerate(pcgts[0].get_Page().get_AllRegions(classes=['Text'])):
             for mj, _ in enumerate(mr.get_TextLine()):
-                for iiii, u in enumerate(mr.get_TextLine()[mj].get_TextEquiv()):
-                    self.log.debug(f"[{iiii}] {u.Unicode}")
-                for xx in mr.get_TextLine()[mj].get_Word():
-                    for iiii, u in enumerate(xx.get_TextEquiv()):
-                        self.log.debug(f"[{iiii}] {u.Unicode}")
-
                 lines = []
-                for ii, t in enumerate(ift):
+                for ii, page in enumerate(pcgts):
                     if i >= len(alignments):
                         break
-                    tr = pcgtst[ii].get_Page().get_TextRegion()
+                    tr = page.get_Page().get_AllRegions(classes=['Text'])
                     region = tr[mi].get_TextLine()[mj]
-                    lines.append(Alignment(t, region, alignments[i]))
+                    lines.append(Alignment(file_groups[ii], page, region, alignments[i]))
                 self.align_lines(lines)
                 i += 1
-        return pcgtst[0]
+        return pcgts[0]
 
-    def align_lines(self, lines):
+    def align_lines(self, lines: List[Alignment]) -> None:
         """align the given line alignment with the lines"""
         if not lines:
             return
-        if len(lines[0].region.get_TextEquiv()) > 1:
-            del lines[0].region.get_TextEquiv()[1:]
+        if len(lines[0].region.TextEquiv) > 1:
+            del lines[0].region.TextEquiv[1:]
         for i, line in enumerate(lines):
             if lines[0].region.get_TextEquiv() is None:
                 lines[0].region.TextEquiv = []
-            self.log.debug(f'line alignment: {get_textequiv_unicode(line.region)} '
-                           f'[{line.region.get_id()} - {line.input_file.input_file_group}]')
-            ddt = f"{line.input_file.input_file_group}/{line.region.get_id()}"
-            if i != 0:
+            self.logger.debug(
+                'line alignment: %s [%s - %s]',
+                get_textequiv_unicode(line.region),
+                line.region.get_id(),
+                line.file_grp
+            )
+            ddt = line.file_grp + "/" + line.region.get_id()
+            if i > 0:
                 te = TextEquivType(
                     Unicode=get_textequiv_unicode(line.region),
                     conf=get_textequiv_conf(line.region),
@@ -99,58 +73,64 @@ def align_lines(self, lines):
                     dataTypeDetails=f"ocrd-cis-line-alignment:{ddt}")
                 lines[0].region.add_TextEquiv(te)
             else:
-                self.log.debug("len: %i, i: %i", len(lines[0].region.get_TextEquiv()), i)
-                lines[0].region.get_TextEquiv()[i].set_dataType("other")
-                lines[0].region.get_TextEquiv()[i].set_dataTypeDetails(
-                    f"ocrd-cis-line-alignment-master-ocr:{ddt}")
-            lines[0].region.get_TextEquiv()[i].set_index(i+1)
+                self.logger.debug("len: %i, i: %i", len(lines[0].region.TextEquiv), i)
+                lines[0].region.TextEquiv[i].set_dataType("other")
+                lines[0].region.TextEquiv[i].set_dataTypeDetails(
+                    "ocrd-cis-line-alignment-master-ocr:" + ddt)
+            lines[0].region.TextEquiv[i].set_index(i+1)
         self.align_words(lines)
 
-    def align_words(self, lines):
-        # self.log.info(json.dumps(lines[0].alignment))
+    def align_words(self, lines: List[Alignment]) -> None:
+        # self.logger.info(json.dumps(lines[0].alignment))
         mregion = lines[0].region.get_Word()
         oregion = [lines[i].region.get_Word() for i in range(1, len(lines))]
         for word in lines[0].alignment['wordAlignments']:
-            self.log.debug(f"aligning word {word['master']}", )
+            self.logger.debug("aligning word %s", word['master'])
             master, rest = self.find_word([word['master']], mregion, "master")
             mregion = rest
             if master is None or len(master) != 1:
-                self.log.warn(f"cannot find {word['master']}; giving up")
-                # raise Exception(f"cannot find {word['master']}; giving up")
+                self.logger.warn("cannot find {}; giving up".format(word['master']))
+                # raise Exception("cannot find {}; giving up".format(word['master']))
                 return
             others = list()
             for i, other in enumerate(word['alignments']):
                 match, rest = self.find_word(other, oregion[i])
                 if match is None:
-                    self.log.warn(f"cannot find {other}; giving up")
+                    self.logger.warn(f"cannot find {other}; giving up")
                     return
                 others.append(match)
                 oregion[i] = rest
             words = list()
             words.append(
-                Alignment(lines[0].input_file, master, lines[0].alignment))
+                Alignment(lines[0].file_grp, lines[0].pcgts, master, lines[0].alignment))
             for i, other in enumerate(others):
-                words.append(Alignment(lines[i+1].input_file, other, lines[i+1].alignment))
+                words.append(Alignment(
+                    lines[i+1].file_grp,
+                    lines[i+1].pcgts,
+                    other,
+                    lines[i+1].alignment))
             self.align_word_regions(words)
 
-    def align_word_regions(self, words):
+    def align_word_regions(self, words: List[Alignment]) -> None:
         def te0(x):
-            return x.get_TextEquiv()[0]
+            return x.TextEquiv[0]
         for i, word in enumerate(words):
             if not word.region:
-                ifg = word.input_file.input_file_group
-                self.log.debug(f"(empty) word alignment: [{ifg}]")
-                te = TextEquivType(dataType="other", dataTypeDetails=f"ocrd-cis-empty-word-alignment:{ifg}")
+                ifg = word.file_grp
+                self.logger.debug("(empty) word alignment: [%s]", ifg)
+                te = TextEquivType(
+                    dataType="other",
+                    dataTypeDetails="ocrd-cis-empty-word-alignment:" + ifg)
                 words[0].region[0].add_TextEquiv(te)
                 words[0].region[0].get_TextEquiv()[i].set_index(i+1)
                 continue
             _str = " ".join([te0(x).Unicode for x in word.region])
             _id = ",".join([x.get_id() for x in word.region])
-            ifg = word.input_file.input_file_group
-            ddt = word.input_file.input_file_group + "/" + _id
+            ifg = word.file_grp
+            ddt = word.file_grp + "/" + _id
             # if conf is none it is most likely ground truth data
             conf = min([float(te0(x).get_conf() or "1.0") for x in word.region])
-            self.log.debug(f"word alignment: {_str} [{_id} - {ifg}]")
+            self.logger.debug(f"word alignment: {_str} [{_id} - {ifg}]")
             if i != 0:
                 te = TextEquivType(
                     Unicode=_str, conf=conf, dataType="other", dataTypeDetails=f"ocrd-cis-word-alignment:{ddt}")
@@ -162,21 +142,21 @@ def te0(x):
 
     def find_word(self, tokens, regions, t="other"):
         tokens_str = f"tokens = {tokens} [{t}]"
-        self.log.debug(tokens_str)
+        self.logger.debug(tokens_str)
         for i, _ in enumerate(regions):
             n = self.match_tokens(tokens, regions, i)
             if n == 0:
                 continue
             return tuple([regions[i:n], regions[i:]])
         # not found try again with levenshtein
-        self.log.warn(f"could not find {tokens_str}; trying again")
+        self.logger.warn(f"could not find {tokens_str}; trying again")
         for i, _ in enumerate(regions):
             n = self.match_tokens_lev(tokens, regions, i)
             if n == 0:
                 continue
             return tuple([regions[i:n], regions[i:]])
         # not found try again to match token within another one
-        self.log.warn(f"could not find {tokens_str}; trying again")
+        self.logger.warn(f"could not find {tokens_str}; trying again")
         for i, _ in enumerate(regions):
             n = self.match_tokens_within(tokens, regions, i)
             if n == 0:
@@ -197,7 +177,7 @@ def match_tokens_lev(self, tokens, regions, i):
         def f(a, b):
             k = 3  # int(len(a)/3)
             d = Levenshtein.distance(a, b)
-            self.log.debug(f"lev {a} <=> {b}: {d} ({d})")
+            self.logger.debug(f"lev {a} <=> {b}: {d} ({d})")
             return d <= 1 or d <= k
         return self.match_tokens_lambda(tokens, regions, i, f)
 
@@ -215,11 +195,11 @@ def match_tokens_lambda(self, tokens, regions, i, f):
             sum_i_j = j + i
             if sum_i_j >= len(regions):
                 return 0
-            unicode = regions[sum_i_j].get_TextEquiv()[0].Unicode
+            unicode = regions[sum_i_j].TextEquiv[0].Unicode
             if not unicode:
-                self.log.warn(f"cannot find {token}")
+                self.logger.warn(f"cannot find {token}")
                 return 0
-            self.log.debug(f'checking {token} with {unicode}')
+            self.logger.debug(f'checking {token} with {unicode}')
             if f(token, unicode):
                 continue
             if j == 0:
@@ -230,68 +210,29 @@ def match_tokens_lambda(self, tokens, regions, i, f):
             i += 1
         return i + len(tokens)
 
-    def open_input_file_tuples(self, ift):
-        """
-        opens all xml files of the given input file tuple
-        and returns them as tuples
-        """
-        res = list()
-        for ifile in ift:
-            pcgts = ifile.open()
-            res.append(pcgts)
-        return tuple(res)
-
-    def zip_input_files(self, ifgs):
-        """Zip files of the given input file groups"""
-        files = list()
-        for ifg in ifgs:
-            self.log.info(f"input file group: {ifg}")
-            ifiles = sorted(
-                self.workspace.mets.find_files(fileGrp=ifg),
-                key=lambda ifile: ifile.url)
-            for i in ifiles:
-                self.log.debug(f"sorted file: {os.path.basename(i.url)} {i.ID}")
-            ifiles = [FileAlignment(self.workspace, x, ifg) for x in ifiles]
-            files.append(ifiles)
-        return zip(*files)
-
-    def read_lines_from_input_file(self, ifile):
-        self.log.info(f"reading input file: {ifile}")
+    def run_java_aligner(self, input_pcgts: List[OcrdPage]) -> str:
         lines = list()
-        pcgts = ifile.open()
-        for region in pcgts.get_Page().get_TextRegion():
-            for line in region.get_TextLine():
-                lines.append(get_textequiv_unicode(line))
-        return lines
-
-    def run_java_aligner(self, ifs):
-        lines = list()
-        for ifile in ifs:
-            lines.append(self.read_lines_from_input_file(ifile))
+        for pcgts in input_pcgts:
+            lines.append([get_textequiv_unicode(line)
+                          for line in pcgts.get_Page().get_AllTextLines()])
+        # JavaAligner expects a strange input format
         lines = zip(*lines)
         _input = [x.strip() for t in lines for x in t]
         for i in _input:
-            self.log.debug(f"input line: {i}")
-        n = len(ifs)
-        self.log.debug("starting java client")
-        p = JavaAligner(n, getLevelName(self.log.getEffectiveLevel()))
+            self.logger.debug("input line: %s", i)
+        n = len(input_pcgts)
+        self.logger.debug("starting java client")
+        p = JavaAligner(n, getLevelName(self.logger.getEffectiveLevel()))
         return p.run("\n".join(_input))
 
-class FileAlignment:
-    def __init__(self, workspace, ifile, ifg):
-        self.workspace = workspace
-        self.input_file = ifile
-        self.input_file_group = ifg
-        self.log = getLogger('cis.FileAlignment')
-
-    def open(self):
-        self.log.info(f"opening: {os.path.basename(self.input_file.url)}")
-        return page_from_file(self.workspace.download_file(self.input_file))
-
-
 class Alignment:
-    def __init__(self, ifile, region, alignment):
-        self.input_file = ifile
+    file_grp: str
+    pcgts: OcrdPage
+    region: TextRegionType
+    alignment: Alignment
+    def __init__(self, file_grp: str, pcgts: OcrdPage, region: TextRegionType, alignment: Alignment):
+        self.file_grp = file_grp
+        self.pcgts = pcgts
         self.region = region
         self.alignment = alignment
 

From d4f8483ffdefac50161e4376637b9f8e813c384f Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Thu, 22 Aug 2024 02:21:58 +0200
Subject: [PATCH 74/97] use ocrd_utils instead of pkg_resources

---
 ocrd_cis/data/__main__.py | 10 +++++-----
 ocrd_cis/javaprocess.py   |  5 ++---
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/ocrd_cis/data/__main__.py b/ocrd_cis/data/__main__.py
index 3d8ef735..8fdcddd6 100644
--- a/ocrd_cis/data/__main__.py
+++ b/ocrd_cis/data/__main__.py
@@ -1,18 +1,18 @@
-import pkg_resources
 import sys
+from ocrd_utils import resource_filename
 
 def main():
     usage = 'usage: ' + sys.argv[0] + ' -jar|-3gs|-model|-config'
     if '-h' in sys.argv:
         print(usage)
     elif '-jar' in sys.argv:
-        print(pkg_resources.resource_filename('ocrd_cis', 'data/ocrd-cis.jar'))
+        print(resource_filename('ocrd_cis', 'data/ocrd-cis.jar'))
     elif '-3gs' in sys.argv:
-        print(pkg_resources.resource_filename('ocrd_cis', 'data/3gs.csv.gz'))
+        print(resource_filename('ocrd_cis', 'data/3gs.csv.gz'))
     elif '-model' in sys.argv:
-        print(pkg_resources.resource_filename('ocrd_cis', 'data/model.zip'))
+        print(resource_filename('ocrd_cis', 'data/model.zip'))
     elif '-config' in sys.argv:
-        print(pkg_resources.resource_filename('ocrd_cis', 'data/config.json'))
+        print(resource_filename('ocrd_cis', 'data/config.json'))
     else:
         raise ValueError(usage)
 
diff --git a/ocrd_cis/javaprocess.py b/ocrd_cis/javaprocess.py
index ce2f6bfd..72915d68 100644
--- a/ocrd_cis/javaprocess.py
+++ b/ocrd_cis/javaprocess.py
@@ -1,12 +1,11 @@
 import subprocess
 import json
-import pkg_resources
 
-from ocrd_utils import getLogger
+from ocrd_utils import getLogger, resource_filename
 from pathlib import Path
 
 MAIN = "de.lmu.cis.ocrd.cli.Main"
-JAR = pkg_resources.resource_filename('ocrd_cis', 'data/ocrd-cis.jar')
+JAR = str(resource_filename('ocrd_cis', 'data/ocrd-cis.jar'))
 
 def JavaAligner(n, loglvl):
     """Create a java process that calls -c align -D '{"n":n}'"""

From ecc44c0358354c0c3c3ba6000e7de7413dc9cef1 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Thu, 22 Aug 2024 13:31:09 +0200
Subject: [PATCH 75/97] postcorrect: use final v3 API

---
 ocrd_cis/align/cli.py       |  1 +
 ocrd_cis/postcorrect/cli.py | 22 ++++++++++------------
 2 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/ocrd_cis/align/cli.py b/ocrd_cis/align/cli.py
index f85b7348..f5e47785 100644
--- a/ocrd_cis/align/cli.py
+++ b/ocrd_cis/align/cli.py
@@ -16,6 +16,7 @@
 from ocrd_models.ocrd_page import TextRegionType, TextEquivType
 from ocrd_cis import JavaAligner
 
+
 @click.command()
 @ocrd_cli_options
 def ocrd_cis_align(*args, **kwargs):
diff --git a/ocrd_cis/postcorrect/cli.py b/ocrd_cis/postcorrect/cli.py
index dc3ee48e..71fbaad1 100644
--- a/ocrd_cis/postcorrect/cli.py
+++ b/ocrd_cis/postcorrect/cli.py
@@ -1,14 +1,15 @@
 from __future__ import absolute_import
+import os
+
 import click
 import json
-import os
+
 from ocrd import Processor
-from ocrd.decorators import ocrd_cli_options
-from ocrd.decorators import ocrd_cli_wrap_processor
+from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
 from ocrd_utils import getLogger, getLevelName
 from ocrd_models.ocrd_mets import OcrdMets
 from ocrd_cis import JavaPostCorrector
-from ocrd_cis import get_ocrd_tool
+
 
 @click.command()
 @ocrd_cli_options
@@ -16,26 +17,23 @@ def ocrd_cis_postcorrect(*args, **kwargs):
     return ocrd_cli_wrap_processor(PostCorrector, *args, **kwargs)
 
 class PostCorrector(Processor):
-    def __init__(self, *args, **kwargs):
-        ocrd_tool = get_ocrd_tool()
-        kwargs['ocrd_tool'] = ocrd_tool['tools']['ocrd-cis-postcorrect']
-        kwargs['version'] = ocrd_tool['version']
-        super(PostCorrector, self).__init__(*args, **kwargs)
+    @property
+    def executable(self):
+        return 'ocrd-cis-postcorrect'
 
     def process(self):
-        self.log = getLogger('processor.CISPostCorrector')
         profiler = {}
         profiler["path"] = self.parameter["profilerPath"]
         profiler["config"] = self.parameter["profilerConfig"]
         profiler["noCache"] = True
         self.parameter["profiler"] = profiler
         self.parameter["runDM"] = True
-        self.log.debug(json.dumps(self.parameter, indent=4))
+        self.logger.debug(json.dumps(self.parameter, indent=4))
         p = JavaPostCorrector(self.workspace.mets_target,
                               self.input_file_grp,
                               self.output_file_grp,
                               self.parameter,
-                              getLevelName(self.log.getEffectiveLevel()))
+                              getLevelName(self.logger.getEffectiveLevel()))
         p.exe()
         # reload the mets file to prevent run_processor's save_mets
         # from overriding the results from the Java process

From 2b310b4690b1a83be75cd93432ea38be7250ee35 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Fri, 23 Aug 2024 13:51:07 +0200
Subject: [PATCH 76/97] revert: ocropy.ocrolib changes

---
 ocrd_cis/ocropy/ocrolib/morph.py    | 18 ++++++++----------
 ocrd_cis/ocropy/ocrolib/toplevel.py | 26 ++++++++++++++++++--------
 2 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/ocrd_cis/ocropy/ocrolib/morph.py b/ocrd_cis/ocropy/ocrolib/morph.py
index b9619cca..f7ccdc31 100644
--- a/ocrd_cis/ocropy/ocrolib/morph.py
+++ b/ocrd_cis/ocropy/ocrolib/morph.py
@@ -292,9 +292,8 @@ def propagate_labels_majority(image,labels):
     with the largest overlap."""
     rlabels,_ = label(image)
     cors = correspondences(rlabels,labels)
-    amax_rlabels = amax(rlabels) + 1
-    outputs = zeros(amax_rlabels,'i')
-    counts = zeros(amax_rlabels,'i')
+    outputs = zeros(amax(rlabels)+1,'i')
+    counts = zeros(amax(rlabels)+1,'i')
     for rlabel, label_, count in cors.T:
         if not rlabel or not label_:
             # ignore background correspondences
@@ -348,13 +347,12 @@ def all_neighbors(image, dist=1, bg=NaN):
     """Given an image with labels, find all pairs of labels
     that are directly (up to ``dist``) neighboring each other, ignoring the label ``bg``."""
     q = 100000
-    assert amax(image) < q
-    assert amin(image) >= 0
-    q_image = q * image
-    u = unique(q_image + shift(image, (dist, 0), order=0, cval=bg))
-    d = unique(q_image + shift(image, (-dist, 0), order=0, cval=bg))
-    l = unique(q_image + shift(image, (0, dist), order=0, cval=bg))
-    r = unique(q_image + shift(image, (0, -dist), order=0, cval=bg))
+    assert amax(image)<q
+    assert amin(image)>=0
+    u = unique(q*image+shift(image, (dist, 0), order=0, cval=bg))
+    d = unique(q*image+shift(image, (-dist, 0), order=0, cval=bg))
+    l = unique(q*image+shift(image, (0, dist), order=0, cval=bg))
+    r = unique(q*image+shift(image, (0, -dist), order=0, cval=bg))
     all = unique(r_[u,d,l,r])
     all = all[all!=bg]
     all = c_[all//q,all%q]
diff --git a/ocrd_cis/ocropy/ocrolib/toplevel.py b/ocrd_cis/ocropy/ocrolib/toplevel.py
index 72e397af..87ed18c5 100644
--- a/ocrd_cis/ocropy/ocrolib/toplevel.py
+++ b/ocrd_cis/ocropy/ocrolib/toplevel.py
@@ -125,10 +125,14 @@ def __init__(self,*args,**kw):
         self.fun = kw.get("fun","?")
         self.var = kw.get("var","?")
         self.description = " ".join([strc(x) for x in args])
-
     def __str__(self):
-        return f"\nCheckError for argument {str(self.var)} of function {str(self.fun)}\n{self.description}"
-
+        result = "\nCheckError for argument "
+        result += str(self.var)
+        result += " of function "
+        result += str(self.fun)
+        result += "\n"
+        result += self.description
+        return result
 
 class CheckWarning(CheckError):
     def __init__(self,*args,**kw):
@@ -138,8 +142,14 @@ def __init__(self,*args,**kw):
         CheckError.__init__(self, *args, **kw)
 
     def __str__(self):
-        return (f"\nCheckWarning for argument {str(self.var)} of function {str(self.fun)}\n{self.description} "
-                f"(This can happen occasionally during normal operations and isn't necessarily a bug or problem.)\n")
+        result = "\nCheckWarning for argument "
+        result += str(self.var)
+        result += " of function "
+        result += str(self.fun)
+        result += "\n"
+        result += self.description
+        result += "(This can happen occasionally during normal operations and isn't necessarily a bug or problem.)\n"
+        return result
 
 def checktype(value,type_):
     """Check value against the type spec.  If everything
@@ -201,7 +211,7 @@ def argument_checks(*args,**kw):
                     e.var = var
                     raise e
                 except:
-                    LOG.critical(f"unknown exception while checking function: '{name}'")
+                    LOG.critical("unknown exception while checking function: '%s'", name)
                     raise
             result = f(*args,**kw)
             checktype(result,kw.get("_",True))
@@ -215,9 +225,9 @@ def decorator(f):
         def wrapper(arg):
             if not f(arg):
                 if warning:
-                    raise CheckWarning(f"{strc(arg)} of type {str(type(arg))}: {str(message)}")
+                    raise CheckWarning(strc(arg)+" of type "+str(type(arg))+": "+str(message))
                 else:
-                    raise CheckError(f"{strc(arg)} of type {str(type(arg))}: {str(message)}")
+                    raise CheckError(strc(arg)+" of type "+str(type(arg))+": "+str(message))
         return wrapper
     return decorator
 

From 4420c6fa246c81f1fc7c14e7a1cb6dc1d2460e5f Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Fri, 23 Aug 2024 15:06:41 +0200
Subject: [PATCH 77/97] revert: ocropy.common changes

---
 ocrd_cis/ocropy/common.py | 186 +++++++++++++++++++-------------------
 1 file changed, 93 insertions(+), 93 deletions(-)

diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py
index a5806517..c23e89b9 100644
--- a/ocrd_cis/ocropy/common.py
+++ b/ocrd_cis/ocropy/common.py
@@ -184,19 +184,16 @@ def estimate_thresholds(flat, bignore=0.1, escale=1.0, lo=5, hi=90):
     d0, d1 = flat.shape
     o0, o1 = int(bignore * d0), int(bignore * d1)
     est = flat[o0:d0 - o0, o1:d1 - o1]
-
     if escale > 0:
         # by default, we use only regions that contain
         # significant variance; this makes the percentile
         # based low and high estimates more reliable
         e = escale
-        e_20_0 = e * 20.0
-        e_50 = int(e * 50)
-        v = est - filters.gaussian_filter(est, e_20_0)
-        v = filters.gaussian_filter(v ** 2, e_20_0) ** 0.5
+        v = est - filters.gaussian_filter(est, e*20.0)
+        v = filters.gaussian_filter(v ** 2, e*20.0) ** 0.5
         v = (v > 0.3 * np.amax(v))
-        v = morphology.binary_dilation(v, structure=np.ones((e_50, 1)))
-        v = morphology.binary_dilation(v, structure=np.ones((1, e_50)))
+        v = morphology.binary_dilation(v, structure=np.ones((int(e * 50), 1)))
+        v = morphology.binary_dilation(v, structure=np.ones((1, int(e * 50))))
         est = est[v]
     lo = stats.scoreatpercentile(est.ravel(), lo)
     hi = stats.scoreatpercentile(est.ravel(), hi)
@@ -313,24 +310,24 @@ def check_line(binary, zoom=1.0):
 
     Returns an error report, or None if valid.
     """
-    if np.prod(binary.shape) == 0: return "image dimensions are zero"
-    if len(binary.shape) == 3: return f"image is not monochrome {binary.shape}"
+    if np.prod(binary.shape)==0: return "image dimensions are zero"
+    if len(binary.shape)==3: return "image is not monochrome %s"%(binary.shape,)
     if np.amax(binary)==np.amin(binary): return "image is blank"
     if np.mean(binary)<np.median(binary): return "image may be inverted"
     h,w = binary.shape
-    if h<20/zoom: return f"image not tall enough for a text line {binary.shape}"
-    if h>200/zoom: return f"image too tall for a text line {binary.shape}"
+    if h<20/zoom: return "image not tall enough for a text line %s"%(binary.shape,)
+    if h>200/zoom: return "image too tall for a text line %s"%(binary.shape,)
     ##if w<1.5*h: return "line too short %s"%(binary.shape,)
-    if w<1.5*h and w<32/zoom: return f"image too short for a line image {binary.shape}"
-    if w>4000/zoom: return f"image too long for a line image {binary.shape}"
+    if w<1.5*h and w<32/zoom: return "image too short for a line image %s"%(binary.shape,)
+    if w>4000/zoom: return "image too long for a line image %s"%(binary.shape,)
     return None
     ratio = w*1.0/h
     _, ncomps = measurements.label(binary)
     lo = int(0.5*ratio+0.5)
     hi = int(4*ratio)+1
-    if ncomps<lo: return f"too few connected components (got {ncomps}, wanted >={lo})"
-    ##if ncomps>hi*ratio: return f"too many connected components (got {ncomps}, wanted <={hi})"
-    if ncomps>hi*ratio and ncomps>10: return f"too many connected components (got {ncomps}, wanted <={hi})"
+    if ncomps<lo: return "too few connected components (got %d, wanted >=%d)"%(ncomps,lo)
+    ##if ncomps>hi*ratio: return "too many connected components (got %d, wanted <=%d)"%(ncomps,hi)
+    if ncomps>hi*ratio and ncomps>10: return "too many connected components (got %d, wanted <=%d)"%(ncomps,hi)
     return None
 
 # inspired by ocropus-gpageseg check_page
@@ -344,21 +341,21 @@ def check_region(binary, zoom=1.0):
 
     Returns an error report, or None if valid.
     """
-    if np.prod(binary.shape) == 0: return "image dimensions are zero"
-    if len(binary.shape) == 3: return f"image is not monochrome {binary.shape}"
+    if np.prod(binary.shape)==0: return "image dimensions are zero"
+    if len(binary.shape)==3: return "image is not monochrome %s"%(binary.shape,)
     if np.amax(binary)==np.amin(binary): return "image is blank"
     if np.mean(binary)<np.median(binary): return "image may be inverted"
     h,w = binary.shape
-    if h<45/zoom: return f"image not tall enough for a region image {binary.shape}"
-    if h>5000/zoom: return f"image too tall for a region image {binary.shape}"
-    if w<100/zoom: return f"image too narrow for a region image {binary.shape}"
-    if w>5000/zoom: return f"image too wide for a region image {binary.shape}"
+    if h<45/zoom: return "image not tall enough for a region image %s"%(binary.shape,)
+    if h>5000/zoom: return "image too tall for a region image %s"%(binary.shape,)
+    if w<100/zoom: return "image too narrow for a region image %s"%(binary.shape,)
+    if w>5000/zoom: return "image too wide for a region image %s"%(binary.shape,)
     return None
     # zoom factor (DPI relative) and 4 (against fragmentation from binarization)
     slots = int(w*h*1.0/(30*30)*zoom*zoom) * 4
     _,ncomps = measurements.label(binary)
-    if ncomps<5: return f"too few connected components for a region image (got {ncomps})"
-    if ncomps>slots and ncomps>10: return f"too many connected components for a region image ({ncomps} > {slots})"
+    if ncomps<5: return "too few connected components for a region image (got %d)"%(ncomps,)
+    if ncomps>slots and ncomps>10: return "too many connected components for a region image (%d > %d)"%(ncomps,slots)
     return None
 
 # from ocropus-gpageseg, but with zoom parameter
@@ -372,21 +369,21 @@ def check_page(binary, zoom=1.0):
 
     Returns an error report, or None if valid.
     """
-    if np.prod(binary.shape) == 0: return "image dimensions are zero"
-    if len(binary.shape) == 3: return f"image not monochrome {binary.shape}"
+    if np.prod(binary.shape)==0: return "image dimensions are zero"
+    if len(binary.shape)==3: return "image not monochrome %s"%(binary.shape,)
     if np.amax(binary)==np.amin(binary): return "image is blank"
     if np.mean(binary)<np.median(binary): return "image may be inverted"
     h,w = binary.shape
-    if h<600/zoom: return f"image not tall enough for a page image {binary.shape}"
-    if h>20000/zoom: return f"image too tall for a page image {binary.shape}"
-    if w<600/zoom: return f"image too narrow for a page image {binary.shape}"
-    if w>20000/zoom: return f"image too wide for a page image {binary.shape}"
+    if h<600/zoom: return "image not tall enough for a page image %s"%(binary.shape,)
+    if h>20000/zoom: return "image too tall for a page image %s"%(binary.shape,)
+    if w<600/zoom: return "image too narrow for a page image %s"%(binary.shape,)
+    if w>20000/zoom: return "image too wide for a page image %s"%(binary.shape,)
     return None
     # zoom factor (DPI relative) and 4 (against fragmentation from binarization)
     slots = int(w*h*1.0/(30*30)*zoom*zoom) * 4
     _,ncomps = measurements.label(binary)
-    if ncomps<10: return f"too few connected components for a page image (got {ncomps})"
-    if ncomps>slots and ncomps>10: return f"too many connected components for a page image ({ncomps} > {slots})"
+    if ncomps<10: return "too few connected components for a page image (got %d)"%(ncomps,)
+    if ncomps>slots and ncomps>10: return "too many connected components for a page image (%d > %d)"%(ncomps,slots)
     return None
 
 def odd(num):
@@ -479,13 +476,8 @@ def compute_images(binary, scale, maximages=5):
     #images = morph.rb_closing(images, (d0,d1))
     #DSAVE('images1_closed', images+0.6*binary)
     # 1- filter largest connected components
-    binary_0_6 = 0.6 * binary
-    odd_scale = odd(scale)
-    odd_half_scale = odd(scale / 2)
-    odd_doubled_scale = odd(2 * scale)
-    region_min = (4 * scale) ** 2
-    images = morph.select_regions(images, sl.area, min=region_min, nbest=2 * maximages)
-    DSAVE('images1_large', images + binary_0_6)
+    images = morph.select_regions(images,sl.area,min=(4*scale)**2,nbest=2*maximages)
+    DSAVE('images1_large', images+0.6*binary)
     if not images.any():
         return np.zeros_like(binary, int)
     # 2- open horizontally and vertically to suppress
@@ -494,31 +486,31 @@ def compute_images(binary, scale, maximages=5):
     #    single frame, because then the hull polygon
     #    can cover/overlap large text/table parts which
     #    we cannot discern from the actual image anymore
-    h_opened = morph.rb_opening(images, (1, odd_half_scale))
-    DSAVE('images2_h-opened', h_opened + binary_0_6)
-    v_opened = morph.rb_opening(images, (odd_half_scale, 1))
-    DSAVE('images2_v-opened', v_opened + binary_0_6)
+    h_opened = morph.rb_opening(images, (1, odd(scale/2)))
+    DSAVE('images2_h-opened', h_opened+0.6*binary)
+    v_opened = morph.rb_opening(images, (odd(scale/2), 1))
+    DSAVE('images2_v-opened', v_opened+0.6*binary)
     # 3- close whatever remains
-    closed = morph.rb_closing(h_opened&v_opened, (odd_doubled_scale, odd_doubled_scale))
-    DSAVE('images3_closed', closed + binary_0_6)
+    closed = morph.rb_closing(h_opened&v_opened, (odd(2*scale), odd(2*scale)))
+    DSAVE('images3_closed', closed + 0.6*binary)
     # 4- reconstruct the losses up to a certain distance
     #    to avoid creeping into pure h/v-lines again but still
     #    cover most of the large object
     #images = np.where(images, closed, 2)
     #images = morph.spread_labels(images, maxdist=scale) % 2 | closed
     images = morph.rb_reconstruction(closed, images, step=2, maxsteps=scale)
-    DSAVE('images4_reconstructed', images + binary_0_6)
+    DSAVE('images4_reconstructed', images+0.6*binary)
     # 5- select nbest
-    images = morph.select_regions(images, sl.area, min=region_min, nbest=maximages)
-    DSAVE('images5_selected', images + binary_0_6)
+    images = morph.select_regions(images, sl.area, min=(4*scale)**2, nbest=maximages)
+    DSAVE('images5_selected', images+0.6*binary)
     if not images.any():
         return np.zeros_like(binary, int)
     # 6- dilate a little to get a smooth contour without gaps
-    dilated = morph.r_dilation(images, (odd_scale, odd_scale))
+    dilated = morph.r_dilation(images, (odd(scale), odd(scale)))
     images = morph.propagate_labels_majority(binary, dilated+1)
     images = morph.spread_labels(images, maxdist=scale)==2
     images, _ = morph.label(images)
-    DSAVE('images6_dilated', images + binary_0_6)
+    DSAVE('images6_dilated', images+0.6*binary)
     # we could repeat reconstruct-dilate here...
     return images
 
@@ -556,7 +548,6 @@ def compute_seplines(binary, scale, maxseps=0):
     sepsizes = [0]
     sepslices = [None]
     sepdists = [0]
-    doubled_scale = 2 * scale
     for label in range(1, nlabels + 1):
         labelslice = slices[label]
         labelmask = labels == label
@@ -608,8 +599,8 @@ def compute_seplines(binary, scale, maxseps=0):
                 binmask = sublabels == bin + 1
                 binlabels, nbinlabels = morph.label(binmask)
                 _, binlabelcounts = np.unique(binlabels, return_counts=True)
-                largemask = (binlabelcounts > doubled_scale)[binlabels]
-                smallmask = (binlabelcounts <= doubled_scale)[binlabels]
+                largemask = (binlabelcounts > 2 * scale)[binlabels]
+                smallmask = (binlabelcounts <= 2 * scale)[binlabels]
                 sublabels2[binmask & smallmask] = 1
                 if not np.any(binmask & largemask):
                     continue
@@ -1852,13 +1843,11 @@ def find_topological():
             else:
                 llab[box] = lbinary[box]
             # show projection at the sides
-            log_y = -10 * np.log(y + 1e-9)
-            log_x = -10 * np.log(x + 1e-9)
-            for i in range(int(scale / 2)):
-                llab[box[0], box[1].start + i] = log_y
-                llab[box[0], box[1].stop - 1 - i] = log_y
-                llab[box[0].start + i, box[1]] = log_x
-                llab[box[0].stop - 1 - i, box[1]] = log_x
+            for i in range(int(scale/2)):
+                llab[box[0],box[1].start+i] = -10*np.log(y+1e-9)
+                llab[box[0],box[1].stop-1-i] = -10*np.log(y+1e-9)
+                llab[box[0].start+i,box[1]] = -10*np.log(x+1e-9)
+                llab[box[0].stop-1-i,box[1]] = -10*np.log(x+1e-9)
             DSAVE('recursive_x_y_cut_' + (partition_type or 'sliced'), llab)
         gap_weights = list()
         for is_horizontal, profile in enumerate([y, x]):
@@ -1888,19 +1877,19 @@ def find_topological():
                 weights = weights * (1 + 0.5 * props['peak_heights']/gap_height)
             gap_weights.append((gaps, weights))
             if debug:
-                orientation = 'horizontal' if is_horizontal else 'vertical'
-                LOG.debug(f'  {orientation} gaps {gaps} {props} weights {weights}')
+                LOG.debug('  {} gaps {} {} weights {}'.format(
+                    'horizontal' if is_horizontal else 'vertical',
+                    gaps, props, weights))
                 if not gaps.shape[0]:
                     continue
-                half_scale = int(scale / 2)
                 for start, stop, height in sorted(zip(
                         props['left_ips'].astype(int),
                         props['right_ips'].astype(int),
                         props['peak_heights']), key=lambda x: x[2]):
                     if is_horizontal:
-                        llab[box[0].start+half_scale:box[0].stop-half_scale,box[1].start+start:box[1].start+stop] = -10*np.log(-height+1e-9)
+                        llab[box[0].start+int(scale/2):box[0].stop-int(scale/2),box[1].start+start:box[1].start+stop] = -10*np.log(-height+1e-9)
                     else:
-                        llab[box[0].start+start:box[0].start+stop,box[1].start+half_scale:box[1].stop-half_scale] = -10*np.log(-height+1e-9)
+                        llab[box[0].start+start:box[0].start+stop,box[1].start+int(scale/2):box[1].stop-int(scale/2)] = -10*np.log(-height+1e-9)
                 DSAVE('recursive_x_y_cut_gaps_' + ('h' if is_horizontal else 'v'), llab)
         # heuristic (not strict) decision on x or y cut,
         # factors to consider:
@@ -1927,27 +1916,32 @@ def find_topological():
         #   are not allowed
         y_gaps, y_weights = gap_weights[0][0], gap_weights[0][1]
         x_gaps, x_weights = gap_weights[1][0], gap_weights[1][1]
-        if debug: LOG.debug(f'   all y_gaps {y_gaps} x_gaps {x_gaps}')
+        if debug: LOG.debug('   all y_gaps {} x_gaps {}'.format(y_gaps, x_gaps))
         # suppress cuts that significantly split any line labels
-        min_line_scale = min_line * scale
         y_allowed = [not(np.any(np.intersect1d(
             # significant line labels above
-            np.nonzero(np.bincount(lbin[:gap,:].flatten(), minlength=len(objects))[1:] > min_line_scale)[0],
+            np.nonzero(np.bincount(lbin[:gap,:].flatten(),
+                                   minlength=len(objects))[1:] > min_line * scale)[0],
             # significant line labels below
-            np.nonzero(np.bincount(lbin[gap:,:].flatten(), minlength=len(objects))[1:] > min_line_scale)[0],
-            assume_unique=True))) for gap in y_gaps]
+            np.nonzero(np.bincount(lbin[gap:,:].flatten(),
+                                   minlength=len(objects))[1:] > min_line * scale)[0],
+            assume_unique=True)))
+                        for gap in y_gaps]
         x_allowed = [not(np.any(np.intersect1d(
             # significant line labels left
-            np.nonzero(np.bincount(lbin[:,:gap].flatten(), minlength=len(objects))[1:] > min_line_scale)[0],
+            np.nonzero(np.bincount(lbin[:,:gap].flatten(),
+                                   minlength=len(objects))[1:] > min_line * scale)[0],
             # significant line labels right
-            np.nonzero(np.bincount(lbin[:,gap:].flatten(), minlength=len(objects))[1:] > min_line_scale)[0],
-            assume_unique=True))) for gap in x_gaps]
+            np.nonzero(np.bincount(lbin[:,gap:].flatten(),
+                                   minlength=len(objects))[1:] > min_line * scale)[0],
+            assume_unique=True)))
+                        for gap in x_gaps]
         y_gaps, y_weights = y_gaps[y_allowed], y_weights[y_allowed]
         x_gaps, x_weights = x_gaps[x_allowed], x_weights[x_allowed]
-        if debug: LOG.debug(f'   allowed y_gaps {y_gaps} x_gaps {x_gaps}')
+        if debug: LOG.debug('   allowed y_gaps {} x_gaps {}'.format(y_gaps, x_gaps))
         y_prominence = np.amax(y_weights, initial=0)
         x_prominence = np.amax(x_weights, initial=0)
-        if debug: LOG.debug(f'   y_prominence {y_prominence} x_prominence {x_prominence}')
+        if debug: LOG.debug('   y_prominence {} x_prominence {}'.format(y_prominence, x_prominence))
         # suppress less prominent peaks (another heuristic...)
         # they must compete with the other direction next time
         # (when already new cuts or partitions will become visible)
@@ -1955,30 +1949,33 @@ def find_topological():
         x_allowed = x_weights > 0.8 * x_prominence
         y_gaps, y_weights = y_gaps[y_allowed], y_weights[y_allowed]
         x_gaps, x_weights = x_gaps[x_allowed], x_weights[x_allowed]
-        if debug: LOG.debug(f'   prominent y_gaps {y_gaps} x_gaps {x_gaps}')
+        if debug: LOG.debug('   prominent y_gaps {} x_gaps {}'.format(y_gaps, x_gaps))
         if npartitions > 0:
             # TODO this can be avoided when backtracking below
             # suppress peaks creating fewer partitions than others --
             # how large in our preferred direction will the new partitions
             # of sepmask in both slices created by each cut candidate
             # add up?
-            y_partitionscores = [sum(map(
-                sl.height if prefer_vertical else sl.width,
-                morph.find_objects(morph.label(partitions[:gap, :] > 0)[0]) +
-                morph.find_objects(morph.label(partitions[gap:, :] > 0)[0])))
-                for gap in y_gaps]
-            x_partitionscores = [sum(map(
-                sl.height if prefer_vertical else sl.width,
-                morph.find_objects(morph.label(partitions[:, : gap] > 0)[0]) +
-                morph.find_objects(morph.label(partitions[:, gap :] > 0)[0])))
-                for gap in x_gaps]
-            if debug: LOG.debug(f'   y_partitionscores {y_partitionscores} x_partitionscores {x_partitionscores}')
+            y_partitionscores = [sum(map(sl.height if prefer_vertical else sl.width,
+                                         morph.find_objects(morph.label(
+                                             partitions[:gap,:]>0)[0]) +
+                                         morph.find_objects(morph.label(
+                                             partitions[gap:,:]>0)[0])))
+                                 for gap in y_gaps]
+            x_partitionscores = [sum(map(sl.height if prefer_vertical else sl.width,
+                                         morph.find_objects(morph.label(
+                                             partitions[:,:gap]>0)[0]) +
+                                         morph.find_objects(morph.label(
+                                             partitions[:,gap:]>0)[0])))
+                                 for gap in x_gaps]
+            if debug: LOG.debug('   y_partitionscores {} x_partitionscores {}'.format(
+                y_partitionscores, x_partitionscores))
             # Now identify those gaps with the largest overall score
             y_allowed = y_partitionscores == np.max(y_partitionscores, initial=0)
             x_allowed = x_partitionscores == np.max(x_partitionscores, initial=0)
             y_gaps, y_weights = y_gaps[y_allowed], y_weights[y_allowed]
             x_gaps, x_weights = x_gaps[x_allowed], x_weights[x_allowed]
-            if debug: LOG.debug(f'   most partitioning y_gaps {y_gaps} x_gaps {x_gaps}')
+            if debug: LOG.debug('   most partitioning y_gaps {} x_gaps {}'.format(y_gaps, x_gaps))
         else:
             y_partitionscores = None
             x_partitionscores = None
@@ -1989,7 +1986,7 @@ def find_topological():
         x_allowed = x_weights > 0.9 * x_prominence
         y_gaps, y_weights = y_gaps[y_allowed], y_weights[y_allowed]
         x_gaps, x_weights = x_gaps[x_allowed], x_weights[x_allowed]
-        if debug: LOG.debug(f'   prominent y_gaps {y_gaps} x_gaps {x_gaps}')
+        if debug: LOG.debug('   prominent y_gaps {} x_gaps {}'.format(y_gaps, x_gaps))
         
         # decide which direction, x or y
         # TODO: this most likely needs a backtracking mechanism
@@ -2055,7 +2052,7 @@ def find_topological():
                     llab2[box] = partitions
                 DSAVE('recursive_x_y_cut_partitions', llab2)
             for label in range(1, npartitions+1):
-                LOG.debug(f'next partition %d on %s', label, box)
+                LOG.debug('next partition %d on %s', label, box)
                 recursive_x_y_cut(box, mask=partitions==label, partition_type=new_partition_type)
             return
         
@@ -2063,9 +2060,10 @@ def find_topological():
             # no gaps left
             finalize()
             return
-        orientation = 'vertical' if choose_vertical else 'horizontal'
         # otherwise: cut on gaps
-        LOG.debug(f'cutting {orientation}ly on {box} into {gaps}')
+        LOG.debug('cutting %s on %s into %s', 'vertically'
+        if choose_vertical else 'horizontally',
+                  box, gaps)
         cuts = list(zip(np.insert(gaps, 0, 0), np.append(gaps, lim)))
         if choose_vertical:
             if rl:
@@ -2080,7 +2078,9 @@ def find_topological():
                 sub = sl.box(0, len(y), start, stop)
             else: # "cut in horizontal direction"
                 sub = sl.box(start, stop, 0, len(x))
-            LOG.debug(f'next {orientation} block on {box} is {sub}')
+            LOG.debug('next %s block on %s is %s', 'horizontal'
+            if choose_vertical else 'vertical',
+                      box, sub)
             recursive_x_y_cut(sl.compose(box,sub),
                               mask=sl.cut(mask,sub) if isinstance(mask, np.ndarray)
                               else None)

From 2d8650ed51f5e9cc627d95ae5aea217b9f7bacb6 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Fri, 23 Aug 2024 15:15:50 +0200
Subject: [PATCH 78/97] remove whitespaces in ocropy.common and ocropy.ocrolib

---
 ocrd_cis/ocropy/common.py        | 18 +++++++++---------
 ocrd_cis/ocropy/ocrolib/morph.py |  8 ++++----
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py
index c23e89b9..c5b56ed0 100644
--- a/ocrd_cis/ocropy/common.py
+++ b/ocrd_cis/ocropy/common.py
@@ -189,8 +189,8 @@ def estimate_thresholds(flat, bignore=0.1, escale=1.0, lo=5, hi=90):
         # significant variance; this makes the percentile
         # based low and high estimates more reliable
         e = escale
-        v = est - filters.gaussian_filter(est, e*20.0)
-        v = filters.gaussian_filter(v ** 2, e*20.0) ** 0.5
+        v = est - filters.gaussian_filter(est, e * 20.0)
+        v = filters.gaussian_filter(v ** 2, e * 20.0) ** 0.5
         v = (v > 0.3 * np.amax(v))
         v = morphology.binary_dilation(v, structure=np.ones((int(e * 50), 1)))
         v = morphology.binary_dilation(v, structure=np.ones((1, int(e * 50))))
@@ -491,8 +491,8 @@ def compute_images(binary, scale, maximages=5):
     v_opened = morph.rb_opening(images, (odd(scale/2), 1))
     DSAVE('images2_v-opened', v_opened+0.6*binary)
     # 3- close whatever remains
-    closed = morph.rb_closing(h_opened&v_opened, (odd(2*scale), odd(2*scale)))
-    DSAVE('images3_closed', closed + 0.6*binary)
+    closed = morph.rb_closing(h_opened&v_opened, (odd(2*scale),odd(2*scale)))
+    DSAVE('images3_closed', closed+0.6*binary)
     # 4- reconstruct the losses up to a certain distance
     #    to avoid creeping into pure h/v-lines again but still
     #    cover most of the large object
@@ -501,12 +501,12 @@ def compute_images(binary, scale, maximages=5):
     images = morph.rb_reconstruction(closed, images, step=2, maxsteps=scale)
     DSAVE('images4_reconstructed', images+0.6*binary)
     # 5- select nbest
-    images = morph.select_regions(images, sl.area, min=(4*scale)**2, nbest=maximages)
+    images = morph.select_regions(images,sl.area,min=(4*scale)**2,nbest=maximages)
     DSAVE('images5_selected', images+0.6*binary)
     if not images.any():
         return np.zeros_like(binary, int)
     # 6- dilate a little to get a smooth contour without gaps
-    dilated = morph.r_dilation(images, (odd(scale), odd(scale)))
+    dilated = morph.r_dilation(images, (odd(scale),odd(scale)))
     images = morph.propagate_labels_majority(binary, dilated+1)
     images = morph.spread_labels(images, maxdist=scale)==2
     images, _ = morph.label(images)
@@ -1969,7 +1969,7 @@ def find_topological():
                                              partitions[:,gap:]>0)[0])))
                                  for gap in x_gaps]
             if debug: LOG.debug('   y_partitionscores {} x_partitionscores {}'.format(
-                y_partitionscores, x_partitionscores))
+                    y_partitionscores, x_partitionscores))
             # Now identify those gaps with the largest overall score
             y_allowed = y_partitionscores == np.max(y_partitionscores, initial=0)
             x_allowed = x_partitionscores == np.max(x_partitionscores, initial=0)
@@ -2062,7 +2062,7 @@ def find_topological():
             return
         # otherwise: cut on gaps
         LOG.debug('cutting %s on %s into %s', 'vertically'
-        if choose_vertical else 'horizontally',
+                  if choose_vertical else 'horizontally',
                   box, gaps)
         cuts = list(zip(np.insert(gaps, 0, 0), np.append(gaps, lim)))
         if choose_vertical:
@@ -2079,7 +2079,7 @@ def find_topological():
             else: # "cut in horizontal direction"
                 sub = sl.box(start, stop, 0, len(x))
             LOG.debug('next %s block on %s is %s', 'horizontal'
-            if choose_vertical else 'vertical',
+                      if choose_vertical else 'vertical',
                       box, sub)
             recursive_x_y_cut(sl.compose(box,sub),
                               mask=sl.cut(mask,sub) if isinstance(mask, np.ndarray)
diff --git a/ocrd_cis/ocropy/ocrolib/morph.py b/ocrd_cis/ocropy/ocrolib/morph.py
index f7ccdc31..7d6ffc85 100644
--- a/ocrd_cis/ocropy/ocrolib/morph.py
+++ b/ocrd_cis/ocropy/ocrolib/morph.py
@@ -349,10 +349,10 @@ def all_neighbors(image, dist=1, bg=NaN):
     q = 100000
     assert amax(image)<q
     assert amin(image)>=0
-    u = unique(q*image+shift(image, (dist, 0), order=0, cval=bg))
-    d = unique(q*image+shift(image, (-dist, 0), order=0, cval=bg))
-    l = unique(q*image+shift(image, (0, dist), order=0, cval=bg))
-    r = unique(q*image+shift(image, (0, -dist), order=0, cval=bg))
+    u = unique(q*image+shift(image,(dist,0),order=0,cval=bg))
+    d = unique(q*image+shift(image,(-dist,0),order=0,cval=bg))
+    l = unique(q*image+shift(image,(0,dist),order=0,cval=bg))
+    r = unique(q*image+shift(image,(0,-dist),order=0,cval=bg))
     all = unique(r_[u,d,l,r])
     all = all[all!=bg]
     all = c_[all//q,all%q]

From 9a153b079a3684bf875b306ba8eaad9e1637eeed Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Sun, 25 Aug 2024 02:01:36 +0200
Subject: [PATCH 79/97] postcorrect: adapt to frozendict Processor.parameter in
 v3

---
 ocrd_cis/__init__.py         |  1 -
 ocrd_cis/align/cli.py        |  1 -
 ocrd_cis/ocrd_tool.py        |  6 ----
 ocrd_cis/ocropy/binarize.py  |  6 +---
 ocrd_cis/ocropy/clip.py      |  9 +-----
 ocrd_cis/ocropy/denoise.py   |  8 +-----
 ocrd_cis/ocropy/deskew.py    |  8 +-----
 ocrd_cis/ocropy/dewarp.py    |  4 ---
 ocrd_cis/ocropy/recognize.py |  7 ++---
 ocrd_cis/ocropy/resegment.py |  9 +-----
 ocrd_cis/ocropy/segment.py   | 47 +++++++++++++++---------------
 ocrd_cis/ocropy/train.py     |  7 +----
 ocrd_cis/postcorrect/cli.py  | 55 +++++++++++++++++++++---------------
 13 files changed, 63 insertions(+), 105 deletions(-)
 delete mode 100644 ocrd_cis/ocrd_tool.py

diff --git a/ocrd_cis/__init__.py b/ocrd_cis/__init__.py
index 6f37f4f7..9d22fe3e 100644
--- a/ocrd_cis/__init__.py
+++ b/ocrd_cis/__init__.py
@@ -1,3 +1,2 @@
 from .javaprocess import JavaAligner
 from .javaprocess import JavaPostCorrector
-from .ocrd_tool import get_ocrd_tool
diff --git a/ocrd_cis/align/cli.py b/ocrd_cis/align/cli.py
index f5e47785..5706461e 100644
--- a/ocrd_cis/align/cli.py
+++ b/ocrd_cis/align/cli.py
@@ -11,7 +11,6 @@
 from ocrd import Processor, OcrdPage, OcrdPageResult
 from ocrd.decorators import ocrd_cli_options
 from ocrd.decorators import ocrd_cli_wrap_processor
-from ocrd_utils import getLogger
 from ocrd_utils import getLevelName
 from ocrd_models.ocrd_page import TextRegionType, TextEquivType
 from ocrd_cis import JavaAligner
diff --git a/ocrd_cis/ocrd_tool.py b/ocrd_cis/ocrd_tool.py
deleted file mode 100644
index 36cb9d7e..00000000
--- a/ocrd_cis/ocrd_tool.py
+++ /dev/null
@@ -1,6 +0,0 @@
-import json
-from ocrd_utils import resource_string
-
-
-def get_ocrd_tool():
-    return json.loads(resource_string(__name__, 'ocrd-tool.json'))
diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
index 35b28c5a..9a55301d 100644
--- a/ocrd_cis/ocropy/binarize.py
+++ b/ocrd_cis/ocropy/binarize.py
@@ -8,8 +8,7 @@
 
 from ocrd_utils import getLogger
 from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage
-from ocrd import Processor
-from ocrd.processor import OcrdPageResult, OcrdPageResultImage
+from ocrd import Processor, OcrdPageResult, OcrdPageResultImage
 
 from . import common
 from .common import array2pil, determine_zoom, pil2array, remove_noise
@@ -51,14 +50,11 @@ def binarize(logger: Logger, pil_image, method='ocropy', maxskew=2, threshold=0.
         return Image.fromarray(th), 0
 
 class OcropyBinarize(Processor):
-    logger: Logger
-
     @property
     def executable(self):
         return 'ocrd-cis-ocropy-binarize'
 
     def setup(self):
-        self.logger = getLogger('processor.OcropyBinarize')
         method = self.parameter['method']
         if self.parameter['grayscale'] and method != 'ocropy':
             self.logger.critical(f'Requested method {method} does not support grayscale normalized output')
diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py
index b81c731c..18a0c115 100644
--- a/ocrd_cis/ocropy/clip.py
+++ b/ocrd_cis/ocropy/clip.py
@@ -8,13 +8,11 @@
 from shapely.prepared import prep
 
 from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage
-from ocrd import Processor
-from ocrd.processor import OcrdPageResult, OcrdPageResultImage
+from ocrd import Processor, OcrdPageResult, OcrdPageResultImage
 from ocrd_utils import (
     bbox_from_polygon,
     coordinates_of_segment,
     crop_image,
-    getLogger,
     image_from_polygon,
     polygon_from_points,
     polygon_mask,
@@ -25,15 +23,10 @@
 
 
 class OcropyClip(Processor):
-    logger: Logger
-
     @property
     def executable(self):
         return 'ocrd-cis-ocropy-clip'
 
-    def setup(self):
-        self.logger = getLogger('processor.OcropyClip')
-
     def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: str = None) -> OcrdPageResult:
         """Clip text regions / lines of a page at intersections with neighbours.
 
diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py
index eb3e7d23..eaed74df 100644
--- a/ocrd_cis/ocropy/denoise.py
+++ b/ocrd_cis/ocropy/denoise.py
@@ -4,21 +4,15 @@
 
 from ocrd_utils import getLogger
 from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage
-from ocrd import Processor
-from ocrd.processor import OcrdPageResult, OcrdPageResultImage
+from ocrd import Processor, OcrdPageResult, OcrdPageResultImage
 
 from .common import determine_zoom, remove_noise
 
 class OcropyDenoise(Processor):
-    logger: Logger
-
     @property
     def executable(self):
         return 'ocrd-cis-ocropy-denoise'
 
-    def setup(self):
-        self.logger = getLogger('processor.OcropyDenoise')
-
     def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
         """Despeckle the pages / regions / lines of the workspace.
 
diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py
index 7bdbba2d..b02c69d5 100644
--- a/ocrd_cis/ocropy/deskew.py
+++ b/ocrd_cis/ocropy/deskew.py
@@ -4,8 +4,7 @@
 
 from ocrd_utils import getLogger
 from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage, PageType
-from ocrd import Processor
-from ocrd.processor import OcrdPageResult, OcrdPageResultImage
+from ocrd import Processor, OcrdPageResult, OcrdPageResultImage
 
 from . import common
 from .common import pil2array
@@ -16,15 +15,10 @@ def deskew(pil_image, maxskew=2):
     return angle
 
 class OcropyDeskew(Processor):
-    logger: Logger
-
     @property
     def executable(self):
         return 'ocrd-cis-ocropy-deskew'
 
-    def setup(self):
-        self.logger = getLogger('processor.OcropyDeskew')
-
     def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
         """Deskew the pages or regions of the workspace.
 
diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py
index 17d0b4ce..e33ce024 100644
--- a/ocrd_cis/ocropy/dewarp.py
+++ b/ocrd_cis/ocropy/dewarp.py
@@ -5,7 +5,6 @@
 
 from ocrd import Processor
 from ocrd.processor import OcrdPageResult, OcrdPageResultImage
-from ocrd_utils import getLogger
 from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage
 
 from .ocrolib import lineest
@@ -54,14 +53,11 @@ def padvert(image, range_):
     return array2pil(line)
 
 class OcropyDewarp(Processor):
-    logger: Logger
-
     @property
     def executable(self):
         return 'ocrd-cis-ocropy-dewarp'
 
     def setup(self):
-        self.logger = getLogger('processor.OcropyDewarp')
         # defaults from ocrolib.lineest:
         self.lnorm = lineest.CenterNormalizer(
             params=(self.parameter['range'],
diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py
index 02d29e7c..85a76585 100644
--- a/ocrd_cis/ocropy/recognize.py
+++ b/ocrd_cis/ocropy/recognize.py
@@ -10,10 +10,9 @@
 
 from rapidfuzz.distance import Levenshtein
 
-from ocrd_utils import coordinates_for_segment, getLogger, points_from_polygon, polygon_from_bbox
+from ocrd_utils import coordinates_for_segment, points_from_polygon, polygon_from_bbox
 from ocrd_models.ocrd_page import CoordsType, GlyphType, OcrdPage, TextEquivType, WordType
-from ocrd import Processor
-from ocrd.processor import OcrdPageResult
+from ocrd import Processor, OcrdPageResult
 
 from .common import check_line, pil2array
 from .ocrolib import lstm, load_object, midrange
@@ -67,7 +66,6 @@ def recognize(image, pad, network, check=True):
 
 
 class OcropyRecognize(Processor):
-    logger: Logger
     network: Any
     pad: int
 
@@ -76,7 +74,6 @@ def executable(self):
         return 'ocrd-cis-ocropy-recognize'
 
     def setup(self):
-        self.logger = getLogger('processor.OcropyRecognize')
         self.pad = 16
         # from ocropus-rpred:
         self.network = load_object(self.get_model(), verbose=1)
diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
index c1809569..0fb133c0 100644
--- a/ocrd_cis/ocropy/resegment.py
+++ b/ocrd_cis/ocropy/resegment.py
@@ -9,7 +9,6 @@
 from shapely.prepared import prep
 
 from ocrd_utils import (
-    getLogger,
     coordinates_of_segment,
     coordinates_for_segment,
     points_from_polygon,
@@ -17,8 +16,7 @@
     transform_coordinates,
 )
 from ocrd_models.ocrd_page import BaselineType, PageType, OcrdPage
-from ocrd import Processor
-from ocrd.processor import OcrdPageResult
+from ocrd import Processor, OcrdPageResult
 
 from .ocrolib import midrange, morph
 from .common import (
@@ -43,15 +41,10 @@
 )
 
 class OcropyResegment(Processor):
-    logger: Logger
-
     @property
     def executable(self):
         return 'ocrd-cis-ocropy-resegment'
 
-    def setup(self):
-        self.logger = getLogger('processor.OcropyResegment')
-
     def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
         """Resegment lines of the workspace.
 
diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
index b363cbd2..493deb30 100644
--- a/ocrd_cis/ocropy/segment.py
+++ b/ocrd_cis/ocropy/segment.py
@@ -16,7 +16,6 @@
 from shapely import set_precision
 
 from ocrd_utils import (
-    getLogger,
     coordinates_of_segment,
     coordinates_for_segment,
     points_from_polygon,
@@ -243,21 +242,17 @@ def getx(xy):
 
 
 class OcropySegment(Processor):
-    logger: Logger
-
     @property
     def executable(self):
         return 'ocrd-cis-ocropy-segment'
 
-    def setup(self):
-        self.logger = getLogger('processor.OcropySegment')
-
     def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
         """Segment pages into regions+lines, tables into cells+lines, or regions into lines.
-        
+
         Open and deserialise PAGE input file and its respective images,
         then iterate over the element hierarchy down to the requested level.
-        
+
+        \b
         Depending on ``level-of-operation``, consider existing segments:
         - If ``overwrite_separators=True`` on ``page`` level, then
           delete any SeparatorRegions.
@@ -270,12 +265,13 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
         - If ``overwrite_order=True`` on ``page`` or ``table`` level, then
           delete the reading order OrderedGroup entry corresponding
           to the (page/table) segment.
-        
+
         Next, get each element image according to the layout annotation (from
         the alternative image of the page/region, or by cropping via coordinates
         into the higher-level image) in binarized form, and represent it as an array
         with non-text regions and (remaining) text neighbours suppressed.
-        
+
+        \b
         Then compute a text line segmentation for that array (as a label mask).
         When ``level-of-operation`` is ``page`` or ``table``, this also entails
         detecting
@@ -284,25 +280,26 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
         - up to ``maxcolseps`` background column separators
         before text line segmentation itself, as well as aggregating text lines
         to text regions afterwards.
-        
+
         Text regions are detected via a hybrid variant recursive X-Y cut algorithm
         (RXYC): RXYC partitions the binarized image in top-down manner by detecting
         horizontal or vertical gaps. This implementation uses the bottom-up text line
         segmentation to guide the search, and also uses both pre-existing and newly
         detected separators to alternatively partition the respective boxes into
         non-rectangular parts.
-        
+
         During line segmentation, suppress the foreground of all previously annotated
         regions (of any kind) and lines, except if just removed due to ``overwrite``.
         During region aggregation however, combine the existing separators with the
         new-found separators to guide the column search.
-        
+
         All detected segments (both text line and text region) are sorted according
         to their reading order (assuming a top-to-bottom, left-to-right ordering).
         When ``level-of-operation`` is ``page``, prefer vertical (column-first)
         succession of regions. When it is ``table``, prefer horizontal (row-first)
         succession of cells.
-        
+
+        \b
         Then for each resulting segment label, convert its background mask into
         polygon outlines by finding the outer contours consistent with the element's
         polygon outline. Annotate the result by adding it as a new TextLine/TextRegion:
@@ -314,7 +311,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
         - If it is ``page``, then append the new lines to their respective regions,
           and append the new regions to the page.
           (Also, create an OrderedGroup for it in the ReadingOrder.)
-        
+
         Produce a new output file by serialising the resulting hierarchy.
         """
         # FIXME: allow passing a-priori info on reading order / textline order
@@ -495,13 +492,13 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
         Given a PageType, TableRegionType or TextRegionType ``element``, and
         a corresponding binarized PIL.Image object ``image`` with coordinate
         metadata ``coords``, run line segmentation with Ocropy.
-        
+
         If operating on the full page (or table), then also detect horizontal
         and vertical separators, and aggregate the lines into text regions
         afterwards.
-        
+
         Add the resulting sub-segments to the parent ``element``.
-        
+
         If ``ignore`` is not empty, then first suppress all foreground components
         in any of those segments' coordinates during segmentation, and if also
         in full page/table mode, then combine all separators among them with the
@@ -773,7 +770,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non
 
 def polygon_for_parent(polygon, parent):
     """Clip polygon to parent polygon range.
-    
+
     (Should be moved to ocrd_utils.coordinates_for_segment.)
     """
     childp = Polygon(polygon)
@@ -986,7 +983,7 @@ def join_baselines(logger: Logger, baselines, loc=''):
 
 def page_get_reading_order(ro, rogroup):
     """Add all elements from the given reading order group to the given dictionary.
-    
+
     Given a dict ``ro`` from layout element IDs to ReadingOrder element objects,
     and an object ``rogroup`` with additional ReadingOrder element objects,
     add all references to the dict, traversing the group recursively.
@@ -1006,10 +1003,10 @@ def page_get_reading_order(ro, rogroup):
 
 def page_add_to_reading_order(rogroup, region_id, index=None):
     """Add a region reference to an un/ordered RO group.
-    
+
     Given a ReadingOrder group ``rogroup`` (of any type),
     append a reference to region ``region_id`` to it.
-    
+
     If ``index`` is given, use that as position and return
     incremented by one. (This must be an integer if ``rogroup``
     is an OrderedGroup(Indexed).
@@ -1025,16 +1022,16 @@ def page_add_to_reading_order(rogroup, region_id, index=None):
 
 def page_subgroup_in_reading_order(logger: Logger, roelem):
     """Replace given RO element by an equivalent OrderedGroup.
-    
+
     Given a ReadingOrder element ``roelem`` (of any type),
     first look up its parent group. Remove it from the respective
     member list (of its region refs or un/ordered groups),
     even if it already was an OrderedGroup(Indexed).
-    
+
     Then instantiate an empty OrderedGroup(Indexed), referencing
     the same region as ``roelem`` (and using the same index, if any).
     Add that group to the parent instead.
-    
+
     Return the new group object.
     """
     if not roelem:
diff --git a/ocrd_cis/ocropy/train.py b/ocrd_cis/ocropy/train.py
index 6c627231..78302f12 100644
--- a/ocrd_cis/ocropy/train.py
+++ b/ocrd_cis/ocropy/train.py
@@ -7,9 +7,7 @@
 from os.path import abspath, dirname, exists, join, isfile
 
 from ocrd_models import OcrdPage
-from ocrd import Processor, Workspace
-from ocrd.processor import OcrdPageResult
-from ocrd_utils import getLogger
+from ocrd import Processor, Workspace, OcrdPageResult
 
 from .ocropus_rtrain import *
 from .binarize import binarize
@@ -30,9 +28,7 @@ def resize_keep_ratio(image, baseheight=48):
 
 
 class OcropyTrain(Processor):
-    logger: Logger
     modelpath: str
-    old_cwd: str
     outputpath: str
 
     @property
@@ -40,7 +36,6 @@ def executable(self):
         return 'ocrd-cis-ocropy-train'
 
     def setup(self):
-        self.logger = getLogger('processor.OcropyTrain')
         if 'model' in self.parameter:
             model = self.parameter['model']
             try:
diff --git a/ocrd_cis/postcorrect/cli.py b/ocrd_cis/postcorrect/cli.py
index 71fbaad1..6759b96a 100644
--- a/ocrd_cis/postcorrect/cli.py
+++ b/ocrd_cis/postcorrect/cli.py
@@ -4,10 +4,9 @@
 import click
 import json
 
-from ocrd import Processor
+from ocrd import Processor, Workspace
 from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
-from ocrd_utils import getLogger, getLevelName
-from ocrd_models.ocrd_mets import OcrdMets
+from ocrd_utils import getLevelName, pushd_popd
 from ocrd_cis import JavaPostCorrector
 
 
@@ -21,26 +20,38 @@ class PostCorrector(Processor):
     def executable(self):
         return 'ocrd-cis-postcorrect'
 
-    def process(self):
+    def setup(self):
+        # since ocrd v3.0 we cannot overwrite self.parameter anymore
+        # because that gets validated against the schema
+        # (so these additions would fail)
+        self.params = dict(self.parameter)
         profiler = {}
         profiler["path"] = self.parameter["profilerPath"]
         profiler["config"] = self.parameter["profilerConfig"]
         profiler["noCache"] = True
-        self.parameter["profiler"] = profiler
-        self.parameter["runDM"] = True
-        self.logger.debug(json.dumps(self.parameter, indent=4))
-        p = JavaPostCorrector(self.workspace.mets_target,
-                              self.input_file_grp,
-                              self.output_file_grp,
-                              self.parameter,
-                              getLevelName(self.logger.getEffectiveLevel()))
-        p.exe()
-        # reload the mets file to prevent run_processor's save_mets
-        # from overriding the results from the Java process
-        self.workspace.reload_mets()
-        # workaround for cisocrgroup/ocrd-postcorrection#13 (absolute paths in output):
-        for output_file in self.workspace.find_files(file_grp=self.output_file_grp):
-            flocat = output_file._el.find('{http://www.loc.gov/METS/}FLocat')
-            flocat.attrib['LOCTYPE'] = 'OTHER'
-            flocat.attrib['OTHERLOCTYPE'] = 'FILE'
-            output_file.local_filename = os.path.relpath(output_file.local_filename, self.workspace.directory)
+        self.params["profiler"] = profiler
+        self.params["runDM"] = True
+        self.logger.debug(json.dumps(self.params, indent=4))
+
+    def process_workspace(self, workspace: Workspace):
+        with pushd_popd(workspace.directory):
+            self.workspace = workspace
+            self.verify()
+            # this CLI call mimics the OCR-D processor CLI itself
+            # we have no control over its interior
+            # (we get no page-wise error handling and input downloading)
+            p = JavaPostCorrector(self.workspace.mets_target,
+                                  self.input_file_grp,
+                                  self.output_file_grp,
+                                  self.params,
+                                  getLevelName(self.logger.getEffectiveLevel()))
+            p.exe()
+            # reload the mets file to prevent run_processor's save_mets
+            # from overriding the results from the Java process
+            self.workspace.reload_mets()
+            # workaround for cisocrgroup/ocrd-postcorrection#13 (absolute paths in output):
+            for output_file in self.workspace.find_files(file_grp=self.output_file_grp):
+                flocat = output_file._el.find('{http://www.loc.gov/METS/}FLocat')
+                flocat.attrib['LOCTYPE'] = 'OTHER'
+                flocat.attrib['OTHERLOCTYPE'] = 'FILE'
+                output_file.local_filename = os.path.relpath(output_file.local_filename, self.workspace.directory)

From bd0613a20fd4d7d88a466cc75f3e94be656f08bf Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Mon, 26 Aug 2024 11:36:53 +0200
Subject: [PATCH 80/97] require ocrd>=3.0.0b1

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 38f09abd..83cf28bb 100644
--- a/setup.py
+++ b/setup.py
@@ -38,7 +38,7 @@
     packages=find_packages(),
     include_package_data=True,
     install_requires=[
-        'ocrd>=3.0.0a1',
+        'ocrd>=3.0.0b1',
         'click',
         'scipy',
         'numpy>=1.17.0',

From f6e437fc8d5ef7bbb51fa7b4f5d590a11c6fc627 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Tue, 27 Aug 2024 14:46:41 +0200
Subject: [PATCH 81/97] add: simple github actions workflow

---
 .github/workflow/tests.yml | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)
 create mode 100644 .github/workflow/tests.yml

diff --git a/.github/workflow/tests.yml b/.github/workflow/tests.yml
new file mode 100644
index 00000000..424409df
--- /dev/null
+++ b/.github/workflow/tests.yml
@@ -0,0 +1,27 @@
+name: Test ocrd_cis installation and run tests
+
+on:
+  push:
+    branches: [ "master" ]
+  pull_request:
+    branches: [ "master" ]
+
+jobs:
+  build:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
+        os: [ "ubuntu-22.04" ]
+
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install ocrd_cis
+        run: make install
+      - name: Test ocrd_cis
+        run: make test

From 403781a3c27e5fdb0cddcf311401dad1a24f83f8 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Tue, 27 Aug 2024 15:30:14 +0200
Subject: [PATCH 82/97] Update .github/workflow/tests.yml

Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
---
 .github/workflow/tests.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflow/tests.yml b/.github/workflow/tests.yml
index 424409df..24fa0bc7 100644
--- a/.github/workflow/tests.yml
+++ b/.github/workflow/tests.yml
@@ -2,9 +2,8 @@ name: Test ocrd_cis installation and run tests
 
 on:
   push:
-    branches: [ "master" ]
   pull_request:
-    branches: [ "master" ]
+  workflow_dispatch:
 
 jobs:
   build:

From 97083bb71e724276385058bde9244cbdd21dce64 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Tue, 27 Aug 2024 15:30:25 +0200
Subject: [PATCH 83/97] Update .github/workflow/tests.yml

Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
---
 .github/workflow/tests.yml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflow/tests.yml b/.github/workflow/tests.yml
index 24fa0bc7..559297dd 100644
--- a/.github/workflow/tests.yml
+++ b/.github/workflow/tests.yml
@@ -20,7 +20,11 @@ jobs:
         uses: actions/setup-python@v5
         with:
           python-version: ${{ matrix.python-version }}
+      - uses: actions/setup-java@v4
+        with:
+          distribution: 'zulu'
+          java-version: '11'
       - name: Install ocrd_cis
         run: make install
       - name: Test ocrd_cis
-        run: make test
+        run: make test V=""

From 2b20e0c44da924a5b15379d86eb557acdf42b1f3 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Tue, 27 Aug 2024 15:49:11 +0200
Subject: [PATCH 84/97] fix: checkout ref

---
 .github/workflow/tests.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflow/tests.yml b/.github/workflow/tests.yml
index 559297dd..f95a09a4 100644
--- a/.github/workflow/tests.yml
+++ b/.github/workflow/tests.yml
@@ -15,7 +15,10 @@ jobs:
         os: [ "ubuntu-22.04" ]
 
     steps:
-      - uses: actions/checkout@v4
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.head_ref }}
       - name: Set up Python ${{ matrix.python-version }}
         uses: actions/setup-python@v5
         with:

From 86a08eb5cc471eef536bc2d050e80f768a728e43 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Tue, 27 Aug 2024 16:08:48 +0200
Subject: [PATCH 85/97] Create GH Actions workflow: test.yml

---
 .github/workflows/test.yml | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)
 create mode 100644 .github/workflows/test.yml

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
new file mode 100644
index 00000000..f95a09a4
--- /dev/null
+++ b/.github/workflows/test.yml
@@ -0,0 +1,33 @@
+name: Test ocrd_cis installation and run tests
+
+on:
+  push:
+  pull_request:
+  workflow_dispatch:
+
+jobs:
+  build:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
+        os: [ "ubuntu-22.04" ]
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.head_ref }}
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - uses: actions/setup-java@v4
+        with:
+          distribution: 'zulu'
+          java-version: '11'
+      - name: Install ocrd_cis
+        run: make install
+      - name: Test ocrd_cis
+        run: make test V=""

From 1d7e9a0d5f72e66c92c07e15508ba330e130f6bb Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Tue, 27 Aug 2024 16:18:40 +0200
Subject: [PATCH 86/97] delete: wrong path for workflows

---
 .github/workflow/tests.yml | 33 ---------------------------------
 1 file changed, 33 deletions(-)
 delete mode 100644 .github/workflow/tests.yml

diff --git a/.github/workflow/tests.yml b/.github/workflow/tests.yml
deleted file mode 100644
index f95a09a4..00000000
--- a/.github/workflow/tests.yml
+++ /dev/null
@@ -1,33 +0,0 @@
-name: Test ocrd_cis installation and run tests
-
-on:
-  push:
-  pull_request:
-  workflow_dispatch:
-
-jobs:
-  build:
-    runs-on: ${{ matrix.os }}
-    strategy:
-      fail-fast: false
-      matrix:
-        python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
-        os: [ "ubuntu-22.04" ]
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.head_ref }}
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v5
-        with:
-          python-version: ${{ matrix.python-version }}
-      - uses: actions/setup-java@v4
-        with:
-          distribution: 'zulu'
-          java-version: '11'
-      - name: Install ocrd_cis
-        run: make install
-      - name: Test ocrd_cis
-        run: make test V=""

From 224e86f5467c7506882792fa03397cbe032f69c9 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Tue, 27 Aug 2024 16:27:55 +0200
Subject: [PATCH 87/97] fix: NaN error for python3.9+

---
 ocrd_cis/ocropy/ocrolib/morph.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ocrd_cis/ocropy/ocrolib/morph.py b/ocrd_cis/ocropy/ocrolib/morph.py
index 7d6ffc85..1ebfb204 100644
--- a/ocrd_cis/ocropy/ocrolib/morph.py
+++ b/ocrd_cis/ocropy/ocrolib/morph.py
@@ -343,7 +343,7 @@ def select_regions(binary,f,min=0,nbest=100000):
     return keep[labels]
 
 @checks(SEGMENTATION)
-def all_neighbors(image, dist=1, bg=NaN):
+def all_neighbors(image, dist=1, bg=float('nan')):
     """Given an image with labels, find all pairs of labels
     that are directly (up to ``dist``) neighboring each other, ignoring the label ``bg``."""
     q = 100000

From a397531e549532675341c15b6c4a6fbef1f96818 Mon Sep 17 00:00:00 2001
From: Mehmed Mustafa <mehmed.n.mustafa@gmail.com>
Date: Tue, 27 Aug 2024 16:29:37 +0200
Subject: [PATCH 88/97] fix: NaN in reading_order in morph.py

---
 ocrd_cis/ocropy/ocrolib/morph.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ocrd_cis/ocropy/ocrolib/morph.py b/ocrd_cis/ocropy/ocrolib/morph.py
index 1ebfb204..4b626e83 100644
--- a/ocrd_cis/ocropy/ocrolib/morph.py
+++ b/ocrd_cis/ocropy/ocrolib/morph.py
@@ -429,7 +429,7 @@ def reading_order(seg,rl=False,bt=False):
         segmap[1:] = 1
         return segmap
     def pos(f,l):
-        return array([f(x) if x else nan for x in l])
+        return array([f(x) if x else float('nan') for x in l])
     ys = pos(sl.ycenter,objects)
     yorder = argsort(ys)[::-1 if bt else 1]
     groups = [[yorder[0]]]

From 9cf83051b2f1875b0757eb1d81ff0a29b7f63047 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Sun, 1 Sep 2024 11:18:36 +0200
Subject: [PATCH 89/97] fix type hints

---
 ocrd_cis/align/cli.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ocrd_cis/align/cli.py b/ocrd_cis/align/cli.py
index 5706461e..395f7b07 100644
--- a/ocrd_cis/align/cli.py
+++ b/ocrd_cis/align/cli.py
@@ -229,8 +229,8 @@ class Alignment:
     file_grp: str
     pcgts: OcrdPage
     region: TextRegionType
-    alignment: Alignment
-    def __init__(self, file_grp: str, pcgts: OcrdPage, region: TextRegionType, alignment: Alignment):
+    alignment: dict
+    def __init__(self, file_grp: str, pcgts: OcrdPage, region: TextRegionType, alignment: dict):
         self.file_grp = file_grp
         self.pcgts = pcgts
         self.region = region

From a0c734dd3e357606bde1c121cd4e25c972087df6 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Sun, 1 Sep 2024 11:19:29 +0200
Subject: [PATCH 90/97] dewarp: make thread-safe

---
 ocrd_cis/ocropy/dewarp.py          | 25 ++++++++++++-------------
 ocrd_cis/ocropy/ocrolib/lineest.py |  2 +-
 2 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py
index e33ce024..a0d0ea5c 100644
--- a/ocrd_cis/ocropy/dewarp.py
+++ b/ocrd_cis/ocropy/dewarp.py
@@ -57,17 +57,6 @@ class OcropyDewarp(Processor):
     def executable(self):
         return 'ocrd-cis-ocropy-dewarp'
 
-    def setup(self):
-        # defaults from ocrolib.lineest:
-        self.lnorm = lineest.CenterNormalizer(
-            params=(self.parameter['range'],
-                    self.parameter['smoothness'],
-                    # let's not expose this for now
-                    # (otherwise we must explain mutual
-                    #  dependency between smoothness
-                    #  and extra params)
-                    0.3))
-
     def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
         """Dewarp the lines of the workspace.
 
@@ -94,6 +83,16 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
         page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id)
         zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info)
 
+        # defaults from ocrolib.lineest:
+        lnorm = lineest.CenterNormalizer(
+            params=(self.parameter['range'],
+                    self.parameter['smoothness'],
+                    # let's not expose this for now
+                    # (otherwise we must explain mutual
+                    #  dependency between smoothness
+                    #  and extra params)
+                    0.3))
+
         regions = page.get_AllRegions(classes=['Text'], order='reading-order')
         if not regions:
             self.logger.warning(f'Page "{page_id}" contains no text regions')
@@ -107,8 +106,8 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
                 self.logger.info(f"About to dewarp page '{page_id}' region '{region.id}' line '{line.id}'")
                 try:
                     dew_image = dewarp(
-                        line_image, self.lnorm, check=True, max_neighbour=self.parameter['max_neighbour'], zoom=zoom)
-                except InvalidLine as err:
+                        line_image, lnorm, check=True, max_neighbour=self.parameter['max_neighbour'], zoom=zoom)
+                except (InvalidLine, AssertionError) as err:
                     self.logger.error(f'Cannot dewarp line "{line.id}": {err}')
                     continue
                 except InadequateLine as err:
diff --git a/ocrd_cis/ocropy/ocrolib/lineest.py b/ocrd_cis/ocropy/ocrolib/lineest.py
index 42ef2237..392c7e4a 100644
--- a/ocrd_cis/ocropy/ocrolib/lineest.py
+++ b/ocrd_cis/ocropy/ocrolib/lineest.py
@@ -75,7 +75,7 @@ def measure(self,line):
             plt.plot(self.center)
             plt.ginput(1,1000)
     def dewarp(self,img,cval=0,dtype=np.dtype('f')):
-        assert img.shape==self.shape
+        assert img.shape==self.shape, f"input shape {img.shape} deviates from measured shape {self.shape}"
         h,w = img.shape
         # The actual image img is embedded into a larger image by
         # adding vertical space on top and at the bottom (padding)

From 66baaf07f60532185a41ea606c31964ee046c8ba Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Sun, 1 Sep 2024 11:21:19 +0200
Subject: [PATCH 91/97] recognize: disallow multithreading (impossible with
 current lstm implementation)

---
 ocrd_cis/ocropy/recognize.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py
index 85a76585..97bec8a7 100644
--- a/ocrd_cis/ocropy/recognize.py
+++ b/ocrd_cis/ocropy/recognize.py
@@ -45,7 +45,7 @@ def recognize(image, pad, network, check=True):
     pred = network.predictString(line)
 
     # getting confidence
-    result = lstm.translate_back(network.outputs, pos=1)
+    result = lstm.translate_back(network.outputs, pos=1) # raw positions
     scale = len(raw_line.T) * 1.0 / (len(network.outputs) - 2 * pad)
 
     clist = []
@@ -68,6 +68,8 @@ def recognize(image, pad, network, check=True):
 class OcropyRecognize(Processor):
     network: Any
     pad: int
+    # lstm is not thread-safe (.outputs, .last_n as side effects etc)
+    max_workers = 1
 
     @property
     def executable(self):
@@ -191,7 +193,7 @@ def process_lines(self, textlines, maxlevel, region_image, region_xywh):
             try:
                 linepred, clist, rlist, confidlist = recognize(final_img, self.pad, self.network, check=True)
             except Exception as err:
-                self.logger.debug(f'Error processing line "{line.id}": {err}')
+                self.logger.debug(f'Error processing line "{line.id}": {str(err) or err.__class__.__name__}')
                 continue
             self.logger.debug(f"OCR '{line.id}': '{linepred}'")
             edits += Levenshtein.distance(linepred, linegt)

From 32ce6560d9c1e10fdfd00055e567b0fe13187404 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Sun, 1 Sep 2024 11:22:14 +0200
Subject: [PATCH 92/97] postcorrect: make work under METS Server

---
 ocrd_cis/postcorrect/cli.py | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/ocrd_cis/postcorrect/cli.py b/ocrd_cis/postcorrect/cli.py
index 6759b96a..70918de7 100644
--- a/ocrd_cis/postcorrect/cli.py
+++ b/ocrd_cis/postcorrect/cli.py
@@ -1,12 +1,14 @@
 from __future__ import absolute_import
 import os
+import json
 
 import click
-import json
 
 from ocrd import Processor, Workspace
 from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
 from ocrd_utils import getLevelName, pushd_popd
+from ocrd_models import OcrdMets
+
 from ocrd_cis import JavaPostCorrector
 
 
@@ -37,6 +39,8 @@ def process_workspace(self, workspace: Workspace):
         with pushd_popd(workspace.directory):
             self.workspace = workspace
             self.verify()
+            # ensure that input files are referenced in on-disk METS
+            self.workspace.save_mets()
             # this CLI call mimics the OCR-D processor CLI itself
             # we have no control over its interior
             # (we get no page-wise error handling and input downloading)
@@ -46,12 +50,23 @@ def process_workspace(self, workspace: Workspace):
                                   self.params,
                                   getLevelName(self.logger.getEffectiveLevel()))
             p.exe()
-            # reload the mets file to prevent run_processor's save_mets
-            # from overriding the results from the Java process
-            self.workspace.reload_mets()
             # workaround for cisocrgroup/ocrd-postcorrection#13 (absolute paths in output):
-            for output_file in self.workspace.find_files(file_grp=self.output_file_grp):
+            #   We cannot do that with this method, because our self.workspace.mets might be
+            #   a ClientSideOcrdMets, which does not allow modifying or removing files:
+            # for output_file in self.workspace.find_files(file_grp=self.output_file_grp):
+            #     flocat = output_file._el.find('{http://www.loc.gov/METS/}FLocat')
+            #     flocat.attrib['LOCTYPE'] = 'OTHER'
+            #     flocat.attrib['OTHERLOCTYPE'] = 'FILE'
+            #     output_file.local_filename = os.path.relpath(output_file.local_filename, self.workspace.directory)
+            #   So instead, let's post-process the local METS file result directly:
+            mets = OcrdMets(filename=self.workspace.mets_target)
+            for output_file in mets.find_files(fileGrp=self.output_file_grp):
                 flocat = output_file._el.find('{http://www.loc.gov/METS/}FLocat')
                 flocat.attrib['LOCTYPE'] = 'OTHER'
                 flocat.attrib['OTHERLOCTYPE'] = 'FILE'
                 output_file.local_filename = os.path.relpath(output_file.local_filename, self.workspace.directory)
+            with open(self.workspace.mets_target, 'w') as f:
+                f.write(mets.to_xml(xmllint=True).decode('utf-8'))
+            # reload the mets file to prevent run_processor's save_mets
+            # from overriding the results from the Java process
+            self.workspace.reload_mets()

From c4a5999d905d23a8e347eed2b257363c0c2545af Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Sun, 1 Sep 2024 11:24:41 +0200
Subject: [PATCH 93/97] tests: use METS Server if OCRD_MAX_PARALLEL_PAGES>1

---
 tests/run_add_zip_test.bash             |  5 +--
 tests/run_alignment_test.bash           |  5 +--
 tests/run_image_preprocessing_test.bash | 15 +++++----
 tests/run_ocr_test.bash                 |  7 ++--
 tests/run_postcorrection_test.bash      | 19 +++++------
 tests/run_training_test.bash            |  7 ++--
 tests/test_lib.bash                     | 43 ++++++++++++++++++++-----
 7 files changed, 68 insertions(+), 33 deletions(-)

diff --git a/tests/run_add_zip_test.bash b/tests/run_add_zip_test.bash
index 02de2db2..e2d44983 100644
--- a/tests/run_add_zip_test.bash
+++ b/tests/run_add_zip_test.bash
@@ -6,7 +6,7 @@ ocrd_cis_init_ws blumenbach_anatomie_1805.ocrd.zip
 # test if there are 3 gt files
 pushd "$tmpws"
 found_files=0
-for file in $(ocrd workspace find -G OCR-D-GT-SEG-LINE); do
+for file in $(ocrd ${OCRD_LOG_ARGS[*]} workspace ${OCRD_WS_ARGS[*]} find -G OCR-D-GT-SEG-LINE); do
 	[[ -f "$file" ]] || fail "cannot find ground truth file: $file"
 	found_files=$((found_files + 1))
 done
@@ -16,9 +16,10 @@ popd
 # test if there are 3 gt files
 pushd "$tmpws"
 found_files=0
-for file in $(ocrd workspace find -G OCR-D-IMG); do
+for file in $(ocrd ${OCRD_LOG_ARGS[*]} workspace ${OCRD_WS_ARGS[*]} find -G OCR-D-IMG); do
 	[[ -f "$file" ]] || fail "cannot find ground truth file: $file"
 	found_files=$((found_files + 1))
 done
 (( found_files == 3 )) || fail "invalid number of files: $found_files"
 popd
+
diff --git a/tests/run_alignment_test.bash b/tests/run_alignment_test.bash
index e8a3c79a..7a82254b 100644
--- a/tests/run_alignment_test.bash
+++ b/tests/run_alignment_test.bash
@@ -6,7 +6,7 @@ ocrd_cis_init_ws blumenbach_anatomie_1805.ocrd.zip
 # test if there are 3 gt files
 pushd "$tmpws"
 found_files=0
-for file in $(ocrd workspace find -G $OCRD_CIS_FILEGRP); do
+for file in $(ocrd ${OCRD_LOG_ARGS[*]} workspace ${OCRD_WS_ARGS[*]} find -G $OCRD_CIS_FILEGRP); do
 	[[ -f "$file" ]] || fail "cannot find ground truth file: $file"
 	found_files=$((found_files + 1))
 done
@@ -17,9 +17,10 @@ ocrd_cis_align
 
 pushd $tmpws
 found_files=0
-for file in $(ocrd workspace find -G OCR-D-CIS-ALIGN); do
+for file in $(ocrd ${OCRD_LOG_ARGS[*]} workspace ${OCRD_WS_ARGS[*]} find -G OCR-D-CIS-ALIGN); do
 	[[ -f "$file" ]] || fail "cannot find aligned file group workspace"
 	found_files=$((found_files + 1))
 done
 (( found_files == 3 )) || fail "invalid number of files: $found_files"
 popd
+
diff --git a/tests/run_image_preprocessing_test.bash b/tests/run_image_preprocessing_test.bash
index f80fc636..7a66a57b 100644
--- a/tests/run_image_preprocessing_test.bash
+++ b/tests/run_image_preprocessing_test.bash
@@ -7,16 +7,17 @@ ocrd_cis_init_ws "blumenbach_anatomie_1805.ocrd.zip"
 # test if there are 3 gt files
 pushd "$tmpws"
 found_files=0
-for file in $(ocrd workspace find -G $OCRD_CIS_FILEGRP); do
+for file in $(ocrd ${OCRD_LOG_ARGS[*]} workspace ${OCRD_WS_ARGS[*]} find -G $OCRD_CIS_FILEGRP); do
 	[[ -f "$file" ]] || fail "cannot find ground truth file: $file"
 	found_files=$((found_files + 1))
 done
 (( found_files == 3 )) || fail "invalid number of files: $found_files"
 
-ocrd-cis-ocropy-binarize -l DEBUG -I $OCRD_CIS_FILEGRP -O OCR-D-CIS-IMG-BIN
-ocrd-cis-ocropy-clip -l DEBUG -I OCR-D-CIS-IMG-BIN -O OCR-D-CIS-IMG-CLIP
-ocrd-cis-ocropy-denoise -l DEBUG -I OCR-D-CIS-IMG-CLIP -O OCR-D-CIS-IMG-DEN
-ocrd-cis-ocropy-deskew -l DEBUG -I OCR-D-CIS-IMG-DEN  -O OCR-D-CIS-IMG-DES
-ocrd-cis-ocropy-dewarp -l DEBUG -I OCR-D-CIS-IMG-DES -O OCR-D-CIS-IMG-DEW
-ocrd-cis-ocropy-segment -l DEBUG -I OCR-D-CIS-IMG-DEW -O OCR-D-CIS-IMG-SEG
+ARGS=(${OCRD_LOG_ARGS[*]} ${OCRD_WS_ARGS[*]})
+ocrd-cis-ocropy-binarize ${ARGS[*]} -I $OCRD_CIS_FILEGRP -O OCR-D-CIS-IMG-BIN
+ocrd-cis-ocropy-clip ${ARGS[*]} -I OCR-D-CIS-IMG-BIN -O OCR-D-CIS-IMG-CLIP
+ocrd-cis-ocropy-denoise ${ARGS[*]} -I OCR-D-CIS-IMG-CLIP -O OCR-D-CIS-IMG-DEN
+ocrd-cis-ocropy-deskew ${ARGS[*]} -I OCR-D-CIS-IMG-DEN  -O OCR-D-CIS-IMG-DES
+ocrd-cis-ocropy-dewarp ${ARGS[*]} -I OCR-D-CIS-IMG-DES -O OCR-D-CIS-IMG-DEW
+ocrd-cis-ocropy-segment ${ARGS[*]} -I OCR-D-CIS-IMG-DEW -O OCR-D-CIS-IMG-SEG
 popd
diff --git a/tests/run_ocr_test.bash b/tests/run_ocr_test.bash
index b10f6f6d..f737ae43 100644
--- a/tests/run_ocr_test.bash
+++ b/tests/run_ocr_test.bash
@@ -6,7 +6,7 @@ ocrd_cis_init_ws blumenbach_anatomie_1805.ocrd.zip
 # test if there are 3 gt files
 pushd "$tmpws"
 found_files=0
-for file in $(ocrd workspace find -G $OCRD_CIS_FILEGRP); do
+for file in $(ocrd ${OCRD_LOG_ARGS[*]} workspace ${OCRD_WS_ARGS[*]} find -G $OCRD_CIS_FILEGRP); do
 	[[ -f "$file" ]] || fail "cannot find ground truth file: $file"
 	found_files=$((found_files + 1))
 done
@@ -16,8 +16,9 @@ done
 ocrd resmgr download ocrd-cis-ocropy-recognize fraktur.pyrnn.gz
 
 # run ocr
-ocrd-cis-ocropy-binarize -l DEBUG -I $OCRD_CIS_FILEGRP -O OCR-D-CIS-IMG-BIN
-ocrd-cis-ocropy-recognize -l DEBUG -I OCR-D-CIS-IMG-BIN -O OCR-D-CIS-OCR \
+ARGS=(${OCRD_LOG_ARGS[*]} ${OCRD_WS_ARGS[*]})
+ocrd-cis-ocropy-binarize ${ARGS[*]} -I $OCRD_CIS_FILEGRP -O OCR-D-CIS-IMG-BIN
+ocrd-cis-ocropy-recognize ${ARGS[*]} -I OCR-D-CIS-IMG-BIN -O OCR-D-CIS-OCR \
 	-P textequiv_level word -P model fraktur.pyrnn.gz
 
 popd
diff --git a/tests/run_postcorrection_test.bash b/tests/run_postcorrection_test.bash
index d7f34ace..859c8407 100644
--- a/tests/run_postcorrection_test.bash
+++ b/tests/run_postcorrection_test.bash
@@ -6,7 +6,7 @@ ocrd_cis_init_ws blumenbach_anatomie_1805.ocrd.zip
 # test if there are 3 gt files
 pushd "$tmpws"
 found_files=0
-for file in $(ocrd workspace find -G $OCRD_CIS_FILEGRP); do
+for file in $(ocrd ${OCRD_LOG_ARGS[*]} workspace ${OCRD_WS_ARGS[*]} find -G $OCRD_CIS_FILEGRP); do
 	[[ -f "$file" ]] || fail "cannot find ground truth file: $file"
 	found_files=$((found_files + 1))
 done
@@ -15,25 +15,26 @@ popd
 
 ocrd_cis_align
 
-mkdir "$tmpdir/bin"
-cat > "$tmpdir/bin/profiler.bash" <<EOF
+pushd "$tmpws"
+mkdir "bin"
+cat > "bin/profiler.bash" <<EOF
 #!/bin/bash
 cat > /dev/null
 echo '{}'
 EOF
-chmod a+x "$tmpdir/bin/profiler.bash"
-ocrd-cis-postcorrect -l DEBUG \
+chmod a+x "bin/profiler.bash"
+
+ARGS=(${OCRD_LOG_ARGS[*]} ${OCRD_WS_ARGS[*]})
+ocrd-cis-postcorrect ${ARGS[*]} \
 			-I OCR-D-CIS-ALIGN \
 			-O OCR-D-CIS-POSTCORRECT \
-			-m $tmpws/mets.xml \
-			-P profilerPath $tmpdir/bin/profiler.bash \
+			-P profilerPath bin/profiler.bash \
 			-P profilerConfig ignored \
 			-P model "$(ocrd-cis-data -model)" \
 			-P nOCR 2
 
-pushd $tmpws
 found_files=0
-for file in $(ocrd workspace find -G OCR-D-CIS-POSTCORRECT); do
+for file in $(ocrd ${OCRD_LOG_ARGS[*]} workspace ${OCRD_WS_ARGS[*]} find -G OCR-D-CIS-POSTCORRECT); do
 	[[ -f "$file" ]] || fail "$file: not a file"
 	found_files=$((found_files + 1))
 done
diff --git a/tests/run_training_test.bash b/tests/run_training_test.bash
index ade1b68e..5b96dc3e 100644
--- a/tests/run_training_test.bash
+++ b/tests/run_training_test.bash
@@ -6,7 +6,7 @@ ocrd_cis_init_ws blumenbach_anatomie_1805.ocrd.zip
 # test if there are 3 gt files
 pushd "$tmpws"
 found_files=0
-for file in $(ocrd workspace find -G $OCRD_CIS_FILEGRP); do
+for file in $(ocrd ${OCRD_LOG_ARGS[*]} workspace ${OCRD_WS_ARGS[*]} find -G $OCRD_CIS_FILEGRP); do
 	[[ -f "$file" ]] || fail "cannot find ground truth file: $file"
 	found_files=$((found_files + 1))
 done
@@ -15,9 +15,12 @@ popd
 
 ocrd_cis_align
 
+stopserver
+OCRD_MAX_PARALLEL_PAGES=1
+
 # fix ocr for some entries (otherwise the training will fail)
 pushd $tmpws
-for f in $(ocrd workspace find -G OCR-D-CIS-ALIGN); do
+for f in $(ocrd ${OCRD_LOG_ARGS[*]} workspace find -G OCR-D-CIS-ALIGN); do
 	sed -i -e 's#<pc:Unicode>e.</pc:Unicode>#<pc:Unicode>Säugethiere.</pc:Unicode>#' $f
 	sed -i -e 's#<pc:Unicode>E</pc:Unicode>#<pc:Unicode>Säugethieren</pc:Unicode>#' $f
 done
diff --git a/tests/test_lib.bash b/tests/test_lib.bash
index 801be01a..76111d25 100644
--- a/tests/test_lib.bash
+++ b/tests/test_lib.bash
@@ -1,10 +1,27 @@
 #/bin/bash
 
 tmpdir=$(mktemp -d)
-trap "trap 'echo exiting without removing $tmpdir' EXIT" ERR
-trap "rm -rf $tmpdir" EXIT
+function stopserver() {
+    :
+}
+function failexit() {
+    stopserver
+}
+function cleanexit() {
+    stopserver
+    rm -rf $tmpdir
+}
+trap "trap failexit EXIT" ERR
+trap cleanexit EXIT
+
+OCRD_LOG_ARGS=()
+if test -v OCRD_OVERRIDE_LOGLEVEL; then
+    OCRD_LOG_ARGS+=(-l $OCRD_OVERRIDE_LOGLEVEL)
+fi
+OCRD_WS_ARGS=() # -m mets.xml
 
 OCRD_CIS_FILEGRP="OCR-D-GT-SEG-LINE"
+
 data_url="https://github.com/OCR-D/gt_structure_text/releases/download/v1.5.0/"
 function ocrd_cis_download_bagit() {
 	local url="$data_url/$1"
@@ -16,22 +33,32 @@ function ocrd_cis_init_ws() {
 	ocrd_cis_download_bagit "$1"
 	ocrd zip spill -d "$tmpdir" "$PWD/download/$1"
 	tmpws="$tmpdir/${1%.ocrd.zip}"
+        if ((${OCRD_MAX_PARALLEL_PAGES:-0} > 1)); then
+            echo starting METS server at $tmpws
+            ocrd workspace -d "$tmpws" -U "$tmpws/mets.sock" server start &
+            OCRD_WS_ARGS+=(-U "$tmpws/mets.sock")
+            sleep 1
+            function stopserver() {
+                echo stopping METS server at $tmpws
+                ocrd workspace -d "$tmpws" -U "$tmpws/mets.sock" server stop || true
+            }
+        fi
 }
 
+
 function ocrd_cis_align() {
 	# download ocr models
 	ocrd resmgr download ocrd-cis-ocropy-recognize fraktur.pyrnn.gz
 	ocrd resmgr download ocrd-cis-ocropy-recognize fraktur-jze.pyrnn.gz
 	# run ocr
         pushd $tmpws
-        ocrd-cis-ocropy-binarize -l DEBUG -I $OCRD_CIS_FILEGRP -O OCR-D-CIS-IMG-BIN
-	ocrd-cis-ocropy-recognize -l DEBUG \
- 				-I OCR-D-CIS-IMG-BIN -O OCR-D-CIS-OCR-1 \
+        ARGS=(${OCRD_LOG_ARGS[*]} ${OCRD_WS_ARGS[*]})
+        ocrd-cis-ocropy-binarize ${ARGS[*]} -I $OCRD_CIS_FILEGRP -O OCR-D-CIS-IMG-BIN
+	ocrd-cis-ocropy-recognize ${ARGS[*]} -I OCR-D-CIS-IMG-BIN -O OCR-D-CIS-OCR-1 \
 				-P textequiv_level word -P model fraktur.pyrnn.gz
-	ocrd-cis-ocropy-recognize -l DEBUG \
-				-I OCR-D-CIS-IMG-BIN -O OCR-D-CIS-OCR-2 \
+	ocrd-cis-ocropy-recognize ${ARGS[*]} -I OCR-D-CIS-IMG-BIN -O OCR-D-CIS-OCR-2 \
 				-P textequiv_level word -P model fraktur-jze.pyrnn.gz
-	ocrd-cis-align -l DEBUG	-I OCR-D-CIS-OCR-1,OCR-D-CIS-OCR-2,$OCRD_CIS_FILEGRP \
+	ocrd-cis-align ${ARGS[*]} -I OCR-D-CIS-OCR-1,OCR-D-CIS-OCR-2,$OCRD_CIS_FILEGRP \
 				-O OCR-D-CIS-ALIGN
         popd
 }

From ae7dc671ab50104c0cf3f4dec6bf28fc3c1990ed Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Sun, 1 Sep 2024 11:25:35 +0200
Subject: [PATCH 94/97] make test: run serially and parallel, show times

---
 Makefile | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index a040cf9d..d1991df0 100644
--- a/Makefile
+++ b/Makefile
@@ -22,7 +22,17 @@ docker-push: docker-build
 TEST_SCRIPTS=$(sort $(filter-out tests/run_training_test.bash, $(wildcard tests/run_*.bash)))
 .PHONY: $(TEST_SCRIPTS)
 $(TEST_SCRIPTS):
-	bash $@ $V
+	OCRD_MAX_PARALLEL_PAGES=1 /usr/bin/time -o test_serially.log -a -f "$@: %Uuser %Ssystem %Eelapsed %PCPU (%Mmax)k" bash $@ $V
+	OCRD_MAX_PARALLEL_PAGES=4 /usr/bin/time -o test_parallel.log -a -f "$@: %Uuser %Ssystem %Eelapsed %PCPU (%Mmax)k" bash $@ $V
+
+test: export OCRD_OVERRIDE_LOGLEVEL=DEBUG
+test: export OCRD_MISSING_OUTPUT=ABORT
+test: export OCRD_MAX_MISSING_OUTPUTS=-1
 test: $(TEST_SCRIPTS)
-	@echo $^
+	@echo =====single-threaded test results=====
+	@cat test_serially.log
+	@echo =====4-page-parallel test results=====
+	@cat test_parallel.log
+	@$(RM) test_serially.log test_parallel.log
+
 .PHONY: install install-devel uninstall test docker-build docker-push

From e540b108e0c7f14c1cfcf8579dd0722a41069ead Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Mon, 2 Sep 2024 11:48:43 +0200
Subject: [PATCH 95/97] require ocrd>=3.0.0b4

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 83cf28bb..e8ea1cf3 100644
--- a/setup.py
+++ b/setup.py
@@ -38,7 +38,7 @@
     packages=find_packages(),
     include_package_data=True,
     install_requires=[
-        'ocrd>=3.0.0b1',
+        'ocrd>=3.0.0b4',
         'click',
         'scipy',
         'numpy>=1.17.0',

From 99b348915bcf0c1d3ea0028ca43ac2448a0ee922 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Thu, 26 Sep 2024 01:28:50 +0000
Subject: [PATCH 96/97] segment: adapt to numpy deprecation

---
 ocrd_cis/ocropy/common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py
index c5b56ed0..bae4dac0 100644
--- a/ocrd_cis/ocropy/common.py
+++ b/ocrd_cis/ocropy/common.py
@@ -644,7 +644,7 @@ def compute_seplines(binary, scale, maxseps=0):
                 sepdists.append(np.median(subdistances))
                 #LOG.debug("adding sublabel %d as sep %d (size %d [%s])", sublabel, numsep, sublabelsize, str(sublabelslice))
     sepsizes = np.array(sepsizes)
-    sepslices = np.array(sepslices)
+    sepslices = np.array(sepslices, dtype=object)
     LOG.debug("detected %d separator candidates", numsep)
     DSAVE("seps-raw", sepmap[labels])
     # now dilate+erode to link neighbouring candidates,

From dee1abf5c1cfcf3b8e111f4b3f8614e0f6fea214 Mon Sep 17 00:00:00 2001
From: kba <unixprog@gmail.com>
Date: Fri, 11 Oct 2024 11:12:20 +0200
Subject: [PATCH 97/97] eval/stats: Levenshtein ->
 rapidfuzz.distance.Levenshtein

---
 ocrd_cis/div/eval.py  | 2 +-
 ocrd_cis/div/stats.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/ocrd_cis/div/eval.py b/ocrd_cis/div/eval.py
index 6efe90c6..f47682ff 100644
--- a/ocrd_cis/div/eval.py
+++ b/ocrd_cis/div/eval.py
@@ -1,6 +1,6 @@
 import os
 from PIL import Image
-from Levenshtein import distance
+from rapidfuzz.distance.Levenshtein import distance
 
 
 path = '/mnt/c/Users/chris/Documents/projects/OCR-D/daten/gt/lines/'
diff --git a/ocrd_cis/div/stats.py b/ocrd_cis/div/stats.py
index ea385d98..6f9c9816 100644
--- a/ocrd_cis/div/stats.py
+++ b/ocrd_cis/div/stats.py
@@ -4,7 +4,7 @@
 from ocrd import Processor
 from ocrd_cis import get_ocrd_tool
 from ocrd_models.ocrd_page_generateds import parse
-from Levenshtein import distance
+from rapidfuzz.distance import Levenshtein
 
 
 class Stats(Processor):
@@ -81,7 +81,7 @@ def process(self):
                         # print(line.get_TextEquiv()[2].dataType)
                         unicodeline = line.get_TextEquiv()[i].Unicode
 
-                        d[i] += distance(gtline, unicodeline)
+                        d[i] += Levenshtein.distance(gtline, unicodeline)
 
                         # words = line.get_Word()
                         # for word in words: