From 2ed2c4f89ab4611d24e0a9328479124f88750ca1 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 13 Aug 2024 10:41:03 +0200 Subject: [PATCH 01/97] add executable property --- ocrd_cis/ocropy/binarize.py | 10 ++++++---- ocrd_cis/ocropy/clip.py | 8 +++++--- ocrd_cis/ocropy/denoise.py | 8 +++++--- ocrd_cis/ocropy/deskew.py | 6 +++++- ocrd_cis/ocropy/dewarp.py | 10 ++++++---- ocrd_cis/ocropy/recognize.py | 10 ++++++---- ocrd_cis/ocropy/resegment.py | 8 +++++--- ocrd_cis/ocropy/segment.py | 8 +++++--- ocrd_cis/ocropy/train.py | 6 +++++- 9 files changed, 48 insertions(+), 26 deletions(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index 872185c3..7429d14a 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -28,8 +28,6 @@ #sys.path.append(os.path.dirname(os.path.abspath(__file__))) -TOOL = 'ocrd-cis-ocropy-binarize' - def binarize(pil_image, method='ocropy', maxskew=2, threshold=0.5, nrm=False, zoom=1.0): LOG = getLogger('processor.OcropyBinarize') LOG.debug('binarizing %dx%d image with method=%s', pil_image.width, pil_image.height, method) @@ -71,13 +69,17 @@ class OcropyBinarize(Processor): def __init__(self, *args, **kwargs): self.ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = self.ocrd_tool['tools'][TOOL] + kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable] kwargs['version'] = self.ocrd_tool['version'] super(OcropyBinarize, self).__init__(*args, **kwargs) if hasattr(self, 'output_file_grp'): # processing context self.setup() - + + @property + def executable(self): + return 'ocrd-cis-ocropy-binarize' + def setup(self): self.logger = getLogger('processor.OcropyBinarize') if self.parameter['grayscale'] and self.parameter['method'] != 'ocropy': diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py index a305f09e..919b26b0 100644 --- a/ocrd_cis/ocropy/clip.py +++ b/ocrd_cis/ocropy/clip.py @@ -31,16 +31,18 @@ pil2array, array2pil ) -TOOL = 'ocrd-cis-ocropy-clip' - class OcropyClip(Processor): def __init__(self, *args, **kwargs): self.ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = self.ocrd_tool['tools'][TOOL] + kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable] kwargs['version'] = self.ocrd_tool['version'] super(OcropyClip, self).__init__(*args, **kwargs) + @property + def executable(self): + return 'ocrd-cis-ocropy-clip' + def process(self): """Clip text regions / lines of the workspace at intersections with neighbours. diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py index cbbdf8cf..ac3c4dc5 100644 --- a/ocrd_cis/ocropy/denoise.py +++ b/ocrd_cis/ocropy/denoise.py @@ -19,16 +19,18 @@ # binarize, remove_noise) -TOOL = 'ocrd-cis-ocropy-denoise' - class OcropyDenoise(Processor): def __init__(self, *args, **kwargs): self.ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = self.ocrd_tool['tools'][TOOL] + kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable] kwargs['version'] = self.ocrd_tool['version'] super(OcropyDenoise, self).__init__(*args, **kwargs) + @property + def executable(self): + return 'ocrd-cis-ocropy-denoise' + def process(self): """Despeckle the pages / regions / lines of the workspace. diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py index 4ed04218..fe61fce3 100644 --- a/ocrd_cis/ocropy/deskew.py +++ b/ocrd_cis/ocropy/deskew.py @@ -34,10 +34,14 @@ class OcropyDeskew(Processor): def __init__(self, *args, **kwargs): ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = ocrd_tool['tools'][TOOL] + kwargs['ocrd_tool'] = ocrd_tool['tools'][self.executable] kwargs['version'] = ocrd_tool['version'] super(OcropyDeskew, self).__init__(*args, **kwargs) + @property + def executable(self): + return 'ocrd-cis-ocropy-deskew' + def process(self): """Deskew the pages or regions of the workspace. diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py index 7d3251bf..1bc4a805 100644 --- a/ocrd_cis/ocropy/dewarp.py +++ b/ocrd_cis/ocropy/dewarp.py @@ -24,8 +24,6 @@ #sys.path.append(os.path.dirname(os.path.abspath(__file__))) -TOOL = 'ocrd-cis-ocropy-dewarp' - class InvalidLine(Exception): """Line image does not allow dewarping and should be ignored.""" @@ -72,13 +70,17 @@ class OcropyDewarp(Processor): def __init__(self, *args, **kwargs): self.ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = self.ocrd_tool['tools'][TOOL] + kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable] kwargs['version'] = self.ocrd_tool['version'] super(OcropyDewarp, self).__init__(*args, **kwargs) if hasattr(self, 'output_file_grp'): # processing context self.setup() - + + @property + def executable(self): + return 'ocrd-cis-ocropy-dewarp' + def setup(self): # defaults from ocrolib.lineest: self.lnorm = lineest.CenterNormalizer( diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py index 74d858ab..5734aa92 100644 --- a/ocrd_cis/ocropy/recognize.py +++ b/ocrd_cis/ocropy/recognize.py @@ -30,8 +30,6 @@ check_line ) -TOOL = 'ocrd-cis-ocropy-recognize' - def resize_keep_ratio(image, baseheight=48): scale = baseheight / image.height wsize = round(image.width * scale) @@ -85,13 +83,17 @@ def __init__(self, *args, **kwargs): self.ocrd_tool = get_ocrd_tool() self.pad = 16 # ocropus-rpred default self.network = None # set in process - kwargs['ocrd_tool'] = self.ocrd_tool['tools'][TOOL] + kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable] kwargs['version'] = self.ocrd_tool['version'] super(OcropyRecognize, self).__init__(*args, **kwargs) if hasattr(self, 'output_file_grp'): # processing context self.setup() - + + @property + def executable(self): + return 'ocrd-cis-ocropy-recognize' + def setup(self): self.logger = getLogger('processor.OcropyRecognize') # from ocropus-rpred: diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index a337b5e0..2b1f73c3 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -46,16 +46,18 @@ diff_polygons ) -TOOL = 'ocrd-cis-ocropy-resegment' - class OcropyResegment(Processor): def __init__(self, *args, **kwargs): self.ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = self.ocrd_tool['tools'][TOOL] + kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable] kwargs['version'] = self.ocrd_tool['version'] super().__init__(*args, **kwargs) + @property + def executable(self): + return 'ocrd-cis-ocropy-resegment' + def process(self): """Resegment lines of the workspace. diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index 49cb6776..1624597e 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -58,8 +58,6 @@ lines2regions ) -TOOL = 'ocrd-cis-ocropy-segment' - def masks2polygons(bg_labels, baselines, fg_bin, name, min_area=None, simplify=None, open_holes=False, reorder=True): """Convert label masks into polygon coordinates. @@ -248,10 +246,14 @@ class OcropySegment(Processor): def __init__(self, *args, **kwargs): self.ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = self.ocrd_tool['tools'][TOOL] + kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable] kwargs['version'] = self.ocrd_tool['version'] super(OcropySegment, self).__init__(*args, **kwargs) + @property + def executable(self): + return 'ocrd-cis-ocropy-segment' + def process(self): """Segment pages into regions+lines, tables into cells+lines, or regions into lines. diff --git a/ocrd_cis/ocropy/train.py b/ocrd_cis/ocropy/train.py index d257a61f..46e9d258 100644 --- a/ocrd_cis/ocropy/train.py +++ b/ocrd_cis/ocropy/train.py @@ -32,13 +32,17 @@ class OcropyTrain(Processor): def __init__(self, *args, **kwargs): self.oldcwd = os.getcwd() ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = ocrd_tool['tools']['ocrd-cis-ocropy-train'] + kwargs['ocrd_tool'] = ocrd_tool['tools'][self.executable] kwargs['version'] = ocrd_tool['version'] super(OcropyTrain, self).__init__(*args, **kwargs) if hasattr(self, 'input_file_grp'): # processing context self.setup() + @property + def executable(self): + return 'ocrd-cis-ocropy-train' + def setup(self): self.log = getLogger('processor.OcropyTrain') #print(self.parameter) From 61e6caf06ff479d4e6a8c59d85254d5a25fa79e4 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 13 Aug 2024 10:54:46 +0200 Subject: [PATCH 02/97] add setup method if missing --- ocrd_cis/ocropy/binarize.py | 10 ++++++---- ocrd_cis/ocropy/clip.py | 5 +++++ ocrd_cis/ocropy/denoise.py | 5 +++++ ocrd_cis/ocropy/deskew.py | 5 +++++ ocrd_cis/ocropy/dewarp.py | 4 +++- ocrd_cis/ocropy/recognize.py | 4 +++- ocrd_cis/ocropy/resegment.py | 5 +++++ ocrd_cis/ocropy/segment.py | 5 +++++ ocrd_cis/ocropy/train.py | 2 +- 9 files changed, 38 insertions(+), 7 deletions(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index 7429d14a..f42ff2bd 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -68,6 +68,7 @@ def binarize(pil_image, method='ocropy', maxskew=2, threshold=0.5, nrm=False, zo class OcropyBinarize(Processor): def __init__(self, *args, **kwargs): + self.logger = getLogger('processor.OcropyBinarize') self.ocrd_tool = get_ocrd_tool() kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable] kwargs['version'] = self.ocrd_tool['version'] @@ -81,10 +82,11 @@ def executable(self): return 'ocrd-cis-ocropy-binarize' def setup(self): - self.logger = getLogger('processor.OcropyBinarize') - if self.parameter['grayscale'] and self.parameter['method'] != 'ocropy': - self.logger.critical('requested method %s does not support grayscale normalized output', - self.parameter['method']) + assert_file_grp_cardinality(self.input_file_grp, 1) + assert_file_grp_cardinality(self.output_file_grp, 1) + method = self.parameter['method'] + if self.parameter['grayscale'] and method != 'ocropy': + self.logger.critical(f'Requested method {method} does not support grayscale normalized output') raise Exception('only method=ocropy allows grayscale=true') def process(self): diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py index 919b26b0..d11b8eae 100644 --- a/ocrd_cis/ocropy/clip.py +++ b/ocrd_cis/ocropy/clip.py @@ -34,6 +34,7 @@ class OcropyClip(Processor): def __init__(self, *args, **kwargs): + self.logger = getLogger('processor.OcropyClip') self.ocrd_tool = get_ocrd_tool() kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable] kwargs['version'] = self.ocrd_tool['version'] @@ -43,6 +44,10 @@ def __init__(self, *args, **kwargs): def executable(self): return 'ocrd-cis-ocropy-clip' + def setup(self): + assert_file_grp_cardinality(self.input_file_grp, 1) + assert_file_grp_cardinality(self.output_file_grp, 1) + def process(self): """Clip text regions / lines of the workspace at intersections with neighbours. diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py index ac3c4dc5..fc1b582e 100644 --- a/ocrd_cis/ocropy/denoise.py +++ b/ocrd_cis/ocropy/denoise.py @@ -22,6 +22,7 @@ class OcropyDenoise(Processor): def __init__(self, *args, **kwargs): + self.logger = getLogger('processor.OcropyDenoise') self.ocrd_tool = get_ocrd_tool() kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable] kwargs['version'] = self.ocrd_tool['version'] @@ -31,6 +32,10 @@ def __init__(self, *args, **kwargs): def executable(self): return 'ocrd-cis-ocropy-denoise' + def setup(self): + assert_file_grp_cardinality(self.input_file_grp, 1) + assert_file_grp_cardinality(self.output_file_grp, 1) + def process(self): """Despeckle the pages / regions / lines of the workspace. diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py index fe61fce3..1ffaec62 100644 --- a/ocrd_cis/ocropy/deskew.py +++ b/ocrd_cis/ocropy/deskew.py @@ -33,6 +33,7 @@ def deskew(pil_image, maxskew=2): class OcropyDeskew(Processor): def __init__(self, *args, **kwargs): + self.logger = getLogger('processor.OcropyDeskew') ocrd_tool = get_ocrd_tool() kwargs['ocrd_tool'] = ocrd_tool['tools'][self.executable] kwargs['version'] = ocrd_tool['version'] @@ -42,6 +43,10 @@ def __init__(self, *args, **kwargs): def executable(self): return 'ocrd-cis-ocropy-deskew' + def setup(self): + assert_file_grp_cardinality(self.input_file_grp, 1) + assert_file_grp_cardinality(self.output_file_grp, 1) + def process(self): """Deskew the pages or regions of the workspace. diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py index 1bc4a805..89a62e11 100644 --- a/ocrd_cis/ocropy/dewarp.py +++ b/ocrd_cis/ocropy/dewarp.py @@ -69,6 +69,7 @@ def padvert(image, range_): class OcropyDewarp(Processor): def __init__(self, *args, **kwargs): + self.logger = getLogger('processor.OcropyDewarp') self.ocrd_tool = get_ocrd_tool() kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable] kwargs['version'] = self.ocrd_tool['version'] @@ -82,6 +83,8 @@ def executable(self): return 'ocrd-cis-ocropy-dewarp' def setup(self): + assert_file_grp_cardinality(self.input_file_grp, 1) + assert_file_grp_cardinality(self.output_file_grp, 1) # defaults from ocrolib.lineest: self.lnorm = lineest.CenterNormalizer( params=(self.parameter['range'], @@ -91,7 +94,6 @@ def setup(self): # dependency between smoothness # and extra params) 0.3)) - self.logger = getLogger('processor.OcropyDewarp') def process(self): """Dewarp the lines of the workspace. diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py index 5734aa92..fdeaed27 100644 --- a/ocrd_cis/ocropy/recognize.py +++ b/ocrd_cis/ocropy/recognize.py @@ -80,6 +80,7 @@ def recognize(image, pad, network, check=True): class OcropyRecognize(Processor): def __init__(self, *args, **kwargs): + self.logger = getLogger('processor.OcropyRecognize') self.ocrd_tool = get_ocrd_tool() self.pad = 16 # ocropus-rpred default self.network = None # set in process @@ -95,7 +96,8 @@ def executable(self): return 'ocrd-cis-ocropy-recognize' def setup(self): - self.logger = getLogger('processor.OcropyRecognize') + assert_file_grp_cardinality(self.input_file_grp, 1) + assert_file_grp_cardinality(self.output_file_grp, 1) # from ocropus-rpred: self.network = load_object(self.get_model(), verbose=1) for x in self.network.walk(): diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index 2b1f73c3..d9a92390 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -49,6 +49,7 @@ class OcropyResegment(Processor): def __init__(self, *args, **kwargs): + self.logger = getLogger('processor.OcropyResegment') self.ocrd_tool = get_ocrd_tool() kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable] kwargs['version'] = self.ocrd_tool['version'] @@ -58,6 +59,10 @@ def __init__(self, *args, **kwargs): def executable(self): return 'ocrd-cis-ocropy-resegment' + def setup(self): + assert_file_grp_cardinality(self.input_file_grp, 1) + assert_file_grp_cardinality(self.output_file_grp, 1) + def process(self): """Resegment lines of the workspace. diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index 1624597e..7488eefe 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -245,6 +245,7 @@ def getx(xy): class OcropySegment(Processor): def __init__(self, *args, **kwargs): + self.logger = getLogger('processor.OcropySegment') self.ocrd_tool = get_ocrd_tool() kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable] kwargs['version'] = self.ocrd_tool['version'] @@ -254,6 +255,10 @@ def __init__(self, *args, **kwargs): def executable(self): return 'ocrd-cis-ocropy-segment' + def setup(self): + assert_file_grp_cardinality(self.input_file_grp, 1) + assert_file_grp_cardinality(self.output_file_grp, 1) + def process(self): """Segment pages into regions+lines, tables into cells+lines, or regions into lines. diff --git a/ocrd_cis/ocropy/train.py b/ocrd_cis/ocropy/train.py index 46e9d258..25317c4d 100644 --- a/ocrd_cis/ocropy/train.py +++ b/ocrd_cis/ocropy/train.py @@ -30,6 +30,7 @@ def resize_keep_ratio(image, baseheight=48): class OcropyTrain(Processor): def __init__(self, *args, **kwargs): + self.log = getLogger('processor.OcropyTrain') self.oldcwd = os.getcwd() ocrd_tool = get_ocrd_tool() kwargs['ocrd_tool'] = ocrd_tool['tools'][self.executable] @@ -44,7 +45,6 @@ def executable(self): return 'ocrd-cis-ocropy-train' def setup(self): - self.log = getLogger('processor.OcropyTrain') #print(self.parameter) if 'model' in self.parameter: model = self.parameter['model'] From a0965c2aa7d6315f001606bc1c6043a020095ef9 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 13 Aug 2024 14:02:55 +0200 Subject: [PATCH 03/97] add self.logger wherever missing --- ocrd_cis/ocropy/clip.py | 20 +++--- ocrd_cis/ocropy/denoise.py | 16 ++--- ocrd_cis/ocropy/deskew.py | 14 ++-- ocrd_cis/ocropy/resegment.py | 74 +++++++++---------- ocrd_cis/ocropy/segment.py | 136 ++++++++++++++++++----------------- 5 files changed, 129 insertions(+), 131 deletions(-) diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py index d11b8eae..4c0eebea 100644 --- a/ocrd_cis/ocropy/clip.py +++ b/ocrd_cis/ocropy/clip.py @@ -83,13 +83,12 @@ def process(self): # too. However, region-level clipping _must_ be run before region-level # deskewing, because that would make segments incomensurable with their # neighbours. - LOG = getLogger('processor.OcropyClip') level = self.parameter['level-of-operation'] assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) for (n, input_file) in enumerate(self.input_files): - LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) + self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) file_id = make_file_id(input_file, self.output_file_grp) pcgts = page_from_file(self.workspace.download_file(input_file)) @@ -105,7 +104,7 @@ def process(self): dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi *= 2.54 - LOG.info('Page "%s" uses %f DPI', page_id, dpi) + self.logger.info('Page "%s" uses %f DPI', page_id, dpi) zoom = 300.0/dpi else: zoom = 1 @@ -127,7 +126,7 @@ def process(self): page.get_TableRegion() + page.get_UnknownRegion()) if not num_texts: - LOG.warning('Page "%s" contains no text regions', page_id) + self.logger.warning('Page "%s" contains no text regions', page_id) background = ImageStat.Stat(page_image) # workaround for Pillow#4925 if len(background.bands) > 1: @@ -158,7 +157,7 @@ def process(self): if level == 'region': if region.get_AlternativeImage(): # FIXME: This should probably be an exception (bad workflow configuration). - LOG.warning('Page "%s" region "%s" already contains image data: skipping', + self.logger.warning('Page "%s" region "%s" already contains image data: skipping', page_id, region.id) continue shape = prep(shapes[i]) @@ -176,7 +175,7 @@ def process(self): # level == 'line': lines = region.get_TextLine() if not lines: - LOG.warning('Page "%s" region "%s" contains no text lines', page_id, region.id) + self.logger.warning('Page "%s" region "%s" contains no text lines', page_id, region.id) continue region_image, region_coords = self.workspace.image_from_segment( region, page_image, page_coords, feature_selector='binarized') @@ -194,7 +193,7 @@ def process(self): for j, line in enumerate(lines): if line.get_AlternativeImage(): # FIXME: This should probably be an exception (bad workflow configuration). - LOG.warning('Page "%s" region "%s" line "%s" already contains image data: skipping', + self.logger.warning('Page "%s" region "%s" line "%s" already contains image data: skipping', page_id, region.id, line.id) continue shape = prep(shapes[j]) @@ -219,13 +218,12 @@ def process(self): local_filename=file_path, mimetype=MIMETYPE_PAGE, content=to_xml(pcgts)) - LOG.info('created file ID: %s, file_grp: %s, path: %s', + self.logger.info('created file ID: %s, file_grp: %s, path: %s', file_id, self.output_file_grp, out.local_filename) def process_segment(self, segment, segment_mask, segment_polygon, neighbours, background_image, parent_image, parent_coords, parent_bin, page_id, file_id): - LOG = getLogger('processor.OcropyClip') # initialize AlternativeImage@comments classes from parent, except # for those operations that can apply on multiple hierarchy levels: features = ','.join( @@ -237,7 +235,7 @@ def process_segment(self, segment, segment_mask, segment_polygon, neighbours, segment_bbox = bbox_from_polygon(segment_polygon) for neighbour, neighbour_mask in neighbours: if not np.any(segment_mask > neighbour_mask): - LOG.info('Ignoring enclosing neighbour "%s" of segment "%s" on page "%s"', + self.logger.info('Ignoring enclosing neighbour "%s" of segment "%s" on page "%s"', neighbour.id, segment.id, page_id) continue # find connected components that (only) belong to the neighbour: @@ -247,7 +245,7 @@ def process_segment(self, segment, segment_mask, segment_polygon, neighbours, num_foreground = np.count_nonzero(segment_mask * parent_bin) if not num_intruders: continue - LOG.debug('segment "%s" vs neighbour "%s": suppressing %d of %d pixels on page "%s"', + self.logger.debug('segment "%s" vs neighbour "%s": suppressing %d of %d pixels on page "%s"', segment.id, neighbour.id, num_intruders, num_foreground, page_id) # suppress in segment_mask so these intruders can stay in the neighbours # (are not removed from both sides) diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py index fc1b582e..d6a4f7ff 100644 --- a/ocrd_cis/ocropy/denoise.py +++ b/ocrd_cis/ocropy/denoise.py @@ -57,13 +57,12 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ - LOG = getLogger('processor.OcropyDenoise') level = self.parameter['level-of-operation'] assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) for (n, input_file) in enumerate(self.input_files): - LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) + self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) file_id = make_file_id(input_file, self.output_file_grp) pcgts = page_from_file(self.workspace.download_file(input_file)) @@ -80,7 +79,7 @@ def process(self): dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi *= 2.54 - LOG.info('Page "%s" uses %f DPI', page_id, dpi) + self.logger.info('Page "%s" uses %f DPI', page_id, dpi) zoom = 300.0/dpi else: zoom = 1 @@ -91,7 +90,7 @@ def process(self): else: regions = page.get_AllRegions(classes=['Text'], order='reading-order') if not regions: - LOG.warning('Page "%s" contains no text regions', page_id) + self.logger.warning('Page "%s" contains no text regions', page_id) for region in regions: region_image, region_xywh = self.workspace.image_from_segment( region, page_image, page_xywh, @@ -102,7 +101,7 @@ def process(self): continue lines = region.get_TextLine() if not lines: - LOG.warning('Page "%s" region "%s" contains no text lines', page_id, region.id) + self.logger.warning('Page "%s" region "%s" contains no text lines', page_id, region.id) for line in lines: line_image, line_xywh = self.workspace.image_from_segment( line, region_image, region_xywh, @@ -121,15 +120,14 @@ def process(self): local_filename=file_path, mimetype=MIMETYPE_PAGE, content=to_xml(pcgts)) - LOG.info('created file ID: %s, file_grp: %s, path: %s', + self.logger.info('created file ID: %s, file_grp: %s, path: %s', file_id, self.output_file_grp, out.local_filename) def process_segment(self, segment, segment_image, segment_xywh, zoom, page_id, file_id): - LOG = getLogger('processor.OcropyDenoise') if not segment_image.width or not segment_image.height: - LOG.warning("Skipping '%s' with zero size", file_id) + self.logger.warning("Skipping '%s' with zero size", file_id) return - LOG.info("About to despeckle '%s'", file_id) + self.logger.info("About to despeckle '%s'", file_id) bin_image = remove_noise(segment_image, maxsize=self.parameter['noise_maxsize']/zoom*300/72) # in pt # update METS (add the image file): diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py index 1ffaec62..63bb6b97 100644 --- a/ocrd_cis/ocropy/deskew.py +++ b/ocrd_cis/ocropy/deskew.py @@ -65,13 +65,12 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ - LOG = getLogger('processor.OcropyDeskew') level = self.parameter['level-of-operation'] assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) for (n, input_file) in enumerate(self.input_files): - LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) + self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) file_id = make_file_id(input_file, self.output_file_grp) pcgts = page_from_file(self.workspace.download_file(input_file)) @@ -95,7 +94,7 @@ def process(self): else: # region regions = page.get_AllRegions(classes=['Text'], order='reading-order') if not regions: - LOG.warning('Page "%s" contains no text regions', page_id) + self.logger.warning('Page "%s" contains no text regions', page_id) for region in regions: # process region: region_image, region_coords = self.workspace.image_from_segment( @@ -118,23 +117,22 @@ def process(self): local_filename=file_path, mimetype=MIMETYPE_PAGE, content=to_xml(pcgts)) - LOG.info('created file ID: %s, file_grp: %s, path: %s', + self.logger.info('created file ID: %s, file_grp: %s, path: %s', file_id, self.output_file_grp, out.local_filename) def _process_segment(self, segment, segment_image, segment_coords, segment_id, page_id, file_id): - LOG = getLogger('processor.OcropyDeskew') if not segment_image.width or not segment_image.height: - LOG.warning("Skipping %s with zero size", segment_id) + self.logger.warning("Skipping %s with zero size", segment_id) return angle0 = segment_coords['angle'] # deskewing (w.r.t. top image) already applied to segment_image - LOG.info("About to deskew %s", segment_id) + self.logger.info("About to deskew %s", segment_id) angle = deskew(segment_image, maxskew=self.parameter['maxskew']) # additional angle to be applied # segment angle: PAGE orientation is defined clockwise, # whereas PIL/ndimage rotation is in mathematical direction: orientation = -(angle + angle0) orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180] segment.set_orientation(orientation) # also removes all deskewed AlternativeImages - LOG.info("Found angle for %s: %.1f", segment_id, angle) + self.logger.info("Found angle for %s: %.1f", segment_id, angle) # delegate reflection, rotation and re-cropping to core: if isinstance(segment, PageType): segment_image, segment_coords, _ = self.workspace.image_from_page( diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index d9a92390..2261cf3e 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -105,7 +105,6 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ - LOG = getLogger('processor.OcropyResegment') # This makes best sense for bad/coarse line segmentation, like current GT # or as postprocessing for bbox-only steps like Tesseract. # Most notably, it can convert rectangles to polygons (polygonalization), @@ -120,7 +119,7 @@ def process(self): assert_file_grp_cardinality(self.output_file_grp, 1) for n, input_file in enumerate(self.input_files): - LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) + self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) file_id = make_file_id(input_file, self.output_file_grp) pcgts = page_from_file(self.workspace.download_file(input_file)) @@ -136,7 +135,7 @@ def process(self): dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi *= 2.54 - LOG.info('Page "%s" uses %f DPI', page_id, dpi) + self.logger.info('Page "%s" uses %f DPI', page_id, dpi) zoom = 300.0/dpi else: zoom = 1 @@ -156,14 +155,14 @@ def process(self): page.get_CustomRegion()) regions = page.get_AllRegions(classes=['Text']) if not regions: - LOG.warning('Page "%s" contains no text regions', page_id) + self.logger.warning('Page "%s" contains no text regions', page_id) elif level == 'page': lines = [line for region in regions for line in region.get_TextLine()] if lines: self._process_segment(page, page_image, page_coords, page_id, zoom, lines, ignore) else: - LOG.warning('Page "%s" contains no text regions with lines', page_id) + self.logger.warning('Page "%s" contains no text regions with lines', page_id) else: for region in regions: lines = region.get_TextLine() @@ -172,7 +171,7 @@ def process(self): region, page_image, page_coords, feature_selector='binarized') self._process_segment(region, region_image, region_coords, page_id, zoom, lines, ignore) else: - LOG.warning('Page "%s" region "%s" contains no text lines', page_id, region.id) + self.logger.warning('Page "%s" region "%s" contains no text lines', page_id, region.id) # update METS (add the PAGE file): file_path = os.path.join(self.output_file_grp, file_id + '.xml') @@ -184,11 +183,10 @@ def process(self): local_filename=file_path, mimetype=MIMETYPE_PAGE, content=to_xml(pcgts)) - LOG.info('created file ID: %s, file_grp: %s, path: %s', + self.logger.info('created file ID: %s, file_grp: %s, path: %s', file_id, self.output_file_grp, out.local_filename) def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, lines, ignore): - LOG = getLogger('processor.OcropyResegment') threshold = self.parameter['min_fraction'] method = self.parameter['method'] maxdist = self.parameter['spread']/zoom*300/72 # in pt @@ -206,7 +204,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l fullpage = False report = check_region(parent_bin, zoom) if report: - LOG.warning('Invalid %s "%s": %s', tag, + self.logger.warning('Invalid %s "%s": %s', tag, page_id if fullpage else parent.id, report) return # get existing line labels: @@ -234,7 +232,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l line_labels[i, line_y, line_x] = True # only text region(s) may contain new text lines for i, region in enumerate(set(line.parent_object_ for line in lines)): - LOG.debug('unmasking area of text region "%s" for "%s"', + self.logger.debug('unmasking area of text region "%s" for "%s"', region.id, page_id if fullpage else parent.id) region_polygon = coordinates_of_segment(region, parent_image, parent_coords) region_polygon = make_valid(Polygon(region_polygon)) @@ -244,14 +242,14 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l parent_bin.shape)] = False # mask/ignore overlapping neighbours for i, segment in enumerate(ignore): - LOG.debug('masking area of %s "%s" for "%s"', type(segment).__name__[:-4], + self.logger.debug('masking area of %s "%s" for "%s"', type(segment).__name__[:-4], segment.id, page_id if fullpage else parent.id) segment_polygon = coordinates_of_segment(segment, parent_image, parent_coords) ignore_bin[draw.polygon(segment_polygon[:, 1], segment_polygon[:, 0], parent_bin.shape)] = True if method != 'lineest': - LOG.debug('calculating connected component and distance transforms for "%s"', parent.id) + self.logger.debug('calculating connected component and distance transforms for "%s"', parent.id) bin = parent_bin & ~ ignore_bin components, _ = morph.label(bin) # estimate glyph scale (roughly) @@ -260,7 +258,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l counts = np.sqrt(3 * counts) scale = int(np.median(counts[(5/zoom < counts) & (counts < 100/zoom)])) components *= (counts > 15/zoom)[components] - LOG.debug("estimated scale: %d", scale) + self.logger.debug("estimated scale: %d", scale) else: scale = 43 if method == 'ccomps': @@ -278,7 +276,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l new_labels = np.zeros_like(parent_bin, np.uint8) for i, line in enumerate(lines): if line.Baseline is None: - LOG.warning("Skipping '%s' without baseline", line.id) + self.logger.warning("Skipping '%s' without baseline", line.id) new_labels[line_labels[i]] = i + 1 continue line_baseline = baseline_of_segment(line, parent_coords) @@ -289,22 +287,23 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l parent_bin.shape) new_labels[line_y, line_x] = i + 1 spread_dist(lines, line_labels, new_labels, parent_bin, components, parent_coords, - maxdist=maxdist or scale/2, loc=parent.id, threshold=threshold) + maxdist=maxdist or scale/2, loc=parent.id, threshold=threshold, logger=self.logger) return try: + # TODO: 'scale' passed as a param may not be always defined (mehmedGIT) new_line_labels, new_baselines, _, _, _, scale = compute_segmentation( parent_bin, seps=ignore_bin, zoom=zoom, spread_dist=maxdist or scale/2, fullpage=fullpage, maxseps=0, maxcolseps=len(ignore), maximages=0) except Exception as err: - LOG.error('Cannot line-segment %s "%s": %s', + self.logger.error('Cannot line-segment %s "%s": %s', tag, page_id if fullpage else parent.id, err) return - LOG.info("Found %d new line labels for %d existing lines on %s '%s'", + self.logger.info("Found %d new line labels for %d existing lines on %s '%s'", new_line_labels.max(), len(lines), tag, parent.id) # polygonalize and prepare comparison new_line_polygons, new_line_labels = masks2polygons( new_line_labels, new_baselines, parent_bin, '%s "%s"' % (tag, parent.id), - min_area=640/zoom/zoom) + min_area=640/zoom/zoom, logger=self.logger) DSAVE('line_labels', [np.argmax(np.insert(line_labels, 0, 0, axis=0), axis=0), parent_bin]) DSAVE('new_line_labels', [new_line_labels, parent_bin]) new_line_polygons, new_baselines = list(zip(*[(Polygon(poly), LineString(base)) @@ -387,41 +386,41 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l for j, line in enumerate(lines): new_lines = np.nonzero(assignments == j)[0] if not np.prod(new_lines.shape): - LOG.debug("no lines for '%s' match or fit", line.id) + self.logger.debug("no lines for '%s' match or fit", line.id) continue covers = np.sum(covers_bg[new_lines,j]) if covers < threshold / 3: - LOG.debug("new lines for '%s' only cover %.1f%% bg", + self.logger.debug("new lines for '%s' only cover %.1f%% bg", line.id, covers * 100) continue covers = np.sum(covers_fg[new_lines,j]) if covers < threshold: - LOG.debug("new lines for '%s' only cover %.1f%% fg", + self.logger.debug("new lines for '%s' only cover %.1f%% fg", line.id, covers * 100) continue looses = (assignments < 0) & (covers_bg[:,j] > 0.1) if looses.any(): covers = np.sum(covers_bg[np.nonzero(looses)[0],j]) - LOG.debug("new lines for '%s' would loose %d non-matching segments totalling %.1f%% bg", + self.logger.debug("new lines for '%s' would loose %d non-matching segments totalling %.1f%% bg", line.id, np.count_nonzero(looses), covers * 100) continue line_count = np.count_nonzero(line_labels[j] & parent_bin) new_count = covers * line_count - LOG.debug('Black pixels before/after resegment of line "%s": %d/%d', + self.logger.debug('Black pixels before/after resegment of line "%s": %d/%d', line.id, line_count, new_count) # combine all assigned new lines to single outline polygon if len(new_lines) > 1: - LOG.debug("joining %d new line polygons for '%s'", len(new_lines), line.id) + self.logger.debug("joining %d new line polygons for '%s'", len(new_lines), line.id) new_polygon = join_polygons([new_line_polygons[i] #intersections[(i, j)] for i in new_lines], loc=line.id, scale=scale) new_baseline = join_baselines([new_polygon.intersection(new_baselines[i]) - for i in new_lines], loc=line.id) + for i in new_lines], loc=line.id, logger=self.logger) # convert back to absolute (page) coordinates: line_polygon = coordinates_for_segment(new_polygon.exterior.coords[:-1], parent_image, parent_coords) line_polygon = polygon_for_parent(line_polygon, line.parent_object_) if line_polygon is None: - LOG.warning("Ignoring extant new polygon for line '%s'", line.id) + self.logger.warning("Ignoring extant new polygon for line '%s'", line.id) return # annotate result: line.get_Coords().set_points(points_from_polygon(line_polygon)) @@ -436,7 +435,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l if j == otherj: continue otherline = lines[otherj] - LOG.debug("subtracting new '%s' from overlapping '%s'", line.id, otherline.id) + self.logger.debug("subtracting new '%s' from overlapping '%s'", line.id, otherline.id) other_polygon = diff_polygons(line_polygons[otherj].context, new_polygon) if other_polygon.is_empty: continue @@ -445,14 +444,15 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l parent_image, parent_coords) other_polygon = polygon_for_parent(other_polygon, otherline.parent_object_) if other_polygon is None: - LOG.warning("Ignoring extant new polygon for line '%s'", otherline.id) + self.logger.warning("Ignoring extant new polygon for line '%s'", otherline.id) continue otherline.get_Coords().set_points(points_from_polygon(other_polygon)) def spread_dist(lines, old_labels, new_labels, binarized, components, coords, - maxdist=43, loc='', threshold=0.9): + maxdist=43, loc='', threshold=0.9, logger = None): """redefine line coordinates by contourizing spread of connected components propagated from new labels""" - LOG = getLogger('processor.OcropyResegment') + if not logger: + raise ValueError(f"Logger has not been passed by the caller") DSAVE('seeds', [new_labels, (components>0)]) # allocate to connected components consistently # (ignoring smallest components like punctuation) @@ -477,29 +477,29 @@ def spread_dist(lines, old_labels, new_labels, binarized, components, coords, continue count = np.count_nonzero(old_label) if not count: - LOG.warning("skipping zero-area line '%s'", line.id) + logger.warning("skipping zero-area line '%s'", line.id) continue covers = np.count_nonzero(new_label) / count if covers < threshold / 3: - LOG.debug("new line for '%s' only covers %.1f%% bg", + logger.debug("new line for '%s' only covers %.1f%% bg", line.id, covers * 100) continue count = np.count_nonzero(old_label * binarized) if not count: - LOG.warning("skipping binary-empty line '%s'", line.id) + logger.warning("skipping binary-empty line '%s'", line.id) continue covers = np.count_nonzero(new_label * binarized) / count if covers < threshold: - LOG.debug("new line for '%s' only covers %.1f%% fg", + logger.debug("new line for '%s' only covers %.1f%% fg", line.id, covers * 100) continue - LOG.debug('Black pixels before/after resegment of line "%s": %d/%d', + logger.debug('Black pixels before/after resegment of line "%s": %d/%d', line.id, count, covers * count) contours = [contour[:,::-1] # get x,y order again for contour, area in morph.find_contours(new_label)] #LOG.debug("joining %d subsegments for %s", len(contours), line.id) if len(contours) == 0: - LOG.warning("no contours for %s - keeping", line.id) + logger.warning("no contours for %s - keeping", line.id) continue else: # get alpha shape @@ -511,7 +511,7 @@ def spread_dist(lines, old_labels, new_labels, binarized, components, coords, polygon = coordinates_for_segment(poly, None, coords) polygon = polygon_for_parent(polygon, line.parent_object_) if polygon is None: - LOG.warning("Ignoring extant line for %s", line.id) + logger.warning("Ignoring extant line for %s", line.id) continue line.get_Coords().set_points(points_from_polygon(polygon)) diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index 7488eefe..35f309b6 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -58,7 +58,7 @@ lines2regions ) -def masks2polygons(bg_labels, baselines, fg_bin, name, min_area=None, simplify=None, open_holes=False, reorder=True): +def masks2polygons(bg_labels, baselines, fg_bin, name, min_area=None, simplify=None, open_holes=False, reorder=True, logger=None): """Convert label masks into polygon coordinates. Given a Numpy array of background labels ``bg_labels``, @@ -75,7 +75,8 @@ def masks2polygons(bg_labels, baselines, fg_bin, name, min_area=None, simplify=N - these polygons as a list of label, polygon, baseline tuples, and - a Numpy array of new background labels for that list. """ - LOG = getLogger('processor.OcropySegment') + if not logger: + raise ValueError(f"Logger has not been passed by the caller") # find sharp baseline if baselines is not None: def getx(xy): @@ -92,7 +93,7 @@ def getx(xy): bg_mask = np.array(bg_labels == label, bool) if not np.count_nonzero(bg_mask * fg_bin): # ignore if missing foreground - LOG.debug('skipping label %d in %s due to empty fg', + logger.debug('skipping label %d in %s due to empty fg', label, name) continue # simplify to convex hull @@ -101,7 +102,7 @@ def getx(xy): conflicts = np.setdiff1d(hull * simplify, bg_mask * simplify) if conflicts.any(): - LOG.debug('Cannot simplify %d: convex hull would create additional intersections %s', + logger.debug('Cannot simplify %d: convex hull would create additional intersections %s', label, str(conflicts)) else: bg_mask = hull @@ -130,7 +131,7 @@ def getx(xy): if len(hole) < 3: idx_hole = hier[0, idx_hole, 0] continue - LOG.debug("label %d contour %d [%d pts] has hole %d [%d pts]", + logger.debug("label %d contour %d [%d pts] has hole %d [%d pts]", label, idx, len(contour), idx_hole, len(hole)) #plot_poly(hole, 'blue') # cut child from outside... @@ -172,7 +173,7 @@ def getx(xy): diff2 = (interpol[interpol_idx+1:interpol_idx+2] - cispoint2) // 5 cispoint1 = cispoint1 + diff1 cispoint2 = cispoint2 + diff2 - LOG.debug("stitching at interpolation pos %d hole pos %d", interpol_idx, hole_idx) + logger.debug("stitching at interpolation pos %d hole pos %d", interpol_idx, hole_idx) # now stitch together outer (up to cision), inner (re-arranged around cision), outer (rest) # (this works, because inner contours have inverse direction) contour = np.concatenate([contour[:contour_idx], cispoint1, @@ -181,7 +182,7 @@ def getx(xy): #plot_poly(contour, 'green') idx_hole = hier[0, idx_hole, 0] #plot_poly(contour, 'red') - LOG.debug("adding label %d contour %d [%d pts]", label, idx, len(contour)) + logger.debug("adding label %d contour %d [%d pts]", label, idx, len(contour)) contours.append(contour) idx = hier[0, idx, 0] else: @@ -207,7 +208,7 @@ def getx(xy): contour = contours[i] area = areas[i] if min_area and area < min_area and area / total_area < 0.1: - LOG.warning('Label %d contour %d is too small (%d/%d) in %s', + logger.warning('Label %d contour %d is too small (%d/%d) in %s', label, i, area, total_area, name) continue # simplify shape: @@ -217,22 +218,22 @@ def getx(xy): # simplify and validate: polygon = Polygon(polygon) if not polygon.is_valid: - #LOG.debug(polygon.wkt) - LOG.debug(explain_validity(polygon)) + #logger.debug(polygon.wkt) + logger.debug(explain_validity(polygon)) polygon = make_valid(polygon) if not polygon.is_valid: #LOG.debug(polygon.wkt) - LOG.warning(explain_validity(polygon)) + logger.warning(explain_validity(polygon)) poly = polygon.exterior.coords[:-1] # keep open if len(poly) < 4: - LOG.warning('Label %d contour %d for %s has less than 4 points', label, i, name) + logger.warning('Label %d contour %d for %s has less than 4 points', label, i, name) continue # get baseline segments intersecting with this line mask # and concatenate them from left to right if baselines is not None: base = join_baselines([baseline.intersection(polygon) for baseline in baselines - if baseline.intersects(polygon)], name) + if baseline.intersects(polygon)], name, logger) if base is not None: base = base.coords else: @@ -324,7 +325,6 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ - LOG = getLogger('processor.OcropySegment') # FIXME: allow passing a-priori info on reading order / textline order # (and then pass on as ``bt`` and ``rl``; however, there may be a mixture # of different scripts; also, vertical writing needs internal rotation @@ -339,7 +339,7 @@ def process(self): assert_file_grp_cardinality(self.output_file_grp, 1) for (n, input_file) in enumerate(self.input_files): - LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) + self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) file_id = make_file_id(input_file, self.output_file_grp) pcgts = page_from_file(self.workspace.download_file(input_file)) @@ -356,7 +356,7 @@ def process(self): dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi *= 2.54 - LOG.info('Page "%s" uses %f DPI', page_id, dpi) + self.logger.info('Page "%s" uses %f DPI', page_id, dpi) zoom = 300.0/dpi else: zoom = 1 @@ -393,7 +393,7 @@ def process(self): if regions: # page is already region-segmented if overwrite_regions: - LOG.info('removing existing TextRegions in page "%s"', page_id) + self.logger.info('removing existing TextRegions in page "%s"', page_id) # we could remove all other region types as well, # but this is more flexible (for workflows with # specialized separator/image/table detectors): @@ -401,7 +401,7 @@ def process(self): page.set_ReadingOrder(None) ro = None else: - LOG.warning('keeping existing TextRegions in page "%s"', page_id) + self.logger.warning('keeping existing TextRegions in page "%s"', page_id) ignore.extend(regions) # create reading order if necessary if not ro or overwrite_order: @@ -425,20 +425,20 @@ def process(self): ignore.extend(page.get_TextRegion()) regions = list(page.get_TableRegion()) if not regions: - LOG.warning('Page "%s" contains no table regions', page_id) + self.logger.warning('Page "%s" contains no table regions', page_id) for region in regions: subregions = region.get_TextRegion() if subregions: # table is already cell-segmented if overwrite_regions: - LOG.info('removing existing TextRegions in table "%s"', region.id) + self.logger.info('removing existing TextRegions in table "%s"', region.id) region.set_TextRegion([]) roelem = reading_order.get(region.id) # replace by empty group with same index and ref # (which can then take the cells as subregions) - reading_order[region.id] = page_subgroup_in_reading_order(roelem) + reading_order[region.id] = page_subgroup_in_reading_order(roelem, self.logger) else: - LOG.warning('skipping table "%s" with existing TextRegions', region.id) + self.logger.warning('skipping table "%s" with existing TextRegions', region.id) continue # TODO: also allow grayscale_normalized (try/except?) region_image, region_coords = self.workspace.image_from_segment( @@ -449,24 +449,24 @@ def process(self): # create reading order group if necessary roelem = reading_order.get(region.id) if not roelem: - LOG.warning("Page '%s' table region '%s' is not referenced in reading order (%s)", + self.logger.warning("Page '%s' table region '%s' is not referenced in reading order (%s)", page_id, region.id, "no target to add cells to") elif overwrite_order: # replace by empty ordered group with same (index and) ref # (which can then take the cells as subregions) - roelem = page_subgroup_in_reading_order(roelem) + roelem = page_subgroup_in_reading_order(roelem, self.logger) reading_order[region.id] = roelem elif isinstance(roelem, (OrderedGroupType, OrderedGroupIndexedType)): - LOG.warning("Page '%s' table region '%s' already has an ordered group (%s)", + self.logger.warning("Page '%s' table region '%s' already has an ordered group (%s)", page_id, region.id, "cells will be appended") elif isinstance(roelem, (UnorderedGroupType, UnorderedGroupIndexedType)): - LOG.warning("Page '%s' table region '%s' already has an unordered group (%s)", + self.logger.warning("Page '%s' table region '%s' already has an unordered group (%s)", page_id, region.id, "cells will not be appended") roelem = None else: # replace regionRef(Indexed) by group with same index and ref # (which can then take the cells as subregions) - roelem = page_subgroup_in_reading_order(roelem) + roelem = page_subgroup_in_reading_order(roelem, self.logger) reading_order[region.id] = roelem # go get TextRegions with TextLines (and SeparatorRegions) self._process_element(region, subignore, region_image, region_coords, @@ -488,14 +488,14 @@ def process(self): region.add_TextRegion(subregion) regions.append(subregion) if not regions: - LOG.warning('Page "%s" contains no text regions', page_id) + self.logger.warning('Page "%s" contains no text regions', page_id) for region in regions: if region.get_TextLine(): if overwrite_lines: - LOG.info('removing existing TextLines in page "%s" region "%s"', page_id, region.id) + self.logger.info('removing existing TextLines in page "%s" region "%s"', page_id, region.id) region.set_TextLine([]) else: - LOG.warning('keeping existing TextLines in page "%s" region "%s"', page_id, region.id) + self.logger.warning('keeping existing TextLines in page "%s" region "%s"', page_id, region.id) ignore.extend(region.get_TextLine()) # TODO: also allow grayscale_normalized (try/except?) region_image, region_coords = self.workspace.image_from_segment( @@ -519,7 +519,7 @@ def process(self): local_filename=file_path, mimetype=MIMETYPE_PAGE, content=to_xml(pcgts)) - LOG.info('created file ID: %s, file_grp: %s, path: %s', + self.logger.info('created file ID: %s, file_grp: %s, path: %s', file_id, self.output_file_grp, out.local_filename) def _process_element(self, element, ignore, image, coords, element_id, file_id, page_id, zoom=1.0, rogroup=None): @@ -540,16 +540,15 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, in full page/table mode, then combine all separators among them with the newly detected separators to guide region segmentation. """ - LOG = getLogger('processor.OcropySegment') if not image.width or not image.height: - LOG.warning("Skipping '%s' with zero size", element_id) + self.logger.warning("Skipping '%s' with zero size", element_id) return element_array = pil2array(image) element_bin = np.array(element_array <= midrange(element_array), bool) sep_bin = np.zeros_like(element_bin, bool) ignore_labels = np.zeros_like(element_bin, int) for i, segment in enumerate(ignore): - LOG.debug('masking foreground of %s "%s" for "%s"', + self.logger.debug('masking foreground of %s "%s" for "%s"', type(segment).__name__[:-4], segment.id, element_id) # mark these segments (e.g. separator regions, tables, images) # for workflows where they have been detected already; @@ -583,7 +582,7 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, element_name = 'region' fullpage = False report = check_region(element_bin, zoom) - LOG.info('computing line segmentation for %s "%s"', element_name, element_id) + self.logger.info('computing line segmentation for %s "%s"', element_name, element_id) # TODO: we should downscale if DPI is large enough to save time try: if report: @@ -601,14 +600,14 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, csminheight=self.parameter['csminheight']) except Exception as err: if isinstance(element, TextRegionType): - LOG.error('Cannot line-segment region "%s": %s', element_id, err) + self.logger.error('Cannot line-segment region "%s": %s', element_id, err) # as a fallback, add a single text line comprising the whole region: element.add_TextLine(TextLineType(id=element_id + "_line", Coords=element.get_Coords())) else: - LOG.error('Cannot line-segment %s "%s": %s', element_name, element_id, err) + self.logger.error('Cannot line-segment %s "%s": %s', element_name, element_id, err) return - LOG.info('Found %d text lines for %s "%s"', + self.logger.info('Found %d text lines for %s "%s"', len(np.unique(line_labels)) - 1, element_name, element_id) # post-process line labels @@ -631,11 +630,11 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, gap_height=self.parameter['gap_height'], gap_width=self.parameter['gap_width'], scale=scale, zoom=zoom) - LOG.info('Found %d text regions for %s "%s"', + self.logger.info('Found %d text regions for %s "%s"', len(np.unique(region_labels)) - 1, element_name, element_id) except Exception as err: - LOG.error('Cannot region-segment %s "%s": %s', + self.logger.error('Cannot region-segment %s "%s": %s', element_name, element_id, err) region_labels = np.where(line_labels > len(ignore), 1 + len(ignore), line_labels) @@ -669,7 +668,7 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, region = ignore[region_line_labels0[0] - 1] if rogroup and region.parent_object_ is element and not isinstance(region, SeparatorRegionType): index = page_add_to_reading_order(rogroup, region.id, index) - LOG.debug('Region label %d is for ignored region "%s"', + self.logger.debug('Region label %d is for ignored region "%s"', region_label, region.id) continue # normal case: new lines inside new regions @@ -685,11 +684,13 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, regions, _ = masks2polygons(region_mask * region_label, None, element_bin, '%s "%s"' % (element_name, element_id), min_area=6000/zoom/zoom, - simplify=ignore_labels * ~(sep_bin)) + simplify=ignore_labels * ~(sep_bin), + logger=self.logger) # find contours for lines (can be non-contiguous) lines, _ = masks2polygons(region_line_labels, baselines, element_bin, 'region "%s"' % element_id, - min_area=640/zoom/zoom) + min_area=640/zoom/zoom, + logger=self.logger) # create new lines in new regions (allocating by intersection) line_polys = [Polygon(polygon) for _, polygon, _ in lines] for _, region_polygon, _ in regions: @@ -698,12 +699,12 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, region_polygon = coordinates_for_segment(region_polygon, image, coords) region_polygon = polygon_for_parent(region_polygon, element) if region_polygon is None: - LOG.warning('Ignoring extant region contour for region label %d', region_label) + self.logger.warning('Ignoring extant region contour for region label %d', region_label) continue # annotate result: region_no += 1 region_id = element_id + "_region%04d" % region_no - LOG.debug('Region label %d becomes ID "%s"', region_label, region_id) + self.logger.debug('Region label %d becomes ID "%s"', region_label, region_id) region = TextRegionType( id=region_id, Coords=CoordsType( points=points_from_polygon(region_polygon))) @@ -717,13 +718,13 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, line_polygon = coordinates_for_segment(line_polygon, image, coords) line_polygon = polygon_for_parent(line_polygon, region) if line_polygon is None: - LOG.warning('Ignoring extant line contour for region label %d line label %d', + self.logger.warning('Ignoring extant line contour for region label %d line label %d', region_label, line_label) continue # annotate result: line_no += 1 line_id = region_id + "_line%04d" % line_no - LOG.debug('Line label %d becomes ID "%s"', line_label, line_id) + self.logger.debug('Line label %d becomes ID "%s"', line_label, line_id) line = TextLineType(id=line_id, Coords=CoordsType(points=points_from_polygon(line_polygon))) if line_baseline: @@ -733,22 +734,22 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, # if the region has received text lines, keep it if region.get_TextLine(): element.add_TextRegion(region) - LOG.info('Added region "%s" with %d lines for %s "%s"', + self.logger.info('Added region "%s" with %d lines for %s "%s"', region_id, line_no, element_name, element_id) if rogroup: index = page_add_to_reading_order(rogroup, region.id, index) # add additional image/non-text regions from compute_segmentation # (e.g. drop-capitals or images) ... - LOG.info('Found %d large image regions for %s "%s"', images.max(), element_name, element_id) + self.logger.info('Found %d large image regions for %s "%s"', images.max(), element_name, element_id) # find contours around region labels (can be non-contiguous): image_polygons, _ = masks2polygons(images, None, element_bin, - '%s "%s"' % (element_name, element_id)) + '%s "%s"' % (element_name, element_id), self.logger) for image_label, polygon, _ in image_polygons: # convert back to absolute (page) coordinates: region_polygon = coordinates_for_segment(polygon, image, coords) region_polygon = polygon_for_parent(region_polygon, element) if region_polygon is None: - LOG.warning('Ignoring extant region contour for image label %d', image_label) + self.logger.warning('Ignoring extant region contour for image label %d', image_label) continue region_no += 1 # annotate result: @@ -757,17 +758,17 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, id=region_id, Coords=CoordsType( points=points_from_polygon(region_polygon)))) # split detected separator labels into separator regions: - LOG.info('Found %d separators for %s "%s"', seplines.max(), element_name, element_id) + self.logger.info('Found %d separators for %s "%s"', seplines.max(), element_name, element_id) # find contours around region labels (can be non-contiguous): sep_polygons, _ = masks2polygons(seplines, None, element_bin, '%s "%s"' % (element_name, element_id), - open_holes=True, reorder=False) + open_holes=True, reorder=False, logger=self.logger) for sep_label, polygon, _ in sep_polygons: # convert back to absolute (page) coordinates: region_polygon = coordinates_for_segment(polygon, image, coords) region_polygon = polygon_for_parent(region_polygon, element) if region_polygon is None: - LOG.warning('Ignoring extant region contour for separator %d', sep_label) + self.logger.warning('Ignoring extant region contour for separator %d', sep_label) continue # annotate result: region_no += 1 @@ -795,14 +796,14 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, # find contours around labels (can be non-contiguous): line_polygons, _ = masks2polygons(line_labels, baselines, element_bin, 'region "%s"' % element_id, - min_area=640/zoom/zoom) + min_area=640/zoom/zoom, logger=self.logger) line_no = 0 for line_label, polygon, baseline in line_polygons: # convert back to absolute (page) coordinates: line_polygon = coordinates_for_segment(polygon, image, coords) line_polygon = polygon_for_parent(line_polygon, element) if line_polygon is None: - LOG.warning('Ignoring extant line contour for line label %d', + self.logger.warning('Ignoring extant line contour for line label %d', line_label) continue # annotate result: @@ -937,8 +938,9 @@ def join_polygons(polygons, loc='', scale=20): jointp = make_valid(jointp) return jointp -def join_baselines(baselines, loc=''): - LOG = getLogger('processor.OcropyResegment') +def join_baselines(baselines, loc='', logger = None): + if not logger: + raise ValueError(f"Logger has not been passed by the caller") lines = [] for baseline in baselines: if (baseline.is_empty or @@ -955,9 +957,9 @@ def join_baselines(baselines, loc=''): elif geom.geom_type == 'MultiLineString': lines.extend(geom) else: - LOG.warning("ignoring baseline subtype %s in %s", geom.geom_type, loc) + logger.warning("ignoring baseline subtype %s in %s", geom.geom_type, loc) else: - LOG.warning("ignoring baseline type %s in %s", baseline.geom_type, loc) + logger.warning("ignoring baseline type %s in %s", baseline.geom_type, loc) nlines = len(lines) if nlines == 0: return None @@ -1019,7 +1021,7 @@ def join_baselines(baselines, loc=''): else: chains.append([prevl, nextl]) if len(chains) > 1: - LOG.warning("baseline merge impossible (no spanning tree) in %s", loc) + logger.warning("baseline merge impossible (no spanning tree) in %s", loc) return None assert len(chains) == 1, chains assert len(chains[0]) == nlines, chains[0] @@ -1031,7 +1033,7 @@ def join_baselines(baselines, loc=''): coords.extend(line.normalize().coords) result = LineString(coords) if result.is_empty: - LOG.warning("baseline merge is empty in %s", loc) + logger.warning("baseline merge is empty in %s", loc) return None assert result.geom_type == 'LineString', result.wkt result = set_precision(result, 1.0) @@ -1080,7 +1082,7 @@ def page_add_to_reading_order(rogroup, region_id, index=None): index += 1 return index -def page_subgroup_in_reading_order(roelem): +def page_subgroup_in_reading_order(roelem, logger = None): """Replace given RO element by an equivalent OrderedGroup. Given a ReadingOrder element ``roelem`` (of any type), @@ -1094,12 +1096,14 @@ def page_subgroup_in_reading_order(roelem): Return the new group object. """ - LOG = getLogger('processor.OcropySegment') + if not logger: + raise ValueError(f"Logger has not been passed by the caller") + if not roelem: - LOG.error('Cannot subgroup from empty ReadingOrder element') + logger.error('Cannot subgroup from empty ReadingOrder element') return roelem if not roelem.parent_object_: - LOG.error('Cannot subgroup from orphan ReadingOrder element') + logger.error('Cannot subgroup from orphan ReadingOrder element') return roelem if isinstance(roelem, (OrderedGroupType,OrderedGroupIndexedType)) and not ( roelem.get_OrderedGroupIndexed() or From dbccae58d9213d5df4e072502a7eae8484902ef6 Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 13 Aug 2024 14:57:16 +0200 Subject: [PATCH 04/97] require core >= 3.0.0a1 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 6df9445c..38f09abd 100644 --- a/setup.py +++ b/setup.py @@ -38,7 +38,7 @@ packages=find_packages(), include_package_data=True, install_requires=[ - 'ocrd>=2.47', + 'ocrd>=3.0.0a1', 'click', 'scipy', 'numpy>=1.17.0', From 8557a26dc75cf858f9e6819296389f71ab972cf3 Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 13 Aug 2024 15:26:32 +0200 Subject: [PATCH 05/97] port part of binarize to core v3 --- ocrd_cis/ocropy/binarize.py | 157 ++++++++++++++++-------------------- 1 file changed, 70 insertions(+), 87 deletions(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index f42ff2bd..c3b4cded 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -1,9 +1,13 @@ from __future__ import absolute_import +import logging import os.path +import PIL import cv2 import numpy as np from PIL import Image +from os.path import join +from ocrd_models import OcrdExif #import kraken.binarization @@ -15,11 +19,10 @@ ) from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( - to_xml, AlternativeImageType + OcrdPage, to_xml, AlternativeImageType ) from ocrd import Processor -from .. import get_ocrd_tool from . import common from .common import ( pil2array, array2pil, @@ -64,18 +67,20 @@ def binarize(pil_image, method='ocropy', maxskew=2, threshold=0.5, nrm=False, zo raise Exception('unknown binarization method %s' % method) return Image.fromarray(th), 0 +def determine_zoom(dpi: float, page_image_info: OcrdExif) -> float: + if dpi > 0: + zoom = 300.0/dpi + elif page_image_info.resolution != 1: + dpi = page_image_info.resolution + if page_image_info.resolutionUnit == 'cm': + dpi *= 2.54 + zoom = 300.0/dpi + else: + zoom = 1 + return zoom class OcropyBinarize(Processor): - - def __init__(self, *args, **kwargs): - self.logger = getLogger('processor.OcropyBinarize') - self.ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable] - kwargs['version'] = self.ocrd_tool['version'] - super(OcropyBinarize, self).__init__(*args, **kwargs) - if hasattr(self, 'output_file_grp'): - # processing context - self.setup() + logger : logging.Logger @property def executable(self): @@ -84,16 +89,16 @@ def executable(self): def setup(self): assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) + self.logger = getLogger('processor.OcropyBinarize') method = self.parameter['method'] if self.parameter['grayscale'] and method != 'ocropy': self.logger.critical(f'Requested method {method} does not support grayscale normalized output') raise Exception('only method=ocropy allows grayscale=true') - def process(self): + def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage: """Binarize (and optionally deskew/despeckle) the pages/regions/lines of the workspace. - Open and deserialise PAGE input files and their respective images, - then iterate over the element hierarchy down to the requested + THEN Iterate over the PAGE-XML element hierarchy down to the requested ``level-of-operation``. Next, for each file, crop each segment image according to the layout @@ -109,80 +114,61 @@ def process(self): Reference each new image in the AlternativeImage of the element. - Produce a new output file by serialising the resulting hierarchy. + Return a PAGE-XML with AlternativeImage and the arguments for ``workspace.save_image_file``. """ level = self.parameter['level-of-operation'] - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) + assert self.workspace + self.logger.debug(f'Level of operation: "{level}"') - for (n, input_file) in enumerate(self.input_files): - self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) - file_id = make_file_id(input_file, self.output_file_grp) + pcgts = input_pcgts[0] + page = pcgts.get_Page() + assert page - pcgts = page_from_file(self.workspace.download_file(input_file)) - self.add_metadata(pcgts) - page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id) - page = pcgts.get_Page() - - page_image, page_xywh, page_image_info = self.workspace.image_from_page( - page, page_id, feature_filter='binarized') - if self.parameter['dpi'] > 0: - zoom = 300.0/self.parameter['dpi'] - elif page_image_info.resolution != 1: - dpi = page_image_info.resolution - if page_image_info.resolutionUnit == 'cm': - dpi *= 2.54 - self.logger.info('Page "%s" uses %f DPI', page_id, dpi) - zoom = 300.0/dpi - else: - zoom = 1 - - if level == 'page': - self.process_page(page, page_image, page_xywh, zoom, - input_file.pageId, file_id) - else: - if level == 'table': - regions = page.get_TableRegion() - else: # region - regions = page.get_AllRegions(classes=['Text'], order='reading-order') - if not regions: - self.logger.warning('Page "%s" contains no text regions', page_id) - for region in regions: - region_image, region_xywh = self.workspace.image_from_segment( - region, page_image, page_xywh, feature_filter='binarized') - if level == 'region': - self.process_region(region, region_image, region_xywh, zoom, - input_file.pageId, file_id + '_' + region.id) - continue - lines = region.get_TextLine() - if not lines: - self.logger.warning('Page "%s" region "%s" contains no text lines', - page_id, region.id) - for line in lines: - line_image, line_xywh = self.workspace.image_from_segment( - line, region_image, region_xywh, feature_filter='binarized') - self.process_line(line, line_image, line_xywh, zoom, - input_file.pageId, region.id, - file_id + '_' + region.id + '_' + line.id) + page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id, feature_filter='binarized') + zoom = determine_zoom(self.parameter['dpi'], page_image_info) + self.logger.info('Page "%s" uses %f DPI', page_id, self.parameter['dpi']) + + ret = [pcgts] + if level == 'page': + try: + ret.append(self.process_page(page, page_image, page_xywh, zoom, page_id, output_file_id)) + except ValueError as e: + self.logger.exception(e) + else: + # TODO + raise NotImplementedError + if level == 'table': + regions = page.get_TableRegion() + else: # region + regions = page.get_AllRegions(classes=['Text'], order='reading-order') + if not regions: + self.logger.warning('Page "%s" contains no text regions', page_id) + for region in regions: + region_image, region_xywh = self.workspace.image_from_segment( + region, page_image, page_xywh, feature_filter='binarized') + if level == 'region': + self.process_region(region, region_image, region_xywh, zoom, + input_file.pageId, file_id + '_' + region.id) + continue + lines = region.get_TextLine() + if not lines: + self.logger.warning('Page "%s" region "%s" contains no text lines', + page_id, region.id) + for line in lines: + line_image, line_xywh = self.workspace.image_from_segment( + line, region_image, region_xywh, feature_filter='binarized') + self.process_line(line, line_image, line_xywh, zoom, + input_file.pageId, region.id, + file_id + '_' + region.id + '_' + line.id) - # update METS (add the PAGE file): - file_path = os.path.join(self.output_file_grp, file_id + '.xml') - pcgts.set_pcGtsId(file_id) - out = self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - local_filename=file_path, - mimetype=MIMETYPE_PAGE, - content=to_xml(pcgts)) - self.logger.info('created file ID: %s, file_grp: %s, path: %s', - file_id, self.output_file_grp, out.local_filename) + return ret - def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id): + def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id) -> tuple[Image.Image, str, str]: if not page_image.width or not page_image.height: - self.logger.warning("Skipping page '%s' with zero size", page_id) - return + raise ValueError("Skipping page '%s' with zero size", page_id) self.logger.info("About to binarize page '%s'", page_id) + assert self.output_file_grp + features = page_xywh['features'] if 'angle' in page_xywh and page_xywh['angle']: # orientation has already been annotated (by previous deskewing), @@ -216,13 +202,10 @@ def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id): else: file_id += '.IMG-BIN' features += ',binarized' - file_path = self.workspace.save_image_file( - bin_image, file_id, self.output_file_grp, - page_id=page_id) + bin_image_path = join(self.output_file_grp, f'{file_id}.png') # update PAGE (reference the image file): - page.add_AlternativeImage(AlternativeImageType( - filename=file_path, - comments=features)) + page.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=features)) + return (bin_image, file_id, bin_image_path) def process_region(self, region, region_image, region_xywh, zoom, page_id, file_id): if not region_image.width or not region_image.height: From 278b706246e24ec0fc0b5030aff6d16673bad817 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 13 Aug 2024 16:08:10 +0200 Subject: [PATCH 06/97] move: determine_zoom to common.py --- ocrd_cis/ocropy/binarize.py | 18 ++---------------- ocrd_cis/ocropy/clip.py | 15 +++------------ ocrd_cis/ocropy/common.py | 14 +++++++++++++- ocrd_cis/ocropy/denoise.py | 15 ++++----------- ocrd_cis/ocropy/deskew.py | 6 +----- ocrd_cis/ocropy/dewarp.py | 18 ++++-------------- ocrd_cis/ocropy/resegment.py | 14 ++++---------- ocrd_cis/ocropy/segment.py | 13 +++---------- 8 files changed, 34 insertions(+), 79 deletions(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index c3b4cded..b5e2bc7e 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -7,7 +7,6 @@ import numpy as np from PIL import Image from os.path import join -from ocrd_models import OcrdExif #import kraken.binarization @@ -25,9 +24,8 @@ from . import common from .common import ( - pil2array, array2pil, # binarize, - remove_noise) + array2pil, determine_zoom, pil2array, remove_noise) #sys.path.append(os.path.dirname(os.path.abspath(__file__))) @@ -67,18 +65,6 @@ def binarize(pil_image, method='ocropy', maxskew=2, threshold=0.5, nrm=False, zo raise Exception('unknown binarization method %s' % method) return Image.fromarray(th), 0 -def determine_zoom(dpi: float, page_image_info: OcrdExif) -> float: - if dpi > 0: - zoom = 300.0/dpi - elif page_image_info.resolution != 1: - dpi = page_image_info.resolution - if page_image_info.resolutionUnit == 'cm': - dpi *= 2.54 - zoom = 300.0/dpi - else: - zoom = 1 - return zoom - class OcropyBinarize(Processor): logger : logging.Logger @@ -126,7 +112,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id, feature_filter='binarized') zoom = determine_zoom(self.parameter['dpi'], page_image_info) - self.logger.info('Page "%s" uses %f DPI', page_id, self.parameter['dpi']) + self.logger.info(f"Page '{page_id}' uses {self.parameter['dpi']} DPI") ret = [pcgts] if level == 'page': diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py index 4c0eebea..3b854897 100644 --- a/ocrd_cis/ocropy/clip.py +++ b/ocrd_cis/ocropy/clip.py @@ -28,8 +28,7 @@ from .ocrolib import midrange, morph from .common import ( # binarize, - pil2array, array2pil -) + array2pil, determine_zoom, pil2array) class OcropyClip(Processor): @@ -98,16 +97,8 @@ def process(self): page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector='binarized') - if self.parameter['dpi'] > 0: - zoom = 300.0/self.parameter['dpi'] - elif page_image_info.resolution != 1: - dpi = page_image_info.resolution - if page_image_info.resolutionUnit == 'cm': - dpi *= 2.54 - self.logger.info('Page "%s" uses %f DPI', page_id, dpi) - zoom = 300.0/dpi - else: - zoom = 1 + zoom = determine_zoom(self.parameter['dpi'], page_image_info) + self.logger.info(f"Page '{page_id}' uses {self.parameter['dpi']} DPI") # FIXME: what about text regions inside table regions? regions = list(page.get_TextRegion()) diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py index 3cb9e4c4..1804c29d 100644 --- a/ocrd_cis/ocropy/common.py +++ b/ocrd_cis/ocropy/common.py @@ -10,7 +10,7 @@ from skimage.morphology import medial_axis import networkx as nx from PIL import Image - +from ocrd_models import OcrdExif from . import ocrolib from .ocrolib import morph, psegutils, sl # for decorators (type-checks etc): @@ -2102,3 +2102,15 @@ def find_topological(): # rlabels[region_hull] = region # DSAVE('rlabels_closed', rlabels) return rlabels + +def determine_zoom(dpi: float, page_image_info: OcrdExif) -> float: + if dpi > 0: + zoom = 300.0/dpi + elif page_image_info.resolution != 1: + dpi = page_image_info.resolution + if page_image_info.resolutionUnit == 'cm': + dpi *= 2.54 + zoom = 300.0/dpi + else: + zoom = 1 + return zoom diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py index d6a4f7ff..d8554a3e 100644 --- a/ocrd_cis/ocropy/denoise.py +++ b/ocrd_cis/ocropy/denoise.py @@ -17,7 +17,7 @@ from .. import get_ocrd_tool from .common import ( # binarize, - remove_noise) + determine_zoom, remove_noise) class OcropyDenoise(Processor): @@ -73,16 +73,9 @@ def process(self): page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector='binarized' if level == 'page' else '') - if self.parameter['dpi'] > 0: - zoom = 300.0/self.parameter['dpi'] - elif page_image_info.resolution != 1: - dpi = page_image_info.resolution - if page_image_info.resolutionUnit == 'cm': - dpi *= 2.54 - self.logger.info('Page "%s" uses %f DPI', page_id, dpi) - zoom = 300.0/dpi - else: - zoom = 1 + + zoom = determine_zoom(self.parameter['dpi'], page_image_info) + self.logger.info(f"Page '{page_id}' uses {self.parameter['dpi']} DPI") if level == 'page': self.process_segment(page, page_image, page_xywh, zoom, diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py index 63bb6b97..055ab27d 100644 --- a/ocrd_cis/ocropy/deskew.py +++ b/ocrd_cis/ocropy/deskew.py @@ -17,14 +17,10 @@ from .. import get_ocrd_tool from . import common -from .common import ( - pil2array -) +from .common import pil2array #sys.path.append(os.path.dirname(os.path.abspath(__file__))) -TOOL = 'ocrd-cis-ocropy-deskew' - def deskew(pil_image, maxskew=2): array = pil2array(pil_image) _, angle = common.binarize(array, maxskew=maxskew) diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py index 89a62e11..4c9a1bdb 100644 --- a/ocrd_cis/ocropy/dewarp.py +++ b/ocrd_cis/ocropy/dewarp.py @@ -17,10 +17,7 @@ from .. import get_ocrd_tool from .ocrolib import lineest -from .common import ( - pil2array, array2pil, - check_line, -) +from .common import array2pil, check_line, determine_zoom, pil2array #sys.path.append(os.path.dirname(os.path.abspath(__file__))) @@ -128,16 +125,9 @@ def process(self): page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id) - if self.parameter['dpi'] > 0: - zoom = 300.0/self.parameter['dpi'] - elif page_image_info.resolution != 1: - dpi = page_image_info.resolution - if page_image_info.resolutionUnit == 'cm': - dpi *= 2.54 - self.logger.info('Page "%s" uses %f DPI', page_id, dpi) - zoom = 300.0/dpi - else: - zoom = 1 + + zoom = determine_zoom(self.parameter['dpi'], page_image_info) + self.logger.info(f"Page '{page_id}' uses {self.parameter['dpi']} DPI") regions = page.get_AllRegions(classes=['Text'], order='reading-order') if not regions: diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index 2261cf3e..e4681b23 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -30,6 +30,7 @@ pil2array, odd, DSAVE, + determine_zoom, # binarize, check_page, check_region, @@ -129,16 +130,9 @@ def process(self): page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector='binarized') - if self.parameter['dpi'] > 0: - zoom = 300.0/self.parameter['dpi'] - elif page_image_info.resolution != 1: - dpi = page_image_info.resolution - if page_image_info.resolutionUnit == 'cm': - dpi *= 2.54 - self.logger.info('Page "%s" uses %f DPI', page_id, dpi) - zoom = 300.0/dpi - else: - zoom = 1 + + zoom = determine_zoom(self.parameter['dpi'], page_image_info) + self.logger.info(f"Page '{page_id}' uses {self.parameter['dpi']} DPI") ignore = (page.get_ImageRegion() + page.get_LineDrawingRegion() + diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index 35f309b6..e13c3d71 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -53,6 +53,7 @@ pil2array, array2pil, check_page, check_region, + determine_zoom, hmerge_line_seeds, compute_segmentation, lines2regions @@ -350,16 +351,8 @@ def process(self): # TODO: also allow grayscale_normalized (try/except?) page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector='binarized') - if self.parameter['dpi'] > 0: - zoom = 300.0/self.parameter['dpi'] - elif page_image_info.resolution != 1: - dpi = page_image_info.resolution - if page_image_info.resolutionUnit == 'cm': - dpi *= 2.54 - self.logger.info('Page "%s" uses %f DPI', page_id, dpi) - zoom = 300.0/dpi - else: - zoom = 1 + zoom = determine_zoom(self.parameter['dpi'], page_image_info) + self.logger.info(f"Page '{page_id}' uses {self.parameter['dpi']} DPI") # aggregate existing regions so their foreground can be ignored ignore = (page.get_ImageRegion() + From 6beec175ed89e321cae93917dbe02bd2809cd83b Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 13 Aug 2024 16:14:31 +0200 Subject: [PATCH 07/97] move: logger init to setup() --- ocrd_cis/ocropy/binarize.py | 6 +++--- ocrd_cis/ocropy/clip.py | 4 +++- ocrd_cis/ocropy/denoise.py | 5 +++-- ocrd_cis/ocropy/deskew.py | 5 +++-- ocrd_cis/ocropy/dewarp.py | 5 +++-- ocrd_cis/ocropy/recognize.py | 5 +++-- ocrd_cis/ocropy/resegment.py | 5 +++-- ocrd_cis/ocropy/segment.py | 6 ++++-- ocrd_cis/ocropy/train.py | 5 +++-- 9 files changed, 28 insertions(+), 18 deletions(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index b5e2bc7e..cc34690e 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -1,5 +1,5 @@ from __future__ import absolute_import -import logging +from logging import Logger import os.path import PIL @@ -66,16 +66,16 @@ def binarize(pil_image, method='ocropy', maxskew=2, threshold=0.5, nrm=False, zo return Image.fromarray(th), 0 class OcropyBinarize(Processor): - logger : logging.Logger + logger: Logger @property def executable(self): return 'ocrd-cis-ocropy-binarize' def setup(self): + self.logger = getLogger('processor.OcropyBinarize') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) - self.logger = getLogger('processor.OcropyBinarize') method = self.parameter['method'] if self.parameter['grayscale'] and method != 'ocropy': self.logger.critical(f'Requested method {method} does not support grayscale normalized output') diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py index 3b854897..1b7fb28b 100644 --- a/ocrd_cis/ocropy/clip.py +++ b/ocrd_cis/ocropy/clip.py @@ -1,4 +1,5 @@ from __future__ import absolute_import +from logging import Logger import os.path import numpy as np @@ -31,9 +32,9 @@ array2pil, determine_zoom, pil2array) class OcropyClip(Processor): + logger: Logger def __init__(self, *args, **kwargs): - self.logger = getLogger('processor.OcropyClip') self.ocrd_tool = get_ocrd_tool() kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable] kwargs['version'] = self.ocrd_tool['version'] @@ -44,6 +45,7 @@ def executable(self): return 'ocrd-cis-ocropy-clip' def setup(self): + self.logger = getLogger('processor.OcropyClip') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py index d8554a3e..34750a53 100644 --- a/ocrd_cis/ocropy/denoise.py +++ b/ocrd_cis/ocropy/denoise.py @@ -1,5 +1,5 @@ from __future__ import absolute_import - +from logging import Logger import os.path from ocrd_utils import ( @@ -20,9 +20,9 @@ determine_zoom, remove_noise) class OcropyDenoise(Processor): + logger: Logger def __init__(self, *args, **kwargs): - self.logger = getLogger('processor.OcropyDenoise') self.ocrd_tool = get_ocrd_tool() kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable] kwargs['version'] = self.ocrd_tool['version'] @@ -33,6 +33,7 @@ def executable(self): return 'ocrd-cis-ocropy-denoise' def setup(self): + self.logger = getLogger('processor.OcropyDenoise') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py index 055ab27d..2eb898ca 100644 --- a/ocrd_cis/ocropy/deskew.py +++ b/ocrd_cis/ocropy/deskew.py @@ -1,5 +1,5 @@ from __future__ import absolute_import - +from logging import Logger import os.path from ocrd_utils import ( @@ -27,9 +27,9 @@ def deskew(pil_image, maxskew=2): return angle class OcropyDeskew(Processor): + logger: Logger def __init__(self, *args, **kwargs): - self.logger = getLogger('processor.OcropyDeskew') ocrd_tool = get_ocrd_tool() kwargs['ocrd_tool'] = ocrd_tool['tools'][self.executable] kwargs['version'] = ocrd_tool['version'] @@ -40,6 +40,7 @@ def executable(self): return 'ocrd-cis-ocropy-deskew' def setup(self): + self.logger = getLogger('processor.OcropyDeskew') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py index 4c9a1bdb..cad280c6 100644 --- a/ocrd_cis/ocropy/dewarp.py +++ b/ocrd_cis/ocropy/dewarp.py @@ -1,5 +1,5 @@ from __future__ import absolute_import - +from logging import Logger import os.path import numpy as np @@ -64,9 +64,9 @@ def padvert(image, range_): return array2pil(line) class OcropyDewarp(Processor): + logger: Logger def __init__(self, *args, **kwargs): - self.logger = getLogger('processor.OcropyDewarp') self.ocrd_tool = get_ocrd_tool() kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable] kwargs['version'] = self.ocrd_tool['version'] @@ -80,6 +80,7 @@ def executable(self): return 'ocrd-cis-ocropy-dewarp' def setup(self): + self.logger = getLogger('processor.OcropyDewarp') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) # defaults from ocrolib.lineest: diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py index fdeaed27..8e147fea 100644 --- a/ocrd_cis/ocropy/recognize.py +++ b/ocrd_cis/ocropy/recognize.py @@ -1,5 +1,5 @@ from __future__ import absolute_import - +from logging import Logger import sys import os.path import numpy as np @@ -78,9 +78,9 @@ def recognize(image, pad, network, check=True): class OcropyRecognize(Processor): + logger: Logger def __init__(self, *args, **kwargs): - self.logger = getLogger('processor.OcropyRecognize') self.ocrd_tool = get_ocrd_tool() self.pad = 16 # ocropus-rpred default self.network = None # set in process @@ -96,6 +96,7 @@ def executable(self): return 'ocrd-cis-ocropy-recognize' def setup(self): + self.logger = getLogger('processor.OcropyRecognize') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) # from ocropus-rpred: diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index e4681b23..1e920b0f 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -1,5 +1,5 @@ from __future__ import absolute_import - +from logging import Logger import os.path import numpy as np from skimage import draw, segmentation @@ -48,9 +48,9 @@ ) class OcropyResegment(Processor): + logger: Logger def __init__(self, *args, **kwargs): - self.logger = getLogger('processor.OcropyResegment') self.ocrd_tool = get_ocrd_tool() kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable] kwargs['version'] = self.ocrd_tool['version'] @@ -61,6 +61,7 @@ def executable(self): return 'ocrd-cis-ocropy-resegment' def setup(self): + self.logger = getLogger('processor.OcropyResegment') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index e13c3d71..3b89bda6 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -1,5 +1,5 @@ from __future__ import absolute_import - +from logging import Logger import os.path import itertools import numpy as np @@ -245,9 +245,10 @@ def getx(xy): class OcropySegment(Processor): + logger: Logger def __init__(self, *args, **kwargs): - self.logger = getLogger('processor.OcropySegment') + self.ocrd_tool = get_ocrd_tool() kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable] kwargs['version'] = self.ocrd_tool['version'] @@ -258,6 +259,7 @@ def executable(self): return 'ocrd-cis-ocropy-segment' def setup(self): + self.logger = getLogger('processor.OcropySegment') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) diff --git a/ocrd_cis/ocropy/train.py b/ocrd_cis/ocropy/train.py index 25317c4d..61a918c7 100644 --- a/ocrd_cis/ocropy/train.py +++ b/ocrd_cis/ocropy/train.py @@ -1,5 +1,5 @@ from __future__ import absolute_import - +from logging import Logger import sys import os import tempfile @@ -28,9 +28,9 @@ def resize_keep_ratio(image, baseheight=48): class OcropyTrain(Processor): + log: Logger def __init__(self, *args, **kwargs): - self.log = getLogger('processor.OcropyTrain') self.oldcwd = os.getcwd() ocrd_tool = get_ocrd_tool() kwargs['ocrd_tool'] = ocrd_tool['tools'][self.executable] @@ -45,6 +45,7 @@ def executable(self): return 'ocrd-cis-ocropy-train' def setup(self): + self.log = getLogger('processor.OcropyTrain') #print(self.parameter) if 'model' in self.parameter: model = self.parameter['model'] From 1b2fea3ed5b7c9d1a02f2dcabe0770aa3eb87da6 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 13 Aug 2024 16:16:55 +0200 Subject: [PATCH 08/97] refactor: log -> logger --- ocrd_cis/ocropy/train.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/ocrd_cis/ocropy/train.py b/ocrd_cis/ocropy/train.py index 61a918c7..9278da92 100644 --- a/ocrd_cis/ocropy/train.py +++ b/ocrd_cis/ocropy/train.py @@ -28,7 +28,7 @@ def resize_keep_ratio(image, baseheight=48): class OcropyTrain(Processor): - log: Logger + logger: Logger def __init__(self, *args, **kwargs): self.oldcwd = os.getcwd() @@ -45,7 +45,7 @@ def executable(self): return 'ocrd-cis-ocropy-train' def setup(self): - self.log = getLogger('processor.OcropyTrain') + self.logger = getLogger('processor.OcropyTrain') #print(self.parameter) if 'model' in self.parameter: model = self.parameter['model'] @@ -54,9 +54,9 @@ def setup(self): except SystemExit: ocropydir = os.path.dirname(os.path.abspath(__file__)) modelpath = os.path.join(ocropydir, 'models', model) - self.log.info("Failed to resolve model '%s' path, trying '%s'", model, modelpath) + self.logger.info("Failed to resolve model '%s' path, trying '%s'", model, modelpath) if not os.path.isfile(modelpath): - self.log.error("Could not find model '%s'. Try 'ocrd resmgr download ocrd-cis-ocropy-recognize %s'", + self.logger.error("Could not find model '%s'. Try 'ocrd resmgr download ocrd-cis-ocropy-recognize %s'", model, model) sys.exit(1) outputpath = os.path.join(self.oldcwd, 'output', model) @@ -78,18 +78,18 @@ def process(self): """ filelist = [] filepath = tempfile.mkdtemp(prefix='ocrd-cis-ocropy-train-') - #self.log.info("Using model %s in %s for recognition", model) + #self.logger.info("Using model %s in %s for recognition", model) for (n, input_file) in enumerate(self.input_files): - #self.log.info("INPUT FILE %i / %s", n, input_file) + #self.logger.info("INPUT FILE %i / %s", n, input_file) pcgts = page_from_file(self.workspace.download_file(input_file)) page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id) page = pcgts.get_Page() page_image, page_coords, _ = self.workspace.image_from_page(page, page_id) - self.log.info("Extracting from page '%s'", page_id) + self.logger.info("Extracting from page '%s'", page_id) for region in page.get_AllRegions(classes=['Text']): textlines = region.get_TextLine() - self.log.info("Extracting %i lines from region '%s'", len(textlines), region.id) + self.logger.info("Extracting %i lines from region '%s'", len(textlines), region.id) for line in textlines: if self.parameter['textequiv_level'] == 'line': path = os.path.join(filepath, page_id + region.id + line.id) @@ -110,7 +110,7 @@ def process(self): if imgpath: filelist.append(imgpath) - self.log.info("Training %s from %s on %i file pairs", + self.logger.info("Training %s from %s on %i file pairs", self.outputpath, self.modelpath or 'scratch', len(filelist)) @@ -130,7 +130,7 @@ def extract_segment(self, path, segment, page_image, page_coords): with open(gtpath, "w", encoding='utf-8') as f: f.write(gt) - self.log.debug("Extracting %s '%s'", segment.__class__.__name__, segment.id) + self.logger.debug("Extracting %s '%s'", segment.__class__.__name__, segment.id) image, coords = self.workspace.image_from_segment(segment, page_image, page_coords) if 'binarized' not in coords['features'].split(','): From fe33494814e845cfd969a5f1a51234ceadb865a3 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 13 Aug 2024 16:32:17 +0200 Subject: [PATCH 09/97] remove: unused imports --- ocrd_cis/ocropy/binarize.py | 19 +++++++----------- ocrd_cis/ocropy/clip.py | 4 ++-- ocrd_cis/ocropy/denoise.py | 4 ++-- ocrd_cis/ocropy/deskew.py | 4 ++-- ocrd_cis/ocropy/dewarp.py | 4 ++-- ocrd_cis/ocropy/recognize.py | 20 +++++++++--------- ocrd_cis/ocropy/resegment.py | 9 +++------ ocrd_cis/ocropy/segment.py | 4 ++-- ocrd_cis/ocropy/train.py | 39 ++++++++++++++++++------------------ 9 files changed, 49 insertions(+), 58 deletions(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index cc34690e..5d3fc7c3 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -1,12 +1,12 @@ from __future__ import absolute_import from logging import Logger -import os.path -import PIL import cv2 import numpy as np from PIL import Image -from os.path import join +from os.path import abspath, dirname, join + +from typing import Tuple #import kraken.binarization @@ -16,18 +16,13 @@ assert_file_grp_cardinality, MIMETYPE_PAGE ) -from ocrd_modelfactory import page_from_file -from ocrd_models.ocrd_page import ( - OcrdPage, to_xml, AlternativeImageType -) +from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage, to_xml from ocrd import Processor from . import common -from .common import ( - # binarize, - array2pil, determine_zoom, pil2array, remove_noise) +from .common import array2pil, determine_zoom, pil2array, remove_noise -#sys.path.append(os.path.dirname(os.path.abspath(__file__))) +#sys.path.append(dirname(abspath(__file__))) def binarize(pil_image, method='ocropy', maxskew=2, threshold=0.5, nrm=False, zoom=1.0): LOG = getLogger('processor.OcropyBinarize') @@ -149,7 +144,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: return ret - def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id) -> tuple[Image.Image, str, str]: + def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id) -> Tuple[Image.Image, str, str]: if not page_image.width or not page_image.height: raise ValueError("Skipping page '%s' with zero size", page_id) self.logger.info("About to binarize page '%s'", page_id) diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py index 1b7fb28b..b70d1fb0 100644 --- a/ocrd_cis/ocropy/clip.py +++ b/ocrd_cis/ocropy/clip.py @@ -1,7 +1,7 @@ from __future__ import absolute_import from logging import Logger -import os.path +from os.path import join import numpy as np from PIL import Image, ImageStat, ImageOps from shapely.geometry import Polygon @@ -202,7 +202,7 @@ def process(self): input_file.pageId, file_id + '_' + region.id + '_' + line.id) # update METS (add the PAGE file): - file_path = os.path.join(self.output_file_grp, file_id + '.xml') + file_path = join(self.output_file_grp, file_id + '.xml') pcgts.set_pcGtsId(file_id) out = self.workspace.add_file( ID=file_id, diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py index 34750a53..7cf74727 100644 --- a/ocrd_cis/ocropy/denoise.py +++ b/ocrd_cis/ocropy/denoise.py @@ -1,6 +1,6 @@ from __future__ import absolute_import from logging import Logger -import os.path +from os.path import join from ocrd_utils import ( getLogger, @@ -105,7 +105,7 @@ def process(self): file_id + '_' + region.id + '_' + line.id) # update METS (add the PAGE file): - file_path = os.path.join(self.output_file_grp, file_id + '.xml') + file_path = join(self.output_file_grp, file_id + '.xml') pcgts.set_pcGtsId(file_id) out = self.workspace.add_file( ID=file_id, diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py index 2eb898ca..bcd3be01 100644 --- a/ocrd_cis/ocropy/deskew.py +++ b/ocrd_cis/ocropy/deskew.py @@ -1,6 +1,6 @@ from __future__ import absolute_import from logging import Logger -import os.path +from os.path import join from ocrd_utils import ( getLogger, @@ -105,7 +105,7 @@ def process(self): file_id + '_' + region.id) # update METS (add the PAGE file): - file_path = os.path.join(self.output_file_grp, file_id + '.xml') + file_path = join(self.output_file_grp, file_id + '.xml') pcgts.set_pcGtsId(file_id) out = self.workspace.add_file( ID=file_id, diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py index cad280c6..6c27c5c6 100644 --- a/ocrd_cis/ocropy/dewarp.py +++ b/ocrd_cis/ocropy/dewarp.py @@ -1,6 +1,6 @@ from __future__ import absolute_import from logging import Logger -import os.path +from os.path import join import numpy as np from ocrd_utils import ( @@ -172,7 +172,7 @@ def process(self): comments=line_xywh['features'] + ',dewarped')) # update METS (add the PAGE file): - file_path = os.path.join(self.output_file_grp, file_id + '.xml') + file_path = join(self.output_file_grp, file_id + '.xml') pcgts.set_pcGtsId(file_id) out = self.workspace.add_file( ID=file_id, diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py index 8e147fea..f3ecf199 100644 --- a/ocrd_cis/ocropy/recognize.py +++ b/ocrd_cis/ocropy/recognize.py @@ -1,7 +1,7 @@ from __future__ import absolute_import from logging import Logger -import sys -import os.path +from sys import exit +from os.path import abspath, dirname, isfile, join import numpy as np from PIL import Image @@ -24,11 +24,9 @@ from ocrd import Processor from .. import get_ocrd_tool +from .common import check_line, pil2array from .ocrolib import lstm, load_object, midrange -from .common import ( - pil2array, - check_line -) + def resize_keep_ratio(image, baseheight=48): scale = baseheight / image.height @@ -112,20 +110,20 @@ def get_model(self): be resolved with OcrdResourceManager to a valid readeable file and returns it. If not, it checks if the model can be found in the dirname(__file__)/models/ directory.""" - canread = lambda p: os.path.isfile(p) and os.access(p, os.R_OK) + canread = lambda p: isfile(p) and os.access(p, os.R_OK) try: model = self.resolve_resource(self.parameter['model']) if canread(model): return model except SystemExit: - ocropydir = os.path.dirname(os.path.abspath(__file__)) - path = os.path.join(ocropydir, 'models', self.parameter['model']) + ocropydir = dirname(abspath(__file__)) + path = join(ocropydir, 'models', self.parameter['model']) self.logger.info("Failed to resolve model with OCR-D/core mechanism, trying %s", path) if canread(path): return path self.logger.error("Could not find model %s. Try 'ocrd resmgr download ocrd-cis-ocropy-recognize %s", self.parameter['model'], self.parameter['model']) - sys.exit(1) + exit(1) def process(self): """Recognize lines / words / glyphs of the workspace. @@ -176,7 +174,7 @@ def process(self): # update METS (add the PAGE file): file_id = make_file_id(input_file, self.output_file_grp) - file_path = os.path.join(self.output_file_grp, file_id + '.xml') + file_path = join(self.output_file_grp, file_id + '.xml') pcgts.set_pcGtsId(file_id) out = self.workspace.add_file( ID=file_id, diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index 1e920b0f..329694d0 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -1,16 +1,13 @@ from __future__ import absolute_import from logging import Logger -import os.path +from os.path import join import numpy as np from skimage import draw, segmentation from shapely.geometry import Polygon, LineString from shapely.prepared import prep -from shapely.ops import unary_union from ocrd_modelfactory import page_from_file -from ocrd_models.ocrd_page import ( - to_xml, PageType, BaselineType -) +from ocrd_models.ocrd_page import BaselineType, PageType, to_xml from ocrd import Processor from ocrd_utils import ( getLogger, @@ -169,7 +166,7 @@ def process(self): self.logger.warning('Page "%s" region "%s" contains no text lines', page_id, region.id) # update METS (add the PAGE file): - file_path = os.path.join(self.output_file_grp, file_id + '.xml') + file_path = join(self.output_file_grp, file_id + '.xml') pcgts.set_pcGtsId(file_id) out = self.workspace.add_file( ID=file_id, diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index 3b89bda6..446fc628 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -1,6 +1,6 @@ from __future__ import absolute_import from logging import Logger -import os.path +from os.path import join import itertools import numpy as np from scipy.sparse.csgraph import minimum_spanning_tree @@ -505,7 +505,7 @@ def process(self): input_file.pageId, zoom) # update METS (add the PAGE file): - file_path = os.path.join(self.output_file_grp, file_id + '.xml') + file_path = join(self.output_file_grp, file_id + '.xml') pcgts.set_pcGtsId(file_id) out = self.workspace.add_file( ID=file_id, diff --git a/ocrd_cis/ocropy/train.py b/ocrd_cis/ocropy/train.py index 9278da92..ff460523 100644 --- a/ocrd_cis/ocropy/train.py +++ b/ocrd_cis/ocropy/train.py @@ -1,7 +1,8 @@ from __future__ import absolute_import from logging import Logger -import sys -import os +from sys import exit +from os import getcwd, makedirs, remove +from os.path import abspath, dirname, exists, join, isfile import tempfile from ocrd_modelfactory import page_from_file @@ -15,10 +16,10 @@ def deletefiles(filelist): for file in filelist: - if os.path.exists(file): - os.remove(file) - if os.path.exists(file[:-3]+'gt.txt'): - os.remove(file[:-3]+'gt.txt') + if exists(file): + remove(file) + if exists(file[:-3]+'gt.txt'): + remove(file[:-3]+'gt.txt') def resize_keep_ratio(image, baseheight=48): hpercent = (baseheight / float(image.size[1])) @@ -31,7 +32,7 @@ class OcropyTrain(Processor): logger: Logger def __init__(self, *args, **kwargs): - self.oldcwd = os.getcwd() + self.oldcwd = getcwd() ocrd_tool = get_ocrd_tool() kwargs['ocrd_tool'] = ocrd_tool['tools'][self.executable] kwargs['version'] = ocrd_tool['version'] @@ -52,22 +53,22 @@ def setup(self): try: modelpath = self.resolve_resource(model) except SystemExit: - ocropydir = os.path.dirname(os.path.abspath(__file__)) - modelpath = os.path.join(ocropydir, 'models', model) + ocropydir = dirname(abspath(__file__)) + modelpath = join(ocropydir, 'models', model) self.logger.info("Failed to resolve model '%s' path, trying '%s'", model, modelpath) - if not os.path.isfile(modelpath): + if not isfile(modelpath): self.logger.error("Could not find model '%s'. Try 'ocrd resmgr download ocrd-cis-ocropy-recognize %s'", model, model) - sys.exit(1) - outputpath = os.path.join(self.oldcwd, 'output', model) + exit(1) + outputpath = join(self.oldcwd, 'output', model) if 'outputpath' in self.parameter: - outputpath = os.path.join(self.parameter, model) + outputpath = join(self.parameter, model) else: modelpath = None - outputpath = os.path.join(self.oldcwd, 'output', 'lstm') + outputpath = join(self.oldcwd, 'output', 'lstm') if 'outputpath' in self.parameter: - outputpath = os.path.join(self.parameter, 'lstm') - os.makedirs(os.path.dirname(outputpath)) + outputpath = join(self.parameter, 'lstm') + makedirs(dirname(outputpath)) self.modelpath = modelpath self.outputpath = outputpath @@ -92,20 +93,20 @@ def process(self): self.logger.info("Extracting %i lines from region '%s'", len(textlines), region.id) for line in textlines: if self.parameter['textequiv_level'] == 'line': - path = os.path.join(filepath, page_id + region.id + line.id) + path = join(filepath, page_id + region.id + line.id) imgpath = self.extract_segment(path, line, page_image, page_coords) if imgpath: filelist.append(imgpath) continue for word in line.get_Word(): if self.parameter['textequiv_level'] == 'word': - path = os.path.join(filepath, page_id + region.id + line.id + word.id) + path = join(filepath, page_id + region.id + line.id + word.id) imgpath = self.extract_segment(path, word, page_image, page_coords) if imgpath: filelist.append(imgpath) continue for glyph in word.get_Glyph(): - path = os.path.join(filepath, page_id + region.id + line.id + glyph.id) + path = join(filepath, page_id + region.id + line.id + glyph.id) imgpath = self.extract_segment(path, glyph, page_image, page_coords) if imgpath: filelist.append(imgpath) From 3368a53e8341ab265ac5fa115a740cfc02bcc5ef Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 13 Aug 2024 16:34:21 +0200 Subject: [PATCH 10/97] remove: file grp cardinality checks inside process() --- ocrd_cis/ocropy/clip.py | 2 -- ocrd_cis/ocropy/denoise.py | 2 -- ocrd_cis/ocropy/deskew.py | 2 -- ocrd_cis/ocropy/dewarp.py | 2 -- ocrd_cis/ocropy/recognize.py | 2 -- ocrd_cis/ocropy/resegment.py | 2 -- ocrd_cis/ocropy/segment.py | 3 --- 7 files changed, 15 deletions(-) diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py index b70d1fb0..777b3d3d 100644 --- a/ocrd_cis/ocropy/clip.py +++ b/ocrd_cis/ocropy/clip.py @@ -85,8 +85,6 @@ def process(self): # deskewing, because that would make segments incomensurable with their # neighbours. level = self.parameter['level-of-operation'] - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) for (n, input_file) in enumerate(self.input_files): self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py index 7cf74727..5d3b9d44 100644 --- a/ocrd_cis/ocropy/denoise.py +++ b/ocrd_cis/ocropy/denoise.py @@ -59,8 +59,6 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ level = self.parameter['level-of-operation'] - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) for (n, input_file) in enumerate(self.input_files): self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py index bcd3be01..16b4bc81 100644 --- a/ocrd_cis/ocropy/deskew.py +++ b/ocrd_cis/ocropy/deskew.py @@ -63,8 +63,6 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ level = self.parameter['level-of-operation'] - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) for (n, input_file) in enumerate(self.input_files): self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py index 6c27c5c6..dbe512f2 100644 --- a/ocrd_cis/ocropy/dewarp.py +++ b/ocrd_cis/ocropy/dewarp.py @@ -112,8 +112,6 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) for (n, input_file) in enumerate(self.input_files): self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py index f3ecf199..4b5da4b1 100644 --- a/ocrd_cis/ocropy/recognize.py +++ b/ocrd_cis/ocropy/recognize.py @@ -150,8 +150,6 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) maxlevel = self.parameter['textequiv_level'] # self.logger.info("Using model %s in %s for recognition", model) diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index 329694d0..378c2fd3 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -114,8 +114,6 @@ def process(self): # accuracy crucially depends on a good estimate of the images' # pixel density (at least if source input is not 300 DPI). level = self.parameter['level-of-operation'] - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) for n, input_file in enumerate(self.input_files): self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index 446fc628..6feb6e29 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -338,9 +338,6 @@ def process(self): overwrite_order = self.parameter['overwrite_order'] oplevel = self.parameter['level-of-operation'] - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) - for (n, input_file) in enumerate(self.input_files): self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) file_id = make_file_id(input_file, self.output_file_grp) From ae97768ea73a900092f656c6ad42a64670525a11 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 13 Aug 2024 16:41:13 +0200 Subject: [PATCH 11/97] remove: constructors, adapt setup() --- ocrd_cis/ocropy/clip.py | 7 ------- ocrd_cis/ocropy/denoise.py | 7 ------- ocrd_cis/ocropy/deskew.py | 7 ------- ocrd_cis/ocropy/dewarp.py | 10 ---------- ocrd_cis/ocropy/recognize.py | 19 ++++++------------- ocrd_cis/ocropy/resegment.py | 7 ------- ocrd_cis/ocropy/segment.py | 8 -------- ocrd_cis/ocropy/train.py | 17 ++++------------- 8 files changed, 10 insertions(+), 72 deletions(-) diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py index 777b3d3d..62f68fcf 100644 --- a/ocrd_cis/ocropy/clip.py +++ b/ocrd_cis/ocropy/clip.py @@ -25,7 +25,6 @@ MIMETYPE_PAGE ) -from .. import get_ocrd_tool from .ocrolib import midrange, morph from .common import ( # binarize, @@ -34,12 +33,6 @@ class OcropyClip(Processor): logger: Logger - def __init__(self, *args, **kwargs): - self.ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable] - kwargs['version'] = self.ocrd_tool['version'] - super(OcropyClip, self).__init__(*args, **kwargs) - @property def executable(self): return 'ocrd-cis-ocropy-clip' diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py index 5d3b9d44..a68e2e3c 100644 --- a/ocrd_cis/ocropy/denoise.py +++ b/ocrd_cis/ocropy/denoise.py @@ -14,7 +14,6 @@ ) from ocrd import Processor -from .. import get_ocrd_tool from .common import ( # binarize, determine_zoom, remove_noise) @@ -22,12 +21,6 @@ class OcropyDenoise(Processor): logger: Logger - def __init__(self, *args, **kwargs): - self.ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable] - kwargs['version'] = self.ocrd_tool['version'] - super(OcropyDenoise, self).__init__(*args, **kwargs) - @property def executable(self): return 'ocrd-cis-ocropy-denoise' diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py index 16b4bc81..e41a557d 100644 --- a/ocrd_cis/ocropy/deskew.py +++ b/ocrd_cis/ocropy/deskew.py @@ -15,7 +15,6 @@ ) from ocrd import Processor -from .. import get_ocrd_tool from . import common from .common import pil2array @@ -29,12 +28,6 @@ def deskew(pil_image, maxskew=2): class OcropyDeskew(Processor): logger: Logger - def __init__(self, *args, **kwargs): - ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = ocrd_tool['tools'][self.executable] - kwargs['version'] = ocrd_tool['version'] - super(OcropyDeskew, self).__init__(*args, **kwargs) - @property def executable(self): return 'ocrd-cis-ocropy-deskew' diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py index dbe512f2..bb9e4098 100644 --- a/ocrd_cis/ocropy/dewarp.py +++ b/ocrd_cis/ocropy/dewarp.py @@ -15,7 +15,6 @@ from ocrd import Processor from ocrd_utils import MIMETYPE_PAGE -from .. import get_ocrd_tool from .ocrolib import lineest from .common import array2pil, check_line, determine_zoom, pil2array @@ -66,15 +65,6 @@ def padvert(image, range_): class OcropyDewarp(Processor): logger: Logger - def __init__(self, *args, **kwargs): - self.ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable] - kwargs['version'] = self.ocrd_tool['version'] - super(OcropyDewarp, self).__init__(*args, **kwargs) - if hasattr(self, 'output_file_grp'): - # processing context - self.setup() - @property def executable(self): return 'ocrd-cis-ocropy-dewarp' diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py index 4b5da4b1..5880675c 100644 --- a/ocrd_cis/ocropy/recognize.py +++ b/ocrd_cis/ocropy/recognize.py @@ -1,6 +1,8 @@ from __future__ import absolute_import from logging import Logger from sys import exit +from typing import Any +from os import access, R_OK from os.path import abspath, dirname, isfile, join import numpy as np from PIL import Image @@ -23,7 +25,6 @@ ) from ocrd import Processor -from .. import get_ocrd_tool from .common import check_line, pil2array from .ocrolib import lstm, load_object, midrange @@ -77,17 +78,8 @@ def recognize(image, pad, network, check=True): class OcropyRecognize(Processor): logger: Logger - - def __init__(self, *args, **kwargs): - self.ocrd_tool = get_ocrd_tool() - self.pad = 16 # ocropus-rpred default - self.network = None # set in process - kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable] - kwargs['version'] = self.ocrd_tool['version'] - super(OcropyRecognize, self).__init__(*args, **kwargs) - if hasattr(self, 'output_file_grp'): - # processing context - self.setup() + network: Any + pad: int @property def executable(self): @@ -95,6 +87,7 @@ def executable(self): def setup(self): self.logger = getLogger('processor.OcropyRecognize') + self.pad = 16 assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) # from ocropus-rpred: @@ -110,7 +103,7 @@ def get_model(self): be resolved with OcrdResourceManager to a valid readeable file and returns it. If not, it checks if the model can be found in the dirname(__file__)/models/ directory.""" - canread = lambda p: isfile(p) and os.access(p, os.R_OK) + canread = lambda p: isfile(p) and access(p, R_OK) try: model = self.resolve_resource(self.parameter['model']) if canread(model): diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index 378c2fd3..17b90f65 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -21,7 +21,6 @@ MIMETYPE_PAGE ) -from .. import get_ocrd_tool from .ocrolib import midrange, morph from .common import ( pil2array, @@ -47,12 +46,6 @@ class OcropyResegment(Processor): logger: Logger - def __init__(self, *args, **kwargs): - self.ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable] - kwargs['version'] = self.ocrd_tool['version'] - super().__init__(*args, **kwargs) - @property def executable(self): return 'ocrd-cis-ocropy-resegment' diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index 6feb6e29..f886e1d1 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -46,7 +46,6 @@ MIMETYPE_PAGE ) -from .. import get_ocrd_tool from .ocrolib import midrange from .ocrolib import morph from .common import ( @@ -247,13 +246,6 @@ def getx(xy): class OcropySegment(Processor): logger: Logger - def __init__(self, *args, **kwargs): - - self.ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable] - kwargs['version'] = self.ocrd_tool['version'] - super(OcropySegment, self).__init__(*args, **kwargs) - @property def executable(self): return 'ocrd-cis-ocropy-segment' diff --git a/ocrd_cis/ocropy/train.py b/ocrd_cis/ocropy/train.py index ff460523..08b68693 100644 --- a/ocrd_cis/ocropy/train.py +++ b/ocrd_cis/ocropy/train.py @@ -8,7 +8,6 @@ from ocrd_modelfactory import page_from_file from ocrd import Processor from ocrd_utils import getLogger -from ocrd_cis import get_ocrd_tool from .ocropus_rtrain import * from .binarize import binarize @@ -30,16 +29,7 @@ def resize_keep_ratio(image, baseheight=48): class OcropyTrain(Processor): logger: Logger - - def __init__(self, *args, **kwargs): - self.oldcwd = getcwd() - ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = ocrd_tool['tools'][self.executable] - kwargs['version'] = ocrd_tool['version'] - super(OcropyTrain, self).__init__(*args, **kwargs) - if hasattr(self, 'input_file_grp'): - # processing context - self.setup() + old_cwd: str @property def executable(self): @@ -47,6 +37,7 @@ def executable(self): def setup(self): self.logger = getLogger('processor.OcropyTrain') + self.old_cwd = getcwd() #print(self.parameter) if 'model' in self.parameter: model = self.parameter['model'] @@ -60,12 +51,12 @@ def setup(self): self.logger.error("Could not find model '%s'. Try 'ocrd resmgr download ocrd-cis-ocropy-recognize %s'", model, model) exit(1) - outputpath = join(self.oldcwd, 'output', model) + outputpath = join(self.old_cwd, 'output', model) if 'outputpath' in self.parameter: outputpath = join(self.parameter, model) else: modelpath = None - outputpath = join(self.oldcwd, 'output', 'lstm') + outputpath = join(self.old_cwd, 'output', 'lstm') if 'outputpath' in self.parameter: outputpath = join(self.parameter, 'lstm') makedirs(dirname(outputpath)) From 60d02d28040f5b1bc2b4f5497f5353d4f53d5c45 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 14 Aug 2024 00:39:18 +0200 Subject: [PATCH 12/97] completed: OcropyBinarize --- ocrd_cis/ocropy/binarize.py | 138 +++++++++++++++++------------------- 1 file changed, 65 insertions(+), 73 deletions(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index 5d3fc7c3..0728f852 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -116,38 +116,36 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: except ValueError as e: self.logger.exception(e) else: - # TODO - raise NotImplementedError if level == 'table': regions = page.get_TableRegion() else: # region regions = page.get_AllRegions(classes=['Text'], order='reading-order') if not regions: - self.logger.warning('Page "%s" contains no text regions', page_id) + self.logger.warning(f"Page '{page_id}' contains no text regions") for region in regions: region_image, region_xywh = self.workspace.image_from_segment( region, page_image, page_xywh, feature_filter='binarized') if level == 'region': - self.process_region(region, region_image, region_xywh, zoom, - input_file.pageId, file_id + '_' + region.id) - continue + try: + ret.append(self.process_region(region, region_image, region_xywh, zoom, page_id, file_id)) + except ValueError as e: + self.logger.exception(e) lines = region.get_TextLine() if not lines: - self.logger.warning('Page "%s" region "%s" contains no text lines', - page_id, region.id) + self.logger.warning(f"Page '{page_id}' region '{region.id}' contains no text lines") for line in lines: line_image, line_xywh = self.workspace.image_from_segment( line, region_image, region_xywh, feature_filter='binarized') - self.process_line(line, line_image, line_xywh, zoom, - input_file.pageId, region.id, - file_id + '_' + region.id + '_' + line.id) - + try: + ret.append(self.process_line(line, line_image, line_xywh, zoom, page_id, region.id, file_id)) + except ValueError as e: + self.logger.exception(e) return ret def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id) -> Tuple[Image.Image, str, str]: if not page_image.width or not page_image.height: - raise ValueError("Skipping page '%s' with zero size", page_id) - self.logger.info("About to binarize page '%s'", page_id) + raise ValueError(f"Skipping page '{page_id}' with zero size") + self.logger.info(f"About to binarize page '{page_id}'") assert self.output_file_grp features = page_xywh['features'] @@ -157,18 +155,18 @@ def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id) -> T maxskew = 0 else: maxskew = self.parameter['maxskew'] - bin_image, angle = binarize(page_image, - method=self.parameter['method'], - maxskew=maxskew, - threshold=self.parameter['threshold'], - nrm=self.parameter['grayscale'], - zoom=zoom) + bin_image, angle = binarize( + page_image, + method=self.parameter['method'], + maxskew=maxskew, + threshold=self.parameter['threshold'], + nrm=self.parameter['grayscale'], + zoom=zoom) if angle: features += ',deskewed' page_xywh['angle'] = angle if self.parameter['noise_maxsize']: - bin_image = remove_noise( - bin_image, maxsize=self.parameter['noise_maxsize']) + bin_image = remove_noise(bin_image, maxsize=self.parameter['noise_maxsize']) features += ',despeckled' # annotate angle in PAGE (to allow consumers of the AlternativeImage # to do consistent coordinate transforms, and non-consumers @@ -176,43 +174,43 @@ def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id) -> T orientation = -page_xywh['angle'] orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180] page.set_orientation(orientation) - # update METS (add the image file): if self.parameter['grayscale']: file_id += '.IMG-NRM' features += ',grayscale_normalized' else: file_id += '.IMG-BIN' features += ',binarized' - bin_image_path = join(self.output_file_grp, f'{file_id}.png') + bin_image_id = f'{file_id}.IMG-BIN' + bin_image_path = join(self.output_file_grp, f'{bin_image_id}.png') # update PAGE (reference the image file): page.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=features)) - return (bin_image, file_id, bin_image_path) + return bin_image, bin_image_id, bin_image_path - def process_region(self, region, region_image, region_xywh, zoom, page_id, file_id): + def process_region(self, region, region_image, region_xywh, zoom, page_id, file_id) -> Tuple[Image.Image, str, str]: if not region_image.width or not region_image.height: - self.logger.warning("Skipping region '%s' with zero size", region.id) - return - self.logger.info("About to binarize page '%s' region '%s'", page_id, region.id) + raise ValueError(f"Skipping region '{region.id}' with zero size") + self.logger.info(f"About to binarize page '{page_id}' region '{region.id}'") features = region_xywh['features'] if 'angle' in region_xywh and region_xywh['angle']: # orientation has already been annotated (by previous deskewing), # so skip deskewing here: - bin_image, _ = binarize(region_image, - method=self.parameter['method'], - maxskew=0, - nrm=self.parameter['grayscale'], - zoom=zoom) + bin_image, _ = binarize( + region_image, + method=self.parameter['method'], + maxskew=0, + nrm=self.parameter['grayscale'], + zoom=zoom) else: - bin_image, angle = binarize(region_image, - method=self.parameter['method'], - maxskew=self.parameter['maxskew'], - nrm=self.parameter['grayscale'], - zoom=zoom) + bin_image, angle = binarize( + region_image, + method=self.parameter['method'], + maxskew=self.parameter['maxskew'], + nrm=self.parameter['grayscale'], + zoom=zoom) if angle: features += ',deskewed' region_xywh['angle'] = angle - bin_image = remove_noise(bin_image, - maxsize=self.parameter['noise_maxsize']) + bin_image = remove_noise(bin_image, maxsize=self.parameter['noise_maxsize']) if self.parameter['noise_maxsize']: features += ',despeckled' # annotate angle in PAGE (to allow consumers of the AlternativeImage @@ -221,33 +219,31 @@ def process_region(self, region, region_image, region_xywh, zoom, page_id, file_ orientation = -region_xywh['angle'] orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180] region.set_orientation(orientation) - # update METS (add the image file): + bin_image_id = f'{file_id}_{region.id}' if self.parameter['grayscale']: - file_id += '.IMG-NRM' + bin_image_id += '.IMG-NRM' features += ',grayscale_normalized' else: - file_id += '.IMG-BIN' + bin_image_id += '.IMG-BIN' features += ',binarized' - file_path = self.workspace.save_image_file( - bin_image, file_id, self.output_file_grp, - page_id=page_id) + bin_image_path = join(self.output_file_grp, f'{bin_image_id}.png') # update PAGE (reference the image file): - region.add_AlternativeImage(AlternativeImageType( - filename=file_path, - comments=features)) + region.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=features)) + return bin_image, bin_image_id, bin_image_path - def process_line(self, line, line_image, line_xywh, zoom, page_id, region_id, file_id): + def process_line( + self, line, line_image, line_xywh, zoom, page_id, region_id, file_id + ) -> Tuple[Image.Image, str, str]: if not line_image.width or not line_image.height: - self.logger.warning("Skipping line '%s' with zero size", line.id) - return - self.logger.info("About to binarize page '%s' region '%s' line '%s'", - page_id, region_id, line.id) + raise ValueError(f"Skipping line '{line.id}' with zero size") + self.logger.info(f"About to binarize page '{page_id}' region '{region_id}' line '{line.id}'") features = line_xywh['features'] - bin_image, angle = binarize(line_image, - method=self.parameter['method'], - maxskew=self.parameter['maxskew'], - nrm=self.parameter['grayscale'], - zoom=zoom) + bin_image, angle = binarize( + line_image, + method=self.parameter['method'], + maxskew=self.parameter['maxskew'], + nrm=self.parameter['grayscale'], + zoom=zoom) if angle: features += ',deskewed' # annotate angle in PAGE (to allow consumers of the AlternativeImage @@ -256,23 +252,19 @@ def process_line(self, line, line_image, line_xywh, zoom, page_id, region_id, fi #orientation = -angle #orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180] #line.set_orientation(orientation) # does not exist on line level! - self.logger.warning("cannot add orientation %.2f to page '%s' region '%s' line '%s'", - -angle, page_id, region_id, line.id) - bin_image = remove_noise(bin_image, - maxsize=self.parameter['noise_maxsize']) + self.logger.warning(f"cannot add orientation %.2f to page '{page_id}' region '{region_id}' line '{line.id}'", + -angle) + bin_image = remove_noise(bin_image, maxsize=self.parameter['noise_maxsize']) if self.parameter['noise_maxsize']: features += ',despeckled' - # update METS (add the image file): + bin_image_id = f'{file_id}_{region_id}_{line.id}' if self.parameter['grayscale']: - file_id += '.IMG-NRM' + bin_image_id += '.IMG-NRM' features += ',grayscale_normalized' else: - file_id += '.IMG-BIN' + bin_image_id += '.IMG-BIN' features += ',binarized' - file_path = self.workspace.save_image_file( - bin_image, file_id, self.output_file_grp, - page_id=page_id) + bin_image_path = join(self.output_file_grp, f'{bin_image_id}.png') # update PAGE (reference the image file): - line.add_AlternativeImage(AlternativeImageType( - filename=file_path, - comments=features)) + line.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=features)) + return bin_image, bin_image_id, bin_image_path From dcaccd4b5bb357c4f73356aaed04fd8a4483caa8 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 14 Aug 2024 00:46:34 +0200 Subject: [PATCH 13/97] remove file grp cardinality asserts --- ocrd_cis/ocropy/binarize.py | 3 --- ocrd_cis/ocropy/clip.py | 3 --- ocrd_cis/ocropy/denoise.py | 3 --- ocrd_cis/ocropy/deskew.py | 3 --- ocrd_cis/ocropy/dewarp.py | 3 --- ocrd_cis/ocropy/recognize.py | 3 --- ocrd_cis/ocropy/resegment.py | 3 --- ocrd_cis/ocropy/segment.py | 3 --- 8 files changed, 24 deletions(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index 0728f852..746aba5e 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -13,7 +13,6 @@ from ocrd_utils import ( getLogger, make_file_id, - assert_file_grp_cardinality, MIMETYPE_PAGE ) from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage, to_xml @@ -69,8 +68,6 @@ def executable(self): def setup(self): self.logger = getLogger('processor.OcropyBinarize') - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) method = self.parameter['method'] if self.parameter['grayscale'] and method != 'ocropy': self.logger.critical(f'Requested method {method} does not support grayscale normalized output') diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py index 62f68fcf..3e76157b 100644 --- a/ocrd_cis/ocropy/clip.py +++ b/ocrd_cis/ocropy/clip.py @@ -15,7 +15,6 @@ from ocrd_utils import ( getLogger, make_file_id, - assert_file_grp_cardinality, coordinates_of_segment, polygon_from_points, bbox_from_polygon, @@ -39,8 +38,6 @@ def executable(self): def setup(self): self.logger = getLogger('processor.OcropyClip') - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) def process(self): """Clip text regions / lines of the workspace at intersections with neighbours. diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py index a68e2e3c..24852f24 100644 --- a/ocrd_cis/ocropy/denoise.py +++ b/ocrd_cis/ocropy/denoise.py @@ -5,7 +5,6 @@ from ocrd_utils import ( getLogger, make_file_id, - assert_file_grp_cardinality, MIMETYPE_PAGE ) from ocrd_modelfactory import page_from_file @@ -27,8 +26,6 @@ def executable(self): def setup(self): self.logger = getLogger('processor.OcropyDenoise') - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) def process(self): """Despeckle the pages / regions / lines of the workspace. diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py index e41a557d..616864e1 100644 --- a/ocrd_cis/ocropy/deskew.py +++ b/ocrd_cis/ocropy/deskew.py @@ -5,7 +5,6 @@ from ocrd_utils import ( getLogger, make_file_id, - assert_file_grp_cardinality, MIMETYPE_PAGE ) from ocrd_modelfactory import page_from_file @@ -34,8 +33,6 @@ def executable(self): def setup(self): self.logger = getLogger('processor.OcropyDeskew') - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) def process(self): """Deskew the pages or regions of the workspace. diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py index bb9e4098..17b69bc5 100644 --- a/ocrd_cis/ocropy/dewarp.py +++ b/ocrd_cis/ocropy/dewarp.py @@ -6,7 +6,6 @@ from ocrd_utils import ( getLogger, make_file_id, - assert_file_grp_cardinality, ) from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( @@ -71,8 +70,6 @@ def executable(self): def setup(self): self.logger = getLogger('processor.OcropyDewarp') - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) # defaults from ocrolib.lineest: self.lnorm = lineest.CenterNormalizer( params=(self.parameter['range'], diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py index 5880675c..40de2817 100644 --- a/ocrd_cis/ocropy/recognize.py +++ b/ocrd_cis/ocropy/recognize.py @@ -12,7 +12,6 @@ from ocrd_utils import ( getLogger, make_file_id, - assert_file_grp_cardinality, coordinates_for_segment, polygon_from_bbox, points_from_polygon, @@ -88,8 +87,6 @@ def executable(self): def setup(self): self.logger = getLogger('processor.OcropyRecognize') self.pad = 16 - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) # from ocropus-rpred: self.network = load_object(self.get_model(), verbose=1) for x in self.network.walk(): diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index 17b90f65..2483411d 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -12,7 +12,6 @@ from ocrd_utils import ( getLogger, make_file_id, - assert_file_grp_cardinality, coordinates_of_segment, coordinates_for_segment, points_from_polygon, @@ -52,8 +51,6 @@ def executable(self): def setup(self): self.logger = getLogger('processor.OcropyResegment') - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) def process(self): """Resegment lines of the workspace. diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index f886e1d1..9a1b8e11 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -38,7 +38,6 @@ from ocrd_utils import ( getLogger, make_file_id, - assert_file_grp_cardinality, coordinates_of_segment, coordinates_for_segment, points_from_polygon, @@ -252,8 +251,6 @@ def executable(self): def setup(self): self.logger = getLogger('processor.OcropySegment') - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) def process(self): """Segment pages into regions+lines, tables into cells+lines, or regions into lines. From b178227763b834802b1e775623402b7bb5cdf84c Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 14 Aug 2024 10:51:52 +0200 Subject: [PATCH 14/97] Update ocrd_cis/ocropy/binarize.py Co-authored-by: Konstantin Baierer --- ocrd_cis/ocropy/binarize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index 746aba5e..27a3667c 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -118,7 +118,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: else: # region regions = page.get_AllRegions(classes=['Text'], order='reading-order') if not regions: - self.logger.warning(f"Page '{page_id}' contains no text regions") + self.logger.warning(f"Page '{page_id}' contains no regions") for region in regions: region_image, region_xywh = self.workspace.image_from_segment( region, page_image, page_xywh, feature_filter='binarized') From 67b6107e19c604063e9dae37473fcc48e04b4558 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 14 Aug 2024 10:52:25 +0200 Subject: [PATCH 15/97] Update ocrd_cis/ocropy/binarize.py Co-authored-by: Konstantin Baierer --- ocrd_cis/ocropy/binarize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index 27a3667c..fea064af 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -76,7 +76,7 @@ def setup(self): def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage: """Binarize (and optionally deskew/despeckle) the pages/regions/lines of the workspace. - THEN Iterate over the PAGE-XML element hierarchy down to the requested + Iterate over the PAGE-XML element hierarchy down to the requested ``level-of-operation``. Next, for each file, crop each segment image according to the layout From 06a98b1f601d80511e73b0c366a60f574e2a8e27 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 14 Aug 2024 10:55:29 +0200 Subject: [PATCH 16/97] Update ocrd_cis/ocropy/binarize.py Co-authored-by: Konstantin Baierer --- ocrd_cis/ocropy/binarize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index fea064af..7e355d73 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -71,7 +71,7 @@ def setup(self): method = self.parameter['method'] if self.parameter['grayscale'] and method != 'ocropy': self.logger.critical(f'Requested method {method} does not support grayscale normalized output') - raise Exception('only method=ocropy allows grayscale=true') + raise ValueError('only method=ocropy allows grayscale=true') def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage: """Binarize (and optionally deskew/despeckle) the pages/regions/lines of the workspace. From 1e6cd7bd53547de5c41f2100cdad8adc1a2091ca Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 14 Aug 2024 10:55:45 +0200 Subject: [PATCH 17/97] Update ocrd_cis/ocropy/binarize.py Co-authored-by: Konstantin Baierer --- ocrd_cis/ocropy/binarize.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index 7e355d73..af60e613 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -21,7 +21,6 @@ from . import common from .common import array2pil, determine_zoom, pil2array, remove_noise -#sys.path.append(dirname(abspath(__file__))) def binarize(pil_image, method='ocropy', maxskew=2, threshold=0.5, nrm=False, zoom=1.0): LOG = getLogger('processor.OcropyBinarize') From 71bb26d9c4f0b45498625b90c9e4cd136d8e667e Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 14 Aug 2024 11:04:12 +0200 Subject: [PATCH 18/97] fix: potentially wrong dpi in logs --- ocrd_cis/ocropy/binarize.py | 4 ++-- ocrd_cis/ocropy/clip.py | 4 ++-- ocrd_cis/ocropy/common.py | 4 ++-- ocrd_cis/ocropy/denoise.py | 4 ++-- ocrd_cis/ocropy/dewarp.py | 4 ++-- ocrd_cis/ocropy/resegment.py | 4 ++-- ocrd_cis/ocropy/segment.py | 4 ++-- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index af60e613..61e959ca 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -102,8 +102,8 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: assert page page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id, feature_filter='binarized') - zoom = determine_zoom(self.parameter['dpi'], page_image_info) - self.logger.info(f"Page '{page_id}' uses {self.parameter['dpi']} DPI") + zoom, dpi = determine_zoom(self.parameter['dpi'], page_image_info) + self.logger.info(f"Page '{page_id}' uses {dpi} DPI. Determined zoom={zoom}") ret = [pcgts] if level == 'page': diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py index 3e76157b..3607399b 100644 --- a/ocrd_cis/ocropy/clip.py +++ b/ocrd_cis/ocropy/clip.py @@ -87,8 +87,8 @@ def process(self): page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector='binarized') - zoom = determine_zoom(self.parameter['dpi'], page_image_info) - self.logger.info(f"Page '{page_id}' uses {self.parameter['dpi']} DPI") + zoom, dpi = determine_zoom(self.parameter['dpi'], page_image_info) + self.logger.info(f"Page '{page_id}' uses {dpi} DPI. Determined zoom={zoom}") # FIXME: what about text regions inside table regions? regions = list(page.get_TextRegion()) diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py index 1804c29d..49e8f248 100644 --- a/ocrd_cis/ocropy/common.py +++ b/ocrd_cis/ocropy/common.py @@ -2103,7 +2103,7 @@ def find_topological(): # DSAVE('rlabels_closed', rlabels) return rlabels -def determine_zoom(dpi: float, page_image_info: OcrdExif) -> float: +def determine_zoom(dpi: float, page_image_info: OcrdExif) -> (float, float): if dpi > 0: zoom = 300.0/dpi elif page_image_info.resolution != 1: @@ -2113,4 +2113,4 @@ def determine_zoom(dpi: float, page_image_info: OcrdExif) -> float: zoom = 300.0/dpi else: zoom = 1 - return zoom + return zoom, dpi diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py index 24852f24..713af889 100644 --- a/ocrd_cis/ocropy/denoise.py +++ b/ocrd_cis/ocropy/denoise.py @@ -63,8 +63,8 @@ def process(self): page, page_id, feature_selector='binarized' if level == 'page' else '') - zoom = determine_zoom(self.parameter['dpi'], page_image_info) - self.logger.info(f"Page '{page_id}' uses {self.parameter['dpi']} DPI") + zoom, dpi = determine_zoom(self.parameter['dpi'], page_image_info) + self.logger.info(f"Page '{page_id}' uses {dpi} DPI. Determined zoom={zoom}") if level == 'page': self.process_segment(page, page_image, page_xywh, zoom, diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py index 17b69bc5..412724db 100644 --- a/ocrd_cis/ocropy/dewarp.py +++ b/ocrd_cis/ocropy/dewarp.py @@ -112,8 +112,8 @@ def process(self): page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id) - zoom = determine_zoom(self.parameter['dpi'], page_image_info) - self.logger.info(f"Page '{page_id}' uses {self.parameter['dpi']} DPI") + zoom, dpi = determine_zoom(self.parameter['dpi'], page_image_info) + self.logger.info(f"Page '{page_id}' uses {dpi} DPI. Determined zoom={zoom}") regions = page.get_AllRegions(classes=['Text'], order='reading-order') if not regions: diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index 2483411d..5bc9d008 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -117,8 +117,8 @@ def process(self): page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector='binarized') - zoom = determine_zoom(self.parameter['dpi'], page_image_info) - self.logger.info(f"Page '{page_id}' uses {self.parameter['dpi']} DPI") + zoom, dpi = determine_zoom(self.parameter['dpi'], page_image_info) + self.logger.info(f"Page '{page_id}' uses {dpi} DPI. Determined zoom={zoom}") ignore = (page.get_ImageRegion() + page.get_LineDrawingRegion() + diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index 9a1b8e11..d171b6ed 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -339,8 +339,8 @@ def process(self): # TODO: also allow grayscale_normalized (try/except?) page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector='binarized') - zoom = determine_zoom(self.parameter['dpi'], page_image_info) - self.logger.info(f"Page '{page_id}' uses {self.parameter['dpi']} DPI") + zoom, dpi = determine_zoom(self.parameter['dpi'], page_image_info) + self.logger.info(f"Page '{page_id}' uses {dpi} DPI. Determined zoom={zoom}") # aggregate existing regions so their foreground can be ignored ignore = (page.get_ImageRegion() + From 64f02a32f938a00e01d6d390993246a617cbab5e Mon Sep 17 00:00:00 2001 From: kba Date: Wed, 14 Aug 2024 11:14:31 +0200 Subject: [PATCH 19/97] binarize: don't conflate region/lines seg, pass output_file_id --- ocrd_cis/ocropy/binarize.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index 61e959ca..817d4a8a 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -123,7 +123,8 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: region, page_image, page_xywh, feature_filter='binarized') if level == 'region': try: - ret.append(self.process_region(region, region_image, region_xywh, zoom, page_id, file_id)) + ret.append(self.process_region(region, region_image, region_xywh, zoom, region.id, output_file_id)) + continue except ValueError as e: self.logger.exception(e) lines = region.get_TextLine() @@ -133,8 +134,8 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: line_image, line_xywh = self.workspace.image_from_segment( line, region_image, region_xywh, feature_filter='binarized') try: - ret.append(self.process_line(line, line_image, line_xywh, zoom, page_id, region.id, file_id)) - except ValueError as e: + ret.append(self.process_line(line, line_image, line_xywh, zoom, page_id, region.id, output_file_id)) + except alueError as e: self.logger.exception(e) return ret From d7c15c7738cdad474eb1999718c41371192e0e14 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 14 Aug 2024 11:29:21 +0200 Subject: [PATCH 20/97] Update binarize.py --- ocrd_cis/ocropy/binarize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index 817d4a8a..064a733e 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -135,7 +135,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: line, region_image, region_xywh, feature_filter='binarized') try: ret.append(self.process_line(line, line_image, line_xywh, zoom, page_id, region.id, output_file_id)) - except alueError as e: + except ValueError as e: self.logger.exception(e) return ret From 19566c0567b5b23bdc4596384d3867601045ca57 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 14 Aug 2024 13:53:35 +0200 Subject: [PATCH 21/97] try to migrate recognize --- ocrd_cis/ocropy/recognize.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py index 40de2817..140a3c83 100644 --- a/ocrd_cis/ocropy/recognize.py +++ b/ocrd_cis/ocropy/recognize.py @@ -115,6 +115,30 @@ def get_model(self): self.parameter['model'], self.parameter['model']) exit(1) + def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage: + maxlevel = self.parameter['textequiv_level'] + assert self.workspace + self.logger.debug(f'Max level: "{maxlevel}"') + + pcgts = input_pcgts[0] + page = pcgts.get_Page() + assert page + + page_image, page_coords, _ = self.workspace.image_from_page(page, page_id) + ret = [pcgts] + + self.logger.info(f"Recognizing text in page '{page_id}'") + # region, line, word, or glyph level: + regions = page.get_AllRegions(classes=['Text']) + if not regions: + self.logger.warning(f"Page '{page_id}' contains no text regions") + self.process_regions(regions, maxlevel, page_image, page_coords) + + file_path = join(self.output_file_grp, output_file_id + '.xml') + ret.append((output_file_id, file_path)) + return ret + + # TODO: remove when `process_page_pcgts` is validated to be correct def process(self): """Recognize lines / words / glyphs of the workspace. From 5f60976452011656fd05c1375055dd5ebd5f89d9 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 14 Aug 2024 13:59:33 +0200 Subject: [PATCH 22/97] fix: migrate recognize --- ocrd_cis/ocropy/recognize.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py index 140a3c83..9729b480 100644 --- a/ocrd_cis/ocropy/recognize.py +++ b/ocrd_cis/ocropy/recognize.py @@ -125,18 +125,13 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: assert page page_image, page_coords, _ = self.workspace.image_from_page(page, page_id) - ret = [pcgts] - self.logger.info(f"Recognizing text in page '{page_id}'") # region, line, word, or glyph level: regions = page.get_AllRegions(classes=['Text']) if not regions: self.logger.warning(f"Page '{page_id}' contains no text regions") self.process_regions(regions, maxlevel, page_image, page_coords) - - file_path = join(self.output_file_grp, output_file_id + '.xml') - ret.append((output_file_id, file_path)) - return ret + return [pcgts] # TODO: remove when `process_page_pcgts` is validated to be correct def process(self): From e8b26035f0d4bd84e689ce92f8da805cb0adaf13 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 14 Aug 2024 14:35:53 +0200 Subject: [PATCH 23/97] fix: detect_zoom logging --- ocrd_cis/ocropy/binarize.py | 5 ++--- ocrd_cis/ocropy/clip.py | 4 ++-- ocrd_cis/ocropy/common.py | 5 +++-- ocrd_cis/ocropy/denoise.py | 3 +-- ocrd_cis/ocropy/dewarp.py | 3 +-- ocrd_cis/ocropy/resegment.py | 3 +-- ocrd_cis/ocropy/segment.py | 3 +-- 7 files changed, 11 insertions(+), 15 deletions(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index 064a733e..387c51dc 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -102,9 +102,8 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: assert page page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id, feature_filter='binarized') - zoom, dpi = determine_zoom(self.parameter['dpi'], page_image_info) - self.logger.info(f"Page '{page_id}' uses {dpi} DPI. Determined zoom={zoom}") - + zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info) + ret = [pcgts] if level == 'page': try: diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py index 3607399b..dd0de012 100644 --- a/ocrd_cis/ocropy/clip.py +++ b/ocrd_cis/ocropy/clip.py @@ -87,8 +87,8 @@ def process(self): page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector='binarized') - zoom, dpi = determine_zoom(self.parameter['dpi'], page_image_info) - self.logger.info(f"Page '{page_id}' uses {dpi} DPI. Determined zoom={zoom}") + # TODO: zoom is not used anywhere, is it still useful to have this call here? + zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info) # FIXME: what about text regions inside table regions? regions = list(page.get_TextRegion()) diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py index 49e8f248..095de5eb 100644 --- a/ocrd_cis/ocropy/common.py +++ b/ocrd_cis/ocropy/common.py @@ -2103,14 +2103,15 @@ def find_topological(): # DSAVE('rlabels_closed', rlabels) return rlabels -def determine_zoom(dpi: float, page_image_info: OcrdExif) -> (float, float): +def determine_zoom(logger: logging.Logger, dpi: float, page_image_info: OcrdExif) -> float: if dpi > 0: zoom = 300.0/dpi elif page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi *= 2.54 + logger.info(f"Page '{page_id}' uses {dpi} DPI.") zoom = 300.0/dpi else: zoom = 1 - return zoom, dpi + return zoom diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py index 713af889..78d11c28 100644 --- a/ocrd_cis/ocropy/denoise.py +++ b/ocrd_cis/ocropy/denoise.py @@ -63,8 +63,7 @@ def process(self): page, page_id, feature_selector='binarized' if level == 'page' else '') - zoom, dpi = determine_zoom(self.parameter['dpi'], page_image_info) - self.logger.info(f"Page '{page_id}' uses {dpi} DPI. Determined zoom={zoom}") + zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info) if level == 'page': self.process_segment(page, page_image, page_xywh, zoom, diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py index 412724db..9dddae44 100644 --- a/ocrd_cis/ocropy/dewarp.py +++ b/ocrd_cis/ocropy/dewarp.py @@ -112,8 +112,7 @@ def process(self): page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id) - zoom, dpi = determine_zoom(self.parameter['dpi'], page_image_info) - self.logger.info(f"Page '{page_id}' uses {dpi} DPI. Determined zoom={zoom}") + zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info) regions = page.get_AllRegions(classes=['Text'], order='reading-order') if not regions: diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index 5bc9d008..e8c52a69 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -117,8 +117,7 @@ def process(self): page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector='binarized') - zoom, dpi = determine_zoom(self.parameter['dpi'], page_image_info) - self.logger.info(f"Page '{page_id}' uses {dpi} DPI. Determined zoom={zoom}") + zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info) ignore = (page.get_ImageRegion() + page.get_LineDrawingRegion() + diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index d171b6ed..c092718f 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -339,8 +339,7 @@ def process(self): # TODO: also allow grayscale_normalized (try/except?) page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector='binarized') - zoom, dpi = determine_zoom(self.parameter['dpi'], page_image_info) - self.logger.info(f"Page '{page_id}' uses {dpi} DPI. Determined zoom={zoom}") + zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info) # aggregate existing regions so their foreground can be ignored ignore = (page.get_ImageRegion() + From 7dfd4964be3f4e4db9bfe6ff548eda477ed36ae6 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 14 Aug 2024 14:38:05 +0200 Subject: [PATCH 24/97] update: test_lib base url --- tests/test_lib.bash | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_lib.bash b/tests/test_lib.bash index f28acb1e..c018d253 100644 --- a/tests/test_lib.bash +++ b/tests/test_lib.bash @@ -5,7 +5,7 @@ trap "trap 'echo exiting without removing $tmpdir' EXIT" ERR trap "rm -rf $tmpdir" EXIT OCRD_CIS_FILEGRP="OCR-D-GT-SEG-LINE" -data_url="https://github.com/OCR-D/gt_structure_text/releases/download/v1.2.4/" +data_url="https://github.com/OCR-D/gt_structure_text/releases/tag/v1.5.0/" function ocrd_cis_download_bagit() { local url="$data_url/$1" mkdir -p "$PWD/download" From 033c38ac3e3a6fdd9e74ab502d792878aad77439 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 14 Aug 2024 15:07:22 +0200 Subject: [PATCH 25/97] logging exception -> error --- ocrd_cis/ocropy/binarize.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index 387c51dc..0ea170e4 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -109,7 +109,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: try: ret.append(self.process_page(page, page_image, page_xywh, zoom, page_id, output_file_id)) except ValueError as e: - self.logger.exception(e) + self.logger.error(e) else: if level == 'table': regions = page.get_TableRegion() @@ -125,7 +125,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: ret.append(self.process_region(region, region_image, region_xywh, zoom, region.id, output_file_id)) continue except ValueError as e: - self.logger.exception(e) + self.logger.error(e) lines = region.get_TextLine() if not lines: self.logger.warning(f"Page '{page_id}' region '{region.id}' contains no text lines") @@ -135,7 +135,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: try: ret.append(self.process_line(line, line_image, line_xywh, zoom, page_id, region.id, output_file_id)) except ValueError as e: - self.logger.exception(e) + self.logger.error(e) return ret def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id) -> Tuple[Image.Image, str, str]: From 46d84d58b7474adc3cb9f9b756b215efebd495e3 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 14 Aug 2024 15:50:10 +0200 Subject: [PATCH 26/97] refactor: logger as a first positional argument --- ocrd_cis/ocropy/binarize.py | 9 +++++--- ocrd_cis/ocropy/resegment.py | 18 +++++++-------- ocrd_cis/ocropy/segment.py | 43 +++++++++++++++--------------------- 3 files changed, 32 insertions(+), 38 deletions(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index 0ea170e4..8f7d8d3a 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -22,9 +22,8 @@ from .common import array2pil, determine_zoom, pil2array, remove_noise -def binarize(pil_image, method='ocropy', maxskew=2, threshold=0.5, nrm=False, zoom=1.0): - LOG = getLogger('processor.OcropyBinarize') - LOG.debug('binarizing %dx%d image with method=%s', pil_image.width, pil_image.height, method) +def binarize(logger: Logger, pil_image, method='ocropy', maxskew=2, threshold=0.5, nrm=False, zoom=1.0): + logger.debug('binarizing %dx%d image with method=%s', pil_image.width, pil_image.height, method) if method == 'none': # useful if the images are already binary, # but lack image attribute `binarized` @@ -152,6 +151,7 @@ def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id) -> T else: maxskew = self.parameter['maxskew'] bin_image, angle = binarize( + self.logger, page_image, method=self.parameter['method'], maxskew=maxskew, @@ -191,6 +191,7 @@ def process_region(self, region, region_image, region_xywh, zoom, page_id, file_ # orientation has already been annotated (by previous deskewing), # so skip deskewing here: bin_image, _ = binarize( + self.logger, region_image, method=self.parameter['method'], maxskew=0, @@ -198,6 +199,7 @@ def process_region(self, region, region_image, region_xywh, zoom, page_id, file_ zoom=zoom) else: bin_image, angle = binarize( + self.logger, region_image, method=self.parameter['method'], maxskew=self.parameter['maxskew'], @@ -235,6 +237,7 @@ def process_line( self.logger.info(f"About to binarize page '{page_id}' region '{region_id}' line '{line.id}'") features = line_xywh['features'] bin_image, angle = binarize( + self.logger, line_image, method=self.parameter['method'], maxskew=self.parameter['maxskew'], diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index e8c52a69..b18c0b5e 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -265,8 +265,8 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l line_polygon[:, 0], parent_bin.shape) new_labels[line_y, line_x] = i + 1 - spread_dist(lines, line_labels, new_labels, parent_bin, components, parent_coords, - maxdist=maxdist or scale/2, loc=parent.id, threshold=threshold, logger=self.logger) + spread_dist(self.logger, lines, line_labels, new_labels, parent_bin, components, parent_coords, + maxdist=maxdist or scale/2, loc=parent.id, threshold=threshold) return try: # TODO: 'scale' passed as a param may not be always defined (mehmedGIT) @@ -280,9 +280,9 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l self.logger.info("Found %d new line labels for %d existing lines on %s '%s'", new_line_labels.max(), len(lines), tag, parent.id) # polygonalize and prepare comparison - new_line_polygons, new_line_labels = masks2polygons( + new_line_polygons, new_line_labels = masks2polygons(self.logger, new_line_labels, new_baselines, parent_bin, '%s "%s"' % (tag, parent.id), - min_area=640/zoom/zoom, logger=self.logger) + min_area=640/zoom/zoom) DSAVE('line_labels', [np.argmax(np.insert(line_labels, 0, 0, axis=0), axis=0), parent_bin]) DSAVE('new_line_labels', [new_line_labels, parent_bin]) new_line_polygons, new_baselines = list(zip(*[(Polygon(poly), LineString(base)) @@ -392,8 +392,8 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l self.logger.debug("joining %d new line polygons for '%s'", len(new_lines), line.id) new_polygon = join_polygons([new_line_polygons[i] #intersections[(i, j)] for i in new_lines], loc=line.id, scale=scale) - new_baseline = join_baselines([new_polygon.intersection(new_baselines[i]) - for i in new_lines], loc=line.id, logger=self.logger) + new_baseline = join_baselines(self.logger, [new_polygon.intersection(new_baselines[i]) + for i in new_lines], loc=line.id) # convert back to absolute (page) coordinates: line_polygon = coordinates_for_segment(new_polygon.exterior.coords[:-1], parent_image, parent_coords) @@ -427,11 +427,9 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l continue otherline.get_Coords().set_points(points_from_polygon(other_polygon)) -def spread_dist(lines, old_labels, new_labels, binarized, components, coords, - maxdist=43, loc='', threshold=0.9, logger = None): +def spread_dist(logger: Logger, lines, old_labels, new_labels, binarized, components, coords, + maxdist=43, loc='', threshold=0.9): """redefine line coordinates by contourizing spread of connected components propagated from new labels""" - if not logger: - raise ValueError(f"Logger has not been passed by the caller") DSAVE('seeds', [new_labels, (components>0)]) # allocate to connected components consistently # (ignoring smallest components like punctuation) diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index c092718f..782425cc 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -57,7 +57,7 @@ lines2regions ) -def masks2polygons(bg_labels, baselines, fg_bin, name, min_area=None, simplify=None, open_holes=False, reorder=True, logger=None): +def masks2polygons(logger: Logger, bg_labels, baselines, fg_bin, name, min_area=None, simplify=None, open_holes=False, reorder=True): """Convert label masks into polygon coordinates. Given a Numpy array of background labels ``bg_labels``, @@ -230,9 +230,9 @@ def getx(xy): # get baseline segments intersecting with this line mask # and concatenate them from left to right if baselines is not None: - base = join_baselines([baseline.intersection(polygon) + base = join_baselines(logger, [baseline.intersection(polygon) for baseline in baselines - if baseline.intersects(polygon)], name, logger) + if baseline.intersects(polygon)], name) if base is not None: base = base.coords else: @@ -416,7 +416,7 @@ def process(self): roelem = reading_order.get(region.id) # replace by empty group with same index and ref # (which can then take the cells as subregions) - reading_order[region.id] = page_subgroup_in_reading_order(roelem, self.logger) + reading_order[region.id] = page_subgroup_in_reading_order(self.logger, roelem) else: self.logger.warning('skipping table "%s" with existing TextRegions', region.id) continue @@ -434,7 +434,7 @@ def process(self): elif overwrite_order: # replace by empty ordered group with same (index and) ref # (which can then take the cells as subregions) - roelem = page_subgroup_in_reading_order(roelem, self.logger) + roelem = page_subgroup_in_reading_order(self.logger, roelem) reading_order[region.id] = roelem elif isinstance(roelem, (OrderedGroupType, OrderedGroupIndexedType)): self.logger.warning("Page '%s' table region '%s' already has an ordered group (%s)", @@ -446,7 +446,7 @@ def process(self): else: # replace regionRef(Indexed) by group with same index and ref # (which can then take the cells as subregions) - roelem = page_subgroup_in_reading_order(roelem, self.logger) + roelem = page_subgroup_in_reading_order(self.logger, roelem) reading_order[region.id] = roelem # go get TextRegions with TextLines (and SeparatorRegions) self._process_element(region, subignore, region_image, region_coords, @@ -661,16 +661,14 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, seps=np.maximum(sepmask, colseps)) region_mask |= region_line_labels > 0 # find contours for region (can be non-contiguous) - regions, _ = masks2polygons(region_mask * region_label, None, element_bin, + regions, _ = masks2polygons(self.logger, region_mask * region_label, None, element_bin, '%s "%s"' % (element_name, element_id), min_area=6000/zoom/zoom, - simplify=ignore_labels * ~(sep_bin), - logger=self.logger) + simplify=ignore_labels * ~(sep_bin)) # find contours for lines (can be non-contiguous) - lines, _ = masks2polygons(region_line_labels, baselines, element_bin, + lines, _ = masks2polygons(self.logger, region_line_labels, baselines, element_bin, 'region "%s"' % element_id, - min_area=640/zoom/zoom, - logger=self.logger) + min_area=640/zoom/zoom) # create new lines in new regions (allocating by intersection) line_polys = [Polygon(polygon) for _, polygon, _ in lines] for _, region_polygon, _ in regions: @@ -722,8 +720,8 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, # (e.g. drop-capitals or images) ... self.logger.info('Found %d large image regions for %s "%s"', images.max(), element_name, element_id) # find contours around region labels (can be non-contiguous): - image_polygons, _ = masks2polygons(images, None, element_bin, - '%s "%s"' % (element_name, element_id), self.logger) + image_polygons, _ = masks2polygons(self.logger, images, None, element_bin, + '%s "%s"' % (element_name, element_id)) for image_label, polygon, _ in image_polygons: # convert back to absolute (page) coordinates: region_polygon = coordinates_for_segment(polygon, image, coords) @@ -740,9 +738,9 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, # split detected separator labels into separator regions: self.logger.info('Found %d separators for %s "%s"', seplines.max(), element_name, element_id) # find contours around region labels (can be non-contiguous): - sep_polygons, _ = masks2polygons(seplines, None, element_bin, + sep_polygons, _ = masks2polygons(self.logger, seplines, None, element_bin, '%s "%s"' % (element_name, element_id), - open_holes=True, reorder=False, logger=self.logger) + open_holes=True, reorder=False) for sep_label, polygon, _ in sep_polygons: # convert back to absolute (page) coordinates: region_polygon = coordinates_for_segment(polygon, image, coords) @@ -774,9 +772,9 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, # ensure the new line labels do not extrude from the region: line_labels = line_labels * region_mask # find contours around labels (can be non-contiguous): - line_polygons, _ = masks2polygons(line_labels, baselines, element_bin, + line_polygons, _ = masks2polygons(self.logger, line_labels, baselines, element_bin, 'region "%s"' % element_id, - min_area=640/zoom/zoom, logger=self.logger) + min_area=640/zoom/zoom) line_no = 0 for line_label, polygon, baseline in line_polygons: # convert back to absolute (page) coordinates: @@ -918,9 +916,7 @@ def join_polygons(polygons, loc='', scale=20): jointp = make_valid(jointp) return jointp -def join_baselines(baselines, loc='', logger = None): - if not logger: - raise ValueError(f"Logger has not been passed by the caller") +def join_baselines(logger: Logger, baselines, loc=''): lines = [] for baseline in baselines: if (baseline.is_empty or @@ -1062,7 +1058,7 @@ def page_add_to_reading_order(rogroup, region_id, index=None): index += 1 return index -def page_subgroup_in_reading_order(roelem, logger = None): +def page_subgroup_in_reading_order(logger: Logger, roelem): """Replace given RO element by an equivalent OrderedGroup. Given a ReadingOrder element ``roelem`` (of any type), @@ -1076,9 +1072,6 @@ def page_subgroup_in_reading_order(roelem, logger = None): Return the new group object. """ - if not logger: - raise ValueError(f"Logger has not been passed by the caller") - if not roelem: logger.error('Cannot subgroup from empty ReadingOrder element') return roelem From f6fe4cf4caaf056ded182b498b44a610349627fc Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 14 Aug 2024 15:54:25 +0200 Subject: [PATCH 27/97] fix: test_lib.bash data url --- tests/test_lib.bash | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_lib.bash b/tests/test_lib.bash index c018d253..801be01a 100644 --- a/tests/test_lib.bash +++ b/tests/test_lib.bash @@ -5,7 +5,7 @@ trap "trap 'echo exiting without removing $tmpdir' EXIT" ERR trap "rm -rf $tmpdir" EXIT OCRD_CIS_FILEGRP="OCR-D-GT-SEG-LINE" -data_url="https://github.com/OCR-D/gt_structure_text/releases/tag/v1.5.0/" +data_url="https://github.com/OCR-D/gt_structure_text/releases/download/v1.5.0/" function ocrd_cis_download_bagit() { local url="$data_url/$1" mkdir -p "$PWD/download" From aed0f95ccdc0dfe4cc26982258ef1c8acd613e1e Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 14 Aug 2024 16:33:31 +0200 Subject: [PATCH 28/97] fix: recognize OcrdPage import --- ocrd_cis/ocropy/recognize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py index 9729b480..ccb019eb 100644 --- a/ocrd_cis/ocropy/recognize.py +++ b/ocrd_cis/ocropy/recognize.py @@ -19,7 +19,7 @@ ) from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( - to_xml, TextEquivType, + to_xml, TextEquivType, OcrdPage, CoordsType, GlyphType, WordType ) from ocrd import Processor From 804f031221eb4e64649e167c2f554d26555d5637 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 14 Aug 2024 18:10:00 +0200 Subject: [PATCH 29/97] try to migrate clip --- ocrd_cis/ocropy/clip.py | 178 +++++++++++++++++++++++++++++++--------- 1 file changed, 138 insertions(+), 40 deletions(-) diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py index dd0de012..0675257b 100644 --- a/ocrd_cis/ocropy/clip.py +++ b/ocrd_cis/ocropy/clip.py @@ -8,9 +8,7 @@ from shapely.prepared import prep from ocrd_modelfactory import page_from_file -from ocrd_models.ocrd_page import ( - to_xml, AlternativeImageType -) +from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage, to_xml from ocrd import Processor from ocrd_utils import ( getLogger, @@ -39,6 +37,113 @@ def executable(self): def setup(self): self.logger = getLogger('processor.OcropyClip') + def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage: + level = self.parameter['level-of-operation'] + assert self.workspace + self.logger.debug(f'Level of operation: "{level}"') + + pcgts = input_pcgts[0] + page = pcgts.get_Page() + assert page + + page_image, page_coords, page_image_info = self.workspace.image_from_page( + page, page_id, feature_selector='binarized') + # TODO: zoom is not used anywhere, is it still useful to have this call here? + zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info) + ret = [pcgts] + + # FIXME: what about text regions inside table regions? + regions = list(page.get_TextRegion()) + num_texts = len(regions) + regions += ( + page.get_AdvertRegion() + + page.get_ChartRegion() + + page.get_ChemRegion() + + page.get_GraphicRegion() + + page.get_ImageRegion() + + page.get_LineDrawingRegion() + + page.get_MathsRegion() + + page.get_MusicRegion() + + page.get_NoiseRegion() + + page.get_SeparatorRegion() + + page.get_TableRegion() + + page.get_UnknownRegion()) + if not num_texts: + self.logger.warning('Page "%s" contains no text regions', page_id) + background = ImageStat.Stat(page_image) + # workaround for Pillow#4925 + if len(background.bands) > 1: + background = tuple(background.median) + else: + background = background.median[0] + if level == 'region': + background_image = Image.new(page_image.mode, page_image.size, background) + page_array = pil2array(page_image) + page_bin = np.array(page_array <= midrange(page_array), np.uint8) + # in absolute coordinates merely for comparison/intersection + shapes = [Polygon(polygon_from_points(region.get_Coords().points)) for region in regions] + # in relative coordinates for mask/cropping + polygons = [coordinates_of_segment(region, page_image, page_coords) for region in regions] + for i, polygon in enumerate(polygons[num_texts:], num_texts): + # for non-text regions, extend mask by 3 pixels in each direction + # to ensure they do not leak components accidentally + # (accounts for bad cropping of such regions in GT): + polygon = Polygon(polygon).buffer(3).exterior.coords[:-1] # keep open + polygons[i] = polygon + masks = [pil2array(polygon_mask(page_image, polygon)).astype(np.uint8) for polygon in polygons] + for i, region in enumerate(regions): + if i >= num_texts: + break # keep non-text regions unchanged + if level == 'region': + if region.get_AlternativeImage(): + # FIXME: This should probably be an exception (bad workflow configuration). + self.logger.warning( + f'Page "{page_id}" region "{region.id}" already contains image data: skipping') + continue + shape = prep(shapes[i]) + neighbours = [(regionj, maskj) for shapej, regionj, maskj + in zip(shapes[:i] + shapes[i+1:], regions[:i] + regions[i+1:], masks[:i] + masks[i+1:]) + if shape.intersects(shapej)] + if neighbours: + segment_region_file_id = f"{output_file_id}_{region.id}" + ret.append(self.process_segment( + region, masks[i], polygons[i], neighbours, background_image, + page_image, page_coords, page_bin, page_id, segment_region_file_id)) + continue + # level == 'line': + lines = region.get_TextLine() + if not lines: + self.logger.warning(f'Page "{page_id}" region "{region.id}" contains no text lines') + continue + region_image, region_coords = self.workspace.image_from_segment( + region, page_image, page_coords, feature_selector='binarized') + background_image = Image.new(region_image.mode, region_image.size, background) + region_array = pil2array(region_image) + region_bin = np.array(region_array <= midrange(region_array), np.uint8) + # in absolute coordinates merely for comparison/intersection + shapes = [Polygon(polygon_from_points(line.get_Coords().points)) for line in lines] + # in relative coordinates for mask/cropping + polygons = [coordinates_of_segment(line, region_image, region_coords) for line in lines] + masks = [pil2array(polygon_mask(region_image, polygon)).astype(np.uint8) for polygon in polygons] + for j, line in enumerate(lines): + if line.get_AlternativeImage(): + # FIXME: This should probably be an exception (bad workflow configuration). + self.logger.warning( + f'Page "{page_id}" region "{region.id}" line "{line.id}" already contains image ' + f'data: skipping') + continue + shape = prep(shapes[j]) + neighbours = [(linej, maskj) for shapej, linej, maskj + in zip(shapes[:j] + shapes[j+1:], lines[:j] + lines[j+1:], masks[:j] + masks[j+1:]) + if shape.intersects(shapej)] + if neighbours: + segment_line_file_id = f"{output_file_id}_{region.id}_{line.id}" + ret.append(self.process_segment( + line, masks[j], polygons[j], neighbours, background_image, + region_image, region_coords, region_bin, page_id, segment_line_file_id)) + return ret + + # TODO: remove when `process_page_pcgts` is validated to be correct def process(self): """Clip text regions / lines of the workspace at intersections with neighbours. @@ -119,27 +224,24 @@ def process(self): page_array = pil2array(page_image) page_bin = np.array(page_array <= midrange(page_array), np.uint8) # in absolute coordinates merely for comparison/intersection - shapes = [Polygon(polygon_from_points(region.get_Coords().points)) - for region in regions] + shapes = [Polygon(polygon_from_points(region.get_Coords().points)) for region in regions] # in relative coordinates for mask/cropping - polygons = [coordinates_of_segment(region, page_image, page_coords) - for region in regions] + polygons = [coordinates_of_segment(region, page_image, page_coords) for region in regions] for i, polygon in enumerate(polygons[num_texts:], num_texts): # for non-text regions, extend mask by 3 pixels in each direction # to ensure they do not leak components accidentally # (accounts for bad cropping of such regions in GT): polygon = Polygon(polygon).buffer(3).exterior.coords[:-1] # keep open polygons[i] = polygon - masks = [pil2array(polygon_mask(page_image, polygon)).astype(np.uint8) - for polygon in polygons] + masks = [pil2array(polygon_mask(page_image, polygon)).astype(np.uint8) for polygon in polygons] for i, region in enumerate(regions): if i >= num_texts: break # keep non-text regions unchanged if level == 'region': if region.get_AlternativeImage(): # FIXME: This should probably be an exception (bad workflow configuration). - self.logger.warning('Page "%s" region "%s" already contains image data: skipping', - page_id, region.id) + self.logger.warning( + f'Page "{page_id}" region "{region.id}" already contains image data: skipping') continue shape = prep(shapes[i]) neighbours = [(regionj, maskj) for shapej, regionj, maskj @@ -148,15 +250,15 @@ def process(self): masks[:i] + masks[i+1:]) if shape.intersects(shapej)] if neighbours: - self.process_segment(region, masks[i], polygons[i], - neighbours, background_image, - page_image, page_coords, page_bin, - input_file.pageId, file_id + '_' + region.id) + segment_region_file_id = f"{file_id}_{region.id}" + self.process_segment( + region, masks[i], polygons[i], neighbours, background_image, + page_image, page_coords, page_bin, input_file.pageId, segment_region_file_id) continue # level == 'line': lines = region.get_TextLine() if not lines: - self.logger.warning('Page "%s" region "%s" contains no text lines', page_id, region.id) + self.logger.warning(f'Page "{page_id}" region "{region.id}" contains no text lines') continue region_image, region_coords = self.workspace.image_from_segment( region, page_image, page_coords, feature_selector='binarized') @@ -164,18 +266,16 @@ def process(self): region_array = pil2array(region_image) region_bin = np.array(region_array <= midrange(region_array), np.uint8) # in absolute coordinates merely for comparison/intersection - shapes = [Polygon(polygon_from_points(line.get_Coords().points)) - for line in lines] + shapes = [Polygon(polygon_from_points(line.get_Coords().points)) for line in lines] # in relative coordinates for mask/cropping - polygons = [coordinates_of_segment(line, region_image, region_coords) - for line in lines] - masks = [pil2array(polygon_mask(region_image, polygon)).astype(np.uint8) - for polygon in polygons] + polygons = [coordinates_of_segment(line, region_image, region_coords) for line in lines] + masks = [pil2array(polygon_mask(region_image, polygon)).astype(np.uint8) for polygon in polygons] for j, line in enumerate(lines): if line.get_AlternativeImage(): # FIXME: This should probably be an exception (bad workflow configuration). - self.logger.warning('Page "%s" region "%s" line "%s" already contains image data: skipping', - page_id, region.id, line.id) + self.logger.warning( + f'Page "{page_id}" region "{region.id}" line "{line.id}" already contains image ' + f'data: skipping') continue shape = prep(shapes[j]) neighbours = [(linej, maskj) for shapej, linej, maskj @@ -184,10 +284,10 @@ def process(self): masks[:j] + masks[j+1:]) if shape.intersects(shapej)] if neighbours: - self.process_segment(line, masks[j], polygons[j], - neighbours, background_image, - region_image, region_coords, region_bin, - input_file.pageId, file_id + '_' + region.id + '_' + line.id) + segment_line_file_id = f"{file_id}_{region.id}_{line.id}" + self.process_segment( + line, masks[j], polygons[j], neighbours, background_image, + region_image, region_coords, region_bin, input_file.pageId, segment_line_file_id) # update METS (add the PAGE file): file_path = join(self.output_file_grp, file_id + '.xml') @@ -204,7 +304,7 @@ def process(self): def process_segment(self, segment, segment_mask, segment_polygon, neighbours, background_image, parent_image, parent_coords, parent_bin, - page_id, file_id): + page_id, file_id) -> Tuple[Image.Image, str, str]: # initialize AlternativeImage@comments classes from parent, except # for those operations that can apply on multiple hierarchy levels: features = ','.join( @@ -216,8 +316,8 @@ def process_segment(self, segment, segment_mask, segment_polygon, neighbours, segment_bbox = bbox_from_polygon(segment_polygon) for neighbour, neighbour_mask in neighbours: if not np.any(segment_mask > neighbour_mask): - self.logger.info('Ignoring enclosing neighbour "%s" of segment "%s" on page "%s"', - neighbour.id, segment.id, page_id) + self.logger.info( + f'Ignoring enclosing neighbour "{neighbour.id}" of segment "{segment.id}" on page "{page_id}"') continue # find connected components that (only) belong to the neighbour: intruders = segment_mask * morph.keep_marked(parent_bin, neighbour_mask > 0) # overlaps neighbour @@ -226,8 +326,9 @@ def process_segment(self, segment, segment_mask, segment_polygon, neighbours, num_foreground = np.count_nonzero(segment_mask * parent_bin) if not num_intruders: continue - self.logger.debug('segment "%s" vs neighbour "%s": suppressing %d of %d pixels on page "%s"', - segment.id, neighbour.id, num_intruders, num_foreground, page_id) + self.logger.debug( + f'segment "{segment.id}" vs neighbour "{neighbour.id}": suppressing {num_intruders} of ' + f'{num_foreground} pixels on page "{page_id}"') # suppress in segment_mask so these intruders can stay in the neighbours # (are not removed from both sides) segment_mask -= intruders @@ -241,11 +342,8 @@ def process_segment(self, segment, segment_mask, segment_polygon, neighbours, # recrop segment into rectangle, just as image_from_segment would do # (and also clipping with background colour): segment_image = crop_image(segment_image,box=segment_bbox) - # update METS (add the image file): - file_path = self.workspace.save_image_file( - segment_image, file_id + '.IMG-CLIP', self.output_file_grp, - page_id=page_id) + segment_image_id = file_id + '.IMG-CLIP' + segment_image_path = join(self.output_file_grp, f'{segment_image_id}.png') # update PAGE (reference the image file): - segment.add_AlternativeImage(AlternativeImageType( - filename=file_path, - comments=features)) + segment.add_AlternativeImage(AlternativeImageType(filename=segment_image_path, comments=features)) + return segment_image, segment_image_id, segment_image_path From 7bdff31747ad2c9cdb834569b8b1adf8b90303d2 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Thu, 15 Aug 2024 11:51:42 +0200 Subject: [PATCH 30/97] remove: process() methods --- ocrd_cis/ocropy/clip.py | 194 +++++++---------------------------- ocrd_cis/ocropy/recognize.py | 65 +++--------- 2 files changed, 50 insertions(+), 209 deletions(-) diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py index 0675257b..9e6d8d19 100644 --- a/ocrd_cis/ocropy/clip.py +++ b/ocrd_cis/ocropy/clip.py @@ -37,7 +37,42 @@ def executable(self): def setup(self): self.logger = getLogger('processor.OcropyClip') + # TODO: Adapt the docstring comment to process_page_pcgts def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage: + """Clip text regions / lines of the workspace at intersections with neighbours. + + Open and deserialise PAGE input files and their respective images, + then iterate over the element hierarchy down to the requested + ``level-of-operation``. + + Next, get each segment image according to the layout annotation (by cropping + via coordinates into the higher-level image), as well as all its neighbours', + binarize them (without deskewing), and make a connected component analysis. + (Segments must not already have AlternativeImage annotated, otherwise they + will be skipped.) + + Then, for each section of overlap with a neighbour, re-assign components + which are only contained in the neighbour by clipping them to white (background), + and export the (final) result as image file. + + Add the new image file to the workspace along with the output fileGrp, + and using a file ID with suffix ``.IMG-CLIP`` along with further + identification of the input element. + + Reference each new image in the AlternativeImage of the element. + + Produce a new output file by serialising the resulting hierarchy. + """ + # This makes best sense for overlapping segmentation, like current GT + # or Tesseract layout analysis. Most notably, it can suppress graphics + # and separators within or across a region or line. It _should_ ideally + # be run after binarization (on page level for region-level clipping, + # and on the region level for line-level clipping), because the + # connected component analysis after implicit binarization could be + # suboptimal, and the explicit binarization after clipping could be, + # too. However, region-level clipping _must_ be run before region-level + # deskewing, because that would make segments incomensurable with their + # neighbours. level = self.parameter['level-of-operation'] assert self.workspace self.logger.debug(f'Level of operation: "{level}"') @@ -143,165 +178,6 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: region_image, region_coords, region_bin, page_id, segment_line_file_id)) return ret - # TODO: remove when `process_page_pcgts` is validated to be correct - def process(self): - """Clip text regions / lines of the workspace at intersections with neighbours. - - Open and deserialise PAGE input files and their respective images, - then iterate over the element hierarchy down to the requested - ``level-of-operation``. - - Next, get each segment image according to the layout annotation (by cropping - via coordinates into the higher-level image), as well as all its neighbours', - binarize them (without deskewing), and make a connected component analysis. - (Segments must not already have AlternativeImage annotated, otherwise they - will be skipped.) - - Then, for each section of overlap with a neighbour, re-assign components - which are only contained in the neighbour by clipping them to white (background), - and export the (final) result as image file. - - Add the new image file to the workspace along with the output fileGrp, - and using a file ID with suffix ``.IMG-CLIP`` along with further - identification of the input element. - - Reference each new image in the AlternativeImage of the element. - - Produce a new output file by serialising the resulting hierarchy. - """ - # This makes best sense for overlapping segmentation, like current GT - # or Tesseract layout analysis. Most notably, it can suppress graphics - # and separators within or across a region or line. It _should_ ideally - # be run after binarization (on page level for region-level clipping, - # and on the region level for line-level clipping), because the - # connected component analysis after implicit binarization could be - # suboptimal, and the explicit binarization after clipping could be, - # too. However, region-level clipping _must_ be run before region-level - # deskewing, because that would make segments incomensurable with their - # neighbours. - level = self.parameter['level-of-operation'] - - for (n, input_file) in enumerate(self.input_files): - self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) - file_id = make_file_id(input_file, self.output_file_grp) - - pcgts = page_from_file(self.workspace.download_file(input_file)) - self.add_metadata(pcgts) - page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id) - page = pcgts.get_Page() - - page_image, page_coords, page_image_info = self.workspace.image_from_page( - page, page_id, feature_selector='binarized') - # TODO: zoom is not used anywhere, is it still useful to have this call here? - zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info) - - # FIXME: what about text regions inside table regions? - regions = list(page.get_TextRegion()) - num_texts = len(regions) - regions += ( - page.get_AdvertRegion() + - page.get_ChartRegion() + - page.get_ChemRegion() + - page.get_GraphicRegion() + - page.get_ImageRegion() + - page.get_LineDrawingRegion() + - page.get_MathsRegion() + - page.get_MusicRegion() + - page.get_NoiseRegion() + - page.get_SeparatorRegion() + - page.get_TableRegion() + - page.get_UnknownRegion()) - if not num_texts: - self.logger.warning('Page "%s" contains no text regions', page_id) - background = ImageStat.Stat(page_image) - # workaround for Pillow#4925 - if len(background.bands) > 1: - background = tuple(background.median) - else: - background = background.median[0] - if level == 'region': - background_image = Image.new(page_image.mode, page_image.size, background) - page_array = pil2array(page_image) - page_bin = np.array(page_array <= midrange(page_array), np.uint8) - # in absolute coordinates merely for comparison/intersection - shapes = [Polygon(polygon_from_points(region.get_Coords().points)) for region in regions] - # in relative coordinates for mask/cropping - polygons = [coordinates_of_segment(region, page_image, page_coords) for region in regions] - for i, polygon in enumerate(polygons[num_texts:], num_texts): - # for non-text regions, extend mask by 3 pixels in each direction - # to ensure they do not leak components accidentally - # (accounts for bad cropping of such regions in GT): - polygon = Polygon(polygon).buffer(3).exterior.coords[:-1] # keep open - polygons[i] = polygon - masks = [pil2array(polygon_mask(page_image, polygon)).astype(np.uint8) for polygon in polygons] - for i, region in enumerate(regions): - if i >= num_texts: - break # keep non-text regions unchanged - if level == 'region': - if region.get_AlternativeImage(): - # FIXME: This should probably be an exception (bad workflow configuration). - self.logger.warning( - f'Page "{page_id}" region "{region.id}" already contains image data: skipping') - continue - shape = prep(shapes[i]) - neighbours = [(regionj, maskj) for shapej, regionj, maskj - in zip(shapes[:i] + shapes[i+1:], - regions[:i] + regions[i+1:], - masks[:i] + masks[i+1:]) - if shape.intersects(shapej)] - if neighbours: - segment_region_file_id = f"{file_id}_{region.id}" - self.process_segment( - region, masks[i], polygons[i], neighbours, background_image, - page_image, page_coords, page_bin, input_file.pageId, segment_region_file_id) - continue - # level == 'line': - lines = region.get_TextLine() - if not lines: - self.logger.warning(f'Page "{page_id}" region "{region.id}" contains no text lines') - continue - region_image, region_coords = self.workspace.image_from_segment( - region, page_image, page_coords, feature_selector='binarized') - background_image = Image.new(region_image.mode, region_image.size, background) - region_array = pil2array(region_image) - region_bin = np.array(region_array <= midrange(region_array), np.uint8) - # in absolute coordinates merely for comparison/intersection - shapes = [Polygon(polygon_from_points(line.get_Coords().points)) for line in lines] - # in relative coordinates for mask/cropping - polygons = [coordinates_of_segment(line, region_image, region_coords) for line in lines] - masks = [pil2array(polygon_mask(region_image, polygon)).astype(np.uint8) for polygon in polygons] - for j, line in enumerate(lines): - if line.get_AlternativeImage(): - # FIXME: This should probably be an exception (bad workflow configuration). - self.logger.warning( - f'Page "{page_id}" region "{region.id}" line "{line.id}" already contains image ' - f'data: skipping') - continue - shape = prep(shapes[j]) - neighbours = [(linej, maskj) for shapej, linej, maskj - in zip(shapes[:j] + shapes[j+1:], - lines[:j] + lines[j+1:], - masks[:j] + masks[j+1:]) - if shape.intersects(shapej)] - if neighbours: - segment_line_file_id = f"{file_id}_{region.id}_{line.id}" - self.process_segment( - line, masks[j], polygons[j], neighbours, background_image, - region_image, region_coords, region_bin, input_file.pageId, segment_line_file_id) - - # update METS (add the PAGE file): - file_path = join(self.output_file_grp, file_id + '.xml') - pcgts.set_pcGtsId(file_id) - out = self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - local_filename=file_path, - mimetype=MIMETYPE_PAGE, - content=to_xml(pcgts)) - self.logger.info('created file ID: %s, file_grp: %s, path: %s', - file_id, self.output_file_grp, out.local_filename) - def process_segment(self, segment, segment_mask, segment_polygon, neighbours, background_image, parent_image, parent_coords, parent_bin, page_id, file_id) -> Tuple[Image.Image, str, str]: diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py index ccb019eb..389cf8db 100644 --- a/ocrd_cis/ocropy/recognize.py +++ b/ocrd_cis/ocropy/recognize.py @@ -115,26 +115,8 @@ def get_model(self): self.parameter['model'], self.parameter['model']) exit(1) + # TODO: Adapt the docstring comment to process_page_pcgts def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage: - maxlevel = self.parameter['textequiv_level'] - assert self.workspace - self.logger.debug(f'Max level: "{maxlevel}"') - - pcgts = input_pcgts[0] - page = pcgts.get_Page() - assert page - - page_image, page_coords, _ = self.workspace.image_from_page(page, page_id) - self.logger.info(f"Recognizing text in page '{page_id}'") - # region, line, word, or glyph level: - regions = page.get_AllRegions(classes=['Text']) - if not regions: - self.logger.warning(f"Page '{page_id}' contains no text regions") - self.process_regions(regions, maxlevel, page_image, page_coords) - return [pcgts] - - # TODO: remove when `process_page_pcgts` is validated to be correct - def process(self): """Recognize lines / words / glyphs of the workspace. Open and deserialise each PAGE input file and its respective image, @@ -160,38 +142,21 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ maxlevel = self.parameter['textequiv_level'] + assert self.workspace + self.logger.debug(f'Max level: "{maxlevel}"') + + pcgts = input_pcgts[0] + page = pcgts.get_Page() + assert page - # self.logger.info("Using model %s in %s for recognition", model) - for (n, input_file) in enumerate(self.input_files): - self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) - pcgts = page_from_file(self.workspace.download_file(input_file)) - self.add_metadata(pcgts) - page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id) - page = pcgts.get_Page() - - page_image, page_coords, _ = self.workspace.image_from_page( - page, page_id) - - self.logger.info("Recognizing text in page '%s'", page_id) - # region, line, word, or glyph level: - regions = page.get_AllRegions(classes=['Text']) - if not regions: - self.logger.warning("Page '%s' contains no text regions", page_id) - self.process_regions(regions, maxlevel, page_image, page_coords) - - # update METS (add the PAGE file): - file_id = make_file_id(input_file, self.output_file_grp) - file_path = join(self.output_file_grp, file_id + '.xml') - pcgts.set_pcGtsId(file_id) - out = self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - local_filename=file_path, - mimetype=MIMETYPE_PAGE, - content=to_xml(pcgts)) - self.logger.info('created file ID: %s, file_grp: %s, path: %s', - file_id, self.output_file_grp, out.local_filename) + page_image, page_coords, _ = self.workspace.image_from_page(page, page_id) + self.logger.info(f"Recognizing text in page '{page_id}'") + # region, line, word, or glyph level: + regions = page.get_AllRegions(classes=['Text']) + if not regions: + self.logger.warning(f"Page '{page_id}' contains no text regions") + self.process_regions(regions, maxlevel, page_image, page_coords) + return [pcgts] def process_regions(self, regions, maxlevel, page_image, page_coords): edits = 0 From 03c2f158fa02ddeae40baa93cee686be1fd0ca09 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Thu, 15 Aug 2024 11:57:36 +0200 Subject: [PATCH 31/97] adapt: docstring of process_page_pcgts --- ocrd_cis/ocropy/clip.py | 8 ++++---- ocrd_cis/ocropy/recognize.py | 17 ++++++++--------- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py index 9e6d8d19..a5f4f705 100644 --- a/ocrd_cis/ocropy/clip.py +++ b/ocrd_cis/ocropy/clip.py @@ -39,9 +39,9 @@ def setup(self): # TODO: Adapt the docstring comment to process_page_pcgts def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage: - """Clip text regions / lines of the workspace at intersections with neighbours. + """Clip text regions / lines of a page at intersections with neighbours. - Open and deserialise PAGE input files and their respective images, + Open and deserialize PAGE input file and its respective image, then iterate over the element hierarchy down to the requested ``level-of-operation``. @@ -61,7 +61,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: Reference each new image in the AlternativeImage of the element. - Produce a new output file by serialising the resulting hierarchy. + Return the resulting OcrdPage. """ # This makes best sense for overlapping segmentation, like current GT # or Tesseract layout analysis. Most notably, it can suppress graphics @@ -71,7 +71,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: # connected component analysis after implicit binarization could be # suboptimal, and the explicit binarization after clipping could be, # too. However, region-level clipping _must_ be run before region-level - # deskewing, because that would make segments incomensurable with their + # deskewing, because that would make segments incommensurable with their # neighbours. level = self.parameter['level-of-operation'] assert self.workspace diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py index 389cf8db..69b374ec 100644 --- a/ocrd_cis/ocropy/recognize.py +++ b/ocrd_cis/ocropy/recognize.py @@ -115,18 +115,17 @@ def get_model(self): self.parameter['model'], self.parameter['model']) exit(1) - # TODO: Adapt the docstring comment to process_page_pcgts def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage: - """Recognize lines / words / glyphs of the workspace. + """Recognize lines / words / glyphs of a page. - Open and deserialise each PAGE input file and its respective image, + Open and deserialize the PAGE input file and its respective image, then iterate over the element hierarchy down to the requested ``textequiv_level``. If any layout annotation below the line level already exists, then remove it (regardless of ``textequiv_level``). - Set up Ocropy to recognise each text line (via coordinates into + Set up Ocropy to recognize each text line (via coordinates into the higher-level image, or from the alternative image; the image - must have been binarised/grayscale-normalised, deskewed and dewarped + must have been binarized/grayscale-normalised, deskewed and dewarped already). Rescale and pad the image, then recognize. Create new elements below the line level, if necessary. @@ -139,11 +138,11 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: Levenshtein distance. Aggregate these scores for each file and print the line-wise and the total character error rates (CER). - Produce a new output file by serialising the resulting hierarchy. + Return the resulting OcrdPage. """ - maxlevel = self.parameter['textequiv_level'] + max_level = self.parameter['textequiv_level'] assert self.workspace - self.logger.debug(f'Max level: "{maxlevel}"') + self.logger.debug(f'Max level: "{max_level}"') pcgts = input_pcgts[0] page = pcgts.get_Page() @@ -155,7 +154,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: regions = page.get_AllRegions(classes=['Text']) if not regions: self.logger.warning(f"Page '{page_id}' contains no text regions") - self.process_regions(regions, maxlevel, page_image, page_coords) + self.process_regions(regions, max_level, page_image, page_coords) return [pcgts] def process_regions(self, regions, maxlevel, page_image, page_coords): From 90ac28e1f9c9b6c95492aac765aaf5183a045be2 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Thu, 15 Aug 2024 12:11:30 +0200 Subject: [PATCH 32/97] refactor: other small things --- ocrd_cis/ocropy/clip.py | 16 +++++------ ocrd_cis/ocropy/recognize.py | 52 +++++++++++++++--------------------- 2 files changed, 29 insertions(+), 39 deletions(-) diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py index a5f4f705..75b4123f 100644 --- a/ocrd_cis/ocropy/clip.py +++ b/ocrd_cis/ocropy/clip.py @@ -37,7 +37,6 @@ def executable(self): def setup(self): self.logger = getLogger('processor.OcropyClip') - # TODO: Adapt the docstring comment to process_page_pcgts def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage: """Clip text regions / lines of a page at intersections with neighbours. @@ -81,9 +80,9 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: page = pcgts.get_Page() assert page - page_image, page_coords, page_image_info = self.workspace.image_from_page( + page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector='binarized') - # TODO: zoom is not used anywhere, is it still useful to have this call here? + # The zoom is not used anywhere zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info) ret = [pcgts] @@ -104,7 +103,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: page.get_TableRegion() + page.get_UnknownRegion()) if not num_texts: - self.logger.warning('Page "%s" contains no text regions', page_id) + self.logger.warning(f'Page "{page_id}" contains no text regions') background = ImageStat.Stat(page_image) # workaround for Pillow#4925 if len(background.bands) > 1: @@ -118,7 +117,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: # in absolute coordinates merely for comparison/intersection shapes = [Polygon(polygon_from_points(region.get_Coords().points)) for region in regions] # in relative coordinates for mask/cropping - polygons = [coordinates_of_segment(region, page_image, page_coords) for region in regions] + polygons = [coordinates_of_segment(region, page_image, page_xywh) for region in regions] for i, polygon in enumerate(polygons[num_texts:], num_texts): # for non-text regions, extend mask by 3 pixels in each direction # to ensure they do not leak components accidentally @@ -143,7 +142,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: segment_region_file_id = f"{output_file_id}_{region.id}" ret.append(self.process_segment( region, masks[i], polygons[i], neighbours, background_image, - page_image, page_coords, page_bin, page_id, segment_region_file_id)) + page_image, page_xywh, page_bin, page_id, segment_region_file_id)) continue # level == 'line': lines = region.get_TextLine() @@ -151,7 +150,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: self.logger.warning(f'Page "{page_id}" region "{region.id}" contains no text lines') continue region_image, region_coords = self.workspace.image_from_segment( - region, page_image, page_coords, feature_selector='binarized') + region, page_image, page_xywh, feature_selector='binarized') background_image = Image.new(region_image.mode, region_image.size, background) region_array = pil2array(region_image) region_bin = np.array(region_array <= midrange(region_array), np.uint8) @@ -164,8 +163,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: if line.get_AlternativeImage(): # FIXME: This should probably be an exception (bad workflow configuration). self.logger.warning( - f'Page "{page_id}" region "{region.id}" line "{line.id}" already contains image ' - f'data: skipping') + f'Page "{page_id}" region "{region.id}" line "{line.id}" already contains image data: skipping') continue shape = prep(shapes[j]) neighbours = [(linej, maskj) for shapej, linej, maskj diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py index 69b374ec..b9fc453f 100644 --- a/ocrd_cis/ocropy/recognize.py +++ b/ocrd_cis/ocropy/recognize.py @@ -101,18 +101,19 @@ def get_model(self): returns it. If not, it checks if the model can be found in the dirname(__file__)/models/ directory.""" canread = lambda p: isfile(p) and access(p, R_OK) + p_model = self.parameter['model'] try: - model = self.resolve_resource(self.parameter['model']) + model = self.resolve_resource(p_model) if canread(model): return model except SystemExit: ocropydir = dirname(abspath(__file__)) - path = join(ocropydir, 'models', self.parameter['model']) - self.logger.info("Failed to resolve model with OCR-D/core mechanism, trying %s", path) + path = join(ocropydir, 'models', p_model) + self.logger.info(f"Failed to resolve model with OCR-D/core mechanism, trying {path}") if canread(path): return path - self.logger.error("Could not find model %s. Try 'ocrd resmgr download ocrd-cis-ocropy-recognize %s", - self.parameter['model'], self.parameter['model']) + self.logger.error( + f"Could not find model {p_model}. Try 'ocrd resmgr download ocrd-cis-ocropy-recognize {p_model}") exit(1) def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage: @@ -148,7 +149,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: page = pcgts.get_Page() assert page - page_image, page_coords, _ = self.workspace.image_from_page(page, page_id) + page_image, page_xywh, _ = self.workspace.image_from_page(page, page_id) self.logger.info(f"Recognizing text in page '{page_id}'") # region, line, word, or glyph level: regions = page.get_AllRegions(classes=['Text']) @@ -157,37 +158,32 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: self.process_regions(regions, max_level, page_image, page_coords) return [pcgts] - def process_regions(self, regions, maxlevel, page_image, page_coords): + def process_regions(self, regions, maxlevel, page_image, page_xywh): edits = 0 lengs = 0 for region in regions: - region_image, region_coords = self.workspace.image_from_segment( - region, page_image, page_coords) - - self.logger.info("Recognizing text in region '%s'", region.id) + region_image, region_xywh = self.workspace.image_from_segment(region, page_image, page_xywh) + self.logger.info(f"Recognizing text in region '{region.id}'") textlines = region.get_TextLine() if not textlines: - self.logger.warning("Region '%s' contains no text lines", region.id) + self.logger.warning(f"Region '{region.id}' contains no text lines") else: - edits_, lengs_ = self.process_lines(textlines, maxlevel, region_image, region_coords) + edits_, lengs_ = self.process_lines(textlines, maxlevel, region_image, region_xywh) edits += edits_ lengs += lengs_ # update region text by concatenation for consistency - region_unicode = u'\n'.join(line.get_TextEquiv()[0].Unicode - if line.get_TextEquiv() - else u'' for line in textlines) + region_unicode = u'\n'.join( + line.get_TextEquiv()[0].Unicode if line.get_TextEquiv() else u'' for line in textlines) region.set_TextEquiv([TextEquivType(Unicode=region_unicode)]) if lengs > 0: self.logger.info('CER: %.1f%%', 100.0 * edits / lengs) - def process_lines(self, textlines, maxlevel, region_image, region_coords): + def process_lines(self, textlines, maxlevel, region_image, region_xywh): edits = 0 lengs = 0 for line in textlines: - line_image, line_coords = self.workspace.image_from_segment( - line, region_image, region_coords) - - self.logger.info("Recognizing text in line '%s'", line.id) + line_image, line_coords = self.workspace.image_from_segment(line, region_image, region_xywh) + self.logger.info(f"Recognizing text in line '{line.id}'") if line.get_TextEquiv(): linegt = line.TextEquiv[0].Unicode else: @@ -198,19 +194,18 @@ def process_lines(self, textlines, maxlevel, region_image, region_coords): line.set_Word([]) if line_image.size[1] < 16: - self.logger.debug("ERROR: bounding box is too narrow at line %s", line.id) + self.logger.debug(f"ERROR: bounding box is too narrow at line {line.id}") continue # resize image to 48 pixel height final_img, scale = resize_keep_ratio(line_image) # process ocropy: try: - linepred, clist, rlist, confidlist = recognize( - final_img, self.pad, self.network, check=True) + linepred, clist, rlist, confidlist = recognize(final_img, self.pad, self.network, check=True) except Exception as err: - self.logger.debug('error processing line "%s": %s', line.id, err) + self.logger.debug(f'error processing line "{line.id}": {err}') continue - self.logger.debug("OCR '%s': '%s'", line.id, linepred) + self.logger.debug(f"OCR '{line.id}': '{linepred}'") edits += Levenshtein.distance(linepred, linegt) lengs += len(linegt) @@ -226,11 +221,9 @@ def process_lines(self, textlines, maxlevel, region_image, region_coords): found_char = True word_conf_list[w_no].append(confidlist[i]) word_r_list[w_no].append(rlist[i]) - if c == ' ' and found_char: if i == 0: word_r_list[0][0] = rlist[i] - elif i+1 <= len(clist)-1 and clist[i+1] != ' ': word_conf_list.append([]) word_r_list.append([rlist[i]]) @@ -244,8 +237,7 @@ def process_lines(self, textlines, maxlevel, region_image, region_coords): # conf for the line line_conf = (min(wordsconf) + max(wordsconf))/2 # line text - line.add_TextEquiv(TextEquivType( - Unicode=linepred, conf=line_conf)) + line.add_TextEquiv(TextEquivType(Unicode=linepred, conf=line_conf)) if maxlevel in ['word', 'glyph']: for word_no, word_str in enumerate(words): From f24f86b9e963e28f206662e464f8843c99deddf0 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Thu, 15 Aug 2024 12:33:04 +0200 Subject: [PATCH 33/97] fix: determine_zoom --- ocrd_cis/ocropy/binarize.py | 2 +- ocrd_cis/ocropy/clip.py | 3 ++- ocrd_cis/ocropy/common.py | 2 +- ocrd_cis/ocropy/denoise.py | 2 +- ocrd_cis/ocropy/dewarp.py | 2 +- ocrd_cis/ocropy/recognize.py | 2 +- ocrd_cis/ocropy/resegment.py | 2 +- ocrd_cis/ocropy/segment.py | 2 +- 8 files changed, 9 insertions(+), 8 deletions(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index 8f7d8d3a..7478edb5 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -101,7 +101,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: assert page page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id, feature_filter='binarized') - zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info) + zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) ret = [pcgts] if level == 'page': diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py index 75b4123f..400e9b54 100644 --- a/ocrd_cis/ocropy/clip.py +++ b/ocrd_cis/ocropy/clip.py @@ -1,5 +1,6 @@ from __future__ import absolute_import from logging import Logger +from typing import Tuple from os.path import join import numpy as np @@ -83,7 +84,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector='binarized') # The zoom is not used anywhere - zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info) + zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) ret = [pcgts] # FIXME: what about text regions inside table regions? diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py index 095de5eb..c6b7c49d 100644 --- a/ocrd_cis/ocropy/common.py +++ b/ocrd_cis/ocropy/common.py @@ -2103,7 +2103,7 @@ def find_topological(): # DSAVE('rlabels_closed', rlabels) return rlabels -def determine_zoom(logger: logging.Logger, dpi: float, page_image_info: OcrdExif) -> float: +def determine_zoom(logger: logging.Logger, page_id: str, dpi: float, page_image_info: OcrdExif) -> float: if dpi > 0: zoom = 300.0/dpi elif page_image_info.resolution != 1: diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py index 78d11c28..cc622c24 100644 --- a/ocrd_cis/ocropy/denoise.py +++ b/ocrd_cis/ocropy/denoise.py @@ -63,7 +63,7 @@ def process(self): page, page_id, feature_selector='binarized' if level == 'page' else '') - zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info) + zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) if level == 'page': self.process_segment(page, page_image, page_xywh, zoom, diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py index 9dddae44..72efca45 100644 --- a/ocrd_cis/ocropy/dewarp.py +++ b/ocrd_cis/ocropy/dewarp.py @@ -112,7 +112,7 @@ def process(self): page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id) - zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info) + zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) regions = page.get_AllRegions(classes=['Text'], order='reading-order') if not regions: diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py index b9fc453f..bbb8e415 100644 --- a/ocrd_cis/ocropy/recognize.py +++ b/ocrd_cis/ocropy/recognize.py @@ -155,7 +155,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: regions = page.get_AllRegions(classes=['Text']) if not regions: self.logger.warning(f"Page '{page_id}' contains no text regions") - self.process_regions(regions, max_level, page_image, page_coords) + self.process_regions(regions, max_level, page_image, page_xywh) return [pcgts] def process_regions(self, regions, maxlevel, page_image, page_xywh): diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index b18c0b5e..1e9f8c7f 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -117,7 +117,7 @@ def process(self): page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector='binarized') - zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info) + zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) ignore = (page.get_ImageRegion() + page.get_LineDrawingRegion() + diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index 782425cc..57368fe8 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -339,7 +339,7 @@ def process(self): # TODO: also allow grayscale_normalized (try/except?) page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector='binarized') - zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info) + zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) # aggregate existing regions so their foreground can be ignored ignore = (page.get_ImageRegion() + From 5f8e1dfb337d78cd757f4a6b5aff968829c2d4a1 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Thu, 15 Aug 2024 13:19:08 +0200 Subject: [PATCH 34/97] add missing Levenshtein req in setup --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 38f09abd..e3ee8213 100644 --- a/setup.py +++ b/setup.py @@ -41,6 +41,7 @@ 'ocrd>=3.0.0a1', 'click', 'scipy', + 'python-Levenshtein>=0.25.1', 'numpy>=1.17.0', 'pillow>=7.1.2', 'shapely>=1.7.1', From 9a14e1dddf44515630dadbcc23b62e6951eccc5d Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Thu, 15 Aug 2024 13:53:33 +0200 Subject: [PATCH 35/97] fix: remove version req for Levenshtein --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index e3ee8213..6b75d3a3 100644 --- a/setup.py +++ b/setup.py @@ -41,7 +41,7 @@ 'ocrd>=3.0.0a1', 'click', 'scipy', - 'python-Levenshtein>=0.25.1', + 'python-Levenshtein', 'numpy>=1.17.0', 'pillow>=7.1.2', 'shapely>=1.7.1', From 4ca4d1417030e40818327a7cc3571b22ad4ccda9 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Thu, 15 Aug 2024 13:59:33 +0200 Subject: [PATCH 36/97] fix: Levenshtein import --- ocrd_cis/align/cli.py | 2 +- setup.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/ocrd_cis/align/cli.py b/ocrd_cis/align/cli.py index ffe53fd8..7747622e 100644 --- a/ocrd_cis/align/cli.py +++ b/ocrd_cis/align/cli.py @@ -2,7 +2,7 @@ import click import json import os -import Levenshtein +from rapidfuzz.distance import Levenshtein from ocrd import Processor from ocrd.decorators import ocrd_cli_options from ocrd.decorators import ocrd_cli_wrap_processor diff --git a/setup.py b/setup.py index 6b75d3a3..38f09abd 100644 --- a/setup.py +++ b/setup.py @@ -41,7 +41,6 @@ 'ocrd>=3.0.0a1', 'click', 'scipy', - 'python-Levenshtein', 'numpy>=1.17.0', 'pillow>=7.1.2', 'shapely>=1.7.1', From fbaafcb4e3f982496aafdf561a4cd4713d859f5c Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 15 Aug 2024 16:23:00 +0200 Subject: [PATCH 37/97] update ocrd-cis-binarize to be compatible with bertsky/core#8 --- ocrd_cis/ocropy/binarize.py | 70 ++++++++++++++++--------------------- ocrd_cis/ocropy/common.py | 3 +- 2 files changed, 33 insertions(+), 40 deletions(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index 7478edb5..3c9583f9 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -1,21 +1,15 @@ from __future__ import absolute_import from logging import Logger +from typing import Optional import cv2 import numpy as np from PIL import Image -from os.path import abspath, dirname, join -from typing import Tuple +from ocrd.processor.ocrd_page_result import OcrdPageResult, OcrdPageResultImage -#import kraken.binarization - -from ocrd_utils import ( - getLogger, - make_file_id, - MIMETYPE_PAGE -) -from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage, to_xml +from ocrd_utils import getLogger +from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage from ocrd import Processor from . import common @@ -71,7 +65,7 @@ def setup(self): self.logger.critical(f'Requested method {method} does not support grayscale normalized output') raise ValueError('only method=ocropy allows grayscale=true') - def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage: + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Binarize (and optionally deskew/despeckle) the pages/regions/lines of the workspace. Iterate over the PAGE-XML element hierarchy down to the requested @@ -97,16 +91,17 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: self.logger.debug(f'Level of operation: "{level}"') pcgts = input_pcgts[0] + assert pcgts page = pcgts.get_Page() assert page page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id, feature_filter='binarized') zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) - ret = [pcgts] + result = OcrdPageResult(pcgts) if level == 'page': try: - ret.append(self.process_page(page, page_image, page_xywh, zoom, page_id, output_file_id)) + result.images.append(self.process_page(page, page_image, page_xywh, zoom, page_id)) except ValueError as e: self.logger.error(e) else: @@ -121,7 +116,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: region, page_image, page_xywh, feature_filter='binarized') if level == 'region': try: - ret.append(self.process_region(region, region_image, region_xywh, zoom, region.id, output_file_id)) + result.images.append(self.process_region(region, region_image, region_xywh, zoom, region.id)) continue except ValueError as e: self.logger.error(e) @@ -132,12 +127,12 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: line_image, line_xywh = self.workspace.image_from_segment( line, region_image, region_xywh, feature_filter='binarized') try: - ret.append(self.process_line(line, line_image, line_xywh, zoom, page_id, region.id, output_file_id)) + result.images.append(self.process_line(line, line_image, line_xywh, zoom, page_id, region.id)) except ValueError as e: self.logger.error(e) - return ret + return result - def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id) -> Tuple[Image.Image, str, str]: + def process_page(self, page, page_image, page_xywh, zoom, page_id) -> OcrdPageResultImage: if not page_image.width or not page_image.height: raise ValueError(f"Skipping page '{page_id}' with zero size") self.logger.info(f"About to binarize page '{page_id}'") @@ -171,18 +166,17 @@ def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id) -> T orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180] page.set_orientation(orientation) if self.parameter['grayscale']: - file_id += '.IMG-NRM' + id_suffix = '.IMG-NRM' features += ',grayscale_normalized' else: - file_id += '.IMG-BIN' + id_suffix = '.IMG-BIN' features += ',binarized' - bin_image_id = f'{file_id}.IMG-BIN' - bin_image_path = join(self.output_file_grp, f'{bin_image_id}.png') # update PAGE (reference the image file): - page.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=features)) - return bin_image, bin_image_id, bin_image_path + alternative_image = AlternativeImageType(comments=features) + page.add_AlternativeImage(alternative_image) + return OcrdPageResultImage(bin_image, id_suffix, alternative_image) - def process_region(self, region, region_image, region_xywh, zoom, page_id, file_id) -> Tuple[Image.Image, str, str]: + def process_region(self, region, region_image, region_xywh, zoom, page_id) -> OcrdPageResultImage: if not region_image.width or not region_image.height: raise ValueError(f"Skipping region '{region.id}' with zero size") self.logger.info(f"About to binarize page '{page_id}' region '{region.id}'") @@ -217,21 +211,19 @@ def process_region(self, region, region_image, region_xywh, zoom, page_id, file_ orientation = -region_xywh['angle'] orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180] region.set_orientation(orientation) - bin_image_id = f'{file_id}_{region.id}' + id_suffix = f'{region.id}' if self.parameter['grayscale']: - bin_image_id += '.IMG-NRM' + id_suffix += '.IMG-NRM' features += ',grayscale_normalized' else: - bin_image_id += '.IMG-BIN' + id_suffix += '.IMG-BIN' features += ',binarized' - bin_image_path = join(self.output_file_grp, f'{bin_image_id}.png') # update PAGE (reference the image file): - region.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=features)) - return bin_image, bin_image_id, bin_image_path + alternative_image = AlternativeImageType(comments=features) + region.add_AlternativeImage(alternative_image) + return OcrdPageResultImage(bin_image, id_suffix, alternative_image) - def process_line( - self, line, line_image, line_xywh, zoom, page_id, region_id, file_id - ) -> Tuple[Image.Image, str, str]: + def process_line(self, line, line_image, line_xywh, zoom, page_id, region_id) -> OcrdPageResultImage: if not line_image.width or not line_image.height: raise ValueError(f"Skipping line '{line.id}' with zero size") self.logger.info(f"About to binarize page '{page_id}' region '{region_id}' line '{line.id}'") @@ -256,14 +248,14 @@ def process_line( bin_image = remove_noise(bin_image, maxsize=self.parameter['noise_maxsize']) if self.parameter['noise_maxsize']: features += ',despeckled' - bin_image_id = f'{file_id}_{region_id}_{line.id}' + id_suffix = f'{region_id}_{line.id}' if self.parameter['grayscale']: - bin_image_id += '.IMG-NRM' + id_suffix += '.IMG-NRM' features += ',grayscale_normalized' else: - bin_image_id += '.IMG-BIN' + id_suffix += '.IMG-BIN' features += ',binarized' - bin_image_path = join(self.output_file_grp, f'{bin_image_id}.png') # update PAGE (reference the image file): - line.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=features)) - return bin_image, bin_image_id, bin_image_path + alternative_image = AlternativeImageType(comments=features) + line.add_AlternativeImage(alternative_image) + return OcrdPageResultImage(bin_image, id_suffix, alternative_image) diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py index c6b7c49d..c5b56ed0 100644 --- a/ocrd_cis/ocropy/common.py +++ b/ocrd_cis/ocropy/common.py @@ -1,4 +1,5 @@ from __future__ import absolute_import +from typing import Optional import warnings import logging @@ -2103,7 +2104,7 @@ def find_topological(): # DSAVE('rlabels_closed', rlabels) return rlabels -def determine_zoom(logger: logging.Logger, page_id: str, dpi: float, page_image_info: OcrdExif) -> float: +def determine_zoom(logger: logging.Logger, page_id: Optional[str], dpi: float, page_image_info: OcrdExif) -> float: if dpi > 0: zoom = 300.0/dpi elif page_image_info.resolution != 1: From 516ce4ba4bd4f65dae975472b5632d8d3b6027c2 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 16 Aug 2024 00:58:16 +0200 Subject: [PATCH 38/97] binarize: use final v3 API --- ocrd_cis/ocropy/binarize.py | 69 +++++++++++++++++-------------------- 1 file changed, 32 insertions(+), 37 deletions(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index 7478edb5..fa47e139 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -6,17 +6,15 @@ from PIL import Image from os.path import abspath, dirname, join -from typing import Tuple +from typing import Union, Optional #import kraken.binarization -from ocrd_utils import ( - getLogger, - make_file_id, - MIMETYPE_PAGE -) -from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage, to_xml +from ocrd_utils import getLogger +from ocrd_models.ocrd_page import AlternativeImageType +from ocrd_models import OcrdFile, ClientSideOcrdFile, OcrdPage from ocrd import Processor +from ocrd.processor import OcrdPageResult, OcrdPageResultImage from . import common from .common import array2pil, determine_zoom, pil2array, remove_noise @@ -71,7 +69,7 @@ def setup(self): self.logger.critical(f'Requested method {method} does not support grayscale normalized output') raise ValueError('only method=ocropy allows grayscale=true') - def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage: + def process_page_pcgts(self, *input_pcgts: Optional[Union[OcrdFile, ClientSideOcrdFile]], page_id: str = None) -> OcrdPageResult: """Binarize (and optionally deskew/despeckle) the pages/regions/lines of the workspace. Iterate over the PAGE-XML element hierarchy down to the requested @@ -90,7 +88,8 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: Reference each new image in the AlternativeImage of the element. - Return a PAGE-XML with AlternativeImage and the arguments for ``workspace.save_image_file``. + Return a PAGE-XML with new AlternativeImage(s) and the arguments + for ``workspace.save_image_file``. """ level = self.parameter['level-of-operation'] assert self.workspace @@ -103,10 +102,10 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id, feature_filter='binarized') zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) - ret = [pcgts] + ret = OcrdPageResult(pcgts) if level == 'page': try: - ret.append(self.process_page(page, page_image, page_xywh, zoom, page_id, output_file_id)) + ret.images.append(self.process_page(page, page_image, page_xywh, zoom, page_id)) except ValueError as e: self.logger.error(e) else: @@ -121,7 +120,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: region, page_image, page_xywh, feature_filter='binarized') if level == 'region': try: - ret.append(self.process_region(region, region_image, region_xywh, zoom, region.id, output_file_id)) + ret.images.append(self.process_region(region, region_image, region_xywh, zoom, region.id)) continue except ValueError as e: self.logger.error(e) @@ -132,16 +131,15 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: line_image, line_xywh = self.workspace.image_from_segment( line, region_image, region_xywh, feature_filter='binarized') try: - ret.append(self.process_line(line, line_image, line_xywh, zoom, page_id, region.id, output_file_id)) + ret.images.append(self.process_line(line, line_image, line_xywh, zoom, page_id, region.id)) except ValueError as e: self.logger.error(e) return ret - def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id) -> Tuple[Image.Image, str, str]: + def process_page(self, page, page_image, page_xywh, zoom, page_id) -> OcrdPageResultImage: if not page_image.width or not page_image.height: raise ValueError(f"Skipping page '{page_id}' with zero size") self.logger.info(f"About to binarize page '{page_id}'") - assert self.output_file_grp features = page_xywh['features'] if 'angle' in page_xywh and page_xywh['angle']: @@ -171,18 +169,17 @@ def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id) -> T orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180] page.set_orientation(orientation) if self.parameter['grayscale']: - file_id += '.IMG-NRM' + suffix = '.IMG-NRM' features += ',grayscale_normalized' else: - file_id += '.IMG-BIN' + suffix = '.IMG-BIN' features += ',binarized' - bin_image_id = f'{file_id}.IMG-BIN' - bin_image_path = join(self.output_file_grp, f'{bin_image_id}.png') # update PAGE (reference the image file): - page.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=features)) - return bin_image, bin_image_id, bin_image_path + alt_image = AlternativeImageType(comments=features) + page.add_AlternativeImage(alt_image) + return OcrdPageResultImage(bin_image, suffix, alt_image) - def process_region(self, region, region_image, region_xywh, zoom, page_id, file_id) -> Tuple[Image.Image, str, str]: + def process_region(self, region, region_image, region_xywh, zoom, page_id) -> OcrdPageResultImage: if not region_image.width or not region_image.height: raise ValueError(f"Skipping region '{region.id}' with zero size") self.logger.info(f"About to binarize page '{page_id}' region '{region.id}'") @@ -217,21 +214,19 @@ def process_region(self, region, region_image, region_xywh, zoom, page_id, file_ orientation = -region_xywh['angle'] orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180] region.set_orientation(orientation) - bin_image_id = f'{file_id}_{region.id}' + suffix = region.id if self.parameter['grayscale']: - bin_image_id += '.IMG-NRM' + suffix += '.IMG-NRM' features += ',grayscale_normalized' else: - bin_image_id += '.IMG-BIN' + suffix += '.IMG-BIN' features += ',binarized' - bin_image_path = join(self.output_file_grp, f'{bin_image_id}.png') # update PAGE (reference the image file): - region.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=features)) - return bin_image, bin_image_id, bin_image_path + alt_image = AlternativeImageType(comments=features) + region.add_AlternativeImage(alt_image) + return OcrdPageResultImage(bin_image, suffix, alt_image) - def process_line( - self, line, line_image, line_xywh, zoom, page_id, region_id, file_id - ) -> Tuple[Image.Image, str, str]: + def process_line(self, line, line_image, line_xywh, zoom, page_id, region_id) -> OcrdPageResultImage: if not line_image.width or not line_image.height: raise ValueError(f"Skipping line '{line.id}' with zero size") self.logger.info(f"About to binarize page '{page_id}' region '{region_id}' line '{line.id}'") @@ -256,14 +251,14 @@ def process_line( bin_image = remove_noise(bin_image, maxsize=self.parameter['noise_maxsize']) if self.parameter['noise_maxsize']: features += ',despeckled' - bin_image_id = f'{file_id}_{region_id}_{line.id}' + suffix = f'{region_id}_{line.id}' if self.parameter['grayscale']: - bin_image_id += '.IMG-NRM' + suffix += '.IMG-NRM' features += ',grayscale_normalized' else: - bin_image_id += '.IMG-BIN' + suffix += '.IMG-BIN' features += ',binarized' - bin_image_path = join(self.output_file_grp, f'{bin_image_id}.png') # update PAGE (reference the image file): - line.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=features)) - return bin_image, bin_image_id, bin_image_path + alt_image = AlternativeImageType(comments=features) + line.add_AlternativeImage(alt_image) + return OcrdPageResultImage(bin_image, suffix, alt_image) From 2e4f26f04ec5b2070a0396015d4339493e365fa1 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 16 Aug 2024 01:05:17 +0200 Subject: [PATCH 39/97] binarize: use correct types --- ocrd_cis/ocropy/binarize.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index fa47e139..ac499336 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -11,8 +11,7 @@ #import kraken.binarization from ocrd_utils import getLogger -from ocrd_models.ocrd_page import AlternativeImageType -from ocrd_models import OcrdFile, ClientSideOcrdFile, OcrdPage +from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage from ocrd import Processor from ocrd.processor import OcrdPageResult, OcrdPageResultImage @@ -69,7 +68,7 @@ def setup(self): self.logger.critical(f'Requested method {method} does not support grayscale normalized output') raise ValueError('only method=ocropy allows grayscale=true') - def process_page_pcgts(self, *input_pcgts: Optional[Union[OcrdFile, ClientSideOcrdFile]], page_id: str = None) -> OcrdPageResult: + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: str = None) -> OcrdPageResult: """Binarize (and optionally deskew/despeckle) the pages/regions/lines of the workspace. Iterate over the PAGE-XML element hierarchy down to the requested From 21be94106ac55d001cb5729f21138fb9c7715bcb Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 16 Aug 2024 01:12:04 +0200 Subject: [PATCH 40/97] clip: use final v3 API --- ocrd_cis/ocropy/clip.py | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py index 400e9b54..d0119544 100644 --- a/ocrd_cis/ocropy/clip.py +++ b/ocrd_cis/ocropy/clip.py @@ -8,19 +8,17 @@ from shapely.geometry import Polygon from shapely.prepared import prep -from ocrd_modelfactory import page_from_file -from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage, to_xml +from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage from ocrd import Processor +from ocrd.processor import OcrdPageResult, OcrdPageResultImage from ocrd_utils import ( getLogger, - make_file_id, coordinates_of_segment, polygon_from_points, bbox_from_polygon, image_from_polygon, polygon_mask, crop_image, - MIMETYPE_PAGE ) from .ocrolib import midrange, morph @@ -38,7 +36,7 @@ def executable(self): def setup(self): self.logger = getLogger('processor.OcropyClip') - def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage: + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: str = None) -> OcrdPageResult: """Clip text regions / lines of a page at intersections with neighbours. Open and deserialize PAGE input file and its respective image, @@ -85,7 +83,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: page, page_id, feature_selector='binarized') # The zoom is not used anywhere zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) - ret = [pcgts] + ret = OcrdPageResult(pcgts) # FIXME: what about text regions inside table regions? regions = list(page.get_TextRegion()) @@ -141,9 +139,9 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: if shape.intersects(shapej)] if neighbours: segment_region_file_id = f"{output_file_id}_{region.id}" - ret.append(self.process_segment( + ret.images.append(self.process_segment( region, masks[i], polygons[i], neighbours, background_image, - page_image, page_xywh, page_bin, page_id, segment_region_file_id)) + page_image, page_xywh, page_bin, page_id)) continue # level == 'line': lines = region.get_TextLine() @@ -172,14 +170,14 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: if shape.intersects(shapej)] if neighbours: segment_line_file_id = f"{output_file_id}_{region.id}_{line.id}" - ret.append(self.process_segment( + ret.images.append(self.process_segment( line, masks[j], polygons[j], neighbours, background_image, - region_image, region_coords, region_bin, page_id, segment_line_file_id)) + region_image, region_coords, region_bin, page_id)) return ret def process_segment(self, segment, segment_mask, segment_polygon, neighbours, background_image, parent_image, parent_coords, parent_bin, - page_id, file_id) -> Tuple[Image.Image, str, str]: + page_id) -> OcrdPageResultImage: # initialize AlternativeImage@comments classes from parent, except # for those operations that can apply on multiple hierarchy levels: features = ','.join( @@ -217,8 +215,7 @@ def process_segment(self, segment, segment_mask, segment_polygon, neighbours, # recrop segment into rectangle, just as image_from_segment would do # (and also clipping with background colour): segment_image = crop_image(segment_image,box=segment_bbox) - segment_image_id = file_id + '.IMG-CLIP' - segment_image_path = join(self.output_file_grp, f'{segment_image_id}.png') # update PAGE (reference the image file): - segment.add_AlternativeImage(AlternativeImageType(filename=segment_image_path, comments=features)) - return segment_image, segment_image_id, segment_image_path + alternative_image = AlternativeImageType(comments=features) + segment.add_AlternativeImage(alternative_image) + return OcrdPageResultImage(segment_image, '.IMG-CLIP', alternative_image) From 9539ac9620776e335bbe107e57e92742027f02b3 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 16 Aug 2024 01:12:51 +0200 Subject: [PATCH 41/97] clip: use correct types --- ocrd_cis/ocropy/clip.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py index d0119544..3ddd6a70 100644 --- a/ocrd_cis/ocropy/clip.py +++ b/ocrd_cis/ocropy/clip.py @@ -1,6 +1,6 @@ from __future__ import absolute_import from logging import Logger -from typing import Tuple +from typing import Optional from os.path import join import numpy as np From 734b5eb4ef9bfee2e24d8053966b17eaf6e9e1f9 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 16 Aug 2024 01:14:56 +0200 Subject: [PATCH 42/97] recognize: use final v3 API --- ocrd_cis/ocropy/recognize.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py index bbb8e415..7e4f2957 100644 --- a/ocrd_cis/ocropy/recognize.py +++ b/ocrd_cis/ocropy/recognize.py @@ -11,18 +11,16 @@ from ocrd_utils import ( getLogger, - make_file_id, coordinates_for_segment, polygon_from_bbox, points_from_polygon, - MIMETYPE_PAGE ) -from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( - to_xml, TextEquivType, OcrdPage, + TextEquivType, OcrdPage, CoordsType, GlyphType, WordType ) from ocrd import Processor +from ocrd.processor import OcrdPageResult from .common import check_line, pil2array from .ocrolib import lstm, load_object, midrange @@ -116,7 +114,7 @@ def get_model(self): f"Could not find model {p_model}. Try 'ocrd resmgr download ocrd-cis-ocropy-recognize {p_model}") exit(1) - def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage: + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: str = None) -> OcrdPageResult: """Recognize lines / words / glyphs of a page. Open and deserialize the PAGE input file and its respective image, @@ -156,7 +154,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: if not regions: self.logger.warning(f"Page '{page_id}' contains no text regions") self.process_regions(regions, max_level, page_image, page_xywh) - return [pcgts] + return OcrdPageResult(pcgts) def process_regions(self, regions, maxlevel, page_image, page_xywh): edits = 0 From 28ad585c94f9895b3f5011a72aabf36b73d71a8e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 16 Aug 2024 02:20:58 +0200 Subject: [PATCH 43/97] recognize: fix typing import --- ocrd_cis/ocropy/recognize.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py index 7e4f2957..97fcc64d 100644 --- a/ocrd_cis/ocropy/recognize.py +++ b/ocrd_cis/ocropy/recognize.py @@ -1,7 +1,8 @@ from __future__ import absolute_import + from logging import Logger from sys import exit -from typing import Any +from typing import Any, Optional from os import access, R_OK from os.path import abspath, dirname, isfile, join import numpy as np From 9a7c10ab71f7df3783f44848536aa99dd9c8e483 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 16 Aug 2024 02:31:27 +0200 Subject: [PATCH 44/97] denoise: adapt to final v3 API --- ocrd_cis/ocropy/denoise.py | 122 +++++++++++++++---------------------- 1 file changed, 49 insertions(+), 73 deletions(-) diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py index cc622c24..0f368fd5 100644 --- a/ocrd_cis/ocropy/denoise.py +++ b/ocrd_cis/ocropy/denoise.py @@ -1,17 +1,15 @@ from __future__ import absolute_import + +from typing import Optional from logging import Logger from os.path import join -from ocrd_utils import ( - getLogger, - make_file_id, - MIMETYPE_PAGE -) -from ocrd_modelfactory import page_from_file +from ocrd_utils import getLogger from ocrd_models.ocrd_page import ( - to_xml, AlternativeImageType + AlternativeImageType, OcrdPage ) from ocrd import Processor +from ocrd.processor import OcrdPageResult, OcrdPageResultImage from .common import ( # binarize, @@ -27,10 +25,10 @@ def executable(self): def setup(self): self.logger = getLogger('processor.OcropyDenoise') - def process(self): + def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult: """Despeckle the pages / regions / lines of the workspace. - Open and deserialise PAGE input files and their respective images, + Open and deserialise PAGE input file and its respective images, then iterate over the element hierarchy down to the requested ``level-of-operation``. @@ -49,73 +47,51 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ level = self.parameter['level-of-operation'] - - for (n, input_file) in enumerate(self.input_files): - self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) - file_id = make_file_id(input_file, self.output_file_grp) - - pcgts = page_from_file(self.workspace.download_file(input_file)) - self.add_metadata(pcgts) - page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id) - page = pcgts.get_Page() - - page_image, page_xywh, page_image_info = self.workspace.image_from_page( - page, page_id, - feature_selector='binarized' if level == 'page' else '') - - zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) - - if level == 'page': - self.process_segment(page, page_image, page_xywh, zoom, - input_file.pageId, file_id) - else: - regions = page.get_AllRegions(classes=['Text'], order='reading-order') - if not regions: - self.logger.warning('Page "%s" contains no text regions', page_id) - for region in regions: - region_image, region_xywh = self.workspace.image_from_segment( - region, page_image, page_xywh, - feature_selector='binarized' if level == 'region' else '') - if level == 'region': - self.process_segment(region, region_image, region_xywh, zoom, - input_file.pageId, file_id + '_' + region.id) - continue - lines = region.get_TextLine() - if not lines: - self.logger.warning('Page "%s" region "%s" contains no text lines', page_id, region.id) - for line in lines: - line_image, line_xywh = self.workspace.image_from_segment( - line, region_image, region_xywh, - feature_selector='binarized') - self.process_segment(line, line_image, line_xywh, zoom, - input_file.pageId, - file_id + '_' + region.id + '_' + line.id) - - # update METS (add the PAGE file): - file_path = join(self.output_file_grp, file_id + '.xml') - pcgts.set_pcGtsId(file_id) - out = self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - local_filename=file_path, - mimetype=MIMETYPE_PAGE, - content=to_xml(pcgts)) - self.logger.info('created file ID: %s, file_grp: %s, path: %s', - file_id, self.output_file_grp, out.local_filename) - - def process_segment(self, segment, segment_image, segment_xywh, zoom, page_id, file_id): + pcgts = input_pcgts[0] + result = OcrdPageResult(pcgts) + page = pcgts.get_Page() + + page_image, page_xywh, page_image_info = self.workspace.image_from_page( + page, page_id, + feature_selector='binarized' if level == 'page' else '') + zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) + + if level == 'page': + image = self.process_segment(page, page_image, page_xywh, zoom) + if image: + result.images.append(image) + else: + regions = page.get_AllRegions(classes=['Text'], order='reading-order') + if not regions: + self.logger.warning('Page "%s" contains no text regions', page_id) + for region in regions: + region_image, region_xywh = self.workspace.image_from_segment( + region, page_image, page_xywh, + feature_selector='binarized' if level == 'region' else '') + if level == 'region': + image = self.process_segment(region, region_image, region_xywh, zoom) + if image: + result.images.append(image) + continue + lines = region.get_TextLine() + if not lines: + self.logger.warning('Page "%s" region "%s" contains no text lines', page_id, region.id) + for line in lines: + line_image, line_xywh = self.workspace.image_from_segment( + line, region_image, region_xywh, + feature_selector='binarized') + image = self.process_segment(line, line_image, line_xywh, zoom) + if image: + result.images.append(image) + + def process_segment(self, segment, segment_image, segment_xywh, zoom) -> Optional[OcrdPageResultImage]: if not segment_image.width or not segment_image.height: self.logger.warning("Skipping '%s' with zero size", file_id) - return + return None self.logger.info("About to despeckle '%s'", file_id) bin_image = remove_noise(segment_image, maxsize=self.parameter['noise_maxsize']/zoom*300/72) # in pt - # update METS (add the image file): - file_path = self.workspace.save_image_file( - bin_image, file_id + '.IMG-DESPECK', self.output_file_grp, - page_id=page_id) # update PAGE (reference the image file): - segment.add_AlternativeImage(AlternativeImageType( - filename=file_path, - comments=segment_xywh['features'] + ',despeckled')) + alt_image = AlternativeImageType(comments=segment_xywh['features'] + ',despeckled') + segment.add_AlternativeImage(alt_image) + return OcrdPageResultImage(bin_image, segment.id + '.IMG-DESPECK', alt_image) From 7c9f39fa4516401fe17e24d3ca67799c5b85d308 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 16 Aug 2024 02:40:41 +0200 Subject: [PATCH 45/97] deskew: adapt to final v3 API --- ocrd_cis/ocropy/deskew.py | 116 +++++++++++++++----------------------- 1 file changed, 47 insertions(+), 69 deletions(-) diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py index 616864e1..fae0c90c 100644 --- a/ocrd_cis/ocropy/deskew.py +++ b/ocrd_cis/ocropy/deskew.py @@ -1,24 +1,21 @@ from __future__ import absolute_import + +from typing import Optional from logging import Logger from os.path import join -from ocrd_utils import ( - getLogger, - make_file_id, - MIMETYPE_PAGE -) -from ocrd_modelfactory import page_from_file +from ocrd_utils import getLogger from ocrd_models.ocrd_page import ( PageType, - to_xml, AlternativeImageType + AlternativeImageType, + OcrdPage ) from ocrd import Processor +from ocrd.processor import OcrdPageResult, OcrdPageResultImage from . import common from .common import pil2array -#sys.path.append(os.path.dirname(os.path.abspath(__file__))) - def deskew(pil_image, maxskew=2): array = pil2array(pil_image) _, angle = common.binarize(array, maxskew=maxskew) @@ -34,10 +31,10 @@ def executable(self): def setup(self): self.logger = getLogger('processor.OcropyDeskew') - def process(self): + def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult: """Deskew the pages or regions of the workspace. - Open and deserialise PAGE input files and their respective images, + Open and deserialise PAGE input file and its respective images, then iterate over the element hierarchy down to the TextRegion level. Next, for each file, crop each region image according to the layout @@ -53,62 +50,45 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ level = self.parameter['level-of-operation'] - - for (n, input_file) in enumerate(self.input_files): - self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) - file_id = make_file_id(input_file, self.output_file_grp) - - pcgts = page_from_file(self.workspace.download_file(input_file)) - self.add_metadata(pcgts) - page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id) - page = pcgts.get_Page() - - page_image, page_coords, _ = self.workspace.image_from_page( - page, page_id, + pcgts = input_pcgts[0] + result = OcrdPageResult(pcgts) + page = pcgts.get_Page() + + page_image, page_coords, _ = self.workspace.image_from_page( + page, page_id, + # image must not have been rotated already, + # (we will overwrite @orientation anyway,) + # abort if no such image can be produced: + feature_filter='deskewed' if level == 'page' else '') + if level == 'page': + image = self._process_segment(page, page_image, page_coords, "page '%s'" % page_id, page_id) + if image: + result.images.append(image) + return result + if level == 'table': + regions = page.get_TableRegion() + else: # region + regions = page.get_AllRegions(classes=['Text'], order='reading-order') + if not regions: + self.logger.warning('Page "%s" contains no text regions', page_id) + for region in regions: + # process region: + region_image, region_coords = self.workspace.image_from_segment( + region, page_image, page_coords, # image must not have been rotated already, # (we will overwrite @orientation anyway,) # abort if no such image can be produced: - feature_filter='deskewed' if level == 'page' else '') - if level == 'page': - self._process_segment(page, page_image, page_coords, - "page '%s'" % page_id, input_file.pageId, - file_id) - else: - if level == 'table': - regions = page.get_TableRegion() - else: # region - regions = page.get_AllRegions(classes=['Text'], order='reading-order') - if not regions: - self.logger.warning('Page "%s" contains no text regions', page_id) - for region in regions: - # process region: - region_image, region_coords = self.workspace.image_from_segment( - region, page_image, page_coords, - # image must not have been rotated already, - # (we will overwrite @orientation anyway,) - # abort if no such image can be produced: - feature_filter='deskewed') - self._process_segment(region, region_image, region_coords, - "region '%s'" % region.id, input_file.pageId, - file_id + '_' + region.id) - - # update METS (add the PAGE file): - file_path = join(self.output_file_grp, file_id + '.xml') - pcgts.set_pcGtsId(file_id) - out = self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - local_filename=file_path, - mimetype=MIMETYPE_PAGE, - content=to_xml(pcgts)) - self.logger.info('created file ID: %s, file_grp: %s, path: %s', - file_id, self.output_file_grp, out.local_filename) - - def _process_segment(self, segment, segment_image, segment_coords, segment_id, page_id, file_id): + feature_filter='deskewed') + image = self._process_segment(region, region_image, region_coords, + "region '%s'" % region.id, page_id) + if image: + result.images.append(image) + return result + + def _process_segment(self, segment, segment_image, segment_coords, segment_id, page_id) -> Optional[OcrdPageResultImage]: if not segment_image.width or not segment_image.height: self.logger.warning("Skipping %s with zero size", segment_id) - return + return None angle0 = segment_coords['angle'] # deskewing (w.r.t. top image) already applied to segment_image self.logger.info("About to deskew %s", segment_id) angle = deskew(segment_image, maxskew=self.parameter['maxskew']) # additional angle to be applied @@ -123,20 +103,18 @@ def _process_segment(self, segment, segment_image, segment_coords, segment_id, p segment_image, segment_coords, _ = self.workspace.image_from_page( segment, page_id, fill='background', transparency=True) + suffix = '.IMG-DESKEW' else: segment_image, segment_coords = self.workspace.image_from_segment( segment, segment_image, segment_coords, fill='background', transparency=True) + suffix = segment.id + '.IMG-DESKEW' if not angle: # zero rotation does not change coordinates, # but assures consuming processors that the # workflow had deskewing segment_coords['features'] += ',deskewed' - # update METS (add the image file): - file_path = self.workspace.save_image_file( - segment_image, file_id + '.IMG-DESKEW', self.output_file_grp, - page_id=page_id) # update PAGE (reference the image file): - segment.add_AlternativeImage(AlternativeImageType( - filename=file_path, - comments=segment_coords['features'])) + alternative = AlternativeImageType(comments=segment_coords['features']) + segment.add_AlternativeImage(alternative) + return OcrdPageResultImage(segment_image, suffix, alternative) From 669866857395544ed10c0fbda5ea03abd1b31f14 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 16 Aug 2024 02:52:55 +0200 Subject: [PATCH 46/97] dewarp: adapt to final v3 API --- ocrd_cis/ocropy/dewarp.py | 129 +++++++++++++++----------------------- 1 file changed, 50 insertions(+), 79 deletions(-) diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py index 72efca45..a063a05e 100644 --- a/ocrd_cis/ocropy/dewarp.py +++ b/ocrd_cis/ocropy/dewarp.py @@ -1,24 +1,22 @@ from __future__ import absolute_import + +from typing import Optional from logging import Logger from os.path import join + import numpy as np -from ocrd_utils import ( - getLogger, - make_file_id, -) -from ocrd_modelfactory import page_from_file +from ocrd_utils import getLogger from ocrd_models.ocrd_page import ( - to_xml, AlternativeImageType + AlternativeImageType, + OcrdPage ) from ocrd import Processor -from ocrd_utils import MIMETYPE_PAGE +from ocrd.processor import OcrdPageResult, OcrdPageResultImage from .ocrolib import lineest from .common import array2pil, check_line, determine_zoom, pil2array -#sys.path.append(os.path.dirname(os.path.abspath(__file__))) - class InvalidLine(Exception): """Line image does not allow dewarping and should be ignored.""" @@ -80,10 +78,10 @@ def setup(self): # and extra params) 0.3)) - def process(self): + def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult: """Dewarp the lines of the workspace. - Open and deserialise PAGE input files and their respective images, + Open and deserialise PAGE input file and its respective images, then iterate over the element hierarchy down to the TextLine level. Next, get each line image according to the layout annotation (from @@ -99,71 +97,44 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ - - for (n, input_file) in enumerate(self.input_files): - self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) - file_id = make_file_id(input_file, self.output_file_grp) - - pcgts = page_from_file(self.workspace.download_file(input_file)) - self.add_metadata(pcgts) - page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id) - page = pcgts.get_Page() - - page_image, page_xywh, page_image_info = self.workspace.image_from_page( - page, page_id) - - zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) - - regions = page.get_AllRegions(classes=['Text'], order='reading-order') - if not regions: - self.logger.warning('Page "%s" contains no text regions', page_id) - for region in regions: - region_image, region_xywh = self.workspace.image_from_segment( - region, page_image, page_xywh) - - lines = region.get_TextLine() - if not lines: - self.logger.warning('Region %s contains no text lines', region.id) - for line in lines: - line_image, line_xywh = self.workspace.image_from_segment( - line, region_image, region_xywh) - - self.logger.info("About to dewarp page '%s' region '%s' line '%s'", - page_id, region.id, line.id) - try: - dew_image = dewarp(line_image, self.lnorm, check=True, - max_neighbour=self.parameter['max_neighbour'], - zoom=zoom) - except InvalidLine as err: - self.logger.error('cannot dewarp line "%s": %s', line.id, err) - continue - except InadequateLine as err: - self.logger.warning('cannot dewarp line "%s": %s', line.id, err) - # as a fallback, simply pad the image vertically - # (just as dewarping would do on average, so at least - # this line has similar margins as the others): - dew_image = padvert(line_image, self.parameter['range']) - # update METS (add the image file): - file_path = self.workspace.save_image_file( - dew_image, - file_id + '_' + region.id + '_' + line.id + '.IMG-DEWARP', - self.output_file_grp, - page_id=input_file.pageId) - # update PAGE (reference the image file): - alternative_image = line.get_AlternativeImage() - line.add_AlternativeImage(AlternativeImageType( - filename=file_path, - comments=line_xywh['features'] + ',dewarped')) - - # update METS (add the PAGE file): - file_path = join(self.output_file_grp, file_id + '.xml') - pcgts.set_pcGtsId(file_id) - out = self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - local_filename=file_path, - mimetype=MIMETYPE_PAGE, - content=to_xml(pcgts)) - self.logger.info('created file ID: %s, file_grp: %s, path: %s', - file_id, self.output_file_grp, out.local_filename) + pcgts = input_pcgts[0] + result = OcrdPageResult(pcgts) + page = pcgts.get_Page() + + page_image, page_xywh, page_image_info = self.workspace.image_from_page( + page, page_id) + zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) + + regions = page.get_AllRegions(classes=['Text'], order='reading-order') + if not regions: + self.logger.warning('Page "%s" contains no text regions', page_id) + for region in regions: + region_image, region_xywh = self.workspace.image_from_segment( + region, page_image, page_xywh) + + lines = region.get_TextLine() + if not lines: + self.logger.warning('Region %s contains no text lines', region.id) + for line in lines: + line_image, line_xywh = self.workspace.image_from_segment( + line, region_image, region_xywh) + + self.logger.info("About to dewarp page '%s' region '%s' line '%s'", + page_id, region.id, line.id) + try: + dew_image = dewarp(line_image, self.lnorm, check=True, + max_neighbour=self.parameter['max_neighbour'], + zoom=zoom) + except InvalidLine as err: + self.logger.error('cannot dewarp line "%s": %s', line.id, err) + continue + except InadequateLine as err: + self.logger.warning('cannot dewarp line "%s": %s', line.id, err) + # as a fallback, simply pad the image vertically + # (just as dewarping would do on average, so at least + # this line has similar margins as the others): + dew_image = padvert(line_image, self.parameter['range']) + # update PAGE (reference the image file): + alt_image = AlternativeImageType(comments=line_xywh['features'] + ',dewarped') + line.add_AlternativeImage(alternative_image) + return OcrdPageResultImage(dew_image, region.id + '_' + line.id + '.IMG-DEWARP', alt_image) From 48a3146a4e510b14899aafc80c7f9f05da05fc48 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 16 Aug 2024 03:07:40 +0200 Subject: [PATCH 47/97] resegment: adapt to final v3 API --- ocrd_cis/ocropy/resegment.py | 109 +++++++++++++++-------------------- 1 file changed, 45 insertions(+), 64 deletions(-) diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index 1e9f8c7f..05f17d4f 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -1,24 +1,25 @@ from __future__ import absolute_import + +from typing import Optional from logging import Logger from os.path import join + import numpy as np from skimage import draw, segmentation from shapely.geometry import Polygon, LineString from shapely.prepared import prep -from ocrd_modelfactory import page_from_file -from ocrd_models.ocrd_page import BaselineType, PageType, to_xml -from ocrd import Processor from ocrd_utils import ( getLogger, - make_file_id, coordinates_of_segment, coordinates_for_segment, points_from_polygon, polygon_from_points, transform_coordinates, - MIMETYPE_PAGE ) +from ocrd_models.ocrd_page import BaselineType, PageType, OcrdPage +from ocrd import Processor +from ocrd.processor import OcrdPageResult from .ocrolib import midrange, morph from .common import ( @@ -52,10 +53,10 @@ def executable(self): def setup(self): self.logger = getLogger('processor.OcropyResegment') - def process(self): + def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult: """Resegment lines of the workspace. - Open and deserialise PAGE input files and their respective images, + Open and deserialise PAGE input file and its respective images, then iterate over the element hierarchy down to the line level. Next, get the page image according to the layout annotation (from @@ -104,67 +105,47 @@ def process(self): # accuracy crucially depends on a good estimate of the images' # pixel density (at least if source input is not 300 DPI). level = self.parameter['level-of-operation'] + pcgts = input_pcgts[0] + page = pcgts.get_Page() - for n, input_file in enumerate(self.input_files): - self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) - file_id = make_file_id(input_file, self.output_file_grp) - - pcgts = page_from_file(self.workspace.download_file(input_file)) - self.add_metadata(pcgts) - page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID - page = pcgts.get_Page() - - page_image, page_coords, page_image_info = self.workspace.image_from_page( - page, page_id, feature_selector='binarized') + page_image, page_coords, page_image_info = self.workspace.image_from_page( + page, page_id, feature_selector='binarized') + zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) - zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) - - ignore = (page.get_ImageRegion() + - page.get_LineDrawingRegion() + - page.get_GraphicRegion() + - page.get_ChartRegion() + - page.get_MapRegion() + - page.get_MathsRegion() + - page.get_ChemRegion() + - page.get_MusicRegion() + - page.get_AdvertRegion() + - page.get_NoiseRegion() + - page.get_SeparatorRegion() + - page.get_UnknownRegion() + - page.get_CustomRegion()) - regions = page.get_AllRegions(classes=['Text']) - if not regions: - self.logger.warning('Page "%s" contains no text regions', page_id) - elif level == 'page': - lines = [line for region in regions - for line in region.get_TextLine()] + ignore = (page.get_ImageRegion() + + page.get_LineDrawingRegion() + + page.get_GraphicRegion() + + page.get_ChartRegion() + + page.get_MapRegion() + + page.get_MathsRegion() + + page.get_ChemRegion() + + page.get_MusicRegion() + + page.get_AdvertRegion() + + page.get_NoiseRegion() + + page.get_SeparatorRegion() + + page.get_UnknownRegion() + + page.get_CustomRegion()) + regions = page.get_AllRegions(classes=['Text']) + if not regions: + self.logger.warning('Page "%s" contains no text regions', page_id) + elif level == 'page': + lines = [line for region in regions + for line in region.get_TextLine()] + if lines: + self._process_segment(page, page_image, page_coords, page_id, zoom, lines, ignore) + else: + self.logger.warning('Page "%s" contains no text regions with lines', page_id) + else: + for region in regions: + lines = region.get_TextLine() if lines: - self._process_segment(page, page_image, page_coords, page_id, zoom, lines, ignore) + region_image, region_coords = self.workspace.image_from_segment( + region, page_image, page_coords, feature_selector='binarized') + self._process_segment(region, region_image, region_coords, page_id, zoom, lines, ignore) else: - self.logger.warning('Page "%s" contains no text regions with lines', page_id) - else: - for region in regions: - lines = region.get_TextLine() - if lines: - region_image, region_coords = self.workspace.image_from_segment( - region, page_image, page_coords, feature_selector='binarized') - self._process_segment(region, region_image, region_coords, page_id, zoom, lines, ignore) - else: - self.logger.warning('Page "%s" region "%s" contains no text lines', page_id, region.id) - - # update METS (add the PAGE file): - file_path = join(self.output_file_grp, file_id + '.xml') - pcgts.set_pcGtsId(file_id) - out = self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - local_filename=file_path, - mimetype=MIMETYPE_PAGE, - content=to_xml(pcgts)) - self.logger.info('created file ID: %s, file_grp: %s, path: %s', - file_id, self.output_file_grp, out.local_filename) - + self.logger.warning('Page "%s" region "%s" contains no text lines', page_id, region.id) + return OcrdPageResult(pcgts) + def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, lines, ignore): threshold = self.parameter['min_fraction'] method = self.parameter['method'] From 0dd6fbac1a63965d241203cdc1dda85ca1fa4728 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 16 Aug 2024 11:04:23 +0200 Subject: [PATCH 48/97] ocropy_segment: implement process_page_pcgts --- ocrd_cis/ocropy/segment.py | 314 +++++++++++++++++++++++++++---------- 1 file changed, 229 insertions(+), 85 deletions(-) diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index 57368fe8..d2a7a727 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -1,6 +1,7 @@ from __future__ import absolute_import from logging import Logger from os.path import join +from typing import Optional import itertools import numpy as np from scipy.sparse.csgraph import minimum_spanning_tree @@ -16,6 +17,7 @@ from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( to_xml, CoordsType, + OcrdPage, TextLineType, TextRegionType, SeparatorRegionType, @@ -35,6 +37,7 @@ ReadingOrderType ) from ocrd import Processor +from ocrd.processor import OcrdPageResult from ocrd_utils import ( getLogger, make_file_id, @@ -252,6 +255,168 @@ def executable(self): def setup(self): self.logger = getLogger('processor.OcropySegment') + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: + overwrite_lines = self.parameter['overwrite_lines'] + overwrite_regions = self.parameter['overwrite_regions'] + overwrite_separators = self.parameter['overwrite_separators'] + overwrite_order = self.parameter['overwrite_order'] + oplevel = self.parameter['level-of-operation'] + + pcgts = input_pcgts[0] + page = pcgts.get_Page() + + # TODO: also allow grayscale_normalized (try/except?) + page_image, page_coords, page_image_info = self.workspace.image_from_page( + page, page_id, feature_selector='binarized') + zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) + + # aggregate existing regions so their foreground can be ignored + ignore = (page.get_ImageRegion() + + page.get_LineDrawingRegion() + + page.get_GraphicRegion() + + page.get_ChartRegion() + + page.get_MapRegion() + + page.get_MathsRegion() + + page.get_ChemRegion() + + page.get_MusicRegion() + + page.get_AdvertRegion() + + page.get_NoiseRegion() + + page.get_UnknownRegion() + + page.get_CustomRegion()) + if oplevel == 'page' and overwrite_separators: + page.set_SeparatorRegion([]) + else: + ignore.extend(page.get_SeparatorRegion()) + # prepare reading order + reading_order = dict() + ro = page.get_ReadingOrder() + if ro: + rogroup = ro.get_OrderedGroup() or ro.get_UnorderedGroup() + if rogroup: + page_get_reading_order(reading_order, rogroup) + # get segments to process / overwrite + if oplevel == 'page': + ignore.extend(page.get_TableRegion()) + regions = list(page.get_TextRegion()) + if regions: + # page is already region-segmented + if overwrite_regions: + self.logger.info(f'Removing existing TextRegions in page "{page_id}"') + # we could remove all other region types as well, + # but this is more flexible (for workflows with + # specialized separator/image/table detectors): + page.set_TextRegion([]) + page.set_ReadingOrder(None) + ro = None + else: + self.logger.warning(f'Keeping existing TextRegions in page "{page_id}"') + ignore.extend(regions) + # create reading order if necessary + if not ro or overwrite_order: + ro = ReadingOrderType() + page.set_ReadingOrder(ro) + rogroup = ro.get_OrderedGroup() or ro.get_UnorderedGroup() + if not rogroup: + # new top-level group + rogroup = OrderedGroupType(id="reading-order") + ro.set_OrderedGroup(rogroup) + # go get TextRegions with TextLines (and SeparatorRegions): + self._process_element( + page, ignore, page_image, page_coords, page_id, file_id, page_id, zoom, rogroup=rogroup) + if (not rogroup.get_RegionRefIndexed() and + not rogroup.get_OrderedGroupIndexed() and + not rogroup.get_UnorderedGroupIndexed()): + # schema forbids empty OrderedGroup + ro.set_OrderedGroup(None) + elif oplevel == 'table': + ignore.extend(page.get_TextRegion()) + regions = list(page.get_TableRegion()) + if not regions: + self.logger.warning(f'Page "{page_id}" contains no table regions') + for region in regions: + subregions = region.get_TextRegion() + if subregions: + # table is already cell-segmented + if overwrite_regions: + self.logger.info(f'Removing existing TextRegions in table "{region.id}"') + region.set_TextRegion([]) + roelem = reading_order.get(region.id) + # replace by empty group with same index and ref + # (which can then take the cells as subregions) + reading_order[region.id] = page_subgroup_in_reading_order(self.logger, roelem) + else: + self.logger.warning(f'Skipping table "{region.id}" with existing TextRegions') + continue + # TODO: also allow grayscale_normalized (try/except?) + region_image, region_coords = self.workspace.image_from_segment( + region, page_image, page_coords, feature_selector='binarized') + # ignore everything but the current table region + subignore = regions + ignore + subignore.remove(region) + # create reading order group if necessary + roelem = reading_order.get(region.id) + if not roelem: + self.logger.warning(f"Page '{page_id}' table region '{region.id}' is not referenced in reading " + f"order (no target to add cells to)") + elif overwrite_order: + # replace by empty ordered group with same (index and) ref + # (which can then take the cells as subregions) + roelem = page_subgroup_in_reading_order(self.logger, roelem) + reading_order[region.id] = roelem + elif isinstance(roelem, (OrderedGroupType, OrderedGroupIndexedType)): + self.logger.warning(f"Page '{page_id}' table region '{region.id}' already has an ordered " + f"group (cells will be appended)") + elif isinstance(roelem, (UnorderedGroupType, UnorderedGroupIndexedType)): + self.logger.warning(f"Page '{page_id}' table region '{region.id}' already has an unordered " + f"group (cells will not be appended)") + roelem = None + else: + # replace regionRef(Indexed) by group with same index and ref + # (which can then take the cells as subregions) + roelem = page_subgroup_in_reading_order(self.logger, roelem) + reading_order[region.id] = roelem + # go get TextRegions with TextLines (and SeparatorRegions) + self._process_element( + region, subignore, region_image, region_coords, region.id, file_id + '_' + region.id, + page_id, zoom, rogroup=roelem) + else: # 'region' + regions = list(page.get_TextRegion()) + # besides top-level text regions, line-segment any table cells, + # and for tables without any cells, add a pseudo-cell + for region in page.get_TableRegion(): + subregions = region.get_TextRegion() + if subregions: + regions.extend(subregions) + else: + subregion = TextRegionType( + id=region.id + '_text', + Coords=region.get_Coords(), + # as if generated from parser: + parent_object_=region) + region.add_TextRegion(subregion) + regions.append(subregion) + if not regions: + self.logger.warning('Page "%s" contains no text regions', page_id) + for region in regions: + if region.get_TextLine(): + if overwrite_lines: + self.logger.info(f'Removing existing TextLines in page "{page_id}" region "{region.id}"') + region.set_TextLine([]) + else: + self.logger.warning(f'Keeping existing TextLines in page "{page_id}" region "{region.id}"') + ignore.extend(region.get_TextLine()) + # TODO: also allow grayscale_normalized (try/except?) + region_image, region_coords = self.workspace.image_from_segment( + region, page_image, page_coords, feature_selector='binarized') + # if the region images have already been clipped against their neighbours specifically, + # then we don't need to suppress all neighbours' foreground generally here + if 'clipped' in region_coords['features'].split(','): + ignore = [] + # go get TextLines + self._process_element( + region, ignore, region_image, region_coords, region.id, file_id + '_' + region.id, page_id, zoom) + return OcrdPageResult(pcgts) + def process(self): """Segment pages into regions+lines, tables into cells+lines, or regions into lines. @@ -335,7 +500,7 @@ def process(self): self.add_metadata(pcgts) page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id) page = pcgts.get_Page() - + # TODO: also allow grayscale_normalized (try/except?) page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector='binarized') @@ -521,15 +686,14 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, newly detected separators to guide region segmentation. """ if not image.width or not image.height: - self.logger.warning("Skipping '%s' with zero size", element_id) + self.logger.warning(f"Skipping '{element_id}' with zero size") return element_array = pil2array(image) element_bin = np.array(element_array <= midrange(element_array), bool) sep_bin = np.zeros_like(element_bin, bool) ignore_labels = np.zeros_like(element_bin, int) for i, segment in enumerate(ignore): - self.logger.debug('masking foreground of %s "%s" for "%s"', - type(segment).__name__[:-4], segment.id, element_id) + self.logger.debug(f'Masking foreground of {type(segment).__name__[:-4]} "{segment.id}" for "{element_id}"') # mark these segments (e.g. separator regions, tables, images) # for workflows where they have been detected already; # these will be: @@ -540,13 +704,11 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, # negative/above-max indices), either fully or partially, # then this will silently ignore them. The caller does # not need to concern herself with this. + sp_row = segment_polygon[:, 1] + sp_column = segment_polygon[:, 0] if isinstance(segment, SeparatorRegionType): - sep_bin[draw.polygon(segment_polygon[:, 1], - segment_polygon[:, 0], - sep_bin.shape)] = True - ignore_labels[draw.polygon(segment_polygon[:, 1], - segment_polygon[:, 0], - ignore_labels.shape)] = i+1 # mapped back for RO + sep_bin[draw.polygon(sp_row, sp_column, sep_bin.shape)] = True + ignore_labels[draw.polygon(sp_row, sp_column, ignore_labels.shape)] = i + 1 # mapped back for RO if isinstance(element, PageType): element_name = 'page' fullpage = True @@ -562,7 +724,7 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, element_name = 'region' fullpage = False report = check_region(element_bin, zoom) - self.logger.info('computing line segmentation for %s "%s"', element_name, element_id) + self.logger.info(f'Computing line segmentation for {element_name} "{element_id}"') # TODO: we should downscale if DPI is large enough to save time try: if report: @@ -570,9 +732,9 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, line_labels, baselines, seplines, images, colseps, scale = compute_segmentation( # suppress separators and ignored regions for textline estimation # but keep them for h/v-line detection (in fullpage mode): - element_bin, seps=(sep_bin+ignore_labels)>0, + element_bin, seps=(sep_bin + ignore_labels) > 0, zoom=zoom, fullpage=fullpage, - spread_dist=round(self.parameter['spread']/zoom*300/72), # in pt + spread_dist=round(self.parameter['spread'] / zoom * 300 / 72), # in pt # these are ignored when not in fullpage mode: maxcolseps=self.parameter['maxcolseps'], maxseps=self.parameter['maxseps'], @@ -580,16 +742,13 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, csminheight=self.parameter['csminheight']) except Exception as err: if isinstance(element, TextRegionType): - self.logger.error('Cannot line-segment region "%s": %s', element_id, err) + self.logger.error(f'Cannot line-segment region "{element_id}": {err}') # as a fallback, add a single text line comprising the whole region: element.add_TextLine(TextLineType(id=element_id + "_line", Coords=element.get_Coords())) else: - self.logger.error('Cannot line-segment %s "%s": %s', element_name, element_id, err) + self.logger.error(f'Cannot line-segment {element_name} "{element_id}": {err}') return - - self.logger.info('Found %d text lines for %s "%s"', - len(np.unique(line_labels)) - 1, - element_name, element_id) + self.logger.info(f'Found {len(np.unique(line_labels)) - 1} text lines for {element_name} "{element_id}"') # post-process line labels if isinstance(element, (PageType, TableRegionType)): # aggregate text lines to text regions @@ -598,31 +757,28 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, # i.e. identical line and region labels # to detect their reading order among the others # (these cannot be split or grouped together with other regions) - line_labels = np.where(line_labels, line_labels+len(ignore), ignore_labels) + line_labels = np.where(line_labels, line_labels + len(ignore), ignore_labels) # suppress separators/images in fg and try to use for partitioning slices sepmask = np.maximum(sep_bin, np.maximum(seplines > 0, images > 0)) region_labels = lines2regions( element_bin, line_labels, rlabels=ignore_labels, - sepmask=np.maximum(sepmask, colseps), # add bg + sepmask=np.maximum(sepmask, colseps), # add bg # decide horizontal vs vertical cut when gaps of similar size prefer_vertical=not isinstance(element, TableRegionType), gap_height=self.parameter['gap_height'], gap_width=self.parameter['gap_width'], scale=scale, zoom=zoom) - self.logger.info('Found %d text regions for %s "%s"', - len(np.unique(region_labels)) - 1, - element_name, element_id) + self.logger.info( + f'Found {len(np.unique(region_labels)) - 1} text regions for {element_name} "{element_id}"') except Exception as err: - self.logger.error('Cannot region-segment %s "%s": %s', - element_name, element_id, err) + self.logger.error(f'Cannot region-segment {element_name} "{element_id}": {err}') region_labels = np.where(line_labels > len(ignore), 1 + len(ignore), line_labels) - # prepare reading order group index if rogroup: if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): index = 0 - # start counting from largest existing index + # start counting from the largest existing index for elem in (rogroup.get_RegionRefIndexed() + rogroup.get_OrderedGroupIndexed() + rogroup.get_UnorderedGroupIndexed()): @@ -634,7 +790,7 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, region_no = 0 for region_label in np.unique(region_labels): if not region_label: - continue # no bg + continue # no bg region_mask = region_labels == region_label region_line_labels = line_labels * region_mask region_line_labels0 = np.setdiff1d(region_line_labels, [0]) @@ -643,13 +799,12 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, # (no new region, no actual text lines) region_line_labels0 = np.intersect1d(region_line_labels0, ignore_labels) assert len(region_line_labels0) == 1, \ - "region label %d has both existing regions and new lines (%s)" % ( - region_label, str(region_line_labels0)) + (f"Region label {region_label} has both existing regions and new lines " + f"({str(region_line_labels0)})") region = ignore[region_line_labels0[0] - 1] if rogroup and region.parent_object_ is element and not isinstance(region, SeparatorRegionType): index = page_add_to_reading_order(rogroup, region.id, index) - self.logger.debug('Region label %d is for ignored region "%s"', - region_label, region.id) + self.logger.debug(f'Region label {region_label} is for ignored region "{region.id}"') continue # normal case: new lines inside new regions # remove binary-empty labels, and re-order locally @@ -657,18 +812,18 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, order[np.setdiff1d(region_line_labels0, element_bin * region_line_labels)] = 0 region_line_labels = order[region_line_labels] # avoid horizontal gaps - region_line_labels = hmerge_line_seeds(element_bin, region_line_labels, scale, - seps=np.maximum(sepmask, colseps)) + region_line_labels = hmerge_line_seeds( + element_bin, region_line_labels, scale, seps=np.maximum(sepmask, colseps)) region_mask |= region_line_labels > 0 # find contours for region (can be non-contiguous) - regions, _ = masks2polygons(self.logger, region_mask * region_label, None, element_bin, - '%s "%s"' % (element_name, element_id), - min_area=6000/zoom/zoom, - simplify=ignore_labels * ~(sep_bin)) + regions, _ = masks2polygons( + self.logger, region_mask * region_label, None, element_bin, + name=f'{element_name} "{element_id}"', min_area=6000 / zoom / zoom, + simplify=ignore_labels * ~(sep_bin)) # find contours for lines (can be non-contiguous) - lines, _ = masks2polygons(self.logger, region_line_labels, baselines, element_bin, - 'region "%s"' % element_id, - min_area=640/zoom/zoom) + lines, _ = masks2polygons( + self.logger, region_line_labels, baselines, element_bin, + name=f'region "{element_id}"', min_area=640 / zoom / zoom) # create new lines in new regions (allocating by intersection) line_polys = [Polygon(polygon) for _, polygon, _ in lines] for _, region_polygon, _ in regions: @@ -677,34 +832,31 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, region_polygon = coordinates_for_segment(region_polygon, image, coords) region_polygon = polygon_for_parent(region_polygon, element) if region_polygon is None: - self.logger.warning('Ignoring extant region contour for region label %d', region_label) + self.logger.warning(f'Ignoring extant region contour for region label {region_label}') continue # annotate result: region_no += 1 region_id = element_id + "_region%04d" % region_no - self.logger.debug('Region label %d becomes ID "%s"', region_label, region_id) - region = TextRegionType( - id=region_id, Coords=CoordsType( - points=points_from_polygon(region_polygon))) + self.logger.debug(f'Region label {region_label} becomes ID "{region_id}"') + region = TextRegionType(id=region_id, Coords=CoordsType(points=points_from_polygon(region_polygon))) # find out which line (contours) belong to which region (contours) line_no = 0 for i, line_poly in enumerate(line_polys): - if not region_poly.intersects(line_poly): # .contains + if not region_poly.intersects(line_poly): # .contains continue line_label, line_polygon, line_baseline = lines[i] # convert back to absolute (page) coordinates: line_polygon = coordinates_for_segment(line_polygon, image, coords) line_polygon = polygon_for_parent(line_polygon, region) if line_polygon is None: - self.logger.warning('Ignoring extant line contour for region label %d line label %d', - region_label, line_label) + self.logger.warning( + f'Ignoring extant line contour for region label {region_label} line label {line_label}') continue # annotate result: line_no += 1 line_id = region_id + "_line%04d" % line_no - self.logger.debug('Line label %d becomes ID "%s"', line_label, line_id) - line = TextLineType(id=line_id, - Coords=CoordsType(points=points_from_polygon(line_polygon))) + self.logger.debug(f'Line label {line_label} becomes ID "{line_id}"') + line = TextLineType(id=line_id, Coords=CoordsType(points=points_from_polygon(line_polygon))) if line_baseline: line_baseline = coordinates_for_segment(line_baseline, image, coords) line.set_Baseline(BaselineType(points=points_from_polygon(line_baseline))) @@ -712,95 +864,87 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, # if the region has received text lines, keep it if region.get_TextLine(): element.add_TextRegion(region) - self.logger.info('Added region "%s" with %d lines for %s "%s"', - region_id, line_no, element_name, element_id) + self.logger.info( + f'Added region "{region_id}" with {line_no} lines for {element_name} "{element_id}"') if rogroup: index = page_add_to_reading_order(rogroup, region.id, index) # add additional image/non-text regions from compute_segmentation # (e.g. drop-capitals or images) ... - self.logger.info('Found %d large image regions for %s "%s"', images.max(), element_name, element_id) + self.logger.info(f'Found {images.max()} large image regions for {element_name} "{element_id}"') # find contours around region labels (can be non-contiguous): - image_polygons, _ = masks2polygons(self.logger, images, None, element_bin, - '%s "%s"' % (element_name, element_id)) + image_polygons, _ = masks2polygons( + self.logger, images, None, element_bin, f'{element_name} "{element_id}"') for image_label, polygon, _ in image_polygons: # convert back to absolute (page) coordinates: region_polygon = coordinates_for_segment(polygon, image, coords) region_polygon = polygon_for_parent(region_polygon, element) if region_polygon is None: - self.logger.warning('Ignoring extant region contour for image label %d', image_label) + self.logger.warning(f'Ignoring extant region contour for image label {image_label}') continue region_no += 1 # annotate result: region_id = element_id + "_image%04d" % region_no element.add_ImageRegion(ImageRegionType( - id=region_id, Coords=CoordsType( - points=points_from_polygon(region_polygon)))) + id=region_id, Coords=CoordsType(points=points_from_polygon(region_polygon)))) # split detected separator labels into separator regions: - self.logger.info('Found %d separators for %s "%s"', seplines.max(), element_name, element_id) + self.logger.info(f'Found {seplines.max()} separators for {element_name} "{element_id}"') # find contours around region labels (can be non-contiguous): - sep_polygons, _ = masks2polygons(self.logger, seplines, None, element_bin, - '%s "%s"' % (element_name, element_id), - open_holes=True, reorder=False) + sep_polygons, _ = masks2polygons( + self.logger, seplines, None, element_bin, + name=f'{element_name} "{element_id}"', open_holes=True, reorder=False) for sep_label, polygon, _ in sep_polygons: # convert back to absolute (page) coordinates: region_polygon = coordinates_for_segment(polygon, image, coords) region_polygon = polygon_for_parent(region_polygon, element) if region_polygon is None: - self.logger.warning('Ignoring extant region contour for separator %d', sep_label) + self.logger.warning(f'Ignoring extant region contour for separator {sep_label}') continue # annotate result: region_no += 1 region_id = element_id + "_sep%04d" % region_no element.add_SeparatorRegion(SeparatorRegionType( - id=region_id, Coords=CoordsType( - points=points_from_polygon(region_polygon)))) + id=region_id, Coords=CoordsType(points=points_from_polygon(region_polygon)))) # annotate a text/image-separated image element_array[sepmask] = np.amax(element_array) # clip to white/bg image_clipped = array2pil(element_array) file_path = self.workspace.save_image_file( - image_clipped, file_id + '.IMG-CLIP', self.output_file_grp, - page_id=page_id) + image_clipped, file_id + '.IMG-CLIP', self.output_file_grp, page_id=page_id) element.add_AlternativeImage(AlternativeImageType( filename=file_path, comments=coords['features'] + ',clipped')) else: - # get mask from region polygon: + # get mask from region polygon: region_polygon = coordinates_of_segment(element, image, coords) region_mask = np.zeros_like(element_bin, bool) - region_mask[draw.polygon(region_polygon[:, 1], - region_polygon[:, 0], - region_mask.shape)] = True + region_mask[draw.polygon(region_polygon[:, 1], region_polygon[:, 0], region_mask.shape)] = True # ensure the new line labels do not extrude from the region: line_labels = line_labels * region_mask # find contours around labels (can be non-contiguous): - line_polygons, _ = masks2polygons(self.logger, line_labels, baselines, element_bin, - 'region "%s"' % element_id, - min_area=640/zoom/zoom) + line_polygons, _ = masks2polygons( + self.logger, line_labels, baselines, element_bin, + name=f'region "{element_id}"', min_area=640 / zoom / zoom) line_no = 0 for line_label, polygon, baseline in line_polygons: # convert back to absolute (page) coordinates: line_polygon = coordinates_for_segment(polygon, image, coords) line_polygon = polygon_for_parent(line_polygon, element) if line_polygon is None: - self.logger.warning('Ignoring extant line contour for line label %d', - line_label) + self.logger.warning(f'Ignoring extant line contour for line label {line_label}') continue # annotate result: line_no += 1 line_id = element_id + "_line%04d" % line_no - line = TextLineType(id=line_id, - Coords=CoordsType(points=points_from_polygon(line_polygon))) + line = TextLineType(id=line_id, Coords=CoordsType(points=points_from_polygon(line_polygon))) if baseline: line_baseline = coordinates_for_segment(baseline, image, coords) line.set_Baseline(BaselineType(points=points_from_polygon(line_baseline))) element.add_TextLine(line) if not sep_bin.any(): - return # no derived image + return # no derived image # annotate a text/image-separated image - element_array[sep_bin] = np.amax(element_array) # clip to white/bg + element_array[sep_bin] = np.amax(element_array) # clip to white/bg image_clipped = array2pil(element_array) file_path = self.workspace.save_image_file( - image_clipped, file_id + '.IMG-CLIP', self.output_file_grp, - page_id=page_id) + image_clipped, file_id + '.IMG-CLIP', self.output_file_grp, page_id=page_id) # update PAGE (reference the image file): element.add_AlternativeImage(AlternativeImageType( filename=file_path, comments=coords['features'] + ',clipped')) From ad5ac7c4ab7f2b52bf313563456feca0094761ce Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 16 Aug 2024 11:06:01 +0200 Subject: [PATCH 49/97] ocropy_segment: remove process --- ocrd_cis/ocropy/segment.py | 317 ++++++++----------------------------- 1 file changed, 67 insertions(+), 250 deletions(-) diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index d2a7a727..94b6ab1f 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -256,6 +256,73 @@ def setup(self): self.logger = getLogger('processor.OcropySegment') def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: + """Segment pages into regions+lines, tables into cells+lines, or regions into lines. + Open and deserialise PAGE input files and their respective images, + then iterate over the element hierarchy down to the requested level. + + Depending on ``level-of-operation``, consider existing segments: + - If ``overwrite_separators=True`` on ``page`` level, then + delete any SeparatorRegions. + - If ``overwrite_regions=True`` on ``page`` level, then + delete any top-level TextRegions (along with ReadingOrder). + - If ``overwrite_regions=True`` on ``table`` level, then + delete any TextRegions in TableRegions (along with their OrderGroup). + - If ``overwrite_lines=True`` on ``region`` level, then + delete any TextLines in TextRegions. + - If ``overwrite_order=True`` on ``page`` or ``table`` level, then + delete the reading order OrderedGroup entry corresponding + to the (page/table) segment. + + Next, get each element image according to the layout annotation (from + the alternative image of the page/region, or by cropping via coordinates + into the higher-level image) in binarized form, and represent it as an array + with non-text regions and (remaining) text neighbours suppressed. + + Then compute a text line segmentation for that array (as a label mask). + When ``level-of-operation`` is ``page`` or ``table``, this also entails + detecting + - up to ``maximages`` large foreground images, + - up to ``maxseps`` foreground line separators and + - up to ``maxcolseps`` background column separators + before text line segmentation itself, as well as aggregating text lines + to text regions afterwards. + + Text regions are detected via a hybrid variant recursive X-Y cut algorithm + (RXYC): RXYC partitions the binarized image in top-down manner by detecting + horizontal or vertical gaps. This implementation uses the bottom-up text line + segmentation to guide the search, and also uses both pre-existing and newly + detected separators to alternatively partition the respective boxes into + non-rectangular parts. + + During line segmentation, suppress the foreground of all previously annotated + regions (of any kind) and lines, except if just removed due to ``overwrite``. + During region aggregation however, combine the existing separators with the + new-found separators to guide the column search. + + All detected segments (both text line and text region) are sorted according + to their reading order (assuming a top-to-bottom, left-to-right ordering). + When ``level-of-operation`` is ``page``, prefer vertical (column-first) + succession of regions. When it is ``table``, prefer horizontal (row-first) + succession of cells. + + Then for each resulting segment label, convert its background mask into + polygon outlines by finding the outer contours consistent with the element's + polygon outline. Annotate the result by adding it as a new TextLine/TextRegion: + - If ``level-of-operation`` is ``region``, then append the new lines to the + parent region. + - If it is ``table``, then append the new lines to their respective regions, + and append the new regions to the parent table. + (Also, create an OrderedGroup for it as the parent's RegionRef.) + - If it is ``page``, then append the new lines to their respective regions, + and append the new regions to the page. + (Also, create an OrderedGroup for it in the ReadingOrder.) + + Produce a new output file by serialising the resulting hierarchy. + """ + # FIXME: allow passing a-priori info on reading order / textline order + # (and then pass on as ``bt`` and ``rl``; however, there may be a mixture + # of different scripts; also, vertical writing needs internal rotation + # because our line segmentation only works for horizontal writing) overwrite_lines = self.parameter['overwrite_lines'] overwrite_regions = self.parameter['overwrite_regions'] overwrite_separators = self.parameter['overwrite_separators'] @@ -417,256 +484,6 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional region, ignore, region_image, region_coords, region.id, file_id + '_' + region.id, page_id, zoom) return OcrdPageResult(pcgts) - def process(self): - """Segment pages into regions+lines, tables into cells+lines, or regions into lines. - - Open and deserialise PAGE input files and their respective images, - then iterate over the element hierarchy down to the requested level. - - Depending on ``level-of-operation``, consider existing segments: - - If ``overwrite_separators=True`` on ``page`` level, then - delete any SeparatorRegions. - - If ``overwrite_regions=True`` on ``page`` level, then - delete any top-level TextRegions (along with ReadingOrder). - - If ``overwrite_regions=True`` on ``table`` level, then - delete any TextRegions in TableRegions (along with their OrderGroup). - - If ``overwrite_lines=True`` on ``region`` level, then - delete any TextLines in TextRegions. - - If ``overwrite_order=True`` on ``page`` or ``table`` level, then - delete the reading order OrderedGroup entry corresponding - to the (page/table) segment. - - Next, get each element image according to the layout annotation (from - the alternative image of the page/region, or by cropping via coordinates - into the higher-level image) in binarized form, and represent it as an array - with non-text regions and (remaining) text neighbours suppressed. - - Then compute a text line segmentation for that array (as a label mask). - When ``level-of-operation`` is ``page`` or ``table``, this also entails - detecting - - up to ``maximages`` large foreground images, - - up to ``maxseps`` foreground line separators and - - up to ``maxcolseps`` background column separators - before text line segmentation itself, as well as aggregating text lines - to text regions afterwards. - - Text regions are detected via a hybrid variant recursive X-Y cut algorithm - (RXYC): RXYC partitions the binarized image in top-down manner by detecting - horizontal or vertical gaps. This implementation uses the bottom-up text line - segmentation to guide the search, and also uses both pre-existing and newly - detected separators to alternatively partition the respective boxes into - non-rectangular parts. - - During line segmentation, suppress the foreground of all previously annotated - regions (of any kind) and lines, except if just removed due to ``overwrite``. - During region aggregation however, combine the existing separators with the - new-found separators to guide the column search. - - All detected segments (both text line and text region) are sorted according - to their reading order (assuming a top-to-bottom, left-to-right ordering). - When ``level-of-operation`` is ``page``, prefer vertical (column-first) - succession of regions. When it is ``table``, prefer horizontal (row-first) - succession of cells. - - Then for each resulting segment label, convert its background mask into - polygon outlines by finding the outer contours consistent with the element's - polygon outline. Annotate the result by adding it as a new TextLine/TextRegion: - - If ``level-of-operation`` is ``region``, then append the new lines to the - parent region. - - If it is ``table``, then append the new lines to their respective regions, - and append the new regions to the parent table. - (Also, create an OrderedGroup for it as the parent's RegionRef.) - - If it is ``page``, then append the new lines to their respective regions, - and append the new regions to the page. - (Also, create an OrderedGroup for it in the ReadingOrder.) - - Produce a new output file by serialising the resulting hierarchy. - """ - # FIXME: allow passing a-priori info on reading order / textline order - # (and then pass on as ``bt`` and ``rl``; however, there may be a mixture - # of different scripts; also, vertical writing needs internal rotation - # because our line segmentation only works for horizontal writing) - overwrite_lines = self.parameter['overwrite_lines'] - overwrite_regions = self.parameter['overwrite_regions'] - overwrite_separators = self.parameter['overwrite_separators'] - overwrite_order = self.parameter['overwrite_order'] - oplevel = self.parameter['level-of-operation'] - - for (n, input_file) in enumerate(self.input_files): - self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) - file_id = make_file_id(input_file, self.output_file_grp) - - pcgts = page_from_file(self.workspace.download_file(input_file)) - self.add_metadata(pcgts) - page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id) - page = pcgts.get_Page() - - # TODO: also allow grayscale_normalized (try/except?) - page_image, page_coords, page_image_info = self.workspace.image_from_page( - page, page_id, feature_selector='binarized') - zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) - - # aggregate existing regions so their foreground can be ignored - ignore = (page.get_ImageRegion() + - page.get_LineDrawingRegion() + - page.get_GraphicRegion() + - page.get_ChartRegion() + - page.get_MapRegion() + - page.get_MathsRegion() + - page.get_ChemRegion() + - page.get_MusicRegion() + - page.get_AdvertRegion() + - page.get_NoiseRegion() + - page.get_UnknownRegion() + - page.get_CustomRegion()) - if oplevel == 'page' and overwrite_separators: - page.set_SeparatorRegion([]) - else: - ignore.extend(page.get_SeparatorRegion()) - # prepare reading order - reading_order = dict() - ro = page.get_ReadingOrder() - if ro: - rogroup = ro.get_OrderedGroup() or ro.get_UnorderedGroup() - if rogroup: - page_get_reading_order(reading_order, rogroup) - - # get segments to process / overwrite - if oplevel == 'page': - ignore.extend(page.get_TableRegion()) - regions = list(page.get_TextRegion()) - if regions: - # page is already region-segmented - if overwrite_regions: - self.logger.info('removing existing TextRegions in page "%s"', page_id) - # we could remove all other region types as well, - # but this is more flexible (for workflows with - # specialized separator/image/table detectors): - page.set_TextRegion([]) - page.set_ReadingOrder(None) - ro = None - else: - self.logger.warning('keeping existing TextRegions in page "%s"', page_id) - ignore.extend(regions) - # create reading order if necessary - if not ro or overwrite_order: - ro = ReadingOrderType() - page.set_ReadingOrder(ro) - rogroup = ro.get_OrderedGroup() or ro.get_UnorderedGroup() - if not rogroup: - # new top-level group - rogroup = OrderedGroupType(id="reading-order") - ro.set_OrderedGroup(rogroup) - # go get TextRegions with TextLines (and SeparatorRegions): - self._process_element(page, ignore, page_image, page_coords, - page_id, file_id, - input_file.pageId, zoom, rogroup=rogroup) - if (not rogroup.get_RegionRefIndexed() and - not rogroup.get_OrderedGroupIndexed() and - not rogroup.get_UnorderedGroupIndexed()): - # schema forbids empty OrderedGroup - ro.set_OrderedGroup(None) - elif oplevel == 'table': - ignore.extend(page.get_TextRegion()) - regions = list(page.get_TableRegion()) - if not regions: - self.logger.warning('Page "%s" contains no table regions', page_id) - for region in regions: - subregions = region.get_TextRegion() - if subregions: - # table is already cell-segmented - if overwrite_regions: - self.logger.info('removing existing TextRegions in table "%s"', region.id) - region.set_TextRegion([]) - roelem = reading_order.get(region.id) - # replace by empty group with same index and ref - # (which can then take the cells as subregions) - reading_order[region.id] = page_subgroup_in_reading_order(self.logger, roelem) - else: - self.logger.warning('skipping table "%s" with existing TextRegions', region.id) - continue - # TODO: also allow grayscale_normalized (try/except?) - region_image, region_coords = self.workspace.image_from_segment( - region, page_image, page_coords, feature_selector='binarized') - # ignore everything but the current table region - subignore = regions + ignore - subignore.remove(region) - # create reading order group if necessary - roelem = reading_order.get(region.id) - if not roelem: - self.logger.warning("Page '%s' table region '%s' is not referenced in reading order (%s)", - page_id, region.id, "no target to add cells to") - elif overwrite_order: - # replace by empty ordered group with same (index and) ref - # (which can then take the cells as subregions) - roelem = page_subgroup_in_reading_order(self.logger, roelem) - reading_order[region.id] = roelem - elif isinstance(roelem, (OrderedGroupType, OrderedGroupIndexedType)): - self.logger.warning("Page '%s' table region '%s' already has an ordered group (%s)", - page_id, region.id, "cells will be appended") - elif isinstance(roelem, (UnorderedGroupType, UnorderedGroupIndexedType)): - self.logger.warning("Page '%s' table region '%s' already has an unordered group (%s)", - page_id, region.id, "cells will not be appended") - roelem = None - else: - # replace regionRef(Indexed) by group with same index and ref - # (which can then take the cells as subregions) - roelem = page_subgroup_in_reading_order(self.logger, roelem) - reading_order[region.id] = roelem - # go get TextRegions with TextLines (and SeparatorRegions) - self._process_element(region, subignore, region_image, region_coords, - region.id, file_id + '_' + region.id, - input_file.pageId, zoom, rogroup=roelem) - else: # 'region' - regions = list(page.get_TextRegion()) - # besides top-level text regions, line-segment any table cells, - # and for tables without any cells, add a pseudo-cell - for region in page.get_TableRegion(): - subregions = region.get_TextRegion() - if subregions: - regions.extend(subregions) - else: - subregion = TextRegionType(id=region.id + '_text', - Coords=region.get_Coords(), - # as if generated from parser: - parent_object_=region) - region.add_TextRegion(subregion) - regions.append(subregion) - if not regions: - self.logger.warning('Page "%s" contains no text regions', page_id) - for region in regions: - if region.get_TextLine(): - if overwrite_lines: - self.logger.info('removing existing TextLines in page "%s" region "%s"', page_id, region.id) - region.set_TextLine([]) - else: - self.logger.warning('keeping existing TextLines in page "%s" region "%s"', page_id, region.id) - ignore.extend(region.get_TextLine()) - # TODO: also allow grayscale_normalized (try/except?) - region_image, region_coords = self.workspace.image_from_segment( - region, page_image, page_coords, feature_selector='binarized') - # if the region images have already been clipped against their neighbours specifically, - # then we don't need to suppress all neighbours' foreground generally here - if 'clipped' in region_coords['features'].split(','): - ignore = [] - # go get TextLines - self._process_element(region, ignore, region_image, region_coords, - region.id, file_id + '_' + region.id, - input_file.pageId, zoom) - - # update METS (add the PAGE file): - file_path = join(self.output_file_grp, file_id + '.xml') - pcgts.set_pcGtsId(file_id) - out = self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - local_filename=file_path, - mimetype=MIMETYPE_PAGE, - content=to_xml(pcgts)) - self.logger.info('created file ID: %s, file_grp: %s, path: %s', - file_id, self.output_file_grp, out.local_filename) - def _process_element(self, element, ignore, image, coords, element_id, file_id, page_id, zoom=1.0, rogroup=None): """Add PAGE layout elements by segmenting an image. From 5d4007be9ec0e352520995302bd8b11e92e51aae Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 16 Aug 2024 11:41:01 +0200 Subject: [PATCH 50/97] segment: adapt to final v3 API --- ocrd_cis/ocropy/segment.py | 252 +++++++++++++++++++------------------ 1 file changed, 133 insertions(+), 119 deletions(-) diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index 94b6ab1f..bdeb40dd 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -1,8 +1,10 @@ from __future__ import absolute_import + +from typing import Optional from logging import Logger from os.path import join -from typing import Optional import itertools + import numpy as np from scipy.sparse.csgraph import minimum_spanning_tree from skimage import draw @@ -14,15 +16,21 @@ from shapely.validation import explain_validity from shapely import set_precision -from ocrd_modelfactory import page_from_file +from ocrd_utils import ( + getLogger, + coordinates_of_segment, + coordinates_for_segment, + points_from_polygon, + polygon_from_points, +) from ocrd_models.ocrd_page import ( - to_xml, CoordsType, - OcrdPage, + CoordsType, TextLineType, TextRegionType, SeparatorRegionType, PageType, - AlternativeImageType + AlternativeImageType, + OcrdPage ) from ocrd_models.ocrd_page_generateds import ( BaselineType, @@ -37,16 +45,7 @@ ReadingOrderType ) from ocrd import Processor -from ocrd.processor import OcrdPageResult -from ocrd_utils import ( - getLogger, - make_file_id, - coordinates_of_segment, - coordinates_for_segment, - points_from_polygon, - polygon_from_points, - MIMETYPE_PAGE -) +from ocrd.processor import OcrdPageResult, OcrdPageResultImage from .ocrolib import midrange from .ocrolib import morph @@ -255,11 +254,12 @@ def executable(self): def setup(self): self.logger = getLogger('processor.OcropySegment') - def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: + def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult: """Segment pages into regions+lines, tables into cells+lines, or regions into lines. - Open and deserialise PAGE input files and their respective images, + + Open and deserialise PAGE input file and its respective images, then iterate over the element hierarchy down to the requested level. - + Depending on ``level-of-operation``, consider existing segments: - If ``overwrite_separators=True`` on ``page`` level, then delete any SeparatorRegions. @@ -272,12 +272,12 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional - If ``overwrite_order=True`` on ``page`` or ``table`` level, then delete the reading order OrderedGroup entry corresponding to the (page/table) segment. - + Next, get each element image according to the layout annotation (from the alternative image of the page/region, or by cropping via coordinates into the higher-level image) in binarized form, and represent it as an array with non-text regions and (remaining) text neighbours suppressed. - + Then compute a text line segmentation for that array (as a label mask). When ``level-of-operation`` is ``page`` or ``table``, this also entails detecting @@ -286,25 +286,25 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional - up to ``maxcolseps`` background column separators before text line segmentation itself, as well as aggregating text lines to text regions afterwards. - + Text regions are detected via a hybrid variant recursive X-Y cut algorithm (RXYC): RXYC partitions the binarized image in top-down manner by detecting horizontal or vertical gaps. This implementation uses the bottom-up text line segmentation to guide the search, and also uses both pre-existing and newly detected separators to alternatively partition the respective boxes into non-rectangular parts. - + During line segmentation, suppress the foreground of all previously annotated regions (of any kind) and lines, except if just removed due to ``overwrite``. During region aggregation however, combine the existing separators with the new-found separators to guide the column search. - + All detected segments (both text line and text region) are sorted according to their reading order (assuming a top-to-bottom, left-to-right ordering). When ``level-of-operation`` is ``page``, prefer vertical (column-first) succession of regions. When it is ``table``, prefer horizontal (row-first) succession of cells. - + Then for each resulting segment label, convert its background mask into polygon outlines by finding the outer contours consistent with the element's polygon outline. Annotate the result by adding it as a new TextLine/TextRegion: @@ -316,7 +316,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional - If it is ``page``, then append the new lines to their respective regions, and append the new regions to the page. (Also, create an OrderedGroup for it in the ReadingOrder.) - + Produce a new output file by serialising the resulting hierarchy. """ # FIXME: allow passing a-priori info on reading order / textline order @@ -330,6 +330,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional oplevel = self.parameter['level-of-operation'] pcgts = input_pcgts[0] + result = OcrdPageResult(pcgts) page = pcgts.get_Page() # TODO: also allow grayscale_normalized (try/except?) @@ -361,14 +362,15 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional rogroup = ro.get_OrderedGroup() or ro.get_UnorderedGroup() if rogroup: page_get_reading_order(reading_order, rogroup) - # get segments to process / overwrite + + # get segments to process / overwrite if oplevel == 'page': ignore.extend(page.get_TableRegion()) regions = list(page.get_TextRegion()) if regions: # page is already region-segmented if overwrite_regions: - self.logger.info(f'Removing existing TextRegions in page "{page_id}"') + self.logger.info('removing existing TextRegions in page "%s"', page_id) # we could remove all other region types as well, # but this is more flexible (for workflows with # specialized separator/image/table detectors): @@ -376,7 +378,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional page.set_ReadingOrder(None) ro = None else: - self.logger.warning(f'Keeping existing TextRegions in page "{page_id}"') + self.logger.warning('keeping existing TextRegions in page "%s"', page_id) ignore.extend(regions) # create reading order if necessary if not ro or overwrite_order: @@ -387,32 +389,36 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional # new top-level group rogroup = OrderedGroupType(id="reading-order") ro.set_OrderedGroup(rogroup) - # go get TextRegions with TextLines (and SeparatorRegions): - self._process_element( - page, ignore, page_image, page_coords, page_id, file_id, page_id, zoom, rogroup=rogroup) if (not rogroup.get_RegionRefIndexed() and - not rogroup.get_OrderedGroupIndexed() and - not rogroup.get_UnorderedGroupIndexed()): - # schema forbids empty OrderedGroup + not rogroup.get_OrderedGroupIndexed() and + not rogroup.get_UnorderedGroupIndexed()): + # schema forbids empty OrderedGroup ro.set_OrderedGroup(None) - elif oplevel == 'table': + # go get TextRegions with TextLines (and SeparatorRegions): + image = self._process_element(page, ignore, page_image, page_coords, + zoom=zoom, rogroup=rogroup) + if image: + result.images.append(image) + return result + + if oplevel == 'table': ignore.extend(page.get_TextRegion()) regions = list(page.get_TableRegion()) if not regions: - self.logger.warning(f'Page "{page_id}" contains no table regions') + self.logger.warning('Page "%s" contains no table regions', page_id) for region in regions: subregions = region.get_TextRegion() if subregions: # table is already cell-segmented if overwrite_regions: - self.logger.info(f'Removing existing TextRegions in table "{region.id}"') + self.logger.info('removing existing TextRegions in table "%s"', region.id) region.set_TextRegion([]) roelem = reading_order.get(region.id) # replace by empty group with same index and ref # (which can then take the cells as subregions) reading_order[region.id] = page_subgroup_in_reading_order(self.logger, roelem) else: - self.logger.warning(f'Skipping table "{region.id}" with existing TextRegions') + self.logger.warning('skipping table "%s" with existing TextRegions', region.id) continue # TODO: also allow grayscale_normalized (try/except?) region_image, region_coords = self.workspace.image_from_segment( @@ -423,19 +429,19 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional # create reading order group if necessary roelem = reading_order.get(region.id) if not roelem: - self.logger.warning(f"Page '{page_id}' table region '{region.id}' is not referenced in reading " - f"order (no target to add cells to)") + self.logger.warning("Page '%s' table region '%s' is not referenced in reading order (%s)", + page_id, region.id, "no target to add cells to") elif overwrite_order: # replace by empty ordered group with same (index and) ref # (which can then take the cells as subregions) roelem = page_subgroup_in_reading_order(self.logger, roelem) reading_order[region.id] = roelem elif isinstance(roelem, (OrderedGroupType, OrderedGroupIndexedType)): - self.logger.warning(f"Page '{page_id}' table region '{region.id}' already has an ordered " - f"group (cells will be appended)") + self.logger.warning("Page '%s' table region '%s' already has an ordered group (%s)", + page_id, region.id, "cells will be appended") elif isinstance(roelem, (UnorderedGroupType, UnorderedGroupIndexedType)): - self.logger.warning(f"Page '{page_id}' table region '{region.id}' already has an unordered " - f"group (cells will not be appended)") + self.logger.warning("Page '%s' table region '%s' already has an unordered group (%s)", + page_id, region.id, "cells will not be appended") roelem = None else: # replace regionRef(Indexed) by group with same index and ref @@ -443,10 +449,11 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional roelem = page_subgroup_in_reading_order(self.logger, roelem) reading_order[region.id] = roelem # go get TextRegions with TextLines (and SeparatorRegions) - self._process_element( - region, subignore, region_image, region_coords, region.id, file_id + '_' + region.id, - page_id, zoom, rogroup=roelem) - else: # 'region' + image = self._process_element(region, subignore, region_image, region_coords, + zoom=zoom, rogroup=roelem) + if image: + result.images.append(image) + else: # 'region' regions = list(page.get_TextRegion()) # besides top-level text regions, line-segment any table cells, # and for tables without any cells, add a pseudo-cell @@ -455,11 +462,10 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional if subregions: regions.extend(subregions) else: - subregion = TextRegionType( - id=region.id + '_text', - Coords=region.get_Coords(), - # as if generated from parser: - parent_object_=region) + subregion = TextRegionType(id=region.id + '_text', + Coords=region.get_Coords(), + # as if generated from parser: + parent_object_=region) region.add_TextRegion(subregion) regions.append(subregion) if not regions: @@ -467,10 +473,10 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional for region in regions: if region.get_TextLine(): if overwrite_lines: - self.logger.info(f'Removing existing TextLines in page "{page_id}" region "{region.id}"') + self.logger.info('removing existing TextLines in page "%s" region "%s"', page_id, region.id) region.set_TextLine([]) else: - self.logger.warning(f'Keeping existing TextLines in page "{page_id}" region "{region.id}"') + self.logger.warning('keeping existing TextLines in page "%s" region "%s"', page_id, region.id) ignore.extend(region.get_TextLine()) # TODO: also allow grayscale_normalized (try/except?) region_image, region_coords = self.workspace.image_from_segment( @@ -480,11 +486,13 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional if 'clipped' in region_coords['features'].split(','): ignore = [] # go get TextLines - self._process_element( - region, ignore, region_image, region_coords, region.id, file_id + '_' + region.id, page_id, zoom) - return OcrdPageResult(pcgts) + image = self._process_element(region, ignore, region_image, region_coords, zoom=zoom) + if image: + result.images.append(image) - def _process_element(self, element, ignore, image, coords, element_id, file_id, page_id, zoom=1.0, rogroup=None): + return result + + def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=None) -> Optional[OcrdPageResultImage]: """Add PAGE layout elements by segmenting an image. Given a PageType, TableRegionType or TextRegionType ``element``, and @@ -503,14 +511,15 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, newly detected separators to guide region segmentation. """ if not image.width or not image.height: - self.logger.warning(f"Skipping '{element_id}' with zero size") - return + self.logger.warning(f"Skipping '{element.id}' with zero size") + return None element_array = pil2array(image) element_bin = np.array(element_array <= midrange(element_array), bool) sep_bin = np.zeros_like(element_bin, bool) ignore_labels = np.zeros_like(element_bin, int) for i, segment in enumerate(ignore): - self.logger.debug(f'Masking foreground of {type(segment).__name__[:-4]} "{segment.id}" for "{element_id}"') + self.logger.debug(f'masking foreground of {type(segment).__name__[:-4]} ' + f'"{segment.id}" for "{element.id}"') # mark these segments (e.g. separator regions, tables, images) # for workflows where they have been detected already; # these will be: @@ -522,14 +531,15 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, # then this will silently ignore them. The caller does # not need to concern herself with this. sp_row = segment_polygon[:, 1] - sp_column = segment_polygon[:, 0] + sp_col = segment_polygon[:, 0] if isinstance(segment, SeparatorRegionType): - sep_bin[draw.polygon(sp_row, sp_column, sep_bin.shape)] = True - ignore_labels[draw.polygon(sp_row, sp_column, ignore_labels.shape)] = i + 1 # mapped back for RO + sep_bin[draw.polygon(sp_row, sp_col, sep_bin.shape)] = True + ignore_labels[draw.polygon(sp_row, sp_col, ignore_labels.shape)] = i+1 # mapped back for RO if isinstance(element, PageType): element_name = 'page' fullpage = True report = check_page(element_bin, zoom) + suffix = '.IMG-CLIP' elif isinstance(element, TableRegionType) or ( # sole/congruent text region of a table region? element.id.endswith('_text') and @@ -537,11 +547,13 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, element_name = 'table' fullpage = True report = check_region(element_bin, zoom) + suffix = element.id + '.IMG-CLIP' else: element_name = 'region' fullpage = False report = check_region(element_bin, zoom) - self.logger.info(f'Computing line segmentation for {element_name} "{element_id}"') + suffix = element.id + '.IMG-CLIP' + self.logger.info(f'computing line segmentation for {element_name} "{element.id}"') # TODO: we should downscale if DPI is large enough to save time try: if report: @@ -551,7 +563,7 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, # but keep them for h/v-line detection (in fullpage mode): element_bin, seps=(sep_bin + ignore_labels) > 0, zoom=zoom, fullpage=fullpage, - spread_dist=round(self.parameter['spread'] / zoom * 300 / 72), # in pt + spread_dist=round(self.parameter['spread'] / zoom * 300 / 72), # in pt # these are ignored when not in fullpage mode: maxcolseps=self.parameter['maxcolseps'], maxseps=self.parameter['maxseps'], @@ -559,13 +571,15 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, csminheight=self.parameter['csminheight']) except Exception as err: if isinstance(element, TextRegionType): - self.logger.error(f'Cannot line-segment region "{element_id}": {err}') + self.logger.error(f'Cannot line-segment region "{element.id}": {err}') # as a fallback, add a single text line comprising the whole region: - element.add_TextLine(TextLineType(id=element_id + "_line", Coords=element.get_Coords())) + element.add_TextLine(TextLineType(id=element.id + "_line", Coords=element.get_Coords())) else: - self.logger.error(f'Cannot line-segment {element_name} "{element_id}": {err}') - return - self.logger.info(f'Found {len(np.unique(line_labels)) - 1} text lines for {element_name} "{element_id}"') + self.logger.error(f'Cannot line-segment {element_name} "{element.id}": {err}') + return None + + self.logger.info(f'Found {len(np.unique(line_labels)) - 1} text lines ' + f'for {element_name} "{element.id}"') # post-process line labels if isinstance(element, (PageType, TableRegionType)): # aggregate text lines to text regions @@ -580,17 +594,18 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, region_labels = lines2regions( element_bin, line_labels, rlabels=ignore_labels, - sepmask=np.maximum(sepmask, colseps), # add bg + sepmask=np.maximum(sepmask, colseps), # add bg # decide horizontal vs vertical cut when gaps of similar size prefer_vertical=not isinstance(element, TableRegionType), gap_height=self.parameter['gap_height'], gap_width=self.parameter['gap_width'], scale=scale, zoom=zoom) - self.logger.info( - f'Found {len(np.unique(region_labels)) - 1} text regions for {element_name} "{element_id}"') + self.logger.info(f'Found {len(np.unique(region_labels)) - 1} text regions ' + f'for {element_name} "{element.id}"') except Exception as err: - self.logger.error(f'Cannot region-segment {element_name} "{element_id}": {err}') + self.logger.error(f'Cannot region-segment {element_name} "{element.id}": {err}') region_labels = np.where(line_labels > len(ignore), 1 + len(ignore), line_labels) + # prepare reading order group index if rogroup: if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): @@ -607,7 +622,7 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, region_no = 0 for region_label in np.unique(region_labels): if not region_label: - continue # no bg + continue # no bg region_mask = region_labels == region_label region_line_labels = line_labels * region_mask region_line_labels0 = np.setdiff1d(region_line_labels, [0]) @@ -616,12 +631,12 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, # (no new region, no actual text lines) region_line_labels0 = np.intersect1d(region_line_labels0, ignore_labels) assert len(region_line_labels0) == 1, \ - (f"Region label {region_label} has both existing regions and new lines " - f"({str(region_line_labels0)})") + (f'region label "{region_label}" has both existing regions and new lines ' + f'({str(region_line_labels0)})') region = ignore[region_line_labels0[0] - 1] if rogroup and region.parent_object_ is element and not isinstance(region, SeparatorRegionType): index = page_add_to_reading_order(rogroup, region.id, index) - self.logger.debug(f'Region label {region_label} is for ignored region "{region.id}"') + self.logger.debug(f'Region label "{region_label}" is for ignored region "{region.id}"') continue # normal case: new lines inside new regions # remove binary-empty labels, and re-order locally @@ -629,18 +644,18 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, order[np.setdiff1d(region_line_labels0, element_bin * region_line_labels)] = 0 region_line_labels = order[region_line_labels] # avoid horizontal gaps - region_line_labels = hmerge_line_seeds( - element_bin, region_line_labels, scale, seps=np.maximum(sepmask, colseps)) + region_line_labels = hmerge_line_seeds(element_bin, region_line_labels, scale, + seps=np.maximum(sepmask, colseps)) region_mask |= region_line_labels > 0 # find contours for region (can be non-contiguous) - regions, _ = masks2polygons( - self.logger, region_mask * region_label, None, element_bin, - name=f'{element_name} "{element_id}"', min_area=6000 / zoom / zoom, - simplify=ignore_labels * ~(sep_bin)) + regions, _ = masks2polygons(self.logger, region_mask * region_label, None, element_bin, + name=f'{element_name} "{element.id}"', + min_area=6000 / zoom / zoom, + simplify=ignore_labels * ~(sep_bin)) # find contours for lines (can be non-contiguous) - lines, _ = masks2polygons( - self.logger, region_line_labels, baselines, element_bin, - name=f'region "{element_id}"', min_area=640 / zoom / zoom) + lines, _ = masks2polygons(self.logger, region_line_labels, baselines, element_bin, + name=f'region "{element.id}"', + min_area=640 / zoom / zoom) # create new lines in new regions (allocating by intersection) line_polys = [Polygon(polygon) for _, polygon, _ in lines] for _, region_polygon, _ in regions: @@ -653,13 +668,13 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, continue # annotate result: region_no += 1 - region_id = element_id + "_region%04d" % region_no + region_id = element.id + "_region%04d" % region_no self.logger.debug(f'Region label {region_label} becomes ID "{region_id}"') region = TextRegionType(id=region_id, Coords=CoordsType(points=points_from_polygon(region_polygon))) # find out which line (contours) belong to which region (contours) line_no = 0 for i, line_poly in enumerate(line_polys): - if not region_poly.intersects(line_poly): # .contains + if not region_poly.intersects(line_poly): # .contains continue line_label, line_polygon, line_baseline = lines[i] # convert back to absolute (page) coordinates: @@ -681,16 +696,16 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, # if the region has received text lines, keep it if region.get_TextLine(): element.add_TextRegion(region) - self.logger.info( - f'Added region "{region_id}" with {line_no} lines for {element_name} "{element_id}"') + self.logger.info(f'Added region "{region_id}" with {line_no} lines ' + f'for {element_name} "{element.id}"') if rogroup: index = page_add_to_reading_order(rogroup, region.id, index) # add additional image/non-text regions from compute_segmentation # (e.g. drop-capitals or images) ... - self.logger.info(f'Found {images.max()} large image regions for {element_name} "{element_id}"') + self.logger.info(f'Found {images.max()} large image regions for {element_name} "{element.id}"') # find contours around region labels (can be non-contiguous): - image_polygons, _ = masks2polygons( - self.logger, images, None, element_bin, f'{element_name} "{element_id}"') + image_polygons, _ = masks2polygons(self.logger, images, None, element_bin, + name=f'{element_name} "{element.id}"') for image_label, polygon, _ in image_polygons: # convert back to absolute (page) coordinates: region_polygon = coordinates_for_segment(polygon, image, coords) @@ -700,15 +715,15 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, continue region_no += 1 # annotate result: - region_id = element_id + "_image%04d" % region_no + region_id = element.id + "_image%04d" % region_no element.add_ImageRegion(ImageRegionType( id=region_id, Coords=CoordsType(points=points_from_polygon(region_polygon)))) # split detected separator labels into separator regions: - self.logger.info(f'Found {seplines.max()} separators for {element_name} "{element_id}"') + self.logger.info(f'Found {seplines.max()} separators for {element_name} "{element.id}"') # find contours around region labels (can be non-contiguous): - sep_polygons, _ = masks2polygons( - self.logger, seplines, None, element_bin, - name=f'{element_name} "{element_id}"', open_holes=True, reorder=False) + sep_polygons, _ = masks2polygons(self.logger, seplines, None, element_bin, + name=f'{element_name} "{element.id}"', + open_holes=True, reorder=False) for sep_label, polygon, _ in sep_polygons: # convert back to absolute (page) coordinates: region_polygon = coordinates_for_segment(polygon, image, coords) @@ -718,27 +733,28 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, continue # annotate result: region_no += 1 - region_id = element_id + "_sep%04d" % region_no + region_id = element.id + "_sep%04d" % region_no element.add_SeparatorRegion(SeparatorRegionType( id=region_id, Coords=CoordsType(points=points_from_polygon(region_polygon)))) # annotate a text/image-separated image element_array[sepmask] = np.amax(element_array) # clip to white/bg image_clipped = array2pil(element_array) - file_path = self.workspace.save_image_file( - image_clipped, file_id + '.IMG-CLIP', self.output_file_grp, page_id=page_id) - element.add_AlternativeImage(AlternativeImageType( - filename=file_path, comments=coords['features'] + ',clipped')) + image_ref = AlternativeImageType(comments=coords['features'] + ',clipped') + element.add_AlternativeImage(image_ref) + return OcrdPageResultImage(image_clipped, suffix, image_ref) else: - # get mask from region polygon: + # get mask from region polygon: region_polygon = coordinates_of_segment(element, image, coords) region_mask = np.zeros_like(element_bin, bool) - region_mask[draw.polygon(region_polygon[:, 1], region_polygon[:, 0], region_mask.shape)] = True + region_mask[draw.polygon(region_polygon[:, 1], + region_polygon[:, 0], + region_mask.shape)] = True # ensure the new line labels do not extrude from the region: line_labels = line_labels * region_mask # find contours around labels (can be non-contiguous): - line_polygons, _ = masks2polygons( - self.logger, line_labels, baselines, element_bin, - name=f'region "{element_id}"', min_area=640 / zoom / zoom) + line_polygons, _ = masks2polygons(self.logger, line_labels, baselines, element_bin, + name=f'region "{element.id}"', + min_area=640 / zoom / zoom) line_no = 0 for line_label, polygon, baseline in line_polygons: # convert back to absolute (page) coordinates: @@ -749,22 +765,20 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, continue # annotate result: line_no += 1 - line_id = element_id + "_line%04d" % line_no + line_id = element.id + "_line%04d" % line_no line = TextLineType(id=line_id, Coords=CoordsType(points=points_from_polygon(line_polygon))) if baseline: line_baseline = coordinates_for_segment(baseline, image, coords) line.set_Baseline(BaselineType(points=points_from_polygon(line_baseline))) element.add_TextLine(line) if not sep_bin.any(): - return # no derived image + return None # no derived image # annotate a text/image-separated image - element_array[sep_bin] = np.amax(element_array) # clip to white/bg + element_array[sep_bin] = np.amax(element_array) # clip to white/bg image_clipped = array2pil(element_array) - file_path = self.workspace.save_image_file( - image_clipped, file_id + '.IMG-CLIP', self.output_file_grp, page_id=page_id) - # update PAGE (reference the image file): - element.add_AlternativeImage(AlternativeImageType( - filename=file_path, comments=coords['features'] + ',clipped')) + image_ref = AlternativeImageType(comments=coords['features'] + ',clipped') + element.add_AlternativeImage(image_ref) + return OcrdPageResultImage(image_clipped, suffix, image_ref) def polygon_for_parent(polygon, parent): """Clip polygon to parent polygon range. From df1c35cbe1325a8da5dabd2c9227a7246439fd15 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 16 Aug 2024 14:42:57 +0200 Subject: [PATCH 51/97] train: adapt to final v3 API --- ocrd_cis/ocropy/train.py | 129 +++++++++++++++++++-------------------- 1 file changed, 64 insertions(+), 65 deletions(-) diff --git a/ocrd_cis/ocropy/train.py b/ocrd_cis/ocropy/train.py index 08b68693..5c57b2cf 100644 --- a/ocrd_cis/ocropy/train.py +++ b/ocrd_cis/ocropy/train.py @@ -1,12 +1,15 @@ from __future__ import absolute_import + +from typing import Optional from logging import Logger from sys import exit from os import getcwd, makedirs, remove from os.path import abspath, dirname, exists, join, isfile import tempfile -from ocrd_modelfactory import page_from_file -from ocrd import Processor +from ocrd_models import OcrdPage +from ocrd import Processor, Workspace +from ocrd.processor import OcrdPageResult from ocrd_utils import getLogger from .ocropus_rtrain import * @@ -37,80 +40,79 @@ def executable(self): def setup(self): self.logger = getLogger('processor.OcropyTrain') - self.old_cwd = getcwd() - #print(self.parameter) if 'model' in self.parameter: model = self.parameter['model'] try: - modelpath = self.resolve_resource(model) + self.modelpath = self.resolve_resource(model) except SystemExit: ocropydir = dirname(abspath(__file__)) - modelpath = join(ocropydir, 'models', model) - self.logger.info("Failed to resolve model '%s' path, trying '%s'", model, modelpath) + self.modelpath = join(ocropydir, 'models', model) + self.logger.error(f"Failed to resolve model '{model}' path, trying '{modelpath}'") if not isfile(modelpath): - self.logger.error("Could not find model '%s'. Try 'ocrd resmgr download ocrd-cis-ocropy-recognize %s'", - model, model) + self.logger.critical(f"Could not find model '{model}'.\n" + f"Try 'ocrd resmgr download ocrd-cis-ocropy-recognize {model}'") exit(1) - outputpath = join(self.old_cwd, 'output', model) - if 'outputpath' in self.parameter: - outputpath = join(self.parameter, model) + self.outputpath = join(self.parameter.get('outputpath', 'output'), model) else: - modelpath = None - outputpath = join(self.old_cwd, 'output', 'lstm') - if 'outputpath' in self.parameter: - outputpath = join(self.parameter, 'lstm') - makedirs(dirname(outputpath)) - self.modelpath = modelpath - self.outputpath = outputpath - - def process(self): + self.modelpath = None + self.outputpath = join(self.parameter.get('outputpath', 'output'), 'lstm') + makedirs(dirname(self.outputpath)) + self.filelist = None + + def process_workspace(self, workspace: Workspace) -> None: """ Trains a new model on the text lines from the input fileGrp, - extracted as temporary image-text file pairs. + extracted as image-text file pairs into the output fileGrp. + (If the output fileGrp already exists and these files should + be re-used, pass the `--overwrite` option when processing.) + + The model is written into `outputpath` (or just `output`) under + the same name as `model` (i.e. the start model, or just `lstm`). + """ + self.filelist = [] + super().process_workspace(workspace) + self.logger.info(f"Training {self.outputpath} from {self.modelpath or 'scratch'} " + f"on {len(self.filelist)} file pairs") + rtrain(self.filelist, self.modelpath, self.outputpath, self.parameter['ntrain']) + # deletefiles(self.filelist) + + def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult: + """ + Extracts pairs of plaintext and cropped image files for each text line + in the PAGE file (to be used during training). """ - filelist = [] - filepath = tempfile.mkdtemp(prefix='ocrd-cis-ocropy-train-') + pcgts = input_pcgts[0] #self.logger.info("Using model %s in %s for recognition", model) - for (n, input_file) in enumerate(self.input_files): - #self.logger.info("INPUT FILE %i / %s", n, input_file) - pcgts = page_from_file(self.workspace.download_file(input_file)) - page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id) - page = pcgts.get_Page() - page_image, page_coords, _ = self.workspace.image_from_page(page, page_id) - - self.logger.info("Extracting from page '%s'", page_id) - for region in page.get_AllRegions(classes=['Text']): - textlines = region.get_TextLine() - self.logger.info("Extracting %i lines from region '%s'", len(textlines), region.id) - for line in textlines: - if self.parameter['textequiv_level'] == 'line': - path = join(filepath, page_id + region.id + line.id) - imgpath = self.extract_segment(path, line, page_image, page_coords) - if imgpath: - filelist.append(imgpath) + page = pcgts.get_Page() + page_image, page_coords, _ = self.workspace.image_from_page(page, page_id) + + self.logger.debug(f"Extracting from page '{page_id}'") + for region in page.get_AllRegions(classes=['Text']): + textlines = region.get_TextLine() + self.logger.debug(f"Extracting {len(textlines)} lines from region '{region.id}'") + for line in textlines: + if self.parameter['textequiv_level'] == 'line': + path = join(self.output_file_grp, f"{page_id}_{region.id}_{line.id}") + self.filelist.append(self.extract_segment(path, line, page_image, page_coords)) + continue + for word in line.get_Word(): + if self.parameter['textequiv_level'] == 'word': + path = join(self.output_file_grp, f"{page_id}_{region.id}_{line.id}_{word.id}") + self.filelist.append(self.extract_segment(path, word, page_image, page_coords)) continue - for word in line.get_Word(): - if self.parameter['textequiv_level'] == 'word': - path = join(filepath, page_id + region.id + line.id + word.id) - imgpath = self.extract_segment(path, word, page_image, page_coords) - if imgpath: - filelist.append(imgpath) - continue - for glyph in word.get_Glyph(): - path = join(filepath, page_id + region.id + line.id + glyph.id) - imgpath = self.extract_segment(path, glyph, page_image, page_coords) - if imgpath: - filelist.append(imgpath) - - self.logger.info("Training %s from %s on %i file pairs", - self.outputpath, - self.modelpath or 'scratch', - len(filelist)) - rtrain(filelist, self.modelpath, self.outputpath, self.parameter['ntrain']) - deletefiles(filelist) + for glyph in word.get_Glyph(): + path = join(self.output_file_grp, f"{page_id}_{region.id}_{line.id}_{word.id}_{glyph.id}") + self.filelist.append(self.extract_segment(path, glyph, page_image, page_coords)) + # FIXME: PAGE-XML not really needed, find a way around this (raising special exception?) + return OcrdPageResult(pcgts) def extract_segment(self, path, segment, page_image, page_coords): - #ground truth + gtpath = path + '.gt.txt' + imgpath = path + '.png' + if exists(gtpath) and exists(imgpath): + self.logger.debug(f"Reusing {segment.__class__.__name__} '{segment.id}' file pair") + return imgpath + gt = segment.TextEquiv if not gt: return None @@ -118,11 +120,10 @@ def extract_segment(self, path, segment, page_image, page_coords): if not gt or not gt.strip(): return None gt = gt.strip() - gtpath = path + '.gt.txt' with open(gtpath, "w", encoding='utf-8') as f: f.write(gt) - self.logger.debug("Extracting %s '%s'", segment.__class__.__name__, segment.id) + self.logger.debug(f"Extracting {segment.__class__.__name__} '{segment.id}' file pair") image, coords = self.workspace.image_from_segment(segment, page_image, page_coords) if 'binarized' not in coords['features'].split(','): @@ -132,8 +133,6 @@ def extract_segment(self, path, segment, page_image, page_coords): # resize image to 48 pixel height image = resize_keep_ratio(image) - #save temp image - imgpath = path + '.png' image.save(imgpath) return imgpath From c08b623f9b0ad9daf4f8dc858b5b416b1212e018 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 16 Aug 2024 14:51:54 +0200 Subject: [PATCH 52/97] ocrd-tool.json: add v3 cardinalities --- ocrd_cis/ocrd-tool.json | 120 +++++++++++----------------------------- 1 file changed, 31 insertions(+), 89 deletions(-) diff --git a/ocrd_cis/ocrd-tool.json b/ocrd_cis/ocrd-tool.json index a93917da..c2e20268 100644 --- a/ocrd_cis/ocrd-tool.json +++ b/ocrd_cis/ocrd-tool.json @@ -12,17 +12,9 @@ "preprocessing/optimization/grayscale_normalization", "preprocessing/optimization/deskewing" ], - "input_file_grp": [ - "OCR-D-IMG", - "OCR-D-SEG-BLOCK", - "OCR-D-SEG-LINE" - ], - "output_file_grp": [ - "OCR-D-IMG-BIN", - "OCR-D-SEG-BLOCK", - "OCR-D-SEG-LINE" - ], - "description": "Binarize (and optionally deskew/despeckle) pages / regions / lines with ocropy", + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, + "description": "Binarize (and optionally deskew/despeckle) pages / regions / lines with Ocropy v1", "parameters": { "method": { "type": "string", @@ -75,15 +67,9 @@ "steps": [ "preprocessing/optimization/deskewing" ], - "input_file_grp": [ - "OCR-D-SEG-BLOCK", - "OCR-D-SEG-LINE" - ], - "output_file_grp": [ - "OCR-D-SEG-BLOCK", - "OCR-D-SEG-LINE" - ], - "description": "Deskew regions with ocropy (by annotating orientation angle and adding AlternativeImage)", + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, + "description": "Deskew regions with Ocropy v1 (by annotating orientation angle and adding AlternativeImage)", "parameters": { "maxskew": { "type": "number", @@ -106,17 +92,9 @@ "steps": [ "preprocessing/optimization/despeckling" ], - "input_file_grp": [ - "OCR-D-IMG", - "OCR-D-SEG-BLOCK", - "OCR-D-SEG-LINE" - ], - "output_file_grp": [ - "OCR-D-IMG-DESPECK", - "OCR-D-SEG-BLOCK", - "OCR-D-SEG-LINE" - ], - "description": "Despeckle pages / regions / lines with ocropy", + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, + "description": "Despeckle pages / regions / lines with Ocropy v1", "parameters": { "noise_maxsize": { "type": "number", @@ -147,14 +125,8 @@ "layout/segmentation/region", "layout/segmentation/line" ], - "input_file_grp": [ - "OCR-D-SEG-BLOCK", - "OCR-D-SEG-LINE" - ], - "output_file_grp": [ - "OCR-D-SEG-BLOCK", - "OCR-D-SEG-LINE" - ], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "description": "Clip text regions / lines at intersections with neighbours", "parameters": { "level-of-operation": { @@ -185,12 +157,8 @@ "steps": [ "layout/segmentation/line" ], - "input_file_grp": [ - "OCR-D-SEG-LINE" - ], - "output_file_grp": [ - "OCR-D-SEG-LINE" - ], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "description": "Improve coordinates of text lines", "parameters": { "level-of-operation": { @@ -245,12 +213,8 @@ "preprocessing/optimization/dewarping" ], "description": "Dewarp line images with ocropy", - "input_file_grp": [ - "OCR-D-SEG-LINE" - ], - "output_file_grp": [ - "OCR-D-SEG-LINE" - ], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "parameters": { "dpi": { "type": "number", @@ -286,15 +250,9 @@ "steps": [ "recognition/text-recognition" ], - "description": "Recognize text in (binarized+deskewed+dewarped) lines with ocropy", - "input_file_grp": [ - "OCR-D-SEG-LINE", - "OCR-D-SEG-WORD", - "OCR-D-SEG-GLYPH" - ], - "output_file_grp": [ - "OCR-D-OCR-OCRO" - ], + "description": "Recognize text in (binarized+deskewed+dewarped) lines with Ocropy v1", + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "parameters": { "textequiv_level": { "type": "string", @@ -345,14 +303,9 @@ "layout/segmentation/region", "layout/segmentation/line" ], - "input_file_grp": [ - "OCR-D-GT-SEG-BLOCK", - "OCR-D-SEG-BLOCK" - ], - "output_file_grp": [ - "OCR-D-SEG-LINE" - ], - "description": "Segment pages into regions and lines, tables into cells and lines, or regions into lines with ocropy", + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, + "description": "Segment pages into regions and lines, tables into cells and lines, or regions into lines with Ocropy v1", "parameters": { "dpi": { "type": "number", @@ -444,11 +397,9 @@ "steps": [ "recognition/text-recognition" ], - "input_file_grp": [ - "OCR-D-GT-SEG-BLOCK", - "OCR-D-SEG-BLOCK" - ], - "description": "train model with ground truth from mets data", + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, + "description": "train Ocropy v1 text recognition model with PAGE ground truth from the input fileGrp extracted as file pairs into the output fileGrp", "parameters": { "textequiv_level": { "type": "string", @@ -470,7 +421,8 @@ }, "outputpath": { "type": "string", - "description": "(existing) path for the trained model" + "default": "output", + "description": "directory path for the trained model" } } }, @@ -482,15 +434,9 @@ "steps": [ "recognition/post-correction" ], - "input_file_grp": [ - "OCR-D-OCR-1", - "OCR-D-OCR-2", - "OCR-D-OCR-N" - ], - "output_file_grp": [ - "OCR-D-ALIGNED" - ], - "description": "Align multiple OCRs and/or GTs" + "input_file_grp_cardinality": [2, -1], + "output_file_grp_cardinality": 1, + "description": "Align multiple OCRs and/or GTs textually on line/word level" }, "ocrd-cis-postcorrect": { "executable": "ocrd-cis-postcorrect", @@ -501,12 +447,8 @@ "recognition/post-correction" ], "description": "Post correct OCR results", - "input_file_grp": [ - "OCR-D-LINE-ALIGNED" - ], - "output_file_grp": [ - "OCR-D-POST-CORRECTED" - ], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "parameters": { "maxCandidates": { "description": "Maximum number of considered correction candidates per suspicious token", From a18307d4a8f50b0a4b081016c9d9db55cca63023 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 16 Aug 2024 15:27:09 +0200 Subject: [PATCH 53/97] fix: ocropy train errors --- ocrd_cis/ocropy/train.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ocrd_cis/ocropy/train.py b/ocrd_cis/ocropy/train.py index 5c57b2cf..f5d70d6a 100644 --- a/ocrd_cis/ocropy/train.py +++ b/ocrd_cis/ocropy/train.py @@ -47,8 +47,8 @@ def setup(self): except SystemExit: ocropydir = dirname(abspath(__file__)) self.modelpath = join(ocropydir, 'models', model) - self.logger.error(f"Failed to resolve model '{model}' path, trying '{modelpath}'") - if not isfile(modelpath): + self.logger.error(f"Failed to resolve model '{model}' path, trying '{self.modelpath}'") + if not isfile(self.modelpath): self.logger.critical(f"Could not find model '{model}'.\n" f"Try 'ocrd resmgr download ocrd-cis-ocropy-recognize {model}'") exit(1) @@ -128,7 +128,7 @@ def extract_segment(self, path, segment, page_image, page_coords): if 'binarized' not in coords['features'].split(','): # binarize with nlbin - image, _ = binarize(image, maxskew=0) + image, _ = binarize(self.logger, image, maxskew=0) # resize image to 48 pixel height image = resize_keep_ratio(image) From 0ba6839c849688431fa2259da4cd934963724cfb Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 16 Aug 2024 15:39:09 +0200 Subject: [PATCH 54/97] remove: unused imports --- ocrd_cis/ocropy/binarize.py | 6 +----- ocrd_cis/ocropy/clip.py | 14 ++++++-------- ocrd_cis/ocropy/denoise.py | 10 ++-------- ocrd_cis/ocropy/deskew.py | 8 +------- ocrd_cis/ocropy/dewarp.py | 12 +++--------- ocrd_cis/ocropy/recognize.py | 12 ++---------- ocrd_cis/ocropy/resegment.py | 1 - ocrd_cis/ocropy/segment.py | 1 - ocrd_cis/ocropy/train.py | 9 +++++---- 9 files changed, 20 insertions(+), 53 deletions(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index ac499336..271f01fa 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -1,14 +1,10 @@ from __future__ import absolute_import from logging import Logger +from typing import Optional import cv2 import numpy as np from PIL import Image -from os.path import abspath, dirname, join - -from typing import Union, Optional - -#import kraken.binarization from ocrd_utils import getLogger from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py index 3ddd6a70..36ee4eb3 100644 --- a/ocrd_cis/ocropy/clip.py +++ b/ocrd_cis/ocropy/clip.py @@ -2,7 +2,6 @@ from logging import Logger from typing import Optional -from os.path import join import numpy as np from PIL import Image, ImageStat, ImageOps from shapely.geometry import Polygon @@ -12,19 +11,18 @@ from ocrd import Processor from ocrd.processor import OcrdPageResult, OcrdPageResultImage from ocrd_utils import ( - getLogger, - coordinates_of_segment, - polygon_from_points, bbox_from_polygon, + coordinates_of_segment, + crop_image, + getLogger, image_from_polygon, + polygon_from_points, polygon_mask, - crop_image, ) +from .common import array2pil, determine_zoom, pil2array from .ocrolib import midrange, morph -from .common import ( - # binarize, - array2pil, determine_zoom, pil2array) + class OcropyClip(Processor): logger: Logger diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py index 0f368fd5..72757e0c 100644 --- a/ocrd_cis/ocropy/denoise.py +++ b/ocrd_cis/ocropy/denoise.py @@ -1,19 +1,13 @@ from __future__ import absolute_import - from typing import Optional from logging import Logger -from os.path import join from ocrd_utils import getLogger -from ocrd_models.ocrd_page import ( - AlternativeImageType, OcrdPage -) +from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage from ocrd import Processor from ocrd.processor import OcrdPageResult, OcrdPageResultImage -from .common import ( - # binarize, - determine_zoom, remove_noise) +from .common import determine_zoom, remove_noise class OcropyDenoise(Processor): logger: Logger diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py index fae0c90c..9f9f8b0a 100644 --- a/ocrd_cis/ocropy/deskew.py +++ b/ocrd_cis/ocropy/deskew.py @@ -1,15 +1,9 @@ from __future__ import absolute_import - from typing import Optional from logging import Logger -from os.path import join from ocrd_utils import getLogger -from ocrd_models.ocrd_page import ( - PageType, - AlternativeImageType, - OcrdPage -) +from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage, PageType from ocrd import Processor from ocrd.processor import OcrdPageResult, OcrdPageResultImage diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py index a063a05e..9902af95 100644 --- a/ocrd_cis/ocropy/dewarp.py +++ b/ocrd_cis/ocropy/dewarp.py @@ -1,18 +1,12 @@ from __future__ import absolute_import - -from typing import Optional from logging import Logger -from os.path import join - +from typing import Optional import numpy as np -from ocrd_utils import getLogger -from ocrd_models.ocrd_page import ( - AlternativeImageType, - OcrdPage -) from ocrd import Processor from ocrd.processor import OcrdPageResult, OcrdPageResultImage +from ocrd_utils import getLogger +from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage from .ocrolib import lineest from .common import array2pil, check_line, determine_zoom, pil2array diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py index 97fcc64d..41576e43 100644 --- a/ocrd_cis/ocropy/recognize.py +++ b/ocrd_cis/ocropy/recognize.py @@ -10,16 +10,8 @@ from rapidfuzz.distance import Levenshtein -from ocrd_utils import ( - getLogger, - coordinates_for_segment, - polygon_from_bbox, - points_from_polygon, -) -from ocrd_models.ocrd_page import ( - TextEquivType, OcrdPage, - CoordsType, GlyphType, WordType -) +from ocrd_utils import coordinates_for_segment, getLogger, points_from_polygon, polygon_from_bbox +from ocrd_models.ocrd_page import CoordsType, GlyphType, OcrdPage, TextEquivType, WordType from ocrd import Processor from ocrd.processor import OcrdPageResult diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index 05f17d4f..0ef64687 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -2,7 +2,6 @@ from typing import Optional from logging import Logger -from os.path import join import numpy as np from skimage import draw, segmentation diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index bdeb40dd..edb5751a 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -2,7 +2,6 @@ from typing import Optional from logging import Logger -from os.path import join import itertools import numpy as np diff --git a/ocrd_cis/ocropy/train.py b/ocrd_cis/ocropy/train.py index f5d70d6a..8f224b86 100644 --- a/ocrd_cis/ocropy/train.py +++ b/ocrd_cis/ocropy/train.py @@ -3,9 +3,8 @@ from typing import Optional from logging import Logger from sys import exit -from os import getcwd, makedirs, remove +from os import makedirs, remove from os.path import abspath, dirname, exists, join, isfile -import tempfile from ocrd_models import OcrdPage from ocrd import Processor, Workspace @@ -32,7 +31,9 @@ def resize_keep_ratio(image, baseheight=48): class OcropyTrain(Processor): logger: Logger + modelpath: str old_cwd: str + outputpath: str @property def executable(self): @@ -75,8 +76,8 @@ def process_workspace(self, workspace: Workspace) -> None: f"on {len(self.filelist)} file pairs") rtrain(self.filelist, self.modelpath, self.outputpath, self.parameter['ntrain']) # deletefiles(self.filelist) - - def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult: + + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """ Extracts pairs of plaintext and cropped image files for each text line in the PAGE file (to be used during training). From 6b06e8856addd3b4963961df6d6cb1fb29e126cf Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 16 Aug 2024 15:48:42 +0200 Subject: [PATCH 55/97] Update binarize.py --- ocrd_cis/ocropy/binarize.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index 3e87cf8a..e82dbc16 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -98,6 +98,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id, feature_filter='binarized') zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) + result = OcrdPageResult(pcgts) if level == 'page': try: result.images.append(self.process_page(page, page_image, page_xywh, zoom, page_id)) @@ -256,4 +257,4 @@ def process_line(self, line, line_image, line_xywh, zoom, page_id, region_id) -> # update PAGE (reference the image file): alt_image = AlternativeImageType(comments=features) line.add_AlternativeImage(alt_image) - return OcrdPageResultImage(bin_image, suffix, alt_image) \ No newline at end of file + return OcrdPageResultImage(bin_image, suffix, alt_image) From d1a14b704c0d2559685b8f33ddd23d60c65563a7 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 16 Aug 2024 16:22:42 +0200 Subject: [PATCH 56/97] refactor: python strings v3 --- ocrd_cis/ocropy/binarize.py | 6 +-- ocrd_cis/ocropy/clip.py | 5 +-- ocrd_cis/ocropy/denoise.py | 8 ++-- ocrd_cis/ocropy/deskew.py | 7 ++-- ocrd_cis/ocropy/dewarp.py | 11 +++--- ocrd_cis/ocropy/recognize.py | 6 +-- ocrd_cis/ocropy/resegment.py | 72 +++++++++++++++------------------- ocrd_cis/ocropy/segment.py | 76 ++++++++++++++++++------------------ 8 files changed, 88 insertions(+), 103 deletions(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index e82dbc16..782dd578 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -16,7 +16,7 @@ def binarize(logger: Logger, pil_image, method='ocropy', maxskew=2, threshold=0.5, nrm=False, zoom=1.0): - logger.debug('binarizing %dx%d image with method=%s', pil_image.width, pil_image.height, method) + logger.debug(f'Binarizing {pil_image.width}x{pil_image.height} image with method={method}') if method == 'none': # useful if the images are already binary, # but lack image attribute `binarized` @@ -242,8 +242,8 @@ def process_line(self, line, line_image, line_xywh, zoom, page_id, region_id) -> #orientation = -angle #orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180] #line.set_orientation(orientation) # does not exist on line level! - self.logger.warning(f"cannot add orientation %.2f to page '{page_id}' region '{region_id}' line '{line.id}'", - -angle) + self.logger.warning( + f"Cannot add orientation %.2f to page '{page_id}' region '{region_id}' line '{line.id}'", -angle) bin_image = remove_noise(bin_image, maxsize=self.parameter['noise_maxsize']) if self.parameter['noise_maxsize']: features += ',despeckled' diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py index 36ee4eb3..7f40a214 100644 --- a/ocrd_cis/ocropy/clip.py +++ b/ocrd_cis/ocropy/clip.py @@ -128,15 +128,13 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: str = No if level == 'region': if region.get_AlternativeImage(): # FIXME: This should probably be an exception (bad workflow configuration). - self.logger.warning( - f'Page "{page_id}" region "{region.id}" already contains image data: skipping') + self.logger.warning(f'Page "{page_id}" region "{region.id}" already contains image data: skipping') continue shape = prep(shapes[i]) neighbours = [(regionj, maskj) for shapej, regionj, maskj in zip(shapes[:i] + shapes[i+1:], regions[:i] + regions[i+1:], masks[:i] + masks[i+1:]) if shape.intersects(shapej)] if neighbours: - segment_region_file_id = f"{output_file_id}_{region.id}" ret.images.append(self.process_segment( region, masks[i], polygons[i], neighbours, background_image, page_image, page_xywh, page_bin, page_id)) @@ -167,7 +165,6 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: str = No in zip(shapes[:j] + shapes[j+1:], lines[:j] + lines[j+1:], masks[:j] + masks[j+1:]) if shape.intersects(shapej)] if neighbours: - segment_line_file_id = f"{output_file_id}_{region.id}_{line.id}" ret.images.append(self.process_segment( line, masks[j], polygons[j], neighbours, background_image, region_image, region_coords, region_bin, page_id)) diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py index 72757e0c..b3c219fb 100644 --- a/ocrd_cis/ocropy/denoise.py +++ b/ocrd_cis/ocropy/denoise.py @@ -57,7 +57,7 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option else: regions = page.get_AllRegions(classes=['Text'], order='reading-order') if not regions: - self.logger.warning('Page "%s" contains no text regions', page_id) + self.logger.warning(f'Page "{page_id}" contains no text regions') for region in regions: region_image, region_xywh = self.workspace.image_from_segment( region, page_image, page_xywh, @@ -69,7 +69,7 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option continue lines = region.get_TextLine() if not lines: - self.logger.warning('Page "%s" region "%s" contains no text lines', page_id, region.id) + self.logger.warning(f'Page "{page_id}" region "{region.id}" contains no text lines') for line in lines: line_image, line_xywh = self.workspace.image_from_segment( line, region_image, region_xywh, @@ -80,9 +80,9 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option def process_segment(self, segment, segment_image, segment_xywh, zoom) -> Optional[OcrdPageResultImage]: if not segment_image.width or not segment_image.height: - self.logger.warning("Skipping '%s' with zero size", file_id) + self.logger.warning(f"Skipping '{segment.id}' with zero size") return None - self.logger.info("About to despeckle '%s'", file_id) + self.logger.info(f"About to despeckle '{segment.id}'") bin_image = remove_noise(segment_image, maxsize=self.parameter['noise_maxsize']/zoom*300/72) # in pt # update PAGE (reference the image file): diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py index 9f9f8b0a..84475d81 100644 --- a/ocrd_cis/ocropy/deskew.py +++ b/ocrd_cis/ocropy/deskew.py @@ -73,8 +73,7 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option # (we will overwrite @orientation anyway,) # abort if no such image can be produced: feature_filter='deskewed') - image = self._process_segment(region, region_image, region_coords, - "region '%s'" % region.id, page_id) + image = self._process_segment(region, region_image, region_coords, f"region '{region.id}'", page_id) if image: result.images.append(image) return result @@ -84,14 +83,14 @@ def _process_segment(self, segment, segment_image, segment_coords, segment_id, p self.logger.warning("Skipping %s with zero size", segment_id) return None angle0 = segment_coords['angle'] # deskewing (w.r.t. top image) already applied to segment_image - self.logger.info("About to deskew %s", segment_id) + self.logger.info(f"About to deskew {segment_id}") angle = deskew(segment_image, maxskew=self.parameter['maxskew']) # additional angle to be applied # segment angle: PAGE orientation is defined clockwise, # whereas PIL/ndimage rotation is in mathematical direction: orientation = -(angle + angle0) orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180] segment.set_orientation(orientation) # also removes all deskewed AlternativeImages - self.logger.info("Found angle for %s: %.1f", segment_id, angle) + self.logger.info(f"Found angle for {segment_id}: %.1f", angle) # delegate reflection, rotation and re-cropping to core: if isinstance(segment, PageType): segment_image, segment_coords, _ = self.workspace.image_from_page( diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py index 9902af95..302cf2e0 100644 --- a/ocrd_cis/ocropy/dewarp.py +++ b/ocrd_cis/ocropy/dewarp.py @@ -101,29 +101,28 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option regions = page.get_AllRegions(classes=['Text'], order='reading-order') if not regions: - self.logger.warning('Page "%s" contains no text regions', page_id) + self.logger.warning(f'Page "{page_id}" contains no text regions') for region in regions: region_image, region_xywh = self.workspace.image_from_segment( region, page_image, page_xywh) lines = region.get_TextLine() if not lines: - self.logger.warning('Region %s contains no text lines', region.id) + self.logger.warning(f'Region {region.id} contains no text lines') for line in lines: line_image, line_xywh = self.workspace.image_from_segment( line, region_image, region_xywh) - self.logger.info("About to dewarp page '%s' region '%s' line '%s'", - page_id, region.id, line.id) + self.logger.info(f"About to dewarp page '{page_id}' region '{region.id}' line '{line.id}'") try: dew_image = dewarp(line_image, self.lnorm, check=True, max_neighbour=self.parameter['max_neighbour'], zoom=zoom) except InvalidLine as err: - self.logger.error('cannot dewarp line "%s": %s', line.id, err) + self.logger.error(f'Cannot dewarp line "{line.id}": {err}') continue except InadequateLine as err: - self.logger.warning('cannot dewarp line "%s": %s', line.id, err) + self.logger.warning(f'cannot dewarp line "{line.id}": {err}') # as a fallback, simply pad the image vertically # (just as dewarping would do on average, so at least # this line has similar margins as the others): diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py index 41576e43..f0c4b520 100644 --- a/ocrd_cis/ocropy/recognize.py +++ b/ocrd_cis/ocropy/recognize.py @@ -179,13 +179,13 @@ def process_lines(self, textlines, maxlevel, region_image, region_xywh): linegt = line.TextEquiv[0].Unicode else: linegt = '' - self.logger.debug("GT '%s': '%s'", line.id, linegt) + self.logger.debug(f"GT '{line.id}': '{linegt}'") # remove existing annotation below line level: line.set_TextEquiv([]) line.set_Word([]) if line_image.size[1] < 16: - self.logger.debug(f"ERROR: bounding box is too narrow at line {line.id}") + self.logger.debug(f"Error: bounding box is too narrow at line {line.id}") continue # resize image to 48 pixel height final_img, scale = resize_keep_ratio(line_image) @@ -194,7 +194,7 @@ def process_lines(self, textlines, maxlevel, region_image, region_xywh): try: linepred, clist, rlist, confidlist = recognize(final_img, self.pad, self.network, check=True) except Exception as err: - self.logger.debug(f'error processing line "{line.id}": {err}') + self.logger.debug(f'Error processing line "{line.id}": {err}') continue self.logger.debug(f"OCR '{line.id}': '{linepred}'") edits += Levenshtein.distance(linepred, linegt) diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index 0ef64687..d429c1de 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -126,14 +126,14 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option page.get_CustomRegion()) regions = page.get_AllRegions(classes=['Text']) if not regions: - self.logger.warning('Page "%s" contains no text regions', page_id) + self.logger.warning(f'Page "{page_id}" contains no text regions') elif level == 'page': lines = [line for region in regions for line in region.get_TextLine()] if lines: self._process_segment(page, page_image, page_coords, page_id, zoom, lines, ignore) else: - self.logger.warning('Page "%s" contains no text regions with lines', page_id) + self.logger.warning(f'Page "{page_id}" contains no text regions with lines', ) else: for region in regions: lines = region.get_TextLine() @@ -142,7 +142,7 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option region, page_image, page_coords, feature_selector='binarized') self._process_segment(region, region_image, region_coords, page_id, zoom, lines, ignore) else: - self.logger.warning('Page "%s" region "%s" contains no text lines', page_id, region.id) + self.logger.warning(f'Page "{page_id}" region "{region.id}" contains no text lines') return OcrdPageResult(pcgts) def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, lines, ignore): @@ -163,8 +163,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l fullpage = False report = check_region(parent_bin, zoom) if report: - self.logger.warning('Invalid %s "%s": %s', tag, - page_id if fullpage else parent.id, report) + self.logger.warning(f'Invalid {tag} "{page_id if fullpage else parent.id}": {report}') return # get existing line labels: line_labels = np.zeros_like(parent_bin, bool) @@ -191,8 +190,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l line_labels[i, line_y, line_x] = True # only text region(s) may contain new text lines for i, region in enumerate(set(line.parent_object_ for line in lines)): - self.logger.debug('unmasking area of text region "%s" for "%s"', - region.id, page_id if fullpage else parent.id) + self.logger.debug(f'Unmasking area of text region "{region.id}" for "{page_id if fullpage else parent.id}"') region_polygon = coordinates_of_segment(region, parent_image, parent_coords) region_polygon = make_valid(Polygon(region_polygon)) region_polygon = np.array(region_polygon.exterior.coords, int)[:-1] @@ -201,14 +199,14 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l parent_bin.shape)] = False # mask/ignore overlapping neighbours for i, segment in enumerate(ignore): - self.logger.debug('masking area of %s "%s" for "%s"', type(segment).__name__[:-4], - segment.id, page_id if fullpage else parent.id) + self.logger.debug(f'Masking area of {type(segment).__name__[:-4]} "{segment.id}" for ' + f'"{page_id if fullpage else parent.id}"') segment_polygon = coordinates_of_segment(segment, parent_image, parent_coords) ignore_bin[draw.polygon(segment_polygon[:, 1], segment_polygon[:, 0], parent_bin.shape)] = True if method != 'lineest': - self.logger.debug('calculating connected component and distance transforms for "%s"', parent.id) + self.logger.debug(f'Calculating connected component and distance transforms for "{parent.id}"') bin = parent_bin & ~ ignore_bin components, _ = morph.label(bin) # estimate glyph scale (roughly) @@ -217,7 +215,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l counts = np.sqrt(3 * counts) scale = int(np.median(counts[(5/zoom < counts) & (counts < 100/zoom)])) components *= (counts > 15/zoom)[components] - self.logger.debug("estimated scale: %d", scale) + self.logger.debug(f"Estimated scale: {scale}") else: scale = 43 if method == 'ccomps': @@ -235,7 +233,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l new_labels = np.zeros_like(parent_bin, np.uint8) for i, line in enumerate(lines): if line.Baseline is None: - self.logger.warning("Skipping '%s' without baseline", line.id) + self.logger.warning(f"Skipping '{line.id}' without baseline") new_labels[line_labels[i]] = i + 1 continue line_baseline = baseline_of_segment(line, parent_coords) @@ -254,14 +252,13 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l parent_bin, seps=ignore_bin, zoom=zoom, spread_dist=maxdist or scale/2, fullpage=fullpage, maxseps=0, maxcolseps=len(ignore), maximages=0) except Exception as err: - self.logger.error('Cannot line-segment %s "%s": %s', - tag, page_id if fullpage else parent.id, err) + self.logger.error(f'Cannot line-segment {tag} "{page_id if fullpage else parent.id}": {err}') return - self.logger.info("Found %d new line labels for %d existing lines on %s '%s'", - new_line_labels.max(), len(lines), tag, parent.id) + self.logger.info( + f"Found {new_line_labels.max()} new line labels for {len(lines)} existing lines on {tag} '{parent.id}'") # polygonalize and prepare comparison new_line_polygons, new_line_labels = masks2polygons(self.logger, - new_line_labels, new_baselines, parent_bin, '%s "%s"' % (tag, parent.id), + new_line_labels, new_baselines, parent_bin, name=f'{tag} "{parent.id}"', min_area=640/zoom/zoom) DSAVE('line_labels', [np.argmax(np.insert(line_labels, 0, 0, axis=0), axis=0), parent_bin]) DSAVE('new_line_labels', [new_line_labels, parent_bin]) @@ -345,31 +342,29 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l for j, line in enumerate(lines): new_lines = np.nonzero(assignments == j)[0] if not np.prod(new_lines.shape): - self.logger.debug("no lines for '%s' match or fit", line.id) + self.logger.debug(f"no lines for '{line.id}' match or fit", ) continue covers = np.sum(covers_bg[new_lines,j]) if covers < threshold / 3: - self.logger.debug("new lines for '%s' only cover %.1f%% bg", - line.id, covers * 100) + self.logger.debug(f"new lines for '{line.id}' only cover %.1f%% bg", covers * 100) continue covers = np.sum(covers_fg[new_lines,j]) if covers < threshold: - self.logger.debug("new lines for '%s' only cover %.1f%% fg", - line.id, covers * 100) + self.logger.debug(f"new lines for '{line.id}' only cover %.1f%% fg", covers * 100) continue looses = (assignments < 0) & (covers_bg[:,j] > 0.1) if looses.any(): covers = np.sum(covers_bg[np.nonzero(looses)[0],j]) - self.logger.debug("new lines for '%s' would loose %d non-matching segments totalling %.1f%% bg", - line.id, np.count_nonzero(looses), covers * 100) + self.logger.debug( + f"new lines for '{line.id}' would loose {np.count_nonzero(looses)} non-matching segments " + f"totalling %.1f%% bg", covers * 100) continue line_count = np.count_nonzero(line_labels[j] & parent_bin) new_count = covers * line_count - self.logger.debug('Black pixels before/after resegment of line "%s": %d/%d', - line.id, line_count, new_count) + self.logger.debug(f'Black pixels before/after resegment of line "{line.id}": {line_count}/{new_count}') # combine all assigned new lines to single outline polygon if len(new_lines) > 1: - self.logger.debug("joining %d new line polygons for '%s'", len(new_lines), line.id) + self.logger.debug(f"joining {len(new_lines)} new line polygons for '{line.id}'") new_polygon = join_polygons([new_line_polygons[i] #intersections[(i, j)] for i in new_lines], loc=line.id, scale=scale) new_baseline = join_baselines(self.logger, [new_polygon.intersection(new_baselines[i]) @@ -379,7 +374,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l parent_image, parent_coords) line_polygon = polygon_for_parent(line_polygon, line.parent_object_) if line_polygon is None: - self.logger.warning("Ignoring extant new polygon for line '%s'", line.id) + self.logger.warning(f"Ignoring extant new polygon for line '{line.id}'") return # annotate result: line.get_Coords().set_points(points_from_polygon(line_polygon)) @@ -394,7 +389,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l if j == otherj: continue otherline = lines[otherj] - self.logger.debug("subtracting new '%s' from overlapping '%s'", line.id, otherline.id) + self.logger.debug(f"subtracting new '{line.id}' from overlapping '{otherline.id}'") other_polygon = diff_polygons(line_polygons[otherj].context, new_polygon) if other_polygon.is_empty: continue @@ -403,7 +398,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l parent_image, parent_coords) other_polygon = polygon_for_parent(other_polygon, otherline.parent_object_) if other_polygon is None: - self.logger.warning("Ignoring extant new polygon for line '%s'", otherline.id) + self.logger.warning(f"Ignoring extant new polygon for line '{otherline.id}'") continue otherline.get_Coords().set_points(points_from_polygon(other_polygon)) @@ -434,29 +429,26 @@ def spread_dist(logger: Logger, lines, old_labels, new_labels, binarized, compon continue count = np.count_nonzero(old_label) if not count: - logger.warning("skipping zero-area line '%s'", line.id) + logger.warning(f"skipping zero-area line '{line.id}'") continue covers = np.count_nonzero(new_label) / count if covers < threshold / 3: - logger.debug("new line for '%s' only covers %.1f%% bg", - line.id, covers * 100) + logger.debug(f"new line for '%s' only covers %.1f%% bg", covers * 100) continue count = np.count_nonzero(old_label * binarized) if not count: - logger.warning("skipping binary-empty line '%s'", line.id) + logger.warning(f"skipping binary-empty line '{line.id}'") continue covers = np.count_nonzero(new_label * binarized) / count if covers < threshold: - logger.debug("new line for '%s' only covers %.1f%% fg", - line.id, covers * 100) + logger.debug(f"new line for '{line.id}' only covers %.1f%% fg", covers * 100) continue - logger.debug('Black pixels before/after resegment of line "%s": %d/%d', - line.id, count, covers * count) + logger.debug(f'Black pixels before/after resegment of line "{line.id}": {count}/{covers * count}') contours = [contour[:,::-1] # get x,y order again for contour, area in morph.find_contours(new_label)] #LOG.debug("joining %d subsegments for %s", len(contours), line.id) if len(contours) == 0: - logger.warning("no contours for %s - keeping", line.id) + logger.warning(f"no contours for {line.id} - keeping") continue else: # get alpha shape @@ -468,7 +460,7 @@ def spread_dist(logger: Logger, lines, old_labels, new_labels, binarized, compon polygon = coordinates_for_segment(poly, None, coords) polygon = polygon_for_parent(polygon, line.parent_object_) if polygon is None: - logger.warning("Ignoring extant line for %s", line.id) + logger.warning(f"Ignoring extant line for {line.id}") continue line.get_Coords().set_points(points_from_polygon(polygon)) diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index edb5751a..e8c4a1ed 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -75,8 +75,6 @@ def masks2polygons(logger: Logger, bg_labels, baselines, fg_bin, name, min_area= - these polygons as a list of label, polygon, baseline tuples, and - a Numpy array of new background labels for that list. """ - if not logger: - raise ValueError(f"Logger has not been passed by the caller") # find sharp baseline if baselines is not None: def getx(xy): @@ -93,8 +91,7 @@ def getx(xy): bg_mask = np.array(bg_labels == label, bool) if not np.count_nonzero(bg_mask * fg_bin): # ignore if missing foreground - logger.debug('skipping label %d in %s due to empty fg', - label, name) + logger.debug(f'Skipping label {label} in {name} due to empty fg') continue # simplify to convex hull if simplify is not None: @@ -102,8 +99,8 @@ def getx(xy): conflicts = np.setdiff1d(hull * simplify, bg_mask * simplify) if conflicts.any(): - logger.debug('Cannot simplify %d: convex hull would create additional intersections %s', - label, str(conflicts)) + logger.debug( + f'Cannot simplify {label}: convex hull would create additional intersections {str(conflicts)}') else: bg_mask = hull if open_holes: @@ -131,8 +128,8 @@ def getx(xy): if len(hole) < 3: idx_hole = hier[0, idx_hole, 0] continue - logger.debug("label %d contour %d [%d pts] has hole %d [%d pts]", - label, idx, len(contour), idx_hole, len(hole)) + logger.debug( + f"Label {label} contour {idx} [{len(contour)} pts] has hole {idx_hole} [{len(hole)} pts]") #plot_poly(hole, 'blue') # cut child from outside... # first get nearest point on child @@ -173,7 +170,7 @@ def getx(xy): diff2 = (interpol[interpol_idx+1:interpol_idx+2] - cispoint2) // 5 cispoint1 = cispoint1 + diff1 cispoint2 = cispoint2 + diff2 - logger.debug("stitching at interpolation pos %d hole pos %d", interpol_idx, hole_idx) + logger.debug(f"Stitching at interpolation pos {interpol_idx} hole pos {hole_idx}") # now stitch together outer (up to cision), inner (re-arranged around cision), outer (rest) # (this works, because inner contours have inverse direction) contour = np.concatenate([contour[:contour_idx], cispoint1, @@ -182,7 +179,7 @@ def getx(xy): #plot_poly(contour, 'green') idx_hole = hier[0, idx_hole, 0] #plot_poly(contour, 'red') - logger.debug("adding label %d contour %d [%d pts]", label, idx, len(contour)) + logger.debug(f"Adding label {label} contour {idx} [{len(contour)} pts]") contours.append(contour) idx = hier[0, idx, 0] else: @@ -208,8 +205,7 @@ def getx(xy): contour = contours[i] area = areas[i] if min_area and area < min_area and area / total_area < 0.1: - logger.warning('Label %d contour %d is too small (%d/%d) in %s', - label, i, area, total_area, name) + logger.warning(f'Label {label} contour {i} is too small ({area}/{total_area}) in {name}') continue # simplify shape: # can produce invalid (self-intersecting) polygons: @@ -226,7 +222,7 @@ def getx(xy): logger.warning(explain_validity(polygon)) poly = polygon.exterior.coords[:-1] # keep open if len(poly) < 4: - logger.warning('Label %d contour %d for %s has less than 4 points', label, i, name) + logger.warning(f'Label {label} contour {i} for {name} has less than 4 points') continue # get baseline segments intersecting with this line mask # and concatenate them from left to right @@ -369,7 +365,7 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option if regions: # page is already region-segmented if overwrite_regions: - self.logger.info('removing existing TextRegions in page "%s"', page_id) + self.logger.info(f'Removing existing TextRegions in page "{page_id}"', ) # we could remove all other region types as well, # but this is more flexible (for workflows with # specialized separator/image/table detectors): @@ -377,7 +373,7 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option page.set_ReadingOrder(None) ro = None else: - self.logger.warning('keeping existing TextRegions in page "%s"', page_id) + self.logger.warning(f'Keeping existing TextRegions in page "{page_id}"', ) ignore.extend(regions) # create reading order if necessary if not ro or overwrite_order: @@ -404,20 +400,20 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option ignore.extend(page.get_TextRegion()) regions = list(page.get_TableRegion()) if not regions: - self.logger.warning('Page "%s" contains no table regions', page_id) + self.logger.warning(f'Page "{page_id}" contains no table regions') for region in regions: subregions = region.get_TextRegion() if subregions: # table is already cell-segmented if overwrite_regions: - self.logger.info('removing existing TextRegions in table "%s"', region.id) + self.logger.info(f'Removing existing TextRegions in table "{region.id}"') region.set_TextRegion([]) roelem = reading_order.get(region.id) # replace by empty group with same index and ref # (which can then take the cells as subregions) reading_order[region.id] = page_subgroup_in_reading_order(self.logger, roelem) else: - self.logger.warning('skipping table "%s" with existing TextRegions', region.id) + self.logger.warning(f'Skipping table "{region.id}" with existing TextRegions') continue # TODO: also allow grayscale_normalized (try/except?) region_image, region_coords = self.workspace.image_from_segment( @@ -428,19 +424,22 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option # create reading order group if necessary roelem = reading_order.get(region.id) if not roelem: - self.logger.warning("Page '%s' table region '%s' is not referenced in reading order (%s)", - page_id, region.id, "no target to add cells to") + self.logger.warning( + f"Page '{page_id}' table region '{region.id}' is not referenced in reading order " + f"(no target to add cells to)") elif overwrite_order: # replace by empty ordered group with same (index and) ref # (which can then take the cells as subregions) roelem = page_subgroup_in_reading_order(self.logger, roelem) reading_order[region.id] = roelem elif isinstance(roelem, (OrderedGroupType, OrderedGroupIndexedType)): - self.logger.warning("Page '%s' table region '%s' already has an ordered group (%s)", - page_id, region.id, "cells will be appended") + self.logger.warning( + f"Page '{page_id}' table region '{region.id}' already has an ordered group " + f"(cells will be appended)") elif isinstance(roelem, (UnorderedGroupType, UnorderedGroupIndexedType)): - self.logger.warning("Page '%s' table region '%s' already has an unordered group (%s)", - page_id, region.id, "cells will not be appended") + self.logger.warning( + f"Page '{page_id}' table region '{region.id}' already has an unordered group " + f"(cells will not be appended)") roelem = None else: # replace regionRef(Indexed) by group with same index and ref @@ -468,14 +467,14 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option region.add_TextRegion(subregion) regions.append(subregion) if not regions: - self.logger.warning('Page "%s" contains no text regions', page_id) + self.logger.warning(f'Page "{page_id}" contains no text regions') for region in regions: if region.get_TextLine(): if overwrite_lines: - self.logger.info('removing existing TextLines in page "%s" region "%s"', page_id, region.id) + self.logger.info(f'Removing existing TextLines in page "{page_id}" region "{region.id}"') region.set_TextLine([]) else: - self.logger.warning('keeping existing TextLines in page "%s" region "%s"', page_id, region.id) + self.logger.warning(f'Keeping existing TextLines in page "{page_id}" region "{region.id}"') ignore.extend(region.get_TextLine()) # TODO: also allow grayscale_normalized (try/except?) region_image, region_coords = self.workspace.image_from_segment( @@ -517,8 +516,8 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non sep_bin = np.zeros_like(element_bin, bool) ignore_labels = np.zeros_like(element_bin, int) for i, segment in enumerate(ignore): - self.logger.debug(f'masking foreground of {type(segment).__name__[:-4]} ' - f'"{segment.id}" for "{element.id}"') + self.logger.debug( + f'Masking foreground of {type(segment).__name__[:-4]} "{segment.id}" for "{element.id}"') # mark these segments (e.g. separator regions, tables, images) # for workflows where they have been detected already; # these will be: @@ -552,7 +551,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non fullpage = False report = check_region(element_bin, zoom) suffix = element.id + '.IMG-CLIP' - self.logger.info(f'computing line segmentation for {element_name} "{element.id}"') + self.logger.info(f'Computing line segmentation for {element_name} "{element.id}"') # TODO: we should downscale if DPI is large enough to save time try: if report: @@ -577,8 +576,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non self.logger.error(f'Cannot line-segment {element_name} "{element.id}": {err}') return None - self.logger.info(f'Found {len(np.unique(line_labels)) - 1} text lines ' - f'for {element_name} "{element.id}"') + self.logger.info(f'Found {len(np.unique(line_labels)) - 1} text lines for {element_name} "{element.id}"') # post-process line labels if isinstance(element, (PageType, TableRegionType)): # aggregate text lines to text regions @@ -599,8 +597,8 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non gap_height=self.parameter['gap_height'], gap_width=self.parameter['gap_width'], scale=scale, zoom=zoom) - self.logger.info(f'Found {len(np.unique(region_labels)) - 1} text regions ' - f'for {element_name} "{element.id}"') + self.logger.info( + f'Found {len(np.unique(region_labels)) - 1} text regions for {element_name} "{element.id}"') except Exception as err: self.logger.error(f'Cannot region-segment {element_name} "{element.id}": {err}') region_labels = np.where(line_labels > len(ignore), 1 + len(ignore), line_labels) @@ -630,7 +628,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non # (no new region, no actual text lines) region_line_labels0 = np.intersect1d(region_line_labels0, ignore_labels) assert len(region_line_labels0) == 1, \ - (f'region label "{region_label}" has both existing regions and new lines ' + (f'Region label "{region_label}" has both existing regions and new lines ' f'({str(region_line_labels0)})') region = ignore[region_line_labels0[0] - 1] if rogroup and region.parent_object_ is element and not isinstance(region, SeparatorRegionType): @@ -907,9 +905,9 @@ def join_baselines(logger: Logger, baselines, loc=''): elif geom.geom_type == 'MultiLineString': lines.extend(geom) else: - logger.warning("ignoring baseline subtype %s in %s", geom.geom_type, loc) + logger.warning(f"Ignoring baseline subtype {geom.geom_type} in {loc}") else: - logger.warning("ignoring baseline type %s in %s", baseline.geom_type, loc) + logger.warning(f"Ignoring baseline type {baseline.geom_type} in {loc}") nlines = len(lines) if nlines == 0: return None @@ -971,7 +969,7 @@ def join_baselines(logger: Logger, baselines, loc=''): else: chains.append([prevl, nextl]) if len(chains) > 1: - logger.warning("baseline merge impossible (no spanning tree) in %s", loc) + logger.warning(f"Baseline merge impossible (no spanning tree) in {loc}") return None assert len(chains) == 1, chains assert len(chains[0]) == nlines, chains[0] @@ -983,7 +981,7 @@ def join_baselines(logger: Logger, baselines, loc=''): coords.extend(line.normalize().coords) result = LineString(coords) if result.is_empty: - logger.warning("baseline merge is empty in %s", loc) + logger.warning(f"Baseline merge is empty in {loc}") return None assert result.geom_type == 'LineString', result.wkt result = set_precision(result, 1.0) From d8542c20d5e39c1bf8670205a75c039f25198bf8 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 16 Aug 2024 16:39:43 +0200 Subject: [PATCH 57/97] spacing: train --- ocrd_cis/ocropy/train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ocrd_cis/ocropy/train.py b/ocrd_cis/ocropy/train.py index 8f224b86..6c627231 100644 --- a/ocrd_cis/ocropy/train.py +++ b/ocrd_cis/ocropy/train.py @@ -19,8 +19,8 @@ def deletefiles(filelist): for file in filelist: if exists(file): remove(file) - if exists(file[:-3]+'gt.txt'): - remove(file[:-3]+'gt.txt') + if exists(file[:-3] + 'gt.txt'): + remove(file[:-3] + 'gt.txt') def resize_keep_ratio(image, baseheight=48): hpercent = (baseheight / float(image.size[1])) From d7859714ec6622a0b9294d9dc54d9f3e35f4606c Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 16 Aug 2024 16:39:54 +0200 Subject: [PATCH 58/97] spacing: segment --- ocrd_cis/ocropy/segment.py | 41 ++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index e8c4a1ed..75be2a11 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -58,7 +58,9 @@ lines2regions ) -def masks2polygons(logger: Logger, bg_labels, baselines, fg_bin, name, min_area=None, simplify=None, open_holes=False, reorder=True): + +def masks2polygons(logger: Logger, bg_labels, baselines, fg_bin, name, min_area=None, simplify=None, open_holes=False, + reorder=True): """Convert label masks into polygon coordinates. Given a Numpy array of background labels ``bg_labels``, @@ -79,6 +81,7 @@ def masks2polygons(logger: Logger, bg_labels, baselines, fg_bin, name, min_area= if baselines is not None: def getx(xy): return xy[0] + baselines = [LineString(sorted([p[::-1] for p in line], key=getx)).simplify(5) for line in baselines if len(line) >= 2] @@ -96,8 +99,7 @@ def getx(xy): # simplify to convex hull if simplify is not None: hull = convex_hull_image(bg_mask.astype(np.uint8)).astype(bool) - conflicts = np.setdiff1d(hull * simplify, - bg_mask * simplify) + conflicts = np.setdiff1d(hull * simplify, bg_mask * simplify) if conflicts.any(): logger.debug( f'Cannot simplify {label}: convex hull would create additional intersections {str(conflicts)}') @@ -143,10 +145,10 @@ def getx(xy): contourtics = np.maximum(1, np.linalg.norm(contour2, axis=2).astype(int)[:,0] // 10) interpol = [] for i, ntics in enumerate(contourtics): - interpol.extend(np.array(contour[i:i+1] + - contour2[i:i+1] * - np.linspace(0, 1, ntics)[:,np.newaxis,np.newaxis], - int)) + interpol.extend(np.array( + contour[i:i + 1] + + contour2[i:i + 1] * + np.linspace(0, 1, ntics)[:, np.newaxis, np.newaxis], int)) interpol.append(contour[-1]) interpol = np.array(interpol) contourtics = np.insert(np.cumsum(contourtics), 0, 0) @@ -159,23 +161,24 @@ def getx(xy): contour_idx2 = contour_idx if contour_idx2 >= len(contour): contour_idx2 = 0 - cispoint1 = cispoint2 = interpol[interpol_idx:interpol_idx+1] + cispoint1 = cispoint2 = interpol[interpol_idx:interpol_idx + 1] if interpol_idx == 0: diff1 = (interpol[-1:] - cispoint1) // 5 else: - diff1 = (interpol[interpol_idx-1:interpol_idx] - cispoint1) // 5 + diff1 = (interpol[interpol_idx - 1: interpol_idx] - cispoint1) // 5 if interpol_idx + 1 >= len(interpol): diff2 = (interpol[0:1] - cispoint2) // 5 else: - diff2 = (interpol[interpol_idx+1:interpol_idx+2] - cispoint2) // 5 + diff2 = (interpol[interpol_idx + 1: interpol_idx + 2] - cispoint2) // 5 cispoint1 = cispoint1 + diff1 cispoint2 = cispoint2 + diff2 logger.debug(f"Stitching at interpolation pos {interpol_idx} hole pos {hole_idx}") # now stitch together outer (up to cision), inner (re-arranged around cision), outer (rest) # (this works, because inner contours have inverse direction) - contour = np.concatenate([contour[:contour_idx], cispoint1, - hole[hole_idx:], hole[:hole_idx], - cispoint2, contour[contour_idx:]]) + contour = np.concatenate( + [contour[:contour_idx], cispoint1, + hole[hole_idx:], hole[:hole_idx], + cispoint2, contour[contour_idx:]]) #plot_poly(contour, 'green') idx_hole = hier[0, idx_hole, 0] #plot_poly(contour, 'red') @@ -210,7 +213,7 @@ def getx(xy): # simplify shape: # can produce invalid (self-intersecting) polygons: #polygon = cv2.approxPolyDP(contour, 2, False)[:, 0, ::] # already ordered x,y - polygon = contour[:, 0, ::] # already ordered x,y + polygon = contour[:, 0, ::] # already ordered x,y # simplify and validate: polygon = Polygon(polygon) if not polygon.is_valid: @@ -220,22 +223,22 @@ def getx(xy): if not polygon.is_valid: #LOG.debug(polygon.wkt) logger.warning(explain_validity(polygon)) - poly = polygon.exterior.coords[:-1] # keep open + poly = polygon.exterior.coords[:-1] # keep open if len(poly) < 4: logger.warning(f'Label {label} contour {i} for {name} has less than 4 points') continue # get baseline segments intersecting with this line mask # and concatenate them from left to right if baselines is not None: - base = join_baselines(logger, [baseline.intersection(polygon) - for baseline in baselines - if baseline.intersects(polygon)], name) + base = join_baselines( + logger, + [baseline.intersection(polygon) for baseline in baselines if baseline.intersects(polygon)], name) if base is not None: base = base.coords else: base = None results.append((label, poly, base)) - result_labels[contour_labels == i+1] = len(results) + result_labels[contour_labels == i + 1] = len(results) return results, result_labels From 7ca78a97db34559ebf1a8dd819ea08e5415ec8d9 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 16 Aug 2024 16:40:08 +0200 Subject: [PATCH 59/97] spacing: resegment --- ocrd_cis/ocropy/resegment.py | 94 +++++++++++++++++------------------- 1 file changed, 43 insertions(+), 51 deletions(-) diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index d429c1de..48bb0d40 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -144,11 +144,11 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option else: self.logger.warning(f'Page "{page_id}" region "{region.id}" contains no text lines') return OcrdPageResult(pcgts) - + def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, lines, ignore): threshold = self.parameter['min_fraction'] method = self.parameter['method'] - maxdist = self.parameter['spread']/zoom*300/72 # in pt + maxdist = self.parameter['spread'] / zoom * 300 / 72 # in pt # prepare line segmentation parent_array = pil2array(parent_image) #parent_array, _ = common.binarize(parent_array, maxskew=0) # just in case still raw @@ -172,7 +172,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l for i, line in enumerate(lines): if self.parameter['baseline_only'] and line.Baseline: line_base = baseline_of_segment(line, parent_coords) - line_poly = polygon_from_baseline(line_base, 30/zoom) + line_poly = polygon_from_baseline(line_base, 30 / zoom) else: line_poly = coordinates_of_segment(line, parent_image, parent_coords) line_poly = make_valid(Polygon(line_poly)) @@ -184,9 +184,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l # (causing negative/above-max indices), either fully or partially, # then this will silently ignore them. The caller does not need # to concern herself with this. - line_y, line_x = draw.polygon(polygon[:, 1], - polygon[:, 0], - parent_bin.shape) + line_y, line_x = draw.polygon(polygon[:, 1], polygon[:, 0], parent_bin.shape) line_labels[i, line_y, line_x] = True # only text region(s) may contain new text lines for i, region in enumerate(set(line.parent_object_ for line in lines)): @@ -194,17 +192,13 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l region_polygon = coordinates_of_segment(region, parent_image, parent_coords) region_polygon = make_valid(Polygon(region_polygon)) region_polygon = np.array(region_polygon.exterior.coords, int)[:-1] - ignore_bin[draw.polygon(region_polygon[:, 1], - region_polygon[:, 0], - parent_bin.shape)] = False + ignore_bin[draw.polygon(region_polygon[:, 1], region_polygon[:, 0], parent_bin.shape)] = False # mask/ignore overlapping neighbours for i, segment in enumerate(ignore): self.logger.debug(f'Masking area of {type(segment).__name__[:-4]} "{segment.id}" for ' f'"{page_id if fullpage else parent.id}"') segment_polygon = coordinates_of_segment(segment, parent_image, parent_coords) - ignore_bin[draw.polygon(segment_polygon[:, 1], - segment_polygon[:, 0], - parent_bin.shape)] = True + ignore_bin[draw.polygon(segment_polygon[:, 1], segment_polygon[:, 0], parent_bin.shape)] = True if method != 'lineest': self.logger.debug(f'Calculating connected component and distance transforms for "{parent.id}"') bin = parent_bin & ~ ignore_bin @@ -213,8 +207,8 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l _, counts = np.unique(components, return_counts=True) if counts.shape[0] > 1: counts = np.sqrt(3 * counts) - scale = int(np.median(counts[(5/zoom < counts) & (counts < 100/zoom)])) - components *= (counts > 15/zoom)[components] + scale = int(np.median(counts[(5 / zoom < counts) & (counts < 100 / zoom)])) + components *= (counts > 15 / zoom)[components] self.logger.debug(f"Estimated scale: {scale}") else: scale = 43 @@ -244,12 +238,12 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l parent_bin.shape) new_labels[line_y, line_x] = i + 1 spread_dist(self.logger, lines, line_labels, new_labels, parent_bin, components, parent_coords, - maxdist=maxdist or scale/2, loc=parent.id, threshold=threshold) + maxdist=maxdist or scale / 2, loc=parent.id, threshold=threshold) return try: # TODO: 'scale' passed as a param may not be always defined (mehmedGIT) new_line_labels, new_baselines, _, _, _, scale = compute_segmentation( - parent_bin, seps=ignore_bin, zoom=zoom, spread_dist=maxdist or scale/2, + parent_bin, seps=ignore_bin, zoom=zoom, spread_dist=maxdist or scale / 2, fullpage=fullpage, maxseps=0, maxcolseps=len(ignore), maximages=0) except Exception as err: self.logger.error(f'Cannot line-segment {tag} "{page_id if fullpage else parent.id}": {err}') @@ -257,13 +251,13 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l self.logger.info( f"Found {new_line_labels.max()} new line labels for {len(lines)} existing lines on {tag} '{parent.id}'") # polygonalize and prepare comparison - new_line_polygons, new_line_labels = masks2polygons(self.logger, - new_line_labels, new_baselines, parent_bin, name=f'{tag} "{parent.id}"', - min_area=640/zoom/zoom) + new_line_polygons, new_line_labels = masks2polygons( + self.logger, new_line_labels, new_baselines, parent_bin, name=f'{tag} "{parent.id}"', + min_area=640 / zoom / zoom) DSAVE('line_labels', [np.argmax(np.insert(line_labels, 0, 0, axis=0), axis=0), parent_bin]) DSAVE('new_line_labels', [new_line_labels, parent_bin]) - new_line_polygons, new_baselines = list(zip(*[(Polygon(poly), LineString(base)) - for _, poly, base in new_line_polygons])) or ([], []) + new_line_polygons, new_baselines = list(zip( + *[(Polygon(poly), LineString(base)) for _, poly, base in new_line_polygons])) or ([], []) # polygons for intersecting pairs intersections = dict() # ratio of overlap between intersection and new line @@ -281,12 +275,12 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l inter = make_intersection(line_poly.context, new_line_poly) if not inter: continue - new_line_mask = (new_line_labels == i+1) & parent_bin + new_line_mask = (new_line_labels == i + 1) & parent_bin line_mask = line_labels[j] & parent_bin inter_mask = new_line_mask & line_mask if (not np.count_nonzero(inter_mask) or - not np.count_nonzero(new_line_mask) or - not np.count_nonzero(line_mask)): + not np.count_nonzero(new_line_mask) or + not np.count_nonzero(line_mask)): continue intersections[(i, j)] = inter fits_bg[i, j] = inter.area / new_line_poly.area @@ -344,17 +338,17 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l if not np.prod(new_lines.shape): self.logger.debug(f"no lines for '{line.id}' match or fit", ) continue - covers = np.sum(covers_bg[new_lines,j]) + covers = np.sum(covers_bg[new_lines, j]) if covers < threshold / 3: self.logger.debug(f"new lines for '{line.id}' only cover %.1f%% bg", covers * 100) continue - covers = np.sum(covers_fg[new_lines,j]) + covers = np.sum(covers_fg[new_lines, j]) if covers < threshold: self.logger.debug(f"new lines for '{line.id}' only cover %.1f%% fg", covers * 100) continue - looses = (assignments < 0) & (covers_bg[:,j] > 0.1) + looses = (assignments < 0) & (covers_bg[:, j] > 0.1) if looses.any(): - covers = np.sum(covers_bg[np.nonzero(looses)[0],j]) + covers = np.sum(covers_bg[np.nonzero(looses)[0], j]) self.logger.debug( f"new lines for '{line.id}' would loose {np.count_nonzero(looses)} non-matching segments " f"totalling %.1f%% bg", covers * 100) @@ -365,13 +359,12 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l # combine all assigned new lines to single outline polygon if len(new_lines) > 1: self.logger.debug(f"joining {len(new_lines)} new line polygons for '{line.id}'") - new_polygon = join_polygons([new_line_polygons[i] #intersections[(i, j)] - for i in new_lines], loc=line.id, scale=scale) - new_baseline = join_baselines(self.logger, [new_polygon.intersection(new_baselines[i]) - for i in new_lines], loc=line.id) + # intersections[(i, j)] + new_polygon = join_polygons([new_line_polygons[i] for i in new_lines], loc=line.id, scale=scale) + new_baseline = join_baselines( + self.logger, [new_polygon.intersection(new_baselines[i]) for i in new_lines], loc=line.id) # convert back to absolute (page) coordinates: - line_polygon = coordinates_for_segment(new_polygon.exterior.coords[:-1], - parent_image, parent_coords) + line_polygon = coordinates_for_segment(new_polygon.exterior.coords[:-1], parent_image, parent_coords) line_polygon = polygon_for_parent(line_polygon, line.parent_object_) if line_polygon is None: self.logger.warning(f"Ignoring extant new polygon for line '{line.id}'") @@ -379,8 +372,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l # annotate result: line.get_Coords().set_points(points_from_polygon(line_polygon)) if new_baseline is not None: - new_baseline = coordinates_for_segment(new_baseline.coords, - parent_image, parent_coords) + new_baseline = coordinates_for_segment(new_baseline.coords, parent_image, parent_coords) line.set_Baseline(BaselineType(points=points_from_polygon(new_baseline))) line_polygons[j] = prep(new_polygon) # now also ensure the assigned lines do not overlap other existing lines @@ -394,20 +386,22 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l if other_polygon.is_empty: continue # convert back to absolute (page) coordinates: - other_polygon = coordinates_for_segment(other_polygon.exterior.coords[:-1], - parent_image, parent_coords) + other_polygon = coordinates_for_segment( + other_polygon.exterior.coords[:-1], parent_image, parent_coords) other_polygon = polygon_for_parent(other_polygon, otherline.parent_object_) if other_polygon is None: self.logger.warning(f"Ignoring extant new polygon for line '{otherline.id}'") continue otherline.get_Coords().set_points(points_from_polygon(other_polygon)) -def spread_dist(logger: Logger, lines, old_labels, new_labels, binarized, components, coords, - maxdist=43, loc='', threshold=0.9): + +def spread_dist( + logger: Logger, lines, old_labels, new_labels, binarized, components, coords, maxdist=43, loc='', + threshold=0.9): """redefine line coordinates by contourizing spread of connected components propagated from new labels""" DSAVE('seeds', [new_labels, (components>0)]) # allocate to connected components consistently - # (ignoring smallest components like punctuation) + # (ignoring the smallest components like punctuation) # but when there are conflicts, meet in the middle via watershed new_labels2 = morph.propagate_labels(components > 0, new_labels, conflict=0) new_labels2 = segmentation.watershed(new_labels2, markers=new_labels, mask=(components > 0)) @@ -415,7 +409,7 @@ def spread_dist(logger: Logger, lines, old_labels, new_labels, binarized, compon # dilate/grow labels from connected components against each other and bg new_labels = morph.spread_labels(new_labels2, maxdist=maxdist) DSAVE('spread', new_labels) - # now propagate again to catch smallest components like punctuation + # now propagate again to catch the smallest components like punctuation new_labels2 = morph.propagate_labels(binarized, new_labels, conflict=0) new_labels2 = segmentation.watershed(new_labels2, markers=new_labels, mask=binarized) DSAVE('propagated-again', [new_labels2, binarized & (new_labels2==0)]) @@ -444,7 +438,7 @@ def spread_dist(logger: Logger, lines, old_labels, new_labels, binarized, compon logger.debug(f"new line for '{line.id}' only covers %.1f%% fg", covers * 100) continue logger.debug(f'Black pixels before/after resegment of line "{line.id}": {count}/{covers * count}') - contours = [contour[:,::-1] # get x,y order again + contours = [contour[:, :: -1] # get x,y order again for contour, area in morph.find_contours(new_label)] #LOG.debug("joining %d subsegments for %s", len(contours), line.id) if len(contours) == 0: @@ -452,10 +446,9 @@ def spread_dist(logger: Logger, lines, old_labels, new_labels, binarized, compon continue else: # get alpha shape - poly = join_polygons([make_valid(Polygon(contour)) - for contour in contours - if len(contour) >= 4], - loc=line.id, scale=maxdist) + poly = join_polygons( + [make_valid(Polygon(contour)) for contour in contours if len(contour) >= 4], + loc=line.id, scale=maxdist) poly = poly.exterior.coords[:-1] polygon = coordinates_for_segment(poly, None, coords) polygon = polygon_for_parent(polygon, line.parent_object_) @@ -472,9 +465,8 @@ def baseline_of_segment(segment, coords): # zzz should go into core ocrd_utils def polygon_from_baseline(baseline, scale): - ltr = baseline[0,0] < baseline[-1,0] + ltr = baseline[0, 0] < baseline[-1, 0] # left-hand side if left-to-right, and vice versa - polygon = make_valid(join_polygons([LineString(baseline).buffer(scale * (-1) ** ltr, - single_sided=True)], - scale=scale)) + polygon = make_valid(join_polygons( + [LineString(baseline).buffer(scale * (-1) ** ltr, single_sided=True)], scale=scale)) return polygon From 1004b431e451be4288aa98054dff843bce3e306b Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 16 Aug 2024 16:52:51 +0200 Subject: [PATCH 60/97] spacing: rest --- ocrd_cis/ocropy/binarize.py | 11 ++++++----- ocrd_cis/ocropy/clip.py | 34 ++++++++++++++++++---------------- ocrd_cis/ocropy/denoise.py | 9 ++++----- ocrd_cis/ocropy/deskew.py | 22 +++++++++++----------- ocrd_cis/ocropy/dewarp.py | 21 ++++++++++----------- ocrd_cis/ocropy/recognize.py | 35 +++++++++++++++-------------------- ocrd_cis/ocropy/resegment.py | 2 +- ocrd_cis/ocropy/segment.py | 2 +- 8 files changed, 66 insertions(+), 70 deletions(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index 782dd578..35b28c5a 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -38,14 +38,14 @@ def binarize(logger: Logger, pil_image, method='ocropy', maxskew=2, threshold=0. if method == 'global': # global thresholding - _, th = cv2.threshold(img,threshold*255,255,cv2.THRESH_BINARY) + _, th = cv2.threshold(img, threshold * 255, 255, cv2.THRESH_BINARY) elif method == 'otsu': # Otsu's thresholding - _, th = cv2.threshold(img,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU) + _, th = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) elif method == 'gauss-otsu': # Otsu's thresholding after Gaussian filtering blur = cv2.GaussianBlur(img, (5, 5), 0) - _, th = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU) + _, th = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) else: raise Exception('unknown binarization method %s' % method) return Image.fromarray(th), 0 @@ -95,7 +95,8 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional page = pcgts.get_Page() assert page - page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id, feature_filter='binarized') + page_image, page_xywh, page_image_info = self.workspace.image_from_page( + page, page_id, feature_filter='binarized') zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) result = OcrdPageResult(pcgts) @@ -162,7 +163,7 @@ def process_page(self, page, page_image, page_xywh, zoom, page_id) -> OcrdPageRe # to do consistent coordinate transforms, and non-consumers # to redo the rotation themselves): orientation = -page_xywh['angle'] - orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180] + orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180] page.set_orientation(orientation) if self.parameter['grayscale']: suffix = '.IMG-NRM' diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py index 7f40a214..f5390dde 100644 --- a/ocrd_cis/ocropy/clip.py +++ b/ocrd_cis/ocropy/clip.py @@ -124,16 +124,17 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: str = No masks = [pil2array(polygon_mask(page_image, polygon)).astype(np.uint8) for polygon in polygons] for i, region in enumerate(regions): if i >= num_texts: - break # keep non-text regions unchanged + break # keep non-text regions unchanged if level == 'region': if region.get_AlternativeImage(): # FIXME: This should probably be an exception (bad workflow configuration). self.logger.warning(f'Page "{page_id}" region "{region.id}" already contains image data: skipping') continue shape = prep(shapes[i]) - neighbours = [(regionj, maskj) for shapej, regionj, maskj - in zip(shapes[:i] + shapes[i+1:], regions[:i] + regions[i+1:], masks[:i] + masks[i+1:]) - if shape.intersects(shapej)] + neighbours = [ + (regionj, maskj) for shapej, regionj, maskj in + zip(shapes[:i] + shapes[i + 1:], regions[:i] + regions[i + 1:], masks[:i] + masks[i + 1:]) + if shape.intersects(shapej)] if neighbours: ret.images.append(self.process_segment( region, masks[i], polygons[i], neighbours, background_image, @@ -161,24 +162,25 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: str = No f'Page "{page_id}" region "{region.id}" line "{line.id}" already contains image data: skipping') continue shape = prep(shapes[j]) - neighbours = [(linej, maskj) for shapej, linej, maskj - in zip(shapes[:j] + shapes[j+1:], lines[:j] + lines[j+1:], masks[:j] + masks[j+1:]) - if shape.intersects(shapej)] + neighbours = [ + (linej, maskj) for shapej, linej, maskj in + zip(shapes[:j] + shapes[j + 1:], lines[:j] + lines[j + 1:], masks[:j] + masks[j + 1:]) + if shape.intersects(shapej)] if neighbours: ret.images.append(self.process_segment( line, masks[j], polygons[j], neighbours, background_image, region_image, region_coords, region_bin, page_id)) return ret - def process_segment(self, segment, segment_mask, segment_polygon, neighbours, - background_image, parent_image, parent_coords, parent_bin, - page_id) -> OcrdPageResultImage: + def process_segment( + self, segment, segment_mask, segment_polygon, neighbours, background_image, parent_image, parent_coords, + parent_bin, page_id + ) -> OcrdPageResultImage: # initialize AlternativeImage@comments classes from parent, except # for those operations that can apply on multiple hierarchy levels: features = ','.join( [feature for feature in parent_coords['features'].split(',') - if feature in ['binarized', 'grayscale_normalized', - 'despeckled', 'dewarped']]) + ',clipped' + if feature in ['binarized', 'grayscale_normalized', 'despeckled', 'dewarped']]) + ',clipped' # mask segment within parent image: segment_image = image_from_polygon(parent_image, segment_polygon) segment_bbox = bbox_from_polygon(segment_polygon) @@ -188,8 +190,8 @@ def process_segment(self, segment, segment_mask, segment_polygon, neighbours, f'Ignoring enclosing neighbour "{neighbour.id}" of segment "{segment.id}" on page "{page_id}"') continue # find connected components that (only) belong to the neighbour: - intruders = segment_mask * morph.keep_marked(parent_bin, neighbour_mask > 0) # overlaps neighbour - intruders = morph.remove_marked(intruders, segment_mask > neighbour_mask) # but exclusively + intruders = segment_mask * morph.keep_marked(parent_bin, neighbour_mask > 0) # overlaps neighbour + intruders = morph.remove_marked(intruders, segment_mask > neighbour_mask) # but exclusively num_intruders = np.count_nonzero(intruders) num_foreground = np.count_nonzero(segment_mask * parent_bin) if not num_intruders: @@ -202,14 +204,14 @@ def process_segment(self, segment, segment_mask, segment_polygon, neighbours, segment_mask -= intruders # suppress in derived image result to be annotated clip_mask = array2pil(intruders) - segment_image.paste(background_image, mask=clip_mask) # suppress in raw image + segment_image.paste(background_image, mask=clip_mask) # suppress in raw image if segment_image.mode in ['RGB', 'L', 'RGBA', 'LA']: # for consumers that do not have to rely on our # guessed background color, but can cope with transparency: segment_image.putalpha(ImageOps.invert(clip_mask)) # recrop segment into rectangle, just as image_from_segment would do # (and also clipping with background colour): - segment_image = crop_image(segment_image,box=segment_bbox) + segment_image = crop_image(segment_image, box=segment_bbox) # update PAGE (reference the image file): alternative_image = AlternativeImageType(comments=features) segment.add_AlternativeImage(alternative_image) diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py index b3c219fb..0dd14ef8 100644 --- a/ocrd_cis/ocropy/denoise.py +++ b/ocrd_cis/ocropy/denoise.py @@ -19,7 +19,7 @@ def executable(self): def setup(self): self.logger = getLogger('processor.OcropyDenoise') - def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult: + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Despeckle the pages / regions / lines of the workspace. Open and deserialise PAGE input file and its respective images, @@ -72,8 +72,7 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option self.logger.warning(f'Page "{page_id}" region "{region.id}" contains no text lines') for line in lines: line_image, line_xywh = self.workspace.image_from_segment( - line, region_image, region_xywh, - feature_selector='binarized') + line, region_image, region_xywh, feature_selector='binarized') image = self.process_segment(line, line_image, line_xywh, zoom) if image: result.images.append(image) @@ -83,8 +82,8 @@ def process_segment(self, segment, segment_image, segment_xywh, zoom) -> Optiona self.logger.warning(f"Skipping '{segment.id}' with zero size") return None self.logger.info(f"About to despeckle '{segment.id}'") - bin_image = remove_noise(segment_image, - maxsize=self.parameter['noise_maxsize']/zoom*300/72) # in pt + bin_image = remove_noise( + segment_image, maxsize=self.parameter['noise_maxsize'] / zoom * 300 / 72) # in pt # update PAGE (reference the image file): alt_image = AlternativeImageType(comments=segment_xywh['features'] + ',despeckled') segment.add_AlternativeImage(alt_image) diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py index 84475d81..7bdbba2d 100644 --- a/ocrd_cis/ocropy/deskew.py +++ b/ocrd_cis/ocropy/deskew.py @@ -25,7 +25,7 @@ def executable(self): def setup(self): self.logger = getLogger('processor.OcropyDeskew') - def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult: + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Deskew the pages or regions of the workspace. Open and deserialise PAGE input file and its respective images, @@ -61,7 +61,7 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option return result if level == 'table': regions = page.get_TableRegion() - else: # region + else: # region regions = page.get_AllRegions(classes=['Text'], order='reading-order') if not regions: self.logger.warning('Page "%s" contains no text regions', page_id) @@ -78,29 +78,29 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option result.images.append(image) return result - def _process_segment(self, segment, segment_image, segment_coords, segment_id, page_id) -> Optional[OcrdPageResultImage]: + def _process_segment( + self, segment, segment_image, segment_coords, segment_id, page_id + ) -> Optional[OcrdPageResultImage]: if not segment_image.width or not segment_image.height: self.logger.warning("Skipping %s with zero size", segment_id) return None - angle0 = segment_coords['angle'] # deskewing (w.r.t. top image) already applied to segment_image + angle0 = segment_coords['angle'] # deskewing (w.r.t. top image) already applied to segment_image self.logger.info(f"About to deskew {segment_id}") - angle = deskew(segment_image, maxskew=self.parameter['maxskew']) # additional angle to be applied + angle = deskew(segment_image, maxskew=self.parameter['maxskew']) # additional angle to be applied # segment angle: PAGE orientation is defined clockwise, # whereas PIL/ndimage rotation is in mathematical direction: orientation = -(angle + angle0) - orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180] - segment.set_orientation(orientation) # also removes all deskewed AlternativeImages + orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180] + segment.set_orientation(orientation) # also removes all deskewed AlternativeImages self.logger.info(f"Found angle for {segment_id}: %.1f", angle) # delegate reflection, rotation and re-cropping to core: if isinstance(segment, PageType): segment_image, segment_coords, _ = self.workspace.image_from_page( - segment, page_id, - fill='background', transparency=True) + segment, page_id, fill='background', transparency=True) suffix = '.IMG-DESKEW' else: segment_image, segment_coords = self.workspace.image_from_segment( - segment, segment_image, segment_coords, - fill='background', transparency=True) + segment, segment_image, segment_coords, fill='background', transparency=True) suffix = segment.id + '.IMG-DESKEW' if not angle: # zero rotation does not change coordinates, diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py index 302cf2e0..e06718c8 100644 --- a/ocrd_cis/ocropy/dewarp.py +++ b/ocrd_cis/ocropy/dewarp.py @@ -22,27 +22,27 @@ def dewarp(image, lnorm, check=True, max_neighbour=0.02, zoom=1.0): if not image.width or not image.height: raise InvalidLine('image size is zero') line = pil2array(image) - + if np.prod(line.shape) == 0: raise InvalidLine('image dimensions are zero') if np.amax(line) == np.amin(line): raise InvalidLine('image is blank') - - temp = np.amax(line)-line # inverse, zero-closed + + temp = np.amax(line) - line # inverse, zero-closed if check: report = check_line(temp, zoom=zoom) if report: raise InadequateLine(report) - - temp = temp * 1.0 / np.amax(temp) # normalized + + temp = temp * 1.0 / np.amax(temp) # normalized if check: report = lnorm.check(temp, max_ignore=max_neighbour) if report: raise InvalidLine(report) - lnorm.measure(temp) # find centerline + lnorm.measure(temp) # find centerline line = lnorm.dewarp(line, cval=np.amax(line)) - + return array2pil(line) # pad with white above and below (as a fallback for dewarp) @@ -72,7 +72,7 @@ def setup(self): # and extra params) 0.3)) - def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult: + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Dewarp the lines of the workspace. Open and deserialise PAGE input file and its respective images, @@ -115,9 +115,8 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option self.logger.info(f"About to dewarp page '{page_id}' region '{region.id}' line '{line.id}'") try: - dew_image = dewarp(line_image, self.lnorm, check=True, - max_neighbour=self.parameter['max_neighbour'], - zoom=zoom) + dew_image = dewarp( + line_image, self.lnorm, check=True, max_neighbour=self.parameter['max_neighbour'], zoom=zoom) except InvalidLine as err: self.logger.error(f'Cannot dewarp line "{line.id}": {err}') continue diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py index f0c4b520..02d29e7c 100644 --- a/ocrd_cis/ocropy/recognize.py +++ b/ocrd_cis/ocropy/recognize.py @@ -47,7 +47,7 @@ def recognize(image, pad, network, check=True): # getting confidence result = lstm.translate_back(network.outputs, pos=1) - scale = len(raw_line.T)*1.0/(len(network.outputs)-2*pad) + scale = len(raw_line.T) * 1.0 / (len(network.outputs) - 2 * pad) clist = [] rlist = [] @@ -57,7 +57,7 @@ def recognize(image, pad, network, check=True): if c != 0: confid = network.outputs[r, c] c = network.l2s([c]) - r = (r-pad)*scale + r = (r - pad) * scale confidlist.append(confid) clist.append(c) @@ -88,7 +88,7 @@ def setup(self): def get_model(self): """Search for the model file. First checks if parameter['model'] can - be resolved with OcrdResourceManager to a valid readeable file and + be resolved with OcrdResourceManager to a valid readable file and returns it. If not, it checks if the model can be found in the dirname(__file__)/models/ directory.""" canread = lambda p: isfile(p) and access(p, R_OK) @@ -202,8 +202,8 @@ def process_lines(self, textlines, maxlevel, region_image, region_xywh): words = [x.strip() for x in linepred.split(' ') if x.strip()] - word_r_list = [[0]] # r-positions of every glyph in every word - word_conf_list = [[]] # confidences of every glyph in every word + word_r_list = [[0]] # r-positions of every glyph in every word + word_conf_list = [[]] # confidences of every glyph in every word if words != []: w_no = 0 found_char = False @@ -215,7 +215,7 @@ def process_lines(self, textlines, maxlevel, region_image, region_xywh): if c == ' ' and found_char: if i == 0: word_r_list[0][0] = rlist[i] - elif i+1 <= len(clist)-1 and clist[i+1] != ' ': + elif i + 1 <= len(clist) - 1 and clist[i + 1] != ' ': word_conf_list.append([]) word_r_list.append([rlist[i]]) w_no += 1 @@ -224,9 +224,9 @@ def process_lines(self, textlines, maxlevel, region_image, region_xywh): word_r_list = [[0, line_image.width]] # conf for each word - wordsconf = [(min(x)+max(x))/2 for x in word_conf_list] + wordsconf = [(min(x) + max(x)) / 2 for x in word_conf_list] # conf for the line - line_conf = (min(wordsconf) + max(wordsconf))/2 + line_conf = (min(wordsconf) + max(wordsconf)) / 2 # line text line.add_TextEquiv(TextEquivType(Unicode=linepred, conf=line_conf)) @@ -235,32 +235,27 @@ def process_lines(self, textlines, maxlevel, region_image, region_xywh): word_points = points_from_polygon( coordinates_for_segment( np.array(polygon_from_bbox( - word_r_list[word_no][0] / scale, - 0, - word_r_list[word_no][-1] / scale, - 0 + line_image.height)), + word_r_list[word_no][0] / scale,0, + word_r_list[word_no][-1] / scale, 0 + line_image.height)), line_image, line_coords)) word_id = '%s_word%04d' % (line.id, word_no) word = WordType(id=word_id, Coords=CoordsType(word_points)) line.add_Word(word) - word.add_TextEquiv(TextEquivType( - Unicode=word_str, conf=wordsconf[word_no])) + word.add_TextEquiv(TextEquivType(Unicode=word_str, conf=wordsconf[word_no])) if maxlevel == 'glyph': for glyph_no, glyph_str in enumerate(word_str): glyph_points = points_from_polygon( coordinates_for_segment( np.array(polygon_from_bbox( - word_r_list[word_no][glyph_no] / scale, - 0, - word_r_list[word_no][glyph_no+1] / scale, - 0 + line_image.height)), + word_r_list[word_no][glyph_no] / scale, 0, + word_r_list[word_no][glyph_no + 1] / scale, 0 + line_image.height)), line_image, line_coords)) glyph_id = '%s_glyph%04d' % (word.id, glyph_no) glyph = GlyphType(id=glyph_id, Coords=CoordsType(glyph_points)) word.add_Glyph(glyph) - glyph.add_TextEquiv(TextEquivType( - Unicode=glyph_str, conf=word_conf_list[word_no][glyph_no])) + glyph.add_TextEquiv( + TextEquivType(Unicode=glyph_str, conf=word_conf_list[word_no][glyph_no])) return edits, lengs diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index 48bb0d40..5a8c7e96 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -52,7 +52,7 @@ def executable(self): def setup(self): self.logger = getLogger('processor.OcropyResegment') - def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult: + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Resegment lines of the workspace. Open and deserialise PAGE input file and its respective images, diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index 75be2a11..6dc75056 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -252,7 +252,7 @@ def executable(self): def setup(self): self.logger = getLogger('processor.OcropySegment') - def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult: + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Segment pages into regions+lines, tables into cells+lines, or regions into lines. Open and deserialise PAGE input file and its respective images, From c5498a0e8d8bc9a8e3fe3bf0848df9b135bae69c Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 16 Aug 2024 16:55:44 +0200 Subject: [PATCH 61/97] spacing: dewarp --- ocrd_cis/ocropy/dewarp.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py index e06718c8..89901efd 100644 --- a/ocrd_cis/ocropy/dewarp.py +++ b/ocrd_cis/ocropy/dewarp.py @@ -95,24 +95,19 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional result = OcrdPageResult(pcgts) page = pcgts.get_Page() - page_image, page_xywh, page_image_info = self.workspace.image_from_page( - page, page_id) + page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id) zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) regions = page.get_AllRegions(classes=['Text'], order='reading-order') if not regions: self.logger.warning(f'Page "{page_id}" contains no text regions') for region in regions: - region_image, region_xywh = self.workspace.image_from_segment( - region, page_image, page_xywh) - + region_image, region_xywh = self.workspace.image_from_segment(region, page_image, page_xywh) lines = region.get_TextLine() if not lines: self.logger.warning(f'Region {region.id} contains no text lines') for line in lines: - line_image, line_xywh = self.workspace.image_from_segment( - line, region_image, region_xywh) - + line_image, line_xywh = self.workspace.image_from_segment(line, region_image, region_xywh) self.logger.info(f"About to dewarp page '{page_id}' region '{region.id}' line '{line.id}'") try: dew_image = dewarp( From 31e124577faad71f2bb039a6b094900b6cdf9df1 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 16 Aug 2024 16:58:52 +0200 Subject: [PATCH 62/97] fix: dewarp return --- ocrd_cis/ocropy/dewarp.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py index 89901efd..17d0b4ce 100644 --- a/ocrd_cis/ocropy/dewarp.py +++ b/ocrd_cis/ocropy/dewarp.py @@ -123,5 +123,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional dew_image = padvert(line_image, self.parameter['range']) # update PAGE (reference the image file): alt_image = AlternativeImageType(comments=line_xywh['features'] + ',dewarped') - line.add_AlternativeImage(alternative_image) - return OcrdPageResultImage(dew_image, region.id + '_' + line.id + '.IMG-DEWARP', alt_image) + line.add_AlternativeImage(alt_image) + suffix = f"{region.id}_{line.id}.IMG-DEWARP" + result.images.append(OcrdPageResultImage(dew_image, suffix, alt_image)) + return result From f86c99391e987d4918b6d626dbf1b2f990d7712b Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 16 Aug 2024 17:21:14 +0200 Subject: [PATCH 63/97] improve str speed: precompute element_name_id --- ocrd_cis/ocropy/segment.py | 92 +++++++++++++++++--------------------- 1 file changed, 42 insertions(+), 50 deletions(-) diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index 6dc75056..9daf59de 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -388,13 +388,12 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional rogroup = OrderedGroupType(id="reading-order") ro.set_OrderedGroup(rogroup) if (not rogroup.get_RegionRefIndexed() and - not rogroup.get_OrderedGroupIndexed() and - not rogroup.get_UnorderedGroupIndexed()): - # schema forbids empty OrderedGroup + not rogroup.get_OrderedGroupIndexed() and + not rogroup.get_UnorderedGroupIndexed()): + # schema forbids empty OrderedGroup ro.set_OrderedGroup(None) # go get TextRegions with TextLines (and SeparatorRegions): - image = self._process_element(page, ignore, page_image, page_coords, - zoom=zoom, rogroup=rogroup) + image = self._process_element(page, ignore, page_image, page_coords, zoom=zoom, rogroup=rogroup) if image: result.images.append(image) return result @@ -450,11 +449,11 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional roelem = page_subgroup_in_reading_order(self.logger, roelem) reading_order[region.id] = roelem # go get TextRegions with TextLines (and SeparatorRegions) - image = self._process_element(region, subignore, region_image, region_coords, - zoom=zoom, rogroup=roelem) + image = self._process_element( + region, subignore, region_image, region_coords, zoom=zoom, rogroup=roelem) if image: result.images.append(image) - else: # 'region' + else: # 'region' regions = list(page.get_TextRegion()) # besides top-level text regions, line-segment any table cells, # and for tables without any cells, add a pseudo-cell @@ -463,10 +462,8 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional if subregions: regions.extend(subregions) else: - subregion = TextRegionType(id=region.id + '_text', - Coords=region.get_Coords(), - # as if generated from parser: - parent_object_=region) + subregion = TextRegionType( + id=f'{region.id}_text', Coords=region.get_Coords(), parent_object_=region) region.add_TextRegion(subregion) regions.append(subregion) if not regions: @@ -490,7 +487,6 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional image = self._process_element(region, ignore, region_image, region_coords, zoom=zoom) if image: result.images.append(image) - return result def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=None) -> Optional[OcrdPageResultImage]: @@ -535,7 +531,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non sp_col = segment_polygon[:, 0] if isinstance(segment, SeparatorRegionType): sep_bin[draw.polygon(sp_row, sp_col, sep_bin.shape)] = True - ignore_labels[draw.polygon(sp_row, sp_col, ignore_labels.shape)] = i+1 # mapped back for RO + ignore_labels[draw.polygon(sp_row, sp_col, ignore_labels.shape)] = i + 1 # mapped back for RO if isinstance(element, PageType): element_name = 'page' fullpage = True @@ -555,6 +551,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non report = check_region(element_bin, zoom) suffix = element.id + '.IMG-CLIP' self.logger.info(f'Computing line segmentation for {element_name} "{element.id}"') + element_name_id = f'{element_name} "{element.id}"' # TODO: we should downscale if DPI is large enough to save time try: if report: @@ -564,7 +561,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non # but keep them for h/v-line detection (in fullpage mode): element_bin, seps=(sep_bin + ignore_labels) > 0, zoom=zoom, fullpage=fullpage, - spread_dist=round(self.parameter['spread'] / zoom * 300 / 72), # in pt + spread_dist=round(self.parameter['spread'] / zoom * 300 / 72), # in pt # these are ignored when not in fullpage mode: maxcolseps=self.parameter['maxcolseps'], maxseps=self.parameter['maxseps'], @@ -576,10 +573,10 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non # as a fallback, add a single text line comprising the whole region: element.add_TextLine(TextLineType(id=element.id + "_line", Coords=element.get_Coords())) else: - self.logger.error(f'Cannot line-segment {element_name} "{element.id}": {err}') + self.logger.error(f'Cannot line-segment {element_name_id}: {err}') return None - self.logger.info(f'Found {len(np.unique(line_labels)) - 1} text lines for {element_name} "{element.id}"') + self.logger.info(f'Found {len(np.unique(line_labels)) - 1} text lines for {element_name_id}') # post-process line labels if isinstance(element, (PageType, TableRegionType)): # aggregate text lines to text regions @@ -594,18 +591,18 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non region_labels = lines2regions( element_bin, line_labels, rlabels=ignore_labels, - sepmask=np.maximum(sepmask, colseps), # add bg + sepmask=np.maximum(sepmask, colseps), # add bg # decide horizontal vs vertical cut when gaps of similar size prefer_vertical=not isinstance(element, TableRegionType), gap_height=self.parameter['gap_height'], gap_width=self.parameter['gap_width'], scale=scale, zoom=zoom) self.logger.info( - f'Found {len(np.unique(region_labels)) - 1} text regions for {element_name} "{element.id}"') + f'Found {len(np.unique(region_labels)) - 1} text regions for {element_name_id}') except Exception as err: - self.logger.error(f'Cannot region-segment {element_name} "{element.id}": {err}') + self.logger.error(f'Cannot region-segment {element_name_id}: {err}') region_labels = np.where(line_labels > len(ignore), 1 + len(ignore), line_labels) - + # prepare reading order group index if rogroup: if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): @@ -622,7 +619,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non region_no = 0 for region_label in np.unique(region_labels): if not region_label: - continue # no bg + continue # no bg region_mask = region_labels == region_label region_line_labels = line_labels * region_mask region_line_labels0 = np.setdiff1d(region_line_labels, [0]) @@ -644,18 +641,17 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non order[np.setdiff1d(region_line_labels0, element_bin * region_line_labels)] = 0 region_line_labels = order[region_line_labels] # avoid horizontal gaps - region_line_labels = hmerge_line_seeds(element_bin, region_line_labels, scale, - seps=np.maximum(sepmask, colseps)) + region_line_labels = hmerge_line_seeds( + element_bin, region_line_labels, scale, seps=np.maximum(sepmask, colseps)) region_mask |= region_line_labels > 0 # find contours for region (can be non-contiguous) - regions, _ = masks2polygons(self.logger, region_mask * region_label, None, element_bin, - name=f'{element_name} "{element.id}"', - min_area=6000 / zoom / zoom, - simplify=ignore_labels * ~(sep_bin)) + regions, _ = masks2polygons( + self.logger, region_mask * region_label, None, element_bin, name=element_name_id, + min_area=6000 / zoom / zoom, simplify=ignore_labels * ~(sep_bin)) # find contours for lines (can be non-contiguous) - lines, _ = masks2polygons(self.logger, region_line_labels, baselines, element_bin, - name=f'region "{element.id}"', - min_area=640 / zoom / zoom) + lines, _ = masks2polygons( + self.logger, region_line_labels, baselines, element_bin, name=f'region "{element.id}"', + min_area=640 / zoom / zoom) # create new lines in new regions (allocating by intersection) line_polys = [Polygon(polygon) for _, polygon, _ in lines] for _, region_polygon, _ in regions: @@ -674,7 +670,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non # find out which line (contours) belong to which region (contours) line_no = 0 for i, line_poly in enumerate(line_polys): - if not region_poly.intersects(line_poly): # .contains + if not region_poly.intersects(line_poly): # .contains continue line_label, line_polygon, line_baseline = lines[i] # convert back to absolute (page) coordinates: @@ -696,16 +692,14 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non # if the region has received text lines, keep it if region.get_TextLine(): element.add_TextRegion(region) - self.logger.info(f'Added region "{region_id}" with {line_no} lines ' - f'for {element_name} "{element.id}"') + self.logger.info(f'Added region "{region_id}" with {line_no} lines for {element_name_id}') if rogroup: index = page_add_to_reading_order(rogroup, region.id, index) # add additional image/non-text regions from compute_segmentation # (e.g. drop-capitals or images) ... - self.logger.info(f'Found {images.max()} large image regions for {element_name} "{element.id}"') + self.logger.info(f'Found {images.max()} large image regions for {element_name_id}') # find contours around region labels (can be non-contiguous): - image_polygons, _ = masks2polygons(self.logger, images, None, element_bin, - name=f'{element_name} "{element.id}"') + image_polygons, _ = masks2polygons(self.logger, images, None, element_bin, name=element_name_id) for image_label, polygon, _ in image_polygons: # convert back to absolute (page) coordinates: region_polygon = coordinates_for_segment(polygon, image, coords) @@ -719,11 +713,10 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non element.add_ImageRegion(ImageRegionType( id=region_id, Coords=CoordsType(points=points_from_polygon(region_polygon)))) # split detected separator labels into separator regions: - self.logger.info(f'Found {seplines.max()} separators for {element_name} "{element.id}"') + self.logger.info(f'Found {seplines.max()} separators for {element_name_id}') # find contours around region labels (can be non-contiguous): - sep_polygons, _ = masks2polygons(self.logger, seplines, None, element_bin, - name=f'{element_name} "{element.id}"', - open_holes=True, reorder=False) + sep_polygons, _ = masks2polygons( + self.logger, seplines, None, element_bin, name=element_name_id, open_holes=True, reorder=False) for sep_label, polygon, _ in sep_polygons: # convert back to absolute (page) coordinates: region_polygon = coordinates_for_segment(polygon, image, coords) @@ -737,7 +730,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non element.add_SeparatorRegion(SeparatorRegionType( id=region_id, Coords=CoordsType(points=points_from_polygon(region_polygon)))) # annotate a text/image-separated image - element_array[sepmask] = np.amax(element_array) # clip to white/bg + element_array[sepmask] = np.amax(element_array) # clip to white/bg image_clipped = array2pil(element_array) image_ref = AlternativeImageType(comments=coords['features'] + ',clipped') element.add_AlternativeImage(image_ref) @@ -746,15 +739,14 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non # get mask from region polygon: region_polygon = coordinates_of_segment(element, image, coords) region_mask = np.zeros_like(element_bin, bool) - region_mask[draw.polygon(region_polygon[:, 1], - region_polygon[:, 0], - region_mask.shape)] = True + region_mask[draw.polygon( + region_polygon[:, 1], region_polygon[:, 0], region_mask.shape)] = True # ensure the new line labels do not extrude from the region: line_labels = line_labels * region_mask # find contours around labels (can be non-contiguous): - line_polygons, _ = masks2polygons(self.logger, line_labels, baselines, element_bin, - name=f'region "{element.id}"', - min_area=640 / zoom / zoom) + line_polygons, _ = masks2polygons( + self.logger, line_labels, baselines, element_bin, + name=f'region "{element.id}"', min_area=640 / zoom / zoom) line_no = 0 for line_label, polygon, baseline in line_polygons: # convert back to absolute (page) coordinates: @@ -772,9 +764,9 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non line.set_Baseline(BaselineType(points=points_from_polygon(line_baseline))) element.add_TextLine(line) if not sep_bin.any(): - return None # no derived image + return None # no derived image # annotate a text/image-separated image - element_array[sep_bin] = np.amax(element_array) # clip to white/bg + element_array[sep_bin] = np.amax(element_array) # clip to white/bg image_clipped = array2pil(element_array) image_ref = AlternativeImageType(comments=coords['features'] + ',clipped') element.add_AlternativeImage(image_ref) From b8e3ad6207a832fad65bccf5ea4756c004bb1f96 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 16 Aug 2024 22:26:33 +0200 Subject: [PATCH 64/97] fix: clip suffix --- ocrd_cis/ocropy/clip.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py index f5390dde..b81c731c 100644 --- a/ocrd_cis/ocropy/clip.py +++ b/ocrd_cis/ocropy/clip.py @@ -213,6 +213,7 @@ def process_segment( # (and also clipping with background colour): segment_image = crop_image(segment_image, box=segment_bbox) # update PAGE (reference the image file): + suffix = f'{segment.id}.IMG_CLIP' alternative_image = AlternativeImageType(comments=features) segment.add_AlternativeImage(alternative_image) - return OcrdPageResultImage(segment_image, '.IMG-CLIP', alternative_image) + return OcrdPageResultImage(segment_image, suffix, alternative_image) From 02724f2db8c1d29f739282a42330c1a9b14e27d2 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 16 Aug 2024 22:30:11 +0200 Subject: [PATCH 65/97] fix: denoise return --- ocrd_cis/ocropy/denoise.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py index 0dd14ef8..4ae883fd 100644 --- a/ocrd_cis/ocropy/denoise.py +++ b/ocrd_cis/ocropy/denoise.py @@ -76,6 +76,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional image = self.process_segment(line, line_image, line_xywh, zoom) if image: result.images.append(image) + return result def process_segment(self, segment, segment_image, segment_xywh, zoom) -> Optional[OcrdPageResultImage]: if not segment_image.width or not segment_image.height: From aac6fe0989ccb483626af6b238e98162b780aac5 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Sat, 17 Aug 2024 00:50:08 +0200 Subject: [PATCH 66/97] try to fix: ocropy denoise --- ocrd_cis/ocropy/denoise.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py index 4ae883fd..fd9812f8 100644 --- a/ocrd_cis/ocropy/denoise.py +++ b/ocrd_cis/ocropy/denoise.py @@ -51,7 +51,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) if level == 'page': - image = self.process_segment(page, page_image, page_xywh, zoom) + image = self.process_segment(page, page_image, page_xywh, zoom, page_id) if image: result.images.append(image) else: @@ -63,7 +63,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional region, page_image, page_xywh, feature_selector='binarized' if level == 'region' else '') if level == 'region': - image = self.process_segment(region, region_image, region_xywh, zoom) + image = self.process_segment(region, region_image, region_xywh, zoom, page_id) if image: result.images.append(image) continue @@ -73,12 +73,12 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional for line in lines: line_image, line_xywh = self.workspace.image_from_segment( line, region_image, region_xywh, feature_selector='binarized') - image = self.process_segment(line, line_image, line_xywh, zoom) + image = self.process_segment(line, line_image, line_xywh, zoom, page_id) if image: result.images.append(image) return result - def process_segment(self, segment, segment_image, segment_xywh, zoom) -> Optional[OcrdPageResultImage]: + def process_segment(self, segment, segment_image, segment_xywh, zoom, page_id) -> Optional[OcrdPageResultImage]: if not segment_image.width or not segment_image.height: self.logger.warning(f"Skipping '{segment.id}' with zero size") return None @@ -87,5 +87,6 @@ def process_segment(self, segment, segment_image, segment_xywh, zoom) -> Optiona segment_image, maxsize=self.parameter['noise_maxsize'] / zoom * 300 / 72) # in pt # update PAGE (reference the image file): alt_image = AlternativeImageType(comments=segment_xywh['features'] + ',despeckled') + suffix = f"{page_id}_{segment.id}.IMG-DESPECK" segment.add_AlternativeImage(alt_image) - return OcrdPageResultImage(bin_image, segment.id + '.IMG-DESPECK', alt_image) + return OcrdPageResultImage(bin_image, suffix, alt_image) From 5548d0e6043e32d7409fef9817775670b2d1b96f Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Sat, 17 Aug 2024 00:58:12 +0200 Subject: [PATCH 67/97] fix: ocropy denoise --- ocrd_cis/ocropy/denoise.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py index fd9812f8..eb3e7d23 100644 --- a/ocrd_cis/ocropy/denoise.py +++ b/ocrd_cis/ocropy/denoise.py @@ -63,7 +63,8 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional region, page_image, page_xywh, feature_selector='binarized' if level == 'region' else '') if level == 'region': - image = self.process_segment(region, region_image, region_xywh, zoom, page_id) + file_id = f"{page_id}_{region.id}" + image = self.process_segment(region, region_image, region_xywh, zoom, file_id) if image: result.images.append(image) continue @@ -73,12 +74,13 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional for line in lines: line_image, line_xywh = self.workspace.image_from_segment( line, region_image, region_xywh, feature_selector='binarized') - image = self.process_segment(line, line_image, line_xywh, zoom, page_id) + file_id = f"{page_id}_{region.id}_{line.id}" + image = self.process_segment(line, line_image, line_xywh, zoom, file_id) if image: result.images.append(image) return result - def process_segment(self, segment, segment_image, segment_xywh, zoom, page_id) -> Optional[OcrdPageResultImage]: + def process_segment(self, segment, segment_image, segment_xywh, zoom, file_id) -> Optional[OcrdPageResultImage]: if not segment_image.width or not segment_image.height: self.logger.warning(f"Skipping '{segment.id}' with zero size") return None @@ -87,6 +89,6 @@ def process_segment(self, segment, segment_image, segment_xywh, zoom, page_id) - segment_image, maxsize=self.parameter['noise_maxsize'] / zoom * 300 / 72) # in pt # update PAGE (reference the image file): alt_image = AlternativeImageType(comments=segment_xywh['features'] + ',despeckled') - suffix = f"{page_id}_{segment.id}.IMG-DESPECK" + suffix = f"{file_id}.IMG-DESPECK" segment.add_AlternativeImage(alt_image) return OcrdPageResultImage(bin_image, suffix, alt_image) From c9f0f56787f2d34d718bc504ee3d07f7501dff75 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Sat, 17 Aug 2024 01:26:54 +0200 Subject: [PATCH 68/97] fix: resegment --- ocrd_cis/ocropy/resegment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index 5a8c7e96..c1809569 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -427,7 +427,7 @@ def spread_dist( continue covers = np.count_nonzero(new_label) / count if covers < threshold / 3: - logger.debug(f"new line for '%s' only covers %.1f%% bg", covers * 100) + logger.debug(f"new line for '{line.id}' only covers %.1f%% bg", covers * 100) continue count = np.count_nonzero(old_label * binarized) if not count: From fff909746f1347fc9336f8413fd311ac4e3ce206 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Sat, 17 Aug 2024 01:27:05 +0200 Subject: [PATCH 69/97] optimize segment --- ocrd_cis/ocropy/segment.py | 48 ++++++++++++++++---------------------- 1 file changed, 20 insertions(+), 28 deletions(-) diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index 9daf59de..b363cbd2 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -544,14 +544,14 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non element_name = 'table' fullpage = True report = check_region(element_bin, zoom) - suffix = element.id + '.IMG-CLIP' + suffix = f"{element.id}.IMG-CLIP" else: element_name = 'region' fullpage = False report = check_region(element_bin, zoom) - suffix = element.id + '.IMG-CLIP' - self.logger.info(f'Computing line segmentation for {element_name} "{element.id}"') + suffix = f"{element.id}.IMG-CLIP" element_name_id = f'{element_name} "{element.id}"' + self.logger.info(f'Computing line segmentation for {element_name_id}') # TODO: we should downscale if DPI is large enough to save time try: if report: @@ -571,7 +571,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non if isinstance(element, TextRegionType): self.logger.error(f'Cannot line-segment region "{element.id}": {err}') # as a fallback, add a single text line comprising the whole region: - element.add_TextLine(TextLineType(id=element.id + "_line", Coords=element.get_Coords())) + element.add_TextLine(TextLineType(id=f"{element.id}_line", Coords=element.get_Coords())) else: self.logger.error(f'Cannot line-segment {element_name_id}: {err}') return None @@ -664,7 +664,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non continue # annotate result: region_no += 1 - region_id = element.id + "_region%04d" % region_no + region_id = f"{element.id}_region%04d" % region_no self.logger.debug(f'Region label {region_label} becomes ID "{region_id}"') region = TextRegionType(id=region_id, Coords=CoordsType(points=points_from_polygon(region_polygon))) # find out which line (contours) belong to which region (contours) @@ -682,7 +682,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non continue # annotate result: line_no += 1 - line_id = region_id + "_line%04d" % line_no + line_id = f"{region_id}_line%04d" % line_no self.logger.debug(f'Line label {line_label} becomes ID "{line_id}"') line = TextLineType(id=line_id, Coords=CoordsType(points=points_from_polygon(line_polygon))) if line_baseline: @@ -709,7 +709,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non continue region_no += 1 # annotate result: - region_id = element.id + "_image%04d" % region_no + region_id = f"{element.id}_image%04d" % region_no element.add_ImageRegion(ImageRegionType( id=region_id, Coords=CoordsType(points=points_from_polygon(region_polygon)))) # split detected separator labels into separator regions: @@ -726,7 +726,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non continue # annotate result: region_no += 1 - region_id = element.id + "_sep%04d" % region_no + region_id = f"{element.id}_sep%04d" % region_no element.add_SeparatorRegion(SeparatorRegionType( id=region_id, Coords=CoordsType(points=points_from_polygon(region_polygon)))) # annotate a text/image-separated image @@ -739,8 +739,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non # get mask from region polygon: region_polygon = coordinates_of_segment(element, image, coords) region_mask = np.zeros_like(element_bin, bool) - region_mask[draw.polygon( - region_polygon[:, 1], region_polygon[:, 0], region_mask.shape)] = True + region_mask[draw.polygon(region_polygon[:, 1], region_polygon[:, 0], region_mask.shape)] = True # ensure the new line labels do not extrude from the region: line_labels = line_labels * region_mask # find contours around labels (can be non-contiguous): @@ -757,7 +756,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non continue # annotate result: line_no += 1 - line_id = element.id + "_line%04d" % line_no + line_id = f"{element.id}_line%04d" % line_no line = TextLineType(id=line_id, Coords=CoordsType(points=points_from_polygon(line_polygon))) if baseline: line_baseline = coordinates_for_segment(baseline, image, coords) @@ -868,11 +867,12 @@ def join_polygons(polygons, loc='', scale=20): dists[j, i] = dist dists = minimum_spanning_tree(dists, overwrite=True) # add bridge polygons (where necessary) + max_dist = max(1.0, scale / 5) for prevp, nextp in zip(*dists.nonzero()): prevp = polygons[prevp] nextp = polygons[nextp] nearest = nearest_points(prevp, nextp) - bridgep = LineString(nearest).buffer(max(1, scale/5), resolution=1) + bridgep = LineString(nearest).buffer(max_dist, resolution=1) polygons.append(bridgep) jointp = unary_union(polygons) assert jointp.geom_type == 'Polygon', jointp.wkt @@ -1017,11 +1017,9 @@ def page_add_to_reading_order(rogroup, region_id, index=None): """ if rogroup: if index is None: - rogroup.add_RegionRef(RegionRefType( - regionRef=region_id)) + rogroup.add_RegionRef(RegionRefType(regionRef=region_id)) else: - rogroup.add_RegionRefIndexed(RegionRefIndexedType( - regionRef=region_id, index=index)) + rogroup.add_RegionRefIndexed(RegionRefIndexedType(regionRef=region_id, index=index)) index += 1 return index @@ -1045,36 +1043,30 @@ def page_subgroup_in_reading_order(logger: Logger, roelem): if not roelem.parent_object_: logger.error('Cannot subgroup from orphan ReadingOrder element') return roelem - if isinstance(roelem, (OrderedGroupType,OrderedGroupIndexedType)) and not ( + if isinstance(roelem, (OrderedGroupType, OrderedGroupIndexedType)) and not ( roelem.get_OrderedGroupIndexed() or roelem.get_UnorderedGroupIndexed() or roelem.get_RegionRefIndexed()): # is already a group and still empty return roelem - if isinstance(roelem, (OrderedGroupType, - UnorderedGroupType, - RegionRefType)): + if isinstance(roelem, (OrderedGroupType, UnorderedGroupType, RegionRefType)): getattr(roelem.parent_object_, { OrderedGroupType: 'get_OrderedGroup', UnorderedGroupType: 'get_UnorderedGroup', RegionRefType: 'get_RegionRef', }.get(roelem.__class__))().remove(roelem) - roelem2 = OrderedGroupType(id=roelem.regionRef + '_group', - regionRef=roelem.regionRef) + roelem2 = OrderedGroupType(id=f"{roelem.regionRef}_group", regionRef=roelem.regionRef) roelem.parent_object_.add_OrderedGroup(roelem2) roelem2.parent_object_ = roelem.parent_object_ return roelem2 - if isinstance(roelem, (OrderedGroupIndexedType, - UnorderedGroupIndexedType, - RegionRefIndexedType)): + if isinstance(roelem, (OrderedGroupIndexedType, UnorderedGroupIndexedType, RegionRefIndexedType)): getattr(roelem.parent_object_, { OrderedGroupIndexedType: 'get_OrderedGroupIndexed', UnorderedGroupIndexedType: 'get_UnorderedGroupIndexed', RegionRefIndexedType: 'get_RegionRefIndexed' }.get(roelem.__class__))().remove(roelem) - roelem2 = OrderedGroupIndexedType(id=roelem.regionRef + '_group', - index=roelem.index, - regionRef=roelem.regionRef) + roelem2 = OrderedGroupIndexedType( + id=f"{roelem.regionRef}_group", index=roelem.index, regionRef=roelem.regionRef) roelem.parent_object_.add_OrderedGroupIndexed(roelem2) roelem2.parent_object_ = roelem.parent_object_ return roelem2 From 8b9283232a57b7c49a78420b32c915b32992ee9a Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Sat, 17 Aug 2024 02:02:28 +0200 Subject: [PATCH 70/97] optimize ocropy common --- ocrd_cis/ocropy/common.py | 186 +++++++++++++++++++------------------- 1 file changed, 93 insertions(+), 93 deletions(-) diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py index c5b56ed0..a5806517 100644 --- a/ocrd_cis/ocropy/common.py +++ b/ocrd_cis/ocropy/common.py @@ -184,16 +184,19 @@ def estimate_thresholds(flat, bignore=0.1, escale=1.0, lo=5, hi=90): d0, d1 = flat.shape o0, o1 = int(bignore * d0), int(bignore * d1) est = flat[o0:d0 - o0, o1:d1 - o1] + if escale > 0: # by default, we use only regions that contain # significant variance; this makes the percentile # based low and high estimates more reliable e = escale - v = est - filters.gaussian_filter(est, e * 20.0) - v = filters.gaussian_filter(v ** 2, e * 20.0) ** 0.5 + e_20_0 = e * 20.0 + e_50 = int(e * 50) + v = est - filters.gaussian_filter(est, e_20_0) + v = filters.gaussian_filter(v ** 2, e_20_0) ** 0.5 v = (v > 0.3 * np.amax(v)) - v = morphology.binary_dilation(v, structure=np.ones((int(e * 50), 1))) - v = morphology.binary_dilation(v, structure=np.ones((1, int(e * 50)))) + v = morphology.binary_dilation(v, structure=np.ones((e_50, 1))) + v = morphology.binary_dilation(v, structure=np.ones((1, e_50))) est = est[v] lo = stats.scoreatpercentile(est.ravel(), lo) hi = stats.scoreatpercentile(est.ravel(), hi) @@ -310,24 +313,24 @@ def check_line(binary, zoom=1.0): Returns an error report, or None if valid. """ - if np.prod(binary.shape)==0: return "image dimensions are zero" - if len(binary.shape)==3: return "image is not monochrome %s"%(binary.shape,) + if np.prod(binary.shape) == 0: return "image dimensions are zero" + if len(binary.shape) == 3: return f"image is not monochrome {binary.shape}" if np.amax(binary)==np.amin(binary): return "image is blank" if np.mean(binary)200/zoom: return "image too tall for a text line %s"%(binary.shape,) + if h<20/zoom: return f"image not tall enough for a text line {binary.shape}" + if h>200/zoom: return f"image too tall for a text line {binary.shape}" ##if w<1.5*h: return "line too short %s"%(binary.shape,) - if w<1.5*h and w<32/zoom: return "image too short for a line image %s"%(binary.shape,) - if w>4000/zoom: return "image too long for a line image %s"%(binary.shape,) + if w<1.5*h and w<32/zoom: return f"image too short for a line image {binary.shape}" + if w>4000/zoom: return f"image too long for a line image {binary.shape}" return None ratio = w*1.0/h _, ncomps = measurements.label(binary) lo = int(0.5*ratio+0.5) hi = int(4*ratio)+1 - if ncomps=%d)"%(ncomps,lo) - ##if ncomps>hi*ratio: return "too many connected components (got %d, wanted <=%d)"%(ncomps,hi) - if ncomps>hi*ratio and ncomps>10: return "too many connected components (got %d, wanted <=%d)"%(ncomps,hi) + if ncomps={lo})" + ##if ncomps>hi*ratio: return f"too many connected components (got {ncomps}, wanted <={hi})" + if ncomps>hi*ratio and ncomps>10: return f"too many connected components (got {ncomps}, wanted <={hi})" return None # inspired by ocropus-gpageseg check_page @@ -341,21 +344,21 @@ def check_region(binary, zoom=1.0): Returns an error report, or None if valid. """ - if np.prod(binary.shape)==0: return "image dimensions are zero" - if len(binary.shape)==3: return "image is not monochrome %s"%(binary.shape,) + if np.prod(binary.shape) == 0: return "image dimensions are zero" + if len(binary.shape) == 3: return f"image is not monochrome {binary.shape}" if np.amax(binary)==np.amin(binary): return "image is blank" if np.mean(binary)5000/zoom: return "image too tall for a region image %s"%(binary.shape,) - if w<100/zoom: return "image too narrow for a region image %s"%(binary.shape,) - if w>5000/zoom: return "image too wide for a region image %s"%(binary.shape,) + if h<45/zoom: return f"image not tall enough for a region image {binary.shape}" + if h>5000/zoom: return f"image too tall for a region image {binary.shape}" + if w<100/zoom: return f"image too narrow for a region image {binary.shape}" + if w>5000/zoom: return f"image too wide for a region image {binary.shape}" return None # zoom factor (DPI relative) and 4 (against fragmentation from binarization) slots = int(w*h*1.0/(30*30)*zoom*zoom) * 4 _,ncomps = measurements.label(binary) - if ncomps<5: return "too few connected components for a region image (got %d)"%(ncomps,) - if ncomps>slots and ncomps>10: return "too many connected components for a region image (%d > %d)"%(ncomps,slots) + if ncomps<5: return f"too few connected components for a region image (got {ncomps})" + if ncomps>slots and ncomps>10: return f"too many connected components for a region image ({ncomps} > {slots})" return None # from ocropus-gpageseg, but with zoom parameter @@ -369,21 +372,21 @@ def check_page(binary, zoom=1.0): Returns an error report, or None if valid. """ - if np.prod(binary.shape)==0: return "image dimensions are zero" - if len(binary.shape)==3: return "image not monochrome %s"%(binary.shape,) + if np.prod(binary.shape) == 0: return "image dimensions are zero" + if len(binary.shape) == 3: return f"image not monochrome {binary.shape}" if np.amax(binary)==np.amin(binary): return "image is blank" if np.mean(binary)20000/zoom: return "image too tall for a page image %s"%(binary.shape,) - if w<600/zoom: return "image too narrow for a page image %s"%(binary.shape,) - if w>20000/zoom: return "image too wide for a page image %s"%(binary.shape,) + if h<600/zoom: return f"image not tall enough for a page image {binary.shape}" + if h>20000/zoom: return f"image too tall for a page image {binary.shape}" + if w<600/zoom: return f"image too narrow for a page image {binary.shape}" + if w>20000/zoom: return f"image too wide for a page image {binary.shape}" return None # zoom factor (DPI relative) and 4 (against fragmentation from binarization) slots = int(w*h*1.0/(30*30)*zoom*zoom) * 4 _,ncomps = measurements.label(binary) - if ncomps<10: return "too few connected components for a page image (got %d)"%(ncomps,) - if ncomps>slots and ncomps>10: return "too many connected components for a page image (%d > %d)"%(ncomps,slots) + if ncomps<10: return f"too few connected components for a page image (got {ncomps})" + if ncomps>slots and ncomps>10: return f"too many connected components for a page image ({ncomps} > {slots})" return None def odd(num): @@ -476,8 +479,13 @@ def compute_images(binary, scale, maximages=5): #images = morph.rb_closing(images, (d0,d1)) #DSAVE('images1_closed', images+0.6*binary) # 1- filter largest connected components - images = morph.select_regions(images,sl.area,min=(4*scale)**2,nbest=2*maximages) - DSAVE('images1_large', images+0.6*binary) + binary_0_6 = 0.6 * binary + odd_scale = odd(scale) + odd_half_scale = odd(scale / 2) + odd_doubled_scale = odd(2 * scale) + region_min = (4 * scale) ** 2 + images = morph.select_regions(images, sl.area, min=region_min, nbest=2 * maximages) + DSAVE('images1_large', images + binary_0_6) if not images.any(): return np.zeros_like(binary, int) # 2- open horizontally and vertically to suppress @@ -486,31 +494,31 @@ def compute_images(binary, scale, maximages=5): # single frame, because then the hull polygon # can cover/overlap large text/table parts which # we cannot discern from the actual image anymore - h_opened = morph.rb_opening(images, (1, odd(scale/2))) - DSAVE('images2_h-opened', h_opened+0.6*binary) - v_opened = morph.rb_opening(images, (odd(scale/2), 1)) - DSAVE('images2_v-opened', v_opened+0.6*binary) + h_opened = morph.rb_opening(images, (1, odd_half_scale)) + DSAVE('images2_h-opened', h_opened + binary_0_6) + v_opened = morph.rb_opening(images, (odd_half_scale, 1)) + DSAVE('images2_v-opened', v_opened + binary_0_6) # 3- close whatever remains - closed = morph.rb_closing(h_opened&v_opened, (odd(2*scale),odd(2*scale))) - DSAVE('images3_closed', closed+0.6*binary) + closed = morph.rb_closing(h_opened&v_opened, (odd_doubled_scale, odd_doubled_scale)) + DSAVE('images3_closed', closed + binary_0_6) # 4- reconstruct the losses up to a certain distance # to avoid creeping into pure h/v-lines again but still # cover most of the large object #images = np.where(images, closed, 2) #images = morph.spread_labels(images, maxdist=scale) % 2 | closed images = morph.rb_reconstruction(closed, images, step=2, maxsteps=scale) - DSAVE('images4_reconstructed', images+0.6*binary) + DSAVE('images4_reconstructed', images + binary_0_6) # 5- select nbest - images = morph.select_regions(images,sl.area,min=(4*scale)**2,nbest=maximages) - DSAVE('images5_selected', images+0.6*binary) + images = morph.select_regions(images, sl.area, min=region_min, nbest=maximages) + DSAVE('images5_selected', images + binary_0_6) if not images.any(): return np.zeros_like(binary, int) # 6- dilate a little to get a smooth contour without gaps - dilated = morph.r_dilation(images, (odd(scale),odd(scale))) + dilated = morph.r_dilation(images, (odd_scale, odd_scale)) images = morph.propagate_labels_majority(binary, dilated+1) images = morph.spread_labels(images, maxdist=scale)==2 images, _ = morph.label(images) - DSAVE('images6_dilated', images+0.6*binary) + DSAVE('images6_dilated', images + binary_0_6) # we could repeat reconstruct-dilate here... return images @@ -548,6 +556,7 @@ def compute_seplines(binary, scale, maxseps=0): sepsizes = [0] sepslices = [None] sepdists = [0] + doubled_scale = 2 * scale for label in range(1, nlabels + 1): labelslice = slices[label] labelmask = labels == label @@ -599,8 +608,8 @@ def compute_seplines(binary, scale, maxseps=0): binmask = sublabels == bin + 1 binlabels, nbinlabels = morph.label(binmask) _, binlabelcounts = np.unique(binlabels, return_counts=True) - largemask = (binlabelcounts > 2 * scale)[binlabels] - smallmask = (binlabelcounts <= 2 * scale)[binlabels] + largemask = (binlabelcounts > doubled_scale)[binlabels] + smallmask = (binlabelcounts <= doubled_scale)[binlabels] sublabels2[binmask & smallmask] = 1 if not np.any(binmask & largemask): continue @@ -1843,11 +1852,13 @@ def find_topological(): else: llab[box] = lbinary[box] # show projection at the sides - for i in range(int(scale/2)): - llab[box[0],box[1].start+i] = -10*np.log(y+1e-9) - llab[box[0],box[1].stop-1-i] = -10*np.log(y+1e-9) - llab[box[0].start+i,box[1]] = -10*np.log(x+1e-9) - llab[box[0].stop-1-i,box[1]] = -10*np.log(x+1e-9) + log_y = -10 * np.log(y + 1e-9) + log_x = -10 * np.log(x + 1e-9) + for i in range(int(scale / 2)): + llab[box[0], box[1].start + i] = log_y + llab[box[0], box[1].stop - 1 - i] = log_y + llab[box[0].start + i, box[1]] = log_x + llab[box[0].stop - 1 - i, box[1]] = log_x DSAVE('recursive_x_y_cut_' + (partition_type or 'sliced'), llab) gap_weights = list() for is_horizontal, profile in enumerate([y, x]): @@ -1877,19 +1888,19 @@ def find_topological(): weights = weights * (1 + 0.5 * props['peak_heights']/gap_height) gap_weights.append((gaps, weights)) if debug: - LOG.debug(' {} gaps {} {} weights {}'.format( - 'horizontal' if is_horizontal else 'vertical', - gaps, props, weights)) + orientation = 'horizontal' if is_horizontal else 'vertical' + LOG.debug(f' {orientation} gaps {gaps} {props} weights {weights}') if not gaps.shape[0]: continue + half_scale = int(scale / 2) for start, stop, height in sorted(zip( props['left_ips'].astype(int), props['right_ips'].astype(int), props['peak_heights']), key=lambda x: x[2]): if is_horizontal: - llab[box[0].start+int(scale/2):box[0].stop-int(scale/2),box[1].start+start:box[1].start+stop] = -10*np.log(-height+1e-9) + llab[box[0].start+half_scale:box[0].stop-half_scale,box[1].start+start:box[1].start+stop] = -10*np.log(-height+1e-9) else: - llab[box[0].start+start:box[0].start+stop,box[1].start+int(scale/2):box[1].stop-int(scale/2)] = -10*np.log(-height+1e-9) + llab[box[0].start+start:box[0].start+stop,box[1].start+half_scale:box[1].stop-half_scale] = -10*np.log(-height+1e-9) DSAVE('recursive_x_y_cut_gaps_' + ('h' if is_horizontal else 'v'), llab) # heuristic (not strict) decision on x or y cut, # factors to consider: @@ -1916,32 +1927,27 @@ def find_topological(): # are not allowed y_gaps, y_weights = gap_weights[0][0], gap_weights[0][1] x_gaps, x_weights = gap_weights[1][0], gap_weights[1][1] - if debug: LOG.debug(' all y_gaps {} x_gaps {}'.format(y_gaps, x_gaps)) + if debug: LOG.debug(f' all y_gaps {y_gaps} x_gaps {x_gaps}') # suppress cuts that significantly split any line labels + min_line_scale = min_line * scale y_allowed = [not(np.any(np.intersect1d( # significant line labels above - np.nonzero(np.bincount(lbin[:gap,:].flatten(), - minlength=len(objects))[1:] > min_line * scale)[0], + np.nonzero(np.bincount(lbin[:gap,:].flatten(), minlength=len(objects))[1:] > min_line_scale)[0], # significant line labels below - np.nonzero(np.bincount(lbin[gap:,:].flatten(), - minlength=len(objects))[1:] > min_line * scale)[0], - assume_unique=True))) - for gap in y_gaps] + np.nonzero(np.bincount(lbin[gap:,:].flatten(), minlength=len(objects))[1:] > min_line_scale)[0], + assume_unique=True))) for gap in y_gaps] x_allowed = [not(np.any(np.intersect1d( # significant line labels left - np.nonzero(np.bincount(lbin[:,:gap].flatten(), - minlength=len(objects))[1:] > min_line * scale)[0], + np.nonzero(np.bincount(lbin[:,:gap].flatten(), minlength=len(objects))[1:] > min_line_scale)[0], # significant line labels right - np.nonzero(np.bincount(lbin[:,gap:].flatten(), - minlength=len(objects))[1:] > min_line * scale)[0], - assume_unique=True))) - for gap in x_gaps] + np.nonzero(np.bincount(lbin[:,gap:].flatten(), minlength=len(objects))[1:] > min_line_scale)[0], + assume_unique=True))) for gap in x_gaps] y_gaps, y_weights = y_gaps[y_allowed], y_weights[y_allowed] x_gaps, x_weights = x_gaps[x_allowed], x_weights[x_allowed] - if debug: LOG.debug(' allowed y_gaps {} x_gaps {}'.format(y_gaps, x_gaps)) + if debug: LOG.debug(f' allowed y_gaps {y_gaps} x_gaps {x_gaps}') y_prominence = np.amax(y_weights, initial=0) x_prominence = np.amax(x_weights, initial=0) - if debug: LOG.debug(' y_prominence {} x_prominence {}'.format(y_prominence, x_prominence)) + if debug: LOG.debug(f' y_prominence {y_prominence} x_prominence {x_prominence}') # suppress less prominent peaks (another heuristic...) # they must compete with the other direction next time # (when already new cuts or partitions will become visible) @@ -1949,33 +1955,30 @@ def find_topological(): x_allowed = x_weights > 0.8 * x_prominence y_gaps, y_weights = y_gaps[y_allowed], y_weights[y_allowed] x_gaps, x_weights = x_gaps[x_allowed], x_weights[x_allowed] - if debug: LOG.debug(' prominent y_gaps {} x_gaps {}'.format(y_gaps, x_gaps)) + if debug: LOG.debug(f' prominent y_gaps {y_gaps} x_gaps {x_gaps}') if npartitions > 0: # TODO this can be avoided when backtracking below # suppress peaks creating fewer partitions than others -- # how large in our preferred direction will the new partitions # of sepmask in both slices created by each cut candidate # add up? - y_partitionscores = [sum(map(sl.height if prefer_vertical else sl.width, - morph.find_objects(morph.label( - partitions[:gap,:]>0)[0]) + - morph.find_objects(morph.label( - partitions[gap:,:]>0)[0]))) - for gap in y_gaps] - x_partitionscores = [sum(map(sl.height if prefer_vertical else sl.width, - morph.find_objects(morph.label( - partitions[:,:gap]>0)[0]) + - morph.find_objects(morph.label( - partitions[:,gap:]>0)[0]))) - for gap in x_gaps] - if debug: LOG.debug(' y_partitionscores {} x_partitionscores {}'.format( - y_partitionscores, x_partitionscores)) + y_partitionscores = [sum(map( + sl.height if prefer_vertical else sl.width, + morph.find_objects(morph.label(partitions[:gap, :] > 0)[0]) + + morph.find_objects(morph.label(partitions[gap:, :] > 0)[0]))) + for gap in y_gaps] + x_partitionscores = [sum(map( + sl.height if prefer_vertical else sl.width, + morph.find_objects(morph.label(partitions[:, : gap] > 0)[0]) + + morph.find_objects(morph.label(partitions[:, gap :] > 0)[0]))) + for gap in x_gaps] + if debug: LOG.debug(f' y_partitionscores {y_partitionscores} x_partitionscores {x_partitionscores}') # Now identify those gaps with the largest overall score y_allowed = y_partitionscores == np.max(y_partitionscores, initial=0) x_allowed = x_partitionscores == np.max(x_partitionscores, initial=0) y_gaps, y_weights = y_gaps[y_allowed], y_weights[y_allowed] x_gaps, x_weights = x_gaps[x_allowed], x_weights[x_allowed] - if debug: LOG.debug(' most partitioning y_gaps {} x_gaps {}'.format(y_gaps, x_gaps)) + if debug: LOG.debug(f' most partitioning y_gaps {y_gaps} x_gaps {x_gaps}') else: y_partitionscores = None x_partitionscores = None @@ -1986,7 +1989,7 @@ def find_topological(): x_allowed = x_weights > 0.9 * x_prominence y_gaps, y_weights = y_gaps[y_allowed], y_weights[y_allowed] x_gaps, x_weights = x_gaps[x_allowed], x_weights[x_allowed] - if debug: LOG.debug(' prominent y_gaps {} x_gaps {}'.format(y_gaps, x_gaps)) + if debug: LOG.debug(f' prominent y_gaps {y_gaps} x_gaps {x_gaps}') # decide which direction, x or y # TODO: this most likely needs a backtracking mechanism @@ -2052,7 +2055,7 @@ def find_topological(): llab2[box] = partitions DSAVE('recursive_x_y_cut_partitions', llab2) for label in range(1, npartitions+1): - LOG.debug('next partition %d on %s', label, box) + LOG.debug(f'next partition %d on %s', label, box) recursive_x_y_cut(box, mask=partitions==label, partition_type=new_partition_type) return @@ -2060,10 +2063,9 @@ def find_topological(): # no gaps left finalize() return + orientation = 'vertical' if choose_vertical else 'horizontal' # otherwise: cut on gaps - LOG.debug('cutting %s on %s into %s', 'vertically' - if choose_vertical else 'horizontally', - box, gaps) + LOG.debug(f'cutting {orientation}ly on {box} into {gaps}') cuts = list(zip(np.insert(gaps, 0, 0), np.append(gaps, lim))) if choose_vertical: if rl: @@ -2078,9 +2080,7 @@ def find_topological(): sub = sl.box(0, len(y), start, stop) else: # "cut in horizontal direction" sub = sl.box(start, stop, 0, len(x)) - LOG.debug('next %s block on %s is %s', 'horizontal' - if choose_vertical else 'vertical', - box, sub) + LOG.debug(f'next {orientation} block on {box} is {sub}') recursive_x_y_cut(sl.compose(box,sub), mask=sl.cut(mask,sub) if isinstance(mask, np.ndarray) else None) From fceaffe4e928bff7ea70aece7baa3d3717c03cff Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Sat, 17 Aug 2024 02:03:47 +0200 Subject: [PATCH 71/97] optimize ocrolib --- ocrd_cis/ocropy/ocrolib/morph.py | 18 ++++++++++-------- ocrd_cis/ocropy/ocrolib/toplevel.py | 26 ++++++++------------------ 2 files changed, 18 insertions(+), 26 deletions(-) diff --git a/ocrd_cis/ocropy/ocrolib/morph.py b/ocrd_cis/ocropy/ocrolib/morph.py index 7d6ffc85..b9619cca 100644 --- a/ocrd_cis/ocropy/ocrolib/morph.py +++ b/ocrd_cis/ocropy/ocrolib/morph.py @@ -292,8 +292,9 @@ def propagate_labels_majority(image,labels): with the largest overlap.""" rlabels,_ = label(image) cors = correspondences(rlabels,labels) - outputs = zeros(amax(rlabels)+1,'i') - counts = zeros(amax(rlabels)+1,'i') + amax_rlabels = amax(rlabels) + 1 + outputs = zeros(amax_rlabels,'i') + counts = zeros(amax_rlabels,'i') for rlabel, label_, count in cors.T: if not rlabel or not label_: # ignore background correspondences @@ -347,12 +348,13 @@ def all_neighbors(image, dist=1, bg=NaN): """Given an image with labels, find all pairs of labels that are directly (up to ``dist``) neighboring each other, ignoring the label ``bg``.""" q = 100000 - assert amax(image)=0 - u = unique(q*image+shift(image,(dist,0),order=0,cval=bg)) - d = unique(q*image+shift(image,(-dist,0),order=0,cval=bg)) - l = unique(q*image+shift(image,(0,dist),order=0,cval=bg)) - r = unique(q*image+shift(image,(0,-dist),order=0,cval=bg)) + assert amax(image) < q + assert amin(image) >= 0 + q_image = q * image + u = unique(q_image + shift(image, (dist, 0), order=0, cval=bg)) + d = unique(q_image + shift(image, (-dist, 0), order=0, cval=bg)) + l = unique(q_image + shift(image, (0, dist), order=0, cval=bg)) + r = unique(q_image + shift(image, (0, -dist), order=0, cval=bg)) all = unique(r_[u,d,l,r]) all = all[all!=bg] all = c_[all//q,all%q] diff --git a/ocrd_cis/ocropy/ocrolib/toplevel.py b/ocrd_cis/ocropy/ocrolib/toplevel.py index 87ed18c5..72e397af 100644 --- a/ocrd_cis/ocropy/ocrolib/toplevel.py +++ b/ocrd_cis/ocropy/ocrolib/toplevel.py @@ -125,14 +125,10 @@ def __init__(self,*args,**kw): self.fun = kw.get("fun","?") self.var = kw.get("var","?") self.description = " ".join([strc(x) for x in args]) + def __str__(self): - result = "\nCheckError for argument " - result += str(self.var) - result += " of function " - result += str(self.fun) - result += "\n" - result += self.description - return result + return f"\nCheckError for argument {str(self.var)} of function {str(self.fun)}\n{self.description}" + class CheckWarning(CheckError): def __init__(self,*args,**kw): @@ -142,14 +138,8 @@ def __init__(self,*args,**kw): CheckError.__init__(self, *args, **kw) def __str__(self): - result = "\nCheckWarning for argument " - result += str(self.var) - result += " of function " - result += str(self.fun) - result += "\n" - result += self.description - result += "(This can happen occasionally during normal operations and isn't necessarily a bug or problem.)\n" - return result + return (f"\nCheckWarning for argument {str(self.var)} of function {str(self.fun)}\n{self.description} " + f"(This can happen occasionally during normal operations and isn't necessarily a bug or problem.)\n") def checktype(value,type_): """Check value against the type spec. If everything @@ -211,7 +201,7 @@ def argument_checks(*args,**kw): e.var = var raise e except: - LOG.critical("unknown exception while checking function: '%s'", name) + LOG.critical(f"unknown exception while checking function: '{name}'") raise result = f(*args,**kw) checktype(result,kw.get("_",True)) @@ -225,9 +215,9 @@ def decorator(f): def wrapper(arg): if not f(arg): if warning: - raise CheckWarning(strc(arg)+" of type "+str(type(arg))+": "+str(message)) + raise CheckWarning(f"{strc(arg)} of type {str(type(arg))}: {str(message)}") else: - raise CheckError(strc(arg)+" of type "+str(type(arg))+": "+str(message)) + raise CheckError(f"{strc(arg)} of type {str(type(arg))}: {str(message)}") return wrapper return decorator From 3de2585787ea2b59126a4a1c39d9df3e42d18362 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Sat, 17 Aug 2024 02:03:58 +0200 Subject: [PATCH 72/97] optimize align cli --- ocrd_cis/align/cli.py | 85 ++++++++++++++++++------------------------- 1 file changed, 35 insertions(+), 50 deletions(-) diff --git a/ocrd_cis/align/cli.py b/ocrd_cis/align/cli.py index 7747622e..7d6599c2 100644 --- a/ocrd_cis/align/cli.py +++ b/ocrd_cis/align/cli.py @@ -57,16 +57,16 @@ def process(self): def align(self, alignments, ift): """align the alignment objects with the according input file tuples""" for t in ift: - self.log.debug("tuple %s", os.path.basename(t.input_file.url)) + self.log.debug(f"tuple {os.path.basename(t.input_file.url)}") pcgtst = self.open_input_file_tuples(ift) i = 0 for mi, mr in enumerate(pcgtst[0].get_Page().get_TextRegion()): for mj, _ in enumerate(mr.get_TextLine()): for iiii, u in enumerate(mr.get_TextLine()[mj].get_TextEquiv()): - self.log.debug("[%d] %s", iiii, u.Unicode) + self.log.debug(f"[{iiii}] {u.Unicode}") for xx in mr.get_TextLine()[mj].get_Word(): for iiii, u in enumerate(xx.get_TextEquiv()): - self.log.debug("[%d] %s", iiii, u.Unicode) + self.log.debug(f"[{iiii}] {u.Unicode}") lines = [] for ii, t in enumerate(ift): @@ -88,23 +88,21 @@ def align_lines(self, lines): for i, line in enumerate(lines): if lines[0].region.get_TextEquiv() is None: lines[0].region.TextEquiv = [] - self.log.debug('line alignment: %s [%s - %s]', - get_textequiv_unicode(line.region), - line.region.get_id(), - line.input_file.input_file_group) - ddt = line.input_file.input_file_group + "/" + line.region.get_id() + self.log.debug(f'line alignment: {get_textequiv_unicode(line.region)} ' + f'[{line.region.get_id()} - {line.input_file.input_file_group}]') + ddt = f"{line.input_file.input_file_group}/{line.region.get_id()}" if i != 0: te = TextEquivType( Unicode=get_textequiv_unicode(line.region), conf=get_textequiv_conf(line.region), dataType="other", - dataTypeDetails="ocrd-cis-line-alignment:" + ddt) + dataTypeDetails=f"ocrd-cis-line-alignment:{ddt}") lines[0].region.add_TextEquiv(te) else: self.log.debug("len: %i, i: %i", len(lines[0].region.get_TextEquiv()), i) lines[0].region.get_TextEquiv()[i].set_dataType("other") lines[0].region.get_TextEquiv()[i].set_dataTypeDetails( - "ocrd-cis-line-alignment-master-ocr:" + ddt) + f"ocrd-cis-line-alignment-master-ocr:{ddt}") lines[0].region.get_TextEquiv()[i].set_index(i+1) self.align_words(lines) @@ -113,18 +111,18 @@ def align_words(self, lines): mregion = lines[0].region.get_Word() oregion = [lines[i].region.get_Word() for i in range(1, len(lines))] for word in lines[0].alignment['wordAlignments']: - self.log.debug("aligning word %s", word['master']) + self.log.debug(f"aligning word {word['master']}", ) master, rest = self.find_word([word['master']], mregion, "master") mregion = rest if master is None or len(master) != 1: - self.log.warn("cannot find {}; giving up".format(word['master'])) - # raise Exception("cannot find {}; giving up".format(word['master'])) + self.log.warn(f"cannot find {word['master']}; giving up") + # raise Exception(f"cannot find {word['master']}; giving up") return others = list() for i, other in enumerate(word['alignments']): match, rest = self.find_word(other, oregion[i]) if match is None: - self.log.warn("cannot find {}; giving up".format(other)) + self.log.warn(f"cannot find {other}; giving up") return others.append(match) oregion[i] = rest @@ -132,10 +130,7 @@ def align_words(self, lines): words.append( Alignment(lines[0].input_file, master, lines[0].alignment)) for i, other in enumerate(others): - words.append(Alignment( - lines[i+1].input_file, - other, - lines[i+1].alignment)) + words.append(Alignment(lines[i+1].input_file, other, lines[i+1].alignment)) self.align_word_regions(words) def align_word_regions(self, words): @@ -144,10 +139,8 @@ def te0(x): for i, word in enumerate(words): if not word.region: ifg = word.input_file.input_file_group - self.log.debug("(empty) word alignment: [%s]", ifg) - te = TextEquivType( - dataType="other", - dataTypeDetails="ocrd-cis-empty-word-alignment:" + ifg) + self.log.debug(f"(empty) word alignment: [{ifg}]") + te = TextEquivType(dataType="other", dataTypeDetails=f"ocrd-cis-empty-word-alignment:{ifg}") words[0].region[0].add_TextEquiv(te) words[0].region[0].get_TextEquiv()[i].set_index(i+1) continue @@ -157,46 +150,38 @@ def te0(x): ddt = word.input_file.input_file_group + "/" + _id # if conf is none it is most likely ground truth data conf = min([float(te0(x).get_conf() or "1.0") for x in word.region]) - self.log.debug("word alignment: %s [%s - %s]", _str, _id, ifg) + self.log.debug(f"word alignment: {_str} [{_id} - {ifg}]") if i != 0: te = TextEquivType( - Unicode=_str, - conf=conf, - dataType="other", - dataTypeDetails="ocrd-cis-word-alignment:" + ddt) + Unicode=_str, conf=conf, dataType="other", dataTypeDetails=f"ocrd-cis-word-alignment:{ddt}") words[0].region[0].add_TextEquiv(te) else: words[0].region[0].get_TextEquiv()[i].set_dataType("other") - words[0].region[0].get_TextEquiv()[i].set_dataTypeDetails( - "ocrd-cis-word-alignment-master-ocr:" + ddt) + words[0].region[0].get_TextEquiv()[i].set_dataTypeDetails(f"ocrd-cis-word-alignment-master-ocr:{ddt}") words[0].region[0].get_TextEquiv()[i].set_index(i+1) def find_word(self, tokens, regions, t="other"): - self.log.debug("tokens = %s [%s]", tokens, t) + tokens_str = f"tokens = {tokens} [{t}]" + self.log.debug(tokens_str) for i, _ in enumerate(regions): n = self.match_tokens(tokens, regions, i) if n == 0: continue return tuple([regions[i:n], regions[i:]]) # not found try again with levenshtein - self.log.warn( - "could not find tokens = %s [%s]; trying again", - tokens, t) + self.log.warn(f"could not find {tokens_str}; trying again") for i, _ in enumerate(regions): n = self.match_tokens_lev(tokens, regions, i) if n == 0: continue return tuple([regions[i:n], regions[i:]]) # not found try again to match token within another one - self.log.warn( - "could not find tokens = %s [%s]; trying again", - tokens, t) + self.log.warn(f"could not find {tokens_str}; trying again") for i, _ in enumerate(regions): n = self.match_tokens_within(tokens, regions, i) if n == 0: continue return tuple([regions[i:n], regions[i:]]) - # nothing could be found return tuple([None, regions]) @@ -212,7 +197,7 @@ def match_tokens_lev(self, tokens, regions, i): def f(a, b): k = 3 # int(len(a)/3) d = Levenshtein.distance(a, b) - self.log.debug("lev %s <=> %s: %d (%d)", a, b, d, d) + self.log.debug(f"lev {a} <=> {b}: {d} ({d})") return d <= 1 or d <= k return self.match_tokens_lambda(tokens, regions, i, f) @@ -227,14 +212,15 @@ def match_tokens_lambda(self, tokens, regions, i, f): Returns 0 if nothing could be matched. """ for j, token in enumerate(tokens): - if j + i >= len(regions): + sum_i_j = j + i + if sum_i_j >= len(regions): return 0 - if not regions[i+j].get_TextEquiv()[0].Unicode: - self.log.warn("cannot find %s", token) + unicode = regions[sum_i_j].get_TextEquiv()[0].Unicode + if not unicode: + self.log.warn(f"cannot find {token}") return 0 - self.log.debug('checking %s with %s', token, - regions[i+j].get_TextEquiv()[0].Unicode) - if f(token, regions[i+j].get_TextEquiv()[0].Unicode): + self.log.debug(f'checking {token} with {unicode}') + if f(token, unicode): continue if j == 0: return 0 @@ -259,19 +245,18 @@ def zip_input_files(self, ifgs): """Zip files of the given input file groups""" files = list() for ifg in ifgs: - self.log.info("input file group: %s", ifg) + self.log.info(f"input file group: {ifg}") ifiles = sorted( self.workspace.mets.find_files(fileGrp=ifg), key=lambda ifile: ifile.url) for i in ifiles: - self.log.debug("sorted file: %s %s", - os.path.basename(i.url), i.ID) + self.log.debug(f"sorted file: {os.path.basename(i.url)} {i.ID}") ifiles = [FileAlignment(self.workspace, x, ifg) for x in ifiles] files.append(ifiles) return zip(*files) def read_lines_from_input_file(self, ifile): - self.log.info("reading input file: %s", ifile) + self.log.info(f"reading input file: {ifile}") lines = list() pcgts = ifile.open() for region in pcgts.get_Page().get_TextRegion(): @@ -286,7 +271,7 @@ def run_java_aligner(self, ifs): lines = zip(*lines) _input = [x.strip() for t in lines for x in t] for i in _input: - self.log.debug("input line: %s", i) + self.log.debug(f"input line: {i}") n = len(ifs) self.log.debug("starting java client") p = JavaAligner(n, getLevelName(self.log.getEffectiveLevel())) @@ -300,7 +285,7 @@ def __init__(self, workspace, ifile, ifg): self.log = getLogger('cis.FileAlignment') def open(self): - self.log.info("opening: %s", os.path.basename(self.input_file.url)) + self.log.info(f"opening: {os.path.basename(self.input_file.url)}") return page_from_file(self.workspace.download_file(self.input_file)) From 0949277dbe049c1cd6776b3c701980c48cf2ebc8 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 22 Aug 2024 02:21:34 +0200 Subject: [PATCH 73/97] align: use final v3 API --- ocrd_cis/align/cli.py | 229 ++++++++++++++++-------------------------- 1 file changed, 85 insertions(+), 144 deletions(-) diff --git a/ocrd_cis/align/cli.py b/ocrd_cis/align/cli.py index 7d6599c2..f85b7348 100644 --- a/ocrd_cis/align/cli.py +++ b/ocrd_cis/align/cli.py @@ -1,97 +1,71 @@ from __future__ import absolute_import +from __future__ import annotations + import click import json import os +from typing import Optional, List, Dict, Type + from rapidfuzz.distance import Levenshtein -from ocrd import Processor + +from ocrd import Processor, OcrdPage, OcrdPageResult from ocrd.decorators import ocrd_cli_options from ocrd.decorators import ocrd_cli_wrap_processor -from ocrd_utils import MIMETYPE_PAGE from ocrd_utils import getLogger from ocrd_utils import getLevelName -from ocrd_utils import make_file_id -from ocrd_modelfactory import page_from_file -from ocrd_models.ocrd_page import to_xml -from ocrd_models.ocrd_page_generateds import TextEquivType +from ocrd_models.ocrd_page import TextRegionType, TextEquivType from ocrd_cis import JavaAligner -from ocrd_cis import get_ocrd_tool @click.command() @ocrd_cli_options def ocrd_cis_align(*args, **kwargs): - return ocrd_cli_wrap_processor(Aligner, *args, **kwargs) + return ocrd_cli_wrap_processor(CISAligner, *args, **kwargs) -class Aligner(Processor): - def __init__(self, *args, **kwargs): - ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = ocrd_tool['tools']['ocrd-cis-align'] - kwargs['version'] = ocrd_tool['version'] - super(Aligner, self).__init__(*args, **kwargs) +class CISAligner(Processor): + @property + def executable(self): + return 'ocrd-cis-align' - if hasattr(self, 'workspace'): - self.log = getLogger('cis.Processor.Aligner') - - def process(self): - ifgs = self.input_file_grp.split(",") # input file groups - if len(ifgs) < 2: - raise Exception("need at least two input file groups to align") - ifts = self.zip_input_files(ifgs) # input file tuples - for _id, ift in enumerate(ifts): - alignments = json.loads(self.run_java_aligner(ift)) - pcgts = self.align(alignments, ift) - # keep the right part after OCR-D-...-filename - # and prepend output_file_grp - input_file = ift[0].input_file - file_id = make_file_id(input_file, self.output_file_grp) - pcgts.set_pcGtsId(file_id) - out = self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - local_filename=os.path.join(self.output_file_grp, file_id + '.xml'), - mimetype=MIMETYPE_PAGE, - content=to_xml(pcgts), - ) - self.log.info('created file %s', out) + def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult: + assert len(input_pcgts) >= 2 + alignments = json.loads(self.run_java_aligner(input_pcgts)) + pcgts = self.align(alignments, input_pcgts) + return OcrdPageResult(pcgts) - def align(self, alignments, ift): + def align(self, alignments: List[Dict], pcgts: List[OcrdPage]) -> OcrdPage: """align the alignment objects with the according input file tuples""" - for t in ift: - self.log.debug(f"tuple {os.path.basename(t.input_file.url)}") - pcgtst = self.open_input_file_tuples(ift) i = 0 - for mi, mr in enumerate(pcgtst[0].get_Page().get_TextRegion()): + file_groups = self.input_file_grp.split(',') + for mi, mr in enumerate(pcgts[0].get_Page().get_AllRegions(classes=['Text'])): for mj, _ in enumerate(mr.get_TextLine()): - for iiii, u in enumerate(mr.get_TextLine()[mj].get_TextEquiv()): - self.log.debug(f"[{iiii}] {u.Unicode}") - for xx in mr.get_TextLine()[mj].get_Word(): - for iiii, u in enumerate(xx.get_TextEquiv()): - self.log.debug(f"[{iiii}] {u.Unicode}") - lines = [] - for ii, t in enumerate(ift): + for ii, page in enumerate(pcgts): if i >= len(alignments): break - tr = pcgtst[ii].get_Page().get_TextRegion() + tr = page.get_Page().get_AllRegions(classes=['Text']) region = tr[mi].get_TextLine()[mj] - lines.append(Alignment(t, region, alignments[i])) + lines.append(Alignment(file_groups[ii], page, region, alignments[i])) self.align_lines(lines) i += 1 - return pcgtst[0] + return pcgts[0] - def align_lines(self, lines): + def align_lines(self, lines: List[Alignment]) -> None: """align the given line alignment with the lines""" if not lines: return - if len(lines[0].region.get_TextEquiv()) > 1: - del lines[0].region.get_TextEquiv()[1:] + if len(lines[0].region.TextEquiv) > 1: + del lines[0].region.TextEquiv[1:] for i, line in enumerate(lines): if lines[0].region.get_TextEquiv() is None: lines[0].region.TextEquiv = [] - self.log.debug(f'line alignment: {get_textequiv_unicode(line.region)} ' - f'[{line.region.get_id()} - {line.input_file.input_file_group}]') - ddt = f"{line.input_file.input_file_group}/{line.region.get_id()}" - if i != 0: + self.logger.debug( + 'line alignment: %s [%s - %s]', + get_textequiv_unicode(line.region), + line.region.get_id(), + line.file_grp + ) + ddt = line.file_grp + "/" + line.region.get_id() + if i > 0: te = TextEquivType( Unicode=get_textequiv_unicode(line.region), conf=get_textequiv_conf(line.region), @@ -99,58 +73,64 @@ def align_lines(self, lines): dataTypeDetails=f"ocrd-cis-line-alignment:{ddt}") lines[0].region.add_TextEquiv(te) else: - self.log.debug("len: %i, i: %i", len(lines[0].region.get_TextEquiv()), i) - lines[0].region.get_TextEquiv()[i].set_dataType("other") - lines[0].region.get_TextEquiv()[i].set_dataTypeDetails( - f"ocrd-cis-line-alignment-master-ocr:{ddt}") - lines[0].region.get_TextEquiv()[i].set_index(i+1) + self.logger.debug("len: %i, i: %i", len(lines[0].region.TextEquiv), i) + lines[0].region.TextEquiv[i].set_dataType("other") + lines[0].region.TextEquiv[i].set_dataTypeDetails( + "ocrd-cis-line-alignment-master-ocr:" + ddt) + lines[0].region.TextEquiv[i].set_index(i+1) self.align_words(lines) - def align_words(self, lines): - # self.log.info(json.dumps(lines[0].alignment)) + def align_words(self, lines: List[Alignment]) -> None: + # self.logger.info(json.dumps(lines[0].alignment)) mregion = lines[0].region.get_Word() oregion = [lines[i].region.get_Word() for i in range(1, len(lines))] for word in lines[0].alignment['wordAlignments']: - self.log.debug(f"aligning word {word['master']}", ) + self.logger.debug("aligning word %s", word['master']) master, rest = self.find_word([word['master']], mregion, "master") mregion = rest if master is None or len(master) != 1: - self.log.warn(f"cannot find {word['master']}; giving up") - # raise Exception(f"cannot find {word['master']}; giving up") + self.logger.warn("cannot find {}; giving up".format(word['master'])) + # raise Exception("cannot find {}; giving up".format(word['master'])) return others = list() for i, other in enumerate(word['alignments']): match, rest = self.find_word(other, oregion[i]) if match is None: - self.log.warn(f"cannot find {other}; giving up") + self.logger.warn(f"cannot find {other}; giving up") return others.append(match) oregion[i] = rest words = list() words.append( - Alignment(lines[0].input_file, master, lines[0].alignment)) + Alignment(lines[0].file_grp, lines[0].pcgts, master, lines[0].alignment)) for i, other in enumerate(others): - words.append(Alignment(lines[i+1].input_file, other, lines[i+1].alignment)) + words.append(Alignment( + lines[i+1].file_grp, + lines[i+1].pcgts, + other, + lines[i+1].alignment)) self.align_word_regions(words) - def align_word_regions(self, words): + def align_word_regions(self, words: List[Alignment]) -> None: def te0(x): - return x.get_TextEquiv()[0] + return x.TextEquiv[0] for i, word in enumerate(words): if not word.region: - ifg = word.input_file.input_file_group - self.log.debug(f"(empty) word alignment: [{ifg}]") - te = TextEquivType(dataType="other", dataTypeDetails=f"ocrd-cis-empty-word-alignment:{ifg}") + ifg = word.file_grp + self.logger.debug("(empty) word alignment: [%s]", ifg) + te = TextEquivType( + dataType="other", + dataTypeDetails="ocrd-cis-empty-word-alignment:" + ifg) words[0].region[0].add_TextEquiv(te) words[0].region[0].get_TextEquiv()[i].set_index(i+1) continue _str = " ".join([te0(x).Unicode for x in word.region]) _id = ",".join([x.get_id() for x in word.region]) - ifg = word.input_file.input_file_group - ddt = word.input_file.input_file_group + "/" + _id + ifg = word.file_grp + ddt = word.file_grp + "/" + _id # if conf is none it is most likely ground truth data conf = min([float(te0(x).get_conf() or "1.0") for x in word.region]) - self.log.debug(f"word alignment: {_str} [{_id} - {ifg}]") + self.logger.debug(f"word alignment: {_str} [{_id} - {ifg}]") if i != 0: te = TextEquivType( Unicode=_str, conf=conf, dataType="other", dataTypeDetails=f"ocrd-cis-word-alignment:{ddt}") @@ -162,21 +142,21 @@ def te0(x): def find_word(self, tokens, regions, t="other"): tokens_str = f"tokens = {tokens} [{t}]" - self.log.debug(tokens_str) + self.logger.debug(tokens_str) for i, _ in enumerate(regions): n = self.match_tokens(tokens, regions, i) if n == 0: continue return tuple([regions[i:n], regions[i:]]) # not found try again with levenshtein - self.log.warn(f"could not find {tokens_str}; trying again") + self.logger.warn(f"could not find {tokens_str}; trying again") for i, _ in enumerate(regions): n = self.match_tokens_lev(tokens, regions, i) if n == 0: continue return tuple([regions[i:n], regions[i:]]) # not found try again to match token within another one - self.log.warn(f"could not find {tokens_str}; trying again") + self.logger.warn(f"could not find {tokens_str}; trying again") for i, _ in enumerate(regions): n = self.match_tokens_within(tokens, regions, i) if n == 0: @@ -197,7 +177,7 @@ def match_tokens_lev(self, tokens, regions, i): def f(a, b): k = 3 # int(len(a)/3) d = Levenshtein.distance(a, b) - self.log.debug(f"lev {a} <=> {b}: {d} ({d})") + self.logger.debug(f"lev {a} <=> {b}: {d} ({d})") return d <= 1 or d <= k return self.match_tokens_lambda(tokens, regions, i, f) @@ -215,11 +195,11 @@ def match_tokens_lambda(self, tokens, regions, i, f): sum_i_j = j + i if sum_i_j >= len(regions): return 0 - unicode = regions[sum_i_j].get_TextEquiv()[0].Unicode + unicode = regions[sum_i_j].TextEquiv[0].Unicode if not unicode: - self.log.warn(f"cannot find {token}") + self.logger.warn(f"cannot find {token}") return 0 - self.log.debug(f'checking {token} with {unicode}') + self.logger.debug(f'checking {token} with {unicode}') if f(token, unicode): continue if j == 0: @@ -230,68 +210,29 @@ def match_tokens_lambda(self, tokens, regions, i, f): i += 1 return i + len(tokens) - def open_input_file_tuples(self, ift): - """ - opens all xml files of the given input file tuple - and returns them as tuples - """ - res = list() - for ifile in ift: - pcgts = ifile.open() - res.append(pcgts) - return tuple(res) - - def zip_input_files(self, ifgs): - """Zip files of the given input file groups""" - files = list() - for ifg in ifgs: - self.log.info(f"input file group: {ifg}") - ifiles = sorted( - self.workspace.mets.find_files(fileGrp=ifg), - key=lambda ifile: ifile.url) - for i in ifiles: - self.log.debug(f"sorted file: {os.path.basename(i.url)} {i.ID}") - ifiles = [FileAlignment(self.workspace, x, ifg) for x in ifiles] - files.append(ifiles) - return zip(*files) - - def read_lines_from_input_file(self, ifile): - self.log.info(f"reading input file: {ifile}") + def run_java_aligner(self, input_pcgts: List[OcrdPage]) -> str: lines = list() - pcgts = ifile.open() - for region in pcgts.get_Page().get_TextRegion(): - for line in region.get_TextLine(): - lines.append(get_textequiv_unicode(line)) - return lines - - def run_java_aligner(self, ifs): - lines = list() - for ifile in ifs: - lines.append(self.read_lines_from_input_file(ifile)) + for pcgts in input_pcgts: + lines.append([get_textequiv_unicode(line) + for line in pcgts.get_Page().get_AllTextLines()]) + # JavaAligner expects a strange input format lines = zip(*lines) _input = [x.strip() for t in lines for x in t] for i in _input: - self.log.debug(f"input line: {i}") - n = len(ifs) - self.log.debug("starting java client") - p = JavaAligner(n, getLevelName(self.log.getEffectiveLevel())) + self.logger.debug("input line: %s", i) + n = len(input_pcgts) + self.logger.debug("starting java client") + p = JavaAligner(n, getLevelName(self.logger.getEffectiveLevel())) return p.run("\n".join(_input)) -class FileAlignment: - def __init__(self, workspace, ifile, ifg): - self.workspace = workspace - self.input_file = ifile - self.input_file_group = ifg - self.log = getLogger('cis.FileAlignment') - - def open(self): - self.log.info(f"opening: {os.path.basename(self.input_file.url)}") - return page_from_file(self.workspace.download_file(self.input_file)) - - class Alignment: - def __init__(self, ifile, region, alignment): - self.input_file = ifile + file_grp: str + pcgts: OcrdPage + region: TextRegionType + alignment: Alignment + def __init__(self, file_grp: str, pcgts: OcrdPage, region: TextRegionType, alignment: Alignment): + self.file_grp = file_grp + self.pcgts = pcgts self.region = region self.alignment = alignment From d4f8483ffdefac50161e4376637b9f8e813c384f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 22 Aug 2024 02:21:58 +0200 Subject: [PATCH 74/97] use ocrd_utils instead of pkg_resources --- ocrd_cis/data/__main__.py | 10 +++++----- ocrd_cis/javaprocess.py | 5 ++--- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/ocrd_cis/data/__main__.py b/ocrd_cis/data/__main__.py index 3d8ef735..8fdcddd6 100644 --- a/ocrd_cis/data/__main__.py +++ b/ocrd_cis/data/__main__.py @@ -1,18 +1,18 @@ -import pkg_resources import sys +from ocrd_utils import resource_filename def main(): usage = 'usage: ' + sys.argv[0] + ' -jar|-3gs|-model|-config' if '-h' in sys.argv: print(usage) elif '-jar' in sys.argv: - print(pkg_resources.resource_filename('ocrd_cis', 'data/ocrd-cis.jar')) + print(resource_filename('ocrd_cis', 'data/ocrd-cis.jar')) elif '-3gs' in sys.argv: - print(pkg_resources.resource_filename('ocrd_cis', 'data/3gs.csv.gz')) + print(resource_filename('ocrd_cis', 'data/3gs.csv.gz')) elif '-model' in sys.argv: - print(pkg_resources.resource_filename('ocrd_cis', 'data/model.zip')) + print(resource_filename('ocrd_cis', 'data/model.zip')) elif '-config' in sys.argv: - print(pkg_resources.resource_filename('ocrd_cis', 'data/config.json')) + print(resource_filename('ocrd_cis', 'data/config.json')) else: raise ValueError(usage) diff --git a/ocrd_cis/javaprocess.py b/ocrd_cis/javaprocess.py index ce2f6bfd..72915d68 100644 --- a/ocrd_cis/javaprocess.py +++ b/ocrd_cis/javaprocess.py @@ -1,12 +1,11 @@ import subprocess import json -import pkg_resources -from ocrd_utils import getLogger +from ocrd_utils import getLogger, resource_filename from pathlib import Path MAIN = "de.lmu.cis.ocrd.cli.Main" -JAR = pkg_resources.resource_filename('ocrd_cis', 'data/ocrd-cis.jar') +JAR = str(resource_filename('ocrd_cis', 'data/ocrd-cis.jar')) def JavaAligner(n, loglvl): """Create a java process that calls -c align -D '{"n":n}'""" From ecc44c0358354c0c3c3ba6000e7de7413dc9cef1 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 22 Aug 2024 13:31:09 +0200 Subject: [PATCH 75/97] postcorrect: use final v3 API --- ocrd_cis/align/cli.py | 1 + ocrd_cis/postcorrect/cli.py | 22 ++++++++++------------ 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/ocrd_cis/align/cli.py b/ocrd_cis/align/cli.py index f85b7348..f5e47785 100644 --- a/ocrd_cis/align/cli.py +++ b/ocrd_cis/align/cli.py @@ -16,6 +16,7 @@ from ocrd_models.ocrd_page import TextRegionType, TextEquivType from ocrd_cis import JavaAligner + @click.command() @ocrd_cli_options def ocrd_cis_align(*args, **kwargs): diff --git a/ocrd_cis/postcorrect/cli.py b/ocrd_cis/postcorrect/cli.py index dc3ee48e..71fbaad1 100644 --- a/ocrd_cis/postcorrect/cli.py +++ b/ocrd_cis/postcorrect/cli.py @@ -1,14 +1,15 @@ from __future__ import absolute_import +import os + import click import json -import os + from ocrd import Processor -from ocrd.decorators import ocrd_cli_options -from ocrd.decorators import ocrd_cli_wrap_processor +from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor from ocrd_utils import getLogger, getLevelName from ocrd_models.ocrd_mets import OcrdMets from ocrd_cis import JavaPostCorrector -from ocrd_cis import get_ocrd_tool + @click.command() @ocrd_cli_options @@ -16,26 +17,23 @@ def ocrd_cis_postcorrect(*args, **kwargs): return ocrd_cli_wrap_processor(PostCorrector, *args, **kwargs) class PostCorrector(Processor): - def __init__(self, *args, **kwargs): - ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = ocrd_tool['tools']['ocrd-cis-postcorrect'] - kwargs['version'] = ocrd_tool['version'] - super(PostCorrector, self).__init__(*args, **kwargs) + @property + def executable(self): + return 'ocrd-cis-postcorrect' def process(self): - self.log = getLogger('processor.CISPostCorrector') profiler = {} profiler["path"] = self.parameter["profilerPath"] profiler["config"] = self.parameter["profilerConfig"] profiler["noCache"] = True self.parameter["profiler"] = profiler self.parameter["runDM"] = True - self.log.debug(json.dumps(self.parameter, indent=4)) + self.logger.debug(json.dumps(self.parameter, indent=4)) p = JavaPostCorrector(self.workspace.mets_target, self.input_file_grp, self.output_file_grp, self.parameter, - getLevelName(self.log.getEffectiveLevel())) + getLevelName(self.logger.getEffectiveLevel())) p.exe() # reload the mets file to prevent run_processor's save_mets # from overriding the results from the Java process From 2b310b4690b1a83be75cd93432ea38be7250ee35 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 23 Aug 2024 13:51:07 +0200 Subject: [PATCH 76/97] revert: ocropy.ocrolib changes --- ocrd_cis/ocropy/ocrolib/morph.py | 18 ++++++++---------- ocrd_cis/ocropy/ocrolib/toplevel.py | 26 ++++++++++++++++++-------- 2 files changed, 26 insertions(+), 18 deletions(-) diff --git a/ocrd_cis/ocropy/ocrolib/morph.py b/ocrd_cis/ocropy/ocrolib/morph.py index b9619cca..f7ccdc31 100644 --- a/ocrd_cis/ocropy/ocrolib/morph.py +++ b/ocrd_cis/ocropy/ocrolib/morph.py @@ -292,9 +292,8 @@ def propagate_labels_majority(image,labels): with the largest overlap.""" rlabels,_ = label(image) cors = correspondences(rlabels,labels) - amax_rlabels = amax(rlabels) + 1 - outputs = zeros(amax_rlabels,'i') - counts = zeros(amax_rlabels,'i') + outputs = zeros(amax(rlabels)+1,'i') + counts = zeros(amax(rlabels)+1,'i') for rlabel, label_, count in cors.T: if not rlabel or not label_: # ignore background correspondences @@ -348,13 +347,12 @@ def all_neighbors(image, dist=1, bg=NaN): """Given an image with labels, find all pairs of labels that are directly (up to ``dist``) neighboring each other, ignoring the label ``bg``.""" q = 100000 - assert amax(image) < q - assert amin(image) >= 0 - q_image = q * image - u = unique(q_image + shift(image, (dist, 0), order=0, cval=bg)) - d = unique(q_image + shift(image, (-dist, 0), order=0, cval=bg)) - l = unique(q_image + shift(image, (0, dist), order=0, cval=bg)) - r = unique(q_image + shift(image, (0, -dist), order=0, cval=bg)) + assert amax(image)=0 + u = unique(q*image+shift(image, (dist, 0), order=0, cval=bg)) + d = unique(q*image+shift(image, (-dist, 0), order=0, cval=bg)) + l = unique(q*image+shift(image, (0, dist), order=0, cval=bg)) + r = unique(q*image+shift(image, (0, -dist), order=0, cval=bg)) all = unique(r_[u,d,l,r]) all = all[all!=bg] all = c_[all//q,all%q] diff --git a/ocrd_cis/ocropy/ocrolib/toplevel.py b/ocrd_cis/ocropy/ocrolib/toplevel.py index 72e397af..87ed18c5 100644 --- a/ocrd_cis/ocropy/ocrolib/toplevel.py +++ b/ocrd_cis/ocropy/ocrolib/toplevel.py @@ -125,10 +125,14 @@ def __init__(self,*args,**kw): self.fun = kw.get("fun","?") self.var = kw.get("var","?") self.description = " ".join([strc(x) for x in args]) - def __str__(self): - return f"\nCheckError for argument {str(self.var)} of function {str(self.fun)}\n{self.description}" - + result = "\nCheckError for argument " + result += str(self.var) + result += " of function " + result += str(self.fun) + result += "\n" + result += self.description + return result class CheckWarning(CheckError): def __init__(self,*args,**kw): @@ -138,8 +142,14 @@ def __init__(self,*args,**kw): CheckError.__init__(self, *args, **kw) def __str__(self): - return (f"\nCheckWarning for argument {str(self.var)} of function {str(self.fun)}\n{self.description} " - f"(This can happen occasionally during normal operations and isn't necessarily a bug or problem.)\n") + result = "\nCheckWarning for argument " + result += str(self.var) + result += " of function " + result += str(self.fun) + result += "\n" + result += self.description + result += "(This can happen occasionally during normal operations and isn't necessarily a bug or problem.)\n" + return result def checktype(value,type_): """Check value against the type spec. If everything @@ -201,7 +211,7 @@ def argument_checks(*args,**kw): e.var = var raise e except: - LOG.critical(f"unknown exception while checking function: '{name}'") + LOG.critical("unknown exception while checking function: '%s'", name) raise result = f(*args,**kw) checktype(result,kw.get("_",True)) @@ -215,9 +225,9 @@ def decorator(f): def wrapper(arg): if not f(arg): if warning: - raise CheckWarning(f"{strc(arg)} of type {str(type(arg))}: {str(message)}") + raise CheckWarning(strc(arg)+" of type "+str(type(arg))+": "+str(message)) else: - raise CheckError(f"{strc(arg)} of type {str(type(arg))}: {str(message)}") + raise CheckError(strc(arg)+" of type "+str(type(arg))+": "+str(message)) return wrapper return decorator From 4420c6fa246c81f1fc7c14e7a1cb6dc1d2460e5f Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 23 Aug 2024 15:06:41 +0200 Subject: [PATCH 77/97] revert: ocropy.common changes --- ocrd_cis/ocropy/common.py | 186 +++++++++++++++++++------------------- 1 file changed, 93 insertions(+), 93 deletions(-) diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py index a5806517..c23e89b9 100644 --- a/ocrd_cis/ocropy/common.py +++ b/ocrd_cis/ocropy/common.py @@ -184,19 +184,16 @@ def estimate_thresholds(flat, bignore=0.1, escale=1.0, lo=5, hi=90): d0, d1 = flat.shape o0, o1 = int(bignore * d0), int(bignore * d1) est = flat[o0:d0 - o0, o1:d1 - o1] - if escale > 0: # by default, we use only regions that contain # significant variance; this makes the percentile # based low and high estimates more reliable e = escale - e_20_0 = e * 20.0 - e_50 = int(e * 50) - v = est - filters.gaussian_filter(est, e_20_0) - v = filters.gaussian_filter(v ** 2, e_20_0) ** 0.5 + v = est - filters.gaussian_filter(est, e*20.0) + v = filters.gaussian_filter(v ** 2, e*20.0) ** 0.5 v = (v > 0.3 * np.amax(v)) - v = morphology.binary_dilation(v, structure=np.ones((e_50, 1))) - v = morphology.binary_dilation(v, structure=np.ones((1, e_50))) + v = morphology.binary_dilation(v, structure=np.ones((int(e * 50), 1))) + v = morphology.binary_dilation(v, structure=np.ones((1, int(e * 50)))) est = est[v] lo = stats.scoreatpercentile(est.ravel(), lo) hi = stats.scoreatpercentile(est.ravel(), hi) @@ -313,24 +310,24 @@ def check_line(binary, zoom=1.0): Returns an error report, or None if valid. """ - if np.prod(binary.shape) == 0: return "image dimensions are zero" - if len(binary.shape) == 3: return f"image is not monochrome {binary.shape}" + if np.prod(binary.shape)==0: return "image dimensions are zero" + if len(binary.shape)==3: return "image is not monochrome %s"%(binary.shape,) if np.amax(binary)==np.amin(binary): return "image is blank" if np.mean(binary)200/zoom: return f"image too tall for a text line {binary.shape}" + if h<20/zoom: return "image not tall enough for a text line %s"%(binary.shape,) + if h>200/zoom: return "image too tall for a text line %s"%(binary.shape,) ##if w<1.5*h: return "line too short %s"%(binary.shape,) - if w<1.5*h and w<32/zoom: return f"image too short for a line image {binary.shape}" - if w>4000/zoom: return f"image too long for a line image {binary.shape}" + if w<1.5*h and w<32/zoom: return "image too short for a line image %s"%(binary.shape,) + if w>4000/zoom: return "image too long for a line image %s"%(binary.shape,) return None ratio = w*1.0/h _, ncomps = measurements.label(binary) lo = int(0.5*ratio+0.5) hi = int(4*ratio)+1 - if ncomps={lo})" - ##if ncomps>hi*ratio: return f"too many connected components (got {ncomps}, wanted <={hi})" - if ncomps>hi*ratio and ncomps>10: return f"too many connected components (got {ncomps}, wanted <={hi})" + if ncomps=%d)"%(ncomps,lo) + ##if ncomps>hi*ratio: return "too many connected components (got %d, wanted <=%d)"%(ncomps,hi) + if ncomps>hi*ratio and ncomps>10: return "too many connected components (got %d, wanted <=%d)"%(ncomps,hi) return None # inspired by ocropus-gpageseg check_page @@ -344,21 +341,21 @@ def check_region(binary, zoom=1.0): Returns an error report, or None if valid. """ - if np.prod(binary.shape) == 0: return "image dimensions are zero" - if len(binary.shape) == 3: return f"image is not monochrome {binary.shape}" + if np.prod(binary.shape)==0: return "image dimensions are zero" + if len(binary.shape)==3: return "image is not monochrome %s"%(binary.shape,) if np.amax(binary)==np.amin(binary): return "image is blank" if np.mean(binary)5000/zoom: return f"image too tall for a region image {binary.shape}" - if w<100/zoom: return f"image too narrow for a region image {binary.shape}" - if w>5000/zoom: return f"image too wide for a region image {binary.shape}" + if h<45/zoom: return "image not tall enough for a region image %s"%(binary.shape,) + if h>5000/zoom: return "image too tall for a region image %s"%(binary.shape,) + if w<100/zoom: return "image too narrow for a region image %s"%(binary.shape,) + if w>5000/zoom: return "image too wide for a region image %s"%(binary.shape,) return None # zoom factor (DPI relative) and 4 (against fragmentation from binarization) slots = int(w*h*1.0/(30*30)*zoom*zoom) * 4 _,ncomps = measurements.label(binary) - if ncomps<5: return f"too few connected components for a region image (got {ncomps})" - if ncomps>slots and ncomps>10: return f"too many connected components for a region image ({ncomps} > {slots})" + if ncomps<5: return "too few connected components for a region image (got %d)"%(ncomps,) + if ncomps>slots and ncomps>10: return "too many connected components for a region image (%d > %d)"%(ncomps,slots) return None # from ocropus-gpageseg, but with zoom parameter @@ -372,21 +369,21 @@ def check_page(binary, zoom=1.0): Returns an error report, or None if valid. """ - if np.prod(binary.shape) == 0: return "image dimensions are zero" - if len(binary.shape) == 3: return f"image not monochrome {binary.shape}" + if np.prod(binary.shape)==0: return "image dimensions are zero" + if len(binary.shape)==3: return "image not monochrome %s"%(binary.shape,) if np.amax(binary)==np.amin(binary): return "image is blank" if np.mean(binary)20000/zoom: return f"image too tall for a page image {binary.shape}" - if w<600/zoom: return f"image too narrow for a page image {binary.shape}" - if w>20000/zoom: return f"image too wide for a page image {binary.shape}" + if h<600/zoom: return "image not tall enough for a page image %s"%(binary.shape,) + if h>20000/zoom: return "image too tall for a page image %s"%(binary.shape,) + if w<600/zoom: return "image too narrow for a page image %s"%(binary.shape,) + if w>20000/zoom: return "image too wide for a page image %s"%(binary.shape,) return None # zoom factor (DPI relative) and 4 (against fragmentation from binarization) slots = int(w*h*1.0/(30*30)*zoom*zoom) * 4 _,ncomps = measurements.label(binary) - if ncomps<10: return f"too few connected components for a page image (got {ncomps})" - if ncomps>slots and ncomps>10: return f"too many connected components for a page image ({ncomps} > {slots})" + if ncomps<10: return "too few connected components for a page image (got %d)"%(ncomps,) + if ncomps>slots and ncomps>10: return "too many connected components for a page image (%d > %d)"%(ncomps,slots) return None def odd(num): @@ -479,13 +476,8 @@ def compute_images(binary, scale, maximages=5): #images = morph.rb_closing(images, (d0,d1)) #DSAVE('images1_closed', images+0.6*binary) # 1- filter largest connected components - binary_0_6 = 0.6 * binary - odd_scale = odd(scale) - odd_half_scale = odd(scale / 2) - odd_doubled_scale = odd(2 * scale) - region_min = (4 * scale) ** 2 - images = morph.select_regions(images, sl.area, min=region_min, nbest=2 * maximages) - DSAVE('images1_large', images + binary_0_6) + images = morph.select_regions(images,sl.area,min=(4*scale)**2,nbest=2*maximages) + DSAVE('images1_large', images+0.6*binary) if not images.any(): return np.zeros_like(binary, int) # 2- open horizontally and vertically to suppress @@ -494,31 +486,31 @@ def compute_images(binary, scale, maximages=5): # single frame, because then the hull polygon # can cover/overlap large text/table parts which # we cannot discern from the actual image anymore - h_opened = morph.rb_opening(images, (1, odd_half_scale)) - DSAVE('images2_h-opened', h_opened + binary_0_6) - v_opened = morph.rb_opening(images, (odd_half_scale, 1)) - DSAVE('images2_v-opened', v_opened + binary_0_6) + h_opened = morph.rb_opening(images, (1, odd(scale/2))) + DSAVE('images2_h-opened', h_opened+0.6*binary) + v_opened = morph.rb_opening(images, (odd(scale/2), 1)) + DSAVE('images2_v-opened', v_opened+0.6*binary) # 3- close whatever remains - closed = morph.rb_closing(h_opened&v_opened, (odd_doubled_scale, odd_doubled_scale)) - DSAVE('images3_closed', closed + binary_0_6) + closed = morph.rb_closing(h_opened&v_opened, (odd(2*scale), odd(2*scale))) + DSAVE('images3_closed', closed + 0.6*binary) # 4- reconstruct the losses up to a certain distance # to avoid creeping into pure h/v-lines again but still # cover most of the large object #images = np.where(images, closed, 2) #images = morph.spread_labels(images, maxdist=scale) % 2 | closed images = morph.rb_reconstruction(closed, images, step=2, maxsteps=scale) - DSAVE('images4_reconstructed', images + binary_0_6) + DSAVE('images4_reconstructed', images+0.6*binary) # 5- select nbest - images = morph.select_regions(images, sl.area, min=region_min, nbest=maximages) - DSAVE('images5_selected', images + binary_0_6) + images = morph.select_regions(images, sl.area, min=(4*scale)**2, nbest=maximages) + DSAVE('images5_selected', images+0.6*binary) if not images.any(): return np.zeros_like(binary, int) # 6- dilate a little to get a smooth contour without gaps - dilated = morph.r_dilation(images, (odd_scale, odd_scale)) + dilated = morph.r_dilation(images, (odd(scale), odd(scale))) images = morph.propagate_labels_majority(binary, dilated+1) images = morph.spread_labels(images, maxdist=scale)==2 images, _ = morph.label(images) - DSAVE('images6_dilated', images + binary_0_6) + DSAVE('images6_dilated', images+0.6*binary) # we could repeat reconstruct-dilate here... return images @@ -556,7 +548,6 @@ def compute_seplines(binary, scale, maxseps=0): sepsizes = [0] sepslices = [None] sepdists = [0] - doubled_scale = 2 * scale for label in range(1, nlabels + 1): labelslice = slices[label] labelmask = labels == label @@ -608,8 +599,8 @@ def compute_seplines(binary, scale, maxseps=0): binmask = sublabels == bin + 1 binlabels, nbinlabels = morph.label(binmask) _, binlabelcounts = np.unique(binlabels, return_counts=True) - largemask = (binlabelcounts > doubled_scale)[binlabels] - smallmask = (binlabelcounts <= doubled_scale)[binlabels] + largemask = (binlabelcounts > 2 * scale)[binlabels] + smallmask = (binlabelcounts <= 2 * scale)[binlabels] sublabels2[binmask & smallmask] = 1 if not np.any(binmask & largemask): continue @@ -1852,13 +1843,11 @@ def find_topological(): else: llab[box] = lbinary[box] # show projection at the sides - log_y = -10 * np.log(y + 1e-9) - log_x = -10 * np.log(x + 1e-9) - for i in range(int(scale / 2)): - llab[box[0], box[1].start + i] = log_y - llab[box[0], box[1].stop - 1 - i] = log_y - llab[box[0].start + i, box[1]] = log_x - llab[box[0].stop - 1 - i, box[1]] = log_x + for i in range(int(scale/2)): + llab[box[0],box[1].start+i] = -10*np.log(y+1e-9) + llab[box[0],box[1].stop-1-i] = -10*np.log(y+1e-9) + llab[box[0].start+i,box[1]] = -10*np.log(x+1e-9) + llab[box[0].stop-1-i,box[1]] = -10*np.log(x+1e-9) DSAVE('recursive_x_y_cut_' + (partition_type or 'sliced'), llab) gap_weights = list() for is_horizontal, profile in enumerate([y, x]): @@ -1888,19 +1877,19 @@ def find_topological(): weights = weights * (1 + 0.5 * props['peak_heights']/gap_height) gap_weights.append((gaps, weights)) if debug: - orientation = 'horizontal' if is_horizontal else 'vertical' - LOG.debug(f' {orientation} gaps {gaps} {props} weights {weights}') + LOG.debug(' {} gaps {} {} weights {}'.format( + 'horizontal' if is_horizontal else 'vertical', + gaps, props, weights)) if not gaps.shape[0]: continue - half_scale = int(scale / 2) for start, stop, height in sorted(zip( props['left_ips'].astype(int), props['right_ips'].astype(int), props['peak_heights']), key=lambda x: x[2]): if is_horizontal: - llab[box[0].start+half_scale:box[0].stop-half_scale,box[1].start+start:box[1].start+stop] = -10*np.log(-height+1e-9) + llab[box[0].start+int(scale/2):box[0].stop-int(scale/2),box[1].start+start:box[1].start+stop] = -10*np.log(-height+1e-9) else: - llab[box[0].start+start:box[0].start+stop,box[1].start+half_scale:box[1].stop-half_scale] = -10*np.log(-height+1e-9) + llab[box[0].start+start:box[0].start+stop,box[1].start+int(scale/2):box[1].stop-int(scale/2)] = -10*np.log(-height+1e-9) DSAVE('recursive_x_y_cut_gaps_' + ('h' if is_horizontal else 'v'), llab) # heuristic (not strict) decision on x or y cut, # factors to consider: @@ -1927,27 +1916,32 @@ def find_topological(): # are not allowed y_gaps, y_weights = gap_weights[0][0], gap_weights[0][1] x_gaps, x_weights = gap_weights[1][0], gap_weights[1][1] - if debug: LOG.debug(f' all y_gaps {y_gaps} x_gaps {x_gaps}') + if debug: LOG.debug(' all y_gaps {} x_gaps {}'.format(y_gaps, x_gaps)) # suppress cuts that significantly split any line labels - min_line_scale = min_line * scale y_allowed = [not(np.any(np.intersect1d( # significant line labels above - np.nonzero(np.bincount(lbin[:gap,:].flatten(), minlength=len(objects))[1:] > min_line_scale)[0], + np.nonzero(np.bincount(lbin[:gap,:].flatten(), + minlength=len(objects))[1:] > min_line * scale)[0], # significant line labels below - np.nonzero(np.bincount(lbin[gap:,:].flatten(), minlength=len(objects))[1:] > min_line_scale)[0], - assume_unique=True))) for gap in y_gaps] + np.nonzero(np.bincount(lbin[gap:,:].flatten(), + minlength=len(objects))[1:] > min_line * scale)[0], + assume_unique=True))) + for gap in y_gaps] x_allowed = [not(np.any(np.intersect1d( # significant line labels left - np.nonzero(np.bincount(lbin[:,:gap].flatten(), minlength=len(objects))[1:] > min_line_scale)[0], + np.nonzero(np.bincount(lbin[:,:gap].flatten(), + minlength=len(objects))[1:] > min_line * scale)[0], # significant line labels right - np.nonzero(np.bincount(lbin[:,gap:].flatten(), minlength=len(objects))[1:] > min_line_scale)[0], - assume_unique=True))) for gap in x_gaps] + np.nonzero(np.bincount(lbin[:,gap:].flatten(), + minlength=len(objects))[1:] > min_line * scale)[0], + assume_unique=True))) + for gap in x_gaps] y_gaps, y_weights = y_gaps[y_allowed], y_weights[y_allowed] x_gaps, x_weights = x_gaps[x_allowed], x_weights[x_allowed] - if debug: LOG.debug(f' allowed y_gaps {y_gaps} x_gaps {x_gaps}') + if debug: LOG.debug(' allowed y_gaps {} x_gaps {}'.format(y_gaps, x_gaps)) y_prominence = np.amax(y_weights, initial=0) x_prominence = np.amax(x_weights, initial=0) - if debug: LOG.debug(f' y_prominence {y_prominence} x_prominence {x_prominence}') + if debug: LOG.debug(' y_prominence {} x_prominence {}'.format(y_prominence, x_prominence)) # suppress less prominent peaks (another heuristic...) # they must compete with the other direction next time # (when already new cuts or partitions will become visible) @@ -1955,30 +1949,33 @@ def find_topological(): x_allowed = x_weights > 0.8 * x_prominence y_gaps, y_weights = y_gaps[y_allowed], y_weights[y_allowed] x_gaps, x_weights = x_gaps[x_allowed], x_weights[x_allowed] - if debug: LOG.debug(f' prominent y_gaps {y_gaps} x_gaps {x_gaps}') + if debug: LOG.debug(' prominent y_gaps {} x_gaps {}'.format(y_gaps, x_gaps)) if npartitions > 0: # TODO this can be avoided when backtracking below # suppress peaks creating fewer partitions than others -- # how large in our preferred direction will the new partitions # of sepmask in both slices created by each cut candidate # add up? - y_partitionscores = [sum(map( - sl.height if prefer_vertical else sl.width, - morph.find_objects(morph.label(partitions[:gap, :] > 0)[0]) + - morph.find_objects(morph.label(partitions[gap:, :] > 0)[0]))) - for gap in y_gaps] - x_partitionscores = [sum(map( - sl.height if prefer_vertical else sl.width, - morph.find_objects(morph.label(partitions[:, : gap] > 0)[0]) + - morph.find_objects(morph.label(partitions[:, gap :] > 0)[0]))) - for gap in x_gaps] - if debug: LOG.debug(f' y_partitionscores {y_partitionscores} x_partitionscores {x_partitionscores}') + y_partitionscores = [sum(map(sl.height if prefer_vertical else sl.width, + morph.find_objects(morph.label( + partitions[:gap,:]>0)[0]) + + morph.find_objects(morph.label( + partitions[gap:,:]>0)[0]))) + for gap in y_gaps] + x_partitionscores = [sum(map(sl.height if prefer_vertical else sl.width, + morph.find_objects(morph.label( + partitions[:,:gap]>0)[0]) + + morph.find_objects(morph.label( + partitions[:,gap:]>0)[0]))) + for gap in x_gaps] + if debug: LOG.debug(' y_partitionscores {} x_partitionscores {}'.format( + y_partitionscores, x_partitionscores)) # Now identify those gaps with the largest overall score y_allowed = y_partitionscores == np.max(y_partitionscores, initial=0) x_allowed = x_partitionscores == np.max(x_partitionscores, initial=0) y_gaps, y_weights = y_gaps[y_allowed], y_weights[y_allowed] x_gaps, x_weights = x_gaps[x_allowed], x_weights[x_allowed] - if debug: LOG.debug(f' most partitioning y_gaps {y_gaps} x_gaps {x_gaps}') + if debug: LOG.debug(' most partitioning y_gaps {} x_gaps {}'.format(y_gaps, x_gaps)) else: y_partitionscores = None x_partitionscores = None @@ -1989,7 +1986,7 @@ def find_topological(): x_allowed = x_weights > 0.9 * x_prominence y_gaps, y_weights = y_gaps[y_allowed], y_weights[y_allowed] x_gaps, x_weights = x_gaps[x_allowed], x_weights[x_allowed] - if debug: LOG.debug(f' prominent y_gaps {y_gaps} x_gaps {x_gaps}') + if debug: LOG.debug(' prominent y_gaps {} x_gaps {}'.format(y_gaps, x_gaps)) # decide which direction, x or y # TODO: this most likely needs a backtracking mechanism @@ -2055,7 +2052,7 @@ def find_topological(): llab2[box] = partitions DSAVE('recursive_x_y_cut_partitions', llab2) for label in range(1, npartitions+1): - LOG.debug(f'next partition %d on %s', label, box) + LOG.debug('next partition %d on %s', label, box) recursive_x_y_cut(box, mask=partitions==label, partition_type=new_partition_type) return @@ -2063,9 +2060,10 @@ def find_topological(): # no gaps left finalize() return - orientation = 'vertical' if choose_vertical else 'horizontal' # otherwise: cut on gaps - LOG.debug(f'cutting {orientation}ly on {box} into {gaps}') + LOG.debug('cutting %s on %s into %s', 'vertically' + if choose_vertical else 'horizontally', + box, gaps) cuts = list(zip(np.insert(gaps, 0, 0), np.append(gaps, lim))) if choose_vertical: if rl: @@ -2080,7 +2078,9 @@ def find_topological(): sub = sl.box(0, len(y), start, stop) else: # "cut in horizontal direction" sub = sl.box(start, stop, 0, len(x)) - LOG.debug(f'next {orientation} block on {box} is {sub}') + LOG.debug('next %s block on %s is %s', 'horizontal' + if choose_vertical else 'vertical', + box, sub) recursive_x_y_cut(sl.compose(box,sub), mask=sl.cut(mask,sub) if isinstance(mask, np.ndarray) else None) From 2d8650ed51f5e9cc627d95ae5aea217b9f7bacb6 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 23 Aug 2024 15:15:50 +0200 Subject: [PATCH 78/97] remove whitespaces in ocropy.common and ocropy.ocrolib --- ocrd_cis/ocropy/common.py | 18 +++++++++--------- ocrd_cis/ocropy/ocrolib/morph.py | 8 ++++---- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py index c23e89b9..c5b56ed0 100644 --- a/ocrd_cis/ocropy/common.py +++ b/ocrd_cis/ocropy/common.py @@ -189,8 +189,8 @@ def estimate_thresholds(flat, bignore=0.1, escale=1.0, lo=5, hi=90): # significant variance; this makes the percentile # based low and high estimates more reliable e = escale - v = est - filters.gaussian_filter(est, e*20.0) - v = filters.gaussian_filter(v ** 2, e*20.0) ** 0.5 + v = est - filters.gaussian_filter(est, e * 20.0) + v = filters.gaussian_filter(v ** 2, e * 20.0) ** 0.5 v = (v > 0.3 * np.amax(v)) v = morphology.binary_dilation(v, structure=np.ones((int(e * 50), 1))) v = morphology.binary_dilation(v, structure=np.ones((1, int(e * 50)))) @@ -491,8 +491,8 @@ def compute_images(binary, scale, maximages=5): v_opened = morph.rb_opening(images, (odd(scale/2), 1)) DSAVE('images2_v-opened', v_opened+0.6*binary) # 3- close whatever remains - closed = morph.rb_closing(h_opened&v_opened, (odd(2*scale), odd(2*scale))) - DSAVE('images3_closed', closed + 0.6*binary) + closed = morph.rb_closing(h_opened&v_opened, (odd(2*scale),odd(2*scale))) + DSAVE('images3_closed', closed+0.6*binary) # 4- reconstruct the losses up to a certain distance # to avoid creeping into pure h/v-lines again but still # cover most of the large object @@ -501,12 +501,12 @@ def compute_images(binary, scale, maximages=5): images = morph.rb_reconstruction(closed, images, step=2, maxsteps=scale) DSAVE('images4_reconstructed', images+0.6*binary) # 5- select nbest - images = morph.select_regions(images, sl.area, min=(4*scale)**2, nbest=maximages) + images = morph.select_regions(images,sl.area,min=(4*scale)**2,nbest=maximages) DSAVE('images5_selected', images+0.6*binary) if not images.any(): return np.zeros_like(binary, int) # 6- dilate a little to get a smooth contour without gaps - dilated = morph.r_dilation(images, (odd(scale), odd(scale))) + dilated = morph.r_dilation(images, (odd(scale),odd(scale))) images = morph.propagate_labels_majority(binary, dilated+1) images = morph.spread_labels(images, maxdist=scale)==2 images, _ = morph.label(images) @@ -1969,7 +1969,7 @@ def find_topological(): partitions[:,gap:]>0)[0]))) for gap in x_gaps] if debug: LOG.debug(' y_partitionscores {} x_partitionscores {}'.format( - y_partitionscores, x_partitionscores)) + y_partitionscores, x_partitionscores)) # Now identify those gaps with the largest overall score y_allowed = y_partitionscores == np.max(y_partitionscores, initial=0) x_allowed = x_partitionscores == np.max(x_partitionscores, initial=0) @@ -2062,7 +2062,7 @@ def find_topological(): return # otherwise: cut on gaps LOG.debug('cutting %s on %s into %s', 'vertically' - if choose_vertical else 'horizontally', + if choose_vertical else 'horizontally', box, gaps) cuts = list(zip(np.insert(gaps, 0, 0), np.append(gaps, lim))) if choose_vertical: @@ -2079,7 +2079,7 @@ def find_topological(): else: # "cut in horizontal direction" sub = sl.box(start, stop, 0, len(x)) LOG.debug('next %s block on %s is %s', 'horizontal' - if choose_vertical else 'vertical', + if choose_vertical else 'vertical', box, sub) recursive_x_y_cut(sl.compose(box,sub), mask=sl.cut(mask,sub) if isinstance(mask, np.ndarray) diff --git a/ocrd_cis/ocropy/ocrolib/morph.py b/ocrd_cis/ocropy/ocrolib/morph.py index f7ccdc31..7d6ffc85 100644 --- a/ocrd_cis/ocropy/ocrolib/morph.py +++ b/ocrd_cis/ocropy/ocrolib/morph.py @@ -349,10 +349,10 @@ def all_neighbors(image, dist=1, bg=NaN): q = 100000 assert amax(image)=0 - u = unique(q*image+shift(image, (dist, 0), order=0, cval=bg)) - d = unique(q*image+shift(image, (-dist, 0), order=0, cval=bg)) - l = unique(q*image+shift(image, (0, dist), order=0, cval=bg)) - r = unique(q*image+shift(image, (0, -dist), order=0, cval=bg)) + u = unique(q*image+shift(image,(dist,0),order=0,cval=bg)) + d = unique(q*image+shift(image,(-dist,0),order=0,cval=bg)) + l = unique(q*image+shift(image,(0,dist),order=0,cval=bg)) + r = unique(q*image+shift(image,(0,-dist),order=0,cval=bg)) all = unique(r_[u,d,l,r]) all = all[all!=bg] all = c_[all//q,all%q] From 9a153b079a3684bf875b306ba8eaad9e1637eeed Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 25 Aug 2024 02:01:36 +0200 Subject: [PATCH 79/97] postcorrect: adapt to frozendict Processor.parameter in v3 --- ocrd_cis/__init__.py | 1 - ocrd_cis/align/cli.py | 1 - ocrd_cis/ocrd_tool.py | 6 ---- ocrd_cis/ocropy/binarize.py | 6 +--- ocrd_cis/ocropy/clip.py | 9 +----- ocrd_cis/ocropy/denoise.py | 8 +----- ocrd_cis/ocropy/deskew.py | 8 +----- ocrd_cis/ocropy/dewarp.py | 4 --- ocrd_cis/ocropy/recognize.py | 7 ++--- ocrd_cis/ocropy/resegment.py | 9 +----- ocrd_cis/ocropy/segment.py | 47 +++++++++++++++--------------- ocrd_cis/ocropy/train.py | 7 +---- ocrd_cis/postcorrect/cli.py | 55 +++++++++++++++++++++--------------- 13 files changed, 63 insertions(+), 105 deletions(-) delete mode 100644 ocrd_cis/ocrd_tool.py diff --git a/ocrd_cis/__init__.py b/ocrd_cis/__init__.py index 6f37f4f7..9d22fe3e 100644 --- a/ocrd_cis/__init__.py +++ b/ocrd_cis/__init__.py @@ -1,3 +1,2 @@ from .javaprocess import JavaAligner from .javaprocess import JavaPostCorrector -from .ocrd_tool import get_ocrd_tool diff --git a/ocrd_cis/align/cli.py b/ocrd_cis/align/cli.py index f5e47785..5706461e 100644 --- a/ocrd_cis/align/cli.py +++ b/ocrd_cis/align/cli.py @@ -11,7 +11,6 @@ from ocrd import Processor, OcrdPage, OcrdPageResult from ocrd.decorators import ocrd_cli_options from ocrd.decorators import ocrd_cli_wrap_processor -from ocrd_utils import getLogger from ocrd_utils import getLevelName from ocrd_models.ocrd_page import TextRegionType, TextEquivType from ocrd_cis import JavaAligner diff --git a/ocrd_cis/ocrd_tool.py b/ocrd_cis/ocrd_tool.py deleted file mode 100644 index 36cb9d7e..00000000 --- a/ocrd_cis/ocrd_tool.py +++ /dev/null @@ -1,6 +0,0 @@ -import json -from ocrd_utils import resource_string - - -def get_ocrd_tool(): - return json.loads(resource_string(__name__, 'ocrd-tool.json')) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index 35b28c5a..9a55301d 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -8,8 +8,7 @@ from ocrd_utils import getLogger from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage -from ocrd import Processor -from ocrd.processor import OcrdPageResult, OcrdPageResultImage +from ocrd import Processor, OcrdPageResult, OcrdPageResultImage from . import common from .common import array2pil, determine_zoom, pil2array, remove_noise @@ -51,14 +50,11 @@ def binarize(logger: Logger, pil_image, method='ocropy', maxskew=2, threshold=0. return Image.fromarray(th), 0 class OcropyBinarize(Processor): - logger: Logger - @property def executable(self): return 'ocrd-cis-ocropy-binarize' def setup(self): - self.logger = getLogger('processor.OcropyBinarize') method = self.parameter['method'] if self.parameter['grayscale'] and method != 'ocropy': self.logger.critical(f'Requested method {method} does not support grayscale normalized output') diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py index b81c731c..18a0c115 100644 --- a/ocrd_cis/ocropy/clip.py +++ b/ocrd_cis/ocropy/clip.py @@ -8,13 +8,11 @@ from shapely.prepared import prep from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage -from ocrd import Processor -from ocrd.processor import OcrdPageResult, OcrdPageResultImage +from ocrd import Processor, OcrdPageResult, OcrdPageResultImage from ocrd_utils import ( bbox_from_polygon, coordinates_of_segment, crop_image, - getLogger, image_from_polygon, polygon_from_points, polygon_mask, @@ -25,15 +23,10 @@ class OcropyClip(Processor): - logger: Logger - @property def executable(self): return 'ocrd-cis-ocropy-clip' - def setup(self): - self.logger = getLogger('processor.OcropyClip') - def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: str = None) -> OcrdPageResult: """Clip text regions / lines of a page at intersections with neighbours. diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py index eb3e7d23..eaed74df 100644 --- a/ocrd_cis/ocropy/denoise.py +++ b/ocrd_cis/ocropy/denoise.py @@ -4,21 +4,15 @@ from ocrd_utils import getLogger from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage -from ocrd import Processor -from ocrd.processor import OcrdPageResult, OcrdPageResultImage +from ocrd import Processor, OcrdPageResult, OcrdPageResultImage from .common import determine_zoom, remove_noise class OcropyDenoise(Processor): - logger: Logger - @property def executable(self): return 'ocrd-cis-ocropy-denoise' - def setup(self): - self.logger = getLogger('processor.OcropyDenoise') - def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Despeckle the pages / regions / lines of the workspace. diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py index 7bdbba2d..b02c69d5 100644 --- a/ocrd_cis/ocropy/deskew.py +++ b/ocrd_cis/ocropy/deskew.py @@ -4,8 +4,7 @@ from ocrd_utils import getLogger from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage, PageType -from ocrd import Processor -from ocrd.processor import OcrdPageResult, OcrdPageResultImage +from ocrd import Processor, OcrdPageResult, OcrdPageResultImage from . import common from .common import pil2array @@ -16,15 +15,10 @@ def deskew(pil_image, maxskew=2): return angle class OcropyDeskew(Processor): - logger: Logger - @property def executable(self): return 'ocrd-cis-ocropy-deskew' - def setup(self): - self.logger = getLogger('processor.OcropyDeskew') - def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Deskew the pages or regions of the workspace. diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py index 17d0b4ce..e33ce024 100644 --- a/ocrd_cis/ocropy/dewarp.py +++ b/ocrd_cis/ocropy/dewarp.py @@ -5,7 +5,6 @@ from ocrd import Processor from ocrd.processor import OcrdPageResult, OcrdPageResultImage -from ocrd_utils import getLogger from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage from .ocrolib import lineest @@ -54,14 +53,11 @@ def padvert(image, range_): return array2pil(line) class OcropyDewarp(Processor): - logger: Logger - @property def executable(self): return 'ocrd-cis-ocropy-dewarp' def setup(self): - self.logger = getLogger('processor.OcropyDewarp') # defaults from ocrolib.lineest: self.lnorm = lineest.CenterNormalizer( params=(self.parameter['range'], diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py index 02d29e7c..85a76585 100644 --- a/ocrd_cis/ocropy/recognize.py +++ b/ocrd_cis/ocropy/recognize.py @@ -10,10 +10,9 @@ from rapidfuzz.distance import Levenshtein -from ocrd_utils import coordinates_for_segment, getLogger, points_from_polygon, polygon_from_bbox +from ocrd_utils import coordinates_for_segment, points_from_polygon, polygon_from_bbox from ocrd_models.ocrd_page import CoordsType, GlyphType, OcrdPage, TextEquivType, WordType -from ocrd import Processor -from ocrd.processor import OcrdPageResult +from ocrd import Processor, OcrdPageResult from .common import check_line, pil2array from .ocrolib import lstm, load_object, midrange @@ -67,7 +66,6 @@ def recognize(image, pad, network, check=True): class OcropyRecognize(Processor): - logger: Logger network: Any pad: int @@ -76,7 +74,6 @@ def executable(self): return 'ocrd-cis-ocropy-recognize' def setup(self): - self.logger = getLogger('processor.OcropyRecognize') self.pad = 16 # from ocropus-rpred: self.network = load_object(self.get_model(), verbose=1) diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index c1809569..0fb133c0 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -9,7 +9,6 @@ from shapely.prepared import prep from ocrd_utils import ( - getLogger, coordinates_of_segment, coordinates_for_segment, points_from_polygon, @@ -17,8 +16,7 @@ transform_coordinates, ) from ocrd_models.ocrd_page import BaselineType, PageType, OcrdPage -from ocrd import Processor -from ocrd.processor import OcrdPageResult +from ocrd import Processor, OcrdPageResult from .ocrolib import midrange, morph from .common import ( @@ -43,15 +41,10 @@ ) class OcropyResegment(Processor): - logger: Logger - @property def executable(self): return 'ocrd-cis-ocropy-resegment' - def setup(self): - self.logger = getLogger('processor.OcropyResegment') - def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Resegment lines of the workspace. diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index b363cbd2..493deb30 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -16,7 +16,6 @@ from shapely import set_precision from ocrd_utils import ( - getLogger, coordinates_of_segment, coordinates_for_segment, points_from_polygon, @@ -243,21 +242,17 @@ def getx(xy): class OcropySegment(Processor): - logger: Logger - @property def executable(self): return 'ocrd-cis-ocropy-segment' - def setup(self): - self.logger = getLogger('processor.OcropySegment') - def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Segment pages into regions+lines, tables into cells+lines, or regions into lines. - + Open and deserialise PAGE input file and its respective images, then iterate over the element hierarchy down to the requested level. - + + \b Depending on ``level-of-operation``, consider existing segments: - If ``overwrite_separators=True`` on ``page`` level, then delete any SeparatorRegions. @@ -270,12 +265,13 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional - If ``overwrite_order=True`` on ``page`` or ``table`` level, then delete the reading order OrderedGroup entry corresponding to the (page/table) segment. - + Next, get each element image according to the layout annotation (from the alternative image of the page/region, or by cropping via coordinates into the higher-level image) in binarized form, and represent it as an array with non-text regions and (remaining) text neighbours suppressed. - + + \b Then compute a text line segmentation for that array (as a label mask). When ``level-of-operation`` is ``page`` or ``table``, this also entails detecting @@ -284,25 +280,26 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional - up to ``maxcolseps`` background column separators before text line segmentation itself, as well as aggregating text lines to text regions afterwards. - + Text regions are detected via a hybrid variant recursive X-Y cut algorithm (RXYC): RXYC partitions the binarized image in top-down manner by detecting horizontal or vertical gaps. This implementation uses the bottom-up text line segmentation to guide the search, and also uses both pre-existing and newly detected separators to alternatively partition the respective boxes into non-rectangular parts. - + During line segmentation, suppress the foreground of all previously annotated regions (of any kind) and lines, except if just removed due to ``overwrite``. During region aggregation however, combine the existing separators with the new-found separators to guide the column search. - + All detected segments (both text line and text region) are sorted according to their reading order (assuming a top-to-bottom, left-to-right ordering). When ``level-of-operation`` is ``page``, prefer vertical (column-first) succession of regions. When it is ``table``, prefer horizontal (row-first) succession of cells. - + + \b Then for each resulting segment label, convert its background mask into polygon outlines by finding the outer contours consistent with the element's polygon outline. Annotate the result by adding it as a new TextLine/TextRegion: @@ -314,7 +311,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional - If it is ``page``, then append the new lines to their respective regions, and append the new regions to the page. (Also, create an OrderedGroup for it in the ReadingOrder.) - + Produce a new output file by serialising the resulting hierarchy. """ # FIXME: allow passing a-priori info on reading order / textline order @@ -495,13 +492,13 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non Given a PageType, TableRegionType or TextRegionType ``element``, and a corresponding binarized PIL.Image object ``image`` with coordinate metadata ``coords``, run line segmentation with Ocropy. - + If operating on the full page (or table), then also detect horizontal and vertical separators, and aggregate the lines into text regions afterwards. - + Add the resulting sub-segments to the parent ``element``. - + If ``ignore`` is not empty, then first suppress all foreground components in any of those segments' coordinates during segmentation, and if also in full page/table mode, then combine all separators among them with the @@ -773,7 +770,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non def polygon_for_parent(polygon, parent): """Clip polygon to parent polygon range. - + (Should be moved to ocrd_utils.coordinates_for_segment.) """ childp = Polygon(polygon) @@ -986,7 +983,7 @@ def join_baselines(logger: Logger, baselines, loc=''): def page_get_reading_order(ro, rogroup): """Add all elements from the given reading order group to the given dictionary. - + Given a dict ``ro`` from layout element IDs to ReadingOrder element objects, and an object ``rogroup`` with additional ReadingOrder element objects, add all references to the dict, traversing the group recursively. @@ -1006,10 +1003,10 @@ def page_get_reading_order(ro, rogroup): def page_add_to_reading_order(rogroup, region_id, index=None): """Add a region reference to an un/ordered RO group. - + Given a ReadingOrder group ``rogroup`` (of any type), append a reference to region ``region_id`` to it. - + If ``index`` is given, use that as position and return incremented by one. (This must be an integer if ``rogroup`` is an OrderedGroup(Indexed). @@ -1025,16 +1022,16 @@ def page_add_to_reading_order(rogroup, region_id, index=None): def page_subgroup_in_reading_order(logger: Logger, roelem): """Replace given RO element by an equivalent OrderedGroup. - + Given a ReadingOrder element ``roelem`` (of any type), first look up its parent group. Remove it from the respective member list (of its region refs or un/ordered groups), even if it already was an OrderedGroup(Indexed). - + Then instantiate an empty OrderedGroup(Indexed), referencing the same region as ``roelem`` (and using the same index, if any). Add that group to the parent instead. - + Return the new group object. """ if not roelem: diff --git a/ocrd_cis/ocropy/train.py b/ocrd_cis/ocropy/train.py index 6c627231..78302f12 100644 --- a/ocrd_cis/ocropy/train.py +++ b/ocrd_cis/ocropy/train.py @@ -7,9 +7,7 @@ from os.path import abspath, dirname, exists, join, isfile from ocrd_models import OcrdPage -from ocrd import Processor, Workspace -from ocrd.processor import OcrdPageResult -from ocrd_utils import getLogger +from ocrd import Processor, Workspace, OcrdPageResult from .ocropus_rtrain import * from .binarize import binarize @@ -30,9 +28,7 @@ def resize_keep_ratio(image, baseheight=48): class OcropyTrain(Processor): - logger: Logger modelpath: str - old_cwd: str outputpath: str @property @@ -40,7 +36,6 @@ def executable(self): return 'ocrd-cis-ocropy-train' def setup(self): - self.logger = getLogger('processor.OcropyTrain') if 'model' in self.parameter: model = self.parameter['model'] try: diff --git a/ocrd_cis/postcorrect/cli.py b/ocrd_cis/postcorrect/cli.py index 71fbaad1..6759b96a 100644 --- a/ocrd_cis/postcorrect/cli.py +++ b/ocrd_cis/postcorrect/cli.py @@ -4,10 +4,9 @@ import click import json -from ocrd import Processor +from ocrd import Processor, Workspace from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor -from ocrd_utils import getLogger, getLevelName -from ocrd_models.ocrd_mets import OcrdMets +from ocrd_utils import getLevelName, pushd_popd from ocrd_cis import JavaPostCorrector @@ -21,26 +20,38 @@ class PostCorrector(Processor): def executable(self): return 'ocrd-cis-postcorrect' - def process(self): + def setup(self): + # since ocrd v3.0 we cannot overwrite self.parameter anymore + # because that gets validated against the schema + # (so these additions would fail) + self.params = dict(self.parameter) profiler = {} profiler["path"] = self.parameter["profilerPath"] profiler["config"] = self.parameter["profilerConfig"] profiler["noCache"] = True - self.parameter["profiler"] = profiler - self.parameter["runDM"] = True - self.logger.debug(json.dumps(self.parameter, indent=4)) - p = JavaPostCorrector(self.workspace.mets_target, - self.input_file_grp, - self.output_file_grp, - self.parameter, - getLevelName(self.logger.getEffectiveLevel())) - p.exe() - # reload the mets file to prevent run_processor's save_mets - # from overriding the results from the Java process - self.workspace.reload_mets() - # workaround for cisocrgroup/ocrd-postcorrection#13 (absolute paths in output): - for output_file in self.workspace.find_files(file_grp=self.output_file_grp): - flocat = output_file._el.find('{http://www.loc.gov/METS/}FLocat') - flocat.attrib['LOCTYPE'] = 'OTHER' - flocat.attrib['OTHERLOCTYPE'] = 'FILE' - output_file.local_filename = os.path.relpath(output_file.local_filename, self.workspace.directory) + self.params["profiler"] = profiler + self.params["runDM"] = True + self.logger.debug(json.dumps(self.params, indent=4)) + + def process_workspace(self, workspace: Workspace): + with pushd_popd(workspace.directory): + self.workspace = workspace + self.verify() + # this CLI call mimics the OCR-D processor CLI itself + # we have no control over its interior + # (we get no page-wise error handling and input downloading) + p = JavaPostCorrector(self.workspace.mets_target, + self.input_file_grp, + self.output_file_grp, + self.params, + getLevelName(self.logger.getEffectiveLevel())) + p.exe() + # reload the mets file to prevent run_processor's save_mets + # from overriding the results from the Java process + self.workspace.reload_mets() + # workaround for cisocrgroup/ocrd-postcorrection#13 (absolute paths in output): + for output_file in self.workspace.find_files(file_grp=self.output_file_grp): + flocat = output_file._el.find('{http://www.loc.gov/METS/}FLocat') + flocat.attrib['LOCTYPE'] = 'OTHER' + flocat.attrib['OTHERLOCTYPE'] = 'FILE' + output_file.local_filename = os.path.relpath(output_file.local_filename, self.workspace.directory) From bd0613a20fd4d7d88a466cc75f3e94be656f08bf Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 26 Aug 2024 11:36:53 +0200 Subject: [PATCH 80/97] require ocrd>=3.0.0b1 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 38f09abd..83cf28bb 100644 --- a/setup.py +++ b/setup.py @@ -38,7 +38,7 @@ packages=find_packages(), include_package_data=True, install_requires=[ - 'ocrd>=3.0.0a1', + 'ocrd>=3.0.0b1', 'click', 'scipy', 'numpy>=1.17.0', From f6e437fc8d5ef7bbb51fa7b4f5d590a11c6fc627 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 27 Aug 2024 14:46:41 +0200 Subject: [PATCH 81/97] add: simple github actions workflow --- .github/workflow/tests.yml | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 .github/workflow/tests.yml diff --git a/.github/workflow/tests.yml b/.github/workflow/tests.yml new file mode 100644 index 00000000..424409df --- /dev/null +++ b/.github/workflow/tests.yml @@ -0,0 +1,27 @@ +name: Test ocrd_cis installation and run tests + +on: + push: + branches: [ "master" ] + pull_request: + branches: [ "master" ] + +jobs: + build: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ] + os: [ "ubuntu-22.04" ] + + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install ocrd_cis + run: make install + - name: Test ocrd_cis + run: make test From 403781a3c27e5fdb0cddcf311401dad1a24f83f8 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 27 Aug 2024 15:30:14 +0200 Subject: [PATCH 82/97] Update .github/workflow/tests.yml Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- .github/workflow/tests.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflow/tests.yml b/.github/workflow/tests.yml index 424409df..24fa0bc7 100644 --- a/.github/workflow/tests.yml +++ b/.github/workflow/tests.yml @@ -2,9 +2,8 @@ name: Test ocrd_cis installation and run tests on: push: - branches: [ "master" ] pull_request: - branches: [ "master" ] + workflow_dispatch: jobs: build: From 97083bb71e724276385058bde9244cbdd21dce64 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 27 Aug 2024 15:30:25 +0200 Subject: [PATCH 83/97] Update .github/workflow/tests.yml Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- .github/workflow/tests.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflow/tests.yml b/.github/workflow/tests.yml index 24fa0bc7..559297dd 100644 --- a/.github/workflow/tests.yml +++ b/.github/workflow/tests.yml @@ -20,7 +20,11 @@ jobs: uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} + - uses: actions/setup-java@v4 + with: + distribution: 'zulu' + java-version: '11' - name: Install ocrd_cis run: make install - name: Test ocrd_cis - run: make test + run: make test V="" From 2b20e0c44da924a5b15379d86eb557acdf42b1f3 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 27 Aug 2024 15:49:11 +0200 Subject: [PATCH 84/97] fix: checkout ref --- .github/workflow/tests.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflow/tests.yml b/.github/workflow/tests.yml index 559297dd..f95a09a4 100644 --- a/.github/workflow/tests.yml +++ b/.github/workflow/tests.yml @@ -15,7 +15,10 @@ jobs: os: [ "ubuntu-22.04" ] steps: - - uses: actions/checkout@v4 + - name: Checkout repository + uses: actions/checkout@v4 + with: + ref: ${{ github.head_ref }} - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: From 86a08eb5cc471eef536bc2d050e80f768a728e43 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 27 Aug 2024 16:08:48 +0200 Subject: [PATCH 85/97] Create GH Actions workflow: test.yml --- .github/workflows/test.yml | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 .github/workflows/test.yml diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 00000000..f95a09a4 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,33 @@ +name: Test ocrd_cis installation and run tests + +on: + push: + pull_request: + workflow_dispatch: + +jobs: + build: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ] + os: [ "ubuntu-22.04" ] + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + ref: ${{ github.head_ref }} + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - uses: actions/setup-java@v4 + with: + distribution: 'zulu' + java-version: '11' + - name: Install ocrd_cis + run: make install + - name: Test ocrd_cis + run: make test V="" From 1d7e9a0d5f72e66c92c07e15508ba330e130f6bb Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 27 Aug 2024 16:18:40 +0200 Subject: [PATCH 86/97] delete: wrong path for workflows --- .github/workflow/tests.yml | 33 --------------------------------- 1 file changed, 33 deletions(-) delete mode 100644 .github/workflow/tests.yml diff --git a/.github/workflow/tests.yml b/.github/workflow/tests.yml deleted file mode 100644 index f95a09a4..00000000 --- a/.github/workflow/tests.yml +++ /dev/null @@ -1,33 +0,0 @@ -name: Test ocrd_cis installation and run tests - -on: - push: - pull_request: - workflow_dispatch: - -jobs: - build: - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ] - os: [ "ubuntu-22.04" ] - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - with: - ref: ${{ github.head_ref }} - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - - uses: actions/setup-java@v4 - with: - distribution: 'zulu' - java-version: '11' - - name: Install ocrd_cis - run: make install - - name: Test ocrd_cis - run: make test V="" From 224e86f5467c7506882792fa03397cbe032f69c9 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 27 Aug 2024 16:27:55 +0200 Subject: [PATCH 87/97] fix: NaN error for python3.9+ --- ocrd_cis/ocropy/ocrolib/morph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_cis/ocropy/ocrolib/morph.py b/ocrd_cis/ocropy/ocrolib/morph.py index 7d6ffc85..1ebfb204 100644 --- a/ocrd_cis/ocropy/ocrolib/morph.py +++ b/ocrd_cis/ocropy/ocrolib/morph.py @@ -343,7 +343,7 @@ def select_regions(binary,f,min=0,nbest=100000): return keep[labels] @checks(SEGMENTATION) -def all_neighbors(image, dist=1, bg=NaN): +def all_neighbors(image, dist=1, bg=float('nan')): """Given an image with labels, find all pairs of labels that are directly (up to ``dist``) neighboring each other, ignoring the label ``bg``.""" q = 100000 From a397531e549532675341c15b6c4a6fbef1f96818 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 27 Aug 2024 16:29:37 +0200 Subject: [PATCH 88/97] fix: NaN in reading_order in morph.py --- ocrd_cis/ocropy/ocrolib/morph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_cis/ocropy/ocrolib/morph.py b/ocrd_cis/ocropy/ocrolib/morph.py index 1ebfb204..4b626e83 100644 --- a/ocrd_cis/ocropy/ocrolib/morph.py +++ b/ocrd_cis/ocropy/ocrolib/morph.py @@ -429,7 +429,7 @@ def reading_order(seg,rl=False,bt=False): segmap[1:] = 1 return segmap def pos(f,l): - return array([f(x) if x else nan for x in l]) + return array([f(x) if x else float('nan') for x in l]) ys = pos(sl.ycenter,objects) yorder = argsort(ys)[::-1 if bt else 1] groups = [[yorder[0]]] From 9cf83051b2f1875b0757eb1d81ff0a29b7f63047 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 1 Sep 2024 11:18:36 +0200 Subject: [PATCH 89/97] fix type hints --- ocrd_cis/align/cli.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ocrd_cis/align/cli.py b/ocrd_cis/align/cli.py index 5706461e..395f7b07 100644 --- a/ocrd_cis/align/cli.py +++ b/ocrd_cis/align/cli.py @@ -229,8 +229,8 @@ class Alignment: file_grp: str pcgts: OcrdPage region: TextRegionType - alignment: Alignment - def __init__(self, file_grp: str, pcgts: OcrdPage, region: TextRegionType, alignment: Alignment): + alignment: dict + def __init__(self, file_grp: str, pcgts: OcrdPage, region: TextRegionType, alignment: dict): self.file_grp = file_grp self.pcgts = pcgts self.region = region From a0c734dd3e357606bde1c121cd4e25c972087df6 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 1 Sep 2024 11:19:29 +0200 Subject: [PATCH 90/97] dewarp: make thread-safe --- ocrd_cis/ocropy/dewarp.py | 25 ++++++++++++------------- ocrd_cis/ocropy/ocrolib/lineest.py | 2 +- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py index e33ce024..a0d0ea5c 100644 --- a/ocrd_cis/ocropy/dewarp.py +++ b/ocrd_cis/ocropy/dewarp.py @@ -57,17 +57,6 @@ class OcropyDewarp(Processor): def executable(self): return 'ocrd-cis-ocropy-dewarp' - def setup(self): - # defaults from ocrolib.lineest: - self.lnorm = lineest.CenterNormalizer( - params=(self.parameter['range'], - self.parameter['smoothness'], - # let's not expose this for now - # (otherwise we must explain mutual - # dependency between smoothness - # and extra params) - 0.3)) - def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Dewarp the lines of the workspace. @@ -94,6 +83,16 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id) zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) + # defaults from ocrolib.lineest: + lnorm = lineest.CenterNormalizer( + params=(self.parameter['range'], + self.parameter['smoothness'], + # let's not expose this for now + # (otherwise we must explain mutual + # dependency between smoothness + # and extra params) + 0.3)) + regions = page.get_AllRegions(classes=['Text'], order='reading-order') if not regions: self.logger.warning(f'Page "{page_id}" contains no text regions') @@ -107,8 +106,8 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional self.logger.info(f"About to dewarp page '{page_id}' region '{region.id}' line '{line.id}'") try: dew_image = dewarp( - line_image, self.lnorm, check=True, max_neighbour=self.parameter['max_neighbour'], zoom=zoom) - except InvalidLine as err: + line_image, lnorm, check=True, max_neighbour=self.parameter['max_neighbour'], zoom=zoom) + except (InvalidLine, AssertionError) as err: self.logger.error(f'Cannot dewarp line "{line.id}": {err}') continue except InadequateLine as err: diff --git a/ocrd_cis/ocropy/ocrolib/lineest.py b/ocrd_cis/ocropy/ocrolib/lineest.py index 42ef2237..392c7e4a 100644 --- a/ocrd_cis/ocropy/ocrolib/lineest.py +++ b/ocrd_cis/ocropy/ocrolib/lineest.py @@ -75,7 +75,7 @@ def measure(self,line): plt.plot(self.center) plt.ginput(1,1000) def dewarp(self,img,cval=0,dtype=np.dtype('f')): - assert img.shape==self.shape + assert img.shape==self.shape, f"input shape {img.shape} deviates from measured shape {self.shape}" h,w = img.shape # The actual image img is embedded into a larger image by # adding vertical space on top and at the bottom (padding) From 66baaf07f60532185a41ea606c31964ee046c8ba Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 1 Sep 2024 11:21:19 +0200 Subject: [PATCH 91/97] recognize: disallow multithreading (impossible with current lstm implementation) --- ocrd_cis/ocropy/recognize.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py index 85a76585..97bec8a7 100644 --- a/ocrd_cis/ocropy/recognize.py +++ b/ocrd_cis/ocropy/recognize.py @@ -45,7 +45,7 @@ def recognize(image, pad, network, check=True): pred = network.predictString(line) # getting confidence - result = lstm.translate_back(network.outputs, pos=1) + result = lstm.translate_back(network.outputs, pos=1) # raw positions scale = len(raw_line.T) * 1.0 / (len(network.outputs) - 2 * pad) clist = [] @@ -68,6 +68,8 @@ def recognize(image, pad, network, check=True): class OcropyRecognize(Processor): network: Any pad: int + # lstm is not thread-safe (.outputs, .last_n as side effects etc) + max_workers = 1 @property def executable(self): @@ -191,7 +193,7 @@ def process_lines(self, textlines, maxlevel, region_image, region_xywh): try: linepred, clist, rlist, confidlist = recognize(final_img, self.pad, self.network, check=True) except Exception as err: - self.logger.debug(f'Error processing line "{line.id}": {err}') + self.logger.debug(f'Error processing line "{line.id}": {str(err) or err.__class__.__name__}') continue self.logger.debug(f"OCR '{line.id}': '{linepred}'") edits += Levenshtein.distance(linepred, linegt) From 32ce6560d9c1e10fdfd00055e567b0fe13187404 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 1 Sep 2024 11:22:14 +0200 Subject: [PATCH 92/97] postcorrect: make work under METS Server --- ocrd_cis/postcorrect/cli.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/ocrd_cis/postcorrect/cli.py b/ocrd_cis/postcorrect/cli.py index 6759b96a..70918de7 100644 --- a/ocrd_cis/postcorrect/cli.py +++ b/ocrd_cis/postcorrect/cli.py @@ -1,12 +1,14 @@ from __future__ import absolute_import import os +import json import click -import json from ocrd import Processor, Workspace from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor from ocrd_utils import getLevelName, pushd_popd +from ocrd_models import OcrdMets + from ocrd_cis import JavaPostCorrector @@ -37,6 +39,8 @@ def process_workspace(self, workspace: Workspace): with pushd_popd(workspace.directory): self.workspace = workspace self.verify() + # ensure that input files are referenced in on-disk METS + self.workspace.save_mets() # this CLI call mimics the OCR-D processor CLI itself # we have no control over its interior # (we get no page-wise error handling and input downloading) @@ -46,12 +50,23 @@ def process_workspace(self, workspace: Workspace): self.params, getLevelName(self.logger.getEffectiveLevel())) p.exe() - # reload the mets file to prevent run_processor's save_mets - # from overriding the results from the Java process - self.workspace.reload_mets() # workaround for cisocrgroup/ocrd-postcorrection#13 (absolute paths in output): - for output_file in self.workspace.find_files(file_grp=self.output_file_grp): + # We cannot do that with this method, because our self.workspace.mets might be + # a ClientSideOcrdMets, which does not allow modifying or removing files: + # for output_file in self.workspace.find_files(file_grp=self.output_file_grp): + # flocat = output_file._el.find('{http://www.loc.gov/METS/}FLocat') + # flocat.attrib['LOCTYPE'] = 'OTHER' + # flocat.attrib['OTHERLOCTYPE'] = 'FILE' + # output_file.local_filename = os.path.relpath(output_file.local_filename, self.workspace.directory) + # So instead, let's post-process the local METS file result directly: + mets = OcrdMets(filename=self.workspace.mets_target) + for output_file in mets.find_files(fileGrp=self.output_file_grp): flocat = output_file._el.find('{http://www.loc.gov/METS/}FLocat') flocat.attrib['LOCTYPE'] = 'OTHER' flocat.attrib['OTHERLOCTYPE'] = 'FILE' output_file.local_filename = os.path.relpath(output_file.local_filename, self.workspace.directory) + with open(self.workspace.mets_target, 'w') as f: + f.write(mets.to_xml(xmllint=True).decode('utf-8')) + # reload the mets file to prevent run_processor's save_mets + # from overriding the results from the Java process + self.workspace.reload_mets() From c4a5999d905d23a8e347eed2b257363c0c2545af Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 1 Sep 2024 11:24:41 +0200 Subject: [PATCH 93/97] tests: use METS Server if OCRD_MAX_PARALLEL_PAGES>1 --- tests/run_add_zip_test.bash | 5 +-- tests/run_alignment_test.bash | 5 +-- tests/run_image_preprocessing_test.bash | 15 +++++---- tests/run_ocr_test.bash | 7 ++-- tests/run_postcorrection_test.bash | 19 +++++------ tests/run_training_test.bash | 7 ++-- tests/test_lib.bash | 43 ++++++++++++++++++++----- 7 files changed, 68 insertions(+), 33 deletions(-) diff --git a/tests/run_add_zip_test.bash b/tests/run_add_zip_test.bash index 02de2db2..e2d44983 100644 --- a/tests/run_add_zip_test.bash +++ b/tests/run_add_zip_test.bash @@ -6,7 +6,7 @@ ocrd_cis_init_ws blumenbach_anatomie_1805.ocrd.zip # test if there are 3 gt files pushd "$tmpws" found_files=0 -for file in $(ocrd workspace find -G OCR-D-GT-SEG-LINE); do +for file in $(ocrd ${OCRD_LOG_ARGS[*]} workspace ${OCRD_WS_ARGS[*]} find -G OCR-D-GT-SEG-LINE); do [[ -f "$file" ]] || fail "cannot find ground truth file: $file" found_files=$((found_files + 1)) done @@ -16,9 +16,10 @@ popd # test if there are 3 gt files pushd "$tmpws" found_files=0 -for file in $(ocrd workspace find -G OCR-D-IMG); do +for file in $(ocrd ${OCRD_LOG_ARGS[*]} workspace ${OCRD_WS_ARGS[*]} find -G OCR-D-IMG); do [[ -f "$file" ]] || fail "cannot find ground truth file: $file" found_files=$((found_files + 1)) done (( found_files == 3 )) || fail "invalid number of files: $found_files" popd + diff --git a/tests/run_alignment_test.bash b/tests/run_alignment_test.bash index e8a3c79a..7a82254b 100644 --- a/tests/run_alignment_test.bash +++ b/tests/run_alignment_test.bash @@ -6,7 +6,7 @@ ocrd_cis_init_ws blumenbach_anatomie_1805.ocrd.zip # test if there are 3 gt files pushd "$tmpws" found_files=0 -for file in $(ocrd workspace find -G $OCRD_CIS_FILEGRP); do +for file in $(ocrd ${OCRD_LOG_ARGS[*]} workspace ${OCRD_WS_ARGS[*]} find -G $OCRD_CIS_FILEGRP); do [[ -f "$file" ]] || fail "cannot find ground truth file: $file" found_files=$((found_files + 1)) done @@ -17,9 +17,10 @@ ocrd_cis_align pushd $tmpws found_files=0 -for file in $(ocrd workspace find -G OCR-D-CIS-ALIGN); do +for file in $(ocrd ${OCRD_LOG_ARGS[*]} workspace ${OCRD_WS_ARGS[*]} find -G OCR-D-CIS-ALIGN); do [[ -f "$file" ]] || fail "cannot find aligned file group workspace" found_files=$((found_files + 1)) done (( found_files == 3 )) || fail "invalid number of files: $found_files" popd + diff --git a/tests/run_image_preprocessing_test.bash b/tests/run_image_preprocessing_test.bash index f80fc636..7a66a57b 100644 --- a/tests/run_image_preprocessing_test.bash +++ b/tests/run_image_preprocessing_test.bash @@ -7,16 +7,17 @@ ocrd_cis_init_ws "blumenbach_anatomie_1805.ocrd.zip" # test if there are 3 gt files pushd "$tmpws" found_files=0 -for file in $(ocrd workspace find -G $OCRD_CIS_FILEGRP); do +for file in $(ocrd ${OCRD_LOG_ARGS[*]} workspace ${OCRD_WS_ARGS[*]} find -G $OCRD_CIS_FILEGRP); do [[ -f "$file" ]] || fail "cannot find ground truth file: $file" found_files=$((found_files + 1)) done (( found_files == 3 )) || fail "invalid number of files: $found_files" -ocrd-cis-ocropy-binarize -l DEBUG -I $OCRD_CIS_FILEGRP -O OCR-D-CIS-IMG-BIN -ocrd-cis-ocropy-clip -l DEBUG -I OCR-D-CIS-IMG-BIN -O OCR-D-CIS-IMG-CLIP -ocrd-cis-ocropy-denoise -l DEBUG -I OCR-D-CIS-IMG-CLIP -O OCR-D-CIS-IMG-DEN -ocrd-cis-ocropy-deskew -l DEBUG -I OCR-D-CIS-IMG-DEN -O OCR-D-CIS-IMG-DES -ocrd-cis-ocropy-dewarp -l DEBUG -I OCR-D-CIS-IMG-DES -O OCR-D-CIS-IMG-DEW -ocrd-cis-ocropy-segment -l DEBUG -I OCR-D-CIS-IMG-DEW -O OCR-D-CIS-IMG-SEG +ARGS=(${OCRD_LOG_ARGS[*]} ${OCRD_WS_ARGS[*]}) +ocrd-cis-ocropy-binarize ${ARGS[*]} -I $OCRD_CIS_FILEGRP -O OCR-D-CIS-IMG-BIN +ocrd-cis-ocropy-clip ${ARGS[*]} -I OCR-D-CIS-IMG-BIN -O OCR-D-CIS-IMG-CLIP +ocrd-cis-ocropy-denoise ${ARGS[*]} -I OCR-D-CIS-IMG-CLIP -O OCR-D-CIS-IMG-DEN +ocrd-cis-ocropy-deskew ${ARGS[*]} -I OCR-D-CIS-IMG-DEN -O OCR-D-CIS-IMG-DES +ocrd-cis-ocropy-dewarp ${ARGS[*]} -I OCR-D-CIS-IMG-DES -O OCR-D-CIS-IMG-DEW +ocrd-cis-ocropy-segment ${ARGS[*]} -I OCR-D-CIS-IMG-DEW -O OCR-D-CIS-IMG-SEG popd diff --git a/tests/run_ocr_test.bash b/tests/run_ocr_test.bash index b10f6f6d..f737ae43 100644 --- a/tests/run_ocr_test.bash +++ b/tests/run_ocr_test.bash @@ -6,7 +6,7 @@ ocrd_cis_init_ws blumenbach_anatomie_1805.ocrd.zip # test if there are 3 gt files pushd "$tmpws" found_files=0 -for file in $(ocrd workspace find -G $OCRD_CIS_FILEGRP); do +for file in $(ocrd ${OCRD_LOG_ARGS[*]} workspace ${OCRD_WS_ARGS[*]} find -G $OCRD_CIS_FILEGRP); do [[ -f "$file" ]] || fail "cannot find ground truth file: $file" found_files=$((found_files + 1)) done @@ -16,8 +16,9 @@ done ocrd resmgr download ocrd-cis-ocropy-recognize fraktur.pyrnn.gz # run ocr -ocrd-cis-ocropy-binarize -l DEBUG -I $OCRD_CIS_FILEGRP -O OCR-D-CIS-IMG-BIN -ocrd-cis-ocropy-recognize -l DEBUG -I OCR-D-CIS-IMG-BIN -O OCR-D-CIS-OCR \ +ARGS=(${OCRD_LOG_ARGS[*]} ${OCRD_WS_ARGS[*]}) +ocrd-cis-ocropy-binarize ${ARGS[*]} -I $OCRD_CIS_FILEGRP -O OCR-D-CIS-IMG-BIN +ocrd-cis-ocropy-recognize ${ARGS[*]} -I OCR-D-CIS-IMG-BIN -O OCR-D-CIS-OCR \ -P textequiv_level word -P model fraktur.pyrnn.gz popd diff --git a/tests/run_postcorrection_test.bash b/tests/run_postcorrection_test.bash index d7f34ace..859c8407 100644 --- a/tests/run_postcorrection_test.bash +++ b/tests/run_postcorrection_test.bash @@ -6,7 +6,7 @@ ocrd_cis_init_ws blumenbach_anatomie_1805.ocrd.zip # test if there are 3 gt files pushd "$tmpws" found_files=0 -for file in $(ocrd workspace find -G $OCRD_CIS_FILEGRP); do +for file in $(ocrd ${OCRD_LOG_ARGS[*]} workspace ${OCRD_WS_ARGS[*]} find -G $OCRD_CIS_FILEGRP); do [[ -f "$file" ]] || fail "cannot find ground truth file: $file" found_files=$((found_files + 1)) done @@ -15,25 +15,26 @@ popd ocrd_cis_align -mkdir "$tmpdir/bin" -cat > "$tmpdir/bin/profiler.bash" < "bin/profiler.bash" < /dev/null echo '{}' EOF -chmod a+x "$tmpdir/bin/profiler.bash" -ocrd-cis-postcorrect -l DEBUG \ +chmod a+x "bin/profiler.bash" + +ARGS=(${OCRD_LOG_ARGS[*]} ${OCRD_WS_ARGS[*]}) +ocrd-cis-postcorrect ${ARGS[*]} \ -I OCR-D-CIS-ALIGN \ -O OCR-D-CIS-POSTCORRECT \ - -m $tmpws/mets.xml \ - -P profilerPath $tmpdir/bin/profiler.bash \ + -P profilerPath bin/profiler.bash \ -P profilerConfig ignored \ -P model "$(ocrd-cis-data -model)" \ -P nOCR 2 -pushd $tmpws found_files=0 -for file in $(ocrd workspace find -G OCR-D-CIS-POSTCORRECT); do +for file in $(ocrd ${OCRD_LOG_ARGS[*]} workspace ${OCRD_WS_ARGS[*]} find -G OCR-D-CIS-POSTCORRECT); do [[ -f "$file" ]] || fail "$file: not a file" found_files=$((found_files + 1)) done diff --git a/tests/run_training_test.bash b/tests/run_training_test.bash index ade1b68e..5b96dc3e 100644 --- a/tests/run_training_test.bash +++ b/tests/run_training_test.bash @@ -6,7 +6,7 @@ ocrd_cis_init_ws blumenbach_anatomie_1805.ocrd.zip # test if there are 3 gt files pushd "$tmpws" found_files=0 -for file in $(ocrd workspace find -G $OCRD_CIS_FILEGRP); do +for file in $(ocrd ${OCRD_LOG_ARGS[*]} workspace ${OCRD_WS_ARGS[*]} find -G $OCRD_CIS_FILEGRP); do [[ -f "$file" ]] || fail "cannot find ground truth file: $file" found_files=$((found_files + 1)) done @@ -15,9 +15,12 @@ popd ocrd_cis_align +stopserver +OCRD_MAX_PARALLEL_PAGES=1 + # fix ocr for some entries (otherwise the training will fail) pushd $tmpws -for f in $(ocrd workspace find -G OCR-D-CIS-ALIGN); do +for f in $(ocrd ${OCRD_LOG_ARGS[*]} workspace find -G OCR-D-CIS-ALIGN); do sed -i -e 's#e.#Säugethiere.#' $f sed -i -e 's#E#Säugethieren#' $f done diff --git a/tests/test_lib.bash b/tests/test_lib.bash index 801be01a..76111d25 100644 --- a/tests/test_lib.bash +++ b/tests/test_lib.bash @@ -1,10 +1,27 @@ #/bin/bash tmpdir=$(mktemp -d) -trap "trap 'echo exiting without removing $tmpdir' EXIT" ERR -trap "rm -rf $tmpdir" EXIT +function stopserver() { + : +} +function failexit() { + stopserver +} +function cleanexit() { + stopserver + rm -rf $tmpdir +} +trap "trap failexit EXIT" ERR +trap cleanexit EXIT + +OCRD_LOG_ARGS=() +if test -v OCRD_OVERRIDE_LOGLEVEL; then + OCRD_LOG_ARGS+=(-l $OCRD_OVERRIDE_LOGLEVEL) +fi +OCRD_WS_ARGS=() # -m mets.xml OCRD_CIS_FILEGRP="OCR-D-GT-SEG-LINE" + data_url="https://github.com/OCR-D/gt_structure_text/releases/download/v1.5.0/" function ocrd_cis_download_bagit() { local url="$data_url/$1" @@ -16,22 +33,32 @@ function ocrd_cis_init_ws() { ocrd_cis_download_bagit "$1" ocrd zip spill -d "$tmpdir" "$PWD/download/$1" tmpws="$tmpdir/${1%.ocrd.zip}" + if ((${OCRD_MAX_PARALLEL_PAGES:-0} > 1)); then + echo starting METS server at $tmpws + ocrd workspace -d "$tmpws" -U "$tmpws/mets.sock" server start & + OCRD_WS_ARGS+=(-U "$tmpws/mets.sock") + sleep 1 + function stopserver() { + echo stopping METS server at $tmpws + ocrd workspace -d "$tmpws" -U "$tmpws/mets.sock" server stop || true + } + fi } + function ocrd_cis_align() { # download ocr models ocrd resmgr download ocrd-cis-ocropy-recognize fraktur.pyrnn.gz ocrd resmgr download ocrd-cis-ocropy-recognize fraktur-jze.pyrnn.gz # run ocr pushd $tmpws - ocrd-cis-ocropy-binarize -l DEBUG -I $OCRD_CIS_FILEGRP -O OCR-D-CIS-IMG-BIN - ocrd-cis-ocropy-recognize -l DEBUG \ - -I OCR-D-CIS-IMG-BIN -O OCR-D-CIS-OCR-1 \ + ARGS=(${OCRD_LOG_ARGS[*]} ${OCRD_WS_ARGS[*]}) + ocrd-cis-ocropy-binarize ${ARGS[*]} -I $OCRD_CIS_FILEGRP -O OCR-D-CIS-IMG-BIN + ocrd-cis-ocropy-recognize ${ARGS[*]} -I OCR-D-CIS-IMG-BIN -O OCR-D-CIS-OCR-1 \ -P textequiv_level word -P model fraktur.pyrnn.gz - ocrd-cis-ocropy-recognize -l DEBUG \ - -I OCR-D-CIS-IMG-BIN -O OCR-D-CIS-OCR-2 \ + ocrd-cis-ocropy-recognize ${ARGS[*]} -I OCR-D-CIS-IMG-BIN -O OCR-D-CIS-OCR-2 \ -P textequiv_level word -P model fraktur-jze.pyrnn.gz - ocrd-cis-align -l DEBUG -I OCR-D-CIS-OCR-1,OCR-D-CIS-OCR-2,$OCRD_CIS_FILEGRP \ + ocrd-cis-align ${ARGS[*]} -I OCR-D-CIS-OCR-1,OCR-D-CIS-OCR-2,$OCRD_CIS_FILEGRP \ -O OCR-D-CIS-ALIGN popd } From ae7dc671ab50104c0cf3f4dec6bf28fc3c1990ed Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 1 Sep 2024 11:25:35 +0200 Subject: [PATCH 94/97] make test: run serially and parallel, show times --- Makefile | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index a040cf9d..d1991df0 100644 --- a/Makefile +++ b/Makefile @@ -22,7 +22,17 @@ docker-push: docker-build TEST_SCRIPTS=$(sort $(filter-out tests/run_training_test.bash, $(wildcard tests/run_*.bash))) .PHONY: $(TEST_SCRIPTS) $(TEST_SCRIPTS): - bash $@ $V + OCRD_MAX_PARALLEL_PAGES=1 /usr/bin/time -o test_serially.log -a -f "$@: %Uuser %Ssystem %Eelapsed %PCPU (%Mmax)k" bash $@ $V + OCRD_MAX_PARALLEL_PAGES=4 /usr/bin/time -o test_parallel.log -a -f "$@: %Uuser %Ssystem %Eelapsed %PCPU (%Mmax)k" bash $@ $V + +test: export OCRD_OVERRIDE_LOGLEVEL=DEBUG +test: export OCRD_MISSING_OUTPUT=ABORT +test: export OCRD_MAX_MISSING_OUTPUTS=-1 test: $(TEST_SCRIPTS) - @echo $^ + @echo =====single-threaded test results===== + @cat test_serially.log + @echo =====4-page-parallel test results===== + @cat test_parallel.log + @$(RM) test_serially.log test_parallel.log + .PHONY: install install-devel uninstall test docker-build docker-push From e540b108e0c7f14c1cfcf8579dd0722a41069ead Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 2 Sep 2024 11:48:43 +0200 Subject: [PATCH 95/97] require ocrd>=3.0.0b4 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 83cf28bb..e8ea1cf3 100644 --- a/setup.py +++ b/setup.py @@ -38,7 +38,7 @@ packages=find_packages(), include_package_data=True, install_requires=[ - 'ocrd>=3.0.0b1', + 'ocrd>=3.0.0b4', 'click', 'scipy', 'numpy>=1.17.0', From 99b348915bcf0c1d3ea0028ca43ac2448a0ee922 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 26 Sep 2024 01:28:50 +0000 Subject: [PATCH 96/97] segment: adapt to numpy deprecation --- ocrd_cis/ocropy/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py index c5b56ed0..bae4dac0 100644 --- a/ocrd_cis/ocropy/common.py +++ b/ocrd_cis/ocropy/common.py @@ -644,7 +644,7 @@ def compute_seplines(binary, scale, maxseps=0): sepdists.append(np.median(subdistances)) #LOG.debug("adding sublabel %d as sep %d (size %d [%s])", sublabel, numsep, sublabelsize, str(sublabelslice)) sepsizes = np.array(sepsizes) - sepslices = np.array(sepslices) + sepslices = np.array(sepslices, dtype=object) LOG.debug("detected %d separator candidates", numsep) DSAVE("seps-raw", sepmap[labels]) # now dilate+erode to link neighbouring candidates, From dee1abf5c1cfcf3b8e111f4b3f8614e0f6fea214 Mon Sep 17 00:00:00 2001 From: kba Date: Fri, 11 Oct 2024 11:12:20 +0200 Subject: [PATCH 97/97] eval/stats: Levenshtein -> rapidfuzz.distance.Levenshtein --- ocrd_cis/div/eval.py | 2 +- ocrd_cis/div/stats.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ocrd_cis/div/eval.py b/ocrd_cis/div/eval.py index 6efe90c6..f47682ff 100644 --- a/ocrd_cis/div/eval.py +++ b/ocrd_cis/div/eval.py @@ -1,6 +1,6 @@ import os from PIL import Image -from Levenshtein import distance +from rapidfuzz.distance.Levenshtein import distance path = '/mnt/c/Users/chris/Documents/projects/OCR-D/daten/gt/lines/' diff --git a/ocrd_cis/div/stats.py b/ocrd_cis/div/stats.py index ea385d98..6f9c9816 100644 --- a/ocrd_cis/div/stats.py +++ b/ocrd_cis/div/stats.py @@ -4,7 +4,7 @@ from ocrd import Processor from ocrd_cis import get_ocrd_tool from ocrd_models.ocrd_page_generateds import parse -from Levenshtein import distance +from rapidfuzz.distance import Levenshtein class Stats(Processor): @@ -81,7 +81,7 @@ def process(self): # print(line.get_TextEquiv()[2].dataType) unicodeline = line.get_TextEquiv()[i].Unicode - d[i] += distance(gtline, unicodeline) + d[i] += Levenshtein.distance(gtline, unicodeline) # words = line.get_Word() # for word in words: