From 34761d42b159882488bc936a29378b7bdecd30f4 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 10 Sep 2020 14:18:30 +0200 Subject: [PATCH 01/12] repair (sanitize): ensure valid polygon by repeated simplification --- ocrd_segment/repair.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/ocrd_segment/repair.py b/ocrd_segment/repair.py index 1f9eec9..9045bcc 100644 --- a/ocrd_segment/repair.py +++ b/ocrd_segment/repair.py @@ -240,11 +240,16 @@ def sanitize_page(self, page, page_id): LOG.warning('Ignoring contour %d too small (%d/%d) in region "%s"', i, area, total_area, region.id) continue - # simplify shape: + # simplify shape (until valid): # can produce invalid (self-intersecting) polygons: #polygon = cv2.approxPolyDP(contour, 2, False)[:, 0, ::] # already ordered x,y polygon = contour[:, 0, ::] # already ordered x,y - polygon = Polygon(polygon).simplify(1).exterior.coords + polygon = Polygon(polygon) + for tolerance in range(2, int(polygon.area)): + polygon = polygon.simplify(tolerance) + if polygon.is_valid: + break + polygon = polygon.exterior.coords[:-1] # keep open if len(polygon) < 4: LOG.warning('Ignoring contour %d less than 4 points in region "%s"', i, region.id) From 0362d212a9d6ad9164186c8d1a5e75d3e3623040 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 10 Sep 2020 14:21:17 +0200 Subject: [PATCH 02/12] repair (plausibilize): ensure valid polygons after union by hull --- ocrd_segment/repair.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ocrd_segment/repair.py b/ocrd_segment/repair.py index 9045bcc..a8b9606 100644 --- a/ocrd_segment/repair.py +++ b/ocrd_segment/repair.py @@ -359,7 +359,10 @@ def _plausibilize_group(regionspolys, rogroup, mark_for_deletion, mark_for_mergi # and use-cases in the future superpoly = Polygon(polygon_from_points(superreg.get_Coords().points)) superpoly = superpoly.union(poly) - superreg.get_Coords().points = points_from_polygon(superpoly.exterior.coords) + if superpoly.type == 'MultiPolygon': + superpoly = superpoly.convex_hull + superpoly = superpoly.exterior.coords[:-1] # keep open + superreg.get_Coords().points = points_from_polygon(superpoly) # FIXME should we merge/mix attributes and features? if region.get_orientation() != superreg.get_orientation(): LOG.warning('Merging region "%s" with orientation %f into "%s" with %f', From 2a7367cff84fa7b303da91a165cb356ea70487ba Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 10 Sep 2020 14:45:48 +0200 Subject: [PATCH 03/12] repair (plausibilize): ensure valid polygon by repeated simplification --- ocrd_segment/repair.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ocrd_segment/repair.py b/ocrd_segment/repair.py index a8b9606..34f1cab 100644 --- a/ocrd_segment/repair.py +++ b/ocrd_segment/repair.py @@ -245,7 +245,7 @@ def sanitize_page(self, page, page_id): #polygon = cv2.approxPolyDP(contour, 2, False)[:, 0, ::] # already ordered x,y polygon = contour[:, 0, ::] # already ordered x,y polygon = Polygon(polygon) - for tolerance in range(2, int(polygon.area)): + for tolerance in range(1, int(polygon.area)): polygon = polygon.simplify(tolerance) if polygon.is_valid: break @@ -361,6 +361,10 @@ def _plausibilize_group(regionspolys, rogroup, mark_for_deletion, mark_for_mergi superpoly = superpoly.union(poly) if superpoly.type == 'MultiPolygon': superpoly = superpoly.convex_hull + for tolerance in range(1, int(superpoly.area)): + if superpoly.is_valid: + break + superpoly = superpoly.simplify(tolerance) superpoly = superpoly.exterior.coords[:-1] # keep open superreg.get_Coords().points = points_from_polygon(superpoly) # FIXME should we merge/mix attributes and features? From 967471202d853d4488dbe74eeccf2a399df3e3e2 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 10 Sep 2020 17:21:47 +0200 Subject: [PATCH 04/12] repair (plausibilize): ensure valid polygon by rounding --- ocrd_segment/repair.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ocrd_segment/repair.py b/ocrd_segment/repair.py index 34f1cab..440b187 100644 --- a/ocrd_segment/repair.py +++ b/ocrd_segment/repair.py @@ -6,7 +6,7 @@ from scipy.ndimage import filters, morphology import cv2 import numpy as np -from shapely.geometry import Polygon, LineString +from shapely.geometry import asPolygon, Polygon, LineString from ocrd import Processor from ocrd_utils import ( @@ -361,6 +361,8 @@ def _plausibilize_group(regionspolys, rogroup, mark_for_deletion, mark_for_mergi superpoly = superpoly.union(poly) if superpoly.type == 'MultiPolygon': superpoly = superpoly.convex_hull + if superpoly.minimum_clearance < 1.0: + superpoly = asPolygon(np.round(superpoly.exterior.coords)) for tolerance in range(1, int(superpoly.area)): if superpoly.is_valid: break From 3236801e1a33cbfa1725c2230f17f0dc2f827aee Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Mon, 14 Sep 2020 09:08:07 +0200 Subject: [PATCH 05/12] update shapely --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 1f9f008..88359b0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ ocrd >= 2.13.1 -shapely +shapely >= 1.7.1 scikit-image numpy From d10022b9f45d91210fb72fbd9fd1b8afd57af51a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Mon, 14 Sep 2020 11:25:24 +0200 Subject: [PATCH 06/12] repair: ensure valid polygon via more robust simplification --- ocrd_segment/repair.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/ocrd_segment/repair.py b/ocrd_segment/repair.py index 440b187..e8e8af0 100644 --- a/ocrd_segment/repair.py +++ b/ocrd_segment/repair.py @@ -244,11 +244,8 @@ def sanitize_page(self, page, page_id): # can produce invalid (self-intersecting) polygons: #polygon = cv2.approxPolyDP(contour, 2, False)[:, 0, ::] # already ordered x,y polygon = contour[:, 0, ::] # already ordered x,y - polygon = Polygon(polygon) - for tolerance in range(1, int(polygon.area)): - polygon = polygon.simplify(tolerance) - if polygon.is_valid: - break + polygon = Polygon(polygon).simplify(1) + polygon = make_valid(polygon) polygon = polygon.exterior.coords[:-1] # keep open if len(polygon) < 4: LOG.warning('Ignoring contour %d less than 4 points in region "%s"', @@ -363,10 +360,7 @@ def _plausibilize_group(regionspolys, rogroup, mark_for_deletion, mark_for_mergi superpoly = superpoly.convex_hull if superpoly.minimum_clearance < 1.0: superpoly = asPolygon(np.round(superpoly.exterior.coords)) - for tolerance in range(1, int(superpoly.area)): - if superpoly.is_valid: - break - superpoly = superpoly.simplify(tolerance) + superpoly = make_valid(superpoly) superpoly = superpoly.exterior.coords[:-1] # keep open superreg.get_Coords().points = points_from_polygon(superpoly) # FIXME should we merge/mix attributes and features? @@ -413,3 +407,18 @@ def _plausibilize_group(regionspolys, rogroup, mark_for_deletion, mark_for_mergi if region.parent_object_: # remove in-place region.parent_object_.get_TextRegion().remove(region) + +def make_valid(polygon): + """Ensures shapely.geometry.Polygon object is valid by repeated simplification""" + for split in range(1, len(polygon.exterior.coords)-1): + if polygon.is_valid or polygon.simplify(polygon.area).is_valid: + break + # simplification may not be possible (at all) due to ordering + # in that case, try another starting point + polygon = Polygon(polygon.exterior.coords[-split:]+polygon.exterior.coords[:-split]) + for tolerance in range(1, int(polygon.area)): + if polygon.is_valid: + break + # simplification may require a larger tolerance + polygon = polygon.simplify(tolerance) + return polygon From b61ed0debe7fec1fd83930973b240a4b04758d2b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Mon, 14 Sep 2020 12:01:23 +0200 Subject: [PATCH 07/12] repair: delegate to functions now in core --- ocrd_segment/repair.py | 78 +++++------------------------------------- 1 file changed, 8 insertions(+), 70 deletions(-) diff --git a/ocrd_segment/repair.py b/ocrd_segment/repair.py index e8e8af0..91edf2b 100644 --- a/ocrd_segment/repair.py +++ b/ocrd_segment/repair.py @@ -23,8 +23,6 @@ from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( CoordsType, - LabelType, LabelsType, - MetadataItemType, to_xml ) from ocrd_models.ocrd_page_generateds import ( @@ -36,6 +34,7 @@ UnorderedGroupIndexedType, ReadingOrderType ) +from ocrd_validators.page_validator import PageValidator from .config import OCRD_TOOL TOOL = 'ocrd-segment-repair' @@ -68,23 +67,17 @@ def process(self): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) + self.add_metadata(pcgts) page = pcgts.get_Page() - metadata = pcgts.get_Metadata() # ensured by from_file() - metadata.add_MetadataItem( - MetadataItemType(type_="processingStep", - name=self.ocrd_tool['steps'][0], - value=TOOL, - Labels=[LabelsType( - externalModel="ocrd-tool", - externalId="parameters", - Label=[LabelType(type_=name, - value=self.parameter[name]) - for name in self.parameter.keys()])])) - + # # validate segmentation (warn of children extending beyond their parents) # - self.validate_coords(page, page_id) + report = PageValidator.validate(ocrd_page=pcgts, + page_textequiv_consistency='off', + check_baseline=False) + if not report.is_valid: + LOG.warning(report.to_xml()) # # sanitize region segmentation (shrink to hull of lines) @@ -261,61 +254,6 @@ def sanitize_page(self, page, page_id): LOG.info('Using new coordinates for region "%s"', region.id) region.get_Coords().points = points_from_polygon(region_polygon) - def validate_coords(self, page, page_id): - valid = True - regions = page.get_TextRegion() - if page.get_Border(): - other_regions = ( - page.get_AdvertRegion() + - page.get_ChartRegion() + - page.get_ChemRegion() + - page.get_GraphicRegion() + - page.get_ImageRegion() + - page.get_LineDrawingRegion() + - page.get_MathsRegion() + - page.get_MusicRegion() + - page.get_NoiseRegion() + - page.get_SeparatorRegion() + - page.get_TableRegion() + - page.get_UnknownRegion()) - for region in regions + other_regions: - if not _child_within_parent(region, page.get_Border()): - LOG.warning('Region "%s" extends beyond Border of page "%s"', - region.id, page_id) - valid = False - for region in regions: - lines = region.get_TextLine() - for line in lines: - if not _child_within_parent(line, region): - LOG.warning('Line "%s" extends beyond region "%s" on page "%s"', - line.id, region.id, page_id) - valid = False - if line.get_Baseline(): - baseline = LineString(polygon_from_points(line.get_Baseline().points)) - linepoly = Polygon(polygon_from_points(line.get_Coords().points)) - if not baseline.within(linepoly): - LOG.warning('Baseline extends beyond line "%s" in region "%s" on page "%s"', - line.id, region.id, page_id) - valid = False - words = line.get_Word() - for word in words: - if not _child_within_parent(word, line): - LOG.warning('Word "%s" extends beyond line "%s" in region "%s" on page "%s"', - word.id, line.id, region.id, page_id) - valid = False - glyphs = word.get_Glyph() - for glyph in glyphs: - if not _child_within_parent(glyph, word): - LOG.warning('Glyph "%s" extends beyond word "%s" in line "%s" of region "%s" on page "%s"', - glyph.id, word.id, line.id, region.id, page_id) - valid = False - return valid - -def _child_within_parent(child, parent): - child_poly = Polygon(polygon_from_points(child.get_Coords().points)) - parent_poly = Polygon(polygon_from_points(parent.get_Coords().points)) - return child_poly.within(parent_poly) - def _plausibilize_group(regionspolys, rogroup, mark_for_deletion, mark_for_merging): wait_for_deletion = list() reading_order = dict() From d3a75cbd47a6e6a53224ab500c90220bb5b51973 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 14 Sep 2020 13:29:05 +0200 Subject: [PATCH 08/12] =?UTF-8?q?repair=20(plausibilize):=20get=20all=20te?= =?UTF-8?q?xt=20regions=20recursively=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …(but compare same parents only) --- ocrd_segment/repair.py | 134 ++++++++++++++++++++--------------------- requirements.txt | 2 +- 2 files changed, 68 insertions(+), 68 deletions(-) diff --git a/ocrd_segment/repair.py b/ocrd_segment/repair.py index 91edf2b..3667128 100644 --- a/ocrd_segment/repair.py +++ b/ocrd_segment/repair.py @@ -88,80 +88,78 @@ def process(self): # # plausibilize region segmentation (remove redundant text regions) # + ro = page.get_ReadingOrder() + if ro: + rogroup = ro.get_OrderedGroup() or ro.get_UnorderedGroup() + else: + rogroup = None mark_for_deletion = list() # what regions get removed? mark_for_merging = dict() # what regions get merged into which regions? + # cover recursive region structure (but compare only at the same level) + parents = list(set([region.parent_object_ for region in page.get_AllRegions(classes=['text'])])) + for parent in parents: + regions = parent.get_TextRegion() + # sort by area to ensure to arrive at a total ordering compatible + # with the topological sort along containment/equivalence arcs + # (so we can avoid substituting regions with superregions that have + # themselves been substituted/deleted): + RegionPolygon = namedtuple('RegionPolygon', ['region', 'polygon']) + regionspolys = sorted([RegionPolygon(region, Polygon(polygon_from_points(region.get_Coords().points))) + for region in regions], + key=lambda x: x.polygon.area) + for i in range(0, len(regionspolys)): + for j in range(i+1, len(regionspolys)): + region1 = regionspolys[i].region + region2 = regionspolys[j].region + poly1 = regionspolys[i].polygon + poly2 = regionspolys[j].polygon + LOG.debug('Comparing regions "%s" and "%s"', region1.id, region2.id) - # TODO: cover recursive region structure (but compare only at the same level) - regions = page.get_TextRegion() - # sort by area to ensure to arrive at a total ordering compatible - # with the topological sort along containment/equivalence arcs - # (so we can avoid substituting regions with superregions that have - # themselves been substituted/deleted): - RegionPolygon = namedtuple('RegionPolygon', ['region', 'polygon']) - regionspolys = sorted([RegionPolygon(region, Polygon(polygon_from_points(region.get_Coords().points))) - for region in regions], - key=lambda x: x.polygon.area) - for i in range(0, len(regionspolys)): - for j in range(i+1, len(regionspolys)): - region1 = regionspolys[i].region - region2 = regionspolys[j].region - poly1 = regionspolys[i].polygon - poly2 = regionspolys[j].polygon - LOG.debug('Comparing regions "%s" and "%s"', region1.id, region2.id) - - if poly1.almost_equals(poly2): - LOG.warning('Page "%s" region "%s" is almost equal to "%s" %s', - page_id, region2.id, region1.id, - '(removing)' if plausibilize else '') - mark_for_deletion.append(region2.id) - elif poly1.contains(poly2): - LOG.warning('Page "%s" region "%s" is within "%s" %s', - page_id, region2.id, region1.id, - '(removing)' if plausibilize else '') - mark_for_deletion.append(region2.id) - elif poly2.contains(poly1): - LOG.warning('Page "%s" region "%s" is within "%s" %s', - page_id, region1.id, region2.id, - '(removing)' if plausibilize else '') - mark_for_deletion.append(region1.id) - elif poly1.overlaps(poly2): - inter_poly = poly1.intersection(poly2) - union_poly = poly1.union(poly2) - LOG.debug('Page "%s" region "%s" overlaps "%s" by %f/%f', - page_id, region1.id, region2.id, inter_poly.area/poly1.area, inter_poly.area/poly2.area) - if union_poly.convex_hull.area >= poly1.area + poly2.area: - # skip this pair -- combined polygon encloses previously free segments - pass - elif inter_poly.area / poly2.area > self.parameter['plausibilize_merge_min_overlap']: - LOG.warning('Page "%s" region "%s" is almost within "%s" %s', + if poly1.almost_equals(poly2): + LOG.warning('Page "%s" region "%s" is almost equal to "%s" %s', page_id, region2.id, region1.id, - '(merging)' if plausibilize else '') - mark_for_merging[region2.id] = region1 - elif inter_poly.area / poly1.area > self.parameter['plausibilize_merge_min_overlap']: - LOG.warning('Page "%s" region "%s" is almost within "%s" %s', + '(removing)' if plausibilize else '') + mark_for_deletion.append(region2.id) + elif poly1.contains(poly2): + LOG.warning('Page "%s" region "%s" is within "%s" %s', + page_id, region2.id, region1.id, + '(removing)' if plausibilize else '') + mark_for_deletion.append(region2.id) + elif poly2.contains(poly1): + LOG.warning('Page "%s" region "%s" is within "%s" %s', page_id, region1.id, region2.id, - '(merging)' if plausibilize else '') - mark_for_merging[region1.id] = region2 + '(removing)' if plausibilize else '') + mark_for_deletion.append(region1.id) + elif poly1.overlaps(poly2): + inter_poly = poly1.intersection(poly2) + union_poly = poly1.union(poly2) + LOG.debug('Page "%s" region "%s" overlaps "%s" by %f/%f', + page_id, region1.id, region2.id, inter_poly.area/poly1.area, inter_poly.area/poly2.area) + if union_poly.convex_hull.area >= poly1.area + poly2.area: + # skip this pair -- combined polygon encloses previously free segments + pass + elif inter_poly.area / poly2.area > self.parameter['plausibilize_merge_min_overlap']: + LOG.warning('Page "%s" region "%s" is almost within "%s" %s', + page_id, region2.id, region1.id, + '(merging)' if plausibilize else '') + mark_for_merging[region2.id] = region1 + elif inter_poly.area / poly1.area > self.parameter['plausibilize_merge_min_overlap']: + LOG.warning('Page "%s" region "%s" is almost within "%s" %s', + page_id, region1.id, region2.id, + '(merging)' if plausibilize else '') + mark_for_merging[region1.id] = region2 - # TODO: more merging cases... - #LOG.info('Intersection %i', poly1.intersects(poly2)) - #LOG.info('Containment %i', poly1.contains(poly2)) - #if poly1.intersects(poly2): - # LOG.info('Area 1 %d', poly1.area) - # LOG.info('Area 2 %d', poly2.area) - # LOG.info('Area intersect %d', poly1.intersection(poly2).area) - + # TODO: more merging cases... + #LOG.info('Intersection %i', poly1.intersects(poly2)) + #LOG.info('Containment %i', poly1.contains(poly2)) + #if poly1.intersects(poly2): + # LOG.info('Area 1 %d', poly1.area) + # LOG.info('Area 2 %d', poly2.area) + # LOG.info('Area intersect %d', poly1.intersection(poly2).area) - if plausibilize: - # the reading order does not have to include all regions - # but it may include all types of regions! - ro = page.get_ReadingOrder() - if ro: - rogroup = ro.get_OrderedGroup() or ro.get_UnorderedGroup() - else: - rogroup = None - # pass the regions sorted (see above) - _plausibilize_group(regionspolys, rogroup, mark_for_deletion, mark_for_merging) + if plausibilize: + # pass the regions sorted (see above) + _plausibilize_group(regionspolys, rogroup, mark_for_deletion, mark_for_merging) file_id = make_file_id(input_file, self.output_file_grp) self.workspace.add_file( @@ -259,6 +257,8 @@ def _plausibilize_group(regionspolys, rogroup, mark_for_deletion, mark_for_mergi reading_order = dict() regionrefs = list() ordered = False + # the reading order does not have to include all regions + # but it may include all types of regions! if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): regionrefs = (rogroup.get_RegionRefIndexed() + rogroup.get_OrderedGroupIndexed() + diff --git a/requirements.txt b/requirements.txt index 88359b0..b821bca 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -ocrd >= 2.13.1 +ocrd >= 2.16.2 shapely >= 1.7.1 scikit-image numpy From 3c20ebee142c9729b8e30b19824922678559618e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 14 Sep 2020 15:17:04 +0200 Subject: [PATCH 09/12] repair (sanitize): get all text regions recursively --- ocrd_segment/repair.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_segment/repair.py b/ocrd_segment/repair.py index 3667128..48847fc 100644 --- a/ocrd_segment/repair.py +++ b/ocrd_segment/repair.py @@ -172,7 +172,7 @@ def process(self): content=to_xml(pcgts)) def sanitize_page(self, page, page_id): - regions = page.get_TextRegion() + regions = page.get_AllRegions(classes=['text']) page_image, page_coords, _ = self.workspace.image_from_page( page, page_id) for region in regions: From 0134cf4c36cead9ac662dbcfeaf7b94022eb04f4 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 14 Sep 2020 15:20:24 +0200 Subject: [PATCH 10/12] repair: first plausibilize, then sanitize (if both) --- ocrd_segment/repair.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/ocrd_segment/repair.py b/ocrd_segment/repair.py index 48847fc..1d46057 100644 --- a/ocrd_segment/repair.py +++ b/ocrd_segment/repair.py @@ -78,13 +78,8 @@ def process(self): check_baseline=False) if not report.is_valid: LOG.warning(report.to_xml()) + # TODO: maybe skip this page if report contains any CoordinateValidityError - # - # sanitize region segmentation (shrink to hull of lines) - # - if sanitize: - self.sanitize_page(page, page_id) - # # plausibilize region segmentation (remove redundant text regions) # @@ -161,6 +156,12 @@ def process(self): # pass the regions sorted (see above) _plausibilize_group(regionspolys, rogroup, mark_for_deletion, mark_for_merging) + # + # sanitize region segmentation (shrink to hull of lines) + # + if sanitize: + self.sanitize_page(page, page_id) + file_id = make_file_id(input_file, self.output_file_grp) self.workspace.add_file( ID=file_id, From f3911d7dc57dfbf9b59916756bc274d5b12504bd Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 14 Sep 2020 16:38:30 +0200 Subject: [PATCH 11/12] all: delegate to `add_metadata` in core --- .pylintrc | 2 ++ ocrd_segment/extract_lines.py | 16 +--------------- ocrd_segment/extract_pages.py | 16 +--------------- ocrd_segment/extract_regions.py | 16 +--------------- ocrd_segment/import_coco_segmentation.py | 16 +--------------- ocrd_segment/import_image_segmentation.py | 16 +--------------- ocrd_segment/replace_original.py | 14 +------------- 7 files changed, 8 insertions(+), 88 deletions(-) diff --git a/.pylintrc b/.pylintrc index a5ecacb..0c805ee 100644 --- a/.pylintrc +++ b/.pylintrc @@ -3,6 +3,8 @@ ignored-modules=cv2 [MESSAGES CONTROL] disable = + super-with-arguments, + trailing-whitespace, missing-docstring, no-self-use, superfluous-parens, diff --git a/ocrd_segment/extract_lines.py b/ocrd_segment/extract_lines.py index f15999e..150230a 100644 --- a/ocrd_segment/extract_lines.py +++ b/ocrd_segment/extract_lines.py @@ -11,10 +11,6 @@ polygon_from_points, MIME_TO_EXT ) -from ocrd_models.ocrd_page import ( - LabelsType, LabelType, - MetadataItemType -) from ocrd_modelfactory import page_from_file from ocrd import Processor @@ -75,18 +71,8 @@ def process(self): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) + self.add_metadata(pcgts) page = pcgts.get_Page() - metadata = pcgts.get_Metadata() # ensured by from_file() - metadata.add_MetadataItem( - MetadataItemType(type_="processingStep", - name=self.ocrd_tool['steps'][0], - value=TOOL, - Labels=[LabelsType( - externalModel="ocrd-tool", - externalId="parameters", - Label=[LabelType(type_=name, - value=self.parameter[name]) - for name in self.parameter.keys()])])) page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, transparency=self.parameter['transparency']) diff --git a/ocrd_segment/extract_pages.py b/ocrd_segment/extract_pages.py index 42705c5..8963982 100644 --- a/ocrd_segment/extract_pages.py +++ b/ocrd_segment/extract_pages.py @@ -16,10 +16,6 @@ xywh_from_polygon, MIME_TO_EXT ) -from ocrd_models.ocrd_page import ( - LabelsType, LabelType, - MetadataItemType -) from ocrd_modelfactory import page_from_file from ocrd import Processor @@ -168,19 +164,9 @@ def process(self): num_page_id = int(page_id.strip(page_id.strip("0123456789"))) LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) + self.add_metadata(pcgts) page = pcgts.get_Page() ptype = page.get_type() - metadata = pcgts.get_Metadata() # ensured by from_file() - metadata.add_MetadataItem( - MetadataItemType(type_="processingStep", - name=self.ocrd_tool['steps'][0], - value=TOOL, - Labels=[LabelsType( - externalModel="ocrd-tool", - externalId="parameters", - Label=[LabelType(type_=name, - value=self.parameter[name]) - for name in self.parameter])])) page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, feature_filter='binarized', diff --git a/ocrd_segment/extract_regions.py b/ocrd_segment/extract_regions.py index 118a1d6..b3deb92 100644 --- a/ocrd_segment/extract_regions.py +++ b/ocrd_segment/extract_regions.py @@ -10,10 +10,6 @@ polygon_from_points, MIME_TO_EXT ) -from ocrd_models.ocrd_page import ( - LabelsType, LabelType, - MetadataItemType -) from ocrd_modelfactory import page_from_file from ocrd import Processor @@ -72,18 +68,8 @@ def process(self): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) + self.add_metadata(pcgts) page = pcgts.get_Page() - metadata = pcgts.get_Metadata() # ensured by from_file() - metadata.add_MetadataItem( - MetadataItemType(type_="processingStep", - name=self.ocrd_tool['steps'][0], - value=TOOL, - Labels=[LabelsType( - externalModel="ocrd-tool", - externalId="parameters", - Label=[LabelType(type_=name, - value=self.parameter[name]) - for name in self.parameter])])) page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, transparency=self.parameter['transparency']) diff --git a/ocrd_segment/import_coco_segmentation.py b/ocrd_segment/import_coco_segmentation.py index 332a765..b3fff4b 100644 --- a/ocrd_segment/import_coco_segmentation.py +++ b/ocrd_segment/import_coco_segmentation.py @@ -17,8 +17,6 @@ # pragma pylint: disable=unused-import # (region types will be referenced indirectly via globals()) from ocrd_models.ocrd_page import ( - MetadataItemType, - LabelsType, LabelType, CoordsType, TextRegionType, ImageRegionType, @@ -138,21 +136,9 @@ def process(self): num_page_id = int(page_id.strip(page_id.strip("0123456789"))) LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) + self.add_metadata(pcgts) page = pcgts.get_Page() - # add metadata about this operation and its runtime parameters: - metadata = pcgts.get_Metadata() # ensured by from_file() - metadata.add_MetadataItem( - MetadataItemType(type_="processingStep", - name=self.ocrd_tool['steps'][0], - value=TOOL, - Labels=[LabelsType( - externalModel="ocrd-tool", - externalId="parameters", - Label=[LabelType(type_=name, - value=self.parameter[name]) - for name in self.parameter.keys()])])) - # find COCO image if page.imageFilename in images_by_filename: image = images_by_filename[page.imageFilename] diff --git a/ocrd_segment/import_image_segmentation.py b/ocrd_segment/import_image_segmentation.py index dda1f65..16d9d67 100644 --- a/ocrd_segment/import_image_segmentation.py +++ b/ocrd_segment/import_image_segmentation.py @@ -18,8 +18,6 @@ # pragma pylint: disable=unused-import # (region types will be referenced indirectly via globals()) from ocrd_models.ocrd_page import ( - MetadataItemType, - LabelsType, LabelType, CoordsType, TextRegionType, ImageRegionType, @@ -89,21 +87,9 @@ def process(self): input_file, segmentation_file = ift LOG.info("processing page %s", input_file.pageId) pcgts = page_from_file(self.workspace.download_file(input_file)) + self.add_metadata(pcgts) page = pcgts.get_Page() - # add metadata about this operation and its runtime parameters: - metadata = pcgts.get_Metadata() # ensured by from_file() - metadata.add_MetadataItem( - MetadataItemType(type_="processingStep", - name=self.ocrd_tool['steps'][0], - value=TOOL, - Labels=[LabelsType( - externalModel="ocrd-tool", - externalId="parameters", - Label=[LabelType(type_=name, - value=self.parameter[name]) - for name in self.parameter.keys()])])) - # import mask image segmentation_filename = self.workspace.download_file(segmentation_file).local_filename with pushd_popd(self.workspace.directory): diff --git a/ocrd_segment/replace_original.py b/ocrd_segment/replace_original.py index 2b07b59..2ff80ba 100644 --- a/ocrd_segment/replace_original.py +++ b/ocrd_segment/replace_original.py @@ -11,8 +11,6 @@ MIMETYPE_PAGE ) from ocrd_models.ocrd_page import ( - LabelsType, LabelType, - MetadataItemType, TextRegionType, to_xml ) @@ -57,18 +55,8 @@ def process(self): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) + self.add_metadata(pcgts) page = pcgts.get_Page() - metadata = pcgts.get_Metadata() # ensured by from_file() - metadata.add_MetadataItem( - MetadataItemType(type_="processingStep", - name=self.ocrd_tool['steps'][0], - value=TOOL, - Labels=[LabelsType( - externalModel="ocrd-tool", - externalId="parameters", - Label=[LabelType(type_=name, - value=self.parameter[name]) - for name in self.parameter])])) page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, feature_filter=feature_filter, From 24d26cd96b4cf0e67f769325a9cb958532f46c83 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 14 Sep 2020 16:45:25 +0200 Subject: [PATCH 12/12] :package: 0.1.1 --- CHANGELOG.md | 11 +++++++++++ ocrd_segment/ocrd-tool.json | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8b730b6..ad1447e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,17 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [0.1.1] - 2020-09-14 + +Changed: + + * repair: traverse all text regions recursively + +Fixed: + + * repair: be robust against invalid input polygons + * repair: be careful to make valid output polygons + ## [0.1.0] - 2020-08-21 Changed: diff --git a/ocrd_segment/ocrd-tool.json b/ocrd_segment/ocrd-tool.json index 29bd8c3..42ad62f 100644 --- a/ocrd_segment/ocrd-tool.json +++ b/ocrd_segment/ocrd-tool.json @@ -1,5 +1,5 @@ { - "version": "0.1.0", + "version": "0.1.1", "git_url": "https://github.com/OCR-D/ocrd_segment", "tools": { "ocrd-segment-repair": {