From a821f3d41cbae928a237b3b6ec03ed7740163155 Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Wed, 27 Nov 2024 10:31:06 -0500 Subject: [PATCH] Zero height box fix --- marker/providers/pdf.py | 6 ++++-- marker/renderers/markdown.py | 8 ++++++++ marker/schema/blocks/equation.py | 2 +- marker/schema/groups/page.py | 3 ++- marker/schema/polygon.py | 6 +++++- marker/schema/text/span.py | 1 - 6 files changed, 20 insertions(+), 6 deletions(-) diff --git a/marker/providers/pdf.py b/marker/providers/pdf.py index c0a0dff7..079ff6ee 100644 --- a/marker/providers/pdf.py +++ b/marker/providers/pdf.py @@ -129,9 +129,10 @@ def pdftext_extraction(self) -> ProviderPageLines: font_name = span["font"]["name"] or "Unknown" font_weight = span["font"]["weight"] or 0 font_size = span["font"]["size"] or 0 + polygon = PolygonBox.from_bbox(span["bbox"], ensure_nonzero_area=True) spans.append( SpanClass( - polygon=PolygonBox.from_bbox(span["bbox"]), + polygon=polygon, text=span["text"], font=font_name, font_weight=font_weight, @@ -143,9 +144,10 @@ def pdftext_extraction(self) -> ProviderPageLines: text_extraction_method="pdftext" ) ) + polygon = PolygonBox.from_bbox(line["bbox"], ensure_nonzero_area=True) lines.append( ProviderOutput( - line=LineClass(polygon=PolygonBox.from_bbox(line["bbox"]), page_id=page_id), + line=LineClass(polygon=polygon, page_id=page_id), spans=spans ) ) diff --git a/marker/renderers/markdown.py b/marker/renderers/markdown.py index edb1ead5..0e0f2f80 100644 --- a/marker/renderers/markdown.py +++ b/marker/renderers/markdown.py @@ -1,3 +1,5 @@ +import re + import regex from markdownify import MarkdownConverter from pydantic import BaseModel @@ -5,6 +7,11 @@ from marker.renderers.html import HTMLRenderer from marker.schema.document import Document +def cleanup_text(full_text): + full_text = re.sub(r'\n{3,}', '\n\n', full_text) + full_text = re.sub(r'(\n\s){3,}', '\n\n', full_text) + return full_text + class Markdownify(MarkdownConverter): def __init__(self, paginate_output, page_separator, **kwargs): @@ -55,6 +62,7 @@ def __call__(self, document: Document) -> MarkdownOutput: sup_symbol="", ) markdown = md_cls.convert(full_html) + markdown = cleanup_text(markdown) return MarkdownOutput( markdown=markdown, images=images, diff --git a/marker/schema/blocks/equation.py b/marker/schema/blocks/equation.py index ef320839..08caf707 100644 --- a/marker/schema/blocks/equation.py +++ b/marker/schema/blocks/equation.py @@ -10,7 +10,7 @@ class Equation(Block): def assemble_html(self, child_blocks, parent_structure=None): if self.latex: - return f"

{html.escape(self.latex)}

" + return f"\n

{html.escape(self.latex)}

\n" else: template = super().assemble_html(child_blocks, parent_structure) return f"

{template}

" diff --git a/marker/schema/groups/page.py b/marker/schema/groups/page.py index 055d7590..1f216612 100644 --- a/marker/schema/groups/page.py +++ b/marker/schema/groups/page.py @@ -133,6 +133,7 @@ def identify_missing_blocks( else: new_blocks.append(new_block) new_block = [(line_idx, provider_outputs[line_idx])] + assigned_line_idxs.add(line_idx) if new_block: new_blocks.append(new_block) @@ -200,7 +201,7 @@ def merge_blocks( assigned_line_idxs = set() block_lines = defaultdict(list) for line_idx, provider_output in enumerate(provider_outputs): - if line_idx in max_intersections and max_intersections[line_idx][0] > 0.0: + if line_idx in max_intersections: block_id = max_intersections[line_idx][1] block_lines[block_id].append((line_idx, provider_output)) assigned_line_idxs.add(line_idx) diff --git a/marker/schema/polygon.py b/marker/schema/polygon.py index 0cd53074..494d7473 100644 --- a/marker/schema/polygon.py +++ b/marker/schema/polygon.py @@ -184,5 +184,9 @@ def merge(self, others: List[PolygonBox]) -> PolygonBox: return PolygonBox(polygon=corners) @classmethod - def from_bbox(cls, bbox: List[float]): + def from_bbox(cls, bbox: List[float], ensure_nonzero_area=False): + if ensure_nonzero_area: + bbox = list(bbox) + bbox[2] = max(bbox[2], bbox[0] + 1) + bbox[3] = max(bbox[3], bbox[1] + 1) return cls(polygon=[[bbox[0], bbox[1]], [bbox[2], bbox[1]], [bbox[2], bbox[3]], [bbox[0], bbox[3]]]) diff --git a/marker/schema/text/span.py b/marker/schema/text/span.py index cf9b77ca..f40ecca5 100644 --- a/marker/schema/text/span.py +++ b/marker/schema/text/span.py @@ -9,7 +9,6 @@ def cleanup_text(full_text): - full_text = re.sub(r'\n{3,}', '\n\n', full_text) full_text = re.sub(r'(\n\s){3,}', '\n\n', full_text) full_text = full_text.replace('\xa0', ' ') # Replace non-breaking spaces return full_text