Skip to content

Commit

Permalink
Zero height box fix
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Nov 27, 2024
1 parent 58dfbe9 commit a821f3d
Show file tree
Hide file tree
Showing 6 changed files with 20 additions and 6 deletions.
6 changes: 4 additions & 2 deletions marker/providers/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,10 @@ def pdftext_extraction(self) -> ProviderPageLines:
font_name = span["font"]["name"] or "Unknown"
font_weight = span["font"]["weight"] or 0
font_size = span["font"]["size"] or 0
polygon = PolygonBox.from_bbox(span["bbox"], ensure_nonzero_area=True)
spans.append(
SpanClass(
polygon=PolygonBox.from_bbox(span["bbox"]),
polygon=polygon,
text=span["text"],
font=font_name,
font_weight=font_weight,
Expand All @@ -143,9 +144,10 @@ def pdftext_extraction(self) -> ProviderPageLines:
text_extraction_method="pdftext"
)
)
polygon = PolygonBox.from_bbox(line["bbox"], ensure_nonzero_area=True)
lines.append(
ProviderOutput(
line=LineClass(polygon=PolygonBox.from_bbox(line["bbox"]), page_id=page_id),
line=LineClass(polygon=polygon, page_id=page_id),
spans=spans
)
)
Expand Down
8 changes: 8 additions & 0 deletions marker/renderers/markdown.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,17 @@
import re

import regex
from markdownify import MarkdownConverter
from pydantic import BaseModel

from marker.renderers.html import HTMLRenderer
from marker.schema.document import Document

def cleanup_text(full_text):
full_text = re.sub(r'\n{3,}', '\n\n', full_text)
full_text = re.sub(r'(\n\s){3,}', '\n\n', full_text)
return full_text


class Markdownify(MarkdownConverter):
def __init__(self, paginate_output, page_separator, **kwargs):
Expand Down Expand Up @@ -55,6 +62,7 @@ def __call__(self, document: Document) -> MarkdownOutput:
sup_symbol="<sup>",
)
markdown = md_cls.convert(full_html)
markdown = cleanup_text(markdown)
return MarkdownOutput(
markdown=markdown,
images=images,
Expand Down
2 changes: 1 addition & 1 deletion marker/schema/blocks/equation.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ class Equation(Block):

def assemble_html(self, child_blocks, parent_structure=None):
if self.latex:
return f"<p><math>{html.escape(self.latex)}</math></p>"
return f"\n<p><math>{html.escape(self.latex)}</math></p>\n"
else:
template = super().assemble_html(child_blocks, parent_structure)
return f"<p>{template}</p>"
3 changes: 2 additions & 1 deletion marker/schema/groups/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ def identify_missing_blocks(
else:
new_blocks.append(new_block)
new_block = [(line_idx, provider_outputs[line_idx])]
assigned_line_idxs.add(line_idx)
if new_block:
new_blocks.append(new_block)

Expand Down Expand Up @@ -200,7 +201,7 @@ def merge_blocks(
assigned_line_idxs = set()
block_lines = defaultdict(list)
for line_idx, provider_output in enumerate(provider_outputs):
if line_idx in max_intersections and max_intersections[line_idx][0] > 0.0:
if line_idx in max_intersections:
block_id = max_intersections[line_idx][1]
block_lines[block_id].append((line_idx, provider_output))
assigned_line_idxs.add(line_idx)
Expand Down
6 changes: 5 additions & 1 deletion marker/schema/polygon.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,5 +184,9 @@ def merge(self, others: List[PolygonBox]) -> PolygonBox:
return PolygonBox(polygon=corners)

@classmethod
def from_bbox(cls, bbox: List[float]):
def from_bbox(cls, bbox: List[float], ensure_nonzero_area=False):
if ensure_nonzero_area:
bbox = list(bbox)
bbox[2] = max(bbox[2], bbox[0] + 1)
bbox[3] = max(bbox[3], bbox[1] + 1)
return cls(polygon=[[bbox[0], bbox[1]], [bbox[2], bbox[1]], [bbox[2], bbox[3]], [bbox[0], bbox[3]]])
1 change: 0 additions & 1 deletion marker/schema/text/span.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@


def cleanup_text(full_text):
full_text = re.sub(r'\n{3,}', '\n\n', full_text)
full_text = re.sub(r'(\n\s){3,}', '\n\n', full_text)
full_text = full_text.replace('\xa0', ' ') # Replace non-breaking spaces
return full_text
Expand Down

0 comments on commit a821f3d

Please sign in to comment.