Skip to content

Commit

Permalink
filter out width 0 lines from heuristics
Browse files Browse the repository at this point in the history
  • Loading branch information
iammosespaulr committed Nov 26, 2024
1 parent b417dc6 commit 18c64b2
Showing 1 changed file with 4 additions and 3 deletions.
7 changes: 4 additions & 3 deletions marker/processors/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,20 +19,21 @@ class TextProcessor(BaseProcessor):
Default is 0.02.
"""
block_types = (BlockTypes.Text, BlockTypes.TextInlineMath)
column_gap_ratio = 0.02 # column gaps are atleast 2% of the page width
column_gap_ratio = 0.02 # column gaps are atleast 2% of the current column width

def __init__(self, config):
super().__init__(config)

def __call__(self, document: Document):
for page in document.pages:
column_gap = page.polygon.width * self.column_gap_ratio
for block in page.contained_blocks(document, self.block_types):
if block.structure is None:
continue

if not len(block.structure) >= 2: # Skip single lines
continue

column_gap = block.polygon.width * self.column_gap_ratio

column_break, page_break = False, False
next_block = page.get_next_block(block)
Expand Down Expand Up @@ -93,7 +94,7 @@ def __call__(self, document: Document):
min_x = math.ceil(min([l.polygon.x_start for l in new_block_lines]))
next_block_starts_indented = new_block_lines[0].polygon.x_start > min_x

lines: List[Line] = block.structure_blocks(document)
lines: List[Line] = [l for l in block.structure_blocks(document) if l.polygon.width > 0]
max_x = math.floor(max([l.polygon.x_end for l in lines]))
last_line_is_full_width = lines[-1].polygon.x_end >= max_x

Expand Down

0 comments on commit 18c64b2

Please sign in to comment.