Skip to content

Commit

Permalink
more ignoretext and line merging upgrades
Browse files Browse the repository at this point in the history
  • Loading branch information
iammosespaulr committed Nov 26, 2024
1 parent 28816ef commit eda0738
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 7 deletions.
3 changes: 2 additions & 1 deletion marker/processors/ignoretext.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ class IgnoreTextProcessor(BaseProcessor):
"""
block_types = (
BlockTypes.Text, BlockTypes.PageHeader,
BlockTypes.PageFooter, BlockTypes.SectionHeader
BlockTypes.PageFooter, BlockTypes.SectionHeader,
BlockTypes.TextInlineMath
)
common_element_threshold = .20
common_element_min_blocks = 3
Expand Down
19 changes: 13 additions & 6 deletions marker/processors/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,11 @@ def __call__(self, document: Document):

column_break, page_break = False, False
next_block = page.get_next_block(block)
if next_block is not None: # we check for a column break

next_block_exists_and_isnt_last_page_header = next_block is not None and \
not (next_block.block_type == BlockTypes.PageHeader and page.structure[-1] == next_block.id)
if next_block_exists_and_isnt_last_page_header: # next block exists and it's not the last page header
# we check for a column break
column_break = (
math.floor(next_block.polygon.y_start) <= math.floor(block.polygon.y_start) and
next_block.polygon.x_start > (block.polygon.x_end + column_gap)
Expand Down Expand Up @@ -67,13 +71,16 @@ def __call__(self, document: Document):
for next_page_block_id in next_page.structure:
if next_page_block_id.block_type in [BlockTypes.PageHeader, BlockTypes.PageFooter]:
continue # skip headers and footers
if next_page_block_id.block_type not in self.block_types:
break # we found a non-text block, so we can stop looking

# we have our text_block
# we have our block
next_page_block = next_page.get_block(next_page_block_id)
if next_page_block.structure is None:
break # This is odd though, why do we have text blocks with no structure?
if next_page_block.ignore_for_output:
continue # skip ignored blocks

if not (next_page_block.structure is not None and \
next_page_block.block_type in self.block_types):
# we found a non-text block or an empty text block, so we can stop looking
break

new_block_lines = next_page_block.structure_blocks(document)

Expand Down

0 comments on commit eda0738

Please sign in to comment.