Skip to content

Commit

Permalink
Merge pull request #402 from VikParuchuri/dev-mose/list-joining-misc
Browse files Browse the repository at this point in the history
Add `ListGroup` joining processor and refactor `Text` joining processor
  • Loading branch information
VikParuchuri authored Nov 30, 2024
2 parents 9602cb4 + 9d0f0eb commit 1047d9c
Show file tree
Hide file tree
Showing 9 changed files with 115 additions and 71 deletions.
2 changes: 2 additions & 0 deletions marker/converters/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from marker.processors.footnote import FootnoteProcessor
from marker.processors.ignoretext import IgnoreTextProcessor
from marker.processors.line_numbers import LineNumbersProcessor
from marker.processors.list import ListProcessor
from marker.processors.page_header import PageHeaderProcessor
from marker.processors.sectionheader import SectionHeaderProcessor
from marker.processors.table import TableProcessor
Expand Down Expand Up @@ -58,6 +59,7 @@ def __init__(self, artifact_dict: Dict[str, Any], processor_list: List[str] | No
TableProcessor,
SectionHeaderProcessor,
TextProcessor,
ListProcessor,
CodeProcessor,
DocumentTOCProcessor,
IgnoreTextProcessor,
Expand Down
43 changes: 43 additions & 0 deletions marker/processors/list.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import math

from marker.processors import BaseProcessor
from marker.schema import BlockTypes
from marker.schema.document import Document


class ListProcessor(BaseProcessor):
"""
A processor for merging lists across pages and columns
"""
block_types = (BlockTypes.ListGroup,)
ignored_block_types = (BlockTypes.PageHeader, BlockTypes.PageFooter)

def __init__(self, config):
super().__init__(config)

def __call__(self, document: Document):
for page in document.pages:
for block in page.contained_blocks(document, self.block_types):
next_block = document.get_next_block(block, self.ignored_block_types)
if next_block is None:
continue
if next_block.block_type not in self.block_types:
continue
if next_block.structure is None:
continue
if next_block.ignore_for_output:
continue

column_break, page_break = False, False
next_block_in_first_quadrant = False

if next_block.page_id == block.page_id: # block on the same page
# we check for a column break
column_break = next_block.polygon.y_start <= block.polygon.y_end
else:
page_break = True
next_page = document.get_page(next_block.page_id)
next_block_in_first_quadrant = (next_block.polygon.x_start < next_page.polygon.width // 2) and \
(next_block.polygon.y_start < next_page.polygon.height // 2)

block.has_continuation = column_break or (page_break and next_block_in_first_quadrant)
76 changes: 24 additions & 52 deletions marker/processors/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,70 +33,42 @@ def __call__(self, document: Document):

if not len(block.structure) >= 2: # Skip single lines
continue


next_block = document.get_next_block(block, self.ignored_block_types)
if next_block is None: # we've reached the end of the document
continue
if next_block.block_type not in self.block_types:
continue # we found a non-text block
if next_block.structure is None:
continue # This is odd though, why do we have text blocks with no structure?
if next_block.ignore_for_output:
continue # skip ignored blocks

column_gap = block.polygon.width * self.column_gap_ratio

column_break, page_break = False, False
next_block = None

for next_block_id in page.structure[page.structure.index(block.id) + 1:]:
if next_block_id.block_type in self.ignored_block_types:
continue
next_block = page.get_block(next_block_id)
break
next_block_starts_indented = True
next_block_in_first_quadrant = False
last_line_is_full_width = False
last_line_is_hyphentated = False
new_block_lines = []

if next_block is not None: # next block exists
if next_block.page_id == block.page_id: # block on the same page
# we check for a column break
column_break = (
math.floor(next_block.polygon.y_start) <= math.floor(block.polygon.y_start) and
math.floor(next_block.polygon.y_start) <= math.ceil(block.polygon.y_start) and
next_block.polygon.x_start > (block.polygon.x_end + column_gap)
)
else: # It's a page break since we don't have a next block in the page
else:
page_break = True
next_page = document.get_page(next_block.page_id)
next_block_in_first_quadrant = (next_block.polygon.x_start < next_page.polygon.width // 2) and \
(next_block.polygon.y_start < next_page.polygon.height // 2)

if not (column_break or page_break):
continue

next_block_starts_indented = True
next_block_in_first_quadrant = False
last_line_is_full_width = False
last_line_is_hyphentated = False
new_block_lines = []

if column_break:
if next_block.block_type not in self.block_types:
continue
if next_block.structure is None: # This is odd though, why do we have text blocks with no structure?
continue

new_block_lines = next_block.structure_blocks(document)
else: # page break
next_page = document.get_next_page(page)
if next_page is None:
continue # we're on the last page, so we don't worry about merging

# Go through the next page only
for next_page_block_id in next_page.structure:
if next_page_block_id.block_type in self.ignored_block_types:
continue # skip headers and footers

# we have our block
next_page_block = next_page.get_block(next_page_block_id)
if next_page_block.ignore_for_output:
continue # skip ignored blocks

if not (next_page_block.structure is not None and \
next_page_block.block_type in self.block_types):
# we found a non-text block or an empty text block, so we can stop looking
break

new_block_lines = next_page_block.structure_blocks(document)

next_block_in_first_quadrant = (next_page_block.polygon.x_start < next_page.polygon.width // 2) and \
(next_page_block.polygon.y_start < next_page.polygon.height // 2)
break
else:
continue # we didn't break anywhere so we continue

new_block_lines = next_block.structure_blocks(document)

# we check for next_block indentation
if len(new_block_lines):
Expand Down
11 changes: 8 additions & 3 deletions marker/renderers/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from pydantic import BaseModel

from marker.renderers.html import HTMLRenderer
from marker.schema import BlockTypes
from marker.schema.document import Document


Expand Down Expand Up @@ -33,9 +34,13 @@ def convert_p(self, el, text, *args):
hyphens = r'-—¬'
has_continuation = el.has_attr('class') and 'has-continuation' in el['class']
if has_continuation:
if regex.compile(rf'.*[\p{{Ll}}|\d][{hyphens}]\s?$', regex.DOTALL).match(text): # handle hypenation across pages
return regex.split(rf"[{hyphens}]\s?$", text)[0]
return f"{text} "
block_type = BlockTypes[el['block-type']]
if block_type in [BlockTypes.TextInlineMath, BlockTypes.Text]:
if regex.compile(rf'.*[\p{{Ll}}|\d][{hyphens}]\s?$', regex.DOTALL).match(text): # handle hypenation across pages
return regex.split(rf"[{hyphens}]\s?$", text)[0]
return f"{text} "
if block_type == BlockTypes.ListGroup:
return f"{text}"
return f"{text}\n\n" if text else "" # default convert_p behavior


Expand Down
4 changes: 2 additions & 2 deletions marker/schema/blocks/inlinemath.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def assemble_html(self, child_blocks, parent_structure):
template = super().assemble_html(child_blocks, parent_structure)
template = template.replace("\n", " ")

class_attr = ""
class_attr = f" block-type='{self.block_type}'"
if self.has_continuation:
class_attr = " class='has-continuation'"
class_attr += " class='has-continuation'"
return f"<p{class_attr}>{template}</p>"
2 changes: 1 addition & 1 deletion marker/schema/blocks/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def assemble_html(self, child_blocks, parent_structure):
template = super().assemble_html(child_blocks, parent_structure)
template = template.replace("\n", " ")

class_attr = ""
class_attr = f" block-type='{self.block_type}'"
if self.has_continuation:
class_attr += " class='has-continuation'"
return f"<p{class_attr}>{template}</p>"
20 changes: 14 additions & 6 deletions marker/schema/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,15 +42,23 @@ def get_page(self, page_id):
return page
return None

def get_next_block(self, block: Block):
def get_next_block(self, block: Block, ignored_block_types: List[BlockTypes] = None):
if ignored_block_types is None:
ignored_block_types = []
next_block = None

# Try to find the next block in the current page
page = self.get_page(block.page_id)
next_block = page.get_next_block(block)
next_block = page.get_next_block(block, ignored_block_types)
if next_block:
return next_block
next_page = self.get_next_page(page)
if not next_page:
return None
return next_page.get_block(next_page.structure[0])

# If no block found, search subsequent pages
for page in self.pages[self.pages.index(page) + 1:]:
next_block = page.get_next_block(None, ignored_block_types)
if next_block:
return next_block
return None

def get_next_page(self, page: PageGroup):
page_idx = self.pages.index(page)
Expand Down
7 changes: 6 additions & 1 deletion marker/schema/groups/list.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,12 @@

class ListGroup(Group):
block_type: BlockTypes = BlockTypes.ListGroup
has_continuation: bool = False

def assemble_html(self, child_blocks, parent_structure):
template = super().assemble_html(child_blocks, parent_structure)
return f"<p><ul>{template}</ul></p>"

class_attr = f" block-type='{self.block_type}'"
if self.has_continuation:
class_attr += " class='has-continuation'"
return f"<p{class_attr}><ul>{template}</ul></p>"
21 changes: 15 additions & 6 deletions marker/schema/groups/page.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from collections import defaultdict
from typing import Any, Dict, List, Sequence, Tuple, Union
from typing import Any, Dict, List, Optional, Sequence, Tuple, Union

from PIL import Image

Expand Down Expand Up @@ -34,11 +34,20 @@ def add_child(self, block: Block):
else:
self.children.append(block)

def get_next_block(self, block: Block):
block_idx = self.structure.index(block.id)
if block_idx + 1 < len(self.structure):
return self.get_block(self.structure[block_idx + 1])
return None
def get_next_block(self, block: Optional[Block] = None, ignored_block_types: Optional[List[BlockTypes]] = None):
if ignored_block_types is None:
ignored_block_types = []

structure_idx = 0
if block is not None:
structure_idx = self.structure.index(block.id) + 1

# Iterate over blocks following the given block
for next_block_id in self.structure[structure_idx:]:
if next_block_id.block_type not in ignored_block_types:
return self.get_block(next_block_id)

return None # No valid next block found

def get_prev_block(self, block: Block):
block_idx = self.structure.index(block.id)
Expand Down

0 comments on commit 1047d9c

Please sign in to comment.