diff --git a/marker/converters/pdf.py b/marker/converters/pdf.py index 978a8651..5bb7f1cc 100644 --- a/marker/converters/pdf.py +++ b/marker/converters/pdf.py @@ -17,6 +17,7 @@ from marker.processors.footnote import FootnoteProcessor from marker.processors.ignoretext import IgnoreTextProcessor from marker.processors.line_numbers import LineNumbersProcessor +from marker.processors.list import ListProcessor from marker.processors.page_header import PageHeaderProcessor from marker.processors.sectionheader import SectionHeaderProcessor from marker.processors.table import TableProcessor @@ -58,6 +59,7 @@ def __init__(self, artifact_dict: Dict[str, Any], processor_list: List[str] | No TableProcessor, SectionHeaderProcessor, TextProcessor, + ListProcessor, CodeProcessor, DocumentTOCProcessor, IgnoreTextProcessor, diff --git a/marker/processors/list.py b/marker/processors/list.py new file mode 100644 index 00000000..56baa6cf --- /dev/null +++ b/marker/processors/list.py @@ -0,0 +1,43 @@ +import math + +from marker.processors import BaseProcessor +from marker.schema import BlockTypes +from marker.schema.document import Document + + +class ListProcessor(BaseProcessor): + """ + A processor for merging lists across pages and columns + """ + block_types = (BlockTypes.ListGroup,) + ignored_block_types = (BlockTypes.PageHeader, BlockTypes.PageFooter) + + def __init__(self, config): + super().__init__(config) + + def __call__(self, document: Document): + for page in document.pages: + for block in page.contained_blocks(document, self.block_types): + next_block = document.get_next_block(block, self.ignored_block_types) + if next_block is None: + continue + if next_block.block_type not in self.block_types: + continue + if next_block.structure is None: + continue + if next_block.ignore_for_output: + continue + + column_break, page_break = False, False + next_block_in_first_quadrant = False + + if next_block.page_id == block.page_id: # block on the same page + # we check for a column break + column_break = next_block.polygon.y_start <= block.polygon.y_end + else: + page_break = True + next_page = document.get_page(next_block.page_id) + next_block_in_first_quadrant = (next_block.polygon.x_start < next_page.polygon.width // 2) and \ + (next_block.polygon.y_start < next_page.polygon.height // 2) + + block.has_continuation = column_break or (page_break and next_block_in_first_quadrant) diff --git a/marker/processors/text.py b/marker/processors/text.py index c57487d2..e13d699b 100644 --- a/marker/processors/text.py +++ b/marker/processors/text.py @@ -33,70 +33,42 @@ def __call__(self, document: Document): if not len(block.structure) >= 2: # Skip single lines continue - + + next_block = document.get_next_block(block, self.ignored_block_types) + if next_block is None: # we've reached the end of the document + continue + if next_block.block_type not in self.block_types: + continue # we found a non-text block + if next_block.structure is None: + continue # This is odd though, why do we have text blocks with no structure? + if next_block.ignore_for_output: + continue # skip ignored blocks + column_gap = block.polygon.width * self.column_gap_ratio column_break, page_break = False, False - next_block = None - - for next_block_id in page.structure[page.structure.index(block.id) + 1:]: - if next_block_id.block_type in self.ignored_block_types: - continue - next_block = page.get_block(next_block_id) - break + next_block_starts_indented = True + next_block_in_first_quadrant = False + last_line_is_full_width = False + last_line_is_hyphentated = False + new_block_lines = [] - if next_block is not None: # next block exists + if next_block.page_id == block.page_id: # block on the same page # we check for a column break column_break = ( - math.floor(next_block.polygon.y_start) <= math.floor(block.polygon.y_start) and + math.floor(next_block.polygon.y_start) <= math.ceil(block.polygon.y_start) and next_block.polygon.x_start > (block.polygon.x_end + column_gap) ) - else: # It's a page break since we don't have a next block in the page + else: page_break = True + next_page = document.get_page(next_block.page_id) + next_block_in_first_quadrant = (next_block.polygon.x_start < next_page.polygon.width // 2) and \ + (next_block.polygon.y_start < next_page.polygon.height // 2) if not (column_break or page_break): continue - - next_block_starts_indented = True - next_block_in_first_quadrant = False - last_line_is_full_width = False - last_line_is_hyphentated = False - new_block_lines = [] - - if column_break: - if next_block.block_type not in self.block_types: - continue - if next_block.structure is None: # This is odd though, why do we have text blocks with no structure? - continue - - new_block_lines = next_block.structure_blocks(document) - else: # page break - next_page = document.get_next_page(page) - if next_page is None: - continue # we're on the last page, so we don't worry about merging - - # Go through the next page only - for next_page_block_id in next_page.structure: - if next_page_block_id.block_type in self.ignored_block_types: - continue # skip headers and footers - - # we have our block - next_page_block = next_page.get_block(next_page_block_id) - if next_page_block.ignore_for_output: - continue # skip ignored blocks - - if not (next_page_block.structure is not None and \ - next_page_block.block_type in self.block_types): - # we found a non-text block or an empty text block, so we can stop looking - break - - new_block_lines = next_page_block.structure_blocks(document) - - next_block_in_first_quadrant = (next_page_block.polygon.x_start < next_page.polygon.width // 2) and \ - (next_page_block.polygon.y_start < next_page.polygon.height // 2) - break - else: - continue # we didn't break anywhere so we continue + + new_block_lines = next_block.structure_blocks(document) # we check for next_block indentation if len(new_block_lines): diff --git a/marker/renderers/markdown.py b/marker/renderers/markdown.py index 5cbc15bc..0cadaf16 100644 --- a/marker/renderers/markdown.py +++ b/marker/renderers/markdown.py @@ -5,6 +5,7 @@ from pydantic import BaseModel from marker.renderers.html import HTMLRenderer +from marker.schema import BlockTypes from marker.schema.document import Document @@ -33,9 +34,13 @@ def convert_p(self, el, text, *args): hyphens = r'-—¬' has_continuation = el.has_attr('class') and 'has-continuation' in el['class'] if has_continuation: - if regex.compile(rf'.*[\p{{Ll}}|\d][{hyphens}]\s?$', regex.DOTALL).match(text): # handle hypenation across pages - return regex.split(rf"[{hyphens}]\s?$", text)[0] - return f"{text} " + block_type = BlockTypes[el['block-type']] + if block_type in [BlockTypes.TextInlineMath, BlockTypes.Text]: + if regex.compile(rf'.*[\p{{Ll}}|\d][{hyphens}]\s?$', regex.DOTALL).match(text): # handle hypenation across pages + return regex.split(rf"[{hyphens}]\s?$", text)[0] + return f"{text} " + if block_type == BlockTypes.ListGroup: + return f"{text}" return f"{text}\n\n" if text else "" # default convert_p behavior diff --git a/marker/schema/blocks/inlinemath.py b/marker/schema/blocks/inlinemath.py index 99f46759..b0adbc6d 100644 --- a/marker/schema/blocks/inlinemath.py +++ b/marker/schema/blocks/inlinemath.py @@ -13,7 +13,7 @@ def assemble_html(self, child_blocks, parent_structure): template = super().assemble_html(child_blocks, parent_structure) template = template.replace("\n", " ") - class_attr = "" + class_attr = f" block-type='{self.block_type}'" if self.has_continuation: - class_attr = " class='has-continuation'" + class_attr += " class='has-continuation'" return f"{template}

" diff --git a/marker/schema/blocks/text.py b/marker/schema/blocks/text.py index 89fba932..6a40407a 100644 --- a/marker/schema/blocks/text.py +++ b/marker/schema/blocks/text.py @@ -13,7 +13,7 @@ def assemble_html(self, child_blocks, parent_structure): template = super().assemble_html(child_blocks, parent_structure) template = template.replace("\n", " ") - class_attr = "" + class_attr = f" block-type='{self.block_type}'" if self.has_continuation: class_attr += " class='has-continuation'" return f"{template}

" diff --git a/marker/schema/document.py b/marker/schema/document.py index cc718995..d7ca4c73 100644 --- a/marker/schema/document.py +++ b/marker/schema/document.py @@ -42,15 +42,23 @@ def get_page(self, page_id): return page return None - def get_next_block(self, block: Block): + def get_next_block(self, block: Block, ignored_block_types: List[BlockTypes] = None): + if ignored_block_types is None: + ignored_block_types = [] + next_block = None + + # Try to find the next block in the current page page = self.get_page(block.page_id) - next_block = page.get_next_block(block) + next_block = page.get_next_block(block, ignored_block_types) if next_block: return next_block - next_page = self.get_next_page(page) - if not next_page: - return None - return next_page.get_block(next_page.structure[0]) + + # If no block found, search subsequent pages + for page in self.pages[self.pages.index(page) + 1:]: + next_block = page.get_next_block(None, ignored_block_types) + if next_block: + return next_block + return None def get_next_page(self, page: PageGroup): page_idx = self.pages.index(page) diff --git a/marker/schema/groups/list.py b/marker/schema/groups/list.py index 0149211f..3adb47ff 100644 --- a/marker/schema/groups/list.py +++ b/marker/schema/groups/list.py @@ -4,7 +4,12 @@ class ListGroup(Group): block_type: BlockTypes = BlockTypes.ListGroup + has_continuation: bool = False def assemble_html(self, child_blocks, parent_structure): template = super().assemble_html(child_blocks, parent_structure) - return f"

" + + class_attr = f" block-type='{self.block_type}'" + if self.has_continuation: + class_attr += " class='has-continuation'" + return f"
    {template}

" diff --git a/marker/schema/groups/page.py b/marker/schema/groups/page.py index a9f938fb..c00af1f5 100644 --- a/marker/schema/groups/page.py +++ b/marker/schema/groups/page.py @@ -1,5 +1,5 @@ from collections import defaultdict -from typing import Any, Dict, List, Sequence, Tuple, Union +from typing import Any, Dict, List, Optional, Sequence, Tuple, Union from PIL import Image @@ -34,11 +34,20 @@ def add_child(self, block: Block): else: self.children.append(block) - def get_next_block(self, block: Block): - block_idx = self.structure.index(block.id) - if block_idx + 1 < len(self.structure): - return self.get_block(self.structure[block_idx + 1]) - return None + def get_next_block(self, block: Optional[Block] = None, ignored_block_types: Optional[List[BlockTypes]] = None): + if ignored_block_types is None: + ignored_block_types = [] + + structure_idx = 0 + if block is not None: + structure_idx = self.structure.index(block.id) + 1 + + # Iterate over blocks following the given block + for next_block_id in self.structure[structure_idx:]: + if next_block_id.block_type not in ignored_block_types: + return self.get_block(next_block_id) + + return None # No valid next block found def get_prev_block(self, block: Block): block_idx = self.structure.index(block.id)