From a3d5ec297a4bbacc884120449351800325d1f1a6 Mon Sep 17 00:00:00 2001 From: Moses Paul R Date: Sat, 30 Nov 2024 15:41:39 +0000 Subject: [PATCH 1/6] refactor next block logic for reuse with list item processor --- marker/processors/text.py | 72 +++++++++++------------------------- marker/schema/document.py | 20 +++++++--- marker/schema/groups/page.py | 19 +++++++--- 3 files changed, 49 insertions(+), 62 deletions(-) diff --git a/marker/processors/text.py b/marker/processors/text.py index c57487d2..5165945b 100644 --- a/marker/processors/text.py +++ b/marker/processors/text.py @@ -37,66 +37,38 @@ def __call__(self, document: Document): column_gap = block.polygon.width * self.column_gap_ratio column_break, page_break = False, False - next_block = None - - for next_block_id in page.structure[page.structure.index(block.id) + 1:]: - if next_block_id.block_type in self.ignored_block_types: - continue - next_block = page.get_block(next_block_id) - break + next_block_starts_indented = True + next_block_in_first_quadrant = False + last_line_is_full_width = False + last_line_is_hyphentated = False + new_block_lines = [] - if next_block is not None: # next block exists + next_block = document.get_next_block(block, self.ignored_block_types) + if next_block is None: # we've reached the end of the document + continue + if next_block.block_type not in self.block_types: + continue # we found a non-text block + if next_block.structure is None: + continue # This is odd though, why do we have text blocks with no structure? + if next_block.ignore_for_output: + continue # skip ignored blocks + + if next_block.page_id == block.page_id: # block on the same page # we check for a column break column_break = ( math.floor(next_block.polygon.y_start) <= math.floor(block.polygon.y_start) and next_block.polygon.x_start > (block.polygon.x_end + column_gap) ) - else: # It's a page break since we don't have a next block in the page + else: page_break = True + next_page = document.get_page(next_block.page_id) + next_block_in_first_quadrant = (next_block.polygon.x_start < next_page.polygon.width // 2) and \ + (next_block.polygon.y_start < next_page.polygon.height // 2) if not (column_break or page_break): continue - - next_block_starts_indented = True - next_block_in_first_quadrant = False - last_line_is_full_width = False - last_line_is_hyphentated = False - new_block_lines = [] - - if column_break: - if next_block.block_type not in self.block_types: - continue - if next_block.structure is None: # This is odd though, why do we have text blocks with no structure? - continue - - new_block_lines = next_block.structure_blocks(document) - else: # page break - next_page = document.get_next_page(page) - if next_page is None: - continue # we're on the last page, so we don't worry about merging - - # Go through the next page only - for next_page_block_id in next_page.structure: - if next_page_block_id.block_type in self.ignored_block_types: - continue # skip headers and footers - - # we have our block - next_page_block = next_page.get_block(next_page_block_id) - if next_page_block.ignore_for_output: - continue # skip ignored blocks - - if not (next_page_block.structure is not None and \ - next_page_block.block_type in self.block_types): - # we found a non-text block or an empty text block, so we can stop looking - break - - new_block_lines = next_page_block.structure_blocks(document) - - next_block_in_first_quadrant = (next_page_block.polygon.x_start < next_page.polygon.width // 2) and \ - (next_page_block.polygon.y_start < next_page.polygon.height // 2) - break - else: - continue # we didn't break anywhere so we continue + + new_block_lines = next_block.structure_blocks(document) # we check for next_block indentation if len(new_block_lines): diff --git a/marker/schema/document.py b/marker/schema/document.py index cc718995..d7ca4c73 100644 --- a/marker/schema/document.py +++ b/marker/schema/document.py @@ -42,15 +42,23 @@ def get_page(self, page_id): return page return None - def get_next_block(self, block: Block): + def get_next_block(self, block: Block, ignored_block_types: List[BlockTypes] = None): + if ignored_block_types is None: + ignored_block_types = [] + next_block = None + + # Try to find the next block in the current page page = self.get_page(block.page_id) - next_block = page.get_next_block(block) + next_block = page.get_next_block(block, ignored_block_types) if next_block: return next_block - next_page = self.get_next_page(page) - if not next_page: - return None - return next_page.get_block(next_page.structure[0]) + + # If no block found, search subsequent pages + for page in self.pages[self.pages.index(page) + 1:]: + next_block = page.get_next_block(None, ignored_block_types) + if next_block: + return next_block + return None def get_next_page(self, page: PageGroup): page_idx = self.pages.index(page) diff --git a/marker/schema/groups/page.py b/marker/schema/groups/page.py index a9f938fb..a6d18459 100644 --- a/marker/schema/groups/page.py +++ b/marker/schema/groups/page.py @@ -1,5 +1,5 @@ from collections import defaultdict -from typing import Any, Dict, List, Sequence, Tuple, Union +from typing import Any, Dict, List, Optional, Sequence, Tuple, Union from PIL import Image @@ -34,11 +34,18 @@ def add_child(self, block: Block): else: self.children.append(block) - def get_next_block(self, block: Block): - block_idx = self.structure.index(block.id) - if block_idx + 1 < len(self.structure): - return self.get_block(self.structure[block_idx + 1]) - return None + def get_next_block(self, block: Optional[Block] = None, ignored_block_types: Optional[List[BlockTypes]] = None): + if ignored_block_types is None: + ignored_block_types = [] + if block is None: + return self.get_block(self.structure[0]) + + # Iterate over blocks following the given block + for next_block_id in self.structure[self.structure.index(block.id) + 1:]: + if next_block_id.block_type not in ignored_block_types: + return self.get_block(next_block_id) + + return None # No valid next block found def get_prev_block(self, block: Block): block_idx = self.structure.index(block.id) From 3a4dec180bac7be4a3e87f60d02d979fba5aa6e8 Mon Sep 17 00:00:00 2001 From: Moses Paul R Date: Sat, 30 Nov 2024 16:03:50 +0000 Subject: [PATCH 2/6] bugfix [skip ci] --- marker/schema/groups/page.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/marker/schema/groups/page.py b/marker/schema/groups/page.py index a6d18459..c00af1f5 100644 --- a/marker/schema/groups/page.py +++ b/marker/schema/groups/page.py @@ -37,11 +37,13 @@ def add_child(self, block: Block): def get_next_block(self, block: Optional[Block] = None, ignored_block_types: Optional[List[BlockTypes]] = None): if ignored_block_types is None: ignored_block_types = [] - if block is None: - return self.get_block(self.structure[0]) + + structure_idx = 0 + if block is not None: + structure_idx = self.structure.index(block.id) + 1 # Iterate over blocks following the given block - for next_block_id in self.structure[self.structure.index(block.id) + 1:]: + for next_block_id in self.structure[structure_idx:]: if next_block_id.block_type not in ignored_block_types: return self.get_block(next_block_id) From eb1550d887c62ab55b56cc08690f1d5ef98984a1 Mon Sep 17 00:00:00 2001 From: Moses Paul R Date: Sat, 30 Nov 2024 17:41:30 +0000 Subject: [PATCH 3/6] add list joining processor --- marker/converters/pdf.py | 2 ++ marker/processors/list.py | 29 +++++++++++++++++++++++++++++ marker/renderers/markdown.py | 11 ++++++++--- marker/schema/blocks/inlinemath.py | 4 ++-- marker/schema/blocks/text.py | 2 +- marker/schema/groups/list.py | 7 ++++++- 6 files changed, 48 insertions(+), 7 deletions(-) create mode 100644 marker/processors/list.py diff --git a/marker/converters/pdf.py b/marker/converters/pdf.py index 978a8651..5bb7f1cc 100644 --- a/marker/converters/pdf.py +++ b/marker/converters/pdf.py @@ -17,6 +17,7 @@ from marker.processors.footnote import FootnoteProcessor from marker.processors.ignoretext import IgnoreTextProcessor from marker.processors.line_numbers import LineNumbersProcessor +from marker.processors.list import ListProcessor from marker.processors.page_header import PageHeaderProcessor from marker.processors.sectionheader import SectionHeaderProcessor from marker.processors.table import TableProcessor @@ -58,6 +59,7 @@ def __init__(self, artifact_dict: Dict[str, Any], processor_list: List[str] | No TableProcessor, SectionHeaderProcessor, TextProcessor, + ListProcessor, CodeProcessor, DocumentTOCProcessor, IgnoreTextProcessor, diff --git a/marker/processors/list.py b/marker/processors/list.py new file mode 100644 index 00000000..d460f76f --- /dev/null +++ b/marker/processors/list.py @@ -0,0 +1,29 @@ +from marker.processors import BaseProcessor +from marker.schema import BlockTypes +from marker.schema.document import Document + + +class ListProcessor(BaseProcessor): + """ + A processor for merging lists across pages and columns + """ + block_types = (BlockTypes.ListGroup,) + ignored_block_types = (BlockTypes.PageHeader, BlockTypes.PageFooter) + + def __init__(self, config): + super().__init__(config) + + def __call__(self, document: Document): + for page in document.pages: + for block in page.contained_blocks(document, self.block_types): + next_block = document.get_next_block(block, self.ignored_block_types) + if next_block is None: + continue + if next_block.block_type not in self.block_types: + continue + if next_block.structure is None: + continue + if next_block.ignore_for_output: + continue + + block.has_continuation = True diff --git a/marker/renderers/markdown.py b/marker/renderers/markdown.py index 5cbc15bc..0cadaf16 100644 --- a/marker/renderers/markdown.py +++ b/marker/renderers/markdown.py @@ -5,6 +5,7 @@ from pydantic import BaseModel from marker.renderers.html import HTMLRenderer +from marker.schema import BlockTypes from marker.schema.document import Document @@ -33,9 +34,13 @@ def convert_p(self, el, text, *args): hyphens = r'-—¬' has_continuation = el.has_attr('class') and 'has-continuation' in el['class'] if has_continuation: - if regex.compile(rf'.*[\p{{Ll}}|\d][{hyphens}]\s?$', regex.DOTALL).match(text): # handle hypenation across pages - return regex.split(rf"[{hyphens}]\s?$", text)[0] - return f"{text} " + block_type = BlockTypes[el['block-type']] + if block_type in [BlockTypes.TextInlineMath, BlockTypes.Text]: + if regex.compile(rf'.*[\p{{Ll}}|\d][{hyphens}]\s?$', regex.DOTALL).match(text): # handle hypenation across pages + return regex.split(rf"[{hyphens}]\s?$", text)[0] + return f"{text} " + if block_type == BlockTypes.ListGroup: + return f"{text}" return f"{text}\n\n" if text else "" # default convert_p behavior diff --git a/marker/schema/blocks/inlinemath.py b/marker/schema/blocks/inlinemath.py index 99f46759..b0adbc6d 100644 --- a/marker/schema/blocks/inlinemath.py +++ b/marker/schema/blocks/inlinemath.py @@ -13,7 +13,7 @@ def assemble_html(self, child_blocks, parent_structure): template = super().assemble_html(child_blocks, parent_structure) template = template.replace("\n", " ") - class_attr = "" + class_attr = f" block-type='{self.block_type}'" if self.has_continuation: - class_attr = " class='has-continuation'" + class_attr += " class='has-continuation'" return f"{template}

" diff --git a/marker/schema/blocks/text.py b/marker/schema/blocks/text.py index 89fba932..6a40407a 100644 --- a/marker/schema/blocks/text.py +++ b/marker/schema/blocks/text.py @@ -13,7 +13,7 @@ def assemble_html(self, child_blocks, parent_structure): template = super().assemble_html(child_blocks, parent_structure) template = template.replace("\n", " ") - class_attr = "" + class_attr = f" block-type='{self.block_type}'" if self.has_continuation: class_attr += " class='has-continuation'" return f"{template}

" diff --git a/marker/schema/groups/list.py b/marker/schema/groups/list.py index 0149211f..3adb47ff 100644 --- a/marker/schema/groups/list.py +++ b/marker/schema/groups/list.py @@ -4,7 +4,12 @@ class ListGroup(Group): block_type: BlockTypes = BlockTypes.ListGroup + has_continuation: bool = False def assemble_html(self, child_blocks, parent_structure): template = super().assemble_html(child_blocks, parent_structure) - return f"

    {template}

" + + class_attr = f" block-type='{self.block_type}'" + if self.has_continuation: + class_attr += " class='has-continuation'" + return f"
    {template}

" From 6a786beb1492db46b6c3860b38c16923843b883a Mon Sep 17 00:00:00 2001 From: Moses Paul R Date: Sat, 30 Nov 2024 17:55:12 +0000 Subject: [PATCH 4/6] adjust heuristics --- marker/processors/list.py | 21 ++++++++++++++++++++- marker/processors/text.py | 20 ++++++++++---------- 2 files changed, 30 insertions(+), 11 deletions(-) diff --git a/marker/processors/list.py b/marker/processors/list.py index d460f76f..ae94c72b 100644 --- a/marker/processors/list.py +++ b/marker/processors/list.py @@ -1,3 +1,5 @@ +import math + from marker.processors import BaseProcessor from marker.schema import BlockTypes from marker.schema.document import Document @@ -9,6 +11,7 @@ class ListProcessor(BaseProcessor): """ block_types = (BlockTypes.ListGroup,) ignored_block_types = (BlockTypes.PageHeader, BlockTypes.PageFooter) + column_gap_ratio = 0.02 # column gaps are atleast 2% of the current column width def __init__(self, config): super().__init__(config) @@ -26,4 +29,20 @@ def __call__(self, document: Document): if next_block.ignore_for_output: continue - block.has_continuation = True + column_gap = block.polygon.width * self.column_gap_ratio + column_break, page_break = False, False + next_block_in_first_quadrant = False + + if next_block.page_id == block.page_id: # block on the same page + # we check for a column break + column_break = ( + math.floor(next_block.polygon.y_start) <= math.ceil(block.polygon.y_start) and + next_block.polygon.x_start > (block.polygon.x_end + column_gap) + ) + else: + page_break = True + next_page = document.get_page(next_block.page_id) + next_block_in_first_quadrant = (next_block.polygon.x_start < next_page.polygon.width // 2) and \ + (next_block.polygon.y_start < next_page.polygon.height // 2) + + block.has_continuation = column_break or (page_break and next_block_in_first_quadrant) diff --git a/marker/processors/text.py b/marker/processors/text.py index 5165945b..e13d699b 100644 --- a/marker/processors/text.py +++ b/marker/processors/text.py @@ -33,15 +33,6 @@ def __call__(self, document: Document): if not len(block.structure) >= 2: # Skip single lines continue - - column_gap = block.polygon.width * self.column_gap_ratio - - column_break, page_break = False, False - next_block_starts_indented = True - next_block_in_first_quadrant = False - last_line_is_full_width = False - last_line_is_hyphentated = False - new_block_lines = [] next_block = document.get_next_block(block, self.ignored_block_types) if next_block is None: # we've reached the end of the document @@ -53,10 +44,19 @@ def __call__(self, document: Document): if next_block.ignore_for_output: continue # skip ignored blocks + column_gap = block.polygon.width * self.column_gap_ratio + + column_break, page_break = False, False + next_block_starts_indented = True + next_block_in_first_quadrant = False + last_line_is_full_width = False + last_line_is_hyphentated = False + new_block_lines = [] + if next_block.page_id == block.page_id: # block on the same page # we check for a column break column_break = ( - math.floor(next_block.polygon.y_start) <= math.floor(block.polygon.y_start) and + math.floor(next_block.polygon.y_start) <= math.ceil(block.polygon.y_start) and next_block.polygon.x_start > (block.polygon.x_end + column_gap) ) else: From d9eb4dd74d6e3dd78960ecf7710eb621171682af Mon Sep 17 00:00:00 2001 From: Moses Paul R Date: Sat, 30 Nov 2024 18:08:26 +0000 Subject: [PATCH 5/6] make column break threshold more generous --- marker/processors/list.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/marker/processors/list.py b/marker/processors/list.py index ae94c72b..32565e49 100644 --- a/marker/processors/list.py +++ b/marker/processors/list.py @@ -36,7 +36,7 @@ def __call__(self, document: Document): if next_block.page_id == block.page_id: # block on the same page # we check for a column break column_break = ( - math.floor(next_block.polygon.y_start) <= math.ceil(block.polygon.y_start) and + next_block.polygon.y_start <= block.polygon.y_end and next_block.polygon.x_start > (block.polygon.x_end + column_gap) ) else: From 9d0f0eb4174e15e2dd0afb73718b75462b7bf797 Mon Sep 17 00:00:00 2001 From: Moses Paul R Date: Sat, 30 Nov 2024 18:13:19 +0000 Subject: [PATCH 6/6] restricting column break threshold doesn't work quite as well for list groups, loosen it --- marker/processors/list.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/marker/processors/list.py b/marker/processors/list.py index 32565e49..56baa6cf 100644 --- a/marker/processors/list.py +++ b/marker/processors/list.py @@ -11,7 +11,6 @@ class ListProcessor(BaseProcessor): """ block_types = (BlockTypes.ListGroup,) ignored_block_types = (BlockTypes.PageHeader, BlockTypes.PageFooter) - column_gap_ratio = 0.02 # column gaps are atleast 2% of the current column width def __init__(self, config): super().__init__(config) @@ -29,16 +28,12 @@ def __call__(self, document: Document): if next_block.ignore_for_output: continue - column_gap = block.polygon.width * self.column_gap_ratio column_break, page_break = False, False next_block_in_first_quadrant = False if next_block.page_id == block.page_id: # block on the same page # we check for a column break - column_break = ( - next_block.polygon.y_start <= block.polygon.y_end and - next_block.polygon.x_start > (block.polygon.x_end + column_gap) - ) + column_break = next_block.polygon.y_start <= block.polygon.y_end else: page_break = True next_page = document.get_page(next_block.page_id)