From a3d5ec297a4bbacc884120449351800325d1f1a6 Mon Sep 17 00:00:00 2001
From: Moses Paul R
Date: Sat, 30 Nov 2024 15:41:39 +0000
Subject: [PATCH 1/6] refactor next block logic for reuse with list item
processor
---
marker/processors/text.py | 72 +++++++++++-------------------------
marker/schema/document.py | 20 +++++++---
marker/schema/groups/page.py | 19 +++++++---
3 files changed, 49 insertions(+), 62 deletions(-)
diff --git a/marker/processors/text.py b/marker/processors/text.py
index c57487d2..5165945b 100644
--- a/marker/processors/text.py
+++ b/marker/processors/text.py
@@ -37,66 +37,38 @@ def __call__(self, document: Document):
column_gap = block.polygon.width * self.column_gap_ratio
column_break, page_break = False, False
- next_block = None
-
- for next_block_id in page.structure[page.structure.index(block.id) + 1:]:
- if next_block_id.block_type in self.ignored_block_types:
- continue
- next_block = page.get_block(next_block_id)
- break
+ next_block_starts_indented = True
+ next_block_in_first_quadrant = False
+ last_line_is_full_width = False
+ last_line_is_hyphentated = False
+ new_block_lines = []
- if next_block is not None: # next block exists
+ next_block = document.get_next_block(block, self.ignored_block_types)
+ if next_block is None: # we've reached the end of the document
+ continue
+ if next_block.block_type not in self.block_types:
+ continue # we found a non-text block
+ if next_block.structure is None:
+ continue # This is odd though, why do we have text blocks with no structure?
+ if next_block.ignore_for_output:
+ continue # skip ignored blocks
+
+ if next_block.page_id == block.page_id: # block on the same page
# we check for a column break
column_break = (
math.floor(next_block.polygon.y_start) <= math.floor(block.polygon.y_start) and
next_block.polygon.x_start > (block.polygon.x_end + column_gap)
)
- else: # It's a page break since we don't have a next block in the page
+ else:
page_break = True
+ next_page = document.get_page(next_block.page_id)
+ next_block_in_first_quadrant = (next_block.polygon.x_start < next_page.polygon.width // 2) and \
+ (next_block.polygon.y_start < next_page.polygon.height // 2)
if not (column_break or page_break):
continue
-
- next_block_starts_indented = True
- next_block_in_first_quadrant = False
- last_line_is_full_width = False
- last_line_is_hyphentated = False
- new_block_lines = []
-
- if column_break:
- if next_block.block_type not in self.block_types:
- continue
- if next_block.structure is None: # This is odd though, why do we have text blocks with no structure?
- continue
-
- new_block_lines = next_block.structure_blocks(document)
- else: # page break
- next_page = document.get_next_page(page)
- if next_page is None:
- continue # we're on the last page, so we don't worry about merging
-
- # Go through the next page only
- for next_page_block_id in next_page.structure:
- if next_page_block_id.block_type in self.ignored_block_types:
- continue # skip headers and footers
-
- # we have our block
- next_page_block = next_page.get_block(next_page_block_id)
- if next_page_block.ignore_for_output:
- continue # skip ignored blocks
-
- if not (next_page_block.structure is not None and \
- next_page_block.block_type in self.block_types):
- # we found a non-text block or an empty text block, so we can stop looking
- break
-
- new_block_lines = next_page_block.structure_blocks(document)
-
- next_block_in_first_quadrant = (next_page_block.polygon.x_start < next_page.polygon.width // 2) and \
- (next_page_block.polygon.y_start < next_page.polygon.height // 2)
- break
- else:
- continue # we didn't break anywhere so we continue
+
+ new_block_lines = next_block.structure_blocks(document)
# we check for next_block indentation
if len(new_block_lines):
diff --git a/marker/schema/document.py b/marker/schema/document.py
index cc718995..d7ca4c73 100644
--- a/marker/schema/document.py
+++ b/marker/schema/document.py
@@ -42,15 +42,23 @@ def get_page(self, page_id):
return page
return None
- def get_next_block(self, block: Block):
+ def get_next_block(self, block: Block, ignored_block_types: List[BlockTypes] = None):
+ if ignored_block_types is None:
+ ignored_block_types = []
+ next_block = None
+
+ # Try to find the next block in the current page
page = self.get_page(block.page_id)
- next_block = page.get_next_block(block)
+ next_block = page.get_next_block(block, ignored_block_types)
if next_block:
return next_block
- next_page = self.get_next_page(page)
- if not next_page:
- return None
- return next_page.get_block(next_page.structure[0])
+
+ # If no block found, search subsequent pages
+ for page in self.pages[self.pages.index(page) + 1:]:
+ next_block = page.get_next_block(None, ignored_block_types)
+ if next_block:
+ return next_block
+ return None
def get_next_page(self, page: PageGroup):
page_idx = self.pages.index(page)
diff --git a/marker/schema/groups/page.py b/marker/schema/groups/page.py
index a9f938fb..a6d18459 100644
--- a/marker/schema/groups/page.py
+++ b/marker/schema/groups/page.py
@@ -1,5 +1,5 @@
from collections import defaultdict
-from typing import Any, Dict, List, Sequence, Tuple, Union
+from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
from PIL import Image
@@ -34,11 +34,18 @@ def add_child(self, block: Block):
else:
self.children.append(block)
- def get_next_block(self, block: Block):
- block_idx = self.structure.index(block.id)
- if block_idx + 1 < len(self.structure):
- return self.get_block(self.structure[block_idx + 1])
- return None
+ def get_next_block(self, block: Optional[Block] = None, ignored_block_types: Optional[List[BlockTypes]] = None):
+ if ignored_block_types is None:
+ ignored_block_types = []
+ if block is None:
+ return self.get_block(self.structure[0])
+
+ # Iterate over blocks following the given block
+ for next_block_id in self.structure[self.structure.index(block.id) + 1:]:
+ if next_block_id.block_type not in ignored_block_types:
+ return self.get_block(next_block_id)
+
+ return None # No valid next block found
def get_prev_block(self, block: Block):
block_idx = self.structure.index(block.id)
From 3a4dec180bac7be4a3e87f60d02d979fba5aa6e8 Mon Sep 17 00:00:00 2001
From: Moses Paul R
Date: Sat, 30 Nov 2024 16:03:50 +0000
Subject: [PATCH 2/6] bugfix [skip ci]
---
marker/schema/groups/page.py | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/marker/schema/groups/page.py b/marker/schema/groups/page.py
index a6d18459..c00af1f5 100644
--- a/marker/schema/groups/page.py
+++ b/marker/schema/groups/page.py
@@ -37,11 +37,13 @@ def add_child(self, block: Block):
def get_next_block(self, block: Optional[Block] = None, ignored_block_types: Optional[List[BlockTypes]] = None):
if ignored_block_types is None:
ignored_block_types = []
- if block is None:
- return self.get_block(self.structure[0])
+
+ structure_idx = 0
+ if block is not None:
+ structure_idx = self.structure.index(block.id) + 1
# Iterate over blocks following the given block
- for next_block_id in self.structure[self.structure.index(block.id) + 1:]:
+ for next_block_id in self.structure[structure_idx:]:
if next_block_id.block_type not in ignored_block_types:
return self.get_block(next_block_id)
From eb1550d887c62ab55b56cc08690f1d5ef98984a1 Mon Sep 17 00:00:00 2001
From: Moses Paul R
Date: Sat, 30 Nov 2024 17:41:30 +0000
Subject: [PATCH 3/6] add list joining processor
---
marker/converters/pdf.py | 2 ++
marker/processors/list.py | 29 +++++++++++++++++++++++++++++
marker/renderers/markdown.py | 11 ++++++++---
marker/schema/blocks/inlinemath.py | 4 ++--
marker/schema/blocks/text.py | 2 +-
marker/schema/groups/list.py | 7 ++++++-
6 files changed, 48 insertions(+), 7 deletions(-)
create mode 100644 marker/processors/list.py
diff --git a/marker/converters/pdf.py b/marker/converters/pdf.py
index 978a8651..5bb7f1cc 100644
--- a/marker/converters/pdf.py
+++ b/marker/converters/pdf.py
@@ -17,6 +17,7 @@
from marker.processors.footnote import FootnoteProcessor
from marker.processors.ignoretext import IgnoreTextProcessor
from marker.processors.line_numbers import LineNumbersProcessor
+from marker.processors.list import ListProcessor
from marker.processors.page_header import PageHeaderProcessor
from marker.processors.sectionheader import SectionHeaderProcessor
from marker.processors.table import TableProcessor
@@ -58,6 +59,7 @@ def __init__(self, artifact_dict: Dict[str, Any], processor_list: List[str] | No
TableProcessor,
SectionHeaderProcessor,
TextProcessor,
+ ListProcessor,
CodeProcessor,
DocumentTOCProcessor,
IgnoreTextProcessor,
diff --git a/marker/processors/list.py b/marker/processors/list.py
new file mode 100644
index 00000000..d460f76f
--- /dev/null
+++ b/marker/processors/list.py
@@ -0,0 +1,29 @@
+from marker.processors import BaseProcessor
+from marker.schema import BlockTypes
+from marker.schema.document import Document
+
+
+class ListProcessor(BaseProcessor):
+ """
+ A processor for merging lists across pages and columns
+ """
+ block_types = (BlockTypes.ListGroup,)
+ ignored_block_types = (BlockTypes.PageHeader, BlockTypes.PageFooter)
+
+ def __init__(self, config):
+ super().__init__(config)
+
+ def __call__(self, document: Document):
+ for page in document.pages:
+ for block in page.contained_blocks(document, self.block_types):
+ next_block = document.get_next_block(block, self.ignored_block_types)
+ if next_block is None:
+ continue
+ if next_block.block_type not in self.block_types:
+ continue
+ if next_block.structure is None:
+ continue
+ if next_block.ignore_for_output:
+ continue
+
+ block.has_continuation = True
diff --git a/marker/renderers/markdown.py b/marker/renderers/markdown.py
index 5cbc15bc..0cadaf16 100644
--- a/marker/renderers/markdown.py
+++ b/marker/renderers/markdown.py
@@ -5,6 +5,7 @@
from pydantic import BaseModel
from marker.renderers.html import HTMLRenderer
+from marker.schema import BlockTypes
from marker.schema.document import Document
@@ -33,9 +34,13 @@ def convert_p(self, el, text, *args):
hyphens = r'-—¬'
has_continuation = el.has_attr('class') and 'has-continuation' in el['class']
if has_continuation:
- if regex.compile(rf'.*[\p{{Ll}}|\d][{hyphens}]\s?$', regex.DOTALL).match(text): # handle hypenation across pages
- return regex.split(rf"[{hyphens}]\s?$", text)[0]
- return f"{text} "
+ block_type = BlockTypes[el['block-type']]
+ if block_type in [BlockTypes.TextInlineMath, BlockTypes.Text]:
+ if regex.compile(rf'.*[\p{{Ll}}|\d][{hyphens}]\s?$', regex.DOTALL).match(text): # handle hypenation across pages
+ return regex.split(rf"[{hyphens}]\s?$", text)[0]
+ return f"{text} "
+ if block_type == BlockTypes.ListGroup:
+ return f"{text}"
return f"{text}\n\n" if text else "" # default convert_p behavior
diff --git a/marker/schema/blocks/inlinemath.py b/marker/schema/blocks/inlinemath.py
index 99f46759..b0adbc6d 100644
--- a/marker/schema/blocks/inlinemath.py
+++ b/marker/schema/blocks/inlinemath.py
@@ -13,7 +13,7 @@ def assemble_html(self, child_blocks, parent_structure):
template = super().assemble_html(child_blocks, parent_structure)
template = template.replace("\n", " ")
- class_attr = ""
+ class_attr = f" block-type='{self.block_type}'"
if self.has_continuation:
- class_attr = " class='has-continuation'"
+ class_attr += " class='has-continuation'"
return f"{template}
"
diff --git a/marker/schema/blocks/text.py b/marker/schema/blocks/text.py
index 89fba932..6a40407a 100644
--- a/marker/schema/blocks/text.py
+++ b/marker/schema/blocks/text.py
@@ -13,7 +13,7 @@ def assemble_html(self, child_blocks, parent_structure):
template = super().assemble_html(child_blocks, parent_structure)
template = template.replace("\n", " ")
- class_attr = ""
+ class_attr = f" block-type='{self.block_type}'"
if self.has_continuation:
class_attr += " class='has-continuation'"
return f"{template}
"
diff --git a/marker/schema/groups/list.py b/marker/schema/groups/list.py
index 0149211f..3adb47ff 100644
--- a/marker/schema/groups/list.py
+++ b/marker/schema/groups/list.py
@@ -4,7 +4,12 @@
class ListGroup(Group):
block_type: BlockTypes = BlockTypes.ListGroup
+ has_continuation: bool = False
def assemble_html(self, child_blocks, parent_structure):
template = super().assemble_html(child_blocks, parent_structure)
- return f"
"
+
+ class_attr = f" block-type='{self.block_type}'"
+ if self.has_continuation:
+ class_attr += " class='has-continuation'"
+ return f""
From 6a786beb1492db46b6c3860b38c16923843b883a Mon Sep 17 00:00:00 2001
From: Moses Paul R
Date: Sat, 30 Nov 2024 17:55:12 +0000
Subject: [PATCH 4/6] adjust heuristics
---
marker/processors/list.py | 21 ++++++++++++++++++++-
marker/processors/text.py | 20 ++++++++++----------
2 files changed, 30 insertions(+), 11 deletions(-)
diff --git a/marker/processors/list.py b/marker/processors/list.py
index d460f76f..ae94c72b 100644
--- a/marker/processors/list.py
+++ b/marker/processors/list.py
@@ -1,3 +1,5 @@
+import math
+
from marker.processors import BaseProcessor
from marker.schema import BlockTypes
from marker.schema.document import Document
@@ -9,6 +11,7 @@ class ListProcessor(BaseProcessor):
"""
block_types = (BlockTypes.ListGroup,)
ignored_block_types = (BlockTypes.PageHeader, BlockTypes.PageFooter)
+ column_gap_ratio = 0.02 # column gaps are atleast 2% of the current column width
def __init__(self, config):
super().__init__(config)
@@ -26,4 +29,20 @@ def __call__(self, document: Document):
if next_block.ignore_for_output:
continue
- block.has_continuation = True
+ column_gap = block.polygon.width * self.column_gap_ratio
+ column_break, page_break = False, False
+ next_block_in_first_quadrant = False
+
+ if next_block.page_id == block.page_id: # block on the same page
+ # we check for a column break
+ column_break = (
+ math.floor(next_block.polygon.y_start) <= math.ceil(block.polygon.y_start) and
+ next_block.polygon.x_start > (block.polygon.x_end + column_gap)
+ )
+ else:
+ page_break = True
+ next_page = document.get_page(next_block.page_id)
+ next_block_in_first_quadrant = (next_block.polygon.x_start < next_page.polygon.width // 2) and \
+ (next_block.polygon.y_start < next_page.polygon.height // 2)
+
+ block.has_continuation = column_break or (page_break and next_block_in_first_quadrant)
diff --git a/marker/processors/text.py b/marker/processors/text.py
index 5165945b..e13d699b 100644
--- a/marker/processors/text.py
+++ b/marker/processors/text.py
@@ -33,15 +33,6 @@ def __call__(self, document: Document):
if not len(block.structure) >= 2: # Skip single lines
continue
-
- column_gap = block.polygon.width * self.column_gap_ratio
-
- column_break, page_break = False, False
- next_block_starts_indented = True
- next_block_in_first_quadrant = False
- last_line_is_full_width = False
- last_line_is_hyphentated = False
- new_block_lines = []
next_block = document.get_next_block(block, self.ignored_block_types)
if next_block is None: # we've reached the end of the document
@@ -53,10 +44,19 @@ def __call__(self, document: Document):
if next_block.ignore_for_output:
continue # skip ignored blocks
+ column_gap = block.polygon.width * self.column_gap_ratio
+
+ column_break, page_break = False, False
+ next_block_starts_indented = True
+ next_block_in_first_quadrant = False
+ last_line_is_full_width = False
+ last_line_is_hyphentated = False
+ new_block_lines = []
+
if next_block.page_id == block.page_id: # block on the same page
# we check for a column break
column_break = (
- math.floor(next_block.polygon.y_start) <= math.floor(block.polygon.y_start) and
+ math.floor(next_block.polygon.y_start) <= math.ceil(block.polygon.y_start) and
next_block.polygon.x_start > (block.polygon.x_end + column_gap)
)
else:
From d9eb4dd74d6e3dd78960ecf7710eb621171682af Mon Sep 17 00:00:00 2001
From: Moses Paul R
Date: Sat, 30 Nov 2024 18:08:26 +0000
Subject: [PATCH 5/6] make column break threshold more generous
---
marker/processors/list.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/marker/processors/list.py b/marker/processors/list.py
index ae94c72b..32565e49 100644
--- a/marker/processors/list.py
+++ b/marker/processors/list.py
@@ -36,7 +36,7 @@ def __call__(self, document: Document):
if next_block.page_id == block.page_id: # block on the same page
# we check for a column break
column_break = (
- math.floor(next_block.polygon.y_start) <= math.ceil(block.polygon.y_start) and
+ next_block.polygon.y_start <= block.polygon.y_end and
next_block.polygon.x_start > (block.polygon.x_end + column_gap)
)
else:
From 9d0f0eb4174e15e2dd0afb73718b75462b7bf797 Mon Sep 17 00:00:00 2001
From: Moses Paul R
Date: Sat, 30 Nov 2024 18:13:19 +0000
Subject: [PATCH 6/6] restricting column break threshold doesn't work quite as
well for list groups, loosen it
---
marker/processors/list.py | 7 +------
1 file changed, 1 insertion(+), 6 deletions(-)
diff --git a/marker/processors/list.py b/marker/processors/list.py
index 32565e49..56baa6cf 100644
--- a/marker/processors/list.py
+++ b/marker/processors/list.py
@@ -11,7 +11,6 @@ class ListProcessor(BaseProcessor):
"""
block_types = (BlockTypes.ListGroup,)
ignored_block_types = (BlockTypes.PageHeader, BlockTypes.PageFooter)
- column_gap_ratio = 0.02 # column gaps are atleast 2% of the current column width
def __init__(self, config):
super().__init__(config)
@@ -29,16 +28,12 @@ def __call__(self, document: Document):
if next_block.ignore_for_output:
continue
- column_gap = block.polygon.width * self.column_gap_ratio
column_break, page_break = False, False
next_block_in_first_quadrant = False
if next_block.page_id == block.page_id: # block on the same page
# we check for a column break
- column_break = (
- next_block.polygon.y_start <= block.polygon.y_end and
- next_block.polygon.x_start > (block.polygon.x_end + column_gap)
- )
+ column_break = next_block.polygon.y_start <= block.polygon.y_end
else:
page_break = True
next_page = document.get_page(next_block.page_id)