From a3d5ec297a4bbacc884120449351800325d1f1a6 Mon Sep 17 00:00:00 2001
From: Moses Paul R <iammosespaulr@gmail.com>
Date: Sat, 30 Nov 2024 15:41:39 +0000
Subject: [PATCH 1/6] refactor next block logic for reuse with list item
 processor

---
 marker/processors/text.py    | 72 +++++++++++-------------------------
 marker/schema/document.py    | 20 +++++++---
 marker/schema/groups/page.py | 19 +++++++---
 3 files changed, 49 insertions(+), 62 deletions(-)

diff --git a/marker/processors/text.py b/marker/processors/text.py
index c57487d2..5165945b 100644
--- a/marker/processors/text.py
+++ b/marker/processors/text.py
@@ -37,66 +37,38 @@ def __call__(self, document: Document):
                 column_gap = block.polygon.width * self.column_gap_ratio
 
                 column_break, page_break = False, False
-                next_block = None
-
-                for next_block_id in page.structure[page.structure.index(block.id) + 1:]:
-                    if next_block_id.block_type in self.ignored_block_types:
-                        continue
-                    next_block = page.get_block(next_block_id)
-                    break
+                next_block_starts_indented = True
+                next_block_in_first_quadrant = False
+                last_line_is_full_width = False
+                last_line_is_hyphentated = False
+                new_block_lines = []
 
-                if  next_block is not None: # next block exists
+                next_block = document.get_next_block(block, self.ignored_block_types)
+                if next_block is None: # we've reached the end of the document
+                    continue
+                if next_block.block_type not in self.block_types:
+                    continue # we found a non-text block
+                if next_block.structure is None:
+                    continue  # This is odd though, why do we have text blocks with no structure?
+                if next_block.ignore_for_output:
+                    continue # skip ignored blocks
+
+                if next_block.page_id == block.page_id: # block on the same page
                     # we check for a column break
                     column_break = (
                         math.floor(next_block.polygon.y_start) <= math.floor(block.polygon.y_start) and
                         next_block.polygon.x_start > (block.polygon.x_end + column_gap)
                     )
-                else:  # It's a page break since we don't have a next block in the page
+                else:
                     page_break = True
+                    next_page = document.get_page(next_block.page_id)
+                    next_block_in_first_quadrant = (next_block.polygon.x_start < next_page.polygon.width // 2) and \
+                                        (next_block.polygon.y_start < next_page.polygon.height // 2)
 
                 if not (column_break or page_break):
                     continue
-
-                next_block_starts_indented = True
-                next_block_in_first_quadrant = False
-                last_line_is_full_width = False
-                last_line_is_hyphentated = False
-                new_block_lines = []
-
-                if column_break:
-                    if next_block.block_type not in self.block_types:
-                        continue
-                    if next_block.structure is None:  # This is odd though, why do we have text blocks with no structure?
-                        continue
-
-                    new_block_lines = next_block.structure_blocks(document)
-                else:  # page break
-                    next_page = document.get_next_page(page)
-                    if next_page is None:
-                        continue  # we're on the last page, so we don't worry about merging
-
-                    # Go through the next page only
-                    for next_page_block_id in next_page.structure:
-                        if next_page_block_id.block_type in self.ignored_block_types:
-                            continue  # skip headers and footers
-
-                        # we have our block
-                        next_page_block = next_page.get_block(next_page_block_id)
-                        if next_page_block.ignore_for_output:
-                            continue # skip ignored blocks
-
-                        if not (next_page_block.structure is not None and \
-                            next_page_block.block_type in self.block_types): 
-                            # we found a non-text block or an empty text block, so we can stop looking
-                            break
-
-                        new_block_lines = next_page_block.structure_blocks(document)
-
-                        next_block_in_first_quadrant = (next_page_block.polygon.x_start < next_page.polygon.width // 2) and \
-                            (next_page_block.polygon.y_start < next_page.polygon.height // 2)
-                        break
-                    else:
-                        continue  # we didn't break anywhere so we continue
+    
+                new_block_lines = next_block.structure_blocks(document)
 
                 # we check for next_block indentation
                 if len(new_block_lines):
diff --git a/marker/schema/document.py b/marker/schema/document.py
index cc718995..d7ca4c73 100644
--- a/marker/schema/document.py
+++ b/marker/schema/document.py
@@ -42,15 +42,23 @@ def get_page(self, page_id):
                 return page
         return None
 
-    def get_next_block(self, block: Block):
+    def get_next_block(self, block: Block, ignored_block_types: List[BlockTypes] = None):
+        if ignored_block_types is None:
+            ignored_block_types = []
+        next_block = None
+
+        # Try to find the next block in the current page
         page = self.get_page(block.page_id)
-        next_block = page.get_next_block(block)
+        next_block = page.get_next_block(block, ignored_block_types)
         if next_block:
             return next_block
-        next_page = self.get_next_page(page)
-        if not next_page:
-            return None
-        return next_page.get_block(next_page.structure[0])
+
+        # If no block found, search subsequent pages
+        for page in self.pages[self.pages.index(page) + 1:]:
+            next_block = page.get_next_block(None, ignored_block_types)
+            if next_block:
+                return next_block
+        return None
 
     def get_next_page(self, page: PageGroup):
         page_idx = self.pages.index(page)
diff --git a/marker/schema/groups/page.py b/marker/schema/groups/page.py
index a9f938fb..a6d18459 100644
--- a/marker/schema/groups/page.py
+++ b/marker/schema/groups/page.py
@@ -1,5 +1,5 @@
 from collections import defaultdict
-from typing import Any, Dict, List, Sequence, Tuple, Union
+from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
 
 from PIL import Image
 
@@ -34,11 +34,18 @@ def add_child(self, block: Block):
         else:
             self.children.append(block)
 
-    def get_next_block(self, block: Block):
-        block_idx = self.structure.index(block.id)
-        if block_idx + 1 < len(self.structure):
-            return self.get_block(self.structure[block_idx + 1])
-        return None
+    def get_next_block(self, block: Optional[Block] = None, ignored_block_types: Optional[List[BlockTypes]] = None):
+        if ignored_block_types is None:
+            ignored_block_types = []
+        if block is None:
+            return self.get_block(self.structure[0])
+
+        # Iterate over blocks following the given block
+        for next_block_id in self.structure[self.structure.index(block.id) + 1:]:
+            if next_block_id.block_type not in ignored_block_types:
+                return self.get_block(next_block_id)
+
+        return None  # No valid next block found
 
     def get_prev_block(self, block: Block):
         block_idx = self.structure.index(block.id)

From 3a4dec180bac7be4a3e87f60d02d979fba5aa6e8 Mon Sep 17 00:00:00 2001
From: Moses Paul R <iammosespaulr@gmail.com>
Date: Sat, 30 Nov 2024 16:03:50 +0000
Subject: [PATCH 2/6] bugfix [skip ci]

---
 marker/schema/groups/page.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/marker/schema/groups/page.py b/marker/schema/groups/page.py
index a6d18459..c00af1f5 100644
--- a/marker/schema/groups/page.py
+++ b/marker/schema/groups/page.py
@@ -37,11 +37,13 @@ def add_child(self, block: Block):
     def get_next_block(self, block: Optional[Block] = None, ignored_block_types: Optional[List[BlockTypes]] = None):
         if ignored_block_types is None:
             ignored_block_types = []
-        if block is None:
-            return self.get_block(self.structure[0])
+        
+        structure_idx = 0
+        if block is not None:
+            structure_idx = self.structure.index(block.id) + 1
 
         # Iterate over blocks following the given block
-        for next_block_id in self.structure[self.structure.index(block.id) + 1:]:
+        for next_block_id in self.structure[structure_idx:]:
             if next_block_id.block_type not in ignored_block_types:
                 return self.get_block(next_block_id)
 

From eb1550d887c62ab55b56cc08690f1d5ef98984a1 Mon Sep 17 00:00:00 2001
From: Moses Paul R <iammosespaulr@gmail.com>
Date: Sat, 30 Nov 2024 17:41:30 +0000
Subject: [PATCH 3/6] add list joining processor

---
 marker/converters/pdf.py           |  2 ++
 marker/processors/list.py          | 29 +++++++++++++++++++++++++++++
 marker/renderers/markdown.py       | 11 ++++++++---
 marker/schema/blocks/inlinemath.py |  4 ++--
 marker/schema/blocks/text.py       |  2 +-
 marker/schema/groups/list.py       |  7 ++++++-
 6 files changed, 48 insertions(+), 7 deletions(-)
 create mode 100644 marker/processors/list.py

diff --git a/marker/converters/pdf.py b/marker/converters/pdf.py
index 978a8651..5bb7f1cc 100644
--- a/marker/converters/pdf.py
+++ b/marker/converters/pdf.py
@@ -17,6 +17,7 @@
 from marker.processors.footnote import FootnoteProcessor
 from marker.processors.ignoretext import IgnoreTextProcessor
 from marker.processors.line_numbers import LineNumbersProcessor
+from marker.processors.list import ListProcessor
 from marker.processors.page_header import PageHeaderProcessor
 from marker.processors.sectionheader import SectionHeaderProcessor
 from marker.processors.table import TableProcessor
@@ -58,6 +59,7 @@ def __init__(self, artifact_dict: Dict[str, Any], processor_list: List[str] | No
                 TableProcessor,
                 SectionHeaderProcessor,
                 TextProcessor,
+                ListProcessor,
                 CodeProcessor,
                 DocumentTOCProcessor,
                 IgnoreTextProcessor,
diff --git a/marker/processors/list.py b/marker/processors/list.py
new file mode 100644
index 00000000..d460f76f
--- /dev/null
+++ b/marker/processors/list.py
@@ -0,0 +1,29 @@
+from marker.processors import BaseProcessor
+from marker.schema import BlockTypes
+from marker.schema.document import Document
+
+
+class ListProcessor(BaseProcessor):
+    """
+    A processor for merging lists across pages and columns
+    """
+    block_types = (BlockTypes.ListGroup,)
+    ignored_block_types = (BlockTypes.PageHeader, BlockTypes.PageFooter)
+
+    def __init__(self, config):
+        super().__init__(config)
+
+    def __call__(self, document: Document):
+        for page in document.pages:
+            for block in page.contained_blocks(document, self.block_types):
+                next_block = document.get_next_block(block, self.ignored_block_types)
+                if next_block is None:
+                    continue
+                if next_block.block_type not in self.block_types:
+                    continue
+                if next_block.structure is None:
+                    continue
+                if next_block.ignore_for_output:
+                    continue
+
+                block.has_continuation = True
diff --git a/marker/renderers/markdown.py b/marker/renderers/markdown.py
index 5cbc15bc..0cadaf16 100644
--- a/marker/renderers/markdown.py
+++ b/marker/renderers/markdown.py
@@ -5,6 +5,7 @@
 from pydantic import BaseModel
 
 from marker.renderers.html import HTMLRenderer
+from marker.schema import BlockTypes
 from marker.schema.document import Document
 
 
@@ -33,9 +34,13 @@ def convert_p(self, el, text, *args):
         hyphens = r'-—¬'
         has_continuation = el.has_attr('class') and 'has-continuation' in el['class']
         if has_continuation:
-            if regex.compile(rf'.*[\p{{Ll}}|\d][{hyphens}]\s?$', regex.DOTALL).match(text):  # handle hypenation across pages
-                return regex.split(rf"[{hyphens}]\s?$", text)[0]
-            return f"{text} "
+            block_type = BlockTypes[el['block-type']]
+            if block_type in [BlockTypes.TextInlineMath, BlockTypes.Text]:
+                if regex.compile(rf'.*[\p{{Ll}}|\d][{hyphens}]\s?$', regex.DOTALL).match(text):  # handle hypenation across pages
+                    return regex.split(rf"[{hyphens}]\s?$", text)[0]
+                return f"{text} "
+            if block_type == BlockTypes.ListGroup:
+                return f"{text}"
         return f"{text}\n\n" if text else ""  # default convert_p behavior
 
 
diff --git a/marker/schema/blocks/inlinemath.py b/marker/schema/blocks/inlinemath.py
index 99f46759..b0adbc6d 100644
--- a/marker/schema/blocks/inlinemath.py
+++ b/marker/schema/blocks/inlinemath.py
@@ -13,7 +13,7 @@ def assemble_html(self, child_blocks, parent_structure):
         template = super().assemble_html(child_blocks, parent_structure)
         template = template.replace("\n", " ")
 
-        class_attr = ""
+        class_attr = f" block-type='{self.block_type}'"
         if self.has_continuation:
-            class_attr = " class='has-continuation'"
+            class_attr += " class='has-continuation'"
         return f"<p{class_attr}>{template}</p>"
diff --git a/marker/schema/blocks/text.py b/marker/schema/blocks/text.py
index 89fba932..6a40407a 100644
--- a/marker/schema/blocks/text.py
+++ b/marker/schema/blocks/text.py
@@ -13,7 +13,7 @@ def assemble_html(self, child_blocks, parent_structure):
         template = super().assemble_html(child_blocks, parent_structure)
         template = template.replace("\n", " ")
 
-        class_attr = ""
+        class_attr = f" block-type='{self.block_type}'"
         if self.has_continuation:
             class_attr += " class='has-continuation'"
         return f"<p{class_attr}>{template}</p>"
diff --git a/marker/schema/groups/list.py b/marker/schema/groups/list.py
index 0149211f..3adb47ff 100644
--- a/marker/schema/groups/list.py
+++ b/marker/schema/groups/list.py
@@ -4,7 +4,12 @@
 
 class ListGroup(Group):
     block_type: BlockTypes = BlockTypes.ListGroup
+    has_continuation: bool = False
 
     def assemble_html(self, child_blocks, parent_structure):
         template = super().assemble_html(child_blocks, parent_structure)
-        return f"<p><ul>{template}</ul></p>"
+
+        class_attr = f" block-type='{self.block_type}'"
+        if self.has_continuation:
+            class_attr += " class='has-continuation'"
+        return f"<p{class_attr}><ul>{template}</ul></p>"

From 6a786beb1492db46b6c3860b38c16923843b883a Mon Sep 17 00:00:00 2001
From: Moses Paul R <iammosespaulr@gmail.com>
Date: Sat, 30 Nov 2024 17:55:12 +0000
Subject: [PATCH 4/6] adjust heuristics

---
 marker/processors/list.py | 21 ++++++++++++++++++++-
 marker/processors/text.py | 20 ++++++++++----------
 2 files changed, 30 insertions(+), 11 deletions(-)

diff --git a/marker/processors/list.py b/marker/processors/list.py
index d460f76f..ae94c72b 100644
--- a/marker/processors/list.py
+++ b/marker/processors/list.py
@@ -1,3 +1,5 @@
+import math
+
 from marker.processors import BaseProcessor
 from marker.schema import BlockTypes
 from marker.schema.document import Document
@@ -9,6 +11,7 @@ class ListProcessor(BaseProcessor):
     """
     block_types = (BlockTypes.ListGroup,)
     ignored_block_types = (BlockTypes.PageHeader, BlockTypes.PageFooter)
+    column_gap_ratio = 0.02  # column gaps are atleast 2% of the current column width
 
     def __init__(self, config):
         super().__init__(config)
@@ -26,4 +29,20 @@ def __call__(self, document: Document):
                 if next_block.ignore_for_output:
                     continue
 
-                block.has_continuation = True
+                column_gap = block.polygon.width * self.column_gap_ratio
+                column_break, page_break = False, False
+                next_block_in_first_quadrant = False
+
+                if next_block.page_id == block.page_id: # block on the same page
+                    # we check for a column break
+                    column_break = (
+                        math.floor(next_block.polygon.y_start) <= math.ceil(block.polygon.y_start) and
+                        next_block.polygon.x_start > (block.polygon.x_end + column_gap)
+                    )
+                else:
+                    page_break = True
+                    next_page = document.get_page(next_block.page_id)
+                    next_block_in_first_quadrant = (next_block.polygon.x_start < next_page.polygon.width // 2) and \
+                                        (next_block.polygon.y_start < next_page.polygon.height // 2)
+    
+                block.has_continuation = column_break or (page_break and next_block_in_first_quadrant)
diff --git a/marker/processors/text.py b/marker/processors/text.py
index 5165945b..e13d699b 100644
--- a/marker/processors/text.py
+++ b/marker/processors/text.py
@@ -33,15 +33,6 @@ def __call__(self, document: Document):
 
                 if not len(block.structure) >= 2:  # Skip single lines
                     continue
-                
-                column_gap = block.polygon.width * self.column_gap_ratio
-
-                column_break, page_break = False, False
-                next_block_starts_indented = True
-                next_block_in_first_quadrant = False
-                last_line_is_full_width = False
-                last_line_is_hyphentated = False
-                new_block_lines = []
 
                 next_block = document.get_next_block(block, self.ignored_block_types)
                 if next_block is None: # we've reached the end of the document
@@ -53,10 +44,19 @@ def __call__(self, document: Document):
                 if next_block.ignore_for_output:
                     continue # skip ignored blocks
 
+                column_gap = block.polygon.width * self.column_gap_ratio
+
+                column_break, page_break = False, False
+                next_block_starts_indented = True
+                next_block_in_first_quadrant = False
+                last_line_is_full_width = False
+                last_line_is_hyphentated = False
+                new_block_lines = []
+
                 if next_block.page_id == block.page_id: # block on the same page
                     # we check for a column break
                     column_break = (
-                        math.floor(next_block.polygon.y_start) <= math.floor(block.polygon.y_start) and
+                        math.floor(next_block.polygon.y_start) <= math.ceil(block.polygon.y_start) and
                         next_block.polygon.x_start > (block.polygon.x_end + column_gap)
                     )
                 else:

From d9eb4dd74d6e3dd78960ecf7710eb621171682af Mon Sep 17 00:00:00 2001
From: Moses Paul R <iammosespaulr@gmail.com>
Date: Sat, 30 Nov 2024 18:08:26 +0000
Subject: [PATCH 5/6] make column break threshold more generous

---
 marker/processors/list.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/marker/processors/list.py b/marker/processors/list.py
index ae94c72b..32565e49 100644
--- a/marker/processors/list.py
+++ b/marker/processors/list.py
@@ -36,7 +36,7 @@ def __call__(self, document: Document):
                 if next_block.page_id == block.page_id: # block on the same page
                     # we check for a column break
                     column_break = (
-                        math.floor(next_block.polygon.y_start) <= math.ceil(block.polygon.y_start) and
+                        next_block.polygon.y_start <= block.polygon.y_end and
                         next_block.polygon.x_start > (block.polygon.x_end + column_gap)
                     )
                 else:

From 9d0f0eb4174e15e2dd0afb73718b75462b7bf797 Mon Sep 17 00:00:00 2001
From: Moses Paul R <iammosespaulr@gmail.com>
Date: Sat, 30 Nov 2024 18:13:19 +0000
Subject: [PATCH 6/6] restricting column break threshold doesn't work quite as
 well for list groups, loosen it

---
 marker/processors/list.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/marker/processors/list.py b/marker/processors/list.py
index 32565e49..56baa6cf 100644
--- a/marker/processors/list.py
+++ b/marker/processors/list.py
@@ -11,7 +11,6 @@ class ListProcessor(BaseProcessor):
     """
     block_types = (BlockTypes.ListGroup,)
     ignored_block_types = (BlockTypes.PageHeader, BlockTypes.PageFooter)
-    column_gap_ratio = 0.02  # column gaps are atleast 2% of the current column width
 
     def __init__(self, config):
         super().__init__(config)
@@ -29,16 +28,12 @@ def __call__(self, document: Document):
                 if next_block.ignore_for_output:
                     continue
 
-                column_gap = block.polygon.width * self.column_gap_ratio
                 column_break, page_break = False, False
                 next_block_in_first_quadrant = False
 
                 if next_block.page_id == block.page_id: # block on the same page
                     # we check for a column break
-                    column_break = (
-                        next_block.polygon.y_start <= block.polygon.y_end and
-                        next_block.polygon.x_start > (block.polygon.x_end + column_gap)
-                    )
+                    column_break = next_block.polygon.y_start <= block.polygon.y_end
                 else:
                     page_break = True
                     next_page = document.get_page(next_block.page_id)