Merge pull request #402 from VikParuchuri/dev-mose/list-joining-misc

Add `ListGroup` joining processor and refactor `Text` joining processor
VikParuchuri · Nov 30, 2024 · 1047d9c · 1047d9c
2 parents 9602cb4 + 9d0f0eb
commit 1047d9c
Show file tree

Hide file tree

Showing 9 changed files with 115 additions and 71 deletions.
diff --git a/marker/converters/pdf.py b/marker/converters/pdf.py
@@ -17,6 +17,7 @@
 from marker.processors.footnote import FootnoteProcessor
 from marker.processors.ignoretext import IgnoreTextProcessor
 from marker.processors.line_numbers import LineNumbersProcessor
+from marker.processors.list import ListProcessor
 from marker.processors.page_header import PageHeaderProcessor
 from marker.processors.sectionheader import SectionHeaderProcessor
 from marker.processors.table import TableProcessor
@@ -58,6 +59,7 @@ def __init__(self, artifact_dict: Dict[str, Any], processor_list: List[str] | No
                 TableProcessor,
                 SectionHeaderProcessor,
                 TextProcessor,
+                ListProcessor,
                 CodeProcessor,
                 DocumentTOCProcessor,
                 IgnoreTextProcessor,

diff --git a/marker/processors/list.py b/marker/processors/list.py
@@ -0,0 +1,43 @@
+import math
+
+from marker.processors import BaseProcessor
+from marker.schema import BlockTypes
+from marker.schema.document import Document
+
+
+class ListProcessor(BaseProcessor):
+    """
+    A processor for merging lists across pages and columns
+    """
+    block_types = (BlockTypes.ListGroup,)
+    ignored_block_types = (BlockTypes.PageHeader, BlockTypes.PageFooter)
+
+    def __init__(self, config):
+        super().__init__(config)
+
+    def __call__(self, document: Document):
+        for page in document.pages:
+            for block in page.contained_blocks(document, self.block_types):
+                next_block = document.get_next_block(block, self.ignored_block_types)
+                if next_block is None:
+                    continue
+                if next_block.block_type not in self.block_types:
+                    continue
+                if next_block.structure is None:
+                    continue
+                if next_block.ignore_for_output:
+                    continue
+
+                column_break, page_break = False, False
+                next_block_in_first_quadrant = False
+
+                if next_block.page_id == block.page_id: # block on the same page
+                    # we check for a column break
+                    column_break = next_block.polygon.y_start <= block.polygon.y_end
+                else:
+                    page_break = True
+                    next_page = document.get_page(next_block.page_id)
+                    next_block_in_first_quadrant = (next_block.polygon.x_start < next_page.polygon.width // 2) and \
+                                        (next_block.polygon.y_start < next_page.polygon.height // 2)
+
+                block.has_continuation = column_break or (page_break and next_block_in_first_quadrant)
diff --git a/marker/processors/text.py b/marker/processors/text.py
@@ -33,70 +33,42 @@ def __call__(self, document: Document):
 
                 if not len(block.structure) >= 2:  # Skip single lines
                     continue
-
+
+                next_block = document.get_next_block(block, self.ignored_block_types)
+                if next_block is None: # we've reached the end of the document
+                    continue
+                if next_block.block_type not in self.block_types:
+                    continue # we found a non-text block
+                if next_block.structure is None:
+                    continue  # This is odd though, why do we have text blocks with no structure?
+                if next_block.ignore_for_output:
+                    continue # skip ignored blocks
+
                 column_gap = block.polygon.width * self.column_gap_ratio
 
                 column_break, page_break = False, False
-                next_block = None
-
-                for next_block_id in page.structure[page.structure.index(block.id) + 1:]:
-                    if next_block_id.block_type in self.ignored_block_types:
-                        continue
-                    next_block = page.get_block(next_block_id)
-                    break
+                next_block_starts_indented = True
+                next_block_in_first_quadrant = False
+                last_line_is_full_width = False
+                last_line_is_hyphentated = False
+                new_block_lines = []
 
-                if  next_block is not None: # next block exists
+                if next_block.page_id == block.page_id: # block on the same page
                     # we check for a column break
                     column_break = (
-                        math.floor(next_block.polygon.y_start) <= math.floor(block.polygon.y_start) and
+                        math.floor(next_block.polygon.y_start) <= math.ceil(block.polygon.y_start) and
                         next_block.polygon.x_start > (block.polygon.x_end + column_gap)
                     )
-                else:  # It's a page break since we don't have a next block in the page
+                else:
                     page_break = True
+                    next_page = document.get_page(next_block.page_id)
+                    next_block_in_first_quadrant = (next_block.polygon.x_start < next_page.polygon.width // 2) and \
+                                        (next_block.polygon.y_start < next_page.polygon.height // 2)
 
                 if not (column_break or page_break):
                     continue
-
-                next_block_starts_indented = True
-                next_block_in_first_quadrant = False
-                last_line_is_full_width = False
-                last_line_is_hyphentated = False
-                new_block_lines = []
-
-                if column_break:
-                    if next_block.block_type not in self.block_types:
-                        continue
-                    if next_block.structure is None:  # This is odd though, why do we have text blocks with no structure?
-                        continue
-
-                    new_block_lines = next_block.structure_blocks(document)
-                else:  # page break
-                    next_page = document.get_next_page(page)
-                    if next_page is None:
-                        continue  # we're on the last page, so we don't worry about merging
-
-                    # Go through the next page only
-                    for next_page_block_id in next_page.structure:
-                        if next_page_block_id.block_type in self.ignored_block_types:
-                            continue  # skip headers and footers
-
-                        # we have our block
-                        next_page_block = next_page.get_block(next_page_block_id)
-                        if next_page_block.ignore_for_output:
-                            continue # skip ignored blocks
-
-                        if not (next_page_block.structure is not None and \
-                            next_page_block.block_type in self.block_types): 
-                            # we found a non-text block or an empty text block, so we can stop looking
-                            break
-
-                        new_block_lines = next_page_block.structure_blocks(document)
-
-                        next_block_in_first_quadrant = (next_page_block.polygon.x_start < next_page.polygon.width // 2) and \
-                            (next_page_block.polygon.y_start < next_page.polygon.height // 2)
-                        break
-                    else:
-                        continue  # we didn't break anywhere so we continue
+
+                new_block_lines = next_block.structure_blocks(document)
 
                 # we check for next_block indentation
                 if len(new_block_lines):

diff --git a/marker/renderers/markdown.py b/marker/renderers/markdown.py
@@ -5,6 +5,7 @@
 from pydantic import BaseModel
 
 from marker.renderers.html import HTMLRenderer
+from marker.schema import BlockTypes
 from marker.schema.document import Document
 
 
@@ -33,9 +34,13 @@ def convert_p(self, el, text, *args):
         hyphens = r'-—¬'
         has_continuation = el.has_attr('class') and 'has-continuation' in el['class']
         if has_continuation:
-            if regex.compile(rf'.*[\p{{Ll}}|\d][{hyphens}]\s?$', regex.DOTALL).match(text):  # handle hypenation across pages
-                return regex.split(rf"[{hyphens}]\s?$", text)[0]
-            return f"{text} "
+            block_type = BlockTypes[el['block-type']]
+            if block_type in [BlockTypes.TextInlineMath, BlockTypes.Text]:
+                if regex.compile(rf'.*[\p{{Ll}}|\d][{hyphens}]\s?$', regex.DOTALL).match(text):  # handle hypenation across pages
+                    return regex.split(rf"[{hyphens}]\s?$", text)[0]
+                return f"{text} "
+            if block_type == BlockTypes.ListGroup:
+                return f"{text}"
         return f"{text}\n\n" if text else ""  # default convert_p behavior
 
 

diff --git a/marker/schema/blocks/inlinemath.py b/marker/schema/blocks/inlinemath.py
@@ -13,7 +13,7 @@ def assemble_html(self, child_blocks, parent_structure):
         template = super().assemble_html(child_blocks, parent_structure)
         template = template.replace("\n", " ")
 
-        class_attr = ""
+        class_attr = f" block-type='{self.block_type}'"
         if self.has_continuation:
-            class_attr = " class='has-continuation'"
+            class_attr += " class='has-continuation'"
         return f"<p{class_attr}>{template}</p>"
diff --git a/marker/schema/blocks/text.py b/marker/schema/blocks/text.py
@@ -13,7 +13,7 @@ def assemble_html(self, child_blocks, parent_structure):
         template = super().assemble_html(child_blocks, parent_structure)
         template = template.replace("\n", " ")
 
-        class_attr = ""
+        class_attr = f" block-type='{self.block_type}'"
         if self.has_continuation:
             class_attr += " class='has-continuation'"
         return f"<p{class_attr}>{template}</p>"
diff --git a/marker/schema/document.py b/marker/schema/document.py
@@ -42,15 +42,23 @@ def get_page(self, page_id):
                 return page
         return None
 
-    def get_next_block(self, block: Block):
+    def get_next_block(self, block: Block, ignored_block_types: List[BlockTypes] = None):
+        if ignored_block_types is None:
+            ignored_block_types = []
+        next_block = None
+
+        # Try to find the next block in the current page
         page = self.get_page(block.page_id)
-        next_block = page.get_next_block(block)
+        next_block = page.get_next_block(block, ignored_block_types)
         if next_block:
             return next_block
-        next_page = self.get_next_page(page)
-        if not next_page:
-            return None
-        return next_page.get_block(next_page.structure[0])
+
+        # If no block found, search subsequent pages
+        for page in self.pages[self.pages.index(page) + 1:]:
+            next_block = page.get_next_block(None, ignored_block_types)
+            if next_block:
+                return next_block
+        return None
 
     def get_next_page(self, page: PageGroup):
         page_idx = self.pages.index(page)

diff --git a/marker/schema/groups/list.py b/marker/schema/groups/list.py
@@ -4,7 +4,12 @@
 
 class ListGroup(Group):
     block_type: BlockTypes = BlockTypes.ListGroup
+    has_continuation: bool = False
 
     def assemble_html(self, child_blocks, parent_structure):
         template = super().assemble_html(child_blocks, parent_structure)
-        return f"<p><ul>{template}</ul></p>"
+
+        class_attr = f" block-type='{self.block_type}'"
+        if self.has_continuation:
+            class_attr += " class='has-continuation'"
+        return f"<p{class_attr}><ul>{template}</ul></p>"
diff --git a/marker/schema/groups/page.py b/marker/schema/groups/page.py
@@ -1,5 +1,5 @@
 from collections import defaultdict
-from typing import Any, Dict, List, Sequence, Tuple, Union
+from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
 
 from PIL import Image
 
@@ -34,11 +34,20 @@ def add_child(self, block: Block):
         else:
             self.children.append(block)
 
-    def get_next_block(self, block: Block):
-        block_idx = self.structure.index(block.id)
-        if block_idx + 1 < len(self.structure):
-            return self.get_block(self.structure[block_idx + 1])
-        return None
+    def get_next_block(self, block: Optional[Block] = None, ignored_block_types: Optional[List[BlockTypes]] = None):
+        if ignored_block_types is None:
+            ignored_block_types = []
+
+        structure_idx = 0
+        if block is not None:
+            structure_idx = self.structure.index(block.id) + 1
+
+        # Iterate over blocks following the given block
+        for next_block_id in self.structure[structure_idx:]:
+            if next_block_id.block_type not in ignored_block_types:
+                return self.get_block(next_block_id)
+
+        return None  # No valid next block found
 
     def get_prev_block(self, block: Block):
         block_idx = self.structure.index(block.id)