Merge pull request #387 from VikParuchuri/vik_v2

Vik v2
VikParuchuri · Nov 26, 2024 · f26834c · f26834c
2 parents 96d1b81 + 69233b7
commit f26834c
Show file tree

Hide file tree

Showing 25 changed files with 555 additions and 17,757 deletions.
diff --git a/README.md b/README.md
diff --git a/data/examples/marker/multicolcnn.md b/data/examples/marker/multicolcnn.md
diff --git a/data/examples/marker/switch_transformers.md b/data/examples/marker/switch_transformers.md
diff --git a/data/examples/marker/thinkos.md b/data/examples/marker/thinkos.md
diff --git a/data/examples/marker/thinkpython.md b/data/examples/marker/thinkpython.md
diff --git a/data/examples/nougat/multicolcnn.md b/data/examples/nougat/multicolcnn.md
diff --git a/data/examples/nougat/switch_transformers.md b/data/examples/nougat/switch_transformers.md
diff --git a/data/examples/nougat/thinkos.md b/data/examples/nougat/thinkos.md
diff --git a/data/examples/nougat/thinkpython.md b/data/examples/nougat/thinkpython.md
diff --git a/marker/builders/layout.py b/marker/builders/layout.py
@@ -80,6 +80,7 @@ def merge_blocks(self, document_pages: List[PageGroup], provider_page_lines: Pro
                 document_page.text_extraction_method = "surya"
                 continue
             document_page.merge_blocks(provider_lines, text_extraction_method="pdftext")
+            document_page.text_extraction_method = "pdftext"
 
     def check_layout_coverage(
         self,
@@ -88,6 +89,7 @@ def check_layout_coverage(
     ):
         covered_blocks = 0
         total_blocks = 0
+        large_text_blocks = 0
         for layout_block_id in document_page.structure:
             layout_block = document_page.get_block(layout_block_id)
             if layout_block.block_type in [BlockTypes.Figure, BlockTypes.Picture, BlockTypes.Table, BlockTypes.FigureGroup, BlockTypes.TableGroup, BlockTypes.PictureGroup]:
@@ -102,5 +104,13 @@ def check_layout_coverage(
             if intersecting_lines > self.layout_coverage_min_lines:
                 covered_blocks += 1
 
+            if layout_block.polygon.intersection_pct(document_page.polygon) > 0.8 and layout_block.block_type == BlockTypes.Text:
+                large_text_blocks += 1
+
         coverage_ratio = covered_blocks / total_blocks if total_blocks > 0 else 1
-        return coverage_ratio >= self.layout_coverage_threshold
+        text_okay = coverage_ratio >= self.layout_coverage_threshold
+
+        # Model will sometimes say there is a single block of text on the page when it is blank
+        if not text_okay and (total_blocks == 1 and large_text_blocks == 1):
+            text_okay = True
+        return text_okay
diff --git a/marker/converters/pdf.py b/marker/converters/pdf.py
@@ -1,5 +1,6 @@
 import os
 
+from marker.processors.footnote import FootnoteProcessor
 from marker.processors.line_numbers import LineNumbersProcessor
 
 os.environ["TOKENIZERS_PARALLELISM"] = "false" # disables a tokenizers warning
@@ -54,6 +55,7 @@ def __init__(self, artifact_dict: Dict[str, Any], processor_list: List[str] | No
             processor_list = strings_to_classes(processor_list)
         else:
             processor_list = [
+                FootnoteProcessor,
                 EquationProcessor,
                 TableProcessor,
                 SectionHeaderProcessor,

diff --git a/marker/processors/code.py b/marker/processors/code.py
@@ -15,25 +15,22 @@ def __call__(self, document: Document):
             for block in page.contained_blocks(document, self.block_types):
                 self.format_block(document, block)
 
+
     def format_block(self, document: Document, block: Code):
         min_left = 9999  # will contain x- coord of column 0
         total_width = 0
         total_chars = 0
-
-        if block.structure is None:
-            return
-
-        for line_id in block.structure:
-            line = document.get_block(line_id)
+
+        contained_lines = block.contained_blocks(document, (BlockTypes.Line,))
+        for line in contained_lines:
             min_left = min(line.polygon.bbox[0], min_left)
             total_width += line.polygon.width
             total_chars += len(line.raw_text(document))
 
         avg_char_width = total_width / max(total_chars, 1)
         code_text = ""
         is_new_line = False
-        for line_id in block.structure:
-            line = document.get_block(line_id)
+        for line in contained_lines:
             text = line.raw_text(document)
             if avg_char_width == 0:
                 prefix = ""
@@ -47,4 +44,4 @@ def format_block(self, document: Document, block: Code):
             code_text += text
             is_new_line = text.endswith("\n")
 
-        block.code = code_text
+        block.code = code_text.rstrip()
diff --git a/marker/processors/debug.py b/marker/processors/debug.py
@@ -54,6 +54,8 @@ def __call__(self, document: Document):
         if any([self.debug_layout_images, self.debug_pdf_images, self.debug_json]):
             os.makedirs(self.debug_folder, exist_ok=True)
 
+        document.debug_data_path = self.debug_folder
+
         if self.debug_layout_images:
             self.draw_layout_debug_images(document)
             print(f"Dumped layout debug images to {self.debug_data_folder}")

diff --git a/marker/processors/footnote.py b/marker/processors/footnote.py
@@ -0,0 +1,88 @@
+import re
+from collections import Counter
+from statistics import mean
+
+from marker.processors import BaseProcessor
+from marker.schema import BlockTypes
+from marker.schema.blocks import Footnote
+from marker.schema.document import Document
+
+from rapidfuzz import fuzz
+
+from marker.schema.groups import PageGroup
+
+
+class FootnoteProcessor(BaseProcessor):
+    """
+    A processor for pushing footnotes to the bottom, and relabeling mislabeled text blocks.
+
+    Attributes:
+        page_bottom_threshold (float):
+            The fraction of page height that is considered the bottom.
+            Default is .8
+
+        line_height_scaler (float):
+            The amount to scale line height by to consider a block a footnote. (from N to 1+(1-N))
+            Default is .99
+    """
+    block_types = (BlockTypes.Footnote,)
+    page_bottom_threshold = .75
+    line_height_scaler = .99
+
+
+    def __call__(self, document: Document):
+        footnote_heights = self.compute_block_stats(document)
+        if len(footnote_heights) == 0:
+            footnote_heights = [999]
+
+        avg_footnote_height = mean(footnote_heights)
+        for page in document.pages:
+            self.relabel_texts_to_footnotes(page, document, avg_footnote_height)
+            self.push_footnotes_to_bottom(page, document)
+
+    def compute_block_stats(self, document: Document):
+        line_heights = []
+        for page in document.pages:
+            for footnote in page.contained_blocks(document, self.block_types):
+                contained_lines = footnote.contained_blocks(document, (BlockTypes.Line,))
+                line_heights.extend([line.polygon.height for line in contained_lines])
+        return line_heights
+
+
+    def relabel_texts_to_footnotes(self, page: PageGroup, document: Document, avg_footnote_height: int):
+        text_blocks = page.contained_blocks(document, (BlockTypes.Text,))
+        block_stats = []
+
+        for block in text_blocks:
+            contained_lines = block.contained_blocks(document, (BlockTypes.Line,))
+            line_heights = [line.polygon.height for line in contained_lines]
+
+            block_stats.append({
+                "line_height": mean(line_heights) if len(line_heights) > 0 else 999,
+                "in_bottom": block.polygon.y_end > page.polygon.height * self.page_bottom_threshold
+            })
+
+        # Find the average font size and line height
+        if len(block_stats) == 0:
+            return
+
+        height_gap = 1 - self.line_height_scaler
+        for text_block, stats_dict in zip(text_blocks, block_stats):
+            if all([
+                avg_footnote_height * self.line_height_scaler < stats_dict["line_height"] < avg_footnote_height * (1 + height_gap),
+                stats_dict["in_bottom"]
+            ]):
+                new_block = Footnote.from_block(text_block)
+                page.replace_block(text_block, new_block)
+
+
+    def push_footnotes_to_bottom(self, page: PageGroup, document: Document):
+        footnote_blocks = page.contained_blocks(document, self.block_types)
+
+        # Push footnotes to the bottom
+        for block in footnote_blocks:
+            # Check if it is top-level
+            if block.id in page.structure:
+                # Move to bottom if it is
+                page.structure.remove(block.id)
+                page.add_structure(block)
diff --git a/marker/renderers/__init__.py b/marker/renderers/__init__.py
@@ -1,6 +1,7 @@
 import base64
 import io
 import re
+from collections import Counter
 from typing import Optional
 
 from bs4 import BeautifulSoup
@@ -56,17 +57,23 @@ def replace_whitespace(match):
     def generate_page_stats(self, document, document_output):
         page_stats = []
         for page in document.pages:
+            block_counts = Counter([str(block.block_type) for block in page.children]).most_common()
             page_stats.append({
                 "page_id": page.page_id,
-                "text_extraction_method": page.text_extraction_method
+                "text_extraction_method": page.text_extraction_method,
+                "block_counts": block_counts,
             })
         return page_stats
 
     def generate_document_metadata(self, document, document_output):
-        return {
+        metadata =  {
             "table_of_contents": document.table_of_contents,
-            "page_stats": self.generate_page_stats(document, document_output)
+            "page_stats": self.generate_page_stats(document, document_output),
         }
+        if document.debug_data_path is not None:
+            metadata["debug_data_path"] = document.debug_data_path
+
+        return metadata
 
     def extract_block_html(self, document, block_output):
         soup = BeautifulSoup(block_output.html, 'html.parser')

diff --git a/marker/renderers/json.py b/marker/renderers/json.py
@@ -1,16 +1,12 @@
 from __future__ import annotations
 
-import base64
-import io
 from typing import List, Dict
 
-from bs4 import BeautifulSoup
 from pydantic import BaseModel
 
 from marker.schema.blocks import Block
 from marker.renderers import BaseRenderer
 from marker.schema import BlockTypes
-from marker.schema.blocks import BlockId
 from marker.schema.registry import get_block_class
 
 

diff --git a/marker/schema/blocks/base.py b/marker/schema/blocks/base.py
@@ -76,6 +76,11 @@ def id(self) -> BlockId:
             block_type=self.block_type
         )
 
+    @classmethod
+    def from_block(cls, block: Block) -> Block:
+        block_attrs = block.model_dump(exclude=["id", "block_id", "block_type"])
+        return cls(**block_attrs)
+
     def structure_blocks(self, document_page: Document | PageGroup) -> List[Block]:
         if self.structure is None:
             return []
@@ -147,6 +152,13 @@ def contained_blocks(self, document: Document, block_types: Sequence[BlockTypes]
             blocks += block.contained_blocks(document, block_types)
         return blocks
 
+    def replace_block(self, block: Block, new_block: Block):
+        if self.structure is not None:
+            for i, item in enumerate(self.structure):
+                if item == block.id:
+                    self.structure[i] = new_block.id
+                    break
+
     def render(self, document: Document, parent_structure: Optional[List[str]], section_hierarchy=None):
         child_content = []
         if section_hierarchy is None:

diff --git a/marker/schema/document.py b/marker/schema/document.py
@@ -27,6 +27,7 @@ class Document(BaseModel):
     pages: List[PageGroup]
     block_type: BlockTypes = BlockTypes.Document
     table_of_contents: List[TocItem] | None = None
+    debug_data_path: str | None = None # Path that debug data was saved to
 
     def get_block(self, block_id: BlockId):
         page = self.get_page(block_id.page_id)

diff --git a/marker/schema/groups/page.py b/marker/schema/groups/page.py
@@ -1,4 +1,5 @@
-from typing import List
+from collections import defaultdict
+from typing import Dict, List, TYPE_CHECKING
 
 from PIL import Image
 
@@ -8,6 +9,9 @@
 from marker.schema.groups.base import Group
 from marker.schema.polygon import PolygonBox
 
+if TYPE_CHECKING:
+    from marker.schema.document import Document
+
 
 class PageGroup(Group):
     block_type: BlockTypes = BlockTypes.Page
@@ -80,6 +84,17 @@ def compute_line_block_intersections(self, provider_outputs: List[ProviderOutput
                     max_intersections[line_idx] = (intersection_pct, block_idx)
         return max_intersections
 
+    def replace_block(self, block: Block, new_block: Block):
+        # Handles incrementing the id
+        self.add_full_block(new_block)
+
+        # Replace block id in structure
+        super().replace_block(block, new_block)
+
+        # Replace block in structure of children
+        for child in self.children:
+            child.replace_block(block, new_block)
+
     def merge_blocks(
         self,
         provider_outputs: List[ProviderOutput],
@@ -89,28 +104,21 @@ def merge_blocks(
         provider_line_idxs = set(range(len(provider_outputs)))
         max_intersections = self.compute_line_block_intersections(provider_outputs, excluded_block_types)
 
+        # Try to assign lines by intersection
         assigned_line_idxs = set()
+        block_lines = defaultdict(list)
         for line_idx, provider_output in enumerate(provider_outputs):
             if line_idx in max_intersections and max_intersections[line_idx][0] > 0.0:
-                line = provider_output.line
-                spans = provider_output.spans
-                self.add_full_block(line)
                 block_idx = max_intersections[line_idx][1]
-                block: Block = self.children[block_idx]
-                block.add_structure(line)
-                block.polygon = block.polygon.merge([line.polygon])
-                block.text_extraction_method = text_extraction_method
+                block_lines[block_idx].append((line_idx, provider_output))
                 assigned_line_idxs.add(line_idx)
-                for span in spans:
-                    self.add_full_block(span)
-                    line.add_structure(span)
 
+        # If no intersection, assign by distance
         for line_idx in provider_line_idxs.difference(assigned_line_idxs):
             min_dist = None
             min_dist_idx = None
             provider_output: ProviderOutput = provider_outputs[line_idx]
             line = provider_output.line
-            spans = provider_output.spans
             for block_idx, block in enumerate(self.children):
                 if block.block_type in excluded_block_types:
                     continue
@@ -120,12 +128,20 @@ def merge_blocks(
                     min_dist_idx = block_idx
 
             if min_dist_idx is not None:
-                self.add_full_block(line)
-                nearest_block = self.children[min_dist_idx]
-                nearest_block.add_structure(line)
-                nearest_block.polygon = nearest_block.polygon.merge([line.polygon])
-                nearest_block.text_extraction_method = text_extraction_method
+                block_lines[min_dist_idx].append((line_idx, provider_output))
                 assigned_line_idxs.add(line_idx)
+
+        # Add lines to the proper blocks, sorted in order
+        for block_idx, lines in block_lines.items():
+            lines = sorted(lines, key=lambda x: x[0])
+            block = self.children[block_idx]
+            for line_idx, provider_output in lines:
+                line = provider_output.line
+                spans = provider_output.spans
+                self.add_full_block(line)
+                block.add_structure(line)
+                block.polygon = block.polygon.merge([line.polygon])
+                block.text_extraction_method = text_extraction_method
                 for span in spans:
                     self.add_full_block(span)
                     line.add_structure(span)
diff --git a/marker/settings.py b/marker/settings.py
@@ -12,6 +12,7 @@ class Settings(BaseSettings):
     BASE_DIR: str = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
     OUTPUT_DIR: str = os.path.join(BASE_DIR, "conversion_results")
     FONT_DIR: str = os.path.join(BASE_DIR, "static", "fonts")
+    DEBUG_DATA_FOLDER: str = os.path.join(BASE_DIR, "debug_data")
 
     # General models
     TORCH_DEVICE: Optional[str] = None # Note: MPS device does not work for text detection, and will default to CPU