Skip to content

Commit

Permalink
Merge pull request #387 from VikParuchuri/vik_v2
Browse files Browse the repository at this point in the history
Vik v2
  • Loading branch information
VikParuchuri authored Nov 26, 2024
2 parents 96d1b81 + 69233b7 commit f26834c
Show file tree
Hide file tree
Showing 25 changed files with 555 additions and 17,757 deletions.
290 changes: 178 additions & 112 deletions README.md

Large diffs are not rendered by default.

350 changes: 0 additions & 350 deletions data/examples/marker/multicolcnn.md

This file was deleted.

925 changes: 0 additions & 925 deletions data/examples/marker/switch_transformers.md

This file was deleted.

2,248 changes: 0 additions & 2,248 deletions data/examples/marker/thinkos.md

This file was deleted.

6,369 changes: 0 additions & 6,369 deletions data/examples/marker/thinkpython.md

This file was deleted.

245 changes: 0 additions & 245 deletions data/examples/nougat/multicolcnn.md

This file was deleted.

528 changes: 0 additions & 528 deletions data/examples/nougat/switch_transformers.md

This file was deleted.

1,380 changes: 0 additions & 1,380 deletions data/examples/nougat/thinkos.md

This file was deleted.

5,394 changes: 0 additions & 5,394 deletions data/examples/nougat/thinkpython.md

This file was deleted.

12 changes: 11 additions & 1 deletion marker/builders/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ def merge_blocks(self, document_pages: List[PageGroup], provider_page_lines: Pro
document_page.text_extraction_method = "surya"
continue
document_page.merge_blocks(provider_lines, text_extraction_method="pdftext")
document_page.text_extraction_method = "pdftext"

def check_layout_coverage(
self,
Expand All @@ -88,6 +89,7 @@ def check_layout_coverage(
):
covered_blocks = 0
total_blocks = 0
large_text_blocks = 0
for layout_block_id in document_page.structure:
layout_block = document_page.get_block(layout_block_id)
if layout_block.block_type in [BlockTypes.Figure, BlockTypes.Picture, BlockTypes.Table, BlockTypes.FigureGroup, BlockTypes.TableGroup, BlockTypes.PictureGroup]:
Expand All @@ -102,5 +104,13 @@ def check_layout_coverage(
if intersecting_lines > self.layout_coverage_min_lines:
covered_blocks += 1

if layout_block.polygon.intersection_pct(document_page.polygon) > 0.8 and layout_block.block_type == BlockTypes.Text:
large_text_blocks += 1

coverage_ratio = covered_blocks / total_blocks if total_blocks > 0 else 1
return coverage_ratio >= self.layout_coverage_threshold
text_okay = coverage_ratio >= self.layout_coverage_threshold

# Model will sometimes say there is a single block of text on the page when it is blank
if not text_okay and (total_blocks == 1 and large_text_blocks == 1):
text_okay = True
return text_okay
2 changes: 2 additions & 0 deletions marker/converters/pdf.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os

from marker.processors.footnote import FootnoteProcessor
from marker.processors.line_numbers import LineNumbersProcessor

os.environ["TOKENIZERS_PARALLELISM"] = "false" # disables a tokenizers warning
Expand Down Expand Up @@ -54,6 +55,7 @@ def __init__(self, artifact_dict: Dict[str, Any], processor_list: List[str] | No
processor_list = strings_to_classes(processor_list)
else:
processor_list = [
FootnoteProcessor,
EquationProcessor,
TableProcessor,
SectionHeaderProcessor,
Expand Down
15 changes: 6 additions & 9 deletions marker/processors/code.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,25 +15,22 @@ def __call__(self, document: Document):
for block in page.contained_blocks(document, self.block_types):
self.format_block(document, block)


def format_block(self, document: Document, block: Code):
min_left = 9999 # will contain x- coord of column 0
total_width = 0
total_chars = 0

if block.structure is None:
return

for line_id in block.structure:
line = document.get_block(line_id)

contained_lines = block.contained_blocks(document, (BlockTypes.Line,))
for line in contained_lines:
min_left = min(line.polygon.bbox[0], min_left)
total_width += line.polygon.width
total_chars += len(line.raw_text(document))

avg_char_width = total_width / max(total_chars, 1)
code_text = ""
is_new_line = False
for line_id in block.structure:
line = document.get_block(line_id)
for line in contained_lines:
text = line.raw_text(document)
if avg_char_width == 0:
prefix = ""
Expand All @@ -47,4 +44,4 @@ def format_block(self, document: Document, block: Code):
code_text += text
is_new_line = text.endswith("\n")

block.code = code_text
block.code = code_text.rstrip()
2 changes: 2 additions & 0 deletions marker/processors/debug.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ def __call__(self, document: Document):
if any([self.debug_layout_images, self.debug_pdf_images, self.debug_json]):
os.makedirs(self.debug_folder, exist_ok=True)

document.debug_data_path = self.debug_folder

if self.debug_layout_images:
self.draw_layout_debug_images(document)
print(f"Dumped layout debug images to {self.debug_data_folder}")
Expand Down
88 changes: 88 additions & 0 deletions marker/processors/footnote.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import re
from collections import Counter
from statistics import mean

from marker.processors import BaseProcessor
from marker.schema import BlockTypes
from marker.schema.blocks import Footnote
from marker.schema.document import Document

from rapidfuzz import fuzz

from marker.schema.groups import PageGroup


class FootnoteProcessor(BaseProcessor):
"""
A processor for pushing footnotes to the bottom, and relabeling mislabeled text blocks.
Attributes:
page_bottom_threshold (float):
The fraction of page height that is considered the bottom.
Default is .8
line_height_scaler (float):
The amount to scale line height by to consider a block a footnote. (from N to 1+(1-N))
Default is .99
"""
block_types = (BlockTypes.Footnote,)
page_bottom_threshold = .75
line_height_scaler = .99


def __call__(self, document: Document):
footnote_heights = self.compute_block_stats(document)
if len(footnote_heights) == 0:
footnote_heights = [999]

avg_footnote_height = mean(footnote_heights)
for page in document.pages:
self.relabel_texts_to_footnotes(page, document, avg_footnote_height)
self.push_footnotes_to_bottom(page, document)

def compute_block_stats(self, document: Document):
line_heights = []
for page in document.pages:
for footnote in page.contained_blocks(document, self.block_types):
contained_lines = footnote.contained_blocks(document, (BlockTypes.Line,))
line_heights.extend([line.polygon.height for line in contained_lines])
return line_heights


def relabel_texts_to_footnotes(self, page: PageGroup, document: Document, avg_footnote_height: int):
text_blocks = page.contained_blocks(document, (BlockTypes.Text,))
block_stats = []

for block in text_blocks:
contained_lines = block.contained_blocks(document, (BlockTypes.Line,))
line_heights = [line.polygon.height for line in contained_lines]

block_stats.append({
"line_height": mean(line_heights) if len(line_heights) > 0 else 999,
"in_bottom": block.polygon.y_end > page.polygon.height * self.page_bottom_threshold
})

# Find the average font size and line height
if len(block_stats) == 0:
return

height_gap = 1 - self.line_height_scaler
for text_block, stats_dict in zip(text_blocks, block_stats):
if all([
avg_footnote_height * self.line_height_scaler < stats_dict["line_height"] < avg_footnote_height * (1 + height_gap),
stats_dict["in_bottom"]
]):
new_block = Footnote.from_block(text_block)
page.replace_block(text_block, new_block)


def push_footnotes_to_bottom(self, page: PageGroup, document: Document):
footnote_blocks = page.contained_blocks(document, self.block_types)

# Push footnotes to the bottom
for block in footnote_blocks:
# Check if it is top-level
if block.id in page.structure:
# Move to bottom if it is
page.structure.remove(block.id)
page.add_structure(block)
13 changes: 10 additions & 3 deletions marker/renderers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import base64
import io
import re
from collections import Counter
from typing import Optional

from bs4 import BeautifulSoup
Expand Down Expand Up @@ -56,17 +57,23 @@ def replace_whitespace(match):
def generate_page_stats(self, document, document_output):
page_stats = []
for page in document.pages:
block_counts = Counter([str(block.block_type) for block in page.children]).most_common()
page_stats.append({
"page_id": page.page_id,
"text_extraction_method": page.text_extraction_method
"text_extraction_method": page.text_extraction_method,
"block_counts": block_counts,
})
return page_stats

def generate_document_metadata(self, document, document_output):
return {
metadata = {
"table_of_contents": document.table_of_contents,
"page_stats": self.generate_page_stats(document, document_output)
"page_stats": self.generate_page_stats(document, document_output),
}
if document.debug_data_path is not None:
metadata["debug_data_path"] = document.debug_data_path

return metadata

def extract_block_html(self, document, block_output):
soup = BeautifulSoup(block_output.html, 'html.parser')
Expand Down
4 changes: 0 additions & 4 deletions marker/renderers/json.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,12 @@
from __future__ import annotations

import base64
import io
from typing import List, Dict

from bs4 import BeautifulSoup
from pydantic import BaseModel

from marker.schema.blocks import Block
from marker.renderers import BaseRenderer
from marker.schema import BlockTypes
from marker.schema.blocks import BlockId
from marker.schema.registry import get_block_class


Expand Down
12 changes: 12 additions & 0 deletions marker/schema/blocks/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,11 @@ def id(self) -> BlockId:
block_type=self.block_type
)

@classmethod
def from_block(cls, block: Block) -> Block:
block_attrs = block.model_dump(exclude=["id", "block_id", "block_type"])
return cls(**block_attrs)

def structure_blocks(self, document_page: Document | PageGroup) -> List[Block]:
if self.structure is None:
return []
Expand Down Expand Up @@ -147,6 +152,13 @@ def contained_blocks(self, document: Document, block_types: Sequence[BlockTypes]
blocks += block.contained_blocks(document, block_types)
return blocks

def replace_block(self, block: Block, new_block: Block):
if self.structure is not None:
for i, item in enumerate(self.structure):
if item == block.id:
self.structure[i] = new_block.id
break

def render(self, document: Document, parent_structure: Optional[List[str]], section_hierarchy=None):
child_content = []
if section_hierarchy is None:
Expand Down
1 change: 1 addition & 0 deletions marker/schema/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ class Document(BaseModel):
pages: List[PageGroup]
block_type: BlockTypes = BlockTypes.Document
table_of_contents: List[TocItem] | None = None
debug_data_path: str | None = None # Path that debug data was saved to

def get_block(self, block_id: BlockId):
page = self.get_page(block_id.page_id)
Expand Down
50 changes: 33 additions & 17 deletions marker/schema/groups/page.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from typing import List
from collections import defaultdict
from typing import Dict, List, TYPE_CHECKING

from PIL import Image

Expand All @@ -8,6 +9,9 @@
from marker.schema.groups.base import Group
from marker.schema.polygon import PolygonBox

if TYPE_CHECKING:
from marker.schema.document import Document


class PageGroup(Group):
block_type: BlockTypes = BlockTypes.Page
Expand Down Expand Up @@ -80,6 +84,17 @@ def compute_line_block_intersections(self, provider_outputs: List[ProviderOutput
max_intersections[line_idx] = (intersection_pct, block_idx)
return max_intersections

def replace_block(self, block: Block, new_block: Block):
# Handles incrementing the id
self.add_full_block(new_block)

# Replace block id in structure
super().replace_block(block, new_block)

# Replace block in structure of children
for child in self.children:
child.replace_block(block, new_block)

def merge_blocks(
self,
provider_outputs: List[ProviderOutput],
Expand All @@ -89,28 +104,21 @@ def merge_blocks(
provider_line_idxs = set(range(len(provider_outputs)))
max_intersections = self.compute_line_block_intersections(provider_outputs, excluded_block_types)

# Try to assign lines by intersection
assigned_line_idxs = set()
block_lines = defaultdict(list)
for line_idx, provider_output in enumerate(provider_outputs):
if line_idx in max_intersections and max_intersections[line_idx][0] > 0.0:
line = provider_output.line
spans = provider_output.spans
self.add_full_block(line)
block_idx = max_intersections[line_idx][1]
block: Block = self.children[block_idx]
block.add_structure(line)
block.polygon = block.polygon.merge([line.polygon])
block.text_extraction_method = text_extraction_method
block_lines[block_idx].append((line_idx, provider_output))
assigned_line_idxs.add(line_idx)
for span in spans:
self.add_full_block(span)
line.add_structure(span)

# If no intersection, assign by distance
for line_idx in provider_line_idxs.difference(assigned_line_idxs):
min_dist = None
min_dist_idx = None
provider_output: ProviderOutput = provider_outputs[line_idx]
line = provider_output.line
spans = provider_output.spans
for block_idx, block in enumerate(self.children):
if block.block_type in excluded_block_types:
continue
Expand All @@ -120,12 +128,20 @@ def merge_blocks(
min_dist_idx = block_idx

if min_dist_idx is not None:
self.add_full_block(line)
nearest_block = self.children[min_dist_idx]
nearest_block.add_structure(line)
nearest_block.polygon = nearest_block.polygon.merge([line.polygon])
nearest_block.text_extraction_method = text_extraction_method
block_lines[min_dist_idx].append((line_idx, provider_output))
assigned_line_idxs.add(line_idx)

# Add lines to the proper blocks, sorted in order
for block_idx, lines in block_lines.items():
lines = sorted(lines, key=lambda x: x[0])
block = self.children[block_idx]
for line_idx, provider_output in lines:
line = provider_output.line
spans = provider_output.spans
self.add_full_block(line)
block.add_structure(line)
block.polygon = block.polygon.merge([line.polygon])
block.text_extraction_method = text_extraction_method
for span in spans:
self.add_full_block(span)
line.add_structure(span)
1 change: 1 addition & 0 deletions marker/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ class Settings(BaseSettings):
BASE_DIR: str = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
OUTPUT_DIR: str = os.path.join(BASE_DIR, "conversion_results")
FONT_DIR: str = os.path.join(BASE_DIR, "static", "fonts")
DEBUG_DATA_FOLDER: str = os.path.join(BASE_DIR, "debug_data")

# General models
TORCH_DEVICE: Optional[str] = None # Note: MPS device does not work for text detection, and will default to CPU
Expand Down
Loading

0 comments on commit f26834c

Please sign in to comment.