Skip to content

Commit

Permalink
Merge pull request #403 from VikParuchuri/dev
Browse files Browse the repository at this point in the history
Marker Improvements and Bugfixes
  • Loading branch information
VikParuchuri authored Dec 3, 2024
2 parents ea3caac + 619f5b0 commit f446e56
Show file tree
Hide file tree
Showing 39 changed files with 408 additions and 144 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ There's a hosted API for marker available [here](https://www.datalab.to/):

- Supports PDFs, word documents, and powerpoints
- 1/4th the price of leading cloud-based competitors
- High uptime (99.99%), quality, and speed (.25s/page for 50 page doc)
- High uptime (99.99%), quality, and speed (around 15 seconds to convert a 250 page PDF)

# Community

Expand Down
4 changes: 3 additions & 1 deletion benchmarks/overall.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
import subprocess
import shutil
from tabulate import tabulate

from marker.settings import settings
from scoring import score_text

configure_logging()
Expand Down Expand Up @@ -53,7 +55,7 @@ def main(in_folder: str, reference_folder: str, out_file: str, nougat: bool, md_
md_filename = fname.rsplit(".", 1)[0] + ".md"

reference_filename = os.path.join(reference_folder, md_filename)
with open(reference_filename, "r", encoding="utf-8") as f:
with open(reference_filename, "r") as f:
reference = f.read()

pdf_filename = os.path.join(in_folder, fname)
Expand Down
Binary file modified data/images/overall.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified data/images/per_doc.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
12 changes: 7 additions & 5 deletions marker/builders/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,15 @@ def __call__(self, provider: PdfProvider, layout_builder: LayoutBuilder, ocr_bui

def build_document(self, provider: PdfProvider):
PageGroupClass: PageGroup = get_block_class(BlockTypes.Page)
lowres_images = provider.get_images(provider.page_range, self.lowres_image_dpi)
highres_images = provider.get_images(provider.page_range, self.highres_image_dpi)
initial_pages = [
PageGroupClass(
page_id=i,
lowres_image=provider.get_image(i, self.lowres_image_dpi),
highres_image=provider.get_image(i, self.highres_image_dpi),
polygon=provider.get_page_bbox(i)
) for i in provider.page_range
page_id=p,
lowres_image=lowres_images[i],
highres_image=highres_images[i],
polygon=provider.get_page_bbox(p)
) for i, p in enumerate(provider.page_range)
]
DocumentClass: Document = get_block_class(BlockTypes.Document)
return DocumentClass(filepath=provider.filepath, pages=initial_pages)
4 changes: 4 additions & 0 deletions marker/builders/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,10 @@ def add_blocks_to_pages(self, pages: List[PageGroup], layout_results: List[Layou
layout_block.polygon = layout_block.polygon.rescale(layout_page_size, provider_page_size)
page.add_structure(layout_block)

# Ensure page has non-empty structure
if page.structure is None:
page.structure = []

def merge_blocks(self, document_pages: List[PageGroup], provider_page_lines: ProviderPageLines):
good_pages = []
for document_page in document_pages:
Expand Down
7 changes: 4 additions & 3 deletions marker/builders/ocr.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from typing import List

from ftfy import fix_text
from surya.model.detection.model import EfficientViTForSemanticSegmentation
from surya.model.recognition.encoderdecoder import OCREncoderDecoderModel
from surya.ocr import run_ocr

from marker.settings import settings
from marker.builders import BaseBuilder
from marker.providers import ProviderOutput, ProviderPageLines
from marker.providers.pdf import PdfProvider
Expand All @@ -14,6 +14,7 @@
from marker.schema.registry import get_block_class
from marker.schema.text.line import Line
from marker.schema.text.span import Span
from marker.settings import settings


class OcrBuilder(BaseBuilder):
Expand Down Expand Up @@ -96,13 +97,13 @@ def ocr_extraction(self, document: Document, provider: PdfProvider) -> ProviderP
)
spans = [
SpanClass(
text=ocr_line.text + "\n",
text=fix_text(ocr_line.text) + "\n",
formats=['plain'],
page_id=page_id,
polygon=polygon,
minimum_position=0,
maximum_position=0,
font='',
font='Unknown',
font_weight=0,
font_size=0,
)
Expand Down
8 changes: 8 additions & 0 deletions marker/config/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ def common_options(fn):
help="Path to JSON file with additional configuration.")(fn)
fn = click.option("--languages", type=str, default=None, help="Comma separated list of languages to use for OCR.")(fn)
fn = click.option("--disable_multiprocessing", is_flag=True, default=False, help="Disable multiprocessing.")(fn)
fn = click.option("--paginate_output", is_flag=True, default=False, help="Paginate output.")(fn)
fn = click.option("--disable_image_extraction", is_flag=True, default=False, help="Disable image extraction.")(fn)
return fn

def generate_config_dict(self) -> Dict[str, any]:
Expand Down Expand Up @@ -61,6 +63,12 @@ def generate_config_dict(self) -> Dict[str, any]:
case "disable_multiprocessing":
if v:
config["pdftext_workers"] = 1
case "paginate_output":
if v:
config["paginate_output"] = True
case "disable_image_extraction":
if v:
config["extract_images"] = False
return config

def get_renderer(self):
Expand Down
16 changes: 10 additions & 6 deletions marker/converters/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,15 @@
from marker.builders.ocr import OcrBuilder
from marker.builders.structure import StructureBuilder
from marker.converters import BaseConverter
from marker.processors.blockquote import BlockquoteProcessor
from marker.processors.code import CodeProcessor
from marker.processors.debug import DebugProcessor
from marker.processors.document_toc import DocumentTOCProcessor
from marker.processors.equation import EquationProcessor
from marker.processors.footnote import FootnoteProcessor
from marker.processors.ignoretext import IgnoreTextProcessor
from marker.processors.line_numbers import LineNumbersProcessor
from marker.processors.list import ListProcessor
from marker.processors.page_header import PageHeaderProcessor
from marker.processors.sectionheader import SectionHeaderProcessor
from marker.processors.table import TableProcessor
Expand Down Expand Up @@ -52,16 +54,18 @@ def __init__(self, artifact_dict: Dict[str, Any], processor_list: List[str] | No
processor_list = strings_to_classes(processor_list)
else:
processor_list = [
FootnoteProcessor,
PageHeaderProcessor,
EquationProcessor,
TableProcessor,
SectionHeaderProcessor,
TextProcessor,
BlockquoteProcessor,
CodeProcessor,
DocumentTOCProcessor,
EquationProcessor,
FootnoteProcessor,
IgnoreTextProcessor,
LineNumbersProcessor,
ListProcessor,
PageHeaderProcessor,
SectionHeaderProcessor,
TableProcessor,
TextProcessor,
DebugProcessor,
]

Expand Down
8 changes: 5 additions & 3 deletions marker/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from marker.renderers.html import HTMLOutput
from marker.renderers.json import JSONOutput
from marker.renderers.markdown import MarkdownOutput
from marker.settings import settings


def output_exists(output_dir: str, fname_base: str):
Expand All @@ -29,11 +30,12 @@ def text_from_rendered(rendered: BaseModel):

def save_output(rendered: BaseModel, output_dir: str, fname_base: str):
text, ext, images = text_from_rendered(rendered)
text = text.encode(settings.OUTPUT_ENCODING, errors='replace').decode(settings.OUTPUT_ENCODING)

with open(os.path.join(output_dir, f"{fname_base}.{ext}"), "w+") as f:
with open(os.path.join(output_dir, f"{fname_base}.{ext}"), "w+", encoding=settings.OUTPUT_ENCODING) as f:
f.write(text)
with open(os.path.join(output_dir, f"{fname_base}_meta.json"), "w+") as f:
with open(os.path.join(output_dir, f"{fname_base}_meta.json"), "w+", encoding=settings.OUTPUT_ENCODING) as f:
f.write(json.dumps(rendered.metadata, indent=2))

for img_name, img in images.items():
img.save(os.path.join(output_dir, img_name), "PNG")
img.save(os.path.join(output_dir, img_name), "PNG", optimize=False, compress_level=3)
49 changes: 49 additions & 0 deletions marker/processors/blockquote.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from marker.processors import BaseProcessor
from marker.schema import BlockTypes
from marker.schema.document import Document


class BlockquoteProcessor(BaseProcessor):
"""
A processor for tagging blockquotes
"""
block_types = (BlockTypes.Text, BlockTypes.TextInlineMath)
min_x_indent = 0.05 # % of block width
x_start_tolerance = 0.01 # % of block width
x_end_tolerance = 0.01 # % of block width

def __init__(self, config):
super().__init__(config)

def __call__(self, document: Document):
for page in document.pages:
for block in page.contained_blocks(document, self.block_types):
if block.structure is None:
continue

if not len(block.structure) >= 2:
continue

next_block = page.get_next_block(block)
if next_block is None:
continue
if next_block.block_type not in self.block_types:
continue
if next_block.structure is None:
continue
if next_block.ignore_for_output:
continue

matching_x_end = abs(next_block.polygon.x_end - block.polygon.x_end) < self.x_end_tolerance * block.polygon.width
matching_x_start = abs(next_block.polygon.x_start - block.polygon.x_start) < self.x_start_tolerance * block.polygon.width
x_indent = next_block.polygon.x_start > block.polygon.x_start + (self.min_x_indent * block.polygon.width)
y_indent = next_block.polygon.y_start > block.polygon.y_end

if block.blockquote:
next_block.blockquote = (matching_x_end and matching_x_start) or (x_indent and y_indent)
next_block.blockquote_level = block.blockquote_level
if (x_indent and y_indent):
next_block.blockquote_level += 1
elif len(next_block.structure) >= 2 and (x_indent and y_indent):
next_block.blockquote = True
next_block.blockquote_level = 1
10 changes: 5 additions & 5 deletions marker/processors/debug.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def __call__(self, document: Document):
print(f"Dumped block debug data to {self.debug_data_folder}")

def draw_pdf_debug_images(self, document: Document):
for idx, page in enumerate(document.pages):
for page in document.pages:
png_image = page.highres_image.copy()

line_bboxes = []
Expand All @@ -87,12 +87,12 @@ def draw_pdf_debug_images(self, document: Document):

png_image = self.render_layout_boxes(page, png_image)

debug_file = os.path.join(self.debug_folder, f"pdf_page_{idx}.png")
debug_file = os.path.join(self.debug_folder, f"pdf_page_{page.page_id}.png")
png_image.save(debug_file)


def draw_layout_debug_images(self, document: Document, pdf_mode=False):
for idx, page in enumerate(document.pages):
for page in document.pages:
img_size = page.highres_image.size
png_image = Image.new("RGB", img_size, color="white")

Expand All @@ -110,7 +110,7 @@ def draw_layout_debug_images(self, document: Document, pdf_mode=False):

png_image = self.render_layout_boxes(page, png_image)

debug_file = os.path.join(self.debug_folder, f"layout_page_{idx}.png")
debug_file = os.path.join(self.debug_folder, f"layout_page_{page.page_id}.png")
png_image.save(debug_file)


Expand Down Expand Up @@ -143,7 +143,7 @@ def render_layout_boxes(self, page, png_image):
def dump_block_debug_data(self, document: Document):
debug_file = os.path.join(self.debug_folder, f"blocks.json")
debug_data = []
for idx, page in enumerate(document.pages):
for page in document.pages:
page_data = page.model_dump(exclude=["lowres_image", "highres_image"])
debug_data.append(page_data)

Expand Down
6 changes: 3 additions & 3 deletions marker/processors/equation.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ class EquationProcessor(BaseProcessor):
"""
block_types = (BlockTypes.Equation, )
model_max_length = 384
batch_size = None
texify_batch_size = None
token_buffer = 256

def __init__(self, texify_model: GenerateVisionEncoderDecoderModel, config=None):
Expand Down Expand Up @@ -68,8 +68,8 @@ def __call__(self, document: Document):
block.latex = prediction

def get_batch_size(self):
if self.batch_size is not None:
return self.batch_size
if self.texify_batch_size is not None:
return self.texify_batch_size
elif settings.TORCH_DEVICE_MODEL == "cuda":
return 6
elif settings.TORCH_DEVICE_MODEL == "mps":
Expand Down
90 changes: 90 additions & 0 deletions marker/processors/list.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
from typing import List

from marker.processors import BaseProcessor
from marker.schema import BlockTypes
from marker.schema.blocks import ListItem
from marker.schema.document import Document


class ListProcessor(BaseProcessor):
"""
A processor for merging lists across pages and columns
"""
block_types = (BlockTypes.ListGroup,)
ignored_block_types = (BlockTypes.PageHeader, BlockTypes.PageFooter)
min_x_indent = 0.01 # % of page width

def __init__(self, config):
super().__init__(config)

def __call__(self, document: Document):
self.list_group_continuation(document)
self.list_group_indentation(document)

def list_group_continuation(self, document: Document):
for page in document.pages:
for block in page.contained_blocks(document, self.block_types):
next_block = document.get_next_block(block, self.ignored_block_types)
if next_block is None:
continue
if next_block.block_type not in self.block_types:
continue
if next_block.structure is None:
continue
if next_block.ignore_for_output:
continue

column_break, page_break = False, False
next_block_in_first_quadrant = False

if next_block.page_id == block.page_id: # block on the same page
# we check for a column break
column_break = next_block.polygon.y_start <= block.polygon.y_end
else:
page_break = True
next_page = document.get_page(next_block.page_id)
next_block_in_first_quadrant = (next_block.polygon.x_start < next_page.polygon.width // 2) and \
(next_block.polygon.y_start < next_page.polygon.height // 2)

block.has_continuation = column_break or (page_break and next_block_in_first_quadrant)

def list_group_indentation(self, document: Document):
for page in document.pages:
for block in page.contained_blocks(document, self.block_types):
if block.structure is None:
continue
if block.ignore_for_output:
continue

stack: List[ListItem] = [block.get_next_block(page, None)]
for list_item_id in block.structure:
list_item_block: ListItem = page.get_block(list_item_id)

while stack and list_item_block.polygon.x_start <= stack[-1].polygon.x_start + (self.min_x_indent * page.polygon.width):
stack.pop()

if stack and list_item_block.polygon.y_start > stack[-1].polygon.y_start:
list_item_block.list_indent_level = stack[-1].list_indent_level
if list_item_block.polygon.x_start > stack[-1].polygon.x_start + (self.min_x_indent * page.polygon.width):
list_item_block.list_indent_level += 1

next_list_item_block = block.get_next_block(page, list_item_block)
if next_list_item_block is not None and next_list_item_block.polygon.x_start > list_item_block.polygon.x_end:
stack = [next_list_item_block] # reset stack on column breaks
else:
stack.append(list_item_block)

stack: List[ListItem] = [block.get_next_block(page, None)]
for list_item_id in block.structure.copy():
list_item_block: ListItem = page.get_block(list_item_id)

while stack and list_item_block.list_indent_level <= stack[-1].list_indent_level:
stack.pop()

if stack:
current_parent = stack[-1]
current_parent.add_structure(list_item_block)
current_parent.polygon = current_parent.polygon.merge([list_item_block.polygon])

block.remove_structure_items([list_item_id])
stack.append(list_item_block)
2 changes: 1 addition & 1 deletion marker/processors/page_header.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ class PageHeaderProcessor(BaseProcessor):
"""
A processor for moving PageHeaders to the top
"""
block_types = (BlockTypes.PageHeader, BlockTypes.PageFooter)
block_types = (BlockTypes.PageHeader)

def __call__(self, document: Document):
for page in document.pages:
Expand Down
Loading

0 comments on commit f446e56

Please sign in to comment.