Skip to content

Commit

Permalink
Merge pull request #392 from VikParuchuri/v2
Browse files Browse the repository at this point in the history
Marker v2
  • Loading branch information
VikParuchuri authored Nov 26, 2024
2 parents c198419 + c9ea515 commit c78f4af
Show file tree
Hide file tree
Showing 215 changed files with 184,036 additions and 21,310 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ env:
OCR_ENGINE: "surya"

jobs:
build:
benchmark:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
Expand All @@ -28,7 +28,7 @@ jobs:
- name: Run benchmark test
run: |
poetry run python benchmarks/overall.py benchmark_data/pdfs benchmark_data/references report.json
poetry run python scripts/verify_benchmark_scores.py report.json --type marker
poetry run python benchmarks/verify_scores.py report.json --type marker
25 changes: 25 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
name: CI tests

on: [push]

env:
TORCH_DEVICE: "cpu"
OCR_ENGINE: "surya"

jobs:
tests:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up Python 3.11
uses: actions/setup-python@v4
with:
python-version: 3.11
- name: Install python dependencies
run: |
pip install poetry
poetry install
- name: Run tests
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: poetry run pytest
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ wandb
report.json
benchmark_data
debug_data
temp.md
temp
conversion_results

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down Expand Up @@ -170,3 +173,5 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/

.vscode/
311 changes: 186 additions & 125 deletions README.md

Large diffs are not rendered by default.

89 changes: 31 additions & 58 deletions benchmarks/overall.py
Original file line number Diff line number Diff line change
@@ -1,42 +1,26 @@
import argparse
import tempfile
import time
from collections import defaultdict

import click
from tqdm import tqdm
import pypdfium2 as pdfium

from marker.convert import convert_single_pdf
from marker.config.parser import ConfigParser
from marker.converters.pdf import PdfConverter
from marker.logger import configure_logging
from marker.models import load_all_models
from marker.benchmark.scoring import score_text
from marker.pdf.extract_text import naive_get_text
from marker.models import create_model_dict
from pdftext.extraction import plain_text_output
import json
import os
import subprocess
import shutil
from tabulate import tabulate
import torch
from scoring import score_text

configure_logging()


def start_memory_profiling():
torch.cuda.memory._record_memory_history(
max_entries=100000
)


def stop_memory_profiling(memory_file):
try:
torch.cuda.memory._dump_snapshot(memory_file)
except Exception as e:
logger.error(f"Failed to capture memory snapshot {e}")

# Stop recording memory snapshot history.
torch.cuda.memory._record_memory_history(enabled=None)


def nougat_prediction(pdf_filename, batch_size=1):
out_dir = tempfile.mkdtemp()
subprocess.run(["nougat", pdf_filename, "-o", out_dir, "--no-skipping", "--recompute", "--batchsize", str(batch_size)], check=True)
Expand All @@ -46,62 +30,51 @@ def nougat_prediction(pdf_filename, batch_size=1):
shutil.rmtree(out_dir)
return data


def main():
parser = argparse.ArgumentParser(description="Benchmark PDF to MD conversion. Needs source pdfs, and a reference folder with the correct markdown.")
parser.add_argument("in_folder", help="Input PDF files")
parser.add_argument("reference_folder", help="Reference folder with reference markdown files")
parser.add_argument("out_file", help="Output filename")
parser.add_argument("--nougat", action="store_true", help="Run nougat and compare", default=False)
# Nougat batch size 1 uses about as much VRAM as default marker settings
parser.add_argument("--marker_batch_multiplier", type=int, default=1, help="Batch size multiplier to use for marker when making predictions.")
parser.add_argument("--nougat_batch_size", type=int, default=1, help="Batch size to use for nougat when making predictions.")
parser.add_argument("--md_out_path", type=str, default=None, help="Output path for generated markdown files")
parser.add_argument("--profile_memory", action="store_true", help="Profile memory usage", default=False)

args = parser.parse_args()

@click.command(help="Benchmark PDF to MD conversion.")
@click.argument("in_folder", type=str)
@click.argument("reference_folder", type=str)
@click.argument("out_file", type=str)
@click.option("--nougat", is_flag=True, help="Run nougat and compare")
@click.option("--md_out_path", type=str, default=None, help="Output path for generated markdown files")
def main(in_folder: str, reference_folder: str, out_file: str, nougat: bool, md_out_path: str):
methods = ["marker"]
if args.nougat:
if nougat:
methods.append("nougat")

if args.profile_memory:
start_memory_profiling()

model_lst = load_all_models()

if args.profile_memory:
stop_memory_profiling("model_load.pickle")
model_dict = create_model_dict()

scores = defaultdict(dict)
benchmark_files = os.listdir(args.in_folder)
benchmark_files = os.listdir(in_folder)
benchmark_files = [b for b in benchmark_files if b.endswith(".pdf")]
times = defaultdict(dict)
pages = defaultdict(int)

for idx, fname in tqdm(enumerate(benchmark_files)):
md_filename = fname.rsplit(".", 1)[0] + ".md"

reference_filename = os.path.join(args.reference_folder, md_filename)
reference_filename = os.path.join(reference_folder, md_filename)
with open(reference_filename, "r", encoding="utf-8") as f:
reference = f.read()

pdf_filename = os.path.join(args.in_folder, fname)
pdf_filename = os.path.join(in_folder, fname)
doc = pdfium.PdfDocument(pdf_filename)
pages[fname] = len(doc)

config_parser = ConfigParser({"output_format": "markdown"})
for method in methods:
start = time.time()
if method == "marker":
if args.profile_memory:
start_memory_profiling()
full_text, _, out_meta = convert_single_pdf(pdf_filename, model_lst, batch_multiplier=args.marker_batch_multiplier)
if args.profile_memory:
stop_memory_profiling(f"marker_memory_{idx}.pickle")
converter = PdfConverter(
config=config_parser.generate_config_dict(),
artifact_dict=model_dict,
processor_list=None,
renderer=config_parser.get_renderer()
)
full_text = converter(pdf_filename).markdown
elif method == "nougat":
full_text = nougat_prediction(pdf_filename, batch_size=args.nougat_batch_size)
full_text = nougat_prediction(pdf_filename, batch_size=1)
elif method == "naive":
full_text = naive_get_text(doc)
full_text = plain_text_output(doc, workers=1)
else:
raise ValueError(f"Unknown method {method}")

Expand All @@ -110,13 +83,13 @@ def main():
score = score_text(full_text, reference)
scores[method][fname] = score

if args.md_out_path:
if md_out_path:
md_out_filename = f"{method}_{md_filename}"
with open(os.path.join(args.md_out_path, md_out_filename), "w+") as f:
with open(os.path.join(md_out_path, md_out_filename), "w+") as f:
f.write(full_text)

total_pages = sum(pages.values())
with open(args.out_file, "w+") as f:
with open(out_file, "w+") as f:
write_data = defaultdict(dict)
for method in methods:
total_time = sum(times[method].values())
Expand Down
4 changes: 0 additions & 4 deletions marker/benchmark/scoring.py → benchmarks/scoring.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,4 @@
import math

from rapidfuzz import fuzz
import re
import regex
from statistics import mean

CHUNK_MIN_CHARS = 25
Expand Down
File renamed without changes.
2 changes: 0 additions & 2 deletions chunk_convert.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,6 @@ for (( i=0; i<$NUM_DEVICES; i++ )); do
export NUM_WORKERS
echo "Running convert.py on GPU $DEVICE_NUM"
cmd="CUDA_VISIBLE_DEVICES=$DEVICE_NUM marker $INPUT_FOLDER $OUTPUT_FOLDER --num_chunks $NUM_DEVICES --chunk_idx $DEVICE_NUM --workers $NUM_WORKERS"
[[ -n "$METADATA_FILE" ]] && cmd="$cmd --metadata_file $METADATA_FILE"
[[ -n "$MIN_LENGTH" ]] && cmd="$cmd --min_length $MIN_LENGTH"
eval $cmd &

sleep 5
Expand Down
Loading

0 comments on commit c78f4af

Please sign in to comment.