Skip to content

Commit

Permalink
Merge pull request #397 from VikParuchuri/vik_fixes
Browse files Browse the repository at this point in the history
Misc fixes
  • Loading branch information
VikParuchuri authored Nov 30, 2024
2 parents 1047d9c + c086efe commit 7f869ee
Show file tree
Hide file tree
Showing 11 changed files with 37 additions and 20 deletions.
4 changes: 3 additions & 1 deletion benchmarks/overall.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
import subprocess
import shutil
from tabulate import tabulate

from marker.settings import settings
from scoring import score_text

configure_logging()
Expand Down Expand Up @@ -53,7 +55,7 @@ def main(in_folder: str, reference_folder: str, out_file: str, nougat: bool, md_
md_filename = fname.rsplit(".", 1)[0] + ".md"

reference_filename = os.path.join(reference_folder, md_filename)
with open(reference_filename, "r", encoding="utf-8") as f:
with open(reference_filename, "r") as f:
reference = f.read()

pdf_filename = os.path.join(in_folder, fname)
Expand Down
Binary file modified data/images/overall.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified data/images/per_doc.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
4 changes: 4 additions & 0 deletions marker/config/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def common_options(fn):
help="Path to JSON file with additional configuration.")(fn)
fn = click.option("--languages", type=str, default=None, help="Comma separated list of languages to use for OCR.")(fn)
fn = click.option("--disable_multiprocessing", is_flag=True, default=False, help="Disable multiprocessing.")(fn)
fn = click.option("--paginate_output", is_flag=True, default=False, help="Paginate output.")(fn)
return fn

def generate_config_dict(self) -> Dict[str, any]:
Expand Down Expand Up @@ -61,6 +62,9 @@ def generate_config_dict(self) -> Dict[str, any]:
case "disable_multiprocessing":
if v:
config["pdftext_workers"] = 1
case "paginate_output":
if v:
config["paginate_output"] = True
return config

def get_renderer(self):
Expand Down
5 changes: 3 additions & 2 deletions marker/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from marker.renderers.html import HTMLOutput
from marker.renderers.json import JSONOutput
from marker.renderers.markdown import MarkdownOutput
from marker.settings import settings


def output_exists(output_dir: str, fname_base: str):
Expand All @@ -30,9 +31,9 @@ def text_from_rendered(rendered: BaseModel):
def save_output(rendered: BaseModel, output_dir: str, fname_base: str):
text, ext, images = text_from_rendered(rendered)

with open(os.path.join(output_dir, f"{fname_base}.{ext}"), "w+") as f:
with open(os.path.join(output_dir, f"{fname_base}.{ext}"), "w+", encoding=settings.OUTPUT_ENCODING) as f:
f.write(text)
with open(os.path.join(output_dir, f"{fname_base}_meta.json"), "w+") as f:
with open(os.path.join(output_dir, f"{fname_base}_meta.json"), "w+", encoding=settings.OUTPUT_ENCODING) as f:
f.write(json.dumps(rendered.metadata, indent=2))

for img_name, img in images.items():
Expand Down
6 changes: 3 additions & 3 deletions marker/processors/equation.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ class EquationProcessor(BaseProcessor):
"""
block_types = (BlockTypes.Equation, )
model_max_length = 384
batch_size = None
texify_batch_size = None
token_buffer = 256

def __init__(self, texify_model: GenerateVisionEncoderDecoderModel, config=None):
Expand Down Expand Up @@ -68,8 +68,8 @@ def __call__(self, document: Document):
block.latex = prediction

def get_batch_size(self):
if self.batch_size is not None:
return self.batch_size
if self.texify_batch_size is not None:
return self.texify_batch_size
elif settings.TORCH_DEVICE_MODEL == "cuda":
return 6
elif settings.TORCH_DEVICE_MODEL == "mps":
Expand Down
3 changes: 2 additions & 1 deletion marker/renderers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from marker.schema import BlockTypes
from marker.schema.blocks.base import BlockId, BlockOutput
from marker.schema.document import Document
from marker.settings import settings
from marker.util import assign_config


Expand All @@ -35,7 +36,7 @@ def extract_image(document: Document, image_id, to_base64=False):
if to_base64:
image_buffer = io.BytesIO()
cropped.save(image_buffer, format='PNG')
cropped = base64.b64encode(image_buffer.getvalue()).decode('utf-8')
cropped = base64.b64encode(image_buffer.getvalue()).decode(settings.OUTPUT_ENCODING)
return cropped

@staticmethod
Expand Down
3 changes: 3 additions & 0 deletions marker/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ class Settings(BaseSettings):
FONT_DIR: str = os.path.join(BASE_DIR, "static", "fonts")
DEBUG_DATA_FOLDER: str = os.path.join(BASE_DIR, "debug_data")

# General
OUTPUT_ENCODING: str = "utf-8"

# General models
TORCH_DEVICE: Optional[str] = None # Note: MPS device does not work for text detection, and will default to CPU

Expand Down
25 changes: 15 additions & 10 deletions marker_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,7 @@ def load_models():
return create_model_dict()


def convert_pdf(fname: str, **kwargs) -> (str, Dict[str, Any], dict):
config_parser = ConfigParser(kwargs)
def convert_pdf(fname: str, config_parser: ConfigParser) -> (str, Dict[str, Any], dict):
config_dict = config_parser.generate_config_dict()
config_dict["pdftext_workers"] = 1
converter = PdfConverter(
Expand Down Expand Up @@ -122,18 +121,24 @@ def page_count(pdf_file):
st.stop()

# Run Marker
with tempfile.NamedTemporaryFile(suffix=".pdf") as temp_pdf:
with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb+") as temp_pdf:
temp_pdf.write(in_file.getvalue())
temp_pdf.seek(0)
filename = temp_pdf.name
cli_options = {
"output_format": output_format,
"page_range": page_range,
"force_ocr": force_ocr,
"debug": debug,
"output_dir": settings.DEBUG_DATA_FOLDER if debug else None,
}
config_parser = ConfigParser(cli_options)
rendered = convert_pdf(
filename,
page_range=page_range,
force_ocr=force_ocr,
output_format=output_format,
output_dir=settings.DEBUG_DATA_FOLDER if debug else None,
debug=debug
config_parser
)
page_range = config_parser.generate_config_dict()["page_range"]
first_page = page_range[0] if page_range else 0

text, ext, images = text_from_rendered(rendered)
with col2:
Expand All @@ -149,10 +154,10 @@ def page_count(pdf_file):
with col1:
debug_data_path = rendered.metadata.get("debug_data_path")
if debug_data_path:
pdf_image_path = os.path.join(debug_data_path, f"pdf_page_0.png")
pdf_image_path = os.path.join(debug_data_path, f"pdf_page_{first_page}.png")
img = Image.open(pdf_image_path)
st.image(img, caption="PDF debug image", use_container_width=True)
layout_image_path = os.path.join(debug_data_path, f"layout_page_0.png")
layout_image_path = os.path.join(debug_data_path, f"layout_page_{first_page}.png")
img = Image.open(layout_image_path)
st.image(img, caption="Layout debug image", use_container_width=True)

5 changes: 3 additions & 2 deletions marker_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from fastapi import FastAPI, Form, File, UploadFile
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.settings import settings

app_data = {}

Expand Down Expand Up @@ -110,7 +111,7 @@ async def _convert_pdf(params: CommonParams):
for k, v in images.items():
byte_stream = io.BytesIO()
v.save(byte_stream, format="PNG")
encoded[k] = base64.b64encode(byte_stream.getvalue()).decode("utf-8")
encoded[k] = base64.b64encode(byte_stream.getvalue()).decode(settings.OUTPUT_ENCODING)

return {
"format": params.output_format,
Expand Down Expand Up @@ -140,7 +141,7 @@ async def convert_pdf_upload(
),
):
upload_path = os.path.join(UPLOAD_DIRECTORY, file.filename)
with open(upload_path, "wb") as upload_file:
with open(upload_path, "wb+") as upload_file:
file_contents = await file.read()
upload_file.write(file_contents)

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "marker-pdf"
version = "1.0.0"
version = "1.0.1"
description = "Convert PDF to markdown with high speed and accuracy."
authors = ["Vik Paruchuri <[email protected]>"]
readme = "README.md"
Expand Down

0 comments on commit 7f869ee

Please sign in to comment.