diff --git a/benchmarks/overall.py b/benchmarks/overall.py index 6564b256..e9e5ea0e 100644 --- a/benchmarks/overall.py +++ b/benchmarks/overall.py @@ -16,6 +16,8 @@ import subprocess import shutil from tabulate import tabulate + +from marker.settings import settings from scoring import score_text configure_logging() @@ -53,7 +55,7 @@ def main(in_folder: str, reference_folder: str, out_file: str, nougat: bool, md_ md_filename = fname.rsplit(".", 1)[0] + ".md" reference_filename = os.path.join(reference_folder, md_filename) - with open(reference_filename, "r", encoding="utf-8") as f: + with open(reference_filename, "r", encoding=settings.OUTPUT_ENCODING) as f: reference = f.read() pdf_filename = os.path.join(in_folder, fname) diff --git a/data/images/overall.png b/data/images/overall.png index 0b7f5318..0946421a 100644 Binary files a/data/images/overall.png and b/data/images/overall.png differ diff --git a/data/images/per_doc.png b/data/images/per_doc.png index 6c864a57..ed26cfb9 100644 Binary files a/data/images/per_doc.png and b/data/images/per_doc.png differ diff --git a/marker/config/parser.py b/marker/config/parser.py index 11601ac7..2aa56930 100644 --- a/marker/config/parser.py +++ b/marker/config/parser.py @@ -32,6 +32,7 @@ def common_options(fn): help="Path to JSON file with additional configuration.")(fn) fn = click.option("--languages", type=str, default=None, help="Comma separated list of languages to use for OCR.")(fn) fn = click.option("--disable_multiprocessing", is_flag=True, default=False, help="Disable multiprocessing.")(fn) + fn = click.option("--paginate_output", is_flag=True, default=False, help="Paginate output.")(fn) return fn def generate_config_dict(self) -> Dict[str, any]: @@ -61,6 +62,9 @@ def generate_config_dict(self) -> Dict[str, any]: case "disable_multiprocessing": if v: config["pdftext_workers"] = 1 + case "paginate_output": + if v: + config["paginate_output"] = True return config def get_renderer(self): diff --git a/marker/output.py b/marker/output.py index e47c861f..ca24400f 100644 --- a/marker/output.py +++ b/marker/output.py @@ -6,6 +6,7 @@ from marker.renderers.html import HTMLOutput from marker.renderers.json import JSONOutput from marker.renderers.markdown import MarkdownOutput +from marker.settings import settings def output_exists(output_dir: str, fname_base: str): @@ -30,9 +31,9 @@ def text_from_rendered(rendered: BaseModel): def save_output(rendered: BaseModel, output_dir: str, fname_base: str): text, ext, images = text_from_rendered(rendered) - with open(os.path.join(output_dir, f"{fname_base}.{ext}"), "w+") as f: + with open(os.path.join(output_dir, f"{fname_base}.{ext}"), "w+", encoding=settings.OUTPUT_ENCODING) as f: f.write(text) - with open(os.path.join(output_dir, f"{fname_base}_meta.json"), "w+") as f: + with open(os.path.join(output_dir, f"{fname_base}_meta.json"), "w+", encoding=settings.OUTPUT_ENCODING) as f: f.write(json.dumps(rendered.metadata, indent=2)) for img_name, img in images.items(): diff --git a/marker/renderers/__init__.py b/marker/renderers/__init__.py index d2358188..f8837409 100644 --- a/marker/renderers/__init__.py +++ b/marker/renderers/__init__.py @@ -9,6 +9,7 @@ from marker.schema import BlockTypes from marker.schema.blocks.base import BlockOutput, BlockId +from marker.settings import settings from marker.util import assign_config @@ -33,7 +34,7 @@ def extract_image(document, image_id, to_base64=False): if to_base64: image_buffer = io.BytesIO() cropped.save(image_buffer, format='PNG') - cropped = base64.b64encode(image_buffer.getvalue()).decode('utf-8') + cropped = base64.b64encode(image_buffer.getvalue()).decode(settings.OUTPUT_ENCODING) return cropped @staticmethod diff --git a/marker/settings.py b/marker/settings.py index 40a739d1..0a510568 100644 --- a/marker/settings.py +++ b/marker/settings.py @@ -14,6 +14,9 @@ class Settings(BaseSettings): FONT_DIR: str = os.path.join(BASE_DIR, "static", "fonts") DEBUG_DATA_FOLDER: str = os.path.join(BASE_DIR, "debug_data") + # General + OUTPUT_ENCODING: str = "utf-8" + # General models TORCH_DEVICE: Optional[str] = None # Note: MPS device does not work for text detection, and will default to CPU diff --git a/marker_app.py b/marker_app.py index 32579ee9..d1036f10 100644 --- a/marker_app.py +++ b/marker_app.py @@ -122,7 +122,7 @@ def page_count(pdf_file): st.stop() # Run Marker -with tempfile.NamedTemporaryFile(suffix=".pdf") as temp_pdf: +with tempfile.NamedTemporaryFile(suffix=".pdf", encoding=settings.OUTPUT_ENCODING) as temp_pdf: temp_pdf.write(in_file.getvalue()) temp_pdf.seek(0) filename = temp_pdf.name diff --git a/marker_server.py b/marker_server.py index aa5a3178..1a4577d7 100644 --- a/marker_server.py +++ b/marker_server.py @@ -18,6 +18,7 @@ from fastapi import FastAPI, Form, File, UploadFile from marker.converters.pdf import PdfConverter from marker.models import create_model_dict +from marker.settings import settings app_data = {} @@ -110,7 +111,7 @@ async def _convert_pdf(params: CommonParams): for k, v in images.items(): byte_stream = io.BytesIO() v.save(byte_stream, format="PNG") - encoded[k] = base64.b64encode(byte_stream.getvalue()).decode("utf-8") + encoded[k] = base64.b64encode(byte_stream.getvalue()).decode(settings.OUTPUT_ENCODING) return { "format": params.output_format, @@ -140,7 +141,7 @@ async def convert_pdf_upload( ), ): upload_path = os.path.join(UPLOAD_DIRECTORY, file.filename) - with open(upload_path, "wb") as upload_file: + with open(upload_path, "wb", encoding=settings.OUTPUT_ENCODING) as upload_file: file_contents = await file.read() upload_file.write(file_contents)