Skip to content

Commit

Permalink
Force utf8
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Nov 30, 2024
1 parent 93549e5 commit 4f75544
Show file tree
Hide file tree
Showing 9 changed files with 19 additions and 7 deletions.
4 changes: 3 additions & 1 deletion benchmarks/overall.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
import subprocess
import shutil
from tabulate import tabulate

from marker.settings import settings
from scoring import score_text

configure_logging()
Expand Down Expand Up @@ -53,7 +55,7 @@ def main(in_folder: str, reference_folder: str, out_file: str, nougat: bool, md_
md_filename = fname.rsplit(".", 1)[0] + ".md"

reference_filename = os.path.join(reference_folder, md_filename)
with open(reference_filename, "r", encoding="utf-8") as f:
with open(reference_filename, "r", encoding=settings.OUTPUT_ENCODING) as f:
reference = f.read()

pdf_filename = os.path.join(in_folder, fname)
Expand Down
Binary file modified data/images/overall.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified data/images/per_doc.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
4 changes: 4 additions & 0 deletions marker/config/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def common_options(fn):
help="Path to JSON file with additional configuration.")(fn)
fn = click.option("--languages", type=str, default=None, help="Comma separated list of languages to use for OCR.")(fn)
fn = click.option("--disable_multiprocessing", is_flag=True, default=False, help="Disable multiprocessing.")(fn)
fn = click.option("--paginate_output", is_flag=True, default=False, help="Paginate output.")(fn)
return fn

def generate_config_dict(self) -> Dict[str, any]:
Expand Down Expand Up @@ -61,6 +62,9 @@ def generate_config_dict(self) -> Dict[str, any]:
case "disable_multiprocessing":
if v:
config["pdftext_workers"] = 1
case "paginate_output":
if v:
config["paginate_output"] = True
return config

def get_renderer(self):
Expand Down
5 changes: 3 additions & 2 deletions marker/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from marker.renderers.html import HTMLOutput
from marker.renderers.json import JSONOutput
from marker.renderers.markdown import MarkdownOutput
from marker.settings import settings


def output_exists(output_dir: str, fname_base: str):
Expand All @@ -30,9 +31,9 @@ def text_from_rendered(rendered: BaseModel):
def save_output(rendered: BaseModel, output_dir: str, fname_base: str):
text, ext, images = text_from_rendered(rendered)

with open(os.path.join(output_dir, f"{fname_base}.{ext}"), "w+") as f:
with open(os.path.join(output_dir, f"{fname_base}.{ext}"), "w+", encoding=settings.OUTPUT_ENCODING) as f:
f.write(text)
with open(os.path.join(output_dir, f"{fname_base}_meta.json"), "w+") as f:
with open(os.path.join(output_dir, f"{fname_base}_meta.json"), "w+", encoding=settings.OUTPUT_ENCODING) as f:
f.write(json.dumps(rendered.metadata, indent=2))

for img_name, img in images.items():
Expand Down
3 changes: 2 additions & 1 deletion marker/renderers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from marker.schema import BlockTypes
from marker.schema.blocks.base import BlockOutput, BlockId
from marker.settings import settings
from marker.util import assign_config


Expand All @@ -33,7 +34,7 @@ def extract_image(document, image_id, to_base64=False):
if to_base64:
image_buffer = io.BytesIO()
cropped.save(image_buffer, format='PNG')
cropped = base64.b64encode(image_buffer.getvalue()).decode('utf-8')
cropped = base64.b64encode(image_buffer.getvalue()).decode(settings.OUTPUT_ENCODING)
return cropped

@staticmethod
Expand Down
3 changes: 3 additions & 0 deletions marker/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ class Settings(BaseSettings):
FONT_DIR: str = os.path.join(BASE_DIR, "static", "fonts")
DEBUG_DATA_FOLDER: str = os.path.join(BASE_DIR, "debug_data")

# General
OUTPUT_ENCODING: str = "utf-8"

# General models
TORCH_DEVICE: Optional[str] = None # Note: MPS device does not work for text detection, and will default to CPU

Expand Down
2 changes: 1 addition & 1 deletion marker_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def page_count(pdf_file):
st.stop()

# Run Marker
with tempfile.NamedTemporaryFile(suffix=".pdf") as temp_pdf:
with tempfile.NamedTemporaryFile(suffix=".pdf", encoding=settings.OUTPUT_ENCODING) as temp_pdf:
temp_pdf.write(in_file.getvalue())
temp_pdf.seek(0)
filename = temp_pdf.name
Expand Down
5 changes: 3 additions & 2 deletions marker_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from fastapi import FastAPI, Form, File, UploadFile
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.settings import settings

app_data = {}

Expand Down Expand Up @@ -110,7 +111,7 @@ async def _convert_pdf(params: CommonParams):
for k, v in images.items():
byte_stream = io.BytesIO()
v.save(byte_stream, format="PNG")
encoded[k] = base64.b64encode(byte_stream.getvalue()).decode("utf-8")
encoded[k] = base64.b64encode(byte_stream.getvalue()).decode(settings.OUTPUT_ENCODING)

return {
"format": params.output_format,
Expand Down Expand Up @@ -140,7 +141,7 @@ async def convert_pdf_upload(
),
):
upload_path = os.path.join(UPLOAD_DIRECTORY, file.filename)
with open(upload_path, "wb") as upload_file:
with open(upload_path, "wb", encoding=settings.OUTPUT_ENCODING) as upload_file:
file_contents = await file.read()
upload_file.write(file_contents)

Expand Down

0 comments on commit 4f75544

Please sign in to comment.