diff --git a/marker/output.py b/marker/output.py index ce209af..2a6ee12 100644 --- a/marker/output.py +++ b/marker/output.py @@ -38,4 +38,4 @@ def save_output(rendered: BaseModel, output_dir: str, fname_base: str): f.write(json.dumps(rendered.metadata, indent=2)) for img_name, img in images.items(): - img.save(os.path.join(output_dir, img_name), "PNG", optimize=False, compress_level=3) + img.save(os.path.join(output_dir, img_name), settings.OUTPUT_IMAGE_FORMAT) diff --git a/marker/renderers/__init__.py b/marker/renderers/__init__.py index c8bf79d..7372f3e 100644 --- a/marker/renderers/__init__.py +++ b/marker/renderers/__init__.py @@ -35,7 +35,7 @@ def extract_image(document: Document, image_id, to_base64=False): cropped = page_img.crop(image_box.bbox) if to_base64: image_buffer = io.BytesIO() - cropped.save(image_buffer, format='PNG') + cropped.save(image_buffer, format=settings.OUTPUT_IMAGE_FORMAT) cropped = base64.b64encode(image_buffer.getvalue()).decode(settings.OUTPUT_ENCODING) return cropped diff --git a/marker/renderers/html.py b/marker/renderers/html.py index 29ca6be..eb3d5f0 100644 --- a/marker/renderers/html.py +++ b/marker/renderers/html.py @@ -10,6 +10,9 @@ # Ignore beautifulsoup warnings import warnings + +from marker.settings import settings + warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning) @@ -53,7 +56,7 @@ def extract_html(self, document, document_output, level=0): elif ref_block_id.block_type in self.image_blocks: if self.extract_images: image = self.extract_image(document, ref_block_id) - image_name = f"{ref_block_id.to_path()}.png" + image_name = f"{ref_block_id.to_path()}.{settings.OUTPUT_IMAGE_FORMAT.lower()}" images[image_name] = image ref.replace_with(BeautifulSoup(f"
", 'html.parser')) else: diff --git a/marker/settings.py b/marker/settings.py index 0a51056..8dc7a9a 100644 --- a/marker/settings.py +++ b/marker/settings.py @@ -16,6 +16,7 @@ class Settings(BaseSettings): # General OUTPUT_ENCODING: str = "utf-8" + OUTPUT_IMAGE_FORMAT: str = "JPEG" # General models TORCH_DEVICE: Optional[str] = None # Note: MPS device does not work for text detection, and will default to CPU diff --git a/marker_app.py b/marker_app.py index 208ddbe..762644b 100644 --- a/marker_app.py +++ b/marker_app.py @@ -44,10 +44,10 @@ def open_pdf(pdf_file): def img_to_html(img, img_alt): img_bytes = io.BytesIO() - img.save(img_bytes, format="PNG") + img.save(img_bytes, format=settings.OUTPUT_IMAGE_FORMAT) img_bytes = img_bytes.getvalue() encoded = base64.b64encode(img_bytes).decode() - img_html = f'' + img_html = f'' return img_html diff --git a/marker_server.py b/marker_server.py index 2f8bbe2..8092b3b 100644 --- a/marker_server.py +++ b/marker_server.py @@ -110,7 +110,7 @@ async def _convert_pdf(params: CommonParams): encoded = {} for k, v in images.items(): byte_stream = io.BytesIO() - v.save(byte_stream, format="PNG") + v.save(byte_stream, format=settings.OUTPUT_IMAGE_FORMAT) encoded[k] = base64.b64encode(byte_stream.getvalue()).decode(settings.OUTPUT_ENCODING) return { diff --git a/pyproject.toml b/pyproject.toml index 830d0d7..be4ef96 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "marker-pdf" -version = "1.0.1" +version = "1.0.2" description = "Convert PDF to markdown with high speed and accuracy." authors = ["Vik Paruchuri