From 9602cb46347915e34653850c39bdc18be4c0786c Mon Sep 17 00:00:00 2001 From: Moses Paul R Date: Fri, 29 Nov 2024 11:46:06 +0000 Subject: [PATCH] integrate extract_images into json output as well --- marker/renderers/__init__.py | 10 +++++----- marker/renderers/html.py | 1 - 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/marker/renderers/__init__.py b/marker/renderers/__init__.py index 688f7a19..dad32863 100644 --- a/marker/renderers/__init__.py +++ b/marker/renderers/__init__.py @@ -8,7 +8,7 @@ from pydantic import BaseModel from marker.schema import BlockTypes -from marker.schema.blocks.base import BlockId +from marker.schema.blocks.base import BlockId, BlockOutput from marker.schema.document import Document from marker.util import assign_config @@ -16,6 +16,7 @@ class BaseRenderer: remove_blocks: list = [BlockTypes.PageHeader, BlockTypes.PageFooter] image_blocks: list = [BlockTypes.Picture, BlockTypes.Figure] + extract_images: bool = True def __init__(self, config: Optional[BaseModel | dict] = None): assign_config(self, config) @@ -76,7 +77,7 @@ def generate_document_metadata(self, document: Document, document_output): return metadata - def extract_block_html(self, document: Document, block_output): + def extract_block_html(self, document: Document, block_output: BlockOutput): soup = BeautifulSoup(block_output.html, 'html.parser') content_refs = soup.find_all('content-ref') @@ -92,14 +93,13 @@ def extract_block_html(self, document: Document, block_output): ref_block_id: BlockId = item.id break - if ref_block_id.block_type in self.image_blocks: + if ref_block_id.block_type in self.image_blocks and self.extract_images: images[ref_block_id] = self.extract_image(document, ref_block_id, to_base64=True) else: images.update(sub_images) ref.replace_with(BeautifulSoup(content, 'html.parser')) - if block_output.id.block_type in self.image_blocks: + if block_output.id.block_type in self.image_blocks and self.extract_images: images[block_output.id] = self.extract_image(document, block_output.id, to_base64=True) return str(soup), images - diff --git a/marker/renderers/html.py b/marker/renderers/html.py index 5ec30b83..d1b32b8f 100644 --- a/marker/renderers/html.py +++ b/marker/renderers/html.py @@ -21,7 +21,6 @@ class HTMLOutput(BaseModel): class HTMLRenderer(BaseRenderer): page_blocks: list = [BlockTypes.Page] paginate_output: bool = False - extract_images: bool = True def extract_image(self, document, image_id): image_block = document.get_block(image_id)