integrate extract_images into json output as well

VikParuchuri · Nov 29, 2024 · 9602cb4 · 9602cb4
1 parent a1065cb
commit 9602cb4
Show file tree

Hide file tree

Showing 2 changed files with 5 additions and 6 deletions.
diff --git a/marker/renderers/__init__.py b/marker/renderers/__init__.py
@@ -8,14 +8,15 @@
 from pydantic import BaseModel
 
 from marker.schema import BlockTypes
-from marker.schema.blocks.base import BlockId
+from marker.schema.blocks.base import BlockId, BlockOutput
 from marker.schema.document import Document
 from marker.util import assign_config
 
 
 class BaseRenderer:
     remove_blocks: list = [BlockTypes.PageHeader, BlockTypes.PageFooter]
     image_blocks: list = [BlockTypes.Picture, BlockTypes.Figure]
+    extract_images: bool = True
 
     def __init__(self, config: Optional[BaseModel | dict] = None):
         assign_config(self, config)
@@ -76,7 +77,7 @@ def generate_document_metadata(self, document: Document, document_output):
 
         return metadata
 
-    def extract_block_html(self, document: Document, block_output):
+    def extract_block_html(self, document: Document, block_output: BlockOutput):
         soup = BeautifulSoup(block_output.html, 'html.parser')
 
         content_refs = soup.find_all('content-ref')
@@ -92,14 +93,13 @@ def extract_block_html(self, document: Document, block_output):
                     ref_block_id: BlockId = item.id
                     break
 
-            if ref_block_id.block_type in self.image_blocks:
+            if ref_block_id.block_type in self.image_blocks and self.extract_images:
                 images[ref_block_id] = self.extract_image(document, ref_block_id, to_base64=True)
             else:
                 images.update(sub_images)
                 ref.replace_with(BeautifulSoup(content, 'html.parser'))
 
-        if block_output.id.block_type in self.image_blocks:
+        if block_output.id.block_type in self.image_blocks and self.extract_images:
             images[block_output.id] = self.extract_image(document, block_output.id, to_base64=True)
 
         return str(soup), images
-
diff --git a/marker/renderers/html.py b/marker/renderers/html.py
@@ -21,7 +21,6 @@ class HTMLOutput(BaseModel):
 class HTMLRenderer(BaseRenderer):
     page_blocks: list = [BlockTypes.Page]
     paginate_output: bool = False
-    extract_images: bool = True
 
     def extract_image(self, document, image_id):
         image_block = document.get_block(image_id)