Skip to content

Commit

Permalink
integrate extract_images into json output as well
Browse files Browse the repository at this point in the history
  • Loading branch information
iammosespaulr committed Nov 29, 2024
1 parent a1065cb commit 9602cb4
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 6 deletions.
10 changes: 5 additions & 5 deletions marker/renderers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,15 @@
from pydantic import BaseModel

from marker.schema import BlockTypes
from marker.schema.blocks.base import BlockId
from marker.schema.blocks.base import BlockId, BlockOutput
from marker.schema.document import Document
from marker.util import assign_config


class BaseRenderer:
remove_blocks: list = [BlockTypes.PageHeader, BlockTypes.PageFooter]
image_blocks: list = [BlockTypes.Picture, BlockTypes.Figure]
extract_images: bool = True

def __init__(self, config: Optional[BaseModel | dict] = None):
assign_config(self, config)
Expand Down Expand Up @@ -76,7 +77,7 @@ def generate_document_metadata(self, document: Document, document_output):

return metadata

def extract_block_html(self, document: Document, block_output):
def extract_block_html(self, document: Document, block_output: BlockOutput):
soup = BeautifulSoup(block_output.html, 'html.parser')

content_refs = soup.find_all('content-ref')
Expand All @@ -92,14 +93,13 @@ def extract_block_html(self, document: Document, block_output):
ref_block_id: BlockId = item.id
break

if ref_block_id.block_type in self.image_blocks:
if ref_block_id.block_type in self.image_blocks and self.extract_images:
images[ref_block_id] = self.extract_image(document, ref_block_id, to_base64=True)
else:
images.update(sub_images)
ref.replace_with(BeautifulSoup(content, 'html.parser'))

if block_output.id.block_type in self.image_blocks:
if block_output.id.block_type in self.image_blocks and self.extract_images:
images[block_output.id] = self.extract_image(document, block_output.id, to_base64=True)

return str(soup), images

1 change: 0 additions & 1 deletion marker/renderers/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ class HTMLOutput(BaseModel):
class HTMLRenderer(BaseRenderer):
page_blocks: list = [BlockTypes.Page]
paginate_output: bool = False
extract_images: bool = True

def extract_image(self, document, image_id):
image_block = document.get_block(image_id)
Expand Down

0 comments on commit 9602cb4

Please sign in to comment.