From 59b622437ae7e437e3b464b494b52821c9ed6cac Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Tue, 19 Nov 2024 09:30:35 -0500 Subject: [PATCH 1/3] Initial chunk JSON output --- marker/v2/converters/pdf.py | 25 ++++-- marker/v2/renderers/__init__.py | 28 ++++++ marker/v2/renderers/html.py | 22 +---- marker/v2/renderers/json.py | 103 ++++++++++++++++++++++ marker/v2/schema/blocks/base.py | 26 +++++- marker/v2/schema/document.py | 5 +- marker/v2/schema/groups/base.py | 5 ++ marker/v2/schema/groups/figure.py | 4 +- marker/v2/schema/groups/list.py | 4 +- marker/v2/schema/groups/page.py | 3 +- marker/v2/schema/groups/picture.py | 4 +- marker/v2/schema/groups/table.py | 5 +- marker/v2/schema/text/line.py | 7 +- tests/schema/groups/test_list_grouping.py | 2 +- 14 files changed, 199 insertions(+), 44 deletions(-) create mode 100644 marker/v2/renderers/json.py create mode 100644 marker/v2/schema/groups/base.py diff --git a/marker/v2/converters/pdf.py b/marker/v2/converters/pdf.py index 55caad9e..abcf216e 100644 --- a/marker/v2/converters/pdf.py +++ b/marker/v2/converters/pdf.py @@ -1,6 +1,8 @@ from marker.v2.providers.pdf import PdfProvider import os +from marker.v2.renderers.json import JSONRenderer + os.environ["TOKENIZERS_PARALLELISM"] = "false" # disables a tokenizers warning import tempfile @@ -30,7 +32,7 @@ class PdfConverter(BaseConverter): override_map: Dict[BlockTypes, Type[Block]] = defaultdict() - def __init__(self, config=None): + def __init__(self, config=None, output_format="markdown"): super().__init__(config) for block_type, override_block_type in self.override_map.items(): @@ -42,6 +44,11 @@ def __init__(self, config=None): self.table_rec_model = setup_table_rec_model() self.detection_model = setup_detection_model() + if output_format == "markdown": + self.renderer = MarkdownRenderer(self.config) + elif output_format == "json": + self.renderer = JSONRenderer(self.config) + def __call__(self, filepath: str): pdf_provider = PdfProvider(filepath, self.config) @@ -62,18 +69,18 @@ def __call__(self, filepath: str): debug_processor = DebugProcessor(self.config) debug_processor(document) - renderer = MarkdownRenderer(self.config) - return renderer(document) + return self.renderer(document) @click.command() @click.option("--output", type=click.Path(exists=False), required=False, default="temp") @click.option("--fname", type=str, default="adversarial.pdf") @click.option("--debug", is_flag=True) -def main(output: str, fname: str, debug: bool): +@click.option("--output_format", type=click.Choice(["markdown", "json"]), default="markdown") +def main(output: str, fname: str, debug: bool, output_format: str): dataset = datasets.load_dataset("datalab-to/pdfs", split="train") idx = dataset['filename'].index(fname) - out_filename = fname.rsplit(".", 1)[0] + ".md" + fname_base = fname.rsplit(".", 1)[0] os.makedirs(output, exist_ok=True) config = {} @@ -86,14 +93,20 @@ def main(output: str, fname: str, debug: bool): temp_pdf.write(dataset['pdf'][idx]) temp_pdf.flush() - converter = PdfConverter() + converter = PdfConverter(config=config, output_format=output_format) rendered = converter(temp_pdf.name) + if output_format == "markdown": + out_filename = f"{fname_base}.md" with open(os.path.join(output, out_filename), "w+") as f: f.write(rendered.markdown) for img_name, img in rendered.images.items(): img.save(os.path.join(output, img_name), "PNG") + elif output_format == "json": + out_filename = f"{fname_base}.json" + with open(os.path.join(output, out_filename), "w+") as f: + f.write(rendered.model_dump_json(indent=2)) if __name__ == "__main__": diff --git a/marker/v2/renderers/__init__.py b/marker/v2/renderers/__init__.py index 6b61c25d..c0e6ec60 100644 --- a/marker/v2/renderers/__init__.py +++ b/marker/v2/renderers/__init__.py @@ -1,3 +1,4 @@ +import re from typing import Optional from pydantic import BaseModel @@ -15,3 +16,30 @@ def __init__(self, config: Optional[BaseModel | dict] = None): def __call__(self, document): # Children are in reading order raise NotImplementedError + + @staticmethod + def extract_image(document, image_id): + image_block = document.get_block(image_id) + page = document.get_page(image_block.page_id) + page_img = page.highres_image + image_box = image_block.polygon.rescale(page.polygon.size, page_img.size) + cropped = page_img.crop(image_box.bbox) + return cropped + + @staticmethod + def merge_consecutive_tags(html, tag): + if not html: + return html + + def replace_whitespace(match): + return match.group(1) + + pattern = fr'(\s*)<{tag}>' + + while True: + new_merged = re.sub(pattern, replace_whitespace, html) + if new_merged == html: + break + html = new_merged + + return html diff --git a/marker/v2/renderers/html.py b/marker/v2/renderers/html.py index e1cfe884..77f4fb4c 100644 --- a/marker/v2/renderers/html.py +++ b/marker/v2/renderers/html.py @@ -17,24 +17,6 @@ class HTMLOutput(BaseModel): images: dict -def merge_consecutive_tags(html, tag): - if not html: - return html - - def replace_whitespace(match): - return match.group(1) - - pattern = fr'(\s*)<{tag}>' - - while True: - new_merged = re.sub(pattern, replace_whitespace, html) - if new_merged == html: - break - html = new_merged - - return html - - class HTMLRenderer(BaseRenderer): remove_blocks: list = [BlockTypes.PageHeader, BlockTypes.PageFooter] image_blocks: list = [BlockTypes.Picture, BlockTypes.Figure] @@ -82,8 +64,8 @@ def extract_html(self, document, document_output, level=0): output = str(soup) if level == 0: - output = merge_consecutive_tags(output, 'b') - output = merge_consecutive_tags(output, 'i') + output = self.merge_consecutive_tags(output, 'b') + output = self.merge_consecutive_tags(output, 'i') return output, images diff --git a/marker/v2/renderers/json.py b/marker/v2/renderers/json.py new file mode 100644 index 00000000..739dbf4f --- /dev/null +++ b/marker/v2/renderers/json.py @@ -0,0 +1,103 @@ +from __future__ import annotations + +import base64 +import io +from typing import List, Dict + +from bs4 import BeautifulSoup +from pydantic import BaseModel + +from marker.v2.schema.blocks import Block +from marker.v2.renderers import BaseRenderer +from marker.v2.schema import BlockTypes +from marker.v2.schema.blocks import BlockId +from marker.v2.schema.registry import get_block_class + + +class JSONBlockOutput(BaseModel): + id: str + block_type: str + html: str + polygon: List[List[float]] + children: List[JSONBlockOutput] | None = None + section_hierarchy: Dict[int, str] | None = None + images: dict | None = None + + +class JSONOutput(BaseModel): + children: List[JSONBlockOutput] + block_type: BlockTypes = BlockTypes.Document + + +def reformat_section_hierarchy(section_hierarchy): + new_section_hierarchy = {} + for key, value in section_hierarchy.items(): + new_section_hierarchy[key] = str(value) + return new_section_hierarchy + + +class JSONRenderer(BaseRenderer): + image_blocks: list = [BlockTypes.Picture, BlockTypes.Figure] + page_blocks: list = [BlockTypes.Page] + + def extract_json(self, document, block_output): + cls = get_block_class(block_output.id.block_type) + if cls.__base__ == Block: + html, images = self.extract_html(document, block_output) + return JSONBlockOutput( + html=html, + polygon=block_output.polygon.polygon, + id=str(block_output.id), + block_type=str(block_output.id.block_type), + images=images, + section_hierarchy=reformat_section_hierarchy(block_output.section_hierarchy) + ) + else: + children = [] + for child in block_output.children: + child_output = self.extract_json(document, child) + children.append(child_output) + + return JSONBlockOutput( + html=block_output.html, + polygon=block_output.polygon.polygon, + id=str(block_output.id), + block_type=str(block_output.id.block_type), + children=children, + section_hierarchy=reformat_section_hierarchy(block_output.section_hierarchy) + ) + + def extract_html(self, document, block_output): + soup = BeautifulSoup(block_output.html, 'html.parser') + + content_refs = soup.find_all('content-ref') + ref_block_id = None + images = {} + for ref in content_refs: + src = ref.get('src') + sub_images = {} + for item in block_output.children: + if item.id == src: + content, sub_images = self.extract_html(document, item) + ref_block_id: BlockId = item.id + break + + if ref_block_id.block_type in self.image_blocks: + image = self.extract_image(document, ref_block_id) + image_buffer = io.BytesIO() + image.save(image_buffer, format='PNG') + images[ref_block_id] = base64.b64encode(image_buffer.getvalue()).decode('utf-8') + else: + images.update(sub_images) + ref.replace_with(BeautifulSoup(content, 'html.parser')) + + return str(soup), images + + def __call__(self, document) -> JSONOutput: + document_output = document.render() + json_output = [] + for page_output in document_output.children: + json_output.append(self.extract_json(document, page_output)) + return JSONOutput( + children=json_output, + ) diff --git a/marker/v2/schema/blocks/base.py b/marker/v2/schema/blocks/base.py index c09ac102..7f1db7b8 100644 --- a/marker/v2/schema/blocks/base.py +++ b/marker/v2/schema/blocks/base.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, List, Literal, Optional +from typing import TYPE_CHECKING, Any, List, Literal, Optional, Dict from pydantic import BaseModel, ConfigDict, field_validator @@ -16,6 +16,7 @@ class BlockOutput(BaseModel): polygon: PolygonBox id: BlockId children: List[BlockOutput] | None = None + section_hierarchy: Dict[int, BlockId] | None = None class BlockId(BaseModel): @@ -115,16 +116,33 @@ def assemble_html(self, child_blocks, parent_structure=None): template += f"" return template - def render(self, document, parent_structure): + def assign_section_hierarchy(self, section_hierarchy): + if self.block_type == BlockTypes.SectionHeader and self.heading_level: + levels = list(section_hierarchy.keys()) + for level in levels: + if level >= self.heading_level: + del section_hierarchy[level] + section_hierarchy[self.heading_level] = self.id + + return section_hierarchy + + def render(self, document, parent_structure, section_hierarchy=None): child_content = [] + if section_hierarchy is None: + section_hierarchy = {} + section_hierarchy = self.assign_section_hierarchy(section_hierarchy) + if self.structure is not None and len(self.structure) > 0: for block_id in self.structure: block = document.get_block(block_id) - child_content.append(block.render(document, self.structure)) + rendered = block.render(document, self.structure, section_hierarchy) + section_hierarchy = rendered.section_hierarchy # Update the section hierarchy from the peer blocks + child_content.append(rendered) return BlockOutput( html=self.assemble_html(child_content, parent_structure), polygon=self.polygon, id=self.id, - children=child_content + children=child_content, + section_hierarchy=section_hierarchy ) diff --git a/marker/v2/schema/document.py b/marker/v2/schema/document.py index 8af0dfdf..ee90c1a1 100644 --- a/marker/v2/schema/document.py +++ b/marker/v2/schema/document.py @@ -41,8 +41,11 @@ def assemble_html(self, child_blocks: List[Block]): def render(self): child_content = [] + section_hierarchy = None for page in self.pages: - child_content.append(page.render(self, None)) + rendered = page.render(self, None, section_hierarchy) + section_hierarchy = rendered.section_hierarchy + child_content.append(rendered) return DocumentOutput( children=child_content, diff --git a/marker/v2/schema/groups/base.py b/marker/v2/schema/groups/base.py new file mode 100644 index 00000000..a22eaf83 --- /dev/null +++ b/marker/v2/schema/groups/base.py @@ -0,0 +1,5 @@ +from marker.v2.schema.blocks import Block + + +class Group(Block): + pass \ No newline at end of file diff --git a/marker/v2/schema/groups/figure.py b/marker/v2/schema/groups/figure.py index 93b47092..9ba28f0c 100644 --- a/marker/v2/schema/groups/figure.py +++ b/marker/v2/schema/groups/figure.py @@ -1,6 +1,6 @@ from marker.v2.schema import BlockTypes -from marker.v2.schema.blocks import Block +from marker.v2.schema.groups.base import Group -class FigureGroup(Block): +class FigureGroup(Group): block_type: BlockTypes = BlockTypes.FigureGroup diff --git a/marker/v2/schema/groups/list.py b/marker/v2/schema/groups/list.py index f5880bc4..d16439be 100644 --- a/marker/v2/schema/groups/list.py +++ b/marker/v2/schema/groups/list.py @@ -1,8 +1,8 @@ from marker.v2.schema import BlockTypes -from marker.v2.schema.blocks import Block +from marker.v2.schema.groups.base import Group -class ListGroup(Block): +class ListGroup(Group): block_type: BlockTypes = BlockTypes.ListGroup def assemble_html(self, child_blocks, parent_structure): diff --git a/marker/v2/schema/groups/page.py b/marker/v2/schema/groups/page.py index efca9f0d..1a2c3ec4 100644 --- a/marker/v2/schema/groups/page.py +++ b/marker/v2/schema/groups/page.py @@ -4,12 +4,13 @@ from marker.v2.schema import BlockTypes from marker.v2.schema.blocks import Block, BlockId +from marker.v2.schema.groups.base import Group from marker.v2.schema.polygon import PolygonBox from marker.v2.schema.text.line import Line from marker.v2.schema.text.span import Span -class PageGroup(Block): +class PageGroup(Group): block_type: BlockTypes = BlockTypes.Page lowres_image: Image.Image | None = None highres_image: Image.Image | None = None diff --git a/marker/v2/schema/groups/picture.py b/marker/v2/schema/groups/picture.py index d9a3a1ee..0ab6cf49 100644 --- a/marker/v2/schema/groups/picture.py +++ b/marker/v2/schema/groups/picture.py @@ -1,6 +1,6 @@ from marker.v2.schema import BlockTypes -from marker.v2.schema.blocks import Block +from marker.v2.schema.groups.base import Group -class PictureGroup(Block): +class PictureGroup(Group): block_type: BlockTypes = BlockTypes.PictureGroup diff --git a/marker/v2/schema/groups/table.py b/marker/v2/schema/groups/table.py index b1b1f2d6..53a7f6eb 100644 --- a/marker/v2/schema/groups/table.py +++ b/marker/v2/schema/groups/table.py @@ -1,5 +1,6 @@ from marker.v2.schema import BlockTypes -from marker.v2.schema.blocks import Block +from marker.v2.schema.groups.base import Group -class TableGroup(Block): + +class TableGroup(Group): block_type: BlockTypes = BlockTypes.TableGroup diff --git a/marker/v2/schema/text/line.py b/marker/v2/schema/text/line.py index 5f8eb27d..f28b04c5 100644 --- a/marker/v2/schema/text/line.py +++ b/marker/v2/schema/text/line.py @@ -49,16 +49,17 @@ def assemble_html(self, document, child_blocks, parent_structure): template = strip_trailing_hyphens(raw_text, next_line_raw_text, template) return template - def render(self, document, parent_structure): + def render(self, document, parent_structure, section_hierarchy=None): child_content = [] if self.structure is not None and len(self.structure) > 0: for block_id in self.structure: block = document.get_block(block_id) - child_content.append(block.render(document, parent_structure)) + child_content.append(block.render(document, parent_structure, section_hierarchy)) return BlockOutput( html=self.assemble_html(document, child_content, parent_structure), polygon=self.polygon, id=self.id, - children=[] + children=[], + section_hierarchy=section_hierarchy ) diff --git a/tests/schema/groups/test_list_grouping.py b/tests/schema/groups/test_list_grouping.py index 1a50cc4b..5b903b4b 100644 --- a/tests/schema/groups/test_list_grouping.py +++ b/tests/schema/groups/test_list_grouping.py @@ -9,7 +9,7 @@ def test_list_grouping(pdf_document): structure = StructureBuilder() structure(pdf_document) - page = pdf_document.pags[0] + page = pdf_document.page[0] list_groups = [] for block in page.children: if block.block_type == BlockTypes.ListGroup: From 43cbd2c4577ab2180cfba22945a375a8065fdcbf Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Tue, 19 Nov 2024 10:52:27 -0500 Subject: [PATCH 2/3] Compute TOC, fix image output --- marker/v2/builders/ocr.py | 2 +- marker/v2/converters/pdf.py | 10 ++++++ marker/v2/renderers/__init__.py | 62 +++++++++++++++++++++++++++++++-- marker/v2/renderers/html.py | 7 ++-- marker/v2/renderers/json.py | 30 ++-------------- marker/v2/renderers/markdown.py | 4 ++- 6 files changed, 81 insertions(+), 34 deletions(-) diff --git a/marker/v2/builders/ocr.py b/marker/v2/builders/ocr.py index 9f40da9a..64e1dac5 100644 --- a/marker/v2/builders/ocr.py +++ b/marker/v2/builders/ocr.py @@ -35,7 +35,7 @@ def get_recognition_batch_size(self): if self.recognition_batch_size is not None: return self.recognition_batch_size elif settings.TORCH_DEVICE_MODEL == "cuda": - return 32 + return 128 elif settings.TORCH_DEVICE_MODEL == "mps": return 32 return 32 diff --git a/marker/v2/converters/pdf.py b/marker/v2/converters/pdf.py index 59bb1fd8..785b296f 100644 --- a/marker/v2/converters/pdf.py +++ b/marker/v2/converters/pdf.py @@ -1,3 +1,5 @@ +import json + from marker.v2.providers.pdf import PdfProvider import os @@ -99,6 +101,10 @@ def main(output: str, fname: str, debug: bool, output_format: str): with open(os.path.join(output, out_filename), "w+") as f: f.write(rendered.markdown) + meta_filename = f"{fname_base}_meta.json" + with open(os.path.join(output, meta_filename), "w+") as f: + f.write(json.dumps(rendered.metadata, indent=2)) + for img_name, img in rendered.images.items(): img.save(os.path.join(output, img_name), "PNG") elif output_format == "json": @@ -106,6 +112,10 @@ def main(output: str, fname: str, debug: bool, output_format: str): with open(os.path.join(output, out_filename), "w+") as f: f.write(rendered.model_dump_json(indent=2)) + meta_filename = f"{fname_base}_meta.json" + with open(os.path.join(output, meta_filename), "w+") as f: + f.write(json.dumps(rendered.metadata, indent=2)) + if __name__ == "__main__": main() diff --git a/marker/v2/renderers/__init__.py b/marker/v2/renderers/__init__.py index c0e6ec60..8837ca3c 100644 --- a/marker/v2/renderers/__init__.py +++ b/marker/v2/renderers/__init__.py @@ -1,14 +1,19 @@ +import base64 +import io import re from typing import Optional +from bs4 import BeautifulSoup from pydantic import BaseModel from marker.v2.schema import BlockTypes +from marker.v2.schema.blocks.base import BlockOutput, BlockId from marker.v2.util import assign_config class BaseRenderer: - block_type: BlockTypes | None = None + remove_blocks: list = [BlockTypes.PageHeader, BlockTypes.PageFooter] + image_blocks: list = [BlockTypes.Picture, BlockTypes.Figure] def __init__(self, config: Optional[BaseModel | dict] = None): assign_config(self, config) @@ -18,12 +23,16 @@ def __call__(self, document): raise NotImplementedError @staticmethod - def extract_image(document, image_id): + def extract_image(document, image_id, to_base64=False): image_block = document.get_block(image_id) page = document.get_page(image_block.page_id) page_img = page.highres_image image_box = image_block.polygon.rescale(page.polygon.size, page_img.size) cropped = page_img.crop(image_box.bbox) + if to_base64: + image_buffer = io.BytesIO() + cropped.save(image_buffer, format='PNG') + cropped = base64.b64encode(image_buffer.getvalue()).decode('utf-8') return cropped @staticmethod @@ -43,3 +52,52 @@ def replace_whitespace(match): html = new_merged return html + + def compute_toc(self, document, block_output: BlockOutput): + toc = [] + if hasattr(block_output, "id") and block_output.id.block_type == BlockTypes.SectionHeader: + toc.append({ + "title": self.extract_block_html(document, block_output), + "level": document.get_block(block_output.id).heading_level, + "page": block_output.id.page_id + }) + + for child in block_output.children: + child_toc = self.compute_toc(document, child) + if child_toc: + toc.extend(child_toc) + return toc + + def generate_document_metadata(self, document, document_output): + toc = self.compute_toc(document, document_output) + return { + "table_of_contents": toc + } + + def extract_block_html(self, document, block_output): + soup = BeautifulSoup(block_output.html, 'html.parser') + + content_refs = soup.find_all('content-ref') + ref_block_id = None + images = {} + for ref in content_refs: + src = ref.get('src') + sub_images = {} + for item in block_output.children: + if item.id == src: + content, sub_images_ = self.extract_block_html(document, item) + sub_images.update(sub_images_) + ref_block_id: BlockId = item.id + break + + if ref_block_id.block_type in self.image_blocks: + images[ref_block_id] = self.extract_image(document, ref_block_id, to_base64=True) + else: + images.update(sub_images) + ref.replace_with(BeautifulSoup(content, 'html.parser')) + + if block_output.id.block_type in self.image_blocks: + images[block_output.id] = self.extract_image(document, block_output.id, to_base64=True) + + return str(soup), images + diff --git a/marker/v2/renderers/html.py b/marker/v2/renderers/html.py index 77f4fb4c..1f1a4416 100644 --- a/marker/v2/renderers/html.py +++ b/marker/v2/renderers/html.py @@ -15,11 +15,10 @@ class HTMLOutput(BaseModel): html: str images: dict + metadata: dict class HTMLRenderer(BaseRenderer): - remove_blocks: list = [BlockTypes.PageHeader, BlockTypes.PageFooter] - image_blocks: list = [BlockTypes.Picture, BlockTypes.Figure] page_blocks: list = [BlockTypes.Page] paginate_output: bool = False @@ -42,7 +41,8 @@ def extract_html(self, document, document_output, level=0): sub_images = {} for item in document_output.children: if item.id == src: - content, sub_images = self.extract_html(document, item, level + 1) + content, sub_images_ = self.extract_html(document, item, level + 1) + sub_images.update(sub_images_) ref_block_id: BlockId = item.id break @@ -75,4 +75,5 @@ def __call__(self, document) -> HTMLOutput: return HTMLOutput( html=full_html, images=images, + metadata=self.generate_document_metadata(document, document_output) ) diff --git a/marker/v2/renderers/json.py b/marker/v2/renderers/json.py index 739dbf4f..9e32774e 100644 --- a/marker/v2/renderers/json.py +++ b/marker/v2/renderers/json.py @@ -27,6 +27,7 @@ class JSONBlockOutput(BaseModel): class JSONOutput(BaseModel): children: List[JSONBlockOutput] block_type: BlockTypes = BlockTypes.Document + metadata: dict def reformat_section_hierarchy(section_hierarchy): @@ -43,7 +44,7 @@ class JSONRenderer(BaseRenderer): def extract_json(self, document, block_output): cls = get_block_class(block_output.id.block_type) if cls.__base__ == Block: - html, images = self.extract_html(document, block_output) + html, images = self.extract_block_html(document, block_output) return JSONBlockOutput( html=html, polygon=block_output.polygon.polygon, @@ -67,32 +68,6 @@ def extract_json(self, document, block_output): section_hierarchy=reformat_section_hierarchy(block_output.section_hierarchy) ) - def extract_html(self, document, block_output): - soup = BeautifulSoup(block_output.html, 'html.parser') - - content_refs = soup.find_all('content-ref') - ref_block_id = None - images = {} - for ref in content_refs: - src = ref.get('src') - sub_images = {} - for item in block_output.children: - if item.id == src: - content, sub_images = self.extract_html(document, item) - ref_block_id: BlockId = item.id - break - - if ref_block_id.block_type in self.image_blocks: - image = self.extract_image(document, ref_block_id) - image_buffer = io.BytesIO() - image.save(image_buffer, format='PNG') - images[ref_block_id] = base64.b64encode(image_buffer.getvalue()).decode('utf-8') - else: - images.update(sub_images) - ref.replace_with(BeautifulSoup(content, 'html.parser')) - - return str(soup), images - def __call__(self, document) -> JSONOutput: document_output = document.render() json_output = [] @@ -100,4 +75,5 @@ def __call__(self, document) -> JSONOutput: json_output.append(self.extract_json(document, page_output)) return JSONOutput( children=json_output, + metadata=self.generate_document_metadata(document, document_output) ) diff --git a/marker/v2/renderers/markdown.py b/marker/v2/renderers/markdown.py index 904cd4bc..b6739cea 100644 --- a/marker/v2/renderers/markdown.py +++ b/marker/v2/renderers/markdown.py @@ -24,6 +24,7 @@ def convert_div(self, el, text, convert_as_inline): class MarkdownOutput(BaseModel): markdown: str images: dict + metadata: dict class MarkdownRenderer(HTMLRenderer): @@ -43,5 +44,6 @@ def __call__(self, document: Document) -> MarkdownOutput: markdown = md_cls.convert(full_html) return MarkdownOutput( markdown=markdown, - images=images + images=images, + metadata=self.generate_document_metadata(document, document_output) ) From 4591f317e8c2bc9c6c9eb5c3422b3828bca4b193 Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Tue, 19 Nov 2024 10:57:28 -0500 Subject: [PATCH 3/3] Add json renderer tests --- marker/v2/renderers/__init__.py | 2 +- tests/renderers/test_json_renderer.py | 13 +++++++++++++ tests/renderers/test_markdown_renderer.py | 11 ++++++++++- 3 files changed, 24 insertions(+), 2 deletions(-) create mode 100644 tests/renderers/test_json_renderer.py diff --git a/marker/v2/renderers/__init__.py b/marker/v2/renderers/__init__.py index 8837ca3c..e87ee655 100644 --- a/marker/v2/renderers/__init__.py +++ b/marker/v2/renderers/__init__.py @@ -57,7 +57,7 @@ def compute_toc(self, document, block_output: BlockOutput): toc = [] if hasattr(block_output, "id") and block_output.id.block_type == BlockTypes.SectionHeader: toc.append({ - "title": self.extract_block_html(document, block_output), + "title": self.extract_block_html(document, block_output)[0], "level": document.get_block(block_output.id).heading_level, "page": block_output.id.page_id }) diff --git a/tests/renderers/test_json_renderer.py b/tests/renderers/test_json_renderer.py new file mode 100644 index 00000000..d42ab79d --- /dev/null +++ b/tests/renderers/test_json_renderer.py @@ -0,0 +1,13 @@ +import pytest + +from marker.v2.renderers.json import JSONRenderer + + +@pytest.mark.config({"page_range": [0]}) +def test_markdown_renderer_pagination(pdf_document): + renderer = JSONRenderer() + pages = renderer(pdf_document).children + + assert len(pages) == 1 + assert pages[0].block_type == "Page" + assert pages[0].children[0].block_type == "SectionHeader" \ No newline at end of file diff --git a/tests/renderers/test_markdown_renderer.py b/tests/renderers/test_markdown_renderer.py index ba28402d..e44e0b54 100644 --- a/tests/renderers/test_markdown_renderer.py +++ b/tests/renderers/test_markdown_renderer.py @@ -18,4 +18,13 @@ def test_markdown_renderer_pagination(pdf_document): md = renderer(pdf_document).markdown assert "{0}-" in md - assert "{1}-" in md \ No newline at end of file + assert "{1}-" in md + + +@pytest.mark.config({"page_range": [0, 1]}) +def test_markdown_renderer_metadata(pdf_document): + renderer = MarkdownRenderer({"paginate_output": True}) + metadata = renderer(pdf_document).metadata + assert "table_of_contents" in metadata + + assert "Subspace Adversarial Training" in metadata["table_of_contents"][0]["title"] \ No newline at end of file