From 78cd08bb604dc406f6ce86d4e9d2b55da5fee1cd Mon Sep 17 00:00:00 2001 From: Moses Paul R Date: Fri, 6 Dec 2024 18:31:37 +0000 Subject: [PATCH] fix all kinds of weird pdfium bbox issues --- marker/builders/layout.py | 5 ++- marker/providers/pdf.py | 36 ++++--------------- marker/providers/pdf_parsing.py | 62 +++++++++++++++++++++++++++++---- 3 files changed, 65 insertions(+), 38 deletions(-) diff --git a/marker/builders/layout.py b/marker/builders/layout.py index 8a17bdda..32008b02 100644 --- a/marker/builders/layout.py +++ b/marker/builders/layout.py @@ -75,7 +75,7 @@ def add_blocks_to_pages(self, pages: List[PageGroup], layout_results: List[Layou for page, layout_result in zip(pages, layout_results): layout_page_size = PolygonBox.from_bbox(layout_result.image_bbox).size provider_page_size = page.polygon.size - page.layout_sliced = layout_result.sliced # This indicates if the page was sliced by the layout model + page.layout_sliced = layout_result.sliced # This indicates if the page was sliced by the layout model for bbox in sorted(layout_result.bboxes, key=lambda x: x.position): block_cls = get_block_class(BlockTypes[bbox.label]) layout_block = page.add_block(block_cls, PolygonBox(polygon=bbox.polygon)) @@ -124,7 +124,7 @@ def check_layout_coverage( total_blocks += 1 intersecting_lines = np.count_nonzero(intersection_matrix[idx] > 0) - if intersecting_lines > self.layout_coverage_min_lines: + if intersecting_lines >= self.layout_coverage_min_lines: covered_blocks += 1 if layout_block.polygon.intersection_pct(document_page.polygon) > 0.8 and layout_block.block_type == BlockTypes.Text: @@ -137,4 +137,3 @@ def check_layout_coverage( if not text_okay and (total_blocks == 1 and large_text_blocks == 1): text_okay = True return text_okay - diff --git a/marker/providers/pdf.py b/marker/providers/pdf.py index ef8f7c06..3b73e7c7 100644 --- a/marker/providers/pdf.py +++ b/marker/providers/pdf.py @@ -9,7 +9,7 @@ from PIL import Image from marker.providers import BaseProvider, ProviderOutput, ProviderPageLines -from marker.providers.pdf_parsing import get_blocks, get_chars, get_lines, get_spans +from marker.providers.pdf_parsing import get_pages from marker.providers.utils import alphanum_ratio from marker.schema import BlockTypes from marker.schema.polygon import PolygonBox @@ -18,31 +18,6 @@ from marker.schema.text.span import Span -def get_pages(pdf: pdfium.PdfDocument, page_range: range): - pages = [] - for page_idx in page_range: - page = pdf.get_page(page_idx) - textpage = page.get_textpage() - - page_bbox = page.get_bbox() - page_width = math.ceil(abs(page_bbox[2] - page_bbox[0])) - page_height = math.ceil(abs(page_bbox[1] - page_bbox[3])) - - chars = get_chars(textpage, page_width, page_height) - spans = get_spans(chars) - lines = get_lines(spans) - blocks = get_blocks(lines) - - pages.append({ - "page": page_idx, - "bbox": page_bbox, - "width": page_width, - "height": page_height, - "blocks": blocks - }) - return pages - - class PdfProvider(BaseProvider): page_range: List[int] | None = None pdftext_workers: int = 4 @@ -57,6 +32,9 @@ def __init__(self, filepath: str, config=None): super().__init__(filepath, config) self.doc: pdfium.PdfDocument = pdfium.PdfDocument(self.filepath) + if self.flatten_pdf: + self.doc.init_forms() + self.page_lines: ProviderPageLines = {i: [] for i in range(len(self.doc))} if self.page_range is None: @@ -133,12 +111,12 @@ def font_names_to_format(self, font_name: str | None) -> Set[str]: def pdftext_extraction(self) -> ProviderPageLines: page_lines: ProviderPageLines = {} - page_char_blocks = get_pages(self.doc, self.page_range) - self.page_bboxes = {i: [0, 0, page["width"], page["height"]] for i, page in zip(self.page_range, page_char_blocks)} + page_blocks = get_pages(self.doc, self.page_range, self.flatten_pdf) + self.page_bboxes = {i: [0, 0, page["width"], page["height"]] for i, page in zip(self.page_range, page_blocks)} SpanClass: Span = get_block_class(BlockTypes.Span) LineClass: Line = get_block_class(BlockTypes.Line) - for page in page_char_blocks: + for page in page_blocks: page_id = page["page"] lines: List[ProviderOutput] = [] for block in page["blocks"]: diff --git a/marker/providers/pdf_parsing.py b/marker/providers/pdf_parsing.py index 8014b96a..73e1a051 100644 --- a/marker/providers/pdf_parsing.py +++ b/marker/providers/pdf_parsing.py @@ -5,6 +5,12 @@ import pypdfium2.raw as pdfium_c +def flatten(page, flag=pdfium_c.FLAT_NORMALDISPLAY): + rc = pdfium_c.FPDFPage_Flatten(page, flag) + if rc == pdfium_c.FLATTEN_FAIL: + raise pdfium.PdfiumError("Failed to flatten annotations / form fields.") + + def get_fontname(textpage, i): font_name_str = "" flags = 0 @@ -26,16 +32,32 @@ def get_fontname(textpage, i): return font_name_str, flags -def get_chars(textpage, page_width, page_height, loose=False): +def get_chars(page, textpage, loose=False): chars = [] start_idx = 0 end_idx = 1 + + x_start, y_start, x_end, y_end = page.get_bbox() + page_width = math.ceil(abs(x_end - x_start)) + page_height = math.ceil(abs(y_end - y_start)) + for i in range(textpage.count_chars()): fontname, fontflag = get_fontname(textpage, i) text = chr(pdfium_c.FPDFText_GetUnicode(textpage, i)) end_idx = start_idx + len(text) - bbox = [round(coord, 2) for coord in textpage.get_charbox(i, loose=loose)] - bbox = [bbox[0], page_height - bbox[3], bbox[2], page_height - bbox[1]] + + char_box = textpage.get_charbox(i, loose=loose) + cx_start, cy_start, cx_end, cy_end = char_box + + cx_start -= x_start + cx_end -= x_start + cy_start -= y_start + cy_end -= y_start + + ty_start = page_height - cy_start + ty_end = page_height - cy_end + + bbox = [round(cx_start, 2), round(min(ty_start, ty_end), 2), round(cx_end, 2), round(max(ty_start, ty_end), 2)] chars.append({ "bbox": bbox, @@ -131,8 +153,37 @@ def get_blocks(lines): return blocks +def get_pages(pdf: pdfium.PdfDocument, page_range: range, flatten_pdf: bool = True): + pages = [] + for page_idx in page_range: + page = pdf.get_page(page_idx) + if flatten_pdf: + flatten(page) + page = pdf.get_page(page_idx) + + textpage = page.get_textpage() + + page_bbox = page.get_bbox() + page_width = math.ceil(abs(page_bbox[2] - page_bbox[0])) + page_height = math.ceil(abs(page_bbox[1] - page_bbox[3])) + + chars = get_chars(page, textpage) + spans = get_spans(chars) + lines = get_lines(spans) + blocks = get_blocks(lines) + + pages.append({ + "page": page_idx, + "bbox": page_bbox, + "width": page_width, + "height": page_height, + "blocks": blocks + }) + return pages + + if __name__ == "__main__": - pdf_path = '/home/ubuntu/surya-test/pdfs/adversarial.pdf' + pdf_path = '/home/ubuntu/surya-test/pdfs/nested-lists.pdf' pdf = pdfium.PdfDocument(pdf_path) for page_idx in range(len(pdf)): @@ -153,5 +204,4 @@ def get_blocks(lines): text = "" for span_idx, span in enumerate(line["spans"]): text += span["text"] - if 'accuracy against strong' in text: - breakpoint() + print(text, [span["text"] for span in line["spans"]])