fix all kinds of weird pdfium bbox issues

VikParuchuri · Dec 6, 2024 · 78cd08b · 78cd08b
1 parent 2b324e7
commit 78cd08b
Show file tree

Hide file tree

Showing 3 changed files with 65 additions and 38 deletions.
diff --git a/marker/builders/layout.py b/marker/builders/layout.py
@@ -75,7 +75,7 @@ def add_blocks_to_pages(self, pages: List[PageGroup], layout_results: List[Layou
         for page, layout_result in zip(pages, layout_results):
             layout_page_size = PolygonBox.from_bbox(layout_result.image_bbox).size
             provider_page_size = page.polygon.size
-            page.layout_sliced = layout_result.sliced # This indicates if the page was sliced by the layout model
+            page.layout_sliced = layout_result.sliced  # This indicates if the page was sliced by the layout model
             for bbox in sorted(layout_result.bboxes, key=lambda x: x.position):
                 block_cls = get_block_class(BlockTypes[bbox.label])
                 layout_block = page.add_block(block_cls, PolygonBox(polygon=bbox.polygon))
@@ -124,7 +124,7 @@ def check_layout_coverage(
             total_blocks += 1
             intersecting_lines = np.count_nonzero(intersection_matrix[idx] > 0)
 
-            if intersecting_lines > self.layout_coverage_min_lines:
+            if intersecting_lines >= self.layout_coverage_min_lines:
                 covered_blocks += 1
 
             if layout_block.polygon.intersection_pct(document_page.polygon) > 0.8 and layout_block.block_type == BlockTypes.Text:
@@ -137,4 +137,3 @@ def check_layout_coverage(
         if not text_okay and (total_blocks == 1 and large_text_blocks == 1):
             text_okay = True
         return text_okay
-
diff --git a/marker/providers/pdf.py b/marker/providers/pdf.py
@@ -9,7 +9,7 @@
 from PIL import Image
 
 from marker.providers import BaseProvider, ProviderOutput, ProviderPageLines
-from marker.providers.pdf_parsing import get_blocks, get_chars, get_lines, get_spans
+from marker.providers.pdf_parsing import get_pages
 from marker.providers.utils import alphanum_ratio
 from marker.schema import BlockTypes
 from marker.schema.polygon import PolygonBox
@@ -18,31 +18,6 @@
 from marker.schema.text.span import Span
 
 
-def get_pages(pdf: pdfium.PdfDocument, page_range: range):
-    pages = []
-    for page_idx in page_range:
-        page = pdf.get_page(page_idx)
-        textpage = page.get_textpage()
-
-        page_bbox = page.get_bbox()
-        page_width = math.ceil(abs(page_bbox[2] - page_bbox[0]))
-        page_height = math.ceil(abs(page_bbox[1] - page_bbox[3]))
-
-        chars = get_chars(textpage, page_width, page_height)
-        spans = get_spans(chars)
-        lines = get_lines(spans)
-        blocks = get_blocks(lines)
-
-        pages.append({
-            "page": page_idx,
-            "bbox": page_bbox,
-            "width": page_width,
-            "height": page_height,
-            "blocks": blocks
-        })
-    return pages
-
-
 class PdfProvider(BaseProvider):
     page_range: List[int] | None = None
     pdftext_workers: int = 4
@@ -57,6 +32,9 @@ def __init__(self, filepath: str, config=None):
         super().__init__(filepath, config)
 
         self.doc: pdfium.PdfDocument = pdfium.PdfDocument(self.filepath)
+        if self.flatten_pdf:
+            self.doc.init_forms()
+
         self.page_lines: ProviderPageLines = {i: [] for i in range(len(self.doc))}
 
         if self.page_range is None:
@@ -133,12 +111,12 @@ def font_names_to_format(self, font_name: str | None) -> Set[str]:
 
     def pdftext_extraction(self) -> ProviderPageLines:
         page_lines: ProviderPageLines = {}
-        page_char_blocks = get_pages(self.doc, self.page_range)
-        self.page_bboxes = {i: [0, 0, page["width"], page["height"]] for i, page in zip(self.page_range, page_char_blocks)}
+        page_blocks = get_pages(self.doc, self.page_range, self.flatten_pdf)
+        self.page_bboxes = {i: [0, 0, page["width"], page["height"]] for i, page in zip(self.page_range, page_blocks)}
 
         SpanClass: Span = get_block_class(BlockTypes.Span)
         LineClass: Line = get_block_class(BlockTypes.Line)
-        for page in page_char_blocks:
+        for page in page_blocks:
             page_id = page["page"]
             lines: List[ProviderOutput] = []
             for block in page["blocks"]:

diff --git a/marker/providers/pdf_parsing.py b/marker/providers/pdf_parsing.py
@@ -5,6 +5,12 @@
 import pypdfium2.raw as pdfium_c
 
 
+def flatten(page, flag=pdfium_c.FLAT_NORMALDISPLAY):
+    rc = pdfium_c.FPDFPage_Flatten(page, flag)
+    if rc == pdfium_c.FLATTEN_FAIL:
+        raise pdfium.PdfiumError("Failed to flatten annotations / form fields.")
+
+
 def get_fontname(textpage, i):
     font_name_str = ""
     flags = 0
@@ -26,16 +32,32 @@ def get_fontname(textpage, i):
     return font_name_str, flags
 
 
-def get_chars(textpage, page_width, page_height, loose=False):
+def get_chars(page, textpage, loose=False):
     chars = []
     start_idx = 0
     end_idx = 1
+
+    x_start, y_start, x_end, y_end = page.get_bbox()
+    page_width = math.ceil(abs(x_end - x_start))
+    page_height = math.ceil(abs(y_end - y_start))
+
     for i in range(textpage.count_chars()):
         fontname, fontflag = get_fontname(textpage, i)
         text = chr(pdfium_c.FPDFText_GetUnicode(textpage, i))
         end_idx = start_idx + len(text)
-        bbox = [round(coord, 2) for coord in textpage.get_charbox(i, loose=loose)]
-        bbox = [bbox[0], page_height - bbox[3], bbox[2], page_height - bbox[1]]
+
+        char_box = textpage.get_charbox(i, loose=loose)
+        cx_start, cy_start, cx_end, cy_end = char_box
+
+        cx_start -= x_start
+        cx_end -= x_start
+        cy_start -= y_start
+        cy_end -= y_start
+
+        ty_start = page_height - cy_start
+        ty_end = page_height - cy_end
+
+        bbox = [round(cx_start, 2), round(min(ty_start, ty_end), 2), round(cx_end, 2), round(max(ty_start, ty_end), 2)]
 
         chars.append({
             "bbox": bbox,
@@ -131,8 +153,37 @@ def get_blocks(lines):
     return blocks
 
 
+def get_pages(pdf: pdfium.PdfDocument, page_range: range, flatten_pdf: bool = True):
+    pages = []
+    for page_idx in page_range:
+        page = pdf.get_page(page_idx)
+        if flatten_pdf:
+            flatten(page)
+            page = pdf.get_page(page_idx)
+
+        textpage = page.get_textpage()
+
+        page_bbox = page.get_bbox()
+        page_width = math.ceil(abs(page_bbox[2] - page_bbox[0]))
+        page_height = math.ceil(abs(page_bbox[1] - page_bbox[3]))
+
+        chars = get_chars(page, textpage)
+        spans = get_spans(chars)
+        lines = get_lines(spans)
+        blocks = get_blocks(lines)
+
+        pages.append({
+            "page": page_idx,
+            "bbox": page_bbox,
+            "width": page_width,
+            "height": page_height,
+            "blocks": blocks
+        })
+    return pages
+
+
 if __name__ == "__main__":
-    pdf_path = '/home/ubuntu/surya-test/pdfs/adversarial.pdf'
+    pdf_path = '/home/ubuntu/surya-test/pdfs/nested-lists.pdf'
     pdf = pdfium.PdfDocument(pdf_path)
 
     for page_idx in range(len(pdf)):
@@ -153,5 +204,4 @@ def get_blocks(lines):
                 text = ""
                 for span_idx, span in enumerate(line["spans"]):
                     text += span["text"]
-                if 'accuracy against strong' in text:
-                    breakpoint()
+                print(text, [span["text"] for span in line["spans"]])