Skip to content

Commit

Permalink
fix all kinds of weird pdfium bbox issues
Browse files Browse the repository at this point in the history
  • Loading branch information
iammosespaulr committed Dec 6, 2024
1 parent 2b324e7 commit 78cd08b
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 38 deletions.
5 changes: 2 additions & 3 deletions marker/builders/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def add_blocks_to_pages(self, pages: List[PageGroup], layout_results: List[Layou
for page, layout_result in zip(pages, layout_results):
layout_page_size = PolygonBox.from_bbox(layout_result.image_bbox).size
provider_page_size = page.polygon.size
page.layout_sliced = layout_result.sliced # This indicates if the page was sliced by the layout model
page.layout_sliced = layout_result.sliced # This indicates if the page was sliced by the layout model
for bbox in sorted(layout_result.bboxes, key=lambda x: x.position):
block_cls = get_block_class(BlockTypes[bbox.label])
layout_block = page.add_block(block_cls, PolygonBox(polygon=bbox.polygon))
Expand Down Expand Up @@ -124,7 +124,7 @@ def check_layout_coverage(
total_blocks += 1
intersecting_lines = np.count_nonzero(intersection_matrix[idx] > 0)

if intersecting_lines > self.layout_coverage_min_lines:
if intersecting_lines >= self.layout_coverage_min_lines:
covered_blocks += 1

if layout_block.polygon.intersection_pct(document_page.polygon) > 0.8 and layout_block.block_type == BlockTypes.Text:
Expand All @@ -137,4 +137,3 @@ def check_layout_coverage(
if not text_okay and (total_blocks == 1 and large_text_blocks == 1):
text_okay = True
return text_okay

36 changes: 7 additions & 29 deletions marker/providers/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from PIL import Image

from marker.providers import BaseProvider, ProviderOutput, ProviderPageLines
from marker.providers.pdf_parsing import get_blocks, get_chars, get_lines, get_spans
from marker.providers.pdf_parsing import get_pages
from marker.providers.utils import alphanum_ratio
from marker.schema import BlockTypes
from marker.schema.polygon import PolygonBox
Expand All @@ -18,31 +18,6 @@
from marker.schema.text.span import Span


def get_pages(pdf: pdfium.PdfDocument, page_range: range):
pages = []
for page_idx in page_range:
page = pdf.get_page(page_idx)
textpage = page.get_textpage()

page_bbox = page.get_bbox()
page_width = math.ceil(abs(page_bbox[2] - page_bbox[0]))
page_height = math.ceil(abs(page_bbox[1] - page_bbox[3]))

chars = get_chars(textpage, page_width, page_height)
spans = get_spans(chars)
lines = get_lines(spans)
blocks = get_blocks(lines)

pages.append({
"page": page_idx,
"bbox": page_bbox,
"width": page_width,
"height": page_height,
"blocks": blocks
})
return pages


class PdfProvider(BaseProvider):
page_range: List[int] | None = None
pdftext_workers: int = 4
Expand All @@ -57,6 +32,9 @@ def __init__(self, filepath: str, config=None):
super().__init__(filepath, config)

self.doc: pdfium.PdfDocument = pdfium.PdfDocument(self.filepath)
if self.flatten_pdf:
self.doc.init_forms()

self.page_lines: ProviderPageLines = {i: [] for i in range(len(self.doc))}

if self.page_range is None:
Expand Down Expand Up @@ -133,12 +111,12 @@ def font_names_to_format(self, font_name: str | None) -> Set[str]:

def pdftext_extraction(self) -> ProviderPageLines:
page_lines: ProviderPageLines = {}
page_char_blocks = get_pages(self.doc, self.page_range)
self.page_bboxes = {i: [0, 0, page["width"], page["height"]] for i, page in zip(self.page_range, page_char_blocks)}
page_blocks = get_pages(self.doc, self.page_range, self.flatten_pdf)
self.page_bboxes = {i: [0, 0, page["width"], page["height"]] for i, page in zip(self.page_range, page_blocks)}

SpanClass: Span = get_block_class(BlockTypes.Span)
LineClass: Line = get_block_class(BlockTypes.Line)
for page in page_char_blocks:
for page in page_blocks:
page_id = page["page"]
lines: List[ProviderOutput] = []
for block in page["blocks"]:
Expand Down
62 changes: 56 additions & 6 deletions marker/providers/pdf_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@
import pypdfium2.raw as pdfium_c


def flatten(page, flag=pdfium_c.FLAT_NORMALDISPLAY):
rc = pdfium_c.FPDFPage_Flatten(page, flag)
if rc == pdfium_c.FLATTEN_FAIL:
raise pdfium.PdfiumError("Failed to flatten annotations / form fields.")


def get_fontname(textpage, i):
font_name_str = ""
flags = 0
Expand All @@ -26,16 +32,32 @@ def get_fontname(textpage, i):
return font_name_str, flags


def get_chars(textpage, page_width, page_height, loose=False):
def get_chars(page, textpage, loose=False):
chars = []
start_idx = 0
end_idx = 1

x_start, y_start, x_end, y_end = page.get_bbox()
page_width = math.ceil(abs(x_end - x_start))
page_height = math.ceil(abs(y_end - y_start))

for i in range(textpage.count_chars()):
fontname, fontflag = get_fontname(textpage, i)
text = chr(pdfium_c.FPDFText_GetUnicode(textpage, i))
end_idx = start_idx + len(text)
bbox = [round(coord, 2) for coord in textpage.get_charbox(i, loose=loose)]
bbox = [bbox[0], page_height - bbox[3], bbox[2], page_height - bbox[1]]

char_box = textpage.get_charbox(i, loose=loose)
cx_start, cy_start, cx_end, cy_end = char_box

cx_start -= x_start
cx_end -= x_start
cy_start -= y_start
cy_end -= y_start

ty_start = page_height - cy_start
ty_end = page_height - cy_end

bbox = [round(cx_start, 2), round(min(ty_start, ty_end), 2), round(cx_end, 2), round(max(ty_start, ty_end), 2)]

chars.append({
"bbox": bbox,
Expand Down Expand Up @@ -131,8 +153,37 @@ def get_blocks(lines):
return blocks


def get_pages(pdf: pdfium.PdfDocument, page_range: range, flatten_pdf: bool = True):
pages = []
for page_idx in page_range:
page = pdf.get_page(page_idx)
if flatten_pdf:
flatten(page)
page = pdf.get_page(page_idx)

textpage = page.get_textpage()

page_bbox = page.get_bbox()
page_width = math.ceil(abs(page_bbox[2] - page_bbox[0]))
page_height = math.ceil(abs(page_bbox[1] - page_bbox[3]))

chars = get_chars(page, textpage)
spans = get_spans(chars)
lines = get_lines(spans)
blocks = get_blocks(lines)

pages.append({
"page": page_idx,
"bbox": page_bbox,
"width": page_width,
"height": page_height,
"blocks": blocks
})
return pages


if __name__ == "__main__":
pdf_path = '/home/ubuntu/surya-test/pdfs/adversarial.pdf'
pdf_path = '/home/ubuntu/surya-test/pdfs/nested-lists.pdf'
pdf = pdfium.PdfDocument(pdf_path)

for page_idx in range(len(pdf)):
Expand All @@ -153,5 +204,4 @@ def get_blocks(lines):
text = ""
for span_idx, span in enumerate(line["spans"]):
text += span["text"]
if 'accuracy against strong' in text:
breakpoint()
print(text, [span["text"] for span in line["spans"]])

0 comments on commit 78cd08b

Please sign in to comment.