Skip to content

Commit

Permalink
disable loosebox for quotes specifically in the pdf provider (don't c…
Browse files Browse the repository at this point in the history
…hange tables)
  • Loading branch information
iammosespaulr committed Dec 3, 2024
1 parent 91862fb commit c653494
Showing 1 changed file with 5 additions and 4 deletions.
9 changes: 5 additions & 4 deletions marker/providers/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,15 +39,15 @@ def __init__(self, filepath: str, config=None):
if self.page_range is None:
self.page_range = range(len(self.doc))

assert max(self.page_range) < len(self.doc) and min(self.page_range) >= 0, f"Invalid page range, values must be between 0 and {len(self.doc) - 1}. Min of provided page range is {min(self.page_range)} and max is {max(self.page_range)}."
assert max(self.page_range) < len(self.doc) and min(self.page_range) >= 0, \
f"Invalid page range, values must be between 0 and {len(self.doc) - 1}. Min of provided page range is {min(self.page_range)} and max is {max(self.page_range)}."

if self.force_ocr:
# Manually assign page bboxes, since we can't get them from pdftext
self.page_bboxes = {i: self.doc[i].get_bbox() for i in self.page_range}
else:
self.page_lines = self.pdftext_extraction()


atexit.register(self.cleanup_pdf_doc)

def __len__(self) -> int:
Expand Down Expand Up @@ -115,7 +115,8 @@ def pdftext_extraction(self) -> ProviderPageLines:
page_range=self.page_range,
keep_chars=False,
workers=self.pdftext_workers,
flatten_pdf=self.flatten_pdf
flatten_pdf=self.flatten_pdf,
quote_loosebox=False
)
self.page_bboxes = {i: [0, 0, page["width"], page["height"]] for i, page in zip(self.page_range, page_char_blocks)}

Expand Down Expand Up @@ -216,4 +217,4 @@ def get_page_bbox(self, idx: int) -> PolygonBox | None:
return PolygonBox.from_bbox(bbox)

def get_page_lines(self, idx: int) -> List[ProviderOutput]:
return self.page_lines[idx]
return self.page_lines[idx]

0 comments on commit c653494

Please sign in to comment.