Skip to content

Commit

Permalink
fix(text_extraction): Update formatting and docstrings
Browse files Browse the repository at this point in the history
  • Loading branch information
flooie committed May 29, 2024
1 parent c246bef commit f9c0b3d
Showing 1 changed file with 44 additions and 28 deletions.
72 changes: 44 additions & 28 deletions doctor/lib/text_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,15 +44,17 @@ def get_page_text(page: pdfplumber.PDF.pages, strip_margin: bool) -> str:
if strip_margin:
# Crop margins and remove skewed text
_, _, width, height = page.bbox
pixels_per_inch = width / 8.5
bbox = (
(1 / 8.5 * width),
(1 / 11 * height),
(7.5 / 8.5 * width),
(10 / 11 * height),
pixels_per_inch * 1, # 1 inch from left edge
pixels_per_inch * 1, # 1 inch down from top
pixels_per_inch
* 7.5, # 7.5 inches from left edge (1 inch from right)
pixels_per_inch * 10, # 10 inches from top (1 inch from bottom)
)
doc_text = (
page.crop(bbox)
.filter(deskew)
.filter(is_skewed)
.extract_text(
layout=True, keep_blank_chars=True, y_tolerance=5, y_density=25
)
Expand All @@ -65,18 +67,18 @@ def get_page_text(page: pdfplumber.PDF.pages, strip_margin: bool) -> str:


def page_images(page: pdfplumber.pdf.Page) -> bool:
"""Does the page have images of a certain size
"""Does the page have images that are large enough to contain text
:param page: pdf plumber
:param page: pdf plumber page
:return: True if page contains images of a certain size
"""
for img in page.images:
if (
img.get("width") / page.width * img.get("height") / page.height
> 0.1
) or img.get("width") * img.get("width") > 10:
return True
return False
return any(
[
image
for image in page.images
if image["width"] > 10 and image["height"] > 10
]
)


def page_annotations(page: pdfplumber.pdf.Page) -> bool:
Expand Down Expand Up @@ -154,25 +156,23 @@ def page_needs_ocr(page: pdfplumber.pdf.Page, page_text: str) -> bool:
def convert_pdf_page_to_image(
page: pdfplumber.pdf.Page, strip_margin: bool
) -> Image:
"""Conver page to image and crop margin if applicable
"""Convert page to image and crop margin if applicable
:param page:Pdf Plumber
:param strip_margin: bool
:return: The formatted image
:param page: the pdf page
:param strip_margin: whether to crop the margin
:return: The cropped page image
"""
img = page.to_image(resolution=300)
_, _, w, h = page.bbox
width = w * img.scale
height = h * img.scale

if strip_margin == True:
# Because this is OCR - I think its reasonable to crop half the standard
# 1 inch margin - this leads to much better results and helps reduce
pixels_per_inch = width / 8.5
bbox = (
(1 / (8.5 * 2) * width),
(1 / (11 * 2) * height),
(16 / (8.5 * 2) * width),
(21 / (11 * 2) * height),
pixels_per_inch * 0.5, # .5" from left edge
pixels_per_inch * 0.5, # .5" down from top
pixels_per_inch * 8, # 8" from left edge (.5" from right)
pixels_per_inch * 10.5, # 10.5" from top (.5" from bottom)
)
image = img.original.crop(bbox)
else:
Expand Down Expand Up @@ -321,9 +321,23 @@ def get_word(word_dict: dict, width: float, strip_margin: bool) -> str:
def cleanup_content(content: str, page_number: int) -> str:
"""Reduce legal document line clutter
Scans containing line numbers or bad scans have pipe issues this simply
tries to reduce the noise and artifacts and align caption lines
if easy
This function performs several operations to clean up the text extracted from legal documents:
1. On the first page, it smooths out vertical lines if they are detected.
2. It removes pipes ('|') that might start a line repeatedly.
3. It removes artifacts that appear at the end of a line of text, specifically single characters
following at least 10 whitespace characters, reducing right margin edge artifacts.
4. It removes excess left margin whitespace to improve readability and formatting.
Example:
If the pipes below represent the page edge (not characters):
| we can remove the
| the left whitespace
| and shift this entire
| page over four characters
| which keeps formatting and
| makes the text easier to
| read and process with the API.
:param content: the page content extracted
:param page_number: the page number
Expand All @@ -341,6 +355,7 @@ def cleanup_content(content: str, page_number: int) -> str:
pattern = r"\s{10,}[a-zA-Z0-9|] $"
content = re.sub(pattern, "", content, flags=re.MULTILINE)

# shift text left if possible and remove excess start and end whitespace
content = remove_excess_whitespace(content)
return f"{content}\n"

Expand All @@ -349,6 +364,7 @@ def remove_excess_whitespace(document: str) -> str:
"""Remove excess whitespace from OCR
This function removes empty lines of text at the start and end of a document
and shifts the page left if possible
:param document: text of the document
:return: Document with excess whitespace removed
Expand Down

0 comments on commit f9c0b3d

Please sign in to comment.