-
Notifications
You must be signed in to change notification settings - Fork 27
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
3f90b65
commit 12e3161
Showing
2 changed files
with
206 additions
and
0 deletions.
There are no files selected for viewing
164 changes: 164 additions & 0 deletions
164
ynr/apps/sopn_parsing/management/commands/sopn_parsing_aws_textract.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,164 @@ | ||
import os | ||
|
||
import boto3 | ||
import trp.trp2 as t2 | ||
from django.core.management.base import BaseCommand | ||
from textractcaller.t_call import Textract_Features, call_textract | ||
from textractprettyprinter.t_pretty_print import ( | ||
Textract_Pretty_Print, | ||
get_string, | ||
) | ||
from trp.t_pipeline import pipeline_merge_tables | ||
from trp.t_tables import HeaderFooterType, MergeOptions | ||
|
||
accepted_file_types = (".pdf", ".png", ".jpg", ".jpeg") | ||
output_path = ( | ||
"ynr/apps/sopn_parsing/management/commands/test_sopns/converted_sopns" | ||
) | ||
|
||
|
||
class Command(BaseCommand): | ||
def handle(self, *args, **options): | ||
# TO DO | ||
|
||
# check the number of pages and the file size | ||
# if greater than 11 pages or 5MB, split the document into separate pdfs | ||
|
||
# check the file type | ||
# if it is not a pdf, convert it to a pdf | ||
|
||
# STOP HERE | ||
# do we interupt the texttract process here to match the wards? | ||
# and then only focus on that ward/pdf? | ||
# doing so at this point makes the pdf smaller and easier to process | ||
# we might even be able to do more robust matching at this stage and then | ||
# do an s3 lookup for other sopns that are yet to have uploads | ||
# CONTINUE ONCE WE HAVE A MATCH | ||
|
||
# upload the pdf to s3 | ||
# call textract on the pdf | ||
# extract the tables from the textract response | ||
# store the tables in the database | ||
# delete the pdf from s3? | ||
|
||
# # the pdf below is greater than 11 pages so needs to | ||
# be split before being sent to textract | ||
# this is the case for any file greater than 11 pages | ||
self.extract_tables() | ||
|
||
# def format_for_textract(self, original_document): | ||
# """Convert the file to a pdf that can be sent to textract""" | ||
# # check the file type and length | ||
# too_big = os.path.getsize(original_document) >= 5000000 or len(original_document) >= 11 | ||
# if too_big and original_document.endswith(".pdf"): | ||
# self.divide_pdf(original_document, output_path) | ||
# elif not too_big and original_document.endswith(accepted_file_types): | ||
# #skip to texttract | ||
# self.extract_tables_from_multipage_sopn_image(original_document) | ||
# else: | ||
# self.convert_sopn(original_document, output_path) | ||
# # go back to the start of the function | ||
# self.format_for_textract(original_document) | ||
|
||
# def divide_pdf(self, original_document, output_folder): | ||
# """Split the sopn into individual pages""" | ||
# input_pdf_file = original_document | ||
# with open(input_pdf_file, "rb") as pdf_file: | ||
# pdf_reader = PyPDF2.PdfFileReader(pdf_file) | ||
|
||
# # Iterate through each page in the PDF | ||
# for page_number in range(pdf_reader.numPages): | ||
# # Create a new PDF writer for each page | ||
# pdf_writer = PyPDF2.PdfFileWriter() | ||
# output_pdf_file = f"page_{page_number + 1}.pdf" | ||
|
||
# # Add the current page to the new PDF writer | ||
# pdf_writer.addPage(pdf_reader.getPage(page_number)) | ||
|
||
# # Write the current page to a new PDF file | ||
# with open(output_pdf_file, "wb") as output_file: | ||
# pdf_writer.write(output_file) | ||
|
||
# print(f"Page {page_number + 1} saved to {output_pdf_file}") | ||
|
||
# def convert_to_pdf(self, output_folder): | ||
# """Convert the sopn of any accepted format to a pdf""" | ||
# # convert each image in the output_folder to a pdf | ||
# pass | ||
|
||
# def upload_to_s3(self, file_name, bucket): | ||
# """Upload a file to an S3 bucket | ||
# """ | ||
# s3_client = boto3.client("s3") | ||
# try: | ||
# response = s3_client.upload_file(file_name, bucket, file_name) | ||
# except Exception as e: | ||
# print(e) | ||
# return False | ||
|
||
# def append_images(self, output_folder, image_count): | ||
# """Append all the images together to make one long vertical image""" | ||
# images = [] | ||
# for i in range(1, image_count + 1): | ||
# images.append(Image.open(f"{output_folder}/page_{i}.png")) | ||
|
||
# max_width = max(i.size[0] for i in images) | ||
# total_height = sum(i.size[1] for i in images) | ||
|
||
# new_im = Image.new("RGB", (max_width, total_height)) | ||
# y_offset = 0 | ||
# for im in images: | ||
# new_im.paste(im, (0, y_offset)) | ||
# y_offset += im.size[1] | ||
# new_im.save(f"{output_folder}/all_pages.png", "PNG", quality=80) | ||
|
||
# # store this pdf in s3 | ||
# # self.upload_to_s3(f"{output_folder}/all_pages.pdf", "sopn-parsing") | ||
|
||
# # delete individual images in the folder | ||
# # for i in range(1, image_count + 1): | ||
# # os.remove(f"{output_folder}/page_{i}.png") | ||
# s3_uri_of_documents = f"{output_folder}/all_pages.pdf" | ||
# self.extract_tables_from_multipage_sopn_image(s3_uri_of_documents) | ||
|
||
# def append_split_table_pdf(self, output_folder): | ||
# """Append 'page_7.png' and "page_8.png" together to make one long vertical image""" | ||
# images = [] | ||
# for i in range(7, 9): | ||
# images.append(Image.open(f"{output_folder}/page_{i}.png")) | ||
|
||
# max_width = max(i.size[0] for i in images) | ||
# total_height = sum(i.size[1] for i in images) | ||
|
||
# new_im = Image.new("RGB", (max_width, total_height)) | ||
# y_offset = 0 | ||
# for im in images: | ||
# new_im.paste(im, (0, y_offset)) | ||
# y_offset += im.size[1] | ||
# new_im.save(f"{output_folder}/7-8.png") | ||
|
||
def extract_tables(self, s3_uri_of_documents): | ||
s3_uri_of_documents = "ynr/apps/sopn_parsing/management/commands/test_sopns/converted_sopns/converted_sopn_1.pdf" | ||
session = boto3.session.Session( | ||
aws_access_key_id=os.environ.get("AWS_ACCESS_KEY_ID"), | ||
aws_secret_access_key=os.environ.get("AWS_SECRET_ACCESS_KEY"), | ||
aws_session_token=os.environ.get("AWS_SECURITY_TOKEN"), | ||
) | ||
textract_client = boto3.client("textract", region_name="us-west-1") | ||
textract_json = call_textract( | ||
input_document=s3_uri_of_documents, | ||
features=[Textract_Features.TABLES], | ||
boto3_textract_client=textract_client, | ||
) | ||
|
||
# this output is the same as the output from the demo tool | ||
print( | ||
get_string( | ||
textract_json=textract_json, output_type=Textract_Pretty_Print | ||
) | ||
) | ||
|
||
t_document: t2.TDocument = t2.TDocumentSchema().load(textract_json) | ||
t_document = pipeline_merge_tables( | ||
t_document, MergeOptions.MERGE, None, HeaderFooterType.NONE, session | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
|
||
```mermaid | ||
graph TD | ||
subgraph Start | ||
A[Upload the SoPN file] | ||
end | ||
subgraph Check File Type | ||
B[Is it a valid PDF, JPEG, or PNG?] | ||
C[Is it another file type?] | ||
end | ||
subgraph Check File Length | ||
D[Is it more than one page?] | ||
end | ||
subgraph Processing Steps | ||
E[Try to convert it to a PDF] | ||
F[Send to Textract to extract tables] | ||
G[Start internal page extraction and matching] | ||
H[Failure: Save to S3 and process manually] | ||
I[Success: Proceed to parsing] | ||
end | ||
A --> B | ||
A --> C | ||
B --> D | ||
C --> E | ||
D -->|Yes| G | ||
D -->|No| F | ||
E --> B | ||
F --> H | ||
F --> I | ||
G --> H | ||
G --> I | ||
``` |