From 12e3161aa825ec6b2a89ac52379f8b3d1eb713bc Mon Sep 17 00:00:00 2001 From: Virginia Dooley Date: Thu, 21 Sep 2023 12:28:06 +0100 Subject: [PATCH] Visualise SoPN workflow --- .../commands/sopn_parsing_aws_textract.py | 164 ++++++++++++++++++ ynr/apps/sopn_parsing/sopn_processing.md | 42 +++++ 2 files changed, 206 insertions(+) create mode 100644 ynr/apps/sopn_parsing/management/commands/sopn_parsing_aws_textract.py create mode 100644 ynr/apps/sopn_parsing/sopn_processing.md diff --git a/ynr/apps/sopn_parsing/management/commands/sopn_parsing_aws_textract.py b/ynr/apps/sopn_parsing/management/commands/sopn_parsing_aws_textract.py new file mode 100644 index 000000000..4da016517 --- /dev/null +++ b/ynr/apps/sopn_parsing/management/commands/sopn_parsing_aws_textract.py @@ -0,0 +1,164 @@ +import os + +import boto3 +import trp.trp2 as t2 +from django.core.management.base import BaseCommand +from textractcaller.t_call import Textract_Features, call_textract +from textractprettyprinter.t_pretty_print import ( + Textract_Pretty_Print, + get_string, +) +from trp.t_pipeline import pipeline_merge_tables +from trp.t_tables import HeaderFooterType, MergeOptions + +accepted_file_types = (".pdf", ".png", ".jpg", ".jpeg") +output_path = ( + "ynr/apps/sopn_parsing/management/commands/test_sopns/converted_sopns" +) + + +class Command(BaseCommand): + def handle(self, *args, **options): + # TO DO + + # check the number of pages and the file size + # if greater than 11 pages or 5MB, split the document into separate pdfs + + # check the file type + # if it is not a pdf, convert it to a pdf + + # STOP HERE + # do we interupt the texttract process here to match the wards? + # and then only focus on that ward/pdf? + # doing so at this point makes the pdf smaller and easier to process + # we might even be able to do more robust matching at this stage and then + # do an s3 lookup for other sopns that are yet to have uploads + # CONTINUE ONCE WE HAVE A MATCH + + # upload the pdf to s3 + # call textract on the pdf + # extract the tables from the textract response + # store the tables in the database + # delete the pdf from s3? + + # # the pdf below is greater than 11 pages so needs to + # be split before being sent to textract + # this is the case for any file greater than 11 pages + self.extract_tables() + + # def format_for_textract(self, original_document): + # """Convert the file to a pdf that can be sent to textract""" + # # check the file type and length + # too_big = os.path.getsize(original_document) >= 5000000 or len(original_document) >= 11 + # if too_big and original_document.endswith(".pdf"): + # self.divide_pdf(original_document, output_path) + # elif not too_big and original_document.endswith(accepted_file_types): + # #skip to texttract + # self.extract_tables_from_multipage_sopn_image(original_document) + # else: + # self.convert_sopn(original_document, output_path) + # # go back to the start of the function + # self.format_for_textract(original_document) + + # def divide_pdf(self, original_document, output_folder): + # """Split the sopn into individual pages""" + # input_pdf_file = original_document + # with open(input_pdf_file, "rb") as pdf_file: + # pdf_reader = PyPDF2.PdfFileReader(pdf_file) + + # # Iterate through each page in the PDF + # for page_number in range(pdf_reader.numPages): + # # Create a new PDF writer for each page + # pdf_writer = PyPDF2.PdfFileWriter() + # output_pdf_file = f"page_{page_number + 1}.pdf" + + # # Add the current page to the new PDF writer + # pdf_writer.addPage(pdf_reader.getPage(page_number)) + + # # Write the current page to a new PDF file + # with open(output_pdf_file, "wb") as output_file: + # pdf_writer.write(output_file) + + # print(f"Page {page_number + 1} saved to {output_pdf_file}") + + # def convert_to_pdf(self, output_folder): + # """Convert the sopn of any accepted format to a pdf""" + # # convert each image in the output_folder to a pdf + # pass + + # def upload_to_s3(self, file_name, bucket): + # """Upload a file to an S3 bucket + # """ + # s3_client = boto3.client("s3") + # try: + # response = s3_client.upload_file(file_name, bucket, file_name) + # except Exception as e: + # print(e) + # return False + + # def append_images(self, output_folder, image_count): + # """Append all the images together to make one long vertical image""" + # images = [] + # for i in range(1, image_count + 1): + # images.append(Image.open(f"{output_folder}/page_{i}.png")) + + # max_width = max(i.size[0] for i in images) + # total_height = sum(i.size[1] for i in images) + + # new_im = Image.new("RGB", (max_width, total_height)) + # y_offset = 0 + # for im in images: + # new_im.paste(im, (0, y_offset)) + # y_offset += im.size[1] + # new_im.save(f"{output_folder}/all_pages.png", "PNG", quality=80) + + # # store this pdf in s3 + # # self.upload_to_s3(f"{output_folder}/all_pages.pdf", "sopn-parsing") + + # # delete individual images in the folder + # # for i in range(1, image_count + 1): + # # os.remove(f"{output_folder}/page_{i}.png") + # s3_uri_of_documents = f"{output_folder}/all_pages.pdf" + # self.extract_tables_from_multipage_sopn_image(s3_uri_of_documents) + + # def append_split_table_pdf(self, output_folder): + # """Append 'page_7.png' and "page_8.png" together to make one long vertical image""" + # images = [] + # for i in range(7, 9): + # images.append(Image.open(f"{output_folder}/page_{i}.png")) + + # max_width = max(i.size[0] for i in images) + # total_height = sum(i.size[1] for i in images) + + # new_im = Image.new("RGB", (max_width, total_height)) + # y_offset = 0 + # for im in images: + # new_im.paste(im, (0, y_offset)) + # y_offset += im.size[1] + # new_im.save(f"{output_folder}/7-8.png") + + def extract_tables(self, s3_uri_of_documents): + s3_uri_of_documents = "ynr/apps/sopn_parsing/management/commands/test_sopns/converted_sopns/converted_sopn_1.pdf" + session = boto3.session.Session( + aws_access_key_id=os.environ.get("AWS_ACCESS_KEY_ID"), + aws_secret_access_key=os.environ.get("AWS_SECRET_ACCESS_KEY"), + aws_session_token=os.environ.get("AWS_SECURITY_TOKEN"), + ) + textract_client = boto3.client("textract", region_name="us-west-1") + textract_json = call_textract( + input_document=s3_uri_of_documents, + features=[Textract_Features.TABLES], + boto3_textract_client=textract_client, + ) + + # this output is the same as the output from the demo tool + print( + get_string( + textract_json=textract_json, output_type=Textract_Pretty_Print + ) + ) + + t_document: t2.TDocument = t2.TDocumentSchema().load(textract_json) + t_document = pipeline_merge_tables( + t_document, MergeOptions.MERGE, None, HeaderFooterType.NONE, session + ) diff --git a/ynr/apps/sopn_parsing/sopn_processing.md b/ynr/apps/sopn_parsing/sopn_processing.md new file mode 100644 index 000000000..739c1e7e5 --- /dev/null +++ b/ynr/apps/sopn_parsing/sopn_processing.md @@ -0,0 +1,42 @@ + +```mermaid +graph TD + subgraph Start + A[Upload the SoPN file] +end + +subgraph Check File Type + B[Is it a valid PDF, JPEG, or PNG?] + C[Is it another file type?] +end + +subgraph Check File Length + D[Is it more than one page?] +end + +subgraph Processing Steps + E[Try to convert it to a PDF] + F[Send to Textract to extract tables] + G[Start internal page extraction and matching] + H[Failure: Save to S3 and process manually] + I[Success: Proceed to parsing] + +end + +A --> B +A --> C +B --> D +C --> E + + +D -->|Yes| G +D -->|No| F + +E --> B +F --> H +F --> I +G --> H +G --> I + + +``` \ No newline at end of file