Visualise SoPN workflow

DemocracyClub · Sep 25, 2023 · 12e3161 · 12e3161
1 parent 3f90b65
commit 12e3161
Show file tree

Hide file tree

Showing 2 changed files with 206 additions and 0 deletions.
diff --git a/ynr/apps/sopn_parsing/management/commands/sopn_parsing_aws_textract.py b/ynr/apps/sopn_parsing/management/commands/sopn_parsing_aws_textract.py
@@ -0,0 +1,164 @@
+import os
+
+import boto3
+import trp.trp2 as t2
+from django.core.management.base import BaseCommand
+from textractcaller.t_call import Textract_Features, call_textract
+from textractprettyprinter.t_pretty_print import (
+    Textract_Pretty_Print,
+    get_string,
+)
+from trp.t_pipeline import pipeline_merge_tables
+from trp.t_tables import HeaderFooterType, MergeOptions
+
+accepted_file_types = (".pdf", ".png", ".jpg", ".jpeg")
+output_path = (
+    "ynr/apps/sopn_parsing/management/commands/test_sopns/converted_sopns"
+)
+
+
+class Command(BaseCommand):
+    def handle(self, *args, **options):
+        # TO DO
+
+        # check the number of pages and the file size
+        # if greater than 11 pages or 5MB, split the document into separate pdfs
+
+        # check the file type
+        # if it is not a pdf, convert it to a pdf
+
+        # STOP HERE
+        # do we interupt the texttract process here to match the wards?
+        # and then only focus on that ward/pdf?
+        # doing so at this point makes the pdf smaller and easier to process
+        # we might even be able to do more robust matching at this stage and then
+        # do an s3 lookup for other sopns that are yet to have uploads
+        # CONTINUE ONCE WE HAVE A MATCH
+
+        # upload the pdf to s3
+        # call textract on the pdf
+        # extract the tables from the textract response
+        # store the tables in the database
+        # delete the pdf from s3?
+
+        # # the pdf below is greater than 11 pages so needs to
+        # be split before being sent to textract
+        # this is the case for any file greater than 11 pages
+        self.extract_tables()
+
+    # def format_for_textract(self, original_document):
+    #     """Convert the file to a pdf that can be sent to textract"""
+    #     # check the file type and length
+    #     too_big = os.path.getsize(original_document) >= 5000000 or len(original_document) >= 11
+    #     if too_big and original_document.endswith(".pdf"):
+    #         self.divide_pdf(original_document, output_path)
+    #     elif not too_big and original_document.endswith(accepted_file_types):
+    #         #skip to texttract
+    #         self.extract_tables_from_multipage_sopn_image(original_document)
+    #     else:
+    #         self.convert_sopn(original_document, output_path)
+    #         # go back to the start of the function
+    #         self.format_for_textract(original_document)
+
+    # def divide_pdf(self, original_document, output_folder):
+    #     """Split the sopn into individual pages"""
+    #     input_pdf_file = original_document
+    #     with open(input_pdf_file, "rb") as pdf_file:
+    #         pdf_reader = PyPDF2.PdfFileReader(pdf_file)
+
+    #         # Iterate through each page in the PDF
+    #         for page_number in range(pdf_reader.numPages):
+    #             # Create a new PDF writer for each page
+    #             pdf_writer = PyPDF2.PdfFileWriter()
+    #             output_pdf_file = f"page_{page_number + 1}.pdf"
+
+    #             # Add the current page to the new PDF writer
+    #             pdf_writer.addPage(pdf_reader.getPage(page_number))
+
+    #             # Write the current page to a new PDF file
+    #             with open(output_pdf_file, "wb") as output_file:
+    #                 pdf_writer.write(output_file)
+
+    #             print(f"Page {page_number + 1} saved to {output_pdf_file}")
+
+    # def convert_to_pdf(self, output_folder):
+    #     """Convert the sopn of any accepted format to a pdf"""
+    #     # convert each image in the output_folder to a pdf
+    #     pass
+
+    # def upload_to_s3(self, file_name, bucket):
+    #     """Upload a file to an S3 bucket
+    #     """
+    #     s3_client = boto3.client("s3")
+    #     try:
+    #         response = s3_client.upload_file(file_name, bucket, file_name)
+    #     except Exception as e:
+    #         print(e)
+    #         return False
+
+    # def append_images(self, output_folder, image_count):
+    #     """Append all the images together to make one long vertical image"""
+    #     images = []
+    #     for i in range(1, image_count + 1):
+    #         images.append(Image.open(f"{output_folder}/page_{i}.png"))
+
+    #     max_width = max(i.size[0] for i in images)
+    #     total_height = sum(i.size[1] for i in images)
+
+    #     new_im = Image.new("RGB", (max_width, total_height))
+    #     y_offset = 0
+    #     for im in images:
+    #         new_im.paste(im, (0, y_offset))
+    #         y_offset += im.size[1]
+    #     new_im.save(f"{output_folder}/all_pages.png", "PNG", quality=80)
+
+    #     # store this pdf in s3
+    #     # self.upload_to_s3(f"{output_folder}/all_pages.pdf", "sopn-parsing")
+
+    #     # delete individual images in the folder
+    #     # for i in range(1, image_count + 1):
+    #     #     os.remove(f"{output_folder}/page_{i}.png")
+    #     s3_uri_of_documents = f"{output_folder}/all_pages.pdf"
+    #     self.extract_tables_from_multipage_sopn_image(s3_uri_of_documents)
+
+    # def append_split_table_pdf(self, output_folder):
+    #     """Append 'page_7.png' and "page_8.png" together to make one long vertical image"""
+    #     images = []
+    #     for i in range(7, 9):
+    #         images.append(Image.open(f"{output_folder}/page_{i}.png"))
+
+    #     max_width = max(i.size[0] for i in images)
+    #     total_height = sum(i.size[1] for i in images)
+
+    #     new_im = Image.new("RGB", (max_width, total_height))
+    #     y_offset = 0
+    #     for im in images:
+    #         new_im.paste(im, (0, y_offset))
+    #         y_offset += im.size[1]
+    #     new_im.save(f"{output_folder}/7-8.png")
+
+    def extract_tables(self, s3_uri_of_documents):
+        s3_uri_of_documents = "ynr/apps/sopn_parsing/management/commands/test_sopns/converted_sopns/converted_sopn_1.pdf"
+        session = boto3.session.Session(
+            aws_access_key_id=os.environ.get("AWS_ACCESS_KEY_ID"),
+            aws_secret_access_key=os.environ.get("AWS_SECRET_ACCESS_KEY"),
+            aws_session_token=os.environ.get("AWS_SECURITY_TOKEN"),
+        )
+        textract_client = boto3.client("textract", region_name="us-west-1")
+        textract_json = call_textract(
+            input_document=s3_uri_of_documents,
+            features=[Textract_Features.TABLES],
+            boto3_textract_client=textract_client,
+        )
+
+        # this output is the same as the output from the demo tool
+        print(
+            get_string(
+                textract_json=textract_json, output_type=Textract_Pretty_Print
+            )
+        )
+
+        t_document: t2.TDocument = t2.TDocumentSchema().load(textract_json)
+        t_document = pipeline_merge_tables(
+            t_document, MergeOptions.MERGE, None, HeaderFooterType.NONE, session
+        )
diff --git a/ynr/apps/sopn_parsing/sopn_processing.md b/ynr/apps/sopn_parsing/sopn_processing.md
@@ -0,0 +1,42 @@
+
+```mermaid
+graph TD
+    subgraph Start
+  A[Upload the SoPN file]
+end
+
+subgraph Check File Type
+  B[Is it a valid PDF, JPEG, or PNG?]
+  C[Is it another file type?]
+end
+
+subgraph Check File Length
+  D[Is it more than one page?]
+end
+
+subgraph Processing Steps
+  E[Try to convert it to a PDF]
+  F[Send to Textract to extract tables]
+  G[Start internal page extraction and matching]
+  H[Failure: Save to S3 and process manually]
+  I[Success: Proceed to parsing]
+
+end
+
+A --> B
+A --> C
+B --> D
+C --> E
+
+
+D -->|Yes| G
+D -->|No| F
+
+E --> B
+F --> H
+F --> I
+G --> H
+G --> I
+
+
+```