Skip to content

Commit

Permalink
Visualise SoPN workflow
Browse files Browse the repository at this point in the history
  • Loading branch information
VirginiaDooley committed Sep 25, 2023
1 parent 3f90b65 commit 12e3161
Show file tree
Hide file tree
Showing 2 changed files with 206 additions and 0 deletions.
164 changes: 164 additions & 0 deletions ynr/apps/sopn_parsing/management/commands/sopn_parsing_aws_textract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
import os

import boto3
import trp.trp2 as t2
from django.core.management.base import BaseCommand
from textractcaller.t_call import Textract_Features, call_textract
from textractprettyprinter.t_pretty_print import (
Textract_Pretty_Print,
get_string,
)
from trp.t_pipeline import pipeline_merge_tables
from trp.t_tables import HeaderFooterType, MergeOptions

accepted_file_types = (".pdf", ".png", ".jpg", ".jpeg")
output_path = (
"ynr/apps/sopn_parsing/management/commands/test_sopns/converted_sopns"
)


class Command(BaseCommand):
def handle(self, *args, **options):
# TO DO

# check the number of pages and the file size
# if greater than 11 pages or 5MB, split the document into separate pdfs

# check the file type
# if it is not a pdf, convert it to a pdf

# STOP HERE
# do we interupt the texttract process here to match the wards?
# and then only focus on that ward/pdf?
# doing so at this point makes the pdf smaller and easier to process
# we might even be able to do more robust matching at this stage and then
# do an s3 lookup for other sopns that are yet to have uploads
# CONTINUE ONCE WE HAVE A MATCH

# upload the pdf to s3
# call textract on the pdf
# extract the tables from the textract response
# store the tables in the database
# delete the pdf from s3?

# # the pdf below is greater than 11 pages so needs to
# be split before being sent to textract
# this is the case for any file greater than 11 pages
self.extract_tables()

# def format_for_textract(self, original_document):
# """Convert the file to a pdf that can be sent to textract"""
# # check the file type and length
# too_big = os.path.getsize(original_document) >= 5000000 or len(original_document) >= 11
# if too_big and original_document.endswith(".pdf"):
# self.divide_pdf(original_document, output_path)
# elif not too_big and original_document.endswith(accepted_file_types):
# #skip to texttract
# self.extract_tables_from_multipage_sopn_image(original_document)
# else:
# self.convert_sopn(original_document, output_path)
# # go back to the start of the function
# self.format_for_textract(original_document)

# def divide_pdf(self, original_document, output_folder):
# """Split the sopn into individual pages"""
# input_pdf_file = original_document
# with open(input_pdf_file, "rb") as pdf_file:
# pdf_reader = PyPDF2.PdfFileReader(pdf_file)

# # Iterate through each page in the PDF
# for page_number in range(pdf_reader.numPages):
# # Create a new PDF writer for each page
# pdf_writer = PyPDF2.PdfFileWriter()
# output_pdf_file = f"page_{page_number + 1}.pdf"

# # Add the current page to the new PDF writer
# pdf_writer.addPage(pdf_reader.getPage(page_number))

# # Write the current page to a new PDF file
# with open(output_pdf_file, "wb") as output_file:
# pdf_writer.write(output_file)

# print(f"Page {page_number + 1} saved to {output_pdf_file}")

# def convert_to_pdf(self, output_folder):
# """Convert the sopn of any accepted format to a pdf"""
# # convert each image in the output_folder to a pdf
# pass

# def upload_to_s3(self, file_name, bucket):
# """Upload a file to an S3 bucket
# """
# s3_client = boto3.client("s3")
# try:
# response = s3_client.upload_file(file_name, bucket, file_name)
# except Exception as e:
# print(e)
# return False

# def append_images(self, output_folder, image_count):
# """Append all the images together to make one long vertical image"""
# images = []
# for i in range(1, image_count + 1):
# images.append(Image.open(f"{output_folder}/page_{i}.png"))

# max_width = max(i.size[0] for i in images)
# total_height = sum(i.size[1] for i in images)

# new_im = Image.new("RGB", (max_width, total_height))
# y_offset = 0
# for im in images:
# new_im.paste(im, (0, y_offset))
# y_offset += im.size[1]
# new_im.save(f"{output_folder}/all_pages.png", "PNG", quality=80)

# # store this pdf in s3
# # self.upload_to_s3(f"{output_folder}/all_pages.pdf", "sopn-parsing")

# # delete individual images in the folder
# # for i in range(1, image_count + 1):
# # os.remove(f"{output_folder}/page_{i}.png")
# s3_uri_of_documents = f"{output_folder}/all_pages.pdf"
# self.extract_tables_from_multipage_sopn_image(s3_uri_of_documents)

# def append_split_table_pdf(self, output_folder):
# """Append 'page_7.png' and "page_8.png" together to make one long vertical image"""
# images = []
# for i in range(7, 9):
# images.append(Image.open(f"{output_folder}/page_{i}.png"))

# max_width = max(i.size[0] for i in images)
# total_height = sum(i.size[1] for i in images)

# new_im = Image.new("RGB", (max_width, total_height))
# y_offset = 0
# for im in images:
# new_im.paste(im, (0, y_offset))
# y_offset += im.size[1]
# new_im.save(f"{output_folder}/7-8.png")

def extract_tables(self, s3_uri_of_documents):
s3_uri_of_documents = "ynr/apps/sopn_parsing/management/commands/test_sopns/converted_sopns/converted_sopn_1.pdf"
session = boto3.session.Session(
aws_access_key_id=os.environ.get("AWS_ACCESS_KEY_ID"),
aws_secret_access_key=os.environ.get("AWS_SECRET_ACCESS_KEY"),
aws_session_token=os.environ.get("AWS_SECURITY_TOKEN"),
)
textract_client = boto3.client("textract", region_name="us-west-1")
textract_json = call_textract(
input_document=s3_uri_of_documents,
features=[Textract_Features.TABLES],
boto3_textract_client=textract_client,
)

# this output is the same as the output from the demo tool
print(
get_string(
textract_json=textract_json, output_type=Textract_Pretty_Print
)
)

t_document: t2.TDocument = t2.TDocumentSchema().load(textract_json)
t_document = pipeline_merge_tables(
t_document, MergeOptions.MERGE, None, HeaderFooterType.NONE, session
)
42 changes: 42 additions & 0 deletions ynr/apps/sopn_parsing/sopn_processing.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@

```mermaid
graph TD
subgraph Start
A[Upload the SoPN file]
end
subgraph Check File Type
B[Is it a valid PDF, JPEG, or PNG?]
C[Is it another file type?]
end
subgraph Check File Length
D[Is it more than one page?]
end
subgraph Processing Steps
E[Try to convert it to a PDF]
F[Send to Textract to extract tables]
G[Start internal page extraction and matching]
H[Failure: Save to S3 and process manually]
I[Success: Proceed to parsing]
end
A --> B
A --> C
B --> D
C --> E
D -->|Yes| G
D -->|No| F
E --> B
F --> H
F --> I
G --> H
G --> I
```

0 comments on commit 12e3161

Please sign in to comment.