From e4d7d0772d028180c901e74081c954fb15783c8a Mon Sep 17 00:00:00 2001 From: cccs-rs <62077998+cccs-rs@users.noreply.github.com> Date: Wed, 27 Mar 2024 16:05:06 +0000 Subject: [PATCH 1/2] Page browser back after every print to PDF --- document_preview/document_preview.py | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/document_preview/document_preview.py b/document_preview/document_preview.py index 7678f0d..b87149b 100644 --- a/document_preview/document_preview.py +++ b/document_preview/document_preview.py @@ -39,9 +39,9 @@ def __init__(self, config=None): browser_options = ChromeOptions() # Set brower options depending on service configuration - browser_cfg = config.get('browser_options', {}) - [browser_options.add_argument(arg) for arg in browser_cfg.get('arguments', [])] - [browser_options.set_capability(cap_n, cap_v) for cap_n, cap_v in browser_cfg.get('capabilities', {}).items()] + browser_cfg = config.get("browser_options", {}) + [browser_options.add_argument(arg) for arg in browser_cfg.get("arguments", [])] + [browser_options.set_capability(cap_n, cap_v) for cap_n, cap_v in browser_cfg.get("capabilities", {}).items()] # Run browser in offline mode only self.browser = Chrome(options=browser_options, service=ChromeService(executable_path="/usr/bin/chromedriver")) @@ -99,24 +99,14 @@ def html_render(self, file_contents, max_pages): # Load file into browser self.browser.get(f"file://{tmp_html.name}") - # Prepare command to perform Print to PDF - resource = "/session/%s/chromium/send_command_and_get_result" % self.browser.session_id - print_options = { - "landscape": False, - "displayHeaderFooter": False, - "printBackground": True, - "preferCSSPageSize": True, - } - # Execute command and save PDF content to disk for image conversion - resp = self.browser.command_executor._request( - "POST", - url=self.browser.command_executor._url + resource, - body=json.dumps({"cmd": "Page.printToPDF", "params": print_options}), - ) - tmp_pdf.write(b64decode(resp["value"]["data"])) + tmp_pdf.write(b64decode(self.browser.print_page())) tmp_pdf.flush() + # Page browser back to the beginning (in theory we shouldn't have to go far but just in case) + while self.browser.current_url != "data:,": + self.browser.back() + # Render PDF to images self.pdf_to_images(tmp_pdf.name, max_pages) From 81793f00f80ae26bc418460ca7c2c1b5ad3a595d Mon Sep 17 00:00:00 2001 From: cccs-rs <62077998+cccs-rs@users.noreply.github.com> Date: Wed, 27 Mar 2024 16:49:20 +0000 Subject: [PATCH 2/2] Optimization: load file contents directly into browser without intermediate file --- document_preview/document_preview.py | 35 ++++++++++++---------------- 1 file changed, 15 insertions(+), 20 deletions(-) diff --git a/document_preview/document_preview.py b/document_preview/document_preview.py index b87149b..ac2c4d1 100644 --- a/document_preview/document_preview.py +++ b/document_preview/document_preview.py @@ -1,4 +1,3 @@ -import json import os import subprocess import tempfile @@ -8,7 +7,7 @@ from assemblyline_v4_service.common.request import ServiceRequest as Request from assemblyline_v4_service.common.result import Heuristic, Result, ResultImageSection, ResultTextSection -from base64 import b64decode +from base64 import b64decode, b64encode from selenium.webdriver import Chrome, ChromeOptions, ChromeService from natsort import natsorted @@ -91,24 +90,20 @@ def office_conversion(self, file, orientation="portrait", page_range_end=2): return output_path def html_render(self, file_contents, max_pages): - # Create a temporary file containing the '.html' extension so Chrome can render the document properly - with tempfile.NamedTemporaryFile(suffix=".html") as tmp_html: - tmp_html.write(file_contents) - tmp_html.flush() - with tempfile.NamedTemporaryFile(suffix=".pdf") as tmp_pdf: - # Load file into browser - self.browser.get(f"file://{tmp_html.name}") - - # Execute command and save PDF content to disk for image conversion - tmp_pdf.write(b64decode(self.browser.print_page())) - tmp_pdf.flush() - - # Page browser back to the beginning (in theory we shouldn't have to go far but just in case) - while self.browser.current_url != "data:,": - self.browser.back() - - # Render PDF to images - self.pdf_to_images(tmp_pdf.name, max_pages) + with tempfile.NamedTemporaryFile(suffix=".pdf") as tmp_pdf: + # Load base64'd contents directly into browser as HTML + self.browser.get(f"data:text/html;base64,{b64encode(file_contents).decode()}") + + # Execute command and save PDF content to disk for image conversion + tmp_pdf.write(b64decode(self.browser.print_page())) + tmp_pdf.flush() + + # Page browser back to the beginning (in theory we shouldn't have to go far but just in case) + while self.browser.current_url != "data:,": + self.browser.back() + + # Render PDF to images + self.pdf_to_images(tmp_pdf.name, max_pages) def pdf_to_images(self, file, max_pages=None): convert_from_path(file, self.working_directory, first_page=1, last_page=max_pages)