Merge pull request #115 from CybercentreCanada/hotfix/leaky_results

Page browser back after every print to PDF
CybercentreCanada · Mar 27, 2024 · 3a7ec5d · 3a7ec5d
2 parents 4364d1d + 81793f0
commit 3a7ec5d
Showing 1 changed file with 17 additions and 32 deletions.
diff --git a/document_preview/document_preview.py b/document_preview/document_preview.py
@@ -1,4 +1,3 @@
-import json
 import os
 import subprocess
 import tempfile
@@ -8,7 +7,7 @@
 from assemblyline_v4_service.common.request import ServiceRequest as Request
 from assemblyline_v4_service.common.result import Heuristic, Result, ResultImageSection, ResultTextSection
 
-from base64 import b64decode
+from base64 import b64decode, b64encode
 from selenium.webdriver import Chrome, ChromeOptions, ChromeService
 from natsort import natsorted
 
@@ -39,9 +38,9 @@ def __init__(self, config=None):
         browser_options = ChromeOptions()
 
         # Set brower options depending on service configuration
-        browser_cfg = config.get('browser_options', {})
-        [browser_options.add_argument(arg) for arg in browser_cfg.get('arguments', [])]
-        [browser_options.set_capability(cap_n, cap_v) for cap_n, cap_v in browser_cfg.get('capabilities', {}).items()]
+        browser_cfg = config.get("browser_options", {})
+        [browser_options.add_argument(arg) for arg in browser_cfg.get("arguments", [])]
+        [browser_options.set_capability(cap_n, cap_v) for cap_n, cap_v in browser_cfg.get("capabilities", {}).items()]
 
         # Run browser in offline mode only
         self.browser = Chrome(options=browser_options, service=ChromeService(executable_path="/usr/bin/chromedriver"))
@@ -91,34 +90,20 @@ def office_conversion(self, file, orientation="portrait", page_range_end=2):
             return output_path
 
     def html_render(self, file_contents, max_pages):
-        # Create a temporary file containing the '.html' extension so Chrome can render the document properly
-        with tempfile.NamedTemporaryFile(suffix=".html") as tmp_html:
-            tmp_html.write(file_contents)
-            tmp_html.flush()
-            with tempfile.NamedTemporaryFile(suffix=".pdf") as tmp_pdf:
-                # Load file into browser
-                self.browser.get(f"file://{tmp_html.name}")
-
-                # Prepare command to perform Print to PDF
-                resource = "/session/%s/chromium/send_command_and_get_result" % self.browser.session_id
-                print_options = {
-                    "landscape": False,
-                    "displayHeaderFooter": False,
-                    "printBackground": True,
-                    "preferCSSPageSize": True,
-                }
-
-                # Execute command and save PDF content to disk for image conversion
-                resp = self.browser.command_executor._request(
-                    "POST",
-                    url=self.browser.command_executor._url + resource,
-                    body=json.dumps({"cmd": "Page.printToPDF", "params": print_options}),
-                )
-                tmp_pdf.write(b64decode(resp["value"]["data"]))
-                tmp_pdf.flush()
+        with tempfile.NamedTemporaryFile(suffix=".pdf") as tmp_pdf:
+            # Load base64'd contents directly into browser as HTML
+            self.browser.get(f"data:text/html;base64,{b64encode(file_contents).decode()}")
+
+            # Execute command and save PDF content to disk for image conversion
+            tmp_pdf.write(b64decode(self.browser.print_page()))
+            tmp_pdf.flush()
+
+            # Page browser back to the beginning (in theory we shouldn't have to go far but just in case)
+            while self.browser.current_url != "data:,":
+                self.browser.back()
 
-                # Render PDF to images
-                self.pdf_to_images(tmp_pdf.name, max_pages)
+            # Render PDF to images
+            self.pdf_to_images(tmp_pdf.name, max_pages)
 
     def pdf_to_images(self, file, max_pages=None):
         convert_from_path(file, self.working_directory, first_page=1, last_page=max_pages)