Skip to content

Commit

Permalink
Merge pull request #115 from CybercentreCanada/hotfix/leaky_results
Browse files Browse the repository at this point in the history
Page browser back after every print to PDF
  • Loading branch information
cccs-rs authored Mar 27, 2024
2 parents 4364d1d + 81793f0 commit 3a7ec5d
Showing 1 changed file with 17 additions and 32 deletions.
49 changes: 17 additions & 32 deletions document_preview/document_preview.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import json
import os
import subprocess
import tempfile
Expand All @@ -8,7 +7,7 @@
from assemblyline_v4_service.common.request import ServiceRequest as Request
from assemblyline_v4_service.common.result import Heuristic, Result, ResultImageSection, ResultTextSection

from base64 import b64decode
from base64 import b64decode, b64encode
from selenium.webdriver import Chrome, ChromeOptions, ChromeService
from natsort import natsorted

Expand Down Expand Up @@ -39,9 +38,9 @@ def __init__(self, config=None):
browser_options = ChromeOptions()

# Set brower options depending on service configuration
browser_cfg = config.get('browser_options', {})
[browser_options.add_argument(arg) for arg in browser_cfg.get('arguments', [])]
[browser_options.set_capability(cap_n, cap_v) for cap_n, cap_v in browser_cfg.get('capabilities', {}).items()]
browser_cfg = config.get("browser_options", {})
[browser_options.add_argument(arg) for arg in browser_cfg.get("arguments", [])]
[browser_options.set_capability(cap_n, cap_v) for cap_n, cap_v in browser_cfg.get("capabilities", {}).items()]

# Run browser in offline mode only
self.browser = Chrome(options=browser_options, service=ChromeService(executable_path="/usr/bin/chromedriver"))
Expand Down Expand Up @@ -91,34 +90,20 @@ def office_conversion(self, file, orientation="portrait", page_range_end=2):
return output_path

def html_render(self, file_contents, max_pages):
# Create a temporary file containing the '.html' extension so Chrome can render the document properly
with tempfile.NamedTemporaryFile(suffix=".html") as tmp_html:
tmp_html.write(file_contents)
tmp_html.flush()
with tempfile.NamedTemporaryFile(suffix=".pdf") as tmp_pdf:
# Load file into browser
self.browser.get(f"file://{tmp_html.name}")

# Prepare command to perform Print to PDF
resource = "/session/%s/chromium/send_command_and_get_result" % self.browser.session_id
print_options = {
"landscape": False,
"displayHeaderFooter": False,
"printBackground": True,
"preferCSSPageSize": True,
}

# Execute command and save PDF content to disk for image conversion
resp = self.browser.command_executor._request(
"POST",
url=self.browser.command_executor._url + resource,
body=json.dumps({"cmd": "Page.printToPDF", "params": print_options}),
)
tmp_pdf.write(b64decode(resp["value"]["data"]))
tmp_pdf.flush()
with tempfile.NamedTemporaryFile(suffix=".pdf") as tmp_pdf:
# Load base64'd contents directly into browser as HTML
self.browser.get(f"data:text/html;base64,{b64encode(file_contents).decode()}")

# Execute command and save PDF content to disk for image conversion
tmp_pdf.write(b64decode(self.browser.print_page()))
tmp_pdf.flush()

# Page browser back to the beginning (in theory we shouldn't have to go far but just in case)
while self.browser.current_url != "data:,":
self.browser.back()

# Render PDF to images
self.pdf_to_images(tmp_pdf.name, max_pages)
# Render PDF to images
self.pdf_to_images(tmp_pdf.name, max_pages)

def pdf_to_images(self, file, max_pages=None):
convert_from_path(file, self.working_directory, first_page=1, last_page=max_pages)
Expand Down

0 comments on commit 3a7ec5d

Please sign in to comment.