From 81793f00f80ae26bc418460ca7c2c1b5ad3a595d Mon Sep 17 00:00:00 2001
From: cccs-rs <62077998+cccs-rs@users.noreply.github.com>
Date: Wed, 27 Mar 2024 16:49:20 +0000
Subject: [PATCH] Optimization: load file contents directly into browser
 without intermediate file

---
 document_preview/document_preview.py | 35 ++++++++++++----------------
 1 file changed, 15 insertions(+), 20 deletions(-)

diff --git a/document_preview/document_preview.py b/document_preview/document_preview.py
index b87149b..ac2c4d1 100644
--- a/document_preview/document_preview.py
+++ b/document_preview/document_preview.py
@@ -1,4 +1,3 @@
-import json
 import os
 import subprocess
 import tempfile
@@ -8,7 +7,7 @@
 from assemblyline_v4_service.common.request import ServiceRequest as Request
 from assemblyline_v4_service.common.result import Heuristic, Result, ResultImageSection, ResultTextSection
 
-from base64 import b64decode
+from base64 import b64decode, b64encode
 from selenium.webdriver import Chrome, ChromeOptions, ChromeService
 from natsort import natsorted
 
@@ -91,24 +90,20 @@ def office_conversion(self, file, orientation="portrait", page_range_end=2):
             return output_path
 
     def html_render(self, file_contents, max_pages):
-        # Create a temporary file containing the '.html' extension so Chrome can render the document properly
-        with tempfile.NamedTemporaryFile(suffix=".html") as tmp_html:
-            tmp_html.write(file_contents)
-            tmp_html.flush()
-            with tempfile.NamedTemporaryFile(suffix=".pdf") as tmp_pdf:
-                # Load file into browser
-                self.browser.get(f"file://{tmp_html.name}")
-
-                # Execute command and save PDF content to disk for image conversion
-                tmp_pdf.write(b64decode(self.browser.print_page()))
-                tmp_pdf.flush()
-
-                # Page browser back to the beginning (in theory we shouldn't have to go far but just in case)
-                while self.browser.current_url != "data:,":
-                    self.browser.back()
-
-                # Render PDF to images
-                self.pdf_to_images(tmp_pdf.name, max_pages)
+        with tempfile.NamedTemporaryFile(suffix=".pdf") as tmp_pdf:
+            # Load base64'd contents directly into browser as HTML
+            self.browser.get(f"data:text/html;base64,{b64encode(file_contents).decode()}")
+
+            # Execute command and save PDF content to disk for image conversion
+            tmp_pdf.write(b64decode(self.browser.print_page()))
+            tmp_pdf.flush()
+
+            # Page browser back to the beginning (in theory we shouldn't have to go far but just in case)
+            while self.browser.current_url != "data:,":
+                self.browser.back()
+
+            # Render PDF to images
+            self.pdf_to_images(tmp_pdf.name, max_pages)
 
     def pdf_to_images(self, file, max_pages=None):
         convert_from_path(file, self.working_directory, first_page=1, last_page=max_pages)