From 906c4ba940055ec775cfbfcd674914f95332b799 Mon Sep 17 00:00:00 2001 From: cccs-rs Date: Wed, 22 Feb 2023 19:43:34 +0000 Subject: [PATCH] Use headless Chrome to render HTML documents --- Dockerfile | 4 ++++ document_preview/document_preview.py | 9 +++++++++ service_manifest.yml | 2 +- 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index cbfa231..21f950f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -22,6 +22,10 @@ RUN dpkg -i LibreOffice_${LIBRE_BUILD_VERSION}*/DEBS/*.deb && rm -rf LibreOffice RUN apt-get install -y libdbus-1-3 libcups2 libsm6 libice6 RUN ln -n -s /opt/libreoffice${LIBRE_VERSION} /usr/lib/libreoffice +# Install Chrome for headless rendering of HTML documents +RUN wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb && \ + apt install -y ./google-chrome-stable_current_amd64.deb && rm -f ./google-chrome-stable_current_amd64.deb + # Switch to assemblyline user USER assemblyline diff --git a/document_preview/document_preview.py b/document_preview/document_preview.py index 428c394..2f7a92f 100644 --- a/document_preview/document_preview.py +++ b/document_preview/document_preview.py @@ -68,6 +68,15 @@ def render_documents(self, request: Request, max_pages=1): eml2image(file_contents, self.working_directory, self.log, load_ext_images=self.service_attributes.docker_config.allow_internet_access, load_images=request.get_param('load_email_images')) + # HTML + elif request.file_type == "code/html": + with tempfile.NamedTemporaryFile(suffix=".html") as tmp_html: + tmp_html.write(request.file_contents) + tmp_html.flush() + with tempfile.NamedTemporaryFile(suffix=".pdf") as tmp_pdf: + subprocess.run(['google-chrome', '--headless', '--no-sandbox', '--hide-scrollbars', + f'--print-to-pdf={tmp_pdf.name}', tmp_html.name], capture_output=True) + self.pdf_to_images(tmp_pdf.name, max_pages) def execute(self, request): start = time() diff --git a/service_manifest.yml b/service_manifest.yml index a6a7e07..57d3f77 100644 --- a/service_manifest.yml +++ b/service_manifest.yml @@ -2,7 +2,7 @@ name: DocumentPreview version: $SERVICE_TAG description: Use OCR to detect for signs of malicious behaviour in Office and PDF files -accepts: document/(pdf$|office/.*|email) +accepts: document/(pdf$|office/.*|email)|code/html rejects: empty|metadata/.*|document/office/onenote stage: CORE