Skip to content

Commit

Permalink
Merge pull request #73 from CybercentreCanada/offline_html_rendering
Browse files Browse the repository at this point in the history
Attempt to render HTML in chrome offline
  • Loading branch information
cccs-rs authored Sep 18, 2023
2 parents ea50182 + 5e69148 commit e2c0169
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 50 deletions.
9 changes: 8 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,5 +21,12 @@
"[python]": {
"editor.defaultFormatter": "ms-python.black-formatter"
},
"python.formatting.provider": "none",
"black-formatter.args": [
"--line-length=120"
],
"flake8.args": [
"--max-line-length=120",
//Added the ignore of E203 for now : https://github.com/PyCQA/pycodestyle/issues/373
"--ignore=E203,W503"
],
}
4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ ARG branch=latest
FROM cccs/assemblyline-v4-service-base:$branch

ENV SERVICE_PATH document_preview.document_preview.DocumentPreview
ENV LIBRE_VERSION=7.5
ENV LIBRE_BUILD_VERSION=${LIBRE_VERSION}.5
ENV LIBRE_VERSION=7.6
ENV LIBRE_BUILD_VERSION=${LIBRE_VERSION}.1

USER root

Expand Down
70 changes: 26 additions & 44 deletions document_preview/document_preview.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from time import time

from assemblyline_v4_service.common.base import ServiceBase
from assemblyline_v4_service.common.helper import get_service_manifest
from assemblyline_v4_service.common.request import ServiceRequest as Request
from assemblyline_v4_service.common.result import Result, ResultImageSection
from natsort import natsorted
Expand All @@ -16,14 +15,7 @@
class DocumentPreview(ServiceBase):
def __init__(self, config=None):
super(DocumentPreview, self).__init__(config)
self.has_internet_access = (
get_service_manifest()
.get("docker_config", {})
.get("allow_internet_access", False)
)
self.log.info(
f"Service is configured {'with' if self.has_internet_access else 'without'} internet access"
)
self.html_render_timeout = config.get("html_render_timeout", 30)

def start(self):
self.log.debug("Document preview service started")
Expand Down Expand Up @@ -70,15 +62,9 @@ def render_documents(self, request: Request, max_pages=1):
for ms_product in ["word", "excel", "powerpoint", "rtf"]
):
orientation = (
"landscape"
if any(
request.file_type.endswith(type) for type in ["excel", "powerpoint"]
)
else "portrait"
)
converted = self.office_conversion(
request.file_path, orientation, max_pages
"landscape" if any(request.file_type.endswith(type) for type in ["excel", "powerpoint"]) else "portrait"
)
converted = self.office_conversion(request.file_path, orientation, max_pages)
if converted[0]:
self.pdf_to_images(self.working_directory + "/" + converted[1])
# PDF
Expand All @@ -102,27 +88,32 @@ def render_documents(self, request: Request, max_pages=1):
file_contents,
self.working_directory,
self.log,
load_ext_images=self.has_internet_access,
load_ext_images=False,
load_images=request.get_param("load_email_images"),
)
# HTML
elif request.file_type == "code/html" and self.has_internet_access:
elif request.file_type == "code/html":
with tempfile.NamedTemporaryFile(suffix=".html") as tmp_html:
tmp_html.write(request.file_contents)
tmp_html.flush()
with tempfile.NamedTemporaryFile(suffix=".pdf") as tmp_pdf:
subprocess.run(
[
"google-chrome",
"--headless",
"--no-sandbox",
"--hide-scrollbars",
f"--print-to-pdf={tmp_pdf.name}",
tmp_html.name,
],
capture_output=True,
)
self.pdf_to_images(tmp_pdf.name, max_pages)
try:
subprocess.run(
[
"google-chrome",
"--headless",
"--no-sandbox",
"--hide-scrollbars",
f"--print-to-pdf={tmp_pdf.name}",
tmp_html.name,
],
capture_output=True,
timeout=self.html_render_timeout,
)
self.pdf_to_images(tmp_pdf.name, max_pages)
except subprocess.TimeoutExpired:
# Unable to render HTML in given time
pass

def execute(self, request):
start = time()
Expand All @@ -141,16 +132,12 @@ def execute(self, request):
# Create an image gallery section to show the renderings
if any("output" in s for s in os.listdir(self.working_directory)):
previews = [s for s in os.listdir(self.working_directory) if "output" in s]
image_section = ResultImageSection(
request, "Successfully extracted the preview."
)
image_section = ResultImageSection(request, "Successfully extracted the preview.")
run_ocr_on_first_n_pages = request.get_param("run_ocr_on_first_n_pages")
for i, preview in enumerate(natsorted(previews)):
# Trigger OCR on the first N pages as specified in the submission
# Otherwise, just add the image without performing OCR analysis
ocr_heur_id = (
1 if request.deep_scan or (i < run_ocr_on_first_n_pages) else None
)
ocr_heur_id = 1 if request.deep_scan or (i < run_ocr_on_first_n_pages) else None
ocr_io = tempfile.NamedTemporaryFile("w", delete=False)
img_name = f"page_{str(i).zfill(3)}.jpeg"
image_section.add_image(
Expand All @@ -165,10 +152,7 @@ def execute(self, request):
with open(ocr_io.name, "r") as fp:
ocr_content = fp.read()
try:
if (
pdfinfo_from_path(request.file_path)["Pages"] == 1
and "click" in ocr_content.lower()
):
if pdfinfo_from_path(request.file_path)["Pages"] == 1 and "click" in ocr_content.lower():
# Suspected document is part of a phishing campaign
image_section.set_heuristic(2)
except Exception:
Expand All @@ -193,9 +177,7 @@ def execute(self, request):
description="OCR Output",
)
else:
self.log.warning(
f"Unknown save method for OCR given: {save_ocr_output}"
)
self.log.warning(f"Unknown save method for OCR given: {save_ocr_output}")

result.add_section(image_section)
request.result = result
Expand Down
5 changes: 2 additions & 3 deletions service_manifest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,7 @@ submission_params:
type: list
value: "no"
default: "no"
list: ['no', 'as_extracted', 'as_supplementary']

list: ["no", "as_extracted", "as_supplementary"]

config:
# List of OCR terms to override defaults in service base for detection
Expand All @@ -48,7 +47,7 @@ config:
banned: [] # Banned terms
macros: [] # Terms that indicate macros
ransomware: [] # Terms that indicate ransomware

html_render_timeout: 30

heuristics:
- heur_id: 1
Expand Down

0 comments on commit e2c0169

Please sign in to comment.