Skip to content

Commit

Permalink
Merge pull request #41 from CybercentreCanada/AL-2188
Browse files Browse the repository at this point in the history
Al 2188
  • Loading branch information
cccs-rs authored Feb 16, 2023
2 parents 62a6f69 + 8a34718 commit d146055
Show file tree
Hide file tree
Showing 4 changed files with 73 additions and 65 deletions.
31 changes: 14 additions & 17 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,39 +2,36 @@ ARG branch=latest
FROM cccs/assemblyline-v4-service-base:$branch

ENV SERVICE_PATH document_preview.document_preview.DocumentPreview
ENV LIBRE_VERSION=7.4
ENV LIBRE_BUILD_VERSION=${LIBRE_VERSION}.4

USER root

RUN mkdir -p /usr/share/man/man1mkdir -p /usr/share/man/man1
RUN apt-get update && apt-get install -y wget tesseract-ocr libemail-outlook-message-perl libgdiplus unzip
RUN wget https://github.com/wkhtmltopdf/packaging/releases/download/0.12.6-1/wkhtmltox_0.12.6-1.buster_amd64.deb
RUN apt-get install -y poppler-utils ./wkhtmltox_0.12.6-1.buster_amd64.deb --no-install-recommends
RUN apt-get install -y poppler-utils ./wkhtmltox_0.12.6-1.buster_amd64.deb --no-install-recommends &&\
rm -f ./wkhtmltox_0.12.6-1.buster_amd64.deb
RUN pip install pdf2image Pillow natsort imgkit compoundfiles compressed_rtf pytesseract

# Install Aspose Suite for handling documents
RUN pip install aspose-cells-python aspose-words==22.10 aspose.Slides

# Install Wine to run OneNoteAnalyzer (C# app using Aspose)
RUN dpkg --add-architecture i386 && mkdir -pm755 /etc/apt/keyrings && \
wget -O /etc/apt/keyrings/winehq-archive.key https://dl.winehq.org/wine-builds/winehq.key
RUN wget -NP /etc/apt/sources.list.d/ https://dl.winehq.org/wine-builds/debian/dists/buster/winehq-buster.sources && \
apt update && apt install -y --install-recommends winehq-stable

RUN wget https://github.com/knight0x07/OneNoteAnalyzer/releases/download/OneNoteAnalyzer/OneNoteAnalyzer.zip && \
unzip OneNoteAnalyzer.zip -d /opt/al_service/OneNoteAnalyzer && rm -f OneNoteAnalyzer.zip
RUN wget -O /opt/al_service/dotNetFx40_Full_x86_x64.exe 'http://download.microsoft.com/download/9/5/A/95A9616B-7A37-4AF6-BC36-D6EA96C8DAAE/dotNetFx40_Full_x86_x64.exe'
# Install Libreoffice
RUN pip install unoconv
RUN wget https://tdf.mirror.rafal.ca/libreoffice/stable/${LIBRE_BUILD_VERSION}/deb/x86_64/LibreOffice_${LIBRE_BUILD_VERSION}_Linux_x86-64_deb.tar.gz
RUN tar zxvf LibreOffice_${LIBRE_BUILD_VERSION}_Linux_x86-64_deb.tar.gz && rm -f LibreOffice_${LIBRE_BUILD_VERSION}_Linux_x86-64_deb.tar.gz
RUN dpkg -i LibreOffice_${LIBRE_BUILD_VERSION}*/DEBS/*.deb && rm -rf LibreOffice_${LIBRE_BUILD_VERSION}*
RUN apt-get install -y libdbus-1-3 libcups2 libsm6 libice6
RUN ln -n -s /opt/libreoffice${LIBRE_VERSION} /usr/lib/libreoffice

# Switch to assemblyline user
USER assemblyline

# Copy DocPreview service code
WORKDIR /opt/al_service
# Install dotnet under the AL user in Wine
RUN wine dotNetFx40_Full_x86_x64.exe /q

COPY . .

ARG version=4.0.0.dev1
USER root
RUN rm -f dotNetFx40_Full_x86_x64.exe
RUN sed -i -e "s/\$SERVICE_TAG/$version/g" service_manifest.yml
RUN unoconv --listener &

USER assemblyline
88 changes: 41 additions & 47 deletions document_preview/document_preview.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,7 @@
import os
import shutil
import subprocess
import tempfile

from aspose.cells import SaveFormat as WorkbookSaveFormat, Workbook
from aspose.slides import Presentation
from aspose.slides.export import SaveFormat as PresentationSaveFormat
from natsort import natsorted
from pdf2image import convert_from_path
from time import time
Expand All @@ -16,11 +12,6 @@

from document_preview.helper.emlrender import processEml as eml2image

from aspose.words import Document, SaveFormat as WordsSaveFormat


WEBP_MAX_SIZE = 16383


class DocumentPreview(ServiceBase):
def __init__(self, config=None):
Expand All @@ -32,6 +23,18 @@ def start(self):
def stop(self):
self.log.debug("Document preview service ended")

def office_conversion(self, file, orientation="portrait", page_range_end=2):
subprocess.run(["unoconv", "-f", "pdf",
"-e", f"PageRange=1-{page_range_end}",
"-P", f"PaperOrientation={orientation}",
"-P", "PaperFormat=A3",
"-o", f"{self.working_directory}/", file], capture_output=True)
converted_file = [s for s in os.listdir(self.working_directory) if ".pdf" in s]
if converted_file:
return (True, converted_file[0])
else:
return (False, None)

def pdf_to_images(self, file, max_pages=None):
pages = convert_from_path(file, first_page=1, last_page=max_pages)

Expand All @@ -41,9 +44,15 @@ def pdf_to_images(self, file, max_pages=None):
i += 1

def render_documents(self, request: Request, max_pages=1):

if request.file_type == 'document/pdf':
# PDF
# Word/Excel/Powerpoint
if any(request.file_type == f'document/office/{ms_product}' for ms_product in ['word', 'excel', 'powerpoint']):
orientation = "landscape" if any(request.file_type.endswith(type)
for type in ['excel', 'powerpoint']) else "portrait"
converted = self.office_conversion(request.file_path, orientation, max_pages)
if converted[0]:
self.pdf_to_images(self.working_directory + "/" + converted[1])
# PDF
elif request.file_type == 'document/pdf':
self.pdf_to_images(request.file_path, max_pages)
# EML/MSG
elif request.file_type.endswith('email'):
Expand All @@ -54,49 +63,19 @@ def render_documents(self, request: Request, max_pages=1):
subprocess.run(['msgconvert', '-outfile', tmp.name, request.file_path])
tmp.seek(0)
file_contents = tmp.read()

# Render EML as PNG
# If we have internet access, we'll attempt to load external images
eml2image(file_contents, self.working_directory, self.log,
load_ext_images=self.service_attributes.docker_config.allow_internet_access,
load_images=request.get_param('load_email_images'))
elif request.file_type == 'document/office/onenote':
with tempfile.NamedTemporaryFile() as temp_file:
temp_file.write(request.file_contents)
temp_file.flush()
subprocess.run(['wine', 'OneNoteAnalyzer/OneNoteAnalyzer.exe', '--file', temp_file.name],
capture_output=True)

expected_output_dir = f'{temp_file.name}_content/'
if os.path.exists(expected_output_dir):
# Copy to working directory under presumed output filenames
shutil.copyfile(
os.path.join(expected_output_dir, f'ConvertImage_{os.path.basename(temp_file.name)}.png'),
os.path.join(self.working_directory, f'output_{0}'))
else:
# Word/Excel/Powerpoint
aspose_cls, save_format_cls = {
'document/office/excel': (Workbook, WorkbookSaveFormat),
'document/office/word': (Document, WordsSaveFormat),
'document/office/powerpoint': (Presentation, PresentationSaveFormat),
}.get(request.file_type, (None, None))

if not aspose_cls and request.file_type.startswith('document/office'):
self.log.warning(f'Aspose unable to handle: {request.file_type}')
return

with tempfile.NamedTemporaryFile() as tmp_file:
doc = aspose_cls(request.file_path)
doc.save(tmp_file.name, save_format_cls.PDF)
tmp_file.seek(0)
self.pdf_to_images(tmp_file.name, max_pages)

def execute(self, request):
start = time()
result = Result()

# Attempt to render documents given and dump them to the working directory
max_pages = int(request.get_param('max_pages_rendered'))
save_ocr_output = request.get_param('save_ocr_output').lower()
try:
self.render_documents(request, max_pages)
except Exception as e:
Expand All @@ -109,10 +88,25 @@ def execute(self, request):
previews = [s for s in os.listdir(self.working_directory) if "output" in s]
image_section = ResultImageSection(request, "Successfully extracted the preview.")
heur_id = 1 if request.deep_scan or request.get_param('run_ocr') else None
[image_section.add_image(f"{self.working_directory}/{preview}",
name=f"page_{str(i).zfill(3)}.jpeg", description=f"Here's the preview for page {i}",
ocr_heuristic_id=heur_id)
for i, preview in enumerate(natsorted(previews))]
for i, preview in enumerate(natsorted(previews)):
ocr_io = tempfile.NamedTemporaryFile('w', delete=False) if save_ocr_output != 'no' else None
img_name = f"page_{str(i).zfill(3)}.jpeg"
image_section.add_image(f"{self.working_directory}/{preview}", name=img_name,
description=f"Here's the preview for page {i}",
ocr_heuristic_id=heur_id, ocr_io=ocr_io)
# Write OCR output as specified by submissions params
if save_ocr_output == 'no':
continue
else:
# Write content to disk to be uploaded
if save_ocr_output == 'as_extracted':
request.add_extracted(ocr_io.name, f'{img_name}_ocr_output',
description="OCR Output")
elif save_ocr_output == 'as_supplementary':
request.add_supplementary(ocr_io.name, f'{img_name}_ocr_output',
description="OCR Output")
else:
self.log.warning(f'Unknown save method for OCR given: {save_ocr_output}')

result.add_section(image_section)
request.result = result
Expand Down
1 change: 1 addition & 0 deletions document_preview/helper/emlrender.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@

try:
from PIL import Image
Image.MAX_IMAGE_PIXELS = 2147483647
except:
print('[ERROR] pillow module not installed ("pip install pillow")')
sys.exit(1)
Expand Down
18 changes: 17 additions & 1 deletion service_manifest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ version: $SERVICE_TAG
description: Use OCR to detect for signs of malicious behaviour in Office and PDF files

accepts: document/(pdf$|office/.*|email)
rejects: empty|metadata/.*
rejects: empty|metadata/.*|document/office/onenote

stage: CORE
category: Static Analysis
Expand Down Expand Up @@ -33,6 +33,22 @@ submission_params:
value: false
default: false

- name: save_ocr_output
type: list
value: "no"
default: "no"
list: ['no', 'as_extracted', 'as_supplementary']


config:
# List of OCR terms to override defaults in service base for detection
# See: https://github.com/CybercentreCanada/assemblyline-v4-service/blob/master/assemblyline_v4_service/common/extractor/ocr.py
ocr:
banned: [] # Banned terms
macros: [] # Terms that indicate macros
ransomware: [] # Terms that indicate ransomware


heuristics:
- heur_id: 1
name: OCR Detection found
Expand Down

0 comments on commit d146055

Please sign in to comment.