Skip to content

Commit

Permalink
Merge pull request #35 from CybercentreCanada/aspose
Browse files Browse the repository at this point in the history
Switch LibreOffice to Aspose Python Packages
  • Loading branch information
cccs-rs authored Feb 6, 2023
2 parents 032aec1 + 55ca786 commit c331a0a
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 47 deletions.
21 changes: 2 additions & 19 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@ ARG branch=latest
FROM cccs/assemblyline-v4-service-base:$branch

ENV SERVICE_PATH document_preview.document_preview.DocumentPreview
ENV LIBRE_VERSION=7.4
ENV LIBRE_BUILD_VERSION=${LIBRE_VERSION}.4

USER root

Expand All @@ -12,24 +10,10 @@ RUN apt-get update && apt-get install -y wget tesseract-ocr libemail-outlook-mes
RUN wget https://github.com/wkhtmltopdf/packaging/releases/download/0.12.6-1/wkhtmltox_0.12.6-1.buster_amd64.deb
RUN apt-get install -y poppler-utils ./wkhtmltox_0.12.6-1.buster_amd64.deb --no-install-recommends
RUN pip install pdf2image Pillow natsort imgkit compoundfiles compressed_rtf pytesseract
RUN pip install unoconv>=0.9.0

# Install Libreoffice
RUN wget https://tdf.mirror.rafal.ca/libreoffice/stable/${LIBRE_BUILD_VERSION}/deb/x86_64/LibreOffice_${LIBRE_BUILD_VERSION}_Linux_x86-64_deb.tar.gz
RUN tar zxvf LibreOffice_${LIBRE_BUILD_VERSION}_Linux_x86-64_deb.tar.gz && rm -f LibreOffice_${LIBRE_BUILD_VERSION}_Linux_x86-64_deb.tar.gz
RUN dpkg -i LibreOffice_${LIBRE_BUILD_VERSION}*/DEBS/*.deb && rm -rf LibreOffice_${LIBRE_BUILD_VERSION}*
RUN apt-get install -y libdbus-1-3 libcups2 libsm6 libice6
RUN ln -n -s /opt/libreoffice${LIBRE_VERSION} /usr/lib/libreoffice
# Install Aspose Suite for handling documents
RUN pip install aspose-cells-python aspose-words==22.10 aspose.Slides

# Install one2html for OneNote conversion
RUN apt remove --purge -y rustc
RUN apt-get install -y curl build-essential && curl https://sh.rustup.rs -sSf | sh -s -- -y
SHELL ["bash", "-lc"]
RUN source $HOME/.cargo/env
RUN cargo install --git https://github.com/cccs-rs/one2html.git --no-default-features

RUN cp /root/.cargo/bin/one2html /var/lib/assemblyline/.local/bin/one2html
RUN chown assemblyline:assemblyline /var/lib/assemblyline/.local/bin/one2html

USER assemblyline

Expand All @@ -39,6 +23,5 @@ COPY . .
ARG version=4.0.0.dev1
USER root
RUN sed -i -e "s/\$SERVICE_TAG/$version/g" service_manifest.yml
RUN unoconv --listener &

USER assemblyline
53 changes: 26 additions & 27 deletions document_preview/document_preview.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@
from assemblyline_v4_service.common.request import ServiceRequest as Request

from document_preview.helper.emlrender import processEml as eml2image
from aspose.cells import SaveFormat as WorkbookSaveFormat, Workbook
from aspose.slides import Presentation
from aspose.slides.export import SaveFormat as PresentationSaveFormat

from aspose.words import Document, SaveFormat as WordsSaveFormat


WEBP_MAX_SIZE = 16383

Expand Down Expand Up @@ -61,15 +67,9 @@ def pdf_to_images(self, file, max_pages=None):
i += 1

def render_documents(self, request: Request, max_pages=1):
# Word/Excel/Powerpoint
if any(request.file_type == f'document/office/{ms_product}' for ms_product in ['word', 'excel', 'powerpoint']):
orientation = "landscape" if any(request.file_type.endswith(type)
for type in ['excel', 'powerpoint']) else "portrait"
converted = self.office_conversion(request.file_path, orientation, max_pages)
if converted[0]:
self.pdf_to_images(self.working_directory + "/" + converted[1])
# PDF
elif request.file_type == 'document/pdf':

if request.file_type == 'document/pdf':
# PDF
self.pdf_to_images(request.file_path, max_pages)
# EML/MSG
elif request.file_type.endswith('email'):
Expand All @@ -86,24 +86,23 @@ def render_documents(self, request: Request, max_pages=1):
eml2image(file_contents, self.working_directory, self.log,
load_ext_images=self.service_attributes.docker_config.allow_internet_access,
load_images=request.get_param('load_email_images'))

elif request.file_type.endswith('emf'):
self.libreoffice_conversion(request.file_path, convert_to="png")
elif request.file_type == 'document/office/onenote':
with open(os.path.join(self.working_directory, request.file_name), 'wb+') as temp_one:
temp_one.write(request.file_contents)
temp_one.flush()
subprocess.run(['one2html', '-i', temp_one.name, '-o', self.working_directory],
capture_output=True)
# Cleanup files
os.remove(os.path.join(self.working_directory, request.file_name))
for root, _, files in os.walk(self.working_directory):
for file in files:
file_path = os.path.join(root, file)
dir = os.path.dirname(file_path)
if self.working_directory.endswith(dir):
dir = ''
imgkit.from_file(file_path, os.path.join(self.working_directory, f'{dir}_{file}_output.jpg'))
else:
# Word/Excel/Powerpoint
aspose_cls, save_format_cls = {
'document/office/excel': (Workbook, WorkbookSaveFormat),
'document/office/word': (Document, WordsSaveFormat),
'document/office/powerpoint': (Presentation, PresentationSaveFormat),
}.get(request.file_type, (None, None))

if not aspose_cls and request.file_type.startswith('document/office'):
self.log.warning(f'Aspose unable to handle: {request.file_type}')
return

with tempfile.NamedTemporaryFile() as tmp_file:
doc = aspose_cls(request.file_path)
doc.save(tmp_file.name, save_format_cls.PDF)
tmp_file.seek(0)
self.pdf_to_images(tmp_file.name, max_pages)

def execute(self, request):
start = time()
Expand Down
2 changes: 1 addition & 1 deletion service_manifest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ name: DocumentPreview
version: $SERVICE_TAG
description: Use OCR to detect for signs of malicious behaviour in Office and PDF files

accepts: document/(pdf$|office/.*|email)|image/emf
accepts: document/(pdf$|office/.*|email)
rejects: empty|metadata/.*

stage: CORE
Expand Down

0 comments on commit c331a0a

Please sign in to comment.