diff --git a/Dockerfile b/Dockerfile index 2f40f06..82919f8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,8 +2,6 @@ ARG branch=latest FROM cccs/assemblyline-v4-service-base:$branch ENV SERVICE_PATH document_preview.document_preview.DocumentPreview -ENV LIBRE_VERSION=7.4 -ENV LIBRE_BUILD_VERSION=${LIBRE_VERSION}.4 USER root @@ -12,24 +10,10 @@ RUN apt-get update && apt-get install -y wget tesseract-ocr libemail-outlook-mes RUN wget https://github.com/wkhtmltopdf/packaging/releases/download/0.12.6-1/wkhtmltox_0.12.6-1.buster_amd64.deb RUN apt-get install -y poppler-utils ./wkhtmltox_0.12.6-1.buster_amd64.deb --no-install-recommends RUN pip install pdf2image Pillow natsort imgkit compoundfiles compressed_rtf pytesseract -RUN pip install unoconv>=0.9.0 -# Install Libreoffice -RUN wget https://tdf.mirror.rafal.ca/libreoffice/stable/${LIBRE_BUILD_VERSION}/deb/x86_64/LibreOffice_${LIBRE_BUILD_VERSION}_Linux_x86-64_deb.tar.gz -RUN tar zxvf LibreOffice_${LIBRE_BUILD_VERSION}_Linux_x86-64_deb.tar.gz && rm -f LibreOffice_${LIBRE_BUILD_VERSION}_Linux_x86-64_deb.tar.gz -RUN dpkg -i LibreOffice_${LIBRE_BUILD_VERSION}*/DEBS/*.deb && rm -rf LibreOffice_${LIBRE_BUILD_VERSION}* -RUN apt-get install -y libdbus-1-3 libcups2 libsm6 libice6 -RUN ln -n -s /opt/libreoffice${LIBRE_VERSION} /usr/lib/libreoffice +# Install Aspose Suite for handling documents +RUN pip install aspose-cells-python aspose-words==22.10 aspose.Slides -# Install one2html for OneNote conversion -RUN apt remove --purge -y rustc -RUN apt-get install -y curl build-essential && curl https://sh.rustup.rs -sSf | sh -s -- -y -SHELL ["bash", "-lc"] -RUN source $HOME/.cargo/env -RUN cargo install --git https://github.com/cccs-rs/one2html.git --no-default-features - -RUN cp /root/.cargo/bin/one2html /var/lib/assemblyline/.local/bin/one2html -RUN chown assemblyline:assemblyline /var/lib/assemblyline/.local/bin/one2html USER assemblyline @@ -39,6 +23,5 @@ COPY . . ARG version=4.0.0.dev1 USER root RUN sed -i -e "s/\$SERVICE_TAG/$version/g" service_manifest.yml -RUN unoconv --listener & USER assemblyline diff --git a/document_preview/document_preview.py b/document_preview/document_preview.py index 593bb7d..619dcea 100644 --- a/document_preview/document_preview.py +++ b/document_preview/document_preview.py @@ -12,6 +12,12 @@ from assemblyline_v4_service.common.request import ServiceRequest as Request from document_preview.helper.emlrender import processEml as eml2image +from aspose.cells import SaveFormat as WorkbookSaveFormat, Workbook +from aspose.slides import Presentation +from aspose.slides.export import SaveFormat as PresentationSaveFormat + +from aspose.words import Document, SaveFormat as WordsSaveFormat + WEBP_MAX_SIZE = 16383 @@ -61,15 +67,9 @@ def pdf_to_images(self, file, max_pages=None): i += 1 def render_documents(self, request: Request, max_pages=1): - # Word/Excel/Powerpoint - if any(request.file_type == f'document/office/{ms_product}' for ms_product in ['word', 'excel', 'powerpoint']): - orientation = "landscape" if any(request.file_type.endswith(type) - for type in ['excel', 'powerpoint']) else "portrait" - converted = self.office_conversion(request.file_path, orientation, max_pages) - if converted[0]: - self.pdf_to_images(self.working_directory + "/" + converted[1]) - # PDF - elif request.file_type == 'document/pdf': + + if request.file_type == 'document/pdf': + # PDF self.pdf_to_images(request.file_path, max_pages) # EML/MSG elif request.file_type.endswith('email'): @@ -86,24 +86,23 @@ def render_documents(self, request: Request, max_pages=1): eml2image(file_contents, self.working_directory, self.log, load_ext_images=self.service_attributes.docker_config.allow_internet_access, load_images=request.get_param('load_email_images')) - - elif request.file_type.endswith('emf'): - self.libreoffice_conversion(request.file_path, convert_to="png") - elif request.file_type == 'document/office/onenote': - with open(os.path.join(self.working_directory, request.file_name), 'wb+') as temp_one: - temp_one.write(request.file_contents) - temp_one.flush() - subprocess.run(['one2html', '-i', temp_one.name, '-o', self.working_directory], - capture_output=True) - # Cleanup files - os.remove(os.path.join(self.working_directory, request.file_name)) - for root, _, files in os.walk(self.working_directory): - for file in files: - file_path = os.path.join(root, file) - dir = os.path.dirname(file_path) - if self.working_directory.endswith(dir): - dir = '' - imgkit.from_file(file_path, os.path.join(self.working_directory, f'{dir}_{file}_output.jpg')) + else: + # Word/Excel/Powerpoint + aspose_cls, save_format_cls = { + 'document/office/excel': (Workbook, WorkbookSaveFormat), + 'document/office/word': (Document, WordsSaveFormat), + 'document/office/powerpoint': (Presentation, PresentationSaveFormat), + }.get(request.file_type, (None, None)) + + if not aspose_cls and request.file_type.startswith('document/office'): + self.log.warning(f'Aspose unable to handle: {request.file_type}') + return + + with tempfile.NamedTemporaryFile() as tmp_file: + doc = aspose_cls(request.file_path) + doc.save(tmp_file.name, save_format_cls.PDF) + tmp_file.seek(0) + self.pdf_to_images(tmp_file.name, max_pages) def execute(self, request): start = time() diff --git a/service_manifest.yml b/service_manifest.yml index cfd08bc..1dbefa0 100644 --- a/service_manifest.yml +++ b/service_manifest.yml @@ -2,7 +2,7 @@ name: DocumentPreview version: $SERVICE_TAG description: Use OCR to detect for signs of malicious behaviour in Office and PDF files -accepts: document/(pdf$|office/.*|email)|image/emf +accepts: document/(pdf$|office/.*|email) rejects: empty|metadata/.* stage: CORE