Merge pull request #41 from CybercentreCanada/AL-2188

Al 2188
CybercentreCanada · Feb 16, 2023 · d146055 · d146055
2 parents 62a6f69 + 8a34718
commit d146055
Show file tree

Hide file tree

Showing 4 changed files with 73 additions and 65 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -2,39 +2,36 @@ ARG branch=latest
 FROM cccs/assemblyline-v4-service-base:$branch
 
 ENV SERVICE_PATH document_preview.document_preview.DocumentPreview
+ENV LIBRE_VERSION=7.4
+ENV LIBRE_BUILD_VERSION=${LIBRE_VERSION}.4
 
 USER root
 
 RUN mkdir -p /usr/share/man/man1mkdir -p /usr/share/man/man1
 RUN apt-get update && apt-get install -y wget tesseract-ocr libemail-outlook-message-perl libgdiplus unzip
 RUN wget https://github.com/wkhtmltopdf/packaging/releases/download/0.12.6-1/wkhtmltox_0.12.6-1.buster_amd64.deb
-RUN apt-get install -y poppler-utils ./wkhtmltox_0.12.6-1.buster_amd64.deb --no-install-recommends
+RUN apt-get install -y poppler-utils ./wkhtmltox_0.12.6-1.buster_amd64.deb --no-install-recommends &&\
+    rm -f ./wkhtmltox_0.12.6-1.buster_amd64.deb
 RUN pip install pdf2image Pillow natsort imgkit compoundfiles compressed_rtf pytesseract
 
-# Install Aspose Suite for handling documents
-RUN pip install aspose-cells-python aspose-words==22.10 aspose.Slides
-
-# Install Wine to run OneNoteAnalyzer (C# app using Aspose)
-RUN dpkg --add-architecture i386 && mkdir -pm755 /etc/apt/keyrings && \
-    wget -O /etc/apt/keyrings/winehq-archive.key https://dl.winehq.org/wine-builds/winehq.key
-RUN wget -NP /etc/apt/sources.list.d/ https://dl.winehq.org/wine-builds/debian/dists/buster/winehq-buster.sources && \
-    apt update && apt install -y --install-recommends winehq-stable
-
-RUN wget https://github.com/knight0x07/OneNoteAnalyzer/releases/download/OneNoteAnalyzer/OneNoteAnalyzer.zip && \
-    unzip OneNoteAnalyzer.zip -d /opt/al_service/OneNoteAnalyzer && rm -f OneNoteAnalyzer.zip
-RUN wget -O /opt/al_service/dotNetFx40_Full_x86_x64.exe 'http://download.microsoft.com/download/9/5/A/95A9616B-7A37-4AF6-BC36-D6EA96C8DAAE/dotNetFx40_Full_x86_x64.exe'
+# Install Libreoffice
+RUN pip install unoconv
+RUN wget https://tdf.mirror.rafal.ca/libreoffice/stable/${LIBRE_BUILD_VERSION}/deb/x86_64/LibreOffice_${LIBRE_BUILD_VERSION}_Linux_x86-64_deb.tar.gz
+RUN tar zxvf LibreOffice_${LIBRE_BUILD_VERSION}_Linux_x86-64_deb.tar.gz && rm -f LibreOffice_${LIBRE_BUILD_VERSION}_Linux_x86-64_deb.tar.gz
+RUN dpkg -i LibreOffice_${LIBRE_BUILD_VERSION}*/DEBS/*.deb && rm -rf LibreOffice_${LIBRE_BUILD_VERSION}*
+RUN apt-get install -y libdbus-1-3 libcups2 libsm6 libice6
+RUN ln -n -s /opt/libreoffice${LIBRE_VERSION} /usr/lib/libreoffice
 
+# Switch to assemblyline user
 USER assemblyline
 
+# Copy DocPreview service code
 WORKDIR /opt/al_service
-# Install dotnet under the AL user in Wine
-RUN wine dotNetFx40_Full_x86_x64.exe /q
-
 COPY . .
 
 ARG version=4.0.0.dev1
 USER root
-RUN rm -f dotNetFx40_Full_x86_x64.exe
 RUN sed -i -e "s/\$SERVICE_TAG/$version/g" service_manifest.yml
+RUN unoconv --listener &
 
 USER assemblyline
diff --git a/document_preview/document_preview.py b/document_preview/document_preview.py
@@ -1,11 +1,7 @@
 import os
-import shutil
 import subprocess
 import tempfile
 
-from aspose.cells import SaveFormat as WorkbookSaveFormat, Workbook
-from aspose.slides import Presentation
-from aspose.slides.export import SaveFormat as PresentationSaveFormat
 from natsort import natsorted
 from pdf2image import convert_from_path
 from time import time
@@ -16,11 +12,6 @@
 
 from document_preview.helper.emlrender import processEml as eml2image
 
-from aspose.words import Document, SaveFormat as WordsSaveFormat
-
-
-WEBP_MAX_SIZE = 16383
-
 
 class DocumentPreview(ServiceBase):
     def __init__(self, config=None):
@@ -32,6 +23,18 @@ def start(self):
     def stop(self):
         self.log.debug("Document preview service ended")
 
+    def office_conversion(self, file, orientation="portrait", page_range_end=2):
+        subprocess.run(["unoconv", "-f", "pdf",
+                        "-e", f"PageRange=1-{page_range_end}",
+                        "-P", f"PaperOrientation={orientation}",
+                        "-P", "PaperFormat=A3",
+                        "-o", f"{self.working_directory}/", file], capture_output=True)
+        converted_file = [s for s in os.listdir(self.working_directory) if ".pdf" in s]
+        if converted_file:
+            return (True, converted_file[0])
+        else:
+            return (False, None)
+
     def pdf_to_images(self, file, max_pages=None):
         pages = convert_from_path(file, first_page=1, last_page=max_pages)
 
@@ -41,9 +44,15 @@ def pdf_to_images(self, file, max_pages=None):
             i += 1
 
     def render_documents(self, request: Request, max_pages=1):
-
-        if request.file_type == 'document/pdf':
-            # PDF
+        # Word/Excel/Powerpoint
+        if any(request.file_type == f'document/office/{ms_product}' for ms_product in ['word', 'excel', 'powerpoint']):
+            orientation = "landscape" if any(request.file_type.endswith(type)
+                                             for type in ['excel', 'powerpoint']) else "portrait"
+            converted = self.office_conversion(request.file_path, orientation, max_pages)
+            if converted[0]:
+                self.pdf_to_images(self.working_directory + "/" + converted[1])
+        # PDF
+        elif request.file_type == 'document/pdf':
             self.pdf_to_images(request.file_path, max_pages)
         # EML/MSG
         elif request.file_type.endswith('email'):
@@ -54,49 +63,19 @@ def render_documents(self, request: Request, max_pages=1):
                     subprocess.run(['msgconvert', '-outfile', tmp.name, request.file_path])
                     tmp.seek(0)
                     file_contents = tmp.read()
-
             # Render EML as PNG
             # If we have internet access, we'll attempt to load external images
             eml2image(file_contents, self.working_directory, self.log,
                       load_ext_images=self.service_attributes.docker_config.allow_internet_access,
                       load_images=request.get_param('load_email_images'))
-        elif request.file_type == 'document/office/onenote':
-            with tempfile.NamedTemporaryFile() as temp_file:
-                temp_file.write(request.file_contents)
-                temp_file.flush()
-                subprocess.run(['wine', 'OneNoteAnalyzer/OneNoteAnalyzer.exe', '--file', temp_file.name],
-                               capture_output=True)
-
-                expected_output_dir = f'{temp_file.name}_content/'
-                if os.path.exists(expected_output_dir):
-                    # Copy to working directory under presumed output filenames
-                    shutil.copyfile(
-                        os.path.join(expected_output_dir, f'ConvertImage_{os.path.basename(temp_file.name)}.png'),
-                        os.path.join(self.working_directory, f'output_{0}'))
-        else:
-            # Word/Excel/Powerpoint
-            aspose_cls, save_format_cls = {
-                'document/office/excel': (Workbook, WorkbookSaveFormat),
-                'document/office/word': (Document, WordsSaveFormat),
-                'document/office/powerpoint': (Presentation, PresentationSaveFormat),
-            }.get(request.file_type, (None, None))
-
-            if not aspose_cls and request.file_type.startswith('document/office'):
-                self.log.warning(f'Aspose unable to handle: {request.file_type}')
-                return
-
-            with tempfile.NamedTemporaryFile() as tmp_file:
-                doc = aspose_cls(request.file_path)
-                doc.save(tmp_file.name, save_format_cls.PDF)
-                tmp_file.seek(0)
-                self.pdf_to_images(tmp_file.name, max_pages)
 
     def execute(self, request):
         start = time()
         result = Result()
 
         # Attempt to render documents given and dump them to the working directory
         max_pages = int(request.get_param('max_pages_rendered'))
+        save_ocr_output = request.get_param('save_ocr_output').lower()
         try:
             self.render_documents(request, max_pages)
         except Exception as e:
@@ -109,10 +88,25 @@ def execute(self, request):
             previews = [s for s in os.listdir(self.working_directory) if "output" in s]
             image_section = ResultImageSection(request,  "Successfully extracted the preview.")
             heur_id = 1 if request.deep_scan or request.get_param('run_ocr') else None
-            [image_section.add_image(f"{self.working_directory}/{preview}",
-                                     name=f"page_{str(i).zfill(3)}.jpeg", description=f"Here's the preview for page {i}",
-                                     ocr_heuristic_id=heur_id)
-             for i, preview in enumerate(natsorted(previews))]
+            for i, preview in enumerate(natsorted(previews)):
+                ocr_io = tempfile.NamedTemporaryFile('w', delete=False) if save_ocr_output != 'no' else None
+                img_name = f"page_{str(i).zfill(3)}.jpeg"
+                image_section.add_image(f"{self.working_directory}/{preview}", name=img_name,
+                                        description=f"Here's the preview for page {i}",
+                                        ocr_heuristic_id=heur_id, ocr_io=ocr_io)
+                # Write OCR output as specified by submissions params
+                if save_ocr_output == 'no':
+                    continue
+                else:
+                    # Write content to disk to be uploaded
+                    if save_ocr_output == 'as_extracted':
+                        request.add_extracted(ocr_io.name, f'{img_name}_ocr_output',
+                                              description="OCR Output")
+                    elif save_ocr_output == 'as_supplementary':
+                        request.add_supplementary(ocr_io.name, f'{img_name}_ocr_output',
+                                                  description="OCR Output")
+                    else:
+                        self.log.warning(f'Unknown save method for OCR given: {save_ocr_output}')
 
             result.add_section(image_section)
         request.result = result

diff --git a/document_preview/helper/emlrender.py b/document_preview/helper/emlrender.py
@@ -27,6 +27,7 @@
 
 try:
     from PIL import Image
+    Image.MAX_IMAGE_PIXELS = 2147483647
 except:
     print('[ERROR] pillow module not installed ("pip install pillow")')
     sys.exit(1)

diff --git a/service_manifest.yml b/service_manifest.yml
@@ -3,7 +3,7 @@ version: $SERVICE_TAG
 description: Use OCR to detect for signs of malicious behaviour in Office and PDF files
 
 accepts: document/(pdf$|office/.*|email)
-rejects: empty|metadata/.*
+rejects: empty|metadata/.*|document/office/onenote
 
 stage: CORE
 category: Static Analysis
@@ -33,6 +33,22 @@ submission_params:
     value: false
     default: false
 
+  - name: save_ocr_output
+    type: list
+    value: "no"
+    default: "no"
+    list: ['no', 'as_extracted', 'as_supplementary']
+
+
+config:
+  # List of OCR terms to override defaults in service base for detection
+  # See: https://github.com/CybercentreCanada/assemblyline-v4-service/blob/master/assemblyline_v4_service/common/extractor/ocr.py
+  ocr:
+    banned: [] # Banned terms
+    macros: [] # Terms that indicate macros
+    ransomware: [] # Terms that indicate ransomware
+
+
 heuristics:
   - heur_id: 1
     name: OCR Detection found