add OCR analysis

CybercentreCanada · Mar 3, 2022 · a57ce37 · a57ce37
1 parent 6674fa2
commit a57ce37
Show file tree

Hide file tree

Showing 9 changed files with 161 additions and 101 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,15 +1,19 @@
 ARG branch=latest
 FROM cccs/assemblyline-v4-service-base:$branch
 
-ENV SERVICE_PATH document_preview.DocumentPreview
+ENV SERVICE_PATH document_preview.document_preview.DocumentPreview
 
 USER root
 
 RUN mkdir -p /usr/share/man/man1mkdir -p /usr/share/man/man1
 RUN apt-get update && apt-get install -y wget
 RUN wget https://github.com/wkhtmltopdf/packaging/releases/download/0.12.6-1/wkhtmltox_0.12.6-1.buster_amd64.deb
 RUN apt-get install -y poppler-utils libreoffice  ./wkhtmltox_0.12.6-1.buster_amd64.deb --no-install-recommends
-RUN pip3 install pdf2image Pillow natsort imgkit compoundfiles compressed_rtf
+RUN pip install pdf2image Pillow natsort imgkit compoundfiles compressed_rtf
+
+# Install Image/Science libraries for Python & Tesseract OCR engine/ Language plug-ins
+RUN apt-get install -y libjpeg-dev zlib1g-dev imagemagick tesseract-ocr && rm -rf /var/lib/apt/lists/*
+RUN pip install numpy scipy matplotlib pytesseract
 
 USER assemblyline
 

diff --git a/document_preview.py b/document_preview.py
diff --git a/document_preview/__pycache__/document_preview.cpython-39.pyc b/document_preview/__pycache__/document_preview.cpython-39.pyc
diff --git a/document_preview/document_preview.py b/document_preview/document_preview.py
@@ -0,0 +1,140 @@
+import json
+import os
+import pytesseract
+import re
+import shutil
+import subprocess
+
+from natsort import natsorted
+from pdf2image import convert_from_path
+from PIL import Image
+
+from assemblyline_v4_service.common.base import ServiceBase
+from assemblyline_v4_service.common.result import BODY_FORMAT, Result, ResultImageSection, ResultSection, Heuristic
+
+from document_preview.helper.emlrender import processEml as eml2image
+from document_preview.helper.outlookmsgfile import load as msg2eml
+
+# TODO: Would prefer this mapping to be dynamic from trusted sources (ie. import from library), but will copy-paste for now
+INDICATORS_MAPPING = {
+    ('ransomware', 1):   re.compile('|'.join([
+        # https://github.com/cuckoosandbox/community/blob/master/modules/signatures/windows/ransomware_message.py
+        "your files", "your data", "your documents", "restore files",
+        "restore data", "restore the files", "restore the data", "recover files",
+        "recover data", "recover the files", "recover the data", "has been locked",
+        "pay fine", "pay a fine", "pay the fine", "decrypt", "encrypt",
+        "recover files", "recover data", "recover them", "recover your",
+        "recover personal", "bitcoin", "secret server", "secret internet server",
+        "install tor", "download tor", "tor browser", "tor gateway",
+        "tor-browser", "tor-gateway", "torbrowser", "torgateway", "torproject.org",
+        "ransom", "bootkit", "rootkit", "payment", "victim", "AES128", "AES256",
+        "AES 128", "AES 256", "AES-128", "AES-256", "RSA1024", "RSA2048",
+        "RSA4096", "RSA 1024", "RSA 2048", "RSA 4096", "RSA-1024", "RSA-2048",
+        "RSA-4096", "private key", "personal key", "your code", "private code",
+        "personal code", "enter code", "your key", "unique key"
+    ])),
+    ('macros', 2): re.compile('|'.join([
+        # https://github.com/cuckoosandbox/community/blob/17d57d46ccbca0327a8299cb93abba8604b74df7/modules/signatures/windows/office_enablecontent_ocr.py
+        "enable macro",
+        "enable content",
+        "enable editing",
+    ]))
+}
+
+
+class DocumentPreview(ServiceBase):
+    def __init__(self, config=None):
+        super(DocumentPreview, self).__init__(config)
+
+    def start(self):
+        self.log.debug("Document preview service started")
+
+    def stop(self):
+        self.log.debug("Document preview service ended")
+
+    def libreoffice_conversion(self, file):
+        subprocess.check_output(
+            "libreoffice --headless --convert-to pdf --outdir " + self.working_directory + " " + file, shell=True)
+
+        pdf_file = [s for s in os.listdir(self.working_directory) if ".pdf" in s][0]
+
+        if pdf_file:
+            return (True, pdf_file)
+        else:
+            return False
+
+    def pdf_to_images(self, file):
+        pages = convert_from_path(file)
+
+        i = 0
+        for page in pages:
+            page.save(self.working_directory + "/output_" + str(i) + ".jpeg")
+            i += 1
+
+    def render_documents(self, file_type, file, file_contents):
+        # Word/Excel/Powerpoint
+        if any(file_type == f'document/office/{ms_product}' for ms_product in ['word', 'excel', 'powerpoint']):
+            converted = self.libreoffice_conversion(file)
+            if converted[0]:
+                self.pdf_to_images(self.working_directory + "/" + converted[1])
+        # PDF
+        elif file_type == 'document/pdf':
+            self.pdf_to_images(file)
+        # EML/MSG
+        elif file_type.endswith('email'):
+            # Convert MSG to EML where applicable
+            file_contents = msg2eml(file).as_bytes() if file_type == 'document/office/email' else file_contents
+
+            # Render EML as PNG
+            eml2image(file_contents, self.working_directory, self.log)
+
+        # Images don't required to be rendered, however could still be useful for OCR analysis
+        elif file_type.startswith('image'):
+            shutil.move(file, os.path.join(self.working_directory, 'output_0'))
+
+    def execute(self, request):
+        result = Result()
+
+        # Attempt to render documents given and dump them to the working directory
+        self.render_documents(request.file_type, request.file_path, request.file_contents)
+        images = list()
+
+        # Create an image gallery section to show the renderings
+        if any("output" in s for s in os.listdir(self.working_directory)):
+            image_section = ResultImageSection(request, "Successfully extracted the preview.")
+
+            i = 0
+            previews = [s for s in os.listdir(self.working_directory) if "output" in s]
+            for preview in natsorted(previews):
+                image_path = f"{self.working_directory}/{preview}"
+                images.append(image_path)
+                title = f"preview_{i}.jpeg"
+                desc = f"Here's the preview for page {i}"
+                image_section.add_image(image_path, title, desc)
+                i += 1
+
+            result.add_section(image_section)
+
+        # Proceed with analysis of output images
+        for image_path in images:
+            ocr_output = ''
+            with Image.open(image_path) as img:
+                ocr_output = pytesseract.image_to_string(img)
+            parent = ResultSection(f'OCR Analyis on {os.path.basename(image_path)}')
+            for indicator, regex_exp in INDICATORS_MAPPING.items():
+                search_results = regex_exp.findall(ocr_output) + regex_exp.findall(ocr_output.lower())
+                if search_results:
+                    self.log.info(f'Found {indicator[0]}')
+                    body = {
+                        term: [line for line in ocr_output.split('\n') + ocr_output.lower().split('\n') if term in line]
+                        for term in set(search_results)}
+                    ResultSection(
+                        f'OCR Detection: {indicator[0]}', body=json.dumps(body),
+                        body_format=BODY_FORMAT.JSON,
+                        heuristic=Heuristic(heur_id=indicator[1],
+                                            frequency=len(search_results)),
+                        parent=parent)
+            if parent.subsections:
+                result.add_section(parent)
+
+        request.result = result
diff --git a/document_preview/helper/__pycache__/emlrender.cpython-39.pyc b/document_preview/helper/__pycache__/emlrender.cpython-39.pyc
diff --git a/document_preview/helper/__pycache__/outlookmsgfile.cpython-39.pyc b/document_preview/helper/__pycache__/outlookmsgfile.cpython-39.pyc
diff --git a/helper/emlrender.py → document_preview/helper/emlrender.py b/helper/emlrender.py → document_preview/helper/emlrender.py
diff --git a/helper/outlookmsgfile.py → document_preview/helper/outlookmsgfile.py b/helper/outlookmsgfile.py → document_preview/helper/outlookmsgfile.py
diff --git a/service_manifest.yml b/service_manifest.yml
@@ -1,12 +1,12 @@
 name: DocumentPreview
 version: $SERVICE_TAG
-description: Automatically extract the first page of a document as an image
+description: Use OCR to detect for signs of malicious behaviour
 
-accepts: (document/pdf|document/office/.*)
+accepts: (document/pdf|document/office/.*|image/.*)
 rejects: empty|metadata/.*
 
 stage: CORE
-category: Extraction
+category: Static Analysis
 
 file_required: true
 timeout: 60
@@ -16,11 +16,18 @@ enabled: true
 is_external: false
 licence_count: 0
 
-submission_params:
-  - default: false
-    value: false
-    type: bool
-    name: analyze_output
+heuristics:
+  - heur_id: 1
+    name: Potential Ransomware
+    filetype: "*"
+    score: 100
+    description: Ransomware verbage found in OCR inspection.
+  - heur_id: 2
+    name: Potential Macros
+    filetype: "*"
+    score: 100
+    description: Macros verbage found in OCR inspection.
+
 
 docker_config:
   image: cccs/assemblyline-service-document-preview:$SERVICE_TAG