diff --git a/Dockerfile b/Dockerfile index 1713d58..38a3cf8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,7 @@ ARG branch=latest FROM cccs/assemblyline-v4-service-base:$branch -ENV SERVICE_PATH document_preview.DocumentPreview +ENV SERVICE_PATH document_preview.document_preview.DocumentPreview USER root @@ -9,7 +9,11 @@ RUN mkdir -p /usr/share/man/man1mkdir -p /usr/share/man/man1 RUN apt-get update && apt-get install -y wget RUN wget https://github.com/wkhtmltopdf/packaging/releases/download/0.12.6-1/wkhtmltox_0.12.6-1.buster_amd64.deb RUN apt-get install -y poppler-utils libreoffice ./wkhtmltox_0.12.6-1.buster_amd64.deb --no-install-recommends -RUN pip3 install pdf2image Pillow natsort imgkit compoundfiles compressed_rtf +RUN pip install pdf2image Pillow natsort imgkit compoundfiles compressed_rtf + +# Install Image/Science libraries for Python & Tesseract OCR engine/ Language plug-ins +RUN apt-get install -y libjpeg-dev zlib1g-dev imagemagick tesseract-ocr && rm -rf /var/lib/apt/lists/* +RUN pip install numpy scipy matplotlib pytesseract USER assemblyline diff --git a/document_preview.py b/document_preview.py deleted file mode 100644 index 8f9d8b6..0000000 --- a/document_preview.py +++ /dev/null @@ -1,91 +0,0 @@ -import os -import subprocess - -from natsort import natsorted -from pdf2image import convert_from_path - -from assemblyline_v4_service.common.base import ServiceBase -from assemblyline_v4_service.common.result import Heuristic, Result, ResultImageSection - -from helper.emlrender import processEml as eml2image -from helper.outlookmsgfile import load as msg2eml - - -class DocumentPreview(ServiceBase): - def __init__(self, config=None): - super(DocumentPreview, self).__init__(config) - - def start(self): - self.log.debug("Document preview service started") - - def stop(self): - self.log.debug("Document preview service ended") - - def libreoffice_conversion(self, file): - subprocess.check_output( - "libreoffice --headless --convert-to pdf --outdir " + self.working_directory + " " + file, shell=True) - - pdf_file = [s for s in os.listdir(self.working_directory) if ".pdf" in s][0] - - if pdf_file: - return (True, pdf_file) - else: - return False - - def pdf_to_images(self, file): - pages = convert_from_path(file) - - i = 0 - for page in pages: - page.save(self.working_directory + "/output_" + str(i) + ".jpeg") - i += 1 - - def execute(self, request): - result = Result() - - file = request.file_path - file_type = request.file_type - - # Word/Excel/Powerpoint - if any(file_type == f'document/office/{ms_product}' for ms_product in ['word', 'excel', 'powerpoint']): - converted = self.libreoffice_conversion(file) - if converted[0]: - self.pdf_to_images(self.working_directory + "/" + converted[1]) - # PDF - elif file_type == 'document/pdf': - self.pdf_to_images(file) - # EML/MSG - elif file_type.endswith('email'): - # Convert MSG to EML where applicable - file_contents = msg2eml(file).as_bytes() if file_type == 'document/office/email' else request.file_contents - - # Render EML as PNG - eml2image(file_contents, self.working_directory, self.log) - - # Attempt to preview unknown document format - else: - try: - converted = self.libreoffice_conversion(file) - if converted[0]: - self.pdf_to_images(self.working_directory + "/" + converted[1]) - except: - # Conversion not successfull - pass - - if any("output" in s for s in os.listdir(self.working_directory)): - image_section = ResultImageSection(request, "Successfully extracted the preview.") - - i = 0 - previews = [s for s in os.listdir(self.working_directory) if "output" in s] - for preview in natsorted(previews): - image_path = f"{self.working_directory}/{preview}" - title = f"preview_{i}.jpeg" - desc = f"Here's the preview for page {i}" - if request.get_param('analyze_output'): - request.add_extracted(image_path, title, desc) - image_section.add_image(image_path, title, desc) - i += 1 - - result.add_section(image_section) - - request.result = result diff --git a/document_preview/__pycache__/document_preview.cpython-39.pyc b/document_preview/__pycache__/document_preview.cpython-39.pyc new file mode 100644 index 0000000..c353039 Binary files /dev/null and b/document_preview/__pycache__/document_preview.cpython-39.pyc differ diff --git a/document_preview/document_preview.py b/document_preview/document_preview.py new file mode 100644 index 0000000..51e0339 --- /dev/null +++ b/document_preview/document_preview.py @@ -0,0 +1,140 @@ +import json +import os +import pytesseract +import re +import shutil +import subprocess + +from natsort import natsorted +from pdf2image import convert_from_path +from PIL import Image + +from assemblyline_v4_service.common.base import ServiceBase +from assemblyline_v4_service.common.result import BODY_FORMAT, Result, ResultImageSection, ResultSection, Heuristic + +from document_preview.helper.emlrender import processEml as eml2image +from document_preview.helper.outlookmsgfile import load as msg2eml + +# TODO: Would prefer this mapping to be dynamic from trusted sources (ie. import from library), but will copy-paste for now +INDICATORS_MAPPING = { + ('ransomware', 1): re.compile('|'.join([ + # https://github.com/cuckoosandbox/community/blob/master/modules/signatures/windows/ransomware_message.py + "your files", "your data", "your documents", "restore files", + "restore data", "restore the files", "restore the data", "recover files", + "recover data", "recover the files", "recover the data", "has been locked", + "pay fine", "pay a fine", "pay the fine", "decrypt", "encrypt", + "recover files", "recover data", "recover them", "recover your", + "recover personal", "bitcoin", "secret server", "secret internet server", + "install tor", "download tor", "tor browser", "tor gateway", + "tor-browser", "tor-gateway", "torbrowser", "torgateway", "torproject.org", + "ransom", "bootkit", "rootkit", "payment", "victim", "AES128", "AES256", + "AES 128", "AES 256", "AES-128", "AES-256", "RSA1024", "RSA2048", + "RSA4096", "RSA 1024", "RSA 2048", "RSA 4096", "RSA-1024", "RSA-2048", + "RSA-4096", "private key", "personal key", "your code", "private code", + "personal code", "enter code", "your key", "unique key" + ])), + ('macros', 2): re.compile('|'.join([ + # https://github.com/cuckoosandbox/community/blob/17d57d46ccbca0327a8299cb93abba8604b74df7/modules/signatures/windows/office_enablecontent_ocr.py + "enable macro", + "enable content", + "enable editing", + ])) +} + + +class DocumentPreview(ServiceBase): + def __init__(self, config=None): + super(DocumentPreview, self).__init__(config) + + def start(self): + self.log.debug("Document preview service started") + + def stop(self): + self.log.debug("Document preview service ended") + + def libreoffice_conversion(self, file): + subprocess.check_output( + "libreoffice --headless --convert-to pdf --outdir " + self.working_directory + " " + file, shell=True) + + pdf_file = [s for s in os.listdir(self.working_directory) if ".pdf" in s][0] + + if pdf_file: + return (True, pdf_file) + else: + return False + + def pdf_to_images(self, file): + pages = convert_from_path(file) + + i = 0 + for page in pages: + page.save(self.working_directory + "/output_" + str(i) + ".jpeg") + i += 1 + + def render_documents(self, file_type, file, file_contents): + # Word/Excel/Powerpoint + if any(file_type == f'document/office/{ms_product}' for ms_product in ['word', 'excel', 'powerpoint']): + converted = self.libreoffice_conversion(file) + if converted[0]: + self.pdf_to_images(self.working_directory + "/" + converted[1]) + # PDF + elif file_type == 'document/pdf': + self.pdf_to_images(file) + # EML/MSG + elif file_type.endswith('email'): + # Convert MSG to EML where applicable + file_contents = msg2eml(file).as_bytes() if file_type == 'document/office/email' else file_contents + + # Render EML as PNG + eml2image(file_contents, self.working_directory, self.log) + + # Images don't required to be rendered, however could still be useful for OCR analysis + elif file_type.startswith('image'): + shutil.move(file, os.path.join(self.working_directory, 'output_0')) + + def execute(self, request): + result = Result() + + # Attempt to render documents given and dump them to the working directory + self.render_documents(request.file_type, request.file_path, request.file_contents) + images = list() + + # Create an image gallery section to show the renderings + if any("output" in s for s in os.listdir(self.working_directory)): + image_section = ResultImageSection(request, "Successfully extracted the preview.") + + i = 0 + previews = [s for s in os.listdir(self.working_directory) if "output" in s] + for preview in natsorted(previews): + image_path = f"{self.working_directory}/{preview}" + images.append(image_path) + title = f"preview_{i}.jpeg" + desc = f"Here's the preview for page {i}" + image_section.add_image(image_path, title, desc) + i += 1 + + result.add_section(image_section) + + # Proceed with analysis of output images + for image_path in images: + ocr_output = '' + with Image.open(image_path) as img: + ocr_output = pytesseract.image_to_string(img) + parent = ResultSection(f'OCR Analyis on {os.path.basename(image_path)}') + for indicator, regex_exp in INDICATORS_MAPPING.items(): + search_results = regex_exp.findall(ocr_output) + regex_exp.findall(ocr_output.lower()) + if search_results: + self.log.info(f'Found {indicator[0]}') + body = { + term: [line for line in ocr_output.split('\n') + ocr_output.lower().split('\n') if term in line] + for term in set(search_results)} + ResultSection( + f'OCR Detection: {indicator[0]}', body=json.dumps(body), + body_format=BODY_FORMAT.JSON, + heuristic=Heuristic(heur_id=indicator[1], + frequency=len(search_results)), + parent=parent) + if parent.subsections: + result.add_section(parent) + + request.result = result diff --git a/document_preview/helper/__pycache__/emlrender.cpython-39.pyc b/document_preview/helper/__pycache__/emlrender.cpython-39.pyc new file mode 100644 index 0000000..413b6df Binary files /dev/null and b/document_preview/helper/__pycache__/emlrender.cpython-39.pyc differ diff --git a/document_preview/helper/__pycache__/outlookmsgfile.cpython-39.pyc b/document_preview/helper/__pycache__/outlookmsgfile.cpython-39.pyc new file mode 100644 index 0000000..fa39c03 Binary files /dev/null and b/document_preview/helper/__pycache__/outlookmsgfile.cpython-39.pyc differ diff --git a/helper/emlrender.py b/document_preview/helper/emlrender.py similarity index 100% rename from helper/emlrender.py rename to document_preview/helper/emlrender.py diff --git a/helper/outlookmsgfile.py b/document_preview/helper/outlookmsgfile.py similarity index 100% rename from helper/outlookmsgfile.py rename to document_preview/helper/outlookmsgfile.py diff --git a/service_manifest.yml b/service_manifest.yml index a3625d2..c0081a8 100644 --- a/service_manifest.yml +++ b/service_manifest.yml @@ -1,12 +1,12 @@ name: DocumentPreview version: $SERVICE_TAG -description: Automatically extract the first page of a document as an image +description: Use OCR to detect for signs of malicious behaviour -accepts: (document/pdf|document/office/.*) +accepts: (document/pdf|document/office/.*|image/.*) rejects: empty|metadata/.* stage: CORE -category: Extraction +category: Static Analysis file_required: true timeout: 60 @@ -16,11 +16,18 @@ enabled: true is_external: false licence_count: 0 -submission_params: - - default: false - value: false - type: bool - name: analyze_output +heuristics: + - heur_id: 1 + name: Potential Ransomware + filetype: "*" + score: 100 + description: Ransomware verbage found in OCR inspection. + - heur_id: 2 + name: Potential Macros + filetype: "*" + score: 100 + description: Macros verbage found in OCR inspection. + docker_config: image: cccs/assemblyline-service-document-preview:$SERVICE_TAG