Skip to content

Commit

Permalink
Merge pull request #80 from CybercentreCanada/AL-2766
Browse files Browse the repository at this point in the history
Al 2766
  • Loading branch information
cccs-rs authored Nov 8, 2023
2 parents 206479c + 8c6a055 commit 0578544
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 8 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ RUN apt-get install -y tesseract-ocr libemail-outlook-message-perl libgdiplus un
RUN wget https://github.com/wkhtmltopdf/packaging/releases/download/0.12.6-1/wkhtmltox_0.12.6-1.buster_amd64.deb
RUN apt-get install -y poppler-utils ./wkhtmltox_0.12.6-1.buster_amd64.deb --no-install-recommends &&\
rm -f ./wkhtmltox_0.12.6-1.buster_amd64.deb
RUN pip install pdf2image Pillow==9.5.0 natsort imgkit compoundfiles compressed_rtf pytesseract
RUN pip install Pillow==9.5.0 natsort imgkit compoundfiles compressed_rtf pytesseract

# Install Chrome for headless rendering of HTML documents
RUN wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb && \
Expand Down
25 changes: 18 additions & 7 deletions document_preview/document_preview.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,27 @@
from assemblyline_v4_service.common.request import ServiceRequest as Request
from assemblyline_v4_service.common.result import Heuristic, Result, ResultImageSection, ResultTextSection
from natsort import natsorted
from pdf2image import convert_from_path, pdfinfo_from_path

from document_preview.helper.emlrender import processEml as eml2image


def pdfinfo_from_path(fp: str):
pdfinfo = {}
for info in subprocess.run(["pdfinfo", fp], capture_output=True).stdout.strip().decode().split("\n"):
k, v = info.split(":", 1)
# Clean up spacing
v = v.lstrip()
pdfinfo[k] = v
pdfinfo


def convert_from_path(fp: str, output_directory: str, first_page=1, last_page=None):
pdf_conv_command = ["pdftoppm", "-jpeg", "-f", first_page]
if last_page:
pdf_conv_command += ["-l", last_page]
subprocess.run(pdf_conv_command + [fp, os.path.join(output_directory, "output")], capture_output=True)


class DocumentPreview(ServiceBase):
def __init__(self, config=None):
super(DocumentPreview, self).__init__(config)
Expand Down Expand Up @@ -48,12 +64,7 @@ def office_conversion(self, file, orientation="portrait", page_range_end=2):
return (False, None)

def pdf_to_images(self, file, max_pages=None):
pages = convert_from_path(file, first_page=1, last_page=max_pages)

i = 0
for page in pages:
page.save(self.working_directory + "/output_" + str(i) + ".jpeg")
i += 1
convert_from_path(file, self.working_directory, first_page=1, last_page=max_pages)

def render_documents(self, request: Request, max_pages=1):
# Word/Excel/Powerpoint/RTF
Expand Down

0 comments on commit 0578544

Please sign in to comment.