diff --git a/dangerzone/conversion/pixels_to_pdf.py b/dangerzone/conversion/pixels_to_pdf.py index 0243858d8..42884a0a0 100644 --- a/dangerzone/conversion/pixels_to_pdf.py +++ b/dangerzone/conversion/pixels_to_pdf.py @@ -64,8 +64,32 @@ async def convert( tessdata=get_tessdata_dir(), ) else: - # XXX method signature changed in v1.22.5 to add tessdata arg - # TODO remove after oldest distro has PyMuPDF >= v1.22.5 + # XXX: In PyMuPDF v1.22.5, the function signature of + # `pdfocr_tobytes()` / `pdfocr_save()` was extended with an argument + # to explicitly set the Tesseract data dir [1]. + # + # In earlier versions, the PyMuPDF developers recommend setting this + # path via the TESSDATA_PREFIX environment variable. In practice, + # this environment variable is read at import time, so subsequent + # changes to the environment variable are not tracked [2]. + # + # To make things worse, any attempt to alter the internal attribute + # (`fitz.TESSDATA_PREFIX`) makes no difference as well, when using + # the OCR functions. That's due to the way imports work in `fitz`, + # where somehow the internal `fitz.fitz` module is shadowed. + # + # A hacky solution is to grab the `fitz.fitz` module from + # `sys.modules`, and set there the TESSDATA_PREFIX variable. We can + # get away with this hack because we have a proper solution for + # subsequent PyMuPDF versions, and we know that nothing will change + # in older versions. + # + # TODO: Remove after oldest distro has PyMuPDF >= v1.22.5 + # + # [1]: https://pymupdf.readthedocs.io/en/latest/pixmap.html#Pixmap.pdfocr_save + # [2]: https://github.com/pymupdf/PyMuPDF/blob/0368e56cfa6afb55bcf6c726e7f51a2a16a5ccba/fitz/fitz.i#L308 + sys.modules["fitz.fitz"].TESSDATA_PREFIX = get_tessdata_dir() # type: ignore [attr-defined] + page_pdf_bytes = pixmap.pdfocr_tobytes( compress=True, language=ocr_lang, diff --git a/dev_scripts/dangerzone b/dev_scripts/dangerzone index ba2ad48c2..09fe82f5e 100755 --- a/dev_scripts/dangerzone +++ b/dev_scripts/dangerzone @@ -4,10 +4,6 @@ import os import sys -# XXX workaround lack of tessdata path arg for PyMuPDF < v1.22.5 -# for context see https://github.com/freedomofpress/dangerzone/issues/682 -os.environ["TESSDATA_PREFIX"] = os.environ.get("TESSDATA_PREFIX", "/usr/share/tesseract/tessdata") - # Load dangerzone module and resources from the source code tree sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) sys.dangerzone_dev = True