From f75d471ec8ddb7d795c751a9dbf876ac7b175931 Mon Sep 17 00:00:00 2001 From: Alex Pyrgiotis Date: Mon, 4 Mar 2024 16:41:55 +0200 Subject: [PATCH] Fix OCR bug in Qubes Fedora 38 templates Provide a fix for an OCR bug that affected Fedora 38 templates of Qubes OS. In that specific configuration, the PyMuPDF version accepts the Tesseract data directory only from the `TESSDATA_PREFIX` environment variable. Our mistake was that we were setting this environment variable in a dev script, instead of setting it for all configurations. In this commit, we set an attribute in the fitz.fitz module, so that both dev scripts and end-user installations can work. This is hacky, but it targets an old PyMuPDF release after all, so we don't expect things to break in the long run. Fixes #737 --- dangerzone/conversion/pixels_to_pdf.py | 28 ++++++++++++++++++++++++-- dev_scripts/dangerzone | 4 ---- 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/dangerzone/conversion/pixels_to_pdf.py b/dangerzone/conversion/pixels_to_pdf.py index 0243858..42884a0 100644 --- a/dangerzone/conversion/pixels_to_pdf.py +++ b/dangerzone/conversion/pixels_to_pdf.py @@ -64,8 +64,32 @@ class PixelsToPDF(DangerzoneConverter): tessdata=get_tessdata_dir(), ) else: - # XXX method signature changed in v1.22.5 to add tessdata arg - # TODO remove after oldest distro has PyMuPDF >= v1.22.5 + # XXX: In PyMuPDF v1.22.5, the function signature of + # `pdfocr_tobytes()` / `pdfocr_save()` was extended with an argument + # to explicitly set the Tesseract data dir [1]. + # + # In earlier versions, the PyMuPDF developers recommend setting this + # path via the TESSDATA_PREFIX environment variable. In practice, + # this environment variable is read at import time, so subsequent + # changes to the environment variable are not tracked [2]. + # + # To make things worse, any attempt to alter the internal attribute + # (`fitz.TESSDATA_PREFIX`) makes no difference as well, when using + # the OCR functions. That's due to the way imports work in `fitz`, + # where somehow the internal `fitz.fitz` module is shadowed. + # + # A hacky solution is to grab the `fitz.fitz` module from + # `sys.modules`, and set there the TESSDATA_PREFIX variable. We can + # get away with this hack because we have a proper solution for + # subsequent PyMuPDF versions, and we know that nothing will change + # in older versions. + # + # TODO: Remove after oldest distro has PyMuPDF >= v1.22.5 + # + # [1]: https://pymupdf.readthedocs.io/en/latest/pixmap.html#Pixmap.pdfocr_save + # [2]: https://github.com/pymupdf/PyMuPDF/blob/0368e56cfa6afb55bcf6c726e7f51a2a16a5ccba/fitz/fitz.i#L308 + sys.modules["fitz.fitz"].TESSDATA_PREFIX = get_tessdata_dir() # type: ignore [attr-defined] + page_pdf_bytes = pixmap.pdfocr_tobytes( compress=True, language=ocr_lang, diff --git a/dev_scripts/dangerzone b/dev_scripts/dangerzone index ba2ad48..09fe82f 100755 --- a/dev_scripts/dangerzone +++ b/dev_scripts/dangerzone @@ -4,10 +4,6 @@ import os import sys -# XXX workaround lack of tessdata path arg for PyMuPDF < v1.22.5 -# for context see https://github.com/freedomofpress/dangerzone/issues/682 -os.environ["TESSDATA_PREFIX"] = os.environ.get("TESSDATA_PREFIX", "/usr/share/tesseract/tessdata") - # Load dangerzone module and resources from the source code tree sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) sys.dangerzone_dev = True