From 92ca4b172f1efd649c311e5b66f4a5ed34357849 Mon Sep 17 00:00:00 2001 From: Alex Pyrgiotis Date: Wed, 27 Mar 2024 14:09:02 +0200 Subject: [PATCH] FIXUP: Handle different PyMuPDF versions --- dangerzone/isolation_provider/base.py | 51 ++++++++++++++++++++++++--- 1 file changed, 46 insertions(+), 5 deletions(-) diff --git a/dangerzone/isolation_provider/base.py b/dangerzone/isolation_provider/base.py index 3f232ee..5e32b02 100644 --- a/dangerzone/isolation_provider/base.py +++ b/dangerzone/isolation_provider/base.py @@ -126,6 +126,51 @@ class IsolationProvider(ABC): self.print_progress(document, True, str(e), 0) document.mark_as_failed() + def ocr_page(self, pixmap: fitz.Pixmap, ocr_lang: str) -> bytes: + """Get a single page as pixels, OCR it, and return a PDF as bytes. + + This operation is particularly tricky, since we have to handle various PyMuPDF + versions. + """ + if int(fitz.version[2]) >= 20230621000001: + return pixmap.pdfocr_tobytes( + compress=True, + language=ocr_lang, + tessdata=get_tessdata_dir(), + ) + else: + # XXX: In PyMuPDF v1.22.5, the function signature of + # `pdfocr_tobytes()` / `pdfocr_save()` was extended with an argument + # to explicitly set the Tesseract data dir [1]. + # + # In earlier versions, the PyMuPDF developers recommend setting this + # path via the TESSDATA_PREFIX environment variable. In practice, + # this environment variable is read at import time, so subsequent + # changes to the environment variable are not tracked [2]. + # + # To make things worse, any attempt to alter the internal attribute + # (`fitz.TESSDATA_PREFIX`) makes no difference as well, when using + # the OCR functions. That's due to the way imports work in `fitz`, + # where somehow the internal `fitz.fitz` module is shadowed. + # + # A hacky solution is to grab the `fitz.fitz` module from + # `sys.modules`, and set there the TESSDATA_PREFIX variable. We can + # get away with this hack because we have a proper solution for + # subsequent PyMuPDF versions, and we know that nothing will change + # in older versions. + # + # TODO: Remove after oldest distro has PyMuPDF >= v1.22.5 + # + # [1]: https://pymupdf.readthedocs.io/en/latest/pixmap.html#Pixmap.pdfocr_save + # [2]: https://github.com/pymupdf/PyMuPDF/blob/0368e56cfa6afb55bcf6c726e7f51a2a16a5ccba/fitz/fitz.i#L308 + sys.modules["fitz.fitz"].TESSDATA_PREFIX = get_tessdata_dir() # type: ignore [attr-defined] + + return pixmap.pdfocr_tobytes( + compress=True, + language=ocr_lang, + tessdata=get_tessdata_dir(), + ) + def _pixels_to_pdf( self, untrusted_data: bytes, @@ -144,11 +189,7 @@ class IsolationProvider(ABC): pixmap.set_dpi(DEFAULT_DPI, DEFAULT_DPI) if ocr_lang: # OCR the document - page_pdf_bytes = pixmap.pdfocr_tobytes( - compress=True, - language=ocr_lang, - tessdata=get_tessdata_dir(), - ) + page_pdf_bytes = self.ocr_page(pixmap, ocr_lang) else: # Don't OCR page_doc = fitz.Document() page_doc.insert_file(pixmap)