FIXUP: Remove stale code for PyMuPDF < 1.22.5

2025-04-29 18:22:37 +02:00 · 2024-09-11 14:24:28 +03:00 · 2024-09-11 14:24:28 +03:00 · f68721637c
commit f68721637c
parent 0d80cf1f0c
2 changed files with 7 additions and 46 deletions
--- a/BUILD.md
+++ b/BUILD.md
@ -268,10 +268,7 @@ test it.
   cd dangerzone
   ```
-2. Follow the Fedora instructions for setting up the development environment with the particularity of running the following instead of `poetry install`:
+2. Follow the Fedora instructions for setting up the development environment.
   ```
   poetry install
   ```
 3. Build a dangerzone `.rpm` for qubes with the command
--- a/dangerzone/isolation_provider/base.py
+++ b/dangerzone/isolation_provider/base.py
@ -125,48 +125,12 @@ class IsolationProvider(ABC):
            document.mark_as_failed()
    def ocr_page(self, pixmap: fitz.Pixmap, ocr_lang: str) -> bytes:
-        """Get a single page as pixels, OCR it, and return a PDF as bytes.
+        """Get a single page as pixels, OCR it, and return a PDF as bytes."""
        This operation is particularly tricky, since we have to handle various PyMuPDF
        versions.
        """
        if int(fitz.version[2]) >= 20230621000001:
        return pixmap.pdfocr_tobytes(
            compress=True,
            language=ocr_lang,
            tessdata=get_tessdata_dir(),
        )
        else:
            # XXX: In PyMuPDF v1.22.5, the function signature of
            # `pdfocr_tobytes()` / `pdfocr_save()` was extended with an argument
            # to explicitly set the Tesseract data dir [1].
            #
            # In earlier versions, the PyMuPDF developers recommend setting this
            # path via the TESSDATA_PREFIX environment variable. In practice,
            # this environment variable is read at import time, so subsequent
            # changes to the environment variable are not tracked [2].
            #
            # To make things worse, any attempt to alter the internal attribute
            # (`fitz.TESSDATA_PREFIX`) makes no difference as well, when using
            # the OCR functions. That's due to the way imports work in `fitz`,
            # where somehow the internal `fitz.fitz` module is shadowed.
            #
            # A hacky solution is to grab the `fitz.fitz` module from
            # `sys.modules`, and set there the TESSDATA_PREFIX variable. We can
            # get away with this hack because we have a proper solution for
            # subsequent PyMuPDF versions, and we know that nothing will change
            # in older versions.
            #
            # TODO: Remove after oldest distro has PyMuPDF >= v1.22.5
            #
            # [1]: https://pymupdf.readthedocs.io/en/latest/pixmap.html#Pixmap.pdfocr_save
            # [2]: https://github.com/pymupdf/PyMuPDF/blob/0368e56cfa6afb55bcf6c726e7f51a2a16a5ccba/fitz/fitz.i#L308
            sys.modules["fitz.fitz"].TESSDATA_PREFIX = get_tessdata_dir()  # type: ignore [attr-defined]
            return pixmap.pdfocr_tobytes(
                compress=True,
                language=ocr_lang,
            )
    def pixels_to_pdf_page(
        self,