diff --git a/BUILD.md b/BUILD.md index ba70ecb..c88e466 100644 --- a/BUILD.md +++ b/BUILD.md @@ -268,10 +268,7 @@ test it. cd dangerzone ``` -2. Follow the Fedora instructions for setting up the development environment with the particularity of running the following instead of `poetry install`: - ``` - poetry install - ``` +2. Follow the Fedora instructions for setting up the development environment. 3. Build a dangerzone `.rpm` for qubes with the command diff --git a/dangerzone/isolation_provider/base.py b/dangerzone/isolation_provider/base.py index 693b4c7..d5816a8 100644 --- a/dangerzone/isolation_provider/base.py +++ b/dangerzone/isolation_provider/base.py @@ -125,48 +125,12 @@ class IsolationProvider(ABC): document.mark_as_failed() def ocr_page(self, pixmap: fitz.Pixmap, ocr_lang: str) -> bytes: - """Get a single page as pixels, OCR it, and return a PDF as bytes. - - This operation is particularly tricky, since we have to handle various PyMuPDF - versions. - """ - if int(fitz.version[2]) >= 20230621000001: - return pixmap.pdfocr_tobytes( - compress=True, - language=ocr_lang, - tessdata=get_tessdata_dir(), - ) - else: - # XXX: In PyMuPDF v1.22.5, the function signature of - # `pdfocr_tobytes()` / `pdfocr_save()` was extended with an argument - # to explicitly set the Tesseract data dir [1]. - # - # In earlier versions, the PyMuPDF developers recommend setting this - # path via the TESSDATA_PREFIX environment variable. In practice, - # this environment variable is read at import time, so subsequent - # changes to the environment variable are not tracked [2]. - # - # To make things worse, any attempt to alter the internal attribute - # (`fitz.TESSDATA_PREFIX`) makes no difference as well, when using - # the OCR functions. That's due to the way imports work in `fitz`, - # where somehow the internal `fitz.fitz` module is shadowed. - # - # A hacky solution is to grab the `fitz.fitz` module from - # `sys.modules`, and set there the TESSDATA_PREFIX variable. We can - # get away with this hack because we have a proper solution for - # subsequent PyMuPDF versions, and we know that nothing will change - # in older versions. - # - # TODO: Remove after oldest distro has PyMuPDF >= v1.22.5 - # - # [1]: https://pymupdf.readthedocs.io/en/latest/pixmap.html#Pixmap.pdfocr_save - # [2]: https://github.com/pymupdf/PyMuPDF/blob/0368e56cfa6afb55bcf6c726e7f51a2a16a5ccba/fitz/fitz.i#L308 - sys.modules["fitz.fitz"].TESSDATA_PREFIX = get_tessdata_dir() # type: ignore [attr-defined] - - return pixmap.pdfocr_tobytes( - compress=True, - language=ocr_lang, - ) + """Get a single page as pixels, OCR it, and return a PDF as bytes.""" + return pixmap.pdfocr_tobytes( + compress=True, + language=ocr_lang, + tessdata=get_tessdata_dir(), + ) def pixels_to_pdf_page( self,