From 6006beeb037fadf6432c71467dac4b528a93522c Mon Sep 17 00:00:00 2001 From: deeplow Date: Wed, 24 Jan 2024 13:50:22 +0000 Subject: [PATCH] Fix OCR on Qubes: PyMuPDF required TESSDATA_PREFIX PyMuPDF versions lower than 1.22.5 pass the tesseract data path as an argument to `pixmap.pdfocr_tobytes()` [1], but lower versions require setting instead the TESSDATA_PREFIX environment variable [2]. Because on Qubes the pixels to pdf conversion happens on the host and Qubes has a lower PyMuPDF package version, we need to pass instead via environment variable. NOTE: the TESSDATA_PREFIX env. variable was set in dangerzone-cli instead of closer to the calling method in `doc_to_pixels.py` since PyMuPDF reads this variable as soon as the fitz module is imported [3][4]. [1]: https://pymupdf.readthedocs.io/en/latest/pixmap.html#Pixmap.pdfocr_tobytes [2]: https://pymupdf.readthedocs.io/en/latest/installation.html#enabling-integrated-ocr-support [3]: https://github.com/pymupdf/PyMuPDF/discussions/2439 [4]: https://github.com/pymupdf/PyMuPDF/blob/5d6a7db/src/__init__.py#L159 Fixes #682 --- dangerzone/conversion/pixels_to_pdf.py | 18 +++++++++++++----- dev_scripts/dangerzone | 6 +++++- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/dangerzone/conversion/pixels_to_pdf.py b/dangerzone/conversion/pixels_to_pdf.py index 362a769..0243858 100644 --- a/dangerzone/conversion/pixels_to_pdf.py +++ b/dangerzone/conversion/pixels_to_pdf.py @@ -57,11 +57,19 @@ class PixelsToPDF(DangerzoneConverter): self.update_progress( f"Converting page {page_num}/{num_pages} from pixels to searchable PDF" ) - page_pdf_bytes = pixmap.pdfocr_tobytes( - compress=True, - language=ocr_lang, - tessdata=get_tessdata_dir(), - ) + if int(fitz.version[2]) >= 20230621000001: + page_pdf_bytes = pixmap.pdfocr_tobytes( + compress=True, + language=ocr_lang, + tessdata=get_tessdata_dir(), + ) + else: + # XXX method signature changed in v1.22.5 to add tessdata arg + # TODO remove after oldest distro has PyMuPDF >= v1.22.5 + page_pdf_bytes = pixmap.pdfocr_tobytes( + compress=True, + language=ocr_lang, + ) ocr_pdf = fitz.open("pdf", page_pdf_bytes) else: # Don't OCR self.update_progress( diff --git a/dev_scripts/dangerzone b/dev_scripts/dangerzone index ab3fe70..ba2ad48 100755 --- a/dev_scripts/dangerzone +++ b/dev_scripts/dangerzone @@ -1,10 +1,14 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Load dangerzone module and resources from the source code tree import os import sys +# XXX workaround lack of tessdata path arg for PyMuPDF < v1.22.5 +# for context see https://github.com/freedomofpress/dangerzone/issues/682 +os.environ["TESSDATA_PREFIX"] = os.environ.get("TESSDATA_PREFIX", "/usr/share/tesseract/tessdata") + +# Load dangerzone module and resources from the source code tree sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) sys.dangerzone_dev = True