From 6006beeb037fadf6432c71467dac4b528a93522c Mon Sep 17 00:00:00 2001
From: deeplow <deeplower@protonmail.com>
Date: Wed, 24 Jan 2024 13:50:22 +0000
Subject: [PATCH] Fix OCR on Qubes: PyMuPDF required TESSDATA_PREFIX

PyMuPDF versions lower than 1.22.5 pass the tesseract data path as
an argument to `pixmap.pdfocr_tobytes()` [1], but lower versions require
setting instead the TESSDATA_PREFIX environment variable [2].

Because on Qubes the pixels to pdf conversion happens on the host and
Qubes has a lower PyMuPDF package version, we need to pass instead via
environment variable.

NOTE: the TESSDATA_PREFIX env. variable was set in dangerzone-cli
instead of closer to the calling method in `doc_to_pixels.py` since
PyMuPDF reads this variable as soon as the fitz module is imported
[3][4].

[1]: https://pymupdf.readthedocs.io/en/latest/pixmap.html#Pixmap.pdfocr_tobytes
[2]: https://pymupdf.readthedocs.io/en/latest/installation.html#enabling-integrated-ocr-support
[3]: https://github.com/pymupdf/PyMuPDF/discussions/2439
[4]: https://github.com/pymupdf/PyMuPDF/blob/5d6a7db/src/__init__.py#L159

Fixes #682
---
 dangerzone/conversion/pixels_to_pdf.py | 18 +++++++++++++-----
 dev_scripts/dangerzone                 |  6 +++++-
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/dangerzone/conversion/pixels_to_pdf.py b/dangerzone/conversion/pixels_to_pdf.py
index 362a769..0243858 100644
--- a/dangerzone/conversion/pixels_to_pdf.py
+++ b/dangerzone/conversion/pixels_to_pdf.py
@@ -57,11 +57,19 @@ class PixelsToPDF(DangerzoneConverter):
                 self.update_progress(
                     f"Converting page {page_num}/{num_pages} from pixels to searchable PDF"
                 )
-                page_pdf_bytes = pixmap.pdfocr_tobytes(
-                    compress=True,
-                    language=ocr_lang,
-                    tessdata=get_tessdata_dir(),
-                )
+                if int(fitz.version[2]) >= 20230621000001:
+                    page_pdf_bytes = pixmap.pdfocr_tobytes(
+                        compress=True,
+                        language=ocr_lang,
+                        tessdata=get_tessdata_dir(),
+                    )
+                else:
+                    # XXX method signature changed in v1.22.5 to add tessdata arg
+                    # TODO remove after oldest distro has PyMuPDF >= v1.22.5
+                    page_pdf_bytes = pixmap.pdfocr_tobytes(
+                        compress=True,
+                        language=ocr_lang,
+                    )
                 ocr_pdf = fitz.open("pdf", page_pdf_bytes)
             else:  # Don't OCR
                 self.update_progress(
diff --git a/dev_scripts/dangerzone b/dev_scripts/dangerzone
index ab3fe70..ba2ad48 100755
--- a/dev_scripts/dangerzone
+++ b/dev_scripts/dangerzone
@@ -1,10 +1,14 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 
-# Load dangerzone module and resources from the source code tree
 import os
 import sys
 
+# XXX workaround lack of tessdata path arg for PyMuPDF < v1.22.5
+# for context see https://github.com/freedomofpress/dangerzone/issues/682
+os.environ["TESSDATA_PREFIX"] = os.environ.get("TESSDATA_PREFIX", "/usr/share/tesseract/tessdata")
+
+# Load dangerzone module and resources from the source code tree
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 sys.dangerzone_dev = True