mirror of
https://github.com/freedomofpress/dangerzone.git
synced 2025-04-28 18:02:38 +02:00
Fix OCR bug in Qubes Fedora 38 templates
Provide a fix for an OCR bug that affected Fedora 38 templates of Qubes OS. In that specific configuration, the PyMuPDF version accepts the Tesseract data directory only from the `TESSDATA_PREFIX` environment variable. Our mistake was that we were setting this environment variable in a dev script, instead of setting it for all configurations. In this commit, we set an attribute in the fitz.fitz module, so that both dev scripts and end-user installations can work. This is hacky, but it targets an old PyMuPDF release after all, so we don't expect things to break in the long run. Fixes #737
This commit is contained in:
parent
d35eb56b4b
commit
f75d471ec8
2 changed files with 26 additions and 6 deletions
|
@ -64,8 +64,32 @@ class PixelsToPDF(DangerzoneConverter):
|
||||||
tessdata=get_tessdata_dir(),
|
tessdata=get_tessdata_dir(),
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# XXX method signature changed in v1.22.5 to add tessdata arg
|
# XXX: In PyMuPDF v1.22.5, the function signature of
|
||||||
# TODO remove after oldest distro has PyMuPDF >= v1.22.5
|
# `pdfocr_tobytes()` / `pdfocr_save()` was extended with an argument
|
||||||
|
# to explicitly set the Tesseract data dir [1].
|
||||||
|
#
|
||||||
|
# In earlier versions, the PyMuPDF developers recommend setting this
|
||||||
|
# path via the TESSDATA_PREFIX environment variable. In practice,
|
||||||
|
# this environment variable is read at import time, so subsequent
|
||||||
|
# changes to the environment variable are not tracked [2].
|
||||||
|
#
|
||||||
|
# To make things worse, any attempt to alter the internal attribute
|
||||||
|
# (`fitz.TESSDATA_PREFIX`) makes no difference as well, when using
|
||||||
|
# the OCR functions. That's due to the way imports work in `fitz`,
|
||||||
|
# where somehow the internal `fitz.fitz` module is shadowed.
|
||||||
|
#
|
||||||
|
# A hacky solution is to grab the `fitz.fitz` module from
|
||||||
|
# `sys.modules`, and set there the TESSDATA_PREFIX variable. We can
|
||||||
|
# get away with this hack because we have a proper solution for
|
||||||
|
# subsequent PyMuPDF versions, and we know that nothing will change
|
||||||
|
# in older versions.
|
||||||
|
#
|
||||||
|
# TODO: Remove after oldest distro has PyMuPDF >= v1.22.5
|
||||||
|
#
|
||||||
|
# [1]: https://pymupdf.readthedocs.io/en/latest/pixmap.html#Pixmap.pdfocr_save
|
||||||
|
# [2]: https://github.com/pymupdf/PyMuPDF/blob/0368e56cfa6afb55bcf6c726e7f51a2a16a5ccba/fitz/fitz.i#L308
|
||||||
|
sys.modules["fitz.fitz"].TESSDATA_PREFIX = get_tessdata_dir() # type: ignore [attr-defined]
|
||||||
|
|
||||||
page_pdf_bytes = pixmap.pdfocr_tobytes(
|
page_pdf_bytes = pixmap.pdfocr_tobytes(
|
||||||
compress=True,
|
compress=True,
|
||||||
language=ocr_lang,
|
language=ocr_lang,
|
||||||
|
|
|
@ -4,10 +4,6 @@
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
# XXX workaround lack of tessdata path arg for PyMuPDF < v1.22.5
|
|
||||||
# for context see https://github.com/freedomofpress/dangerzone/issues/682
|
|
||||||
os.environ["TESSDATA_PREFIX"] = os.environ.get("TESSDATA_PREFIX", "/usr/share/tesseract/tessdata")
|
|
||||||
|
|
||||||
# Load dangerzone module and resources from the source code tree
|
# Load dangerzone module and resources from the source code tree
|
||||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
sys.dangerzone_dev = True
|
sys.dangerzone_dev = True
|
||||||
|
|
Loading…
Reference in a new issue