mirror of
https://github.com/freedomofpress/dangerzone.git
synced 2025-04-28 18:02:38 +02:00
Fix OCR on Qubes: PyMuPDF required TESSDATA_PREFIX
PyMuPDF versions lower than 1.22.5 pass the tesseract data path as an argument to `pixmap.pdfocr_tobytes()` [1], but lower versions require setting instead the TESSDATA_PREFIX environment variable [2]. Because on Qubes the pixels to pdf conversion happens on the host and Qubes has a lower PyMuPDF package version, we need to pass instead via environment variable. NOTE: the TESSDATA_PREFIX env. variable was set in dangerzone-cli instead of closer to the calling method in `doc_to_pixels.py` since PyMuPDF reads this variable as soon as the fitz module is imported [3][4]. [1]: https://pymupdf.readthedocs.io/en/latest/pixmap.html#Pixmap.pdfocr_tobytes [2]: https://pymupdf.readthedocs.io/en/latest/installation.html#enabling-integrated-ocr-support [3]: https://github.com/pymupdf/PyMuPDF/discussions/2439 [4]: https://github.com/pymupdf/PyMuPDF/blob/5d6a7db/src/__init__.py#L159 Fixes #682
This commit is contained in:
parent
d1afe4c30a
commit
6006beeb03
2 changed files with 18 additions and 6 deletions
|
@ -57,11 +57,19 @@ class PixelsToPDF(DangerzoneConverter):
|
||||||
self.update_progress(
|
self.update_progress(
|
||||||
f"Converting page {page_num}/{num_pages} from pixels to searchable PDF"
|
f"Converting page {page_num}/{num_pages} from pixels to searchable PDF"
|
||||||
)
|
)
|
||||||
page_pdf_bytes = pixmap.pdfocr_tobytes(
|
if int(fitz.version[2]) >= 20230621000001:
|
||||||
compress=True,
|
page_pdf_bytes = pixmap.pdfocr_tobytes(
|
||||||
language=ocr_lang,
|
compress=True,
|
||||||
tessdata=get_tessdata_dir(),
|
language=ocr_lang,
|
||||||
)
|
tessdata=get_tessdata_dir(),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# XXX method signature changed in v1.22.5 to add tessdata arg
|
||||||
|
# TODO remove after oldest distro has PyMuPDF >= v1.22.5
|
||||||
|
page_pdf_bytes = pixmap.pdfocr_tobytes(
|
||||||
|
compress=True,
|
||||||
|
language=ocr_lang,
|
||||||
|
)
|
||||||
ocr_pdf = fitz.open("pdf", page_pdf_bytes)
|
ocr_pdf = fitz.open("pdf", page_pdf_bytes)
|
||||||
else: # Don't OCR
|
else: # Don't OCR
|
||||||
self.update_progress(
|
self.update_progress(
|
||||||
|
|
|
@ -1,10 +1,14 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
# Load dangerzone module and resources from the source code tree
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
# XXX workaround lack of tessdata path arg for PyMuPDF < v1.22.5
|
||||||
|
# for context see https://github.com/freedomofpress/dangerzone/issues/682
|
||||||
|
os.environ["TESSDATA_PREFIX"] = os.environ.get("TESSDATA_PREFIX", "/usr/share/tesseract/tessdata")
|
||||||
|
|
||||||
|
# Load dangerzone module and resources from the source code tree
|
||||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
sys.dangerzone_dev = True
|
sys.dangerzone_dev = True
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue