mirror of
https://github.com/freedomofpress/dangerzone.git
synced 2025-04-29 18:22:37 +02:00
FIXUP: Remove stale code for PyMuPDF < 1.22.5
This commit is contained in:
parent
0d80cf1f0c
commit
f68721637c
2 changed files with 7 additions and 46 deletions
5
BUILD.md
5
BUILD.md
|
@ -268,10 +268,7 @@ test it.
|
||||||
cd dangerzone
|
cd dangerzone
|
||||||
```
|
```
|
||||||
|
|
||||||
2. Follow the Fedora instructions for setting up the development environment with the particularity of running the following instead of `poetry install`:
|
2. Follow the Fedora instructions for setting up the development environment.
|
||||||
```
|
|
||||||
poetry install
|
|
||||||
```
|
|
||||||
|
|
||||||
3. Build a dangerzone `.rpm` for qubes with the command
|
3. Build a dangerzone `.rpm` for qubes with the command
|
||||||
|
|
||||||
|
|
|
@ -125,48 +125,12 @@ class IsolationProvider(ABC):
|
||||||
document.mark_as_failed()
|
document.mark_as_failed()
|
||||||
|
|
||||||
def ocr_page(self, pixmap: fitz.Pixmap, ocr_lang: str) -> bytes:
|
def ocr_page(self, pixmap: fitz.Pixmap, ocr_lang: str) -> bytes:
|
||||||
"""Get a single page as pixels, OCR it, and return a PDF as bytes.
|
"""Get a single page as pixels, OCR it, and return a PDF as bytes."""
|
||||||
|
|
||||||
This operation is particularly tricky, since we have to handle various PyMuPDF
|
|
||||||
versions.
|
|
||||||
"""
|
|
||||||
if int(fitz.version[2]) >= 20230621000001:
|
|
||||||
return pixmap.pdfocr_tobytes(
|
return pixmap.pdfocr_tobytes(
|
||||||
compress=True,
|
compress=True,
|
||||||
language=ocr_lang,
|
language=ocr_lang,
|
||||||
tessdata=get_tessdata_dir(),
|
tessdata=get_tessdata_dir(),
|
||||||
)
|
)
|
||||||
else:
|
|
||||||
# XXX: In PyMuPDF v1.22.5, the function signature of
|
|
||||||
# `pdfocr_tobytes()` / `pdfocr_save()` was extended with an argument
|
|
||||||
# to explicitly set the Tesseract data dir [1].
|
|
||||||
#
|
|
||||||
# In earlier versions, the PyMuPDF developers recommend setting this
|
|
||||||
# path via the TESSDATA_PREFIX environment variable. In practice,
|
|
||||||
# this environment variable is read at import time, so subsequent
|
|
||||||
# changes to the environment variable are not tracked [2].
|
|
||||||
#
|
|
||||||
# To make things worse, any attempt to alter the internal attribute
|
|
||||||
# (`fitz.TESSDATA_PREFIX`) makes no difference as well, when using
|
|
||||||
# the OCR functions. That's due to the way imports work in `fitz`,
|
|
||||||
# where somehow the internal `fitz.fitz` module is shadowed.
|
|
||||||
#
|
|
||||||
# A hacky solution is to grab the `fitz.fitz` module from
|
|
||||||
# `sys.modules`, and set there the TESSDATA_PREFIX variable. We can
|
|
||||||
# get away with this hack because we have a proper solution for
|
|
||||||
# subsequent PyMuPDF versions, and we know that nothing will change
|
|
||||||
# in older versions.
|
|
||||||
#
|
|
||||||
# TODO: Remove after oldest distro has PyMuPDF >= v1.22.5
|
|
||||||
#
|
|
||||||
# [1]: https://pymupdf.readthedocs.io/en/latest/pixmap.html#Pixmap.pdfocr_save
|
|
||||||
# [2]: https://github.com/pymupdf/PyMuPDF/blob/0368e56cfa6afb55bcf6c726e7f51a2a16a5ccba/fitz/fitz.i#L308
|
|
||||||
sys.modules["fitz.fitz"].TESSDATA_PREFIX = get_tessdata_dir() # type: ignore [attr-defined]
|
|
||||||
|
|
||||||
return pixmap.pdfocr_tobytes(
|
|
||||||
compress=True,
|
|
||||||
language=ocr_lang,
|
|
||||||
)
|
|
||||||
|
|
||||||
def pixels_to_pdf_page(
|
def pixels_to_pdf_page(
|
||||||
self,
|
self,
|
||||||
|
|
Loading…
Reference in a new issue