dangerzone/tests/test_ocr.py
deeplow f676891482
Remove Dockerfile dependencies replaced by PyMuPDF
PyMuPDF replaced the need for almost all dependencies, which this commit
now removes.

We are also removing tesseract-ocr as a dependency since
(to our surprise) PyMuPDF ships directly with tesseract binaries [1].
However, now that tesseract-ocr is not available directly as a binary
tool, the `test_ocr.py` needed to be changed.

Fixes #658

[1]: https://github.com/freedomofpress/dangerzone/issues/658#issuecomment-1861033149
2024-01-03 12:58:36 +00:00

56 lines
2 KiB
Python

import platform
import subprocess
from pathlib import Path
import pytest
from dangerzone.isolation_provider.container import Container
from dangerzone.logic import DangerzoneCore
# TODO: Perform an equivalent test on Qubes.
# NOTE: We skip running this test on Windows/MacOS, because our current CI cannot run
# Docker in these platforms. It's not a problem anyways, because the result should be
# the same in all container-based platforms.
@pytest.mark.skipif(platform.system() != "Linux", reason="Container-specific")
def test_ocr_ommisions() -> None:
# Create the command that will list all the installed languages in the container
# image.
runtime = Container.get_runtime()
command = [
runtime,
"run",
Container.CONTAINER_NAME,
"find",
"/usr/share/tessdata/",
"-name",
"*.traineddata",
]
# Run the command, strip any extra whitespace, and remove the following first line
# from the result:
#
# List of available languages in "/usr/share/tessdata/" ...
installed_langs_filenames = (
subprocess.run(command, text=True, check=True, stdout=subprocess.PIPE)
.stdout.strip()
.split("\n")
)
installed_langs = set(
[
Path(filename).name.split(".traineddata")[0]
for filename in installed_langs_filenames
]
)
# Remove the "osd" and "equ" languages from the list of installed languages, since
# they are not an actual language. Read more in:
# https://pyimagesearch.com/2021/11/15/tesseract-page-segmentation-modes-psms-explained-how-to-improve-your-ocr-accuracy/
installed_langs -= {"osd", "equ"}
# Grab the languages that Dangerzone offers to the user through the GUI/CLI.
offered_langs = set(DangerzoneCore(Container(True)).ocr_languages.values())
# Ensure that both the installed languages and the ones we offer to the user are the
# same.
assert installed_langs == offered_langs