diff --git a/dangerzone/util.py b/dangerzone/util.py index 311288c..bbedfdb 100644 --- a/dangerzone/util.py +++ b/dangerzone/util.py @@ -1,3 +1,4 @@ +import os import pathlib import platform import subprocess @@ -45,6 +46,26 @@ def get_resource_path(filename: str) -> str: return str(resource_path) +def get_tessdata_dir() -> str: + if ( + getattr(sys, "dangerzone_dev", False) + or platform.system() == "Windows" + or platform.system() == "Darwin" + ): + # Always use the tessdata path from the Dangerzone ./share directory, for + # development builds, or in Windows/macOS platforms. + return get_resource_path("tessdata") + + fedora_tessdata_dir = "/usr/share/tesseract/tessdata/" + debian_tessdata_dir = "/usr/share/tessdata/" + if os.path.isdir(fedora_tessdata_dir): + return fedora_tessdata_dir + if os.path.isdir(debian_tessdata_dir): + return debian_tessdata_dir + else: + raise RuntimeError("Tesseract language data are not installed in the system") + + def get_version() -> str: try: with open(get_resource_path("version.txt")) as f: diff --git a/tests/test_ocr.py b/tests/test_ocr.py index 29d50f8..1335df4 100644 --- a/tests/test_ocr.py +++ b/tests/test_ocr.py @@ -1,59 +1,24 @@ +import pathlib import platform import subprocess from pathlib import Path import pytest -from dangerzone.isolation_provider.container import Container -from dangerzone.isolation_provider.qubes import is_qubes_native_conversion +from dangerzone.isolation_provider.dummy import Dummy from dangerzone.logic import DangerzoneCore +from dangerzone.util import get_tessdata_dir -# TODO: Perform an equivalent test on Qubes. -# NOTE: We skip running this test on Windows/MacOS, because our current CI cannot run -# Docker in these platforms. It's not a problem anyways, because the result should be -# the same in all container-based platforms. -@pytest.mark.skipif( - platform.system() != "Linux" or is_qubes_native_conversion(), - reason="Container-specific", -) -def test_ocr_omissions() -> None: - # Create the command that will list all the installed languages in the container - # image. - command = [Container.get_runtime(), "run"] - command += Container.get_runtime_security_args() - command += [ - Container.CONTAINER_NAME, - "find", - "/usr/share/tessdata/", - "-name", - "*.traineddata", - ] - - # Run the command, strip any extra whitespace, and remove the following first line - # from the result: - # - # List of available languages in "/usr/share/tessdata/" ... - installed_langs_filenames = ( - subprocess.run(command, text=True, check=True, stdout=subprocess.PIPE) - .stdout.strip() - .split("\n") - ) - installed_langs = set( - [ - Path(filename).name.split(".traineddata")[0] - for filename in installed_langs_filenames - ] - ) - - # Remove the "osd" and "equ" languages from the list of installed languages, since - # they are not an actual language. Read more in: - # https://pyimagesearch.com/2021/11/15/tesseract-page-segmentation-modes-psms-explained-how-to-improve-your-ocr-accuracy/ - installed_langs -= {"osd", "equ"} +def test_ocr_ommisions() -> None: + # Grab the languages that are available in the Tesseract data dir. + tessdata_dir = pathlib.Path(get_tessdata_dir()) + suffix_len = len(".traineddata") + available_langs = {f.name[:-suffix_len] for f in tessdata_dir.iterdir()} # Grab the languages that Dangerzone offers to the user through the GUI/CLI. - offered_langs = set(DangerzoneCore(Container()).ocr_languages.values()) + offered_langs = set(DangerzoneCore(Dummy()).ocr_languages.values()) - # Ensure that both the installed languages and the ones we offer to the user are the + # Ensure that both the available languages and the ones we offer to the user are the # same. - assert installed_langs == offered_langs + assert available_langs == offered_langs