From fc977da9640deea68116a23ea1da1fe7233137a1 Mon Sep 17 00:00:00 2001 From: Alex Pyrgiotis Date: Thu, 14 Mar 2024 10:33:24 +0200 Subject: [PATCH] Add new way to detect tessdata dir Add a new way to detect where the Tesseract data are stored in a user's system. On Linux, the Tesseract data should be installed via the package manager. On macOS and Windows, they should be bundled with the Dangerzone application. There is also the exception of running Dangerzone locally, where even on Linux, we should get the Tesseract data from the Dangerzone share/ folder. --- dangerzone/util.py | 21 +++++++++++++++++ tests/test_ocr.py | 57 +++++++++------------------------------------- 2 files changed, 32 insertions(+), 46 deletions(-) diff --git a/dangerzone/util.py b/dangerzone/util.py index 311288c..bbedfdb 100644 --- a/dangerzone/util.py +++ b/dangerzone/util.py @@ -1,3 +1,4 @@ +import os import pathlib import platform import subprocess @@ -45,6 +46,26 @@ def get_resource_path(filename: str) -> str: return str(resource_path) +def get_tessdata_dir() -> str: + if ( + getattr(sys, "dangerzone_dev", False) + or platform.system() == "Windows" + or platform.system() == "Darwin" + ): + # Always use the tessdata path from the Dangerzone ./share directory, for + # development builds, or in Windows/macOS platforms. + return get_resource_path("tessdata") + + fedora_tessdata_dir = "/usr/share/tesseract/tessdata/" + debian_tessdata_dir = "/usr/share/tessdata/" + if os.path.isdir(fedora_tessdata_dir): + return fedora_tessdata_dir + if os.path.isdir(debian_tessdata_dir): + return debian_tessdata_dir + else: + raise RuntimeError("Tesseract language data are not installed in the system") + + def get_version() -> str: try: with open(get_resource_path("version.txt")) as f: diff --git a/tests/test_ocr.py b/tests/test_ocr.py index 29d50f8..1335df4 100644 --- a/tests/test_ocr.py +++ b/tests/test_ocr.py @@ -1,59 +1,24 @@ +import pathlib import platform import subprocess from pathlib import Path import pytest -from dangerzone.isolation_provider.container import Container -from dangerzone.isolation_provider.qubes import is_qubes_native_conversion +from dangerzone.isolation_provider.dummy import Dummy from dangerzone.logic import DangerzoneCore +from dangerzone.util import get_tessdata_dir -# TODO: Perform an equivalent test on Qubes. -# NOTE: We skip running this test on Windows/MacOS, because our current CI cannot run -# Docker in these platforms. It's not a problem anyways, because the result should be -# the same in all container-based platforms. -@pytest.mark.skipif( - platform.system() != "Linux" or is_qubes_native_conversion(), - reason="Container-specific", -) -def test_ocr_omissions() -> None: - # Create the command that will list all the installed languages in the container - # image. - command = [Container.get_runtime(), "run"] - command += Container.get_runtime_security_args() - command += [ - Container.CONTAINER_NAME, - "find", - "/usr/share/tessdata/", - "-name", - "*.traineddata", - ] - - # Run the command, strip any extra whitespace, and remove the following first line - # from the result: - # - # List of available languages in "/usr/share/tessdata/" ... - installed_langs_filenames = ( - subprocess.run(command, text=True, check=True, stdout=subprocess.PIPE) - .stdout.strip() - .split("\n") - ) - installed_langs = set( - [ - Path(filename).name.split(".traineddata")[0] - for filename in installed_langs_filenames - ] - ) - - # Remove the "osd" and "equ" languages from the list of installed languages, since - # they are not an actual language. Read more in: - # https://pyimagesearch.com/2021/11/15/tesseract-page-segmentation-modes-psms-explained-how-to-improve-your-ocr-accuracy/ - installed_langs -= {"osd", "equ"} +def test_ocr_ommisions() -> None: + # Grab the languages that are available in the Tesseract data dir. + tessdata_dir = pathlib.Path(get_tessdata_dir()) + suffix_len = len(".traineddata") + available_langs = {f.name[:-suffix_len] for f in tessdata_dir.iterdir()} # Grab the languages that Dangerzone offers to the user through the GUI/CLI. - offered_langs = set(DangerzoneCore(Container()).ocr_languages.values()) + offered_langs = set(DangerzoneCore(Dummy()).ocr_languages.values()) - # Ensure that both the installed languages and the ones we offer to the user are the + # Ensure that both the available languages and the ones we offer to the user are the # same. - assert installed_langs == offered_langs + assert available_langs == offered_langs