Add new way to detect tessdata dir

Add a new way to detect where the Tesseract data are stored in a user's
system. On Linux, the Tesseract data should be installed via the package
manager. On macOS and Windows, they should be bundled with the
Dangerzone application.

There is also the exception of running Dangerzone locally, where even
on Linux, we should get the Tesseract data from the Dangerzone share/
folder.
This commit is contained in:
Alex Pyrgiotis 2024-03-14 10:33:24 +02:00
parent 9d2b2b2a47
commit fc977da964
No known key found for this signature in database
GPG key ID: B6C15EBA0357C9AA
2 changed files with 32 additions and 46 deletions

View file

@ -1,3 +1,4 @@
import os
import pathlib import pathlib
import platform import platform
import subprocess import subprocess
@ -45,6 +46,26 @@ def get_resource_path(filename: str) -> str:
return str(resource_path) return str(resource_path)
def get_tessdata_dir() -> str:
if (
getattr(sys, "dangerzone_dev", False)
or platform.system() == "Windows"
or platform.system() == "Darwin"
):
# Always use the tessdata path from the Dangerzone ./share directory, for
# development builds, or in Windows/macOS platforms.
return get_resource_path("tessdata")
fedora_tessdata_dir = "/usr/share/tesseract/tessdata/"
debian_tessdata_dir = "/usr/share/tessdata/"
if os.path.isdir(fedora_tessdata_dir):
return fedora_tessdata_dir
if os.path.isdir(debian_tessdata_dir):
return debian_tessdata_dir
else:
raise RuntimeError("Tesseract language data are not installed in the system")
def get_version() -> str: def get_version() -> str:
try: try:
with open(get_resource_path("version.txt")) as f: with open(get_resource_path("version.txt")) as f:

View file

@ -1,59 +1,24 @@
import pathlib
import platform import platform
import subprocess import subprocess
from pathlib import Path from pathlib import Path
import pytest import pytest
from dangerzone.isolation_provider.container import Container from dangerzone.isolation_provider.dummy import Dummy
from dangerzone.isolation_provider.qubes import is_qubes_native_conversion
from dangerzone.logic import DangerzoneCore from dangerzone.logic import DangerzoneCore
from dangerzone.util import get_tessdata_dir
# TODO: Perform an equivalent test on Qubes. def test_ocr_ommisions() -> None:
# NOTE: We skip running this test on Windows/MacOS, because our current CI cannot run # Grab the languages that are available in the Tesseract data dir.
# Docker in these platforms. It's not a problem anyways, because the result should be tessdata_dir = pathlib.Path(get_tessdata_dir())
# the same in all container-based platforms. suffix_len = len(".traineddata")
@pytest.mark.skipif( available_langs = {f.name[:-suffix_len] for f in tessdata_dir.iterdir()}
platform.system() != "Linux" or is_qubes_native_conversion(),
reason="Container-specific",
)
def test_ocr_omissions() -> None:
# Create the command that will list all the installed languages in the container
# image.
command = [Container.get_runtime(), "run"]
command += Container.get_runtime_security_args()
command += [
Container.CONTAINER_NAME,
"find",
"/usr/share/tessdata/",
"-name",
"*.traineddata",
]
# Run the command, strip any extra whitespace, and remove the following first line
# from the result:
#
# List of available languages in "/usr/share/tessdata/" ...
installed_langs_filenames = (
subprocess.run(command, text=True, check=True, stdout=subprocess.PIPE)
.stdout.strip()
.split("\n")
)
installed_langs = set(
[
Path(filename).name.split(".traineddata")[0]
for filename in installed_langs_filenames
]
)
# Remove the "osd" and "equ" languages from the list of installed languages, since
# they are not an actual language. Read more in:
# https://pyimagesearch.com/2021/11/15/tesseract-page-segmentation-modes-psms-explained-how-to-improve-your-ocr-accuracy/
installed_langs -= {"osd", "equ"}
# Grab the languages that Dangerzone offers to the user through the GUI/CLI. # Grab the languages that Dangerzone offers to the user through the GUI/CLI.
offered_langs = set(DangerzoneCore(Container()).ocr_languages.values()) offered_langs = set(DangerzoneCore(Dummy()).ocr_languages.values())
# Ensure that both the installed languages and the ones we offer to the user are the # Ensure that both the available languages and the ones we offer to the user are the
# same. # same.
assert installed_langs == offered_langs assert available_langs == offered_langs