diff --git a/dangerzone/isolation_provider/base.py b/dangerzone/isolation_provider/base.py index a855f34..8ad64f1 100644 --- a/dangerzone/isolation_provider/base.py +++ b/dangerzone/isolation_provider/base.py @@ -126,7 +126,7 @@ class IsolationProvider(ABC): return pixmap.pdfocr_tobytes( compress=True, language=ocr_lang, - tessdata=get_tessdata_dir(), + tessdata=str(get_tessdata_dir()), ) def pixels_to_pdf_page( diff --git a/dangerzone/util.py b/dangerzone/util.py index ef86664..200f25c 100644 --- a/dangerzone/util.py +++ b/dangerzone/util.py @@ -1,4 +1,3 @@ -import os import pathlib import platform import subprocess @@ -34,23 +33,25 @@ def get_resource_path(filename: str) -> str: return str(resource_path) -def get_tessdata_dir() -> str: +def get_tessdata_dir() -> pathlib.Path: if getattr(sys, "dangerzone_dev", False) or platform.system() in ( "Windows", "Darwin", ): # Always use the tessdata path from the Dangerzone ./share directory, for # development builds, or in Windows/macOS platforms. - return get_resource_path("tessdata") + return pathlib.Path(get_resource_path("tessdata")) - fedora_tessdata_dir = "/usr/share/tesseract/tessdata/" - debian_tessdata_dir = "/usr/share/tessdata/" - if os.path.isdir(fedora_tessdata_dir): - return fedora_tessdata_dir - if os.path.isdir(debian_tessdata_dir): - return debian_tessdata_dir - else: - raise RuntimeError("Tesseract language data are not installed in the system") + tessdata_dirs = [ + pathlib.Path("/usr/share/tessdata/"), # on debian + pathlib.Path("/usr/share/tesseract/tessdata/"), # on fedora + ] + + for dir in tessdata_dirs: + if dir.is_dir(): + return dir + + raise RuntimeError("Tesseract language data are not installed in the system") def get_version() -> str: diff --git a/tests/test_ocr.py b/tests/test_ocr.py index 1335df4..2b8836f 100644 --- a/tests/test_ocr.py +++ b/tests/test_ocr.py @@ -1,10 +1,3 @@ -import pathlib -import platform -import subprocess -from pathlib import Path - -import pytest - from dangerzone.isolation_provider.dummy import Dummy from dangerzone.logic import DangerzoneCore from dangerzone.util import get_tessdata_dir @@ -12,9 +5,8 @@ from dangerzone.util import get_tessdata_dir def test_ocr_ommisions() -> None: # Grab the languages that are available in the Tesseract data dir. - tessdata_dir = pathlib.Path(get_tessdata_dir()) suffix_len = len(".traineddata") - available_langs = {f.name[:-suffix_len] for f in tessdata_dir.iterdir()} + available_langs = {f.name[:-suffix_len] for f in get_tessdata_dir().iterdir()} # Grab the languages that Dangerzone offers to the user through the GUI/CLI. offered_langs = set(DangerzoneCore(Dummy()).ocr_languages.values())