dangerzone/dangerzone/util.py
Alex Pyrgiotis 4a48a2551b
Drop Ubuntu 20.04 (Focal) support
Drop Ubuntu 20.04 (Focal) support, because it's nearing its end-of-life
date. By doing so, we can remove several workarounds and notices we had
in place for this version, and most importantly, remove the pin to our
vendored PyMuPDF package.

Refs #1018
Refs #965
2025-03-17 15:40:25 +02:00

133 lines
4.6 KiB
Python
Raw Blame History

import pathlib
import platform
import subprocess
import sys
import traceback
import unicodedata
try:
import platformdirs
except ImportError:
import appdirs as platformdirs
def get_config_dir() -> str:
return platformdirs.user_config_dir("dangerzone")
def get_resource_path(filename: str) -> str:
if getattr(sys, "dangerzone_dev", False):
# Look for resources directory relative to python file
project_root = pathlib.Path(__file__).parent.parent
prefix = project_root / "share"
else:
if platform.system() == "Darwin":
bin_path = pathlib.Path(sys.executable)
app_path = bin_path.parent.parent
prefix = app_path / "Resources" / "share"
elif platform.system() == "Linux":
prefix = pathlib.Path(sys.prefix) / "share" / "dangerzone"
elif platform.system() == "Windows":
exe_path = pathlib.Path(sys.executable)
dz_install_path = exe_path.parent
prefix = dz_install_path / "share"
else:
raise NotImplementedError(f"Unsupported system {platform.system()}")
resource_path = prefix / filename
return str(resource_path)
def get_tessdata_dir() -> pathlib.Path:
if getattr(sys, "dangerzone_dev", False) or platform.system() in (
"Windows",
"Darwin",
):
# Always use the tessdata path from the Dangerzone ./share directory, for
# development builds, or in Windows/macOS platforms.
return pathlib.Path(get_resource_path("tessdata"))
# In case of Linux systems, grab the Tesseract data from any of the following
# locations. We have found some of the locations through trial and error, whereas
# others are taken from the docs:
#
# [...] Possibilities are /usr/share/tesseract-ocr/tessdata or
# /usr/share/tessdata or /usr/share/tesseract-ocr/4.00/tessdata. [1]
#
# [1] https://tesseract-ocr.github.io/tessdoc/Installation.html
tessdata_dirs = [
pathlib.Path("/usr/share/tessdata/"), # on some Debian
pathlib.Path("/usr/share/tesseract/tessdata/"), # on Fedora
pathlib.Path("/usr/share/tesseract-ocr/tessdata/"), # ? (documented)
pathlib.Path("/usr/share/tesseract-ocr/4.00/tessdata/"), # on Debian Bullseye
pathlib.Path("/usr/share/tesseract-ocr/5/tessdata/"), # on Debian Trixie
]
for dir in tessdata_dirs:
if dir.is_dir():
return dir
raise RuntimeError("Tesseract language data are not installed in the system")
def get_version() -> str:
try:
with open(get_resource_path("version.txt")) as f:
version = f.read().strip()
except FileNotFoundError:
# In dev mode, in Windows, get_resource_path doesn't work properly for the container, but luckily
# it doesn't need to know the version
version = "unknown"
return version
def get_subprocess_startupinfo(): # type: ignore [no-untyped-def]
if platform.system() == "Windows":
startupinfo = subprocess.STARTUPINFO()
startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
return startupinfo
else:
return None
def replace_control_chars(untrusted_str: str, keep_newlines: bool = False) -> str:
"""Remove control characters from string. Protects a terminal emulator
from obscure control characters.
Control characters are replaced by <20> U+FFFD Replacement Character.
If a user wants to keep the newline character (e.g., because they are sanitizing a
multi-line text), they must pass `keep_newlines=True`.
"""
def is_safe(chr: str) -> bool:
"""Return whether Unicode character is safe to print in a terminal
emulator, based on its General Category.
The following General Category values are considered unsafe:
* C* - all control character categories (Cc, Cf, Cs, Co, Cn)
* Zl - U+2028 LINE SEPARATOR only
* Zp - U+2029 PARAGRAPH SEPARATOR only
"""
categ = unicodedata.category(chr)
if categ.startswith("C") or categ in ("Zl", "Zp"):
return False
return True
sanitized_str = ""
for char in untrusted_str:
if (keep_newlines and char == "\n") or is_safe(char):
sanitized_str += char
else:
sanitized_str += "<EFBFBD>"
return sanitized_str
def format_exception(e: Exception) -> str:
# The signature of traceback.format_exception has changed in python 3.10
if sys.version_info < (3, 10):
output = traceback.format_exception(*sys.exc_info())
else:
output = traceback.format_exception(e)
return "".join(output)