mirror of
https://github.com/freedomofpress/dangerzone.git
synced 2025-04-28 18:02:38 +02:00
119 lines
4.2 KiB
Python
119 lines
4.2 KiB
Python
import pathlib
|
||
import platform
|
||
import subprocess
|
||
import sys
|
||
import unicodedata
|
||
|
||
import appdirs
|
||
|
||
|
||
def get_config_dir() -> str:
|
||
return appdirs.user_config_dir("dangerzone")
|
||
|
||
|
||
def get_resource_path(filename: str) -> str:
|
||
if getattr(sys, "dangerzone_dev", False):
|
||
# Look for resources directory relative to python file
|
||
project_root = pathlib.Path(__file__).parent.parent
|
||
prefix = project_root / "share"
|
||
else:
|
||
if platform.system() == "Darwin":
|
||
bin_path = pathlib.Path(sys.executable)
|
||
app_path = bin_path.parent.parent
|
||
prefix = app_path / "Resources" / "share"
|
||
elif platform.system() == "Linux":
|
||
prefix = pathlib.Path(sys.prefix) / "share" / "dangerzone"
|
||
elif platform.system() == "Windows":
|
||
exe_path = pathlib.Path(sys.executable)
|
||
dz_install_path = exe_path.parent
|
||
prefix = dz_install_path / "share"
|
||
else:
|
||
raise NotImplementedError(f"Unsupported system {platform.system()}")
|
||
resource_path = prefix / filename
|
||
return str(resource_path)
|
||
|
||
|
||
def get_tessdata_dir() -> pathlib.Path:
|
||
if getattr(sys, "dangerzone_dev", False) or platform.system() in (
|
||
"Windows",
|
||
"Darwin",
|
||
):
|
||
# Always use the tessdata path from the Dangerzone ./share directory, for
|
||
# development builds, or in Windows/macOS platforms.
|
||
return pathlib.Path(get_resource_path("tessdata"))
|
||
|
||
# In case of Linux systems, grab the Tesseract data from any of the following
|
||
# locations. We have found some of the locations through trial and error, whereas
|
||
# others are taken from the docs:
|
||
#
|
||
# [...] Possibilities are /usr/share/tesseract-ocr/tessdata or
|
||
# /usr/share/tessdata or /usr/share/tesseract-ocr/4.00/tessdata. [1]
|
||
#
|
||
# [1] https://tesseract-ocr.github.io/tessdoc/Installation.html
|
||
tessdata_dirs = [
|
||
pathlib.Path("/usr/share/tessdata/"), # on some Debian
|
||
pathlib.Path("/usr/share/tesseract/tessdata/"), # on Fedora
|
||
pathlib.Path("/usr/share/tesseract-ocr/tessdata/"), # ? (documented)
|
||
pathlib.Path("/usr/share/tesseract-ocr/4.00/tessdata/"), # on Ubuntu Focal
|
||
pathlib.Path("/usr/share/tesseract-ocr/5/tessdata/"), # on Debian Trixie
|
||
]
|
||
|
||
for dir in tessdata_dirs:
|
||
if dir.is_dir():
|
||
return dir
|
||
|
||
raise RuntimeError("Tesseract language data are not installed in the system")
|
||
|
||
|
||
def get_version() -> str:
|
||
try:
|
||
with open(get_resource_path("version.txt")) as f:
|
||
version = f.read().strip()
|
||
except FileNotFoundError:
|
||
# In dev mode, in Windows, get_resource_path doesn't work properly for the container, but luckily
|
||
# it doesn't need to know the version
|
||
version = "unknown"
|
||
return version
|
||
|
||
|
||
def get_subprocess_startupinfo(): # type: ignore [no-untyped-def]
|
||
if platform.system() == "Windows":
|
||
startupinfo = subprocess.STARTUPINFO()
|
||
startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
|
||
return startupinfo
|
||
else:
|
||
return None
|
||
|
||
|
||
def replace_control_chars(untrusted_str: str, keep_newlines: bool = False) -> str:
|
||
"""Remove control characters from string. Protects a terminal emulator
|
||
from obscure control characters.
|
||
|
||
Control characters are replaced by <20> U+FFFD Replacement Character.
|
||
|
||
If a user wants to keep the newline character (e.g., because they are sanitizing a
|
||
multi-line text), they must pass `keep_newlines=True`.
|
||
"""
|
||
|
||
def is_safe(chr: str) -> bool:
|
||
"""Return whether Unicode character is safe to print in a terminal
|
||
emulator, based on its General Category.
|
||
|
||
The following General Category values are considered unsafe:
|
||
|
||
* C* - all control character categories (Cc, Cf, Cs, Co, Cn)
|
||
* Zl - U+2028 LINE SEPARATOR only
|
||
* Zp - U+2029 PARAGRAPH SEPARATOR only
|
||
"""
|
||
categ = unicodedata.category(chr)
|
||
if categ.startswith("C") or categ in ("Zl", "Zp"):
|
||
return False
|
||
return True
|
||
|
||
sanitized_str = ""
|
||
for char in untrusted_str:
|
||
if (keep_newlines and char == "\n") or is_safe(char):
|
||
sanitized_str += char
|
||
else:
|
||
sanitized_str += "<EFBFBD>"
|
||
return sanitized_str
|