diff --git a/dangerzone/util.py b/dangerzone/util.py index dc188f2..5f25da0 100644 --- a/dangerzone/util.py +++ b/dangerzone/util.py @@ -3,6 +3,7 @@ import platform import string import subprocess import sys +import unicodedata from typing import Optional import appdirs @@ -67,8 +68,27 @@ def get_subprocess_startupinfo(): # type: ignore [no-untyped-def] def replace_control_chars(untrusted_str: str) -> str: """Remove control characters from string. Protects a terminal emulator - from obcure control characters""" + from obscure control characters. + + Control characters are replaced by � U+FFFD Replacement Character. + """ + + def is_safe(chr: str) -> bool: + """Return whether Unicode character is safe to print in a terminal + emulator, based on its General Category. + + The following General Category values are considered unsafe: + + * C* - all control character categories (Cc, Cf, Cs, Co, Cn) + * Zl - U+2028 LINE SEPARATOR only + * Zp - U+2029 PARAGRAPH SEPARATOR only + """ + categ = unicodedata.category(chr) + if categ.startswith("C") or categ in ("Zl", "Zp"): + return False + return True + sanitized_str = "" for char in untrusted_str: - sanitized_str += char if char in string.printable else "_" + sanitized_str += char if is_safe(char) else "�" return sanitized_str diff --git a/tests/__init__.py b/tests/__init__.py index 70e7c21..8648c63 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -114,8 +114,9 @@ def uncommon_text() -> str: * A Unicode control character that is not part of ASCII: zero-width joiner (U+200D) * An emoji: Cross Mark (U+274C) + * A surrogate escape used to decode an invalid UTF-8 sequence 0xF0 (U+DCF0) """ - return "\033[31;1;4m BaD TeΧt \u200d ❌ \033[0m" + return "\033[31;1;4m BaD TeΧt \u200d ❌ \udcf0 \033[0m" @pytest.fixture @@ -136,5 +137,9 @@ def uncommon_filename(uncommon_text: str) -> str: @pytest.fixture def sanitized_text() -> str: - """Return a sanitized version of the uncommon_text.""" - return "_[31;1;4m BaD Te_t _ _ _[0m" + """Return a sanitized version of the uncommon_text. + + Take the uncommon text string and replace all the control/invalid characters with + "�". The rest of the characters (emojis and non-English leters) are retained as is. + """ + return "�[31;1;4m BaD TeΧt � ❌ � �[0m"