Relax the restrictions of util.replace_control_chars

The `util.replace_control_chars()` function was overly strict, and would replace every non-ASCII character with "_". This included both control characters, as well as normal characters in a non-English alphabet. Relax these restrictions by checking each character and deciding if it's a Unicode control character, using the `unicodedata` Python package. With this change, emojis and non-English letters are now allowed.
2025-04-28 18:02:38 +02:00 · 2024-04-25 13:31:08 +03:00 · 2024-04-25 13:31:08 +03:00 · 52ced04507
commit 52ced04507
parent 2fa592eb69
2 changed files with 30 additions and 5 deletions
--- a/dangerzone/util.py
+++ b/dangerzone/util.py
@ -3,6 +3,7 @@ import platform
 import string
 import subprocess
 import sys
 import unicodedata
 from typing import Optional
 import appdirs
@ -67,8 +68,27 @@ def get_subprocess_startupinfo():  # type: ignore [no-untyped-def]
 def replace_control_chars(untrusted_str: str) -> str:
    """Remove control characters from string. Protects a terminal emulator
-    from obcure control characters"""
+    from obscure control characters.
    Control characters are replaced by <EFBFBD> U+FFFD Replacement Character.
    """
    def is_safe(chr: str) -> bool:
        """Return whether Unicode character is safe to print in a terminal
        emulator, based on its General Category.
        The following General Category values are considered unsafe:
        * C* - all control character categories (Cc, Cf, Cs, Co, Cn)
        * Zl - U+2028 LINE SEPARATOR only
        * Zp - U+2029 PARAGRAPH SEPARATOR only
        """
        categ = unicodedata.category(chr)
        if categ.startswith("C") or categ in ("Zl", "Zp"):
            return False
        return True
    sanitized_str = ""
    for char in untrusted_str:
-        sanitized_str += char if char in string.printable else "_"
+        sanitized_str += char if is_safe(char) else "<EFBFBD>"
    return sanitized_str
--- a/tests/init.py
+++ b/tests/init.py
@ -114,8 +114,9 @@ def uncommon_text() -> str:
    * A Unicode control character that is not part of ASCII: zero-width joiner
      (U+200D)
    * An emoji: Cross Mark (U+274C)
    * A surrogate escape used to decode an invalid UTF-8 sequence 0xF0 (U+DCF0)
    """
-    return "\033[31;1;4m BaD TeΧt \u200d ❌ \033[0m"
+    return "\033[31;1;4m BaD TeΧt \u200d ❌ \udcf0 \033[0m"
@pytest.fixture
@ -136,5 +137,9 @@ def uncommon_filename(uncommon_text: str) -> str:
@pytest.fixture
 def sanitized_text() -> str:
-    """Return a sanitized version of the uncommon_text."""
+    """Return a sanitized version of the uncommon_text.
-    return "_[31;1;4m BaD Te_t _ _ _[0m"
+
    Take the uncommon text string and replace all the control/invalid characters with
    "<EFBFBD>". The rest of the characters (emojis and non-English leters) are retained as is.
    """
    return "<EFBFBD>[31;1;4m BaD TeΧt <20> ❌ <20> <20>[0m"