Relax the restrictions of util.replace_control_chars

The `util.replace_control_chars()` function was overly strict, and would replace every non-ASCII character with "_". This included both control characters, as well as normal characters in a non-English alphabet. Relax these restrictions by checking each character and deciding if it's a Unicode control character, using the `unicodedata` Python package. With this change, emojis and non-English letters are now allowed.
2025-04-28 18:02:38 +02:00 · 2024-04-25 13:31:08 +03:00 · 2024-04-25 13:31:08 +03:00 · 52ced04507
commit 52ced04507
parent 2fa592eb69
2 changed files with 30 additions and 5 deletions
--- a/dangerzone/util.py
+++ b/dangerzone/util.py
@ -3,6 +3,7 @@ import platform
 import string
 import subprocess
 import sys
+import unicodedata
 from typing import Optional

 import appdirs
@ -67,8 +68,27 @@ def get_subprocess_startupinfo():  # type: ignore [no-untyped-def]

 def replace_control_chars(untrusted_str: str) -> str:
    """Remove control characters from string. Protects a terminal emulator
-    from obcure control characters"""
+    from obscure control characters.
+
+    Control characters are replaced by <EFBFBD> U+FFFD Replacement Character.
+    """
+
+    def is_safe(chr: str) -> bool:
+        """Return whether Unicode character is safe to print in a terminal
+        emulator, based on its General Category.
+
+        The following General Category values are considered unsafe:
+
+        * C* - all control character categories (Cc, Cf, Cs, Co, Cn)
+        * Zl - U+2028 LINE SEPARATOR only
+        * Zp - U+2029 PARAGRAPH SEPARATOR only
+        """
+        categ = unicodedata.category(chr)
+        if categ.startswith("C") or categ in ("Zl", "Zp"):
+            return False
+        return True
+
    sanitized_str = ""
    for char in untrusted_str:
-        sanitized_str += char if char in string.printable else "_"
+        sanitized_str += char if is_safe(char) else "<EFBFBD>"
    return sanitized_str
--- a/tests/init.py
+++ b/tests/init.py
@ -114,8 +114,9 @@ def uncommon_text() -> str:
    * A Unicode control character that is not part of ASCII: zero-width joiner
      (U+200D)
    * An emoji: Cross Mark (U+274C)
+    * A surrogate escape used to decode an invalid UTF-8 sequence 0xF0 (U+DCF0)
    """
-    return "\033[31;1;4m BaD TeΧt \u200d ❌ \033[0m"
+    return "\033[31;1;4m BaD TeΧt \u200d ❌ \udcf0 \033[0m"


@pytest.fixture
@ -136,5 +137,9 @@ def uncommon_filename(uncommon_text: str) -> str:

@pytest.fixture
 def sanitized_text() -> str:
-    """Return a sanitized version of the uncommon_text."""
-    return "_[31;1;4m BaD Te_t _ _ _[0m"
+    """Return a sanitized version of the uncommon_text.
+
+    Take the uncommon text string and replace all the control/invalid characters with
+    "<EFBFBD>". The rest of the characters (emojis and non-English leters) are retained as is.
+    """
+    return "<EFBFBD>[31;1;4m BaD TeΧt <20> ❌ <20> <20>[0m"