mirror of
https://github.com/freedomofpress/dangerzone.git
synced 2025-04-28 18:02:38 +02:00
Relax the restrictions of util.replace_control_chars
The `util.replace_control_chars()` function was overly strict, and would replace every non-ASCII character with "_". This included both control characters, as well as normal characters in a non-English alphabet. Relax these restrictions by checking each character and deciding if it's a Unicode control character, using the `unicodedata` Python package. With this change, emojis and non-English letters are now allowed.
This commit is contained in:
parent
2fa592eb69
commit
52ced04507
2 changed files with 30 additions and 5 deletions
|
@ -3,6 +3,7 @@ import platform
|
|||
import string
|
||||
import subprocess
|
||||
import sys
|
||||
import unicodedata
|
||||
from typing import Optional
|
||||
|
||||
import appdirs
|
||||
|
@ -67,8 +68,27 @@ def get_subprocess_startupinfo(): # type: ignore [no-untyped-def]
|
|||
|
||||
def replace_control_chars(untrusted_str: str) -> str:
|
||||
"""Remove control characters from string. Protects a terminal emulator
|
||||
from obcure control characters"""
|
||||
from obscure control characters.
|
||||
|
||||
Control characters are replaced by <EFBFBD> U+FFFD Replacement Character.
|
||||
"""
|
||||
|
||||
def is_safe(chr: str) -> bool:
|
||||
"""Return whether Unicode character is safe to print in a terminal
|
||||
emulator, based on its General Category.
|
||||
|
||||
The following General Category values are considered unsafe:
|
||||
|
||||
* C* - all control character categories (Cc, Cf, Cs, Co, Cn)
|
||||
* Zl - U+2028 LINE SEPARATOR only
|
||||
* Zp - U+2029 PARAGRAPH SEPARATOR only
|
||||
"""
|
||||
categ = unicodedata.category(chr)
|
||||
if categ.startswith("C") or categ in ("Zl", "Zp"):
|
||||
return False
|
||||
return True
|
||||
|
||||
sanitized_str = ""
|
||||
for char in untrusted_str:
|
||||
sanitized_str += char if char in string.printable else "_"
|
||||
sanitized_str += char if is_safe(char) else "<EFBFBD>"
|
||||
return sanitized_str
|
||||
|
|
|
@ -114,8 +114,9 @@ def uncommon_text() -> str:
|
|||
* A Unicode control character that is not part of ASCII: zero-width joiner
|
||||
(U+200D)
|
||||
* An emoji: Cross Mark (U+274C)
|
||||
* A surrogate escape used to decode an invalid UTF-8 sequence 0xF0 (U+DCF0)
|
||||
"""
|
||||
return "\033[31;1;4m BaD TeΧt \u200d ❌ \033[0m"
|
||||
return "\033[31;1;4m BaD TeΧt \u200d ❌ \udcf0 \033[0m"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
@ -136,5 +137,9 @@ def uncommon_filename(uncommon_text: str) -> str:
|
|||
|
||||
@pytest.fixture
|
||||
def sanitized_text() -> str:
|
||||
"""Return a sanitized version of the uncommon_text."""
|
||||
return "_[31;1;4m BaD Te_t _ _ _[0m"
|
||||
"""Return a sanitized version of the uncommon_text.
|
||||
|
||||
Take the uncommon text string and replace all the control/invalid characters with
|
||||
"<EFBFBD>". The rest of the characters (emojis and non-English leters) are retained as is.
|
||||
"""
|
||||
return "<EFBFBD>[31;1;4m BaD TeΧt <20> ❌ <20> <20>[0m"
|
||||
|
|
Loading…
Reference in a new issue