mirror of
https://github.com/freedomofpress/dangerzone.git
synced 2025-04-28 18:02:38 +02:00
Relax the restrictions of util.replace_control_chars
The `util.replace_control_chars()` function was overly strict, and would replace every non-ASCII character with "_". This included both control characters, as well as normal characters in a non-English alphabet. Relax these restrictions by checking each character and deciding if it's a Unicode control character, using the `unicodedata` Python package. With this change, emojis and non-English letters are now allowed.
This commit is contained in:
parent
2fa592eb69
commit
52ced04507
2 changed files with 30 additions and 5 deletions
|
@ -3,6 +3,7 @@ import platform
|
||||||
import string
|
import string
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
|
import unicodedata
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import appdirs
|
import appdirs
|
||||||
|
@ -67,8 +68,27 @@ def get_subprocess_startupinfo(): # type: ignore [no-untyped-def]
|
||||||
|
|
||||||
def replace_control_chars(untrusted_str: str) -> str:
|
def replace_control_chars(untrusted_str: str) -> str:
|
||||||
"""Remove control characters from string. Protects a terminal emulator
|
"""Remove control characters from string. Protects a terminal emulator
|
||||||
from obcure control characters"""
|
from obscure control characters.
|
||||||
|
|
||||||
|
Control characters are replaced by <EFBFBD> U+FFFD Replacement Character.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def is_safe(chr: str) -> bool:
|
||||||
|
"""Return whether Unicode character is safe to print in a terminal
|
||||||
|
emulator, based on its General Category.
|
||||||
|
|
||||||
|
The following General Category values are considered unsafe:
|
||||||
|
|
||||||
|
* C* - all control character categories (Cc, Cf, Cs, Co, Cn)
|
||||||
|
* Zl - U+2028 LINE SEPARATOR only
|
||||||
|
* Zp - U+2029 PARAGRAPH SEPARATOR only
|
||||||
|
"""
|
||||||
|
categ = unicodedata.category(chr)
|
||||||
|
if categ.startswith("C") or categ in ("Zl", "Zp"):
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
sanitized_str = ""
|
sanitized_str = ""
|
||||||
for char in untrusted_str:
|
for char in untrusted_str:
|
||||||
sanitized_str += char if char in string.printable else "_"
|
sanitized_str += char if is_safe(char) else "<EFBFBD>"
|
||||||
return sanitized_str
|
return sanitized_str
|
||||||
|
|
|
@ -114,8 +114,9 @@ def uncommon_text() -> str:
|
||||||
* A Unicode control character that is not part of ASCII: zero-width joiner
|
* A Unicode control character that is not part of ASCII: zero-width joiner
|
||||||
(U+200D)
|
(U+200D)
|
||||||
* An emoji: Cross Mark (U+274C)
|
* An emoji: Cross Mark (U+274C)
|
||||||
|
* A surrogate escape used to decode an invalid UTF-8 sequence 0xF0 (U+DCF0)
|
||||||
"""
|
"""
|
||||||
return "\033[31;1;4m BaD TeΧt \u200d ❌ \033[0m"
|
return "\033[31;1;4m BaD TeΧt \u200d ❌ \udcf0 \033[0m"
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
@ -136,5 +137,9 @@ def uncommon_filename(uncommon_text: str) -> str:
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def sanitized_text() -> str:
|
def sanitized_text() -> str:
|
||||||
"""Return a sanitized version of the uncommon_text."""
|
"""Return a sanitized version of the uncommon_text.
|
||||||
return "_[31;1;4m BaD Te_t _ _ _[0m"
|
|
||||||
|
Take the uncommon text string and replace all the control/invalid characters with
|
||||||
|
"<EFBFBD>". The rest of the characters (emojis and non-English leters) are retained as is.
|
||||||
|
"""
|
||||||
|
return "<EFBFBD>[31;1;4m BaD TeΧt <20> ❌ <20> <20>[0m"
|
||||||
|
|
Loading…
Reference in a new issue