Relax the restrictions of util.replace_control_chars

The `util.replace_control_chars()` function was overly strict, and
would replace every non-ASCII character with "_". This included both
control characters, as well as normal characters in a non-English
alphabet.

Relax these restrictions by checking each character and deciding if it's
a Unicode control character, using the `unicodedata` Python package.
With this change, emojis and non-English letters are now allowed.
This commit is contained in:
Naglis Jonaitis 2024-04-25 13:31:08 +03:00 committed by Alex Pyrgiotis
parent 2fa592eb69
commit 52ced04507
No known key found for this signature in database
GPG key ID: B6C15EBA0357C9AA
2 changed files with 30 additions and 5 deletions

View file

@ -3,6 +3,7 @@ import platform
import string
import subprocess
import sys
import unicodedata
from typing import Optional
import appdirs
@ -67,8 +68,27 @@ def get_subprocess_startupinfo(): # type: ignore [no-untyped-def]
def replace_control_chars(untrusted_str: str) -> str:
"""Remove control characters from string. Protects a terminal emulator
from obcure control characters"""
from obscure control characters.
Control characters are replaced by <EFBFBD> U+FFFD Replacement Character.
"""
def is_safe(chr: str) -> bool:
"""Return whether Unicode character is safe to print in a terminal
emulator, based on its General Category.
The following General Category values are considered unsafe:
* C* - all control character categories (Cc, Cf, Cs, Co, Cn)
* Zl - U+2028 LINE SEPARATOR only
* Zp - U+2029 PARAGRAPH SEPARATOR only
"""
categ = unicodedata.category(chr)
if categ.startswith("C") or categ in ("Zl", "Zp"):
return False
return True
sanitized_str = ""
for char in untrusted_str:
sanitized_str += char if char in string.printable else "_"
sanitized_str += char if is_safe(char) else "<EFBFBD>"
return sanitized_str

View file

@ -114,8 +114,9 @@ def uncommon_text() -> str:
* A Unicode control character that is not part of ASCII: zero-width joiner
(U+200D)
* An emoji: Cross Mark (U+274C)
* A surrogate escape used to decode an invalid UTF-8 sequence 0xF0 (U+DCF0)
"""
return "\033[31;1;4m BaD TeΧt \u200d\033[0m"
return "\033[31;1;4m BaD TeΧt \u200d\udcf0 \033[0m"
@pytest.fixture
@ -136,5 +137,9 @@ def uncommon_filename(uncommon_text: str) -> str:
@pytest.fixture
def sanitized_text() -> str:
"""Return a sanitized version of the uncommon_text."""
return "_[31;1;4m BaD Te_t _ _ _[0m"
"""Return a sanitized version of the uncommon_text.
Take the uncommon text string and replace all the control/invalid characters with
"<EFBFBD>". The rest of the characters (emojis and non-English leters) are retained as is.
"""
return "<EFBFBD>[31;1;4m BaD TeΧt <20><20> <20>[0m"