Skip to content

Commit 48a82a8

Browse files
committed
fix(cmd): improve character encoding detection for sub-commands
1 parent 3123fec commit 48a82a8

File tree

4 files changed

+77
-3
lines changed

4 files changed

+77
-3
lines changed

‎commitizen/cmd.py

+17-2
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33

44
from charset_normalizer import from_bytes
55

6+
from commitizen.exceptions import CharacterSetDecodeError
7+
68

79
class Command(NamedTuple):
810
out: str
@@ -12,6 +14,19 @@ class Command(NamedTuple):
1214
return_code: int
1315

1416

17+
def _try_decode(bytes_: bytes) -> str:
18+
try:
19+
return bytes_.decode("utf-8")
20+
except UnicodeDecodeError:
21+
charset_match = from_bytes(bytes_).best()
22+
if charset_match is None:
23+
raise CharacterSetDecodeError()
24+
try:
25+
return bytes_.decode(charset_match.encoding)
26+
except UnicodeDecodeError as e:
27+
raise CharacterSetDecodeError() from e
28+
29+
1530
def run(cmd: str) -> Command:
1631
process = subprocess.Popen(
1732
cmd,
@@ -23,8 +38,8 @@ def run(cmd: str) -> Command:
2338
stdout, stderr = process.communicate()
2439
return_code = process.returncode
2540
return Command(
26-
str(from_bytes(stdout).best()),
27-
str(from_bytes(stderr).best()),
41+
_try_decode(stdout),
42+
_try_decode(stderr),
2843
stdout,
2944
stderr,
3045
return_code,

‎commitizen/exceptions.py

+5
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ class ExitCode(enum.IntEnum):
2626
INVALID_CONFIGURATION = 19
2727
NOT_ALLOWED = 20
2828
NO_INCREMENT = 21
29+
UNRECOGNIZED_CHARACTERSET_ENCODING = 22
2930

3031

3132
class CommitizenException(Exception):
@@ -148,3 +149,7 @@ class InvalidConfigurationError(CommitizenException):
148149

149150
class NotAllowed(CommitizenException):
150151
exit_code = ExitCode.NOT_ALLOWED
152+
153+
154+
class CharacterSetDecodeError(CommitizenException):
155+
exit_code = ExitCode.UNRECOGNIZED_CHARACTERSET_ENCODING

‎docs/exit_codes.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -28,4 +28,5 @@ These exit codes can be found in `commitizen/exceptions.py::ExitCode`.
2828
| InvalidCommandArgumentError | 18 | The argument provide to command is invalid (e.g. `cz check -commit-msg-file filename --rev-range master..`) |
2929
| InvalidConfigurationError | 19 | An error was found in the Commitizen Configuration, such as duplicates in `change_type_order` |
3030
| NotAllowed | 20 | `--incremental` cannot be combined with a `rev_range` |
31-
| NoneIncrementExit | 21 | The commits found are not elegible to be bumped |
31+
| NoneIncrementExit | 21 | The commits found are not eligible to be bumped |
32+
| CharacterSetDecodeError | 22 | The character encoding of the command output could not be determined |

‎tests/test_cmd.py

+53
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
import pytest
2+
3+
from commitizen import cmd
4+
from commitizen.exceptions import CharacterSetDecodeError
5+
6+
7+
# https://docs.python.org/3/howto/unicode.html
8+
def test_valid_utf8_encoded_strings():
9+
valid_strings = (
10+
"",
11+
"ascii",
12+
"🤦🏻‍♂️",
13+
"﷽",
14+
"\u0000",
15+
)
16+
assert all(s == cmd._try_decode(s.encode("utf-8")) for s in valid_strings)
17+
18+
19+
# A word of caution: just because an encoding can be guessed for a given
20+
# sequence of bytes and because that guessed encoding may yield a decoded
21+
# string, does not mean that that string was the original! For more, see:
22+
# https://docs.python.org/3/library/codecs.html#standard-encodings
23+
24+
25+
# Pick a random, non-utf8 encoding to test.
26+
def test_valid_cp1250_encoded_strings():
27+
valid_strings = (
28+
"",
29+
"ascii",
30+
"äöüß",
31+
"ça va",
32+
"jak se máte",
33+
)
34+
for s in valid_strings:
35+
assert cmd._try_decode(s.encode("cp1250")) or True
36+
37+
38+
def test_invalid_bytes():
39+
invalid_bytes = (b"\x73\xe2\x9d\xff\x00",)
40+
for s in invalid_bytes:
41+
with pytest.raises(CharacterSetDecodeError):
42+
cmd._try_decode(s)
43+
44+
45+
def test_always_fail_decode():
46+
class _bytes(bytes):
47+
def decode(self, encoding="utf-8", errors="strict"):
48+
raise UnicodeDecodeError(
49+
encoding, self, 0, 0, "Failing intentionally for testing"
50+
)
51+
52+
with pytest.raises(CharacterSetDecodeError):
53+
cmd._try_decode(_bytes())

0 commit comments

Comments
 (0)