From e1a4e6ebf1568935a57ba8cec48e43dd7c1ebcd3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Damir=20Jeli=C4=87?= Date: Tue, 18 Jun 2019 13:38:22 +0200 Subject: compat: Add a method to convert bytes to a string that handles unicode errors. --- python/olm/_compat.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'python/olm/_compat.py') diff --git a/python/olm/_compat.py b/python/olm/_compat.py index 91e4d1b..d81bdb5 100644 --- a/python/olm/_compat.py +++ b/python/olm/_compat.py @@ -18,6 +18,8 @@ from builtins import bytes, str from typing import AnyStr +from future.utils import bytes_to_native_str, native_str + try: import secrets URANDOM = secrets.token_bytes # pragma: no cover @@ -44,3 +46,24 @@ def to_bytes(string): return bytes(string, "utf-8") raise TypeError("Invalid type {}".format(type(string))) + + +def to_native_str(byte_string, errors="replace"): + """Turn a byte string into a native string decoding it as UTF-8. + + Args: + byte_string (bytes): The bytestring that will be converted to a native + string. + errors (str, optional): The error handling scheme that should be used + to handle unicode decode errors. Can be one of "strict" (raise an + UnicodeDecodeError exception, "ignore" (remove the offending + characters), "replace" (replace the offending character with + U+FFFD), "xmlcharrefreplace" as well as any other name registered + with codecs.register_error that can handle UnicodeEncodeErrors. + + Returns the decoded native string. + """ + try: + return native_str(byte_string, errors=errors) + except TypeError: + return bytes(byte_string).decode(errors=errors) -- cgit v1.2.3 From ba65551d5f9985d947e768787ae05664514ce1e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Damir=20Jeli=C4=87?= Date: Wed, 19 Jun 2019 14:42:58 +0200 Subject: _compat: Remove unused import. --- python/olm/_compat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'python/olm/_compat.py') diff --git a/python/olm/_compat.py b/python/olm/_compat.py index d81bdb5..762371b 100644 --- a/python/olm/_compat.py +++ b/python/olm/_compat.py @@ -18,7 +18,7 @@ from builtins import bytes, str from typing import AnyStr -from future.utils import bytes_to_native_str, native_str +from future.utils import native_str try: import secrets -- cgit v1.2.3 From 5e24c605d2926e23273089058741fe69e1b3030a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Damir=20Jeli=C4=87?= Date: Wed, 19 Jun 2019 14:45:20 +0200 Subject: _compat: Change the to_native_str into a to_unicode_str function. The to_native_str function was supposed to produce Unicode decoded native strings for python2 and python3. Upon further consideration this doesn't make much sense since under python2 it would need to decode the bytes into a Unicode string and turn it back into a python2 str. The ability to use the replacement character requires us to use a Unicode string under python2 as well. --- python/olm/_compat.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'python/olm/_compat.py') diff --git a/python/olm/_compat.py b/python/olm/_compat.py index 762371b..e1c0d63 100644 --- a/python/olm/_compat.py +++ b/python/olm/_compat.py @@ -48,8 +48,11 @@ def to_bytes(string): raise TypeError("Invalid type {}".format(type(string))) -def to_native_str(byte_string, errors="replace"): - """Turn a byte string into a native string decoding it as UTF-8. +def to_unicode_str(byte_string, errors="replace"): + """Turn a byte string into a unicode string. + + Should be used everywhere where the input byte string might not be trusted + and may contain invalid unicode values. Args: byte_string (bytes): The bytestring that will be converted to a native @@ -63,7 +66,4 @@ def to_native_str(byte_string, errors="replace"): Returns the decoded native string. """ - try: - return native_str(byte_string, errors=errors) - except TypeError: - return bytes(byte_string).decode(errors=errors) + return byte_string.decode(errors=errors) -- cgit v1.2.3 From fec41f9540665345418b9aa1184cb1a5fc7ed0ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Damir=20Jeli=C4=87?= Date: Wed, 19 Jun 2019 15:07:14 +0200 Subject: _compat: Remove the now unused native_str. --- python/olm/_compat.py | 2 -- 1 file changed, 2 deletions(-) (limited to 'python/olm/_compat.py') diff --git a/python/olm/_compat.py b/python/olm/_compat.py index e1c0d63..29a80d4 100644 --- a/python/olm/_compat.py +++ b/python/olm/_compat.py @@ -18,8 +18,6 @@ from builtins import bytes, str from typing import AnyStr -from future.utils import native_str - try: import secrets URANDOM = secrets.token_bytes # pragma: no cover -- cgit v1.2.3 From c4d703ac3dc6f7d4962d26d252d1ac2afbec0015 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Damir=20Jeli=C4=87?= Date: Thu, 20 Jun 2019 12:24:08 +0200 Subject: _compat: Make the encoding argument explicit in to_unicode_str(). --- python/olm/_compat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'python/olm/_compat.py') diff --git a/python/olm/_compat.py b/python/olm/_compat.py index 29a80d4..2ceaa33 100644 --- a/python/olm/_compat.py +++ b/python/olm/_compat.py @@ -64,4 +64,4 @@ def to_unicode_str(byte_string, errors="replace"): Returns the decoded native string. """ - return byte_string.decode(errors=errors) + return byte_string.decode(encoding="utf-8", errors=errors) -- cgit v1.2.3