From e1a4e6ebf1568935a57ba8cec48e43dd7c1ebcd3 Mon Sep 17 00:00:00 2001
From: Damir Jelić <poljar@termina.org.uk>
Date: Tue, 18 Jun 2019 13:38:22 +0200
Subject: compat: Add a method to convert bytes to a string that handles
 unicode errors.

---
 python/olm/_compat.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'python/olm')

diff --git a/python/olm/_compat.py b/python/olm/_compat.py
index 91e4d1b..d81bdb5 100644
--- a/python/olm/_compat.py
+++ b/python/olm/_compat.py
@@ -18,6 +18,8 @@
 from builtins import bytes, str
 from typing import AnyStr
 
+from future.utils import bytes_to_native_str, native_str
+
 try:
     import secrets
     URANDOM = secrets.token_bytes  # pragma: no cover
@@ -44,3 +46,24 @@ def to_bytes(string):
         return bytes(string, "utf-8")
 
     raise TypeError("Invalid type {}".format(type(string)))
+
+
+def to_native_str(byte_string, errors="replace"):
+    """Turn a byte string into a native string decoding it as UTF-8.
+
+    Args:
+        byte_string (bytes): The bytestring that will be converted to a native
+            string.
+        errors (str, optional): The error handling scheme that should be used
+            to handle unicode decode errors. Can be one of "strict" (raise an
+            UnicodeDecodeError exception, "ignore" (remove the offending
+            characters), "replace" (replace the offending character with
+            U+FFFD), "xmlcharrefreplace" as well as any other name registered
+            with codecs.register_error that can handle UnicodeEncodeErrors.
+
+    Returns the decoded native string.
+    """
+    try:
+        return native_str(byte_string, errors=errors)
+    except TypeError:
+        return bytes(byte_string).decode(errors=errors)
-- 
cgit v1.2.3-70-g09d2


From 2f5590bf38e5995a36f770c04cfbf31eb9070eca Mon Sep 17 00:00:00 2001
From: Damir Jelić <poljar@termina.org.uk>
Date: Tue, 18 Jun 2019 13:46:57 +0200
Subject: olm: Allow decryption functions to define how to handle unicode
 decode errors.

This patch changes the decryption functions not to fail if there was an
unicode decode error while converting the decrypted bytes plaintext into
a native python string.

Characters that cannot be decoded as unicode are now replaced with the
unicode replacement character (U+FFFD).

The old behaviour of raising an UnicodeDecodeError can be achieved by
passing the "strict" error handling scheme to the decrypt function.
---
 python/olm/group_session.py        | 21 ++++++++++++++-------
 python/olm/pk.py                   | 15 +++++++++++----
 python/olm/sas.py                  |  4 ++--
 python/olm/session.py              | 21 +++++++++++++++------
 python/olm/utility.py              |  1 +
 python/tests/group_session_test.py | 17 +++++++++++++++++
 python/tests/pk_test.py            | 10 ++++++++++
 python/tests/session_test.py       | 11 +++++++++++
 8 files changed, 81 insertions(+), 19 deletions(-)

(limited to 'python/olm')

diff --git a/python/olm/group_session.py b/python/olm/group_session.py
index 737d9ef..88f87f0 100644
--- a/python/olm/group_session.py
+++ b/python/olm/group_session.py
@@ -33,7 +33,7 @@ from future.utils import bytes_to_native_str
 # pylint: disable=no-name-in-module
 from _libolm import ffi, lib  # type: ignore
 
-from ._compat import URANDOM, to_bytearray, to_bytes
+from ._compat import URANDOM, to_bytearray, to_bytes, to_native_str
 from ._finalize import track_for_finalization
 
 
@@ -176,8 +176,8 @@ class InboundGroupSession(object):
 
         raise OlmGroupSessionError(last_error)
 
-    def decrypt(self, ciphertext):
-        # type: (AnyStr) -> Tuple[str, int]
+    def decrypt(self, ciphertext, errors="replace"):
+        # type: (AnyStr, str) -> Tuple[str, int]
         """Decrypt a message
 
         Returns a tuple of the decrypted plain-text and the message index of
@@ -197,6 +197,13 @@ class InboundGroupSession(object):
         Args:
             ciphertext(str): Base64 encoded ciphertext containing the encrypted
                 message
+            unicode_errors(str, optional): The error handling scheme to use for
+                unicode decoding errors. The default is "replace" meaning that
+                the character that was unable to decode will be replaced with
+                the unicode replacement character (U+FFFD). Other possible
+                values are "strict", "ignore" and "xmlcharrefreplace" as well
+                as any other name registered with codecs.register_error that
+                can handle UnicodeEncodeErrors.
         """
         if not ciphertext:
             raise ValueError("Ciphertext can't be empty.")
@@ -223,10 +230,10 @@ class InboundGroupSession(object):
 
         self._check_error(plaintext_length)
 
-        plaintext = bytes_to_native_str(ffi.unpack(
-            plaintext_buffer,
-            plaintext_length
-        ))
+        plaintext = to_native_str(
+            ffi.unpack(plaintext_buffer, plaintext_length),
+            errors=errors
+        )
 
         # clear out copies of the plaintext
         lib.memset(plaintext_buffer, 0, max_plaintext_length)
diff --git a/python/olm/pk.py b/python/olm/pk.py
index 193aba5..158c78d 100644
--- a/python/olm/pk.py
+++ b/python/olm/pk.py
@@ -40,7 +40,7 @@ from future.utils import bytes_to_native_str
 
 from _libolm import ffi, lib  # type: ignore
 
-from ._compat import URANDOM, to_bytearray
+from ._compat import URANDOM, to_bytearray, to_native_str
 from ._finalize import track_for_finalization
 
 
@@ -313,8 +313,8 @@ class PkDecryption(object):
 
         return obj
 
-    def decrypt(self, message):
-        # type (PkMessage) -> str
+    def decrypt(self, message, errors="replace"):
+        # type (PkMessage, str) -> str
         """Decrypt a previously encrypted Pk message.
 
         Returns the decrypted plaintext.
@@ -322,6 +322,13 @@ class PkDecryption(object):
 
         Args:
             message(PkMessage): the pk message to decrypt.
+            unicode_errors(str, optional): The error handling scheme to use for
+                unicode decoding errors. The default is "replace" meaning that
+                the character that was unable to decode will be replaced with
+                the unicode replacement character (U+FFFD). Other possible
+                values are "strict", "ignore" and "xmlcharrefreplace" as well
+                as any other name registered with codecs.register_error that
+                can handle UnicodeEncodeErrors.
         """
         ephemeral_key = to_bytearray(message.ephemeral_key)
         ephemeral_key_size = len(ephemeral_key)
@@ -354,7 +361,7 @@ class PkDecryption(object):
         # clear out copies of the plaintext
         lib.memset(plaintext_buffer, 0, max_plaintext_length)
 
-        return bytes_to_native_str(plaintext)
+        return to_native_str(plaintext, errors=errors)
 
 
 def _clear_pk_signing(pk_struct):
diff --git a/python/olm/sas.py b/python/olm/sas.py
index c12b7bc..bea1dd0 100644
--- a/python/olm/sas.py
+++ b/python/olm/sas.py
@@ -30,15 +30,15 @@ Examples:
 
 """
 
-from functools import wraps
 from builtins import bytes
+from functools import wraps
 from typing import Optional
 
 from future.utils import bytes_to_native_str
 
 from _libolm import ffi, lib
 
-from ._compat import URANDOM, to_bytes, to_bytearray
+from ._compat import URANDOM, to_bytearray, to_bytes
 from ._finalize import track_for_finalization
 
 
diff --git a/python/olm/session.py b/python/olm/session.py
index cba9be0..cf66582 100644
--- a/python/olm/session.py
+++ b/python/olm/session.py
@@ -40,7 +40,7 @@ from future.utils import bytes_to_native_str
 # pylint: disable=no-name-in-module
 from _libolm import ffi, lib  # type: ignore
 
-from ._compat import URANDOM, to_bytearray, to_bytes
+from ._compat import URANDOM, to_bytearray, to_bytes, to_native_str
 from ._finalize import track_for_finalization
 
 # This is imported only for type checking purposes
@@ -273,8 +273,8 @@ class Session(object):
         else:  # pragma: no cover
             raise ValueError("Unknown message type")
 
-    def decrypt(self, message):
-        # type: (_OlmMessage) -> str
+    def decrypt(self, message, errors="replace"):
+        # type: (_OlmMessage, str) -> str
         """Decrypts a message using the session. Returns the plaintext string
         on success. Raises OlmSessionError on failure. If the base64 couldn't
         be decoded then the error message will be "INVALID_BASE64". If the
@@ -285,7 +285,14 @@ class Session(object):
 
         Args:
             message(OlmMessage): The Olm message that will be decrypted. It can
-            be either an OlmPreKeyMessage or an OlmMessage.
+                be either an OlmPreKeyMessage or an OlmMessage.
+            unicode_errors(str, optional): The error handling scheme to use for
+                unicode decoding errors. The default is "replace" meaning that
+                the character that was unable to decode will be replaced with
+                the unicode replacement character (U+FFFD). Other possible
+                values are "strict", "ignore" and "xmlcharrefreplace" as well
+                as any other name registered with codecs.register_error that
+                can handle UnicodeEncodeErrors.
         """
         if not message.ciphertext:
             raise ValueError("Ciphertext can't be empty")
@@ -311,8 +318,10 @@ class Session(object):
             plaintext_buffer, max_plaintext_length
         )
         self._check_error(plaintext_length)
-        plaintext = bytes_to_native_str(
-            ffi.unpack(plaintext_buffer, plaintext_length))
+        plaintext = to_native_str(
+            ffi.unpack(plaintext_buffer, plaintext_length),
+            errors=errors
+        )
 
         # clear out copies of the plaintext
         lib.memset(plaintext_buffer, 0, max_plaintext_length)
diff --git a/python/olm/utility.py b/python/olm/utility.py
index 10d5ab4..bddef38 100644
--- a/python/olm/utility.py
+++ b/python/olm/utility.py
@@ -32,6 +32,7 @@ Examples:
 
 # pylint: disable=redefined-builtin,unused-import
 from typing import AnyStr, Type
+
 from future.utils import bytes_to_native_str
 
 # pylint: disable=no-name-in-module
diff --git a/python/tests/group_session_test.py b/python/tests/group_session_test.py
index c17e84f..3942024 100644
--- a/python/tests/group_session_test.py
+++ b/python/tests/group_session_test.py
@@ -1,3 +1,7 @@
+# -*- coding: utf-8 -*-
+
+from builtins import bytes
+
 import pytest
 
 from olm import InboundGroupSession, OlmGroupSessionError, OutboundGroupSession
@@ -112,3 +116,16 @@ class TestClass(object):
         outbound = OutboundGroupSession()
         inbound = InboundGroupSession(outbound.session_key)
         del inbound
+
+    def test_invalid_unicode_decrypt(self):
+        outbound = OutboundGroupSession()
+        inbound = InboundGroupSession(outbound.session_key)
+
+        text = outbound.encrypt(bytes([0xed]))
+        plaintext, _ = inbound.decrypt(text)
+
+        print(plaintext)
+        assert plaintext == "�"
+
+        plaintext, _ = inbound.decrypt(text, "ignore")
+        assert plaintext == ""
diff --git a/python/tests/pk_test.py b/python/tests/pk_test.py
index fe3b4b6..749d2eb 100644
--- a/python/tests/pk_test.py
+++ b/python/tests/pk_test.py
@@ -1,3 +1,6 @@
+# -*- coding: utf-8 -*-
+from builtins import bytes
+
 import pytest
 
 from olm import (PkDecryption, PkDecryptionError, PkEncryption, PkSigning,
@@ -55,3 +58,10 @@ class TestClass(object):
         message = "This statement is true"
         signature = signing.sign(message)
         ed25519_verify(signing.public_key, message, signature)
+
+    def test_invalid_unicode_decrypt(self):
+        decryption = PkDecryption()
+        encryption = PkEncryption(decryption.public_key)
+        message = encryption.encrypt(bytes([0xed]))
+        plaintext = decryption.decrypt(message)
+        assert plaintext == "�"
diff --git a/python/tests/session_test.py b/python/tests/session_test.py
index ab1c38b..56a6b83 100644
--- a/python/tests/session_test.py
+++ b/python/tests/session_test.py
@@ -1,3 +1,6 @@
+# -*- coding: utf-8 -*-
+from builtins import bytes
+
 import pytest
 
 from olm import (Account, InboundSession, OlmMessage, OlmPreKeyMessage,
@@ -141,3 +144,11 @@ class TestClass(object):
 
         new_message = new_session.encrypt(plaintext)
         assert bob_session.matches(new_message) is False
+
+    def test_invalid_unicode_decrypt(self):
+        alice, bob, session = self._create_session()
+        message = session.encrypt(bytes([0xed]))
+
+        bob_session = InboundSession(bob, message)
+        plaintext = bob_session.decrypt(message)
+        assert plaintext == "�"
-- 
cgit v1.2.3-70-g09d2


From ba65551d5f9985d947e768787ae05664514ce1e2 Mon Sep 17 00:00:00 2001
From: Damir Jelić <poljar@termina.org.uk>
Date: Wed, 19 Jun 2019 14:42:58 +0200
Subject: _compat: Remove unused import.

---
 python/olm/_compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'python/olm')

diff --git a/python/olm/_compat.py b/python/olm/_compat.py
index d81bdb5..762371b 100644
--- a/python/olm/_compat.py
+++ b/python/olm/_compat.py
@@ -18,7 +18,7 @@
 from builtins import bytes, str
 from typing import AnyStr
 
-from future.utils import bytes_to_native_str, native_str
+from future.utils import native_str
 
 try:
     import secrets
-- 
cgit v1.2.3-70-g09d2


From 5e24c605d2926e23273089058741fe69e1b3030a Mon Sep 17 00:00:00 2001
From: Damir Jelić <poljar@termina.org.uk>
Date: Wed, 19 Jun 2019 14:45:20 +0200
Subject: _compat: Change the to_native_str into a to_unicode_str function.

The to_native_str function was supposed to produce Unicode decoded
native strings for python2 and python3.

Upon further consideration this doesn't make much sense since under
python2 it would need to decode the bytes into a Unicode string and turn
it back into a python2 str.

The ability to use the replacement character requires us to use a
Unicode string under python2 as well.
---
 python/olm/_compat.py       | 12 ++++++------
 python/olm/group_session.py |  4 ++--
 python/olm/pk.py            |  4 ++--
 python/olm/session.py       |  4 ++--
 4 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'python/olm')

diff --git a/python/olm/_compat.py b/python/olm/_compat.py
index 762371b..e1c0d63 100644
--- a/python/olm/_compat.py
+++ b/python/olm/_compat.py
@@ -48,8 +48,11 @@ def to_bytes(string):
     raise TypeError("Invalid type {}".format(type(string)))
 
 
-def to_native_str(byte_string, errors="replace"):
-    """Turn a byte string into a native string decoding it as UTF-8.
+def to_unicode_str(byte_string, errors="replace"):
+    """Turn a byte string into a unicode string.
+
+    Should be used everywhere where the input byte string might not be trusted
+    and may contain invalid unicode values.
 
     Args:
         byte_string (bytes): The bytestring that will be converted to a native
@@ -63,7 +66,4 @@ def to_native_str(byte_string, errors="replace"):
 
     Returns the decoded native string.
     """
-    try:
-        return native_str(byte_string, errors=errors)
-    except TypeError:
-        return bytes(byte_string).decode(errors=errors)
+    return byte_string.decode(errors=errors)
diff --git a/python/olm/group_session.py b/python/olm/group_session.py
index 88f87f0..313e5fa 100644
--- a/python/olm/group_session.py
+++ b/python/olm/group_session.py
@@ -33,7 +33,7 @@ from future.utils import bytes_to_native_str
 # pylint: disable=no-name-in-module
 from _libolm import ffi, lib  # type: ignore
 
-from ._compat import URANDOM, to_bytearray, to_bytes, to_native_str
+from ._compat import URANDOM, to_bytearray, to_bytes, to_unicode_str
 from ._finalize import track_for_finalization
 
 
@@ -230,7 +230,7 @@ class InboundGroupSession(object):
 
         self._check_error(plaintext_length)
 
-        plaintext = to_native_str(
+        plaintext = to_unicode_str(
             ffi.unpack(plaintext_buffer, plaintext_length),
             errors=errors
         )
diff --git a/python/olm/pk.py b/python/olm/pk.py
index 158c78d..18608b7 100644
--- a/python/olm/pk.py
+++ b/python/olm/pk.py
@@ -40,7 +40,7 @@ from future.utils import bytes_to_native_str
 
 from _libolm import ffi, lib  # type: ignore
 
-from ._compat import URANDOM, to_bytearray, to_native_str
+from ._compat import URANDOM, to_bytearray, to_unicode_str
 from ._finalize import track_for_finalization
 
 
@@ -361,7 +361,7 @@ class PkDecryption(object):
         # clear out copies of the plaintext
         lib.memset(plaintext_buffer, 0, max_plaintext_length)
 
-        return to_native_str(plaintext, errors=errors)
+        return to_unicode_str(plaintext, errors=errors)
 
 
 def _clear_pk_signing(pk_struct):
diff --git a/python/olm/session.py b/python/olm/session.py
index cf66582..f81b727 100644
--- a/python/olm/session.py
+++ b/python/olm/session.py
@@ -40,7 +40,7 @@ from future.utils import bytes_to_native_str
 # pylint: disable=no-name-in-module
 from _libolm import ffi, lib  # type: ignore
 
-from ._compat import URANDOM, to_bytearray, to_bytes, to_native_str
+from ._compat import URANDOM, to_bytearray, to_bytes, to_unicode_str
 from ._finalize import track_for_finalization
 
 # This is imported only for type checking purposes
@@ -318,7 +318,7 @@ class Session(object):
             plaintext_buffer, max_plaintext_length
         )
         self._check_error(plaintext_length)
-        plaintext = to_native_str(
+        plaintext = to_unicode_str(
             ffi.unpack(plaintext_buffer, plaintext_length),
             errors=errors
         )
-- 
cgit v1.2.3-70-g09d2


From fec41f9540665345418b9aa1184cb1a5fc7ed0ce Mon Sep 17 00:00:00 2001
From: Damir Jelić <poljar@termina.org.uk>
Date: Wed, 19 Jun 2019 15:07:14 +0200
Subject: _compat: Remove the now unused native_str.

---
 python/olm/_compat.py | 2 --
 1 file changed, 2 deletions(-)

(limited to 'python/olm')

diff --git a/python/olm/_compat.py b/python/olm/_compat.py
index e1c0d63..29a80d4 100644
--- a/python/olm/_compat.py
+++ b/python/olm/_compat.py
@@ -18,8 +18,6 @@
 from builtins import bytes, str
 from typing import AnyStr
 
-from future.utils import native_str
-
 try:
     import secrets
     URANDOM = secrets.token_bytes  # pragma: no cover
-- 
cgit v1.2.3-70-g09d2


From 7538a1eccf99106712a17cc85adacdf27c4a8e8a Mon Sep 17 00:00:00 2001
From: Damir Jelić <poljar@termina.org.uk>
Date: Thu, 20 Jun 2019 12:16:37 +0200
Subject: olm: Rename the errors function argument in the decryption functions.

---
 python/olm/group_session.py | 4 ++--
 python/olm/pk.py            | 4 ++--
 python/olm/session.py       | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'python/olm')

diff --git a/python/olm/group_session.py b/python/olm/group_session.py
index 313e5fa..5068192 100644
--- a/python/olm/group_session.py
+++ b/python/olm/group_session.py
@@ -176,7 +176,7 @@ class InboundGroupSession(object):
 
         raise OlmGroupSessionError(last_error)
 
-    def decrypt(self, ciphertext, errors="replace"):
+    def decrypt(self, ciphertext, unicode_errors="replace"):
         # type: (AnyStr, str) -> Tuple[str, int]
         """Decrypt a message
 
@@ -232,7 +232,7 @@ class InboundGroupSession(object):
 
         plaintext = to_unicode_str(
             ffi.unpack(plaintext_buffer, plaintext_length),
-            errors=errors
+            errors=unicode_errors
         )
 
         # clear out copies of the plaintext
diff --git a/python/olm/pk.py b/python/olm/pk.py
index 18608b7..4352359 100644
--- a/python/olm/pk.py
+++ b/python/olm/pk.py
@@ -313,7 +313,7 @@ class PkDecryption(object):
 
         return obj
 
-    def decrypt(self, message, errors="replace"):
+    def decrypt(self, message, unicode_errors="replace"):
         # type (PkMessage, str) -> str
         """Decrypt a previously encrypted Pk message.
 
@@ -361,7 +361,7 @@ class PkDecryption(object):
         # clear out copies of the plaintext
         lib.memset(plaintext_buffer, 0, max_plaintext_length)
 
-        return to_unicode_str(plaintext, errors=errors)
+        return to_unicode_str(plaintext, errors=unicode_errors)
 
 
 def _clear_pk_signing(pk_struct):
diff --git a/python/olm/session.py b/python/olm/session.py
index f81b727..636eb3d 100644
--- a/python/olm/session.py
+++ b/python/olm/session.py
@@ -273,7 +273,7 @@ class Session(object):
         else:  # pragma: no cover
             raise ValueError("Unknown message type")
 
-    def decrypt(self, message, errors="replace"):
+    def decrypt(self, message, unicode_errors="replace"):
         # type: (_OlmMessage, str) -> str
         """Decrypts a message using the session. Returns the plaintext string
         on success. Raises OlmSessionError on failure. If the base64 couldn't
@@ -320,7 +320,7 @@ class Session(object):
         self._check_error(plaintext_length)
         plaintext = to_unicode_str(
             ffi.unpack(plaintext_buffer, plaintext_length),
-            errors=errors
+            errors=unicode_errors
         )
 
         # clear out copies of the plaintext
-- 
cgit v1.2.3-70-g09d2


From c4d703ac3dc6f7d4962d26d252d1ac2afbec0015 Mon Sep 17 00:00:00 2001
From: Damir Jelić <poljar@termina.org.uk>
Date: Thu, 20 Jun 2019 12:24:08 +0200
Subject: _compat: Make the encoding argument explicit in to_unicode_str().

---
 python/olm/_compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'python/olm')

diff --git a/python/olm/_compat.py b/python/olm/_compat.py
index 29a80d4..2ceaa33 100644
--- a/python/olm/_compat.py
+++ b/python/olm/_compat.py
@@ -64,4 +64,4 @@ def to_unicode_str(byte_string, errors="replace"):
 
     Returns the decoded native string.
     """
-    return byte_string.decode(errors=errors)
+    return byte_string.decode(encoding="utf-8", errors=errors)
-- 
cgit v1.2.3-70-g09d2