getsentry · untitaker · Nov 11, 2019 · Nov 11, 2019 · Nov 11, 2019
@@ -36,11 +36,17 @@
     # Importing ABCs from collections is deprecated, and will stop working in 3.8
     # https://github.com/python/cpython/blob/master/Lib/collections/__init__.py#L49
     from collections import Mapping, Sequence
+
+    serializable_str_types = string_types
+
 else:
     # New in 3.3
     # https://docs.python.org/3/library/collections.abc.html
     from collections.abc import Mapping, Sequence
 
+    # Bytes are technically not strings in Python 3, but we can serialize them
+    serializable_str_types = (str, bytes)
+
 MAX_DATABAG_DEPTH = 5
 MAX_DATABAG_BREADTH = 10
 CYCLE_MARKER = u"<cyclic>"
@@ -285,7 +291,7 @@ def _serialize_node_impl(
 
             return rv_dict
 
-        elif not isinstance(obj, string_types) and isinstance(obj, Sequence):
+        elif not isinstance(obj, serializable_str_types) and isinstance(obj, Sequence):
             rv_list = []
 
             for i, v in enumerate(obj):

@@ -348,32 +348,42 @@ def safe_str(value):
         return safe_repr(value)
 
 
-def safe_repr(value):
-    # type: (Any) -> str
-    try:
-        rv = repr(value)
-        if isinstance(rv, bytes):
-            rv = rv.decode("utf-8", "replace")
-
-        # At this point `rv` contains a bunch of literal escape codes, like
-        # this (exaggerated example):
-        #
-        # u"\\x2f"
-        #
-        # But we want to show this string as:
-        #
-        # u"/"
+if PY2:
+
+    def safe_repr(value):
+        # type: (Any) -> str
         try:
-            # unicode-escape does this job, but can only decode latin1. So we
-            # attempt to encode in latin1.
-            return rv.encode("latin1").decode("unicode-escape")
+            rv = repr(value).decode("utf-8", "replace")
+
+            # At this point `rv` contains a bunch of literal escape codes, like
+            # this (exaggerated example):
+            #
+            # u"\\x2f"
+            #
+            # But we want to show this string as:
+            #
+            # u"/"
+            try:
+                # unicode-escape does this job, but can only decode latin1. So we
+                # attempt to encode in latin1.
+                return rv.encode("latin1").decode("unicode-escape")
+            except Exception:
+                # Since usually strings aren't latin1 this can break. In those
+                # cases we just give up.
+                return rv
         except Exception:
-            # Since usually strings aren't latin1 this can break. In those
-            # cases we just give up.
-            return rv
-    except Exception:
-        # If e.g. the call to `repr` already fails
-        return u"<broken repr>"
+            # If e.g. the call to `repr` already fails
+            return u"<broken repr>"
+
+
+else:
+
+    def safe_repr(value):
+        # type: (Any) -> str
+        try:
+            return repr(value)
+        except Exception:
+            return "<broken repr>"
 
 
 def filename_for_module(module, abs_path):

@@ -1,5 +1,5 @@
 from datetime import datetime
-
+import sys
 
 import pytest
 
@@ -30,3 +30,39 @@ def test_datetime_precision(dt, semaphore_normalize):
         # Float glitches can happen, and more glitches can happen
         # because we try to work around some float glitches in semaphore
         assert (dt - dt2).total_seconds() < 1.0
+
+    @given(binary=st.binary(min_size=1))
+    def test_bytes_serialization_decode_many(binary, message_normalizer):
+        result = message_normalizer(binary, should_repr_strings=False)
+        assert result == binary.decode("utf-8", "replace")
+
+    @given(binary=st.binary(min_size=1))
+    def test_bytes_serialization_repr_many(binary, message_normalizer):
+        result = message_normalizer(binary, should_repr_strings=True)
+        assert result == repr(binary)
+
+
+@pytest.fixture
+def message_normalizer(semaphore_normalize):
+    if semaphore_normalize({"test": "test"}) is None:
+        pytest.skip("no semaphore available")
+
+    def inner(message, **kwargs):
+        event = serialize({"logentry": {"message": message}}, **kwargs)
+        normalized = semaphore_normalize(event)
+        return normalized["logentry"]["message"]
+
+    return inner
+
+
+def test_bytes_serialization_decode(message_normalizer):
+    binary = b"abc123\x80\xf0\x9f\x8d\x95"
+    result = message_normalizer(binary, should_repr_strings=False)
+    assert result == u"abc123\ufffd\U0001f355"
+
+
+@pytest.mark.xfail(sys.version_info < (3,), reason="Known safe_repr bugs in Py2.7")
+def test_bytes_serialization_repr(message_normalizer):
+    binary = b"abc123\x80\xf0\x9f\x8d\x95"
+    result = message_normalizer(binary, should_repr_strings=True)
+    assert result == r"b'abc123\x80\xf0\x9f\x8d\x95'"
@@ -36,6 +36,19 @@ def test_safe_repr_regressions():
     assert u"лошадь" in safe_repr(u"лошадь")
 
 
+@pytest.mark.xfail(
+    sys.version_info < (3,),
+    reason="Fixing this in Python 2 would break other behaviors",
+)
+@pytest.mark.parametrize("prefix", (u"", u"abcd", u"лошадь"))
+@pytest.mark.parametrize("character", u"\x00\x07\x1b\n")
+def test_safe_repr_non_printable(prefix, character):
+    """Check that non-printable characters are escaped"""
+    string = prefix + character
+    assert character not in safe_repr(string)
+    assert character not in safe_repr(string.encode("utf-8"))
+
+
 def test_abs_path():
     """Check if abs_path is actually an absolute path. This can happen either
     with eval/exec like here, or when the file in the frame is relative to