Skip to content

Fixed the serialization of byte-string objects in Python 3 #551

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Nov 11, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion sentry_sdk/serializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,17 @@
# Importing ABCs from collections is deprecated, and will stop working in 3.8
# https://github.com/python/cpython/blob/master/Lib/collections/__init__.py#L49
from collections import Mapping, Sequence

serializable_str_types = string_types

else:
# New in 3.3
# https://docs.python.org/3/library/collections.abc.html
from collections.abc import Mapping, Sequence

# Bytes are technically not strings in Python 3, but we can serialize them
serializable_str_types = (str, bytes)

MAX_DATABAG_DEPTH = 5
MAX_DATABAG_BREADTH = 10
CYCLE_MARKER = u"<cyclic>"
Expand Down Expand Up @@ -285,7 +291,7 @@ def _serialize_node_impl(

return rv_dict

elif not isinstance(obj, string_types) and isinstance(obj, Sequence):
elif not isinstance(obj, serializable_str_types) and isinstance(obj, Sequence):
rv_list = []

for i, v in enumerate(obj):
Expand Down
58 changes: 34 additions & 24 deletions sentry_sdk/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,32 +348,42 @@ def safe_str(value):
return safe_repr(value)


def safe_repr(value):
# type: (Any) -> str
try:
rv = repr(value)
if isinstance(rv, bytes):
rv = rv.decode("utf-8", "replace")

# At this point `rv` contains a bunch of literal escape codes, like
# this (exaggerated example):
#
# u"\\x2f"
#
# But we want to show this string as:
#
# u"/"
if PY2:

def safe_repr(value):
# type: (Any) -> str
try:
# unicode-escape does this job, but can only decode latin1. So we
# attempt to encode in latin1.
return rv.encode("latin1").decode("unicode-escape")
rv = repr(value).decode("utf-8", "replace")

# At this point `rv` contains a bunch of literal escape codes, like
# this (exaggerated example):
#
# u"\\x2f"
#
# But we want to show this string as:
#
# u"/"
try:
# unicode-escape does this job, but can only decode latin1. So we
# attempt to encode in latin1.
return rv.encode("latin1").decode("unicode-escape")
except Exception:
# Since usually strings aren't latin1 this can break. In those
# cases we just give up.
return rv
except Exception:
# Since usually strings aren't latin1 this can break. In those
# cases we just give up.
return rv
except Exception:
# If e.g. the call to `repr` already fails
return u"<broken repr>"
# If e.g. the call to `repr` already fails
return u"<broken repr>"


else:

def safe_repr(value):
# type: (Any) -> str
try:
return repr(value)
except Exception:
return "<broken repr>"


def filename_for_module(module, abs_path):
Expand Down
38 changes: 37 additions & 1 deletion tests/test_serializer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from datetime import datetime

import sys

import pytest

Expand Down Expand Up @@ -30,3 +30,39 @@ def test_datetime_precision(dt, semaphore_normalize):
# Float glitches can happen, and more glitches can happen
# because we try to work around some float glitches in semaphore
assert (dt - dt2).total_seconds() < 1.0

@given(binary=st.binary(min_size=1))
def test_bytes_serialization_decode_many(binary, message_normalizer):
result = message_normalizer(binary, should_repr_strings=False)
assert result == binary.decode("utf-8", "replace")

@given(binary=st.binary(min_size=1))
def test_bytes_serialization_repr_many(binary, message_normalizer):
result = message_normalizer(binary, should_repr_strings=True)
assert result == repr(binary)


@pytest.fixture
def message_normalizer(semaphore_normalize):
if semaphore_normalize({"test": "test"}) is None:
pytest.skip("no semaphore available")

def inner(message, **kwargs):
event = serialize({"logentry": {"message": message}}, **kwargs)
normalized = semaphore_normalize(event)
return normalized["logentry"]["message"]

return inner


def test_bytes_serialization_decode(message_normalizer):
binary = b"abc123\x80\xf0\x9f\x8d\x95"
result = message_normalizer(binary, should_repr_strings=False)
assert result == u"abc123\ufffd\U0001f355"


@pytest.mark.xfail(sys.version_info < (3,), reason="Known safe_repr bugs in Py2.7")
def test_bytes_serialization_repr(message_normalizer):
binary = b"abc123\x80\xf0\x9f\x8d\x95"
result = message_normalizer(binary, should_repr_strings=True)
assert result == r"b'abc123\x80\xf0\x9f\x8d\x95'"
13 changes: 13 additions & 0 deletions tests/utils/test_general.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,19 @@ def test_safe_repr_regressions():
assert u"лошадь" in safe_repr(u"лошадь")


@pytest.mark.xfail(
sys.version_info < (3,),
reason="Fixing this in Python 2 would break other behaviors",
)
@pytest.mark.parametrize("prefix", (u"", u"abcd", u"лошадь"))
@pytest.mark.parametrize("character", u"\x00\x07\x1b\n")
def test_safe_repr_non_printable(prefix, character):
"""Check that non-printable characters are escaped"""
string = prefix + character
assert character not in safe_repr(string)
assert character not in safe_repr(string.encode("utf-8"))


def test_abs_path():
"""Check if abs_path is actually an absolute path. This can happen either
with eval/exec like here, or when the file in the frame is relative to
Expand Down