Skip to content

Commit 2f916d3

Browse files
authored
perf(profiling): Performance tweaks to profile sampler (#1789)
This contains some small tweaks to speed up the profiler. - changed from a namedtuple to a regular tuple as namedtuples were much slower but the tradeoff here is that it's more legible - moved away from `os.path.abspath` as it was doing some extra operations that were unnecessary for our use case - use the previous sample as a cache while sampling
1 parent dfb04f5 commit 2f916d3

File tree

2 files changed

+201
-129
lines changed

2 files changed

+201
-129
lines changed

sentry_sdk/profiler.py

Lines changed: 111 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
import threading
2222
import time
2323
import uuid
24-
from collections import deque, namedtuple
24+
from collections import deque
2525
from contextlib import contextmanager
2626

2727
import sentry_sdk
@@ -35,10 +35,6 @@
3535
nanosecond_time,
3636
)
3737

38-
RawFrameData = namedtuple(
39-
"RawFrameData", ["abs_path", "filename", "function", "lineno", "module"]
40-
)
41-
4238
if MYPY:
4339
from types import FrameType
4440
from typing import Any
@@ -54,9 +50,17 @@
5450
import sentry_sdk.scope
5551
import sentry_sdk.tracing
5652

57-
RawStack = Tuple[RawFrameData, ...]
58-
RawSample = Sequence[Tuple[str, RawStack]]
59-
RawSampleWithId = Sequence[Tuple[str, int, RawStack]]
53+
StackId = int
54+
55+
RawFrame = Tuple[
56+
str, # abs_path
57+
Optional[str], # module
58+
Optional[str], # filename
59+
str, # function
60+
int, # lineno
61+
]
62+
RawStack = Tuple[RawFrame, ...]
63+
RawSample = Sequence[Tuple[str, Tuple[StackId, RawStack]]]
6064

6165
ProcessedStack = Tuple[int, ...]
6266

@@ -155,8 +159,13 @@ def teardown_profiler():
155159
MAX_STACK_DEPTH = 128
156160

157161

158-
def extract_stack(frame, max_stack_depth=MAX_STACK_DEPTH):
159-
# type: (Optional[FrameType], int) -> Tuple[RawFrameData, ...]
162+
def extract_stack(
163+
frame, # type: Optional[FrameType]
164+
cwd, # type: str
165+
prev_cache=None, # type: Optional[Tuple[StackId, RawStack, Deque[FrameType]]]
166+
max_stack_depth=MAX_STACK_DEPTH, # type: int
167+
):
168+
# type: (...) -> Tuple[StackId, RawStack, Deque[FrameType]]
160169
"""
161170
Extracts the stack starting the specified frame. The extracted stack
162171
assumes the specified frame is the top of the stack, and works back
@@ -166,30 +175,71 @@ def extract_stack(frame, max_stack_depth=MAX_STACK_DEPTH):
166175
only the first `MAX_STACK_DEPTH` frames will be returned.
167176
"""
168177

169-
stack = deque(maxlen=max_stack_depth) # type: Deque[FrameType]
178+
frames = deque(maxlen=max_stack_depth) # type: Deque[FrameType]
170179

171180
while frame is not None:
172-
stack.append(frame)
181+
frames.append(frame)
173182
frame = frame.f_back
174183

175-
return tuple(extract_frame(frame) for frame in stack)
184+
if prev_cache is None:
185+
stack = tuple(extract_frame(frame, cwd) for frame in frames)
186+
else:
187+
_, prev_stack, prev_frames = prev_cache
188+
prev_depth = len(prev_frames)
189+
depth = len(frames)
190+
191+
# We want to match the frame found in this sample to the frames found in the
192+
# previous sample. If they are the same (using the `is` operator), we can
193+
# skip the expensive work of extracting the frame information and reuse what
194+
# we extracted during the last sample.
195+
#
196+
# Make sure to keep in mind that the stack is ordered from the inner most
197+
# from to the outer most frame so be careful with the indexing.
198+
stack = tuple(
199+
prev_stack[i]
200+
if i >= 0 and frame is prev_frames[i]
201+
else extract_frame(frame, cwd)
202+
for i, frame in zip(range(prev_depth - depth, prev_depth), frames)
203+
)
204+
205+
# Instead of mapping the stack into frame ids and hashing
206+
# that as a tuple, we can directly hash the stack.
207+
# This saves us from having to generate yet another list.
208+
# Additionally, using the stack as the key directly is
209+
# costly because the stack can be large, so we pre-hash
210+
# the stack, and use the hash as the key as this will be
211+
# needed a few times to improve performance.
212+
stack_id = hash(stack)
176213

214+
return stack_id, stack, frames
177215

178-
def extract_frame(frame):
179-
# type: (FrameType) -> RawFrameData
216+
217+
def extract_frame(frame, cwd):
218+
# type: (FrameType, str) -> RawFrame
180219
abs_path = frame.f_code.co_filename
181220

182221
try:
183222
module = frame.f_globals["__name__"]
184223
except Exception:
185224
module = None
186225

187-
return RawFrameData(
188-
abs_path=os.path.abspath(abs_path),
189-
filename=filename_for_module(module, abs_path) or None,
190-
function=get_frame_name(frame),
191-
lineno=frame.f_lineno,
192-
module=module,
226+
# namedtuples can be many times slower when initialing
227+
# and accessing attribute so we opt to use a tuple here instead
228+
return (
229+
# This originally was `os.path.abspath(abs_path)` but that had
230+
# a large performance overhead.
231+
#
232+
# According to docs, this is equivalent to
233+
# `os.path.normpath(os.path.join(os.getcwd(), path))`.
234+
# The `os.getcwd()` call is slow here, so we precompute it.
235+
#
236+
# Additionally, since we are using normalized path already,
237+
# we skip calling `os.path.normpath` entirely.
238+
os.path.join(cwd, abs_path),
239+
module,
240+
filename_for_module(module, abs_path) or None,
241+
get_frame_name(frame),
242+
frame.f_lineno,
193243
)
194244

195245

@@ -200,6 +250,8 @@ def get_frame_name(frame):
200250
# we should consider using instead where possible
201251

202252
f_code = frame.f_code
253+
co_varnames = f_code.co_varnames
254+
203255
# co_name only contains the frame name. If the frame was a method,
204256
# the class name will NOT be included.
205257
name = f_code.co_name
@@ -210,8 +262,8 @@ def get_frame_name(frame):
210262
if (
211263
# the co_varnames start with the frame's positional arguments
212264
# and we expect the first to be `self` if its an instance method
213-
f_code.co_varnames
214-
and f_code.co_varnames[0] == "self"
265+
co_varnames
266+
and co_varnames[0] == "self"
215267
and "self" in frame.f_locals
216268
):
217269
for cls in frame.f_locals["self"].__class__.__mro__:
@@ -226,8 +278,8 @@ def get_frame_name(frame):
226278
if (
227279
# the co_varnames start with the frame's positional arguments
228280
# and we expect the first to be `cls` if its a class method
229-
f_code.co_varnames
230-
and f_code.co_varnames[0] == "cls"
281+
co_varnames
282+
and co_varnames[0] == "cls"
231283
and "cls" in frame.f_locals
232284
):
233285
for cls in frame.f_locals["cls"].__mro__:
@@ -338,13 +390,11 @@ class SampleBuffer(object):
338390
def __init__(self, capacity):
339391
# type: (int) -> None
340392

341-
self.buffer = [
342-
None
343-
] * capacity # type: List[Optional[Tuple[int, RawSampleWithId]]]
393+
self.buffer = [None] * capacity # type: List[Optional[Tuple[int, RawSample]]]
344394
self.capacity = capacity # type: int
345395
self.idx = 0 # type: int
346396

347-
def write(self, ts, raw_sample):
397+
def write(self, ts, sample):
348398
# type: (int, RawSample) -> None
349399
"""
350400
Writing to the buffer is not thread safe. There is the possibility
@@ -359,40 +409,24 @@ def write(self, ts, raw_sample):
359409
"""
360410
idx = self.idx
361411

362-
sample = [
363-
(
364-
thread_id,
365-
# Instead of mapping the stack into frame ids and hashing
366-
# that as a tuple, we can directly hash the stack.
367-
# This saves us from having to generate yet another list.
368-
# Additionally, using the stack as the key directly is
369-
# costly because the stack can be large, so we pre-hash
370-
# the stack, and use the hash as the key as this will be
371-
# needed a few times to improve performance.
372-
hash(stack),
373-
stack,
374-
)
375-
for thread_id, stack in raw_sample
376-
]
377-
378412
self.buffer[idx] = (ts, sample)
379413
self.idx = (idx + 1) % self.capacity
380414

381415
def slice_profile(self, start_ns, stop_ns):
382416
# type: (int, int) -> ProcessedProfile
383417
samples = [] # type: List[ProcessedSample]
384-
stacks = dict() # type: Dict[int, int]
385-
stacks_list = list() # type: List[ProcessedStack]
386-
frames = dict() # type: Dict[RawFrameData, int]
387-
frames_list = list() # type: List[ProcessedFrame]
418+
stacks = {} # type: Dict[StackId, int]
419+
stacks_list = [] # type: List[ProcessedStack]
420+
frames = {} # type: Dict[RawFrame, int]
421+
frames_list = [] # type: List[ProcessedFrame]
388422

389423
for ts, sample in filter(None, self.buffer):
390424
if start_ns > ts or ts > stop_ns:
391425
continue
392426

393427
elapsed_since_start_ns = str(ts - start_ns)
394428

395-
for tid, hashed_stack, stack in sample:
429+
for tid, (hashed_stack, stack) in sample:
396430
# Check if the stack is indexed first, this lets us skip
397431
# indexing frames if it's not necessary
398432
if hashed_stack not in stacks:
@@ -401,11 +435,11 @@ def slice_profile(self, start_ns, stop_ns):
401435
frames[frame] = len(frames)
402436
frames_list.append(
403437
{
404-
"abs_path": frame.abs_path,
405-
"function": frame.function or "<unknown>",
406-
"filename": frame.filename,
407-
"lineno": frame.lineno,
408-
"module": frame.module,
438+
"abs_path": frame[0],
439+
"module": frame[1],
440+
"filename": frame[2],
441+
"function": frame[3],
442+
"lineno": frame[4],
409443
}
410444
)
411445

@@ -439,6 +473,14 @@ def slice_profile(self, start_ns, stop_ns):
439473

440474
def make_sampler(self):
441475
# type: () -> Callable[..., None]
476+
cwd = os.getcwd()
477+
478+
# In Python3+, we can use the `nonlocal` keyword to rebind the value,
479+
# but this is not possible in Python2. To get around this, we wrap
480+
# the value in a list to allow updating this value each sample.
481+
last_sample = [
482+
{}
483+
] # type: List[Dict[int, Tuple[StackId, RawStack, Deque[FrameType]]]]
442484

443485
def _sample_stack(*args, **kwargs):
444486
# type: (*Any, **Any) -> None
@@ -447,13 +489,20 @@ def _sample_stack(*args, **kwargs):
447489
This should be called at a regular interval to collect samples.
448490
"""
449491

450-
self.write(
451-
nanosecond_time(),
452-
[
453-
(str(tid), extract_stack(frame))
454-
for tid, frame in sys._current_frames().items()
455-
],
456-
)
492+
now = nanosecond_time()
493+
raw_sample = {
494+
tid: extract_stack(frame, cwd, last_sample[0].get(tid))
495+
for tid, frame in sys._current_frames().items()
496+
}
497+
498+
last_sample[0] = raw_sample
499+
500+
sample = [
501+
(str(tid), (stack_id, stack))
502+
for tid, (stack_id, stack, _) in raw_sample.items()
503+
]
504+
505+
self.write(now, sample)
457506

458507
return _sample_stack
459508

0 commit comments

Comments
 (0)