Skip to content

Commit afdcc96

Browse files
committed
tsan: optimize memory access functions
The optimization is two-fold: First, the algorithm now uses SSE instructions to handle all 4 shadow slots at once. This makes processing faster. Second, if shadow contains the same access, we do not store the event into trace. This increases effective trace size, that is, tsan can remember up to 10x more previous memory accesses. Perofrmance impact: Before: [ OK ] DISABLED_BENCH.Mop8Read (2461 ms) [ OK ] DISABLED_BENCH.Mop8Write (1836 ms) After: [ OK ] DISABLED_BENCH.Mop8Read (1204 ms) [ OK ] DISABLED_BENCH.Mop8Write (976 ms) But this measures only fast-path. On large real applications the speedup is ~20%. Trace size impact: On app1: Memory accesses : 1163265870 Including same : 791312905 (68%) on app2: Memory accesses : 166875345 Including same : 150449689 (90%) 90% of filtered events means that trace size is effectively 10x larger. llvm-svn: 209897
1 parent a233242 commit afdcc96

File tree

10 files changed

+202
-83
lines changed

10 files changed

+202
-83
lines changed

compiler-rt/lib/sanitizer_common/tests/sanitizer_deadlock_detector_test.cc

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -268,9 +268,14 @@ void RunMultipleEpochsTest() {
268268
}
269269
EXPECT_EQ(d.testOnlyGetEpoch(), 4 * d.size());
270270

271+
#if TSAN_DEBUG == 0
272+
// EXPECT_DEATH clones a thread with 4K stack,
273+
// which is overflown by tsan memory accesses functions in debug mode.
274+
271275
// Can not handle the locks from the previous epoch.
272276
// The caller should update the lock id.
273277
EXPECT_DEATH(d.onLock(&dtls, l0), "CHECK failed.*current_epoch_");
278+
#endif
274279
}
275280

276281
TEST(DeadlockDetector, MultipleEpochsTest) {

compiler-rt/lib/tsan/check_analyze.sh

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,11 @@ PrintRes() {
88

99
PrintRes
1010

11-
mops="write1 \
11+
wmops="write1 \
1212
write2 \
1313
write4 \
14-
write8 \
15-
read1 \
14+
write8"
15+
rmops="read1 \
1616
read2 \
1717
read4 \
1818
read8"
@@ -27,10 +27,16 @@ check() {
2727
fi
2828
}
2929

30-
for f in $mops; do
31-
check $f rsp 1 # To read caller pc.
32-
check $f push 0
33-
check $f pop 0
30+
for f in $wmops; do
31+
check $f rsp 3
32+
check $f push 1
33+
check $f pop 5
34+
done
35+
36+
for f in $rmops; do
37+
check $f rsp 3
38+
check $f push 1
39+
check $f pop 4
3440
done
3541

3642
for f in $func; do

compiler-rt/lib/tsan/rtl/Makefile.old

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
CXXFLAGS = -std=c++11 -fPIE -g -Wall -Werror -fno-builtin -DTSAN_DEBUG=$(DEBUG) -DSANITIZER_DEBUG=$(DEBUG)
1+
CXXFLAGS = -std=c++11 -fPIE -g -Wall -Werror -fno-builtin -msse3 -DTSAN_DEBUG=$(DEBUG) -DSANITIZER_DEBUG=$(DEBUG)
22
CLANG=clang
33
ifeq ($(DEBUG), 0)
44
CXXFLAGS += -O3

compiler-rt/lib/tsan/rtl/tsan_defs.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ const uptr kShadowCnt = TSAN_SHADOW_COUNT;
5454
# endif
5555
#else
5656
// Count of shadow values in a shadow cell.
57+
#define TSAN_SHADOW_COUNT 4
5758
const uptr kShadowCnt = 4;
5859
#endif
5960

compiler-rt/lib/tsan/rtl/tsan_rtl.cc

Lines changed: 137 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,16 @@
2525
#include "tsan_suppressions.h"
2626
#include "tsan_symbolize.h"
2727

28+
#ifdef __SSE3__
29+
// <emmintrin.h> transitively includes <stdlib.h>,
30+
// and it's prohibited to include std headers into tsan runtime.
31+
// So we do this dirty trick.
32+
#define _MM_MALLOC_H_INCLUDED
33+
#define __MM_MALLOC_H
34+
#include <emmintrin.h>
35+
typedef __m128i m128;
36+
#endif
37+
2838
volatile int __tsan_resumed = 0;
2939

3040
extern "C" void __tsan_resume() {
@@ -471,7 +481,8 @@ void StoreIfNotYetStored(u64 *sp, u64 *s) {
471481
*s = 0;
472482
}
473483

474-
static inline void HandleRace(ThreadState *thr, u64 *shadow_mem,
484+
ALWAYS_INLINE
485+
void HandleRace(ThreadState *thr, u64 *shadow_mem,
475486
Shadow cur, Shadow old) {
476487
thr->racy_state[0] = cur.raw();
477488
thr->racy_state[1] = old.raw();
@@ -483,16 +494,12 @@ static inline void HandleRace(ThreadState *thr, u64 *shadow_mem,
483494
#endif
484495
}
485496

486-
static inline bool OldIsInSameSynchEpoch(Shadow old, ThreadState *thr) {
487-
return old.epoch() >= thr->fast_synch_epoch;
488-
}
489-
490497
static inline bool HappensBefore(Shadow old, ThreadState *thr) {
491498
return thr->clock.get(old.TidWithIgnore()) >= old.epoch();
492499
}
493500

494-
ALWAYS_INLINE USED
495-
void MemoryAccessImpl(ThreadState *thr, uptr addr,
501+
ALWAYS_INLINE
502+
void MemoryAccessImpl1(ThreadState *thr, uptr addr,
496503
int kAccessSizeLog, bool kAccessIsWrite, bool kIsAtomic,
497504
u64 *shadow_mem, Shadow cur) {
498505
StatInc(thr, StatMop);
@@ -586,6 +593,90 @@ void UnalignedMemoryAccess(ThreadState *thr, uptr pc, uptr addr,
586593
}
587594
}
588595

596+
ALWAYS_INLINE
597+
bool ContainsSameAccessSlow(u64 *s, u64 a, u64 sync_epoch, bool is_write) {
598+
Shadow cur(a);
599+
for (uptr i = 0; i < kShadowCnt; i++) {
600+
Shadow old(LoadShadow(&s[i]));
601+
if (Shadow::Addr0AndSizeAreEqual(cur, old) &&
602+
old.TidWithIgnore() == cur.TidWithIgnore() &&
603+
old.epoch() > sync_epoch &&
604+
old.IsAtomic() == cur.IsAtomic() &&
605+
old.IsRead() <= cur.IsRead())
606+
return true;
607+
}
608+
return false;
609+
}
610+
611+
#if defined(__SSE3__) && TSAN_SHADOW_COUNT == 4
612+
#define SHUF(v0, v1, i0, i1, i2, i3) _mm_castps_si128(_mm_shuffle_ps( \
613+
_mm_castsi128_ps(v0), _mm_castsi128_ps(v1), \
614+
(i0)*1 + (i1)*4 + (i2)*16 + (i3)*64))
615+
ALWAYS_INLINE
616+
bool ContainsSameAccessFast(u64 *s, u64 a, u64 sync_epoch, bool is_write) {
617+
// This is an optimized version of ContainsSameAccessSlow.
618+
// load current access into access[0:63]
619+
const m128 access = _mm_cvtsi64_si128(a);
620+
// duplicate high part of access in addr0:
621+
// addr0[0:31] = access[32:63]
622+
// addr0[32:63] = access[32:63]
623+
// addr0[64:95] = access[32:63]
624+
// addr0[96:127] = access[32:63]
625+
const m128 addr0 = SHUF(access, access, 1, 1, 1, 1);
626+
// load 4 shadow slots
627+
const m128 shadow0 = _mm_load_si128((__m128i*)s);
628+
const m128 shadow1 = _mm_load_si128((__m128i*)s + 1);
629+
// load high parts of 4 shadow slots into addr_vect:
630+
// addr_vect[0:31] = shadow0[32:63]
631+
// addr_vect[32:63] = shadow0[96:127]
632+
// addr_vect[64:95] = shadow1[32:63]
633+
// addr_vect[96:127] = shadow1[96:127]
634+
m128 addr_vect = SHUF(shadow0, shadow1, 1, 3, 1, 3);
635+
if (!is_write) {
636+
// set IsRead bit in addr_vect
637+
const m128 rw_mask1 = _mm_cvtsi64_si128(1<<15);
638+
const m128 rw_mask = SHUF(rw_mask1, rw_mask1, 0, 0, 0, 0);
639+
addr_vect = _mm_or_si128(addr_vect, rw_mask);
640+
}
641+
// addr0 == addr_vect?
642+
const m128 addr_res = _mm_cmpeq_epi32(addr0, addr_vect);
643+
// epoch1[0:63] = sync_epoch
644+
const m128 epoch1 = _mm_cvtsi64_si128(sync_epoch);
645+
// epoch[0:31] = sync_epoch[0:31]
646+
// epoch[32:63] = sync_epoch[0:31]
647+
// epoch[64:95] = sync_epoch[0:31]
648+
// epoch[96:127] = sync_epoch[0:31]
649+
const m128 epoch = SHUF(epoch1, epoch1, 0, 0, 0, 0);
650+
// load low parts of shadow cell epochs into epoch_vect:
651+
// epoch_vect[0:31] = shadow0[0:31]
652+
// epoch_vect[32:63] = shadow0[64:95]
653+
// epoch_vect[64:95] = shadow1[0:31]
654+
// epoch_vect[96:127] = shadow1[64:95]
655+
const m128 epoch_vect = SHUF(shadow0, shadow1, 0, 2, 0, 2);
656+
// epoch_vect >= sync_epoch?
657+
const m128 epoch_res = _mm_cmpgt_epi32(epoch_vect, epoch);
658+
// addr_res & epoch_res
659+
const m128 res = _mm_and_si128(addr_res, epoch_res);
660+
// mask[0] = res[7]
661+
// mask[1] = res[15]
662+
// ...
663+
// mask[15] = res[127]
664+
const int mask = _mm_movemask_epi8(res);
665+
return mask != 0;
666+
}
667+
#endif
668+
669+
ALWAYS_INLINE
670+
bool ContainsSameAccess(u64 *s, u64 a, u64 sync_epoch, bool is_write) {
671+
#if defined(__SSE3__) && TSAN_SHADOW_COUNT == 4
672+
bool res = ContainsSameAccessFast(s, a, sync_epoch, is_write);
673+
DCHECK_EQ(res, ContainsSameAccessSlow(s, a, sync_epoch, is_write));
674+
return res;
675+
#else
676+
return ContainsSameAccessSlow(s, a, sync_epoch, is_write);
677+
#endif
678+
}
679+
589680
ALWAYS_INLINE USED
590681
void MemoryAccess(ThreadState *thr, uptr pc, uptr addr,
591682
int kAccessSizeLog, bool kAccessIsWrite, bool kIsAtomic) {
@@ -618,22 +709,53 @@ void MemoryAccess(ThreadState *thr, uptr pc, uptr addr,
618709
}
619710

620711
FastState fast_state = thr->fast_state;
621-
if (fast_state.GetIgnoreBit())
712+
if (fast_state.GetIgnoreBit()) {
713+
StatInc(thr, StatMop);
714+
StatInc(thr, kAccessIsWrite ? StatMopWrite : StatMopRead);
715+
StatInc(thr, (StatType)(StatMop1 + kAccessSizeLog));
716+
StatInc(thr, StatMopIgnored);
622717
return;
623-
if (kCollectHistory) {
624-
fast_state.IncrementEpoch();
625-
thr->fast_state = fast_state;
626-
// We must not store to the trace if we do not store to the shadow.
627-
// That is, this call must be moved somewhere below.
628-
TraceAddEvent(thr, fast_state, EventTypeMop, pc);
629718
}
630719

631720
Shadow cur(fast_state);
632721
cur.SetAddr0AndSizeLog(addr & 7, kAccessSizeLog);
633722
cur.SetWrite(kAccessIsWrite);
634723
cur.SetAtomic(kIsAtomic);
635724

636-
MemoryAccessImpl(thr, addr, kAccessSizeLog, kAccessIsWrite, kIsAtomic,
725+
if (LIKELY(ContainsSameAccess(shadow_mem, cur.raw(),
726+
thr->fast_synch_epoch, kAccessIsWrite))) {
727+
StatInc(thr, StatMop);
728+
StatInc(thr, kAccessIsWrite ? StatMopWrite : StatMopRead);
729+
StatInc(thr, (StatType)(StatMop1 + kAccessSizeLog));
730+
StatInc(thr, StatMopSame);
731+
return;
732+
}
733+
734+
if (kCollectHistory) {
735+
fast_state.IncrementEpoch();
736+
TraceAddEvent(thr, fast_state, EventTypeMop, pc);
737+
thr->fast_state = fast_state;
738+
cur.IncrementEpoch();
739+
}
740+
741+
MemoryAccessImpl1(thr, addr, kAccessSizeLog, kAccessIsWrite, kIsAtomic,
742+
shadow_mem, cur);
743+
}
744+
745+
// Called by MemoryAccessRange in tsan_rtl_thread.cc
746+
void MemoryAccessImpl(ThreadState *thr, uptr addr,
747+
int kAccessSizeLog, bool kAccessIsWrite, bool kIsAtomic,
748+
u64 *shadow_mem, Shadow cur) {
749+
if (LIKELY(ContainsSameAccess(shadow_mem, cur.raw(),
750+
thr->fast_synch_epoch, kAccessIsWrite))) {
751+
StatInc(thr, StatMop);
752+
StatInc(thr, kAccessIsWrite ? StatMopWrite : StatMopRead);
753+
StatInc(thr, (StatType)(StatMop1 + kAccessSizeLog));
754+
StatInc(thr, StatMopSame);
755+
return;
756+
}
757+
758+
MemoryAccessImpl1(thr, addr, kAccessSizeLog, kAccessIsWrite, kIsAtomic,
637759
shadow_mem, cur);
638760
}
639761

0 commit comments

Comments
 (0)