Skip to content

Commit fda28ba

Browse files
authored
[DevTSAN] Refine thread mapping algorithm for better performance (#19010)
We selected up to 4 work items in each work group to do detection, the whole number of selected work items no more than kThreadSlotCount.
1 parent 5e2d619 commit fda28ba

File tree

2 files changed

+52
-21
lines changed

2 files changed

+52
-21
lines changed

libdevice/include/group_utils.hpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,13 @@ static inline size_t WorkGroupLinearId() {
1818
__spirv_BuiltInWorkgroupId.z;
1919
}
2020

21+
static inline size_t LocalLinearId() {
22+
return __spirv_BuiltInLocalInvocationId.x * __spirv_BuiltInWorkgroupSize.y *
23+
__spirv_BuiltInWorkgroupSize.z +
24+
__spirv_BuiltInLocalInvocationId.y * __spirv_BuiltInWorkgroupSize.z +
25+
__spirv_BuiltInLocalInvocationId.z;
26+
}
27+
2128
// For GPU device, each sub group is a hardware thread
2229
static inline size_t SubGroupLinearId() {
2330
return __spirv_BuiltInGlobalLinearId / __spirv_BuiltInSubgroupSize;

libdevice/sanitizer/tsan_rtl.cpp

Lines changed: 45 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -144,9 +144,21 @@ inline __SYCL_GLOBAL__ RawShadow *MemToShadow(uptr addr, uint32_t as) {
144144
return shadow_ptr;
145145
}
146146

147-
inline Sid GetCurrentSid() {
148-
const auto lid = __spirv_BuiltInGlobalLinearId;
149-
return lid % kThreadSlotCount;
147+
// We selected up to 4 work items in each work group to do detection, the whole
148+
// number of selected work items no more than kThreadSlotCount. This may cause
149+
// some false negtive cases in non-uniform memory access which has data race.
150+
// Since the cases are very rare and the change will greatly reduce runtime
151+
// overhead, it should be worthwhile.
152+
inline int GetCurrentSid() {
153+
const size_t lid = LocalLinearId();
154+
const size_t ThreadPerWorkGroup =
155+
Min(4, __spirv_BuiltInWorkgroupSize.x * __spirv_BuiltInWorkgroupSize.y *
156+
__spirv_BuiltInWorkgroupSize.z);
157+
if (lid >= ThreadPerWorkGroup)
158+
return -1;
159+
160+
const size_t Id = lid + WorkGroupLinearId() * ThreadPerWorkGroup;
161+
return Id < kThreadSlotCount ? Id : -1;
150162
}
151163

152164
inline RawShadow LoadShadow(const __SYCL_GLOBAL__ RawShadow *p) {
@@ -315,7 +327,9 @@ inline bool ContainsSameAccess(__SYCL_GLOBAL__ RawShadow *s, Shadow cur,
315327
__SYCL_GLOBAL__ RawShadow *shadow_mem = MemToShadow(addr, as); \
316328
if (!shadow_mem) \
317329
return; \
318-
Sid sid = GetCurrentSid(); \
330+
int sid = GetCurrentSid(); \
331+
if (sid == -1) \
332+
return; \
319333
uint16_t current_clock = IncrementEpoch(sid) + 1; \
320334
TSAN_DEBUG(__spirv_ocl_printf(__tsan_print_raw_shadow, (void *)addr, as, \
321335
(void *)shadow_mem, shadow_mem[0], \
@@ -360,7 +374,9 @@ __tsan_read16(uptr addr, uint32_t as, const char __SYCL_CONSTANT__ *file,
360374
__SYCL_GLOBAL__ RawShadow *shadow_mem = MemToShadow(addr, as); \
361375
if (!shadow_mem) \
362376
return; \
363-
Sid sid = GetCurrentSid(); \
377+
int sid = GetCurrentSid(); \
378+
if (sid == -1) \
379+
return; \
364380
uint16_t current_clock = IncrementEpoch(sid) + 1; \
365381
AccessType type = is_write ? kAccessWrite : kAccessRead; \
366382
uptr size1 = Min(size, RoundUpTo(addr + 1, kShadowCell) - addr); \
@@ -499,39 +515,47 @@ DEVICE_EXTERN_C_NOINLINE void __tsan_cleanup_dynamic_local(uptr ptr,
499515
}
500516

501517
DEVICE_EXTERN_C_INLINE void __tsan_device_barrier() {
502-
Sid sid = GetCurrentSid();
518+
int sid = GetCurrentSid();
503519

504-
// sync current thread clock to global state
505-
TsanLaunchInfo->Clock[kThreadSlotCount].clk_[sid] =
506-
TsanLaunchInfo->Clock[sid].clk_[sid];
520+
if (sid != -1) {
521+
// sync current thread clock to global state
522+
TsanLaunchInfo->Clock[kThreadSlotCount].clk_[sid] =
523+
TsanLaunchInfo->Clock[sid].clk_[sid];
524+
}
507525

508526
__spirv_ControlBarrier(__spv::Scope::Device, __spv::Scope::Device,
509527
__spv::MemorySemanticsMask::SequentiallyConsistent |
510528
__spv::MemorySemanticsMask::CrossWorkgroupMemory |
511529
__spv::MemorySemanticsMask::WorkgroupMemory);
512530

513-
// sync global state back
514-
for (uptr i = 0; i < kThreadSlotCount; i++)
515-
TsanLaunchInfo->Clock[sid].clk_[i] =
516-
TsanLaunchInfo->Clock[kThreadSlotCount].clk_[i];
531+
if (sid != -1) {
532+
// sync global state back
533+
for (uptr i = 0; i < kThreadSlotCount; i++)
534+
TsanLaunchInfo->Clock[sid].clk_[i] =
535+
TsanLaunchInfo->Clock[kThreadSlotCount].clk_[i];
536+
}
517537
}
518538

519539
DEVICE_EXTERN_C_INLINE void __tsan_group_barrier() {
520-
Sid sid = GetCurrentSid();
540+
int sid = GetCurrentSid();
521541

522-
// sync current thread clock to global state
523-
TsanLaunchInfo->Clock[kThreadSlotCount].clk_[sid] =
524-
TsanLaunchInfo->Clock[sid].clk_[sid];
542+
if (sid != -1) {
543+
// sync current thread clock to global state
544+
TsanLaunchInfo->Clock[kThreadSlotCount].clk_[sid] =
545+
TsanLaunchInfo->Clock[sid].clk_[sid];
546+
}
525547

526548
__spirv_ControlBarrier(__spv::Scope::Workgroup, __spv::Scope::Workgroup,
527549
__spv::MemorySemanticsMask::SequentiallyConsistent |
528550
__spv::MemorySemanticsMask::CrossWorkgroupMemory |
529551
__spv::MemorySemanticsMask::WorkgroupMemory);
530552

531-
// sync global state back
532-
for (uptr i = 0; i < kThreadSlotCount; i++)
533-
TsanLaunchInfo->Clock[sid].clk_[i] =
534-
TsanLaunchInfo->Clock[kThreadSlotCount].clk_[i];
553+
if (sid != -1) {
554+
// sync global state back
555+
for (uptr i = 0; i < kThreadSlotCount; i++)
556+
TsanLaunchInfo->Clock[sid].clk_[i] =
557+
TsanLaunchInfo->Clock[kThreadSlotCount].clk_[i];
558+
}
535559
}
536560

537561
#endif // __SPIR__ || __SPIRV__

0 commit comments

Comments
 (0)