Skip to content

Commit 3db62cc

Browse files
zhaomaosuKornevNikita
authored andcommitted
[DevTSAN] Sync thread clocks before/after barrier call (#17900)
Barrier instructions will synchronize work items execution status, we need to sync thread clocks before/after barrier call to avoid false positive reports.
1 parent e855a3b commit 3db62cc

File tree

7 files changed

+324
-2
lines changed

7 files changed

+324
-2
lines changed

libdevice/sanitizer/tsan_rtl.cpp

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -358,4 +358,61 @@ DEVICE_EXTERN_C_NOINLINE void __tsan_cleanup_private(uptr addr, uint32_t size) {
358358
}
359359
}
360360

361+
DEVICE_EXTERN_C_INLINE void __tsan_device_barrier() {
362+
Sid sid = GetCurrentSid();
363+
__spirv_ControlBarrier(__spv::Scope::Device, __spv::Scope::Device,
364+
__spv::MemorySemanticsMask::SequentiallyConsistent |
365+
__spv::MemorySemanticsMask::CrossWorkgroupMemory |
366+
__spv::MemorySemanticsMask::WorkgroupMemory);
367+
368+
// sync current thread clock to global state
369+
TsanLaunchInfo->Clock[kThreadSlotCount].clk_[sid] =
370+
TsanLaunchInfo->Clock[sid].clk_[sid];
371+
372+
__spirv_ControlBarrier(__spv::Scope::Device, __spv::Scope::Device,
373+
__spv::MemorySemanticsMask::SequentiallyConsistent |
374+
__spv::MemorySemanticsMask::CrossWorkgroupMemory |
375+
__spv::MemorySemanticsMask::WorkgroupMemory);
376+
377+
// sync global state back
378+
for (uptr i = 0; i < kThreadSlotCount; i++)
379+
TsanLaunchInfo->Clock[sid].clk_[i] =
380+
TsanLaunchInfo->Clock[kThreadSlotCount].clk_[i];
381+
382+
__spirv_ControlBarrier(__spv::Scope::Device, __spv::Scope::Device,
383+
__spv::MemorySemanticsMask::SequentiallyConsistent |
384+
__spv::MemorySemanticsMask::CrossWorkgroupMemory |
385+
__spv::MemorySemanticsMask::WorkgroupMemory);
386+
}
387+
388+
DEVICE_EXTERN_C_INLINE void __tsan_group_barrier() {
389+
if (TsanLaunchInfo->DeviceTy == DeviceType::CPU)
390+
return;
391+
392+
Sid sid = GetCurrentSid();
393+
__spirv_ControlBarrier(__spv::Scope::Workgroup, __spv::Scope::Workgroup,
394+
__spv::MemorySemanticsMask::SequentiallyConsistent |
395+
__spv::MemorySemanticsMask::CrossWorkgroupMemory |
396+
__spv::MemorySemanticsMask::WorkgroupMemory);
397+
398+
// sync current thread clock to global state
399+
TsanLaunchInfo->Clock[kThreadSlotCount].clk_[sid] =
400+
TsanLaunchInfo->Clock[sid].clk_[sid];
401+
402+
__spirv_ControlBarrier(__spv::Scope::Workgroup, __spv::Scope::Workgroup,
403+
__spv::MemorySemanticsMask::SequentiallyConsistent |
404+
__spv::MemorySemanticsMask::CrossWorkgroupMemory |
405+
__spv::MemorySemanticsMask::WorkgroupMemory);
406+
407+
// sync global state back
408+
for (uptr i = 0; i < kThreadSlotCount; i++)
409+
TsanLaunchInfo->Clock[sid].clk_[i] =
410+
TsanLaunchInfo->Clock[kThreadSlotCount].clk_[i];
411+
412+
__spirv_ControlBarrier(__spv::Scope::Workgroup, __spv::Scope::Workgroup,
413+
__spv::MemorySemanticsMask::SequentiallyConsistent |
414+
__spv::MemorySemanticsMask::CrossWorkgroupMemory |
415+
__spv::MemorySemanticsMask::WorkgroupMemory);
416+
}
417+
361418
#endif // __SPIR__ || __SPIRV__

llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,8 @@ struct ThreadSanitizerOnSpirv {
123123
bool instrumentAllocInst(Function *F,
124124
SmallVectorImpl<Instruction *> &AllocaInsts);
125125

126+
bool instrumentControlBarrier(CallInst *CI);
127+
126128
void appendDebugInfoToArgs(Instruction *I, SmallVectorImpl<Value *> &Args);
127129

128130
private:
@@ -148,6 +150,8 @@ struct ThreadSanitizerOnSpirv {
148150
// Accesses sizes are powers of two: 1, 2, 4, 8, 16.
149151
static const size_t kNumberOfAccessSizes = 5;
150152
FunctionCallee TsanCleanupPrivate;
153+
FunctionCallee TsanDeviceBarrier;
154+
FunctionCallee TsanGroupBarrier;
151155
FunctionCallee TsanRead[kNumberOfAccessSizes];
152156
FunctionCallee TsanWrite[kNumberOfAccessSizes];
153157

@@ -269,6 +273,14 @@ void ThreadSanitizerOnSpirv::initialize() {
269273
M.getOrInsertFunction("__tsan_cleanup_private", Attr, IRB.getVoidTy(),
270274
IntptrTy, IRB.getInt32Ty());
271275

276+
TsanDeviceBarrier = M.getOrInsertFunction(
277+
"__tsan_device_barrier", Attr.addFnAttribute(C, Attribute::Convergent),
278+
IRB.getVoidTy());
279+
280+
TsanGroupBarrier = M.getOrInsertFunction(
281+
"__tsan_group_barrier", Attr.addFnAttribute(C, Attribute::Convergent),
282+
IRB.getVoidTy());
283+
272284
for (size_t i = 0; i < kNumberOfAccessSizes; ++i) {
273285
const unsigned ByteSize = 1U << i;
274286
std::string ByteSizeStr = utostr(ByteSize);
@@ -312,6 +324,21 @@ bool ThreadSanitizerOnSpirv::instrumentAllocInst(
312324
return Changed;
313325
}
314326

327+
bool ThreadSanitizerOnSpirv::instrumentControlBarrier(CallInst *CI) {
328+
assert(isa<ConstantInt>(CI->getArgOperand(0)));
329+
uint64_t Scope = cast<ConstantInt>(CI->getArgOperand(0))->getZExtValue();
330+
// is not device scope or work group scope
331+
if (Scope != 1 && Scope != 2)
332+
return false;
333+
334+
InstrumentationIRBuilder IRB(CI);
335+
CallInst *NewCI =
336+
IRB.CreateCall(Scope == 1 ? TsanDeviceBarrier : TsanGroupBarrier, {});
337+
NewCI->setAttributes(NewCI->getCalledFunction()->getAttributes());
338+
CI->eraseFromParent();
339+
return true;
340+
}
341+
315342
void ThreadSanitizerOnSpirv::appendDebugInfoToArgs(
316343
Instruction *I, SmallVectorImpl<Value *> &Args) {
317344
auto &Loc = I->getDebugLoc();
@@ -824,6 +851,7 @@ bool ThreadSanitizer::sanitizeFunction(Function &F,
824851
SmallVector<Instruction*, 8> AtomicAccesses;
825852
SmallVector<Instruction*, 8> MemIntrinCalls;
826853
SmallVector<Instruction *, 8> Allocas;
854+
SmallVector<CallInst *, 8> SpirControlBarrierCalls;
827855
bool Res = false;
828856
bool HasCalls = false;
829857
bool SanitizeFunction = F.hasFnAttribute(Attribute::SanitizeThread);
@@ -844,8 +872,16 @@ bool ThreadSanitizer::sanitizeFunction(Function &F,
844872
Allocas.push_back(&Inst);
845873
else if ((isa<CallInst>(Inst) && !isa<DbgInfoIntrinsic>(Inst)) ||
846874
isa<InvokeInst>(Inst)) {
847-
if (CallInst *CI = dyn_cast<CallInst>(&Inst))
875+
if (CallInst *CI = dyn_cast<CallInst>(&Inst)) {
848876
maybeMarkSanitizerLibraryCallNoBuiltin(CI, &TLI);
877+
if (Spirv) {
878+
Function *CalledFn = CI->getCalledFunction();
879+
if (CalledFn &&
880+
CalledFn->getName() == "_Z22__spirv_ControlBarrieriii") {
881+
SpirControlBarrierCalls.push_back(CI);
882+
}
883+
}
884+
}
849885
if (isa<MemIntrinsic>(Inst))
850886
MemIntrinCalls.push_back(&Inst);
851887
HasCalls = true;
@@ -884,6 +920,11 @@ bool ThreadSanitizer::sanitizeFunction(Function &F,
884920
InsertRuntimeIgnores(F);
885921
}
886922

923+
if (Spirv)
924+
for (auto *CI : SpirControlBarrierCalls) {
925+
Res |= Spirv->instrumentControlBarrier(CI);
926+
}
927+
887928
// FIXME: We need to skip the check for private memory, otherwise OpenCL CPU
888929
// device may generate false positive reports due to stack re-use in different
889930
// threads. However, SPIR-V builts 'ToPrivate' doesn't work as expected on
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
; RUN: opt < %s -passes='function(tsan),module(tsan-module)' -tsan-instrument-func-entry-exit=0 -tsan-instrument-memintrinsics=0 -S | FileCheck %s
2+
target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64-G1"
3+
target triple = "spir64-unknown-unknown"
4+
5+
define spir_kernel void @CheckDeviceBarrier() {
6+
; CHECK-LABEL: void @CheckDeviceBarrier
7+
entry:
8+
call spir_func void @_Z22__spirv_ControlBarrieriii(i32 noundef 1, i32 noundef 1, i32 noundef 912)
9+
; CHECK: call void @__tsan_device_barrier
10+
br label %exit
11+
12+
exit: ; preds = %entry
13+
ret void
14+
}
15+
16+
define spir_kernel void @CheckGroupBarrier() {
17+
; CHECK-LABEL: void @CheckGroupBarrier
18+
entry:
19+
call spir_func void @_Z22__spirv_ControlBarrieriii(i32 noundef 2, i32 noundef 2, i32 noundef 912)
20+
; CHECK: call void @__tsan_group_barrier
21+
br label %exit
22+
23+
exit: ; preds = %entry
24+
ret void
25+
}
26+
27+
declare spir_func void @_Z22__spirv_ControlBarrieriii(i32 noundef, i32 noundef, i32 noundef)
28+
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
// REQUIRES: linux, cpu || (gpu && level_zero)
2+
// RUN: %{build} %device_tsan_flags -O2 -g -o %t1.out
3+
// RUN: %{run} %t1.out 2>&1 | FileCheck %s
4+
// UNSUPPORTED: cpu
5+
// UNSUPPORTED-TRACKER: CMPLRLLVM-66827
6+
#include "sycl/detail/core.hpp"
7+
#include "sycl/ext/oneapi/experimental/root_group.hpp"
8+
#include "sycl/group_barrier.hpp"
9+
#include "sycl/usm.hpp"
10+
11+
const size_t N = 32;
12+
13+
struct TestKernel {
14+
int *m_array;
15+
TestKernel(int *array) : m_array(array) {}
16+
17+
void operator()(sycl::nd_item<1> item) const {
18+
auto root = item.ext_oneapi_get_root_group();
19+
if (item.get_group_linear_id() == 0 && item.get_local_linear_id() == 0)
20+
m_array[0]++;
21+
22+
sycl::group_barrier(root);
23+
24+
if (item.get_group_linear_id() == 1 && item.get_local_linear_id() == 0)
25+
m_array[0]++;
26+
27+
sycl::group_barrier(root);
28+
29+
if (item.get_group_linear_id() == 2 && item.get_local_linear_id() == 0)
30+
m_array[0]++;
31+
32+
sycl::group_barrier(root);
33+
34+
if (item.get_group_linear_id() == 3 && item.get_local_linear_id() == 0)
35+
m_array[0]++;
36+
}
37+
38+
auto get(sycl::ext::oneapi::experimental::properties_tag) const {
39+
return sycl::ext::oneapi::experimental::properties{
40+
sycl::ext::oneapi::experimental::use_root_sync};
41+
}
42+
};
43+
44+
int main() {
45+
sycl::queue queue;
46+
int *array = sycl::malloc_shared<int>(1, queue);
47+
array[0] = 0;
48+
49+
queue
50+
.submit([&](sycl::handler &h) {
51+
h.parallel_for<class Test>(sycl::nd_range<1>(N, N / 4),
52+
TestKernel(array));
53+
})
54+
.wait();
55+
// CHECK-NOT: WARNING: DeviceSanitizer: data race
56+
57+
assert(array[0] == 4);
58+
59+
return 0;
60+
}
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
// REQUIRES: linux, cpu || (gpu && level_zero)
2+
// RUN: %{build} %device_tsan_flags -O2 -g -o %t1.out
3+
// RUN: %{run} %t1.out 2>&1 | FileCheck %s
4+
#include "sycl/detail/core.hpp"
5+
#include "sycl/sub_group.hpp"
6+
#include "sycl/usm.hpp"
7+
#include <algorithm>
8+
9+
int main() {
10+
sycl::queue queue;
11+
constexpr size_t reqd_sg_size = 32;
12+
constexpr size_t N = reqd_sg_size * 4;
13+
14+
auto device = queue.get_device();
15+
auto supported_sg_sizes =
16+
device.get_info<sycl::info::device::sub_group_sizes>();
17+
if (std::none_of(supported_sg_sizes.begin(), supported_sg_sizes.end(),
18+
[](size_t size) { return size == reqd_sg_size; }))
19+
return 0;
20+
21+
int *array = sycl::malloc_shared<int>(1, queue);
22+
array[0] = 0;
23+
24+
queue
25+
.submit([&](sycl::handler &h) {
26+
h.parallel_for<class Test>(
27+
sycl::nd_range<1>(N, N),
28+
[=](sycl::nd_item<1> item)
29+
[[sycl::reqd_sub_group_size(reqd_sg_size)]] {
30+
auto sg = item.get_sub_group();
31+
if (item.get_group_linear_id() == 0 &&
32+
sg.get_group_linear_id() == 0 &&
33+
sg.get_local_linear_id() == 0)
34+
array[0]++;
35+
36+
item.barrier();
37+
38+
if (item.get_group_linear_id() == 0 &&
39+
sg.get_group_linear_id() == 1 &&
40+
sg.get_local_linear_id() == 0)
41+
array[0]++;
42+
43+
item.barrier();
44+
45+
if (item.get_group_linear_id() == 0 &&
46+
sg.get_group_linear_id() == 2 &&
47+
sg.get_local_linear_id() == 0)
48+
array[0]++;
49+
50+
item.barrier();
51+
52+
if (item.get_group_linear_id() == 0 &&
53+
sg.get_group_linear_id() == 3 &&
54+
sg.get_local_linear_id() == 0)
55+
array[0]++;
56+
});
57+
})
58+
.wait();
59+
// CHECK-NOT: WARNING: DeviceSanitizer: data race
60+
61+
assert(array[0] == 4);
62+
63+
return 0;
64+
}

0 commit comments

Comments
 (0)