Skip to content

Commit a4a7950

Browse files
authored
[SYCL] Add barrier before leader guard in LowerWGSCope pass (#2208)
Currently barrier is added only to merge basic block. But barrier must be added before leader guard too. As an example, let's consider the following pseudo code which is generated by LowerWGScope: 1 __local int *done; 2 kernel test() { 3 int done_wi; 4 int count = 0; 5 do { 6 count++; 7 if (get_local_linear_id() == 0) 8 *done = (count == 2); 9 barrier(CLK_LOCAL_MEM_FENCE); 10 done_wi = *done; 11 } while (!done_wi); 12 } Step 1. All WIs execute stmt 3 - stmt 9. This is the first time all WIs encounter the barrier. Step 2. WI0 execute stmt 10 - stmt 11 and this time done is false, and then execute stmt 5 - stmt 9, it set done to true. This is the second time WI0 encounter the barrier. Step 3. Other WIs begin executing stmt 10 - stmt 11 they will see done is true so they can't reach barrier now. To resolve this problem barrier must be added before leader guard: ... barrier(CLK_LOCAL_MEM_FENCE); if (get_local_linear_id() == 0) ... Signed-off-by: Artur Gainullin <[email protected]>
1 parent a3c3425 commit a4a7950

File tree

6 files changed

+74
-2
lines changed

6 files changed

+74
-2
lines changed

llvm/lib/SYCLLowerIR/LowerWGScope.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,7 @@ static void guardBlockWithIsLeaderCheck(BasicBlock *IfBB, BasicBlock *TrueBB,
265265
auto *Ty = LinearLocalID->getType();
266266
Value *Zero = Constant::getNullValue(Ty);
267267
IRBuilder<> Builder(IfBB->getContext());
268+
spirv::genWGBarrier(*(IfBB->getTerminator()), TT);
268269
Builder.SetInsertPoint(IfBB->getTerminator());
269270
Value *Cmp = Builder.CreateICmpEQ(LinearLocalID, Zero, "cmpz");
270271
Builder.SetCurrentDebugLocation(DbgLoc);

llvm/test/SYCLLowerIR/byval_arg.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
define internal spir_func void @wibble(%struct.baz* byval(%struct.baz) %arg1) !work_group_scope !0 {
1212
; CHECK-LABEL: @wibble(
1313
; CHECK-NEXT: [[TMP1:%.*]] = load i64, i64 addrspace(1)* @__spirv_BuiltInLocalInvocationIndex
14+
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272)
1415
; CHECK-NEXT: [[CMPZ:%.*]] = icmp eq i64 [[TMP1]], 0
1516
; CHECK-NEXT: br i1 [[CMPZ]], label [[LEADER:%.*]], label [[MERGE:%.*]]
1617
; CHECK: leader:

llvm/test/SYCLLowerIR/byval_arg_cast.ll

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ define dso_local spir_func void @wombat(%struct.widget* byval(%struct.widget) al
2020
; CHECK-LABEL: @wombat(
2121
; CHECK-NEXT: bb:
2222
; CHECK-NEXT: [[TMP0:%.*]] = load i64, i64 addrspace(1)* @__spirv_BuiltInLocalInvocationIndex, align 4
23+
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272)
2324
; CHECK-NEXT: [[CMPZ1:%.*]] = icmp eq i64 [[TMP0]], 0
2425
; CHECK-NEXT: br i1 [[CMPZ1]], label [[LEADER:%.*]], label [[MERGE:%.*]]
2526
; CHECK: leader:
@@ -31,6 +32,7 @@ define dso_local spir_func void @wombat(%struct.widget* byval(%struct.widget) al
3132
; CHECK-NEXT: [[TMP2:%.*]] = bitcast %struct.widget* [[ARG]] to i8*
3233
; CHECK-NEXT: call void @llvm.memcpy.p0i8.p3i8.i64(i8* align 8 [[TMP2]], i8 addrspace(3)* align 16 bitcast (%struct.widget addrspace(3)* @[[SHADOW]] to i8 addrspace(3)*), i64 32, i1 false)
3334
; CHECK-NEXT: [[TMP3:%.*]] = load i64, i64 addrspace(1)* @__spirv_BuiltInLocalInvocationIndex, align 4
35+
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272)
3436
; CHECK-NEXT: [[CMPZ:%.*]] = icmp eq i64 [[TMP3]], 0
3537
; CHECK-NEXT: br i1 [[CMPZ]], label [[WG_LEADER:%.*]], label [[WG_CF:%.*]]
3638
; CHECK: wg_leader:

llvm/test/SYCLLowerIR/convergent.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@ define internal spir_func void @wibble(%struct.baz* byval(%struct.baz) %arg1) !w
1919
; CHECK-PTX: declare i64 @_Z27__spirv_LocalInvocationId_zv()
2020

2121
; CHECK: ; Function Attrs: convergent
22-
; CHECK: declare void @_Z22__spirv_ControlBarrierjjj(i32, i32, i32) #1
22+
; CHECK: declare void @_Z22__spirv_ControlBarrierjjj(i32, i32, i32) #[[ATTR_NUM:[0-9]+]]
2323

24-
; CHECK: attributes #1 = { convergent }
24+
; CHECK: attributes #[[ATTR_NUM]] = { convergent }
2525

2626
!0 = !{}

llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ define internal spir_func void @wibble(%struct.bar addrspace(4)* %arg, %struct.z
2525
; CHECK-NEXT: [[TMP:%.*]] = alloca [[STRUCT_BAR:%.*]] addrspace(4)*, align 8
2626
; CHECK-NEXT: [[TMP2:%.*]] = alloca [[STRUCT_FOO_0:%.*]], align 1
2727
; CHECK-NEXT: [[TMP0:%.*]] = load i64, i64 addrspace(1)* @__spirv_BuiltInLocalInvocationIndex
28+
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272)
2829
; CHECK-NEXT: [[CMPZ3:%.*]] = icmp eq i64 [[TMP0]], 0
2930
; CHECK-NEXT: br i1 [[CMPZ3]], label [[LEADER:%.*]], label [[MERGE:%.*]]
3031
; CHECK: leader:
@@ -40,6 +41,7 @@ define internal spir_func void @wibble(%struct.bar addrspace(4)* %arg, %struct.z
4041
; CHECK-NEXT: [[TMP4:%.*]] = bitcast [[STRUCT_BAR]] addrspace(4)* [[ARG]] to i8 addrspace(4)*
4142
; CHECK-NEXT: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* align 8 [[TMP4]], i8 addrspace(3)* align 8 getelementptr inbounds (%struct.bar, [[STRUCT_BAR]] addrspace(3)* @[[PFWG_SHADOW]], i32 0, i32 0), i64 1, i1 false)
4243
; CHECK-NEXT: [[TMP5:%.*]] = load i64, i64 addrspace(1)* @__spirv_BuiltInLocalInvocationIndex
44+
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272)
4345
; CHECK-NEXT: [[CMPZ:%.*]] = icmp eq i64 [[TMP5]], 0
4446
; CHECK-NEXT: br i1 [[CMPZ]], label [[WG_LEADER:%.*]], label [[WG_CF:%.*]]
4547
; CHECK: wg_leader:
@@ -50,6 +52,7 @@ define internal spir_func void @wibble(%struct.bar addrspace(4)* %arg, %struct.z
5052
; CHECK-NEXT: br label [[WG_CF]]
5153
; CHECK: wg_cf:
5254
; CHECK-NEXT: [[TMP4:%.*]] = load i64, i64 addrspace(1)* @__spirv_BuiltInLocalInvocationIndex
55+
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272)
5356
; CHECK-NEXT: [[CMPZ2:%.*]] = icmp eq i64 [[TMP4]], 0
5457
; CHECK-NEXT: br i1 [[CMPZ2]], label [[TESTMAT:%.*]], label [[LEADERMAT:%.*]]
5558
; CHECK: TestMat:
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2+
; RUN: opt < %s -LowerWGScope -S | FileCheck %s
3+
4+
%struct.snork = type { i32 }
5+
%struct.eggs = type { i8 }
6+
%struct.snork.0 = type { %struct.widget, %struct.widget, %struct.widget, %struct.ham }
7+
%struct.widget = type { %struct.wibble }
8+
%struct.wibble = type { [3 x i64] }
9+
%struct.ham = type { %struct.wibble }
10+
11+
@global = internal addrspace(3) global [12 x %struct.snork] zeroinitializer, align 4
12+
13+
; CHECK: @[[WG_NEXT:[a-zA-Z0-9_.]+]] = internal unnamed_addr addrspace(3) global %struct.snork addrspace(4)* undef, align 8
14+
; CHECK: @[[WG_DONE:[a-zA-Z0-9_.]+]] = internal unnamed_addr addrspace(3) global i1 undef, align 1
15+
16+
define internal spir_func void @spam(%struct.eggs addrspace(4)* %arg, %struct.snork.0* byval(%struct.snork.0) align 8 %arg1) align 2 !work_group_scope !0 {
17+
; CHECK: arrayctor.loop:
18+
; CHECK-NEXT: [[ARRAYCTOR_CUR:%.*]] = phi [[STRUCT_SNORK:%.*]] addrspace(4)* [ getelementptr inbounds ([12 x %struct.snork], [12 x %struct.snork] addrspace(4)* addrspacecast ([12 x %struct.snork] addrspace(3)* @global to [12 x %struct.snork] addrspace(4)*), i32 0, i32 0), [[WG_CF:%.*]] ], [ [[WG_VAL_ARRAYCTOR_NEXT:%.*]], [[WG_CF2:%.*]] ]
19+
; CHECK-NEXT: [[TMP4:%.*]] = load i64, i64 addrspace(1)* @__spirv_BuiltInLocalInvocationIndex, align 4
20+
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272)
21+
; CHECK-NEXT: [[CMPZ3:%.*]] = icmp eq i64 [[TMP4]], 0
22+
; CHECK-NEXT: br i1 [[CMPZ3]], label [[WG_LEADER1:%.*]], label [[WG_CF2]]
23+
; CHECK: wg_leader1:
24+
; CHECK-NEXT: call spir_func void @bar(%struct.snork addrspace(4)* [[ARRAYCTOR_CUR]])
25+
; CHECK-NEXT: [[ARRAYCTOR_NEXT:%.*]] = getelementptr inbounds [[STRUCT_SNORK]], [[STRUCT_SNORK]] addrspace(4)* [[ARRAYCTOR_CUR]], i64 1
26+
; CHECK-NEXT: store [[STRUCT_SNORK]] addrspace(4)* [[ARRAYCTOR_NEXT]], [[STRUCT_SNORK]] addrspace(4)* addrspace(3)* @[[WG_NEXT]], align 8
27+
; CHECK-NEXT: [[ARRAYCTOR_DONE:%.*]] = icmp eq [[STRUCT_SNORK]] addrspace(4)* [[ARRAYCTOR_NEXT]], getelementptr inbounds (%struct.snork, [[STRUCT_SNORK]] addrspace(4)* getelementptr inbounds ([12 x %struct.snork], [12 x %struct.snork] addrspace(4)* addrspacecast ([12 x %struct.snork] addrspace(3)* @global to [12 x %struct.snork] addrspace(4)*), i32 0, i32 0), i64 12)
28+
; CHECK-NEXT: store i1 [[ARRAYCTOR_DONE]], i1 addrspace(3)* @[[WG_DONE]], align 1
29+
; CHECK-NEXT: br label [[WG_CF2]]
30+
; CHECK: wg_cf2:
31+
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) #0
32+
; CHECK-NEXT: [[WG_VAL_ARRAYCTOR_DONE:%.*]] = load i1, i1 addrspace(3)* @[[WG_DONE]], align 1
33+
; CHECK-NEXT: [[WG_VAL_ARRAYCTOR_NEXT]] = load [[STRUCT_SNORK]] addrspace(4)*, [[STRUCT_SNORK]] addrspace(4)* addrspace(3)* @[[WG_NEXT]], align 8
34+
; CHECK-NEXT: br i1 [[WG_VAL_ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP:%.*]]
35+
; CHECK: arrayctor.cont:
36+
; CHECK-NEXT: ret void
37+
;
38+
entry:
39+
%tmp = alloca %struct.eggs addrspace(4)*, align 8
40+
store %struct.eggs addrspace(4)* %arg, %struct.eggs addrspace(4)** %tmp, align 8
41+
%tmp2 = load %struct.eggs addrspace(4)*, %struct.eggs addrspace(4)** %tmp, align 8
42+
br label %arrayctor.loop
43+
44+
arrayctor.loop: ; preds = %arrayctor.loop, %entry
45+
%arrayctor.cur = phi %struct.snork addrspace(4)* [ getelementptr inbounds ([12 x %struct.snork], [12 x %struct.snork] addrspace(4)* addrspacecast ([12 x %struct.snork] addrspace(3)* @global to [12 x %struct.snork] addrspace(4)*), i32 0, i32 0), %entry ], [ %arrayctor.next, %arrayctor.loop ]
46+
call spir_func void @bar(%struct.snork addrspace(4)* %arrayctor.cur)
47+
%arrayctor.next = getelementptr inbounds %struct.snork, %struct.snork addrspace(4)* %arrayctor.cur, i64 1
48+
%arrayctor.done = icmp eq %struct.snork addrspace(4)* %arrayctor.next, getelementptr inbounds (%struct.snork, %struct.snork addrspace(4)* getelementptr inbounds ([12 x %struct.snork], [12 x %struct.snork] addrspace(4)* addrspacecast ([12 x %struct.snork] addrspace(3)* @global to [12 x %struct.snork] addrspace(4)*), i32 0, i32 0), i64 12)
49+
br i1 %arrayctor.done, label %arrayctor.cont, label %arrayctor.loop
50+
51+
arrayctor.cont: ; preds = %arrayctor.loop
52+
ret void
53+
}
54+
55+
define linkonce_odr dso_local spir_func void @bar(%struct.snork addrspace(4)* %arg) unnamed_addr align 2 {
56+
bb:
57+
%tmp = alloca %struct.snork addrspace(4)*, align 8
58+
store %struct.snork addrspace(4)* %arg, %struct.snork addrspace(4)** %tmp, align 8
59+
%tmp1 = load %struct.snork addrspace(4)*, %struct.snork addrspace(4)** %tmp, align 8
60+
%tmp2 = getelementptr inbounds %struct.snork, %struct.snork addrspace(4)* %tmp1, i32 0, i32 0
61+
store i32 0, i32 addrspace(4)* %tmp2, align 4
62+
ret void
63+
}
64+
65+
!0 = !{}

0 commit comments

Comments
 (0)