Skip to content

Commit 7f55d7d

Browse files
[AMDGPU] GFX12: Add Split Workgroup Barrier (#74836)
Co-authored-by: Vang Thao <[email protected]>
1 parent 79524ba commit 7f55d7d

27 files changed

+2306
-4
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -406,5 +406,21 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32, "iffiIb", "nc", "fp8-insts")
406406
TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_bf8_f32, "ifiiIi", "nc", "fp8-insts")
407407
TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32, "ifiiIi", "nc", "fp8-insts")
408408

409+
//===----------------------------------------------------------------------===//
410+
// GFX12+ only builtins.
411+
//===----------------------------------------------------------------------===//
412+
413+
TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal, "vIi", "n", "gfx12-insts")
414+
TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal_var, "vi", "n", "gfx12-insts")
415+
TARGET_BUILTIN(__builtin_amdgcn_s_barrier_wait, "vIs", "n", "gfx12-insts")
416+
TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal_isfirst, "bIi", "n", "gfx12-insts")
417+
TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal_isfirst_var, "bi", "n", "gfx12-insts")
418+
TARGET_BUILTIN(__builtin_amdgcn_s_barrier_init, "vii", "n", "gfx12-insts")
419+
TARGET_BUILTIN(__builtin_amdgcn_s_barrier_join, "vi", "n", "gfx12-insts")
420+
TARGET_BUILTIN(__builtin_amdgcn_s_wakeup_barrier, "vi", "n", "gfx12-insts")
421+
TARGET_BUILTIN(__builtin_amdgcn_s_barrier_leave, "b", "n", "gfx12-insts")
422+
TARGET_BUILTIN(__builtin_amdgcn_s_get_barrier_state, "Uii", "n", "gfx12-insts")
423+
424+
409425
#undef BUILTIN
410426
#undef TARGET_BUILTIN
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
// REQUIRES: amdgpu-registered-target
2+
3+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -verify -S -emit-llvm -o - %s
4+
5+
kernel void builtins_amdgcn_s_barrier_signal_err(global int* in, global int* out, int barrier) {
6+
7+
__builtin_amdgcn_s_barrier_signal(barrier); // expected-error {{'__builtin_amdgcn_s_barrier_signal' must be a constant integer}}
8+
__builtin_amdgcn_s_barrier_wait(-1);
9+
*out = *in;
10+
}
11+
12+
kernel void builtins_amdgcn_s_barrier_wait_err(global int* in, global int* out, int barrier) {
13+
14+
__builtin_amdgcn_s_barrier_signal(-1);
15+
__builtin_amdgcn_s_barrier_wait(barrier); // expected-error {{'__builtin_amdgcn_s_barrier_wait' must be a constant integer}}
16+
*out = *in;
17+
}
18+
19+
kernel void builtins_amdgcn_s_barrier_signal_isfirst_err(global int* in, global int* out, int barrier) {
20+
21+
__builtin_amdgcn_s_barrier_signal_isfirst(barrier); // expected-error {{'__builtin_amdgcn_s_barrier_signal_isfirst' must be a constant integer}}
22+
__builtin_amdgcn_s_barrier_wait(-1);
23+
*out = *in;
24+
}
Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
2+
// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -S -emit-llvm -o - %s | FileCheck %s
3+
4+
// CHECK-LABEL: @test_s_barrier_signal(
5+
// CHECK-NEXT: entry:
6+
// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -1)
7+
// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -1)
8+
// CHECK-NEXT: ret void
9+
//
10+
void test_s_barrier_signal()
11+
{
12+
__builtin_amdgcn_s_barrier_signal(-1);
13+
__builtin_amdgcn_s_barrier_wait(-1);
14+
}
15+
16+
// CHECK-LABEL: @test_s_barrier_signal_var(
17+
// CHECK-NEXT: entry:
18+
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
19+
// CHECK-NEXT: store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
20+
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
21+
// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(i32 [[TMP0]])
22+
// CHECK-NEXT: ret void
23+
//
24+
void test_s_barrier_signal_var(int a)
25+
{
26+
__builtin_amdgcn_s_barrier_signal_var(a);
27+
}
28+
29+
// CHECK-LABEL: @test_s_barrier_signal_isfirst(
30+
// CHECK-NEXT: entry:
31+
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
32+
// CHECK-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
33+
// CHECK-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
34+
// CHECK-NEXT: store ptr [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 8
35+
// CHECK-NEXT: store ptr [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 8
36+
// CHECK-NEXT: store ptr [[C:%.*]], ptr addrspace(5) [[C_ADDR]], align 8
37+
// CHECK-NEXT: [[TMP0:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 1)
38+
// CHECK-NEXT: br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
39+
// CHECK: if.then:
40+
// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8
41+
// CHECK-NEXT: store ptr [[TMP1]], ptr addrspace(5) [[A_ADDR]], align 8
42+
// CHECK-NEXT: br label [[IF_END:%.*]]
43+
// CHECK: if.else:
44+
// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
45+
// CHECK-NEXT: store ptr [[TMP2]], ptr addrspace(5) [[A_ADDR]], align 8
46+
// CHECK-NEXT: br label [[IF_END]]
47+
// CHECK: if.end:
48+
// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1)
49+
// CHECK-NEXT: ret void
50+
//
51+
void test_s_barrier_signal_isfirst(int* a, int* b, int *c)
52+
{
53+
if(__builtin_amdgcn_s_barrier_signal_isfirst(1))
54+
a = b;
55+
else
56+
a = c;
57+
58+
__builtin_amdgcn_s_barrier_wait(1);
59+
}
60+
61+
// CHECK-LABEL: @test_s_barrier_isfirst_var(
62+
// CHECK-NEXT: entry:
63+
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
64+
// CHECK-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
65+
// CHECK-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
66+
// CHECK-NEXT: [[D_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
67+
// CHECK-NEXT: store ptr [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 8
68+
// CHECK-NEXT: store ptr [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 8
69+
// CHECK-NEXT: store ptr [[C:%.*]], ptr addrspace(5) [[C_ADDR]], align 8
70+
// CHECK-NEXT: store i32 [[D:%.*]], ptr addrspace(5) [[D_ADDR]], align 4
71+
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[D_ADDR]], align 4
72+
// CHECK-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst.var(i32 [[TMP0]])
73+
// CHECK-NEXT: br i1 [[TMP1]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
74+
// CHECK: if.then:
75+
// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8
76+
// CHECK-NEXT: store ptr [[TMP2]], ptr addrspace(5) [[A_ADDR]], align 8
77+
// CHECK-NEXT: br label [[IF_END:%.*]]
78+
// CHECK: if.else:
79+
// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
80+
// CHECK-NEXT: store ptr [[TMP3]], ptr addrspace(5) [[A_ADDR]], align 8
81+
// CHECK-NEXT: br label [[IF_END]]
82+
// CHECK: if.end:
83+
// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1)
84+
// CHECK-NEXT: ret void
85+
//
86+
void test_s_barrier_isfirst_var(int* a, int* b, int *c, int d)
87+
{
88+
if ( __builtin_amdgcn_s_barrier_signal_isfirst_var(d))
89+
a = b;
90+
else
91+
a = c;
92+
93+
__builtin_amdgcn_s_barrier_wait(1);
94+
95+
}
96+
97+
// CHECK-LABEL: @test_s_barrier_init(
98+
// CHECK-NEXT: entry:
99+
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
100+
// CHECK-NEXT: store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
101+
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
102+
// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.init(i32 1, i32 [[TMP0]])
103+
// CHECK-NEXT: ret void
104+
//
105+
void test_s_barrier_init(int a)
106+
{
107+
__builtin_amdgcn_s_barrier_init(1, a);
108+
}
109+
110+
// CHECK-LABEL: @test_s_barrier_join(
111+
// CHECK-NEXT: entry:
112+
// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(i32 1)
113+
// CHECK-NEXT: ret void
114+
//
115+
void test_s_barrier_join()
116+
{
117+
__builtin_amdgcn_s_barrier_join(1);
118+
}
119+
120+
// CHECK-LABEL: @test_s_wakeup_barrier(
121+
// CHECK-NEXT: entry:
122+
// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(i32 1)
123+
// CHECK-NEXT: ret void
124+
//
125+
void test_s_wakeup_barrier()
126+
{
127+
__builtin_amdgcn_s_barrier_join(1);
128+
}
129+
130+
// CHECK-LABEL: @test_s_barrier_leave(
131+
// CHECK-NEXT: entry:
132+
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
133+
// CHECK-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
134+
// CHECK-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
135+
// CHECK-NEXT: store ptr [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 8
136+
// CHECK-NEXT: store ptr [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 8
137+
// CHECK-NEXT: store ptr [[C:%.*]], ptr addrspace(5) [[C_ADDR]], align 8
138+
// CHECK-NEXT: [[TMP0:%.*]] = call i1 @llvm.amdgcn.s.barrier.leave()
139+
// CHECK-NEXT: br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
140+
// CHECK: if.then:
141+
// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8
142+
// CHECK-NEXT: store ptr [[TMP1]], ptr addrspace(5) [[A_ADDR]], align 8
143+
// CHECK-NEXT: br label [[IF_END:%.*]]
144+
// CHECK: if.else:
145+
// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
146+
// CHECK-NEXT: store ptr [[TMP2]], ptr addrspace(5) [[A_ADDR]], align 8
147+
// CHECK-NEXT: br label [[IF_END]]
148+
// CHECK: if.end:
149+
// CHECK-NEXT: ret void
150+
//
151+
void test_s_barrier_leave(int* a, int* b, int *c)
152+
{
153+
if (__builtin_amdgcn_s_barrier_leave())
154+
a = b;
155+
else
156+
a = c;
157+
}
158+
159+
// CHECK-LABEL: @test_s_get_barrier_state(
160+
// CHECK-NEXT: entry:
161+
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
162+
// CHECK-NEXT: [[STATE:%.*]] = alloca i32, align 4, addrspace(5)
163+
// CHECK-NEXT: store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
164+
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
165+
// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.s.get.barrier.state(i32 [[TMP0]])
166+
// CHECK-NEXT: store i32 [[TMP1]], ptr addrspace(5) [[STATE]], align 4
167+
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[STATE]], align 4
168+
// CHECK-NEXT: ret i32 [[TMP2]]
169+
//
170+
unsigned test_s_get_barrier_state(int a)
171+
{
172+
unsigned State = __builtin_amdgcn_s_get_barrier_state(a);
173+
return State;
174+
}

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,45 @@ def int_amdgcn_s_sendmsg_rtn : Intrinsic <[llvm_anyint_ty], [llvm_i32_ty],
227227
def int_amdgcn_s_barrier : ClangBuiltin<"__builtin_amdgcn_s_barrier">,
228228
Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
229229

230+
def int_amdgcn_s_barrier_signal : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal">,
231+
Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
232+
IntrNoCallback, IntrNoFree]>;
233+
234+
def int_amdgcn_s_barrier_signal_var : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_var">,
235+
Intrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
236+
IntrNoCallback, IntrNoFree]>;
237+
238+
def int_amdgcn_s_barrier_signal_isfirst : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_isfirst">,
239+
Intrinsic<[llvm_i1_ty], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent,
240+
IntrWillReturn, IntrNoCallback, IntrNoFree]>;
241+
242+
def int_amdgcn_s_barrier_signal_isfirst_var : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_isfirst_var">,
243+
Intrinsic<[llvm_i1_ty], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
244+
IntrNoCallback, IntrNoFree]>;
245+
246+
def int_amdgcn_s_barrier_init : ClangBuiltin<"__builtin_amdgcn_s_barrier_init">,
247+
Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent,
248+
IntrWillReturn, IntrNoCallback, IntrNoFree]>;
249+
250+
def int_amdgcn_s_barrier_join : ClangBuiltin<"__builtin_amdgcn_s_barrier_join">,
251+
Intrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
252+
IntrNoCallback, IntrNoFree]>;
253+
254+
def int_amdgcn_s_wakeup_barrier : ClangBuiltin<"__builtin_amdgcn_s_wakeup_barrier">,
255+
Intrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
256+
IntrNoCallback, IntrNoFree]>;
257+
258+
def int_amdgcn_s_barrier_wait : ClangBuiltin<"__builtin_amdgcn_s_barrier_wait">,
259+
Intrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent,
260+
IntrWillReturn, IntrNoCallback, IntrNoFree]>;
261+
262+
def int_amdgcn_s_barrier_leave : ClangBuiltin<"__builtin_amdgcn_s_barrier_leave">,
263+
Intrinsic<[llvm_i1_ty], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
264+
265+
def int_amdgcn_s_get_barrier_state : ClangBuiltin<"__builtin_amdgcn_s_get_barrier_state">,
266+
Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
267+
IntrNoCallback, IntrNoFree]>;
268+
230269
def int_amdgcn_wave_barrier : ClangBuiltin<"__builtin_amdgcn_wave_barrier">,
231270
Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
232271

0 commit comments

Comments
 (0)