Skip to content

Commit 39ecce1

Browse files
committed
clang/AMDGPU: Emit atomicrmw for global/flat fadd v2bf16 builtins
1 parent 5df69e0 commit 39ecce1

File tree

4 files changed

+38
-32
lines changed

4 files changed

+38
-32
lines changed

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 8 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -18659,22 +18659,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1865918659
CGM.getIntrinsic(IID, {ArgTy, Addr->getType(), Val->getType()});
1866018660
return Builder.CreateCall(F, {Addr, Val});
1866118661
}
18662-
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
18663-
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16: {
18664-
Intrinsic::ID IID;
18665-
switch (BuiltinID) {
18666-
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
18667-
IID = Intrinsic::amdgcn_global_atomic_fadd_v2bf16;
18668-
break;
18669-
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
18670-
IID = Intrinsic::amdgcn_flat_atomic_fadd_v2bf16;
18671-
break;
18672-
}
18673-
llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
18674-
llvm::Value *Val = EmitScalarExpr(E->getArg(1));
18675-
llvm::Function *F = CGM.getIntrinsic(IID, {Addr->getType()});
18676-
return Builder.CreateCall(F, {Addr, Val});
18677-
}
1867818662
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
1867918663
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
1868018664
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
@@ -19048,7 +19032,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1904819032
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
1904919033
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
1905019034
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
19051-
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64: {
19035+
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
19036+
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
19037+
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16: {
1905219038
llvm::AtomicRMWInst::BinOp BinOp;
1905319039
switch (BuiltinID) {
1905419040
case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19070,6 +19056,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1907019056
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
1907119057
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
1907219058
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
19059+
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
19060+
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
1907319061
BinOp = llvm::AtomicRMWInst::FAdd;
1907419062
break;
1907519063
case AMDGPU::BI__builtin_amdgcn_ds_fminf:
@@ -19114,7 +19102,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1911419102
AO = AtomicOrdering::Monotonic;
1911519103

1911619104
// The v2bf16 builtin uses i16 instead of a natural bfloat type.
19117-
if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16) {
19105+
if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16 ||
19106+
BuiltinID == AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16 ||
19107+
BuiltinID == AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16) {
1911819108
llvm::Type *V2BF16Ty = FixedVectorType::get(
1911919109
llvm::Type::getBFloatTy(Builder.getContext()), 2);
1912019110
Val = Builder.CreateBitCast(Val, V2BF16Ty);

clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ typedef short __attribute__((ext_vector_type(2))) short2;
1111

1212
// CHECK-LABEL: test_local_add_2bf16
1313
// CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
14-
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") monotonic, align 4
14+
// CHECK-NEXT: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") monotonic, align 4
1515
// CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
1616

1717
// GFX12-LABEL: test_local_add_2bf16
@@ -48,7 +48,7 @@ void test_local_add_2f16_noret(__local half2 *addr, half2 x) {
4848
}
4949

5050
// CHECK-LABEL: test_flat_add_2f16
51-
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr %{{.+}}, <2 x half> %{{.+}} syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
51+
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr %{{.+}}, <2 x half> %{{.+}} syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
5252

5353
// GFX12-LABEL: test_flat_add_2f16
5454
// GFX12: flat_atomic_pk_add_f16
@@ -57,15 +57,18 @@ half2 test_flat_add_2f16(__generic half2 *addr, half2 x) {
5757
}
5858

5959
// CHECK-LABEL: test_flat_add_2bf16
60-
// CHECK: call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %{{.*}}, <2 x i16> %{{.*}})
60+
// CHECK: [[BC:%.+]] = bitcast <2 x i16> %{{.+}} to <2 x bfloat>
61+
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr %{{.+}}, <2 x bfloat> [[BC]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
62+
// CHECK: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
63+
6164
// GFX12-LABEL: test_flat_add_2bf16
6265
// GFX12: flat_atomic_pk_add_bf16
6366
short2 test_flat_add_2bf16(__generic short2 *addr, short2 x) {
6467
return __builtin_amdgcn_flat_atomic_fadd_v2bf16(addr, x);
6568
}
6669

6770
// CHECK-LABEL: test_global_add_half2
68-
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(1) %{{.+}}, <2 x half> %{{.+}} syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
71+
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(1) %{{.+}}, <2 x half> %{{.+}} syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
6972

7073
// GFX12-LABEL: test_global_add_half2
7174
// GFX12: global_atomic_pk_add_f16 v2, v[0:1], v2, off th:TH_ATOMIC_RETURN
@@ -75,7 +78,7 @@ void test_global_add_half2(__global half2 *addr, half2 x) {
7578
}
7679

7780
// CHECK-LABEL: test_global_add_half2_noret
78-
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(1) %{{.+}}, <2 x half> %{{.+}} syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
81+
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(1) %{{.+}}, <2 x half> %{{.+}} syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
7982

8083
// GFX12-LABEL: test_global_add_half2_noret
8184
// GFX12: global_atomic_pk_add_f16 v[0:1], v2, off
@@ -84,7 +87,11 @@ void test_global_add_half2_noret(__global half2 *addr, half2 x) {
8487
}
8588

8689
// CHECK-LABEL: test_global_add_2bf16
87-
// CHECK: call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %{{.*}}, <2 x i16> %{{.*}})
90+
// CHECK: [[BC:%.+]] = bitcast <2 x i16> %{{.+}} to <2 x bfloat>
91+
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(1) %{{.+}}, <2 x bfloat> [[BC]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
92+
// CHECK: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
93+
94+
8895
// GFX12-LABEL: test_global_add_2bf16
8996
// GFX12: global_atomic_pk_add_bf16 v2, v[0:1], v2, off th:TH_ATOMIC_RETURN
9097
void test_global_add_2bf16(__global short2 *addr, short2 x) {
@@ -93,7 +100,10 @@ void test_global_add_2bf16(__global short2 *addr, short2 x) {
93100
}
94101

95102
// CHECK-LABEL: test_global_add_2bf16_noret
96-
// CHECK: call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %{{.*}}, <2 x i16> %{{.*}})
103+
// CHECK: [[BC:%.+]] = bitcast <2 x i16> %{{.+}} to <2 x bfloat>
104+
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(1) %{{.+}}, <2 x bfloat> [[BC]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
105+
// CHECK: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
106+
97107
// GFX12-LABEL: test_global_add_2bf16_noret
98108
// GFX12: global_atomic_pk_add_bf16 v[0:1], v2, off
99109
void test_global_add_2bf16_noret(__global short2 *addr, short2 x) {

clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ void test_global_add_f64(__global double *addr, double x) {
1818
}
1919

2020
// CHECK-LABEL: test_global_add_half2
21-
// CHECK: = atomicrmw fadd ptr addrspace(1) %{{.+}}, <2 x half> %{{.+}} syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
21+
// CHECK: = atomicrmw fadd ptr addrspace(1) %{{.+}}, <2 x half> %{{.+}} syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
2222
// GFX90A-LABEL: test_global_add_half2
2323
// GFX90A: global_atomic_pk_add_f16 v2, v[0:1], v2, off glc
2424
void test_global_add_half2(__global half2 *addr, half2 x) {
@@ -45,7 +45,7 @@ void test_global_max_f64(__global double *addr, double x){
4545
}
4646

4747
// CHECK-LABEL: test_flat_add_local_f64
48-
// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, double %{{.+}} syncscope("agent") seq_cst, align 8{{$}}
48+
// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, double %{{.+}} syncscope("agent") monotonic, align 8{{$}}
4949

5050
// GFX90A-LABEL: test_flat_add_local_f64$local
5151
// GFX90A: ds_add_rtn_f64
@@ -55,7 +55,7 @@ void test_flat_add_local_f64(__local double *addr, double x){
5555
}
5656

5757
// CHECK-LABEL: test_flat_global_add_f64
58-
// CHECK: = atomicrmw fadd ptr addrspace(1) {{.+}}, double %{{.+}} syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
58+
// CHECK: = atomicrmw fadd ptr addrspace(1) {{.+}}, double %{{.+}} syncscope("agent") monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
5959

6060
// GFX90A-LABEL: test_flat_global_add_f64$local
6161
// GFX90A: global_atomic_add_f64

clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ typedef half __attribute__((ext_vector_type(2))) half2;
1010
typedef short __attribute__((ext_vector_type(2))) short2;
1111

1212
// CHECK-LABEL: test_flat_add_f32
13-
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr %{{.+}}, float %{{.+}} syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
13+
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr %{{.+}}, float %{{.+}} syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
1414

1515
// GFX940-LABEL: test_flat_add_f32
1616
// GFX940: flat_atomic_add_f32
@@ -19,7 +19,7 @@ half2 test_flat_add_f32(__generic float *addr, float x) {
1919
}
2020

2121
// CHECK-LABEL: test_flat_add_2f16
22-
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr %{{.+}}, <2 x half> %{{.+}} syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
22+
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr %{{.+}}, <2 x half> %{{.+}} syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
2323

2424
// GFX940-LABEL: test_flat_add_2f16
2525
// GFX940: flat_atomic_pk_add_f16
@@ -28,15 +28,21 @@ half2 test_flat_add_2f16(__generic half2 *addr, half2 x) {
2828
}
2929

3030
// CHECK-LABEL: test_flat_add_2bf16
31-
// CHECK: call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %{{.*}}, <2 x i16> %{{.*}})
31+
// CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
32+
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
33+
// CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
34+
3235
// GFX940-LABEL: test_flat_add_2bf16
3336
// GFX940: flat_atomic_pk_add_bf16
3437
short2 test_flat_add_2bf16(__generic short2 *addr, short2 x) {
3538
return __builtin_amdgcn_flat_atomic_fadd_v2bf16(addr, x);
3639
}
3740

3841
// CHECK-LABEL: test_global_add_2bf16
39-
// CHECK: call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %{{.*}}, <2 x i16> %{{.*}})
42+
// CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
43+
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(1) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
44+
// CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
45+
4046
// GFX940-LABEL: test_global_add_2bf16
4147
// GFX940: global_atomic_pk_add_bf16
4248
short2 test_global_add_2bf16(__global short2 *addr, short2 x) {

0 commit comments

Comments
 (0)