Skip to content

Commit f162998

Browse files
committed
clang/AMDGPU: Emit atomicrmw for global/flat fadd v2bf16 builtins
1 parent e10cf56 commit f162998

File tree

3 files changed

+30
-24
lines changed

3 files changed

+30
-24
lines changed

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 8 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -18946,22 +18946,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1894618946
CGM.getIntrinsic(IID, {ArgTy, Addr->getType(), Val->getType()});
1894718947
return Builder.CreateCall(F, {Addr, Val});
1894818948
}
18949-
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
18950-
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16: {
18951-
Intrinsic::ID IID;
18952-
switch (BuiltinID) {
18953-
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
18954-
IID = Intrinsic::amdgcn_global_atomic_fadd_v2bf16;
18955-
break;
18956-
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
18957-
IID = Intrinsic::amdgcn_flat_atomic_fadd_v2bf16;
18958-
break;
18959-
}
18960-
llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
18961-
llvm::Value *Val = EmitScalarExpr(E->getArg(1));
18962-
llvm::Function *F = CGM.getIntrinsic(IID, {Addr->getType()});
18963-
return Builder.CreateCall(F, {Addr, Val});
18964-
}
1896518949
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
1896618950
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
1896718951
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
@@ -19343,7 +19327,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1934319327
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
1934419328
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
1934519329
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
19346-
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64: {
19330+
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
19331+
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
19332+
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16: {
1934719333
llvm::AtomicRMWInst::BinOp BinOp;
1934819334
switch (BuiltinID) {
1934919335
case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19365,6 +19351,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1936519351
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
1936619352
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
1936719353
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
19354+
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
19355+
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
1936819356
BinOp = llvm::AtomicRMWInst::FAdd;
1936919357
break;
1937019358
case AMDGPU::BI__builtin_amdgcn_ds_fminf:
@@ -19409,7 +19397,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1940919397
AO = AtomicOrdering::Monotonic;
1941019398

1941119399
// The v2bf16 builtin uses i16 instead of a natural bfloat type.
19412-
if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16) {
19400+
if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16 ||
19401+
BuiltinID == AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16 ||
19402+
BuiltinID == AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16) {
1941319403
llvm::Type *V2BF16Ty = FixedVectorType::get(
1941419404
llvm::Type::getBFloatTy(Builder.getContext()), 2);
1941519405
Val = Builder.CreateBitCast(Val, V2BF16Ty);

clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ typedef short __attribute__((ext_vector_type(2))) short2;
1111

1212
// CHECK-LABEL: test_local_add_2bf16
1313
// CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
14-
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") monotonic, align 4
14+
// CHECK-NEXT: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") monotonic, align 4
1515
// CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
1616

1717
// GFX12-LABEL: test_local_add_2bf16
@@ -57,7 +57,10 @@ half2 test_flat_add_2f16(__generic half2 *addr, half2 x) {
5757
}
5858

5959
// CHECK-LABEL: test_flat_add_2bf16
60-
// CHECK: call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %{{.*}}, <2 x i16> %{{.*}})
60+
// CHECK: [[BC:%.+]] = bitcast <2 x i16> %{{.+}} to <2 x bfloat>
61+
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr %{{.+}}, <2 x bfloat> [[BC]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
62+
// CHECK: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
63+
6164
// GFX12-LABEL: test_flat_add_2bf16
6265
// GFX12: flat_atomic_pk_add_bf16
6366
short2 test_flat_add_2bf16(__generic short2 *addr, short2 x) {
@@ -84,7 +87,11 @@ void test_global_add_half2_noret(__global half2 *addr, half2 x) {
8487
}
8588

8689
// CHECK-LABEL: test_global_add_2bf16
87-
// CHECK: call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %{{.*}}, <2 x i16> %{{.*}})
90+
// CHECK: [[BC:%.+]] = bitcast <2 x i16> %{{.+}} to <2 x bfloat>
91+
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(1) %{{.+}}, <2 x bfloat> [[BC]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
92+
// CHECK: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
93+
94+
8895
// GFX12-LABEL: test_global_add_2bf16
8996
// GFX12: global_atomic_pk_add_bf16 v2, v[0:1], v2, off th:TH_ATOMIC_RETURN
9097
void test_global_add_2bf16(__global short2 *addr, short2 x) {
@@ -93,7 +100,10 @@ void test_global_add_2bf16(__global short2 *addr, short2 x) {
93100
}
94101

95102
// CHECK-LABEL: test_global_add_2bf16_noret
96-
// CHECK: call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %{{.*}}, <2 x i16> %{{.*}})
103+
// CHECK: [[BC:%.+]] = bitcast <2 x i16> %{{.+}} to <2 x bfloat>
104+
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(1) %{{.+}}, <2 x bfloat> [[BC]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
105+
// CHECK: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
106+
97107
// GFX12-LABEL: test_global_add_2bf16_noret
98108
// GFX12: global_atomic_pk_add_bf16 v[0:1], v2, off
99109
void test_global_add_2bf16_noret(__global short2 *addr, short2 x) {

clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,15 +28,21 @@ half2 test_flat_add_2f16(__generic half2 *addr, half2 x) {
2828
}
2929

3030
// CHECK-LABEL: test_flat_add_2bf16
31-
// CHECK: call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %{{.*}}, <2 x i16> %{{.*}})
31+
// CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
32+
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
33+
// CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
34+
3235
// GFX940-LABEL: test_flat_add_2bf16
3336
// GFX940: flat_atomic_pk_add_bf16
3437
short2 test_flat_add_2bf16(__generic short2 *addr, short2 x) {
3538
return __builtin_amdgcn_flat_atomic_fadd_v2bf16(addr, x);
3639
}
3740

3841
// CHECK-LABEL: test_global_add_2bf16
39-
// CHECK: call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %{{.*}}, <2 x i16> %{{.*}})
42+
// CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
43+
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(1) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
44+
// CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
45+
4046
// GFX940-LABEL: test_global_add_2bf16
4147
// GFX940: global_atomic_pk_add_bf16
4248
short2 test_global_add_2bf16(__global short2 *addr, short2 x) {

0 commit comments

Comments
 (0)