Skip to content

Commit 94d04eb

Browse files
committed
clang/AMDGPU: Emit atomicrmw for global/flat fadd v2bf16 builtins
1 parent 9347154 commit 94d04eb

File tree

3 files changed

+30
-24
lines changed

3 files changed

+30
-24
lines changed

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 8 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -18681,22 +18681,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1868118681
CGM.getIntrinsic(IID, {ArgTy, Addr->getType(), Val->getType()});
1868218682
return Builder.CreateCall(F, {Addr, Val});
1868318683
}
18684-
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
18685-
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16: {
18686-
Intrinsic::ID IID;
18687-
switch (BuiltinID) {
18688-
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
18689-
IID = Intrinsic::amdgcn_global_atomic_fadd_v2bf16;
18690-
break;
18691-
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
18692-
IID = Intrinsic::amdgcn_flat_atomic_fadd_v2bf16;
18693-
break;
18694-
}
18695-
llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
18696-
llvm::Value *Val = EmitScalarExpr(E->getArg(1));
18697-
llvm::Function *F = CGM.getIntrinsic(IID, {Addr->getType()});
18698-
return Builder.CreateCall(F, {Addr, Val});
18699-
}
1870018684
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
1870118685
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
1870218686
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
@@ -19068,7 +19052,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1906819052
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
1906919053
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
1907019054
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
19071-
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64: {
19055+
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
19056+
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
19057+
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16: {
1907219058
llvm::AtomicRMWInst::BinOp BinOp;
1907319059
switch (BuiltinID) {
1907419060
case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19090,6 +19076,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1909019076
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
1909119077
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
1909219078
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
19079+
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
19080+
case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
1909319081
BinOp = llvm::AtomicRMWInst::FAdd;
1909419082
break;
1909519083
}
@@ -19126,7 +19114,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1912619114
AO = AtomicOrdering::SequentiallyConsistent;
1912719115

1912819116
// The v2bf16 builtin uses i16 instead of a natural bfloat type.
19129-
if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16) {
19117+
if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16 ||
19118+
BuiltinID == AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16 ||
19119+
BuiltinID == AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16) {
1913019120
llvm::Type *V2BF16Ty = FixedVectorType::get(
1913119121
llvm::Type::getBFloatTy(Builder.getContext()), 2);
1913219122
Val = Builder.CreateBitCast(Val, V2BF16Ty);

clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ typedef short __attribute__((ext_vector_type(2))) short2;
1111

1212
// CHECK-LABEL: test_local_add_2bf16
1313
// CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
14-
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") seq_cst, align 4
14+
// CHECK-NEXT: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") seq_cst, align 4
1515
// CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
1616

1717
// GFX12-LABEL: test_local_add_2bf16
@@ -57,7 +57,10 @@ half2 test_flat_add_2f16(__generic half2 *addr, half2 x) {
5757
}
5858

5959
// CHECK-LABEL: test_flat_add_2bf16
60-
// CHECK: call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %{{.*}}, <2 x i16> %{{.*}})
60+
// CHECK: [[BC:%.+]] = bitcast <2 x i16> %{{.+}} to <2 x bfloat>
61+
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr %{{.+}}, <2 x bfloat> [[BC]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
62+
// CHECK: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
63+
6164
// GFX12-LABEL: test_flat_add_2bf16
6265
// GFX12: flat_atomic_pk_add_bf16
6366
short2 test_flat_add_2bf16(__generic short2 *addr, short2 x) {
@@ -84,7 +87,11 @@ void test_global_add_half2_noret(__global half2 *addr, half2 x) {
8487
}
8588

8689
// CHECK-LABEL: test_global_add_2bf16
87-
// CHECK: call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %{{.*}}, <2 x i16> %{{.*}})
90+
// CHECK: [[BC:%.+]] = bitcast <2 x i16> %{{.+}} to <2 x bfloat>
91+
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(1) %{{.+}}, <2 x bfloat> [[BC]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
92+
// CHECK: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
93+
94+
8895
// GFX12-LABEL: test_global_add_2bf16
8996
// GFX12: global_atomic_pk_add_bf16 v2, v[0:1], v2, off th:TH_ATOMIC_RETURN
9097
void test_global_add_2bf16(__global short2 *addr, short2 x) {
@@ -93,7 +100,10 @@ void test_global_add_2bf16(__global short2 *addr, short2 x) {
93100
}
94101

95102
// CHECK-LABEL: test_global_add_2bf16_noret
96-
// CHECK: call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %{{.*}}, <2 x i16> %{{.*}})
103+
// CHECK: [[BC:%.+]] = bitcast <2 x i16> %{{.+}} to <2 x bfloat>
104+
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(1) %{{.+}}, <2 x bfloat> [[BC]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
105+
// CHECK: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
106+
97107
// GFX12-LABEL: test_global_add_2bf16_noret
98108
// GFX12: global_atomic_pk_add_bf16 v[0:1], v2, off
99109
void test_global_add_2bf16_noret(__global short2 *addr, short2 x) {

clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,15 +28,21 @@ half2 test_flat_add_2f16(__generic half2 *addr, half2 x) {
2828
}
2929

3030
// CHECK-LABEL: test_flat_add_2bf16
31-
// CHECK: call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %{{.*}}, <2 x i16> %{{.*}})
31+
// CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
32+
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
33+
// CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
34+
3235
// GFX940-LABEL: test_flat_add_2bf16
3336
// GFX940: flat_atomic_pk_add_bf16
3437
short2 test_flat_add_2bf16(__generic short2 *addr, short2 x) {
3538
return __builtin_amdgcn_flat_atomic_fadd_v2bf16(addr, x);
3639
}
3740

3841
// CHECK-LABEL: test_global_add_2bf16
39-
// CHECK: call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %{{.*}}, <2 x i16> %{{.*}})
42+
// CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
43+
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(1) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
44+
// CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
45+
4046
// GFX940-LABEL: test_global_add_2bf16
4147
// GFX940: global_atomic_pk_add_bf16
4248
short2 test_global_add_2bf16(__global short2 *addr, short2 x) {

0 commit comments

Comments
 (0)