Skip to content

Commit 5fe43ec

Browse files
committed
AMDGPU: Remove flat/global atomic fadd v2bf16 intrinsics
These are now fully covered by atomicrmw.
1 parent 953322d commit 5fe43ec

File tree

11 files changed

+33
-368
lines changed

11 files changed

+33
-368
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2907,10 +2907,6 @@ multiclass AMDGPUMFp8SmfmacIntrinsic<LLVMType DestTy> {
29072907
def NAME#"_"#kind : AMDGPUMFp8SmfmacIntrinsic<DestTy>;
29082908
}
29092909

2910-
// bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm.
2911-
def int_amdgcn_global_atomic_fadd_v2bf16 : AMDGPUAtomicRtn<llvm_v2i16_ty>;
2912-
def int_amdgcn_flat_atomic_fadd_v2bf16 : AMDGPUAtomicRtn<llvm_v2i16_ty>;
2913-
29142910
defset list<Intrinsic> AMDGPUMFMAIntrinsics940 = {
29152911
def int_amdgcn_mfma_i32_16x16x32_i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i64_ty>;
29162912
def int_amdgcn_mfma_i32_32x32x16_i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i64_ty>;

llvm/lib/IR/AutoUpgrade.cpp

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1034,7 +1034,9 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
10341034
}
10351035

10361036
if (Name.starts_with("ds.fadd") || Name.starts_with("ds.fmin") ||
1037-
Name.starts_with("ds.fmax")) {
1037+
Name.starts_with("ds.fmax") ||
1038+
Name.starts_with("global.atomic.fadd.v2bf16") ||
1039+
Name.starts_with("flat.atomic.fadd.v2bf16")) {
10381040
// Replaced with atomicrmw fadd/fmin/fmax, so there's no new
10391041
// declaration.
10401042
NewFn = nullptr;
@@ -2352,7 +2354,9 @@ static Value *upgradeAMDGCNIntrinsicCall(StringRef Name, CallBase *CI,
23522354
.StartsWith("ds.fmin", AtomicRMWInst::FMin)
23532355
.StartsWith("ds.fmax", AtomicRMWInst::FMax)
23542356
.StartsWith("atomic.inc.", AtomicRMWInst::UIncWrap)
2355-
.StartsWith("atomic.dec.", AtomicRMWInst::UDecWrap);
2357+
.StartsWith("atomic.dec.", AtomicRMWInst::UDecWrap)
2358+
.StartsWith("global.atomic.fadd", AtomicRMWInst::FAdd)
2359+
.StartsWith("flat.atomic.fadd", AtomicRMWInst::FAdd);
23562360

23572361
unsigned NumOperands = CI->getNumOperands();
23582362
if (NumOperands < 3) // Malformed bitcode.
@@ -2407,8 +2411,10 @@ static Value *upgradeAMDGCNIntrinsicCall(StringRef Name, CallBase *CI,
24072411
Builder.CreateAtomicRMW(RMWOp, Ptr, Val, std::nullopt, Order, SSID);
24082412

24092413
if (PtrTy->getAddressSpace() != 3) {
2410-
RMW->setMetadata("amdgpu.no.fine.grained.memory",
2411-
MDNode::get(F->getContext(), {}));
2414+
MDNode *EmptyMD = MDNode::get(F->getContext(), {});
2415+
RMW->setMetadata("amdgpu.no.fine.grained.memory", EmptyMD);
2416+
if (RMWOp == AtomicRMWInst::FAdd && RetTy->isFloatTy())
2417+
RMW->setMetadata("amdgpu.ignore.denormal.mode", EmptyMD);
24122418
}
24132419

24142420
if (IsVolatile)

llvm/lib/Target/AMDGPU/AMDGPUInstructions.td

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -620,12 +620,10 @@ multiclass local_addr_space_atomic_op {
620620

621621
defm int_amdgcn_flat_atomic_fadd : noret_op;
622622
defm int_amdgcn_flat_atomic_fadd : flat_addr_space_atomic_op;
623-
defm int_amdgcn_flat_atomic_fadd_v2bf16 : noret_op;
624623
defm int_amdgcn_flat_atomic_fmin : noret_op;
625624
defm int_amdgcn_flat_atomic_fmax : noret_op;
626625
defm int_amdgcn_global_atomic_fadd : global_addr_space_atomic_op;
627626
defm int_amdgcn_flat_atomic_fadd : global_addr_space_atomic_op;
628-
defm int_amdgcn_global_atomic_fadd_v2bf16 : noret_op;
629627
defm int_amdgcn_global_atomic_fmin : noret_op;
630628
defm int_amdgcn_global_atomic_fmax : noret_op;
631629
defm int_amdgcn_global_atomic_csub : noret_op;

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4897,8 +4897,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
48974897
case Intrinsic::amdgcn_flat_atomic_fmax:
48984898
case Intrinsic::amdgcn_flat_atomic_fmin_num:
48994899
case Intrinsic::amdgcn_flat_atomic_fmax_num:
4900-
case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
4901-
case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
49024900
case Intrinsic::amdgcn_atomic_cond_sub_u32:
49034901
case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
49044902
case Intrinsic::amdgcn_global_load_tr_b64:

llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -250,8 +250,6 @@ def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin>;
250250
def : SourceOfDivergence<int_amdgcn_flat_atomic_fmax>;
251251
def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin_num>;
252252
def : SourceOfDivergence<int_amdgcn_flat_atomic_fmax_num>;
253-
def : SourceOfDivergence<int_amdgcn_global_atomic_fadd_v2bf16>;
254-
def : SourceOfDivergence<int_amdgcn_flat_atomic_fadd_v2bf16>;
255253
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_swap>;
256254
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_add>;
257255
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_sub>;

llvm/lib/Target/AMDGPU/FLATInstructions.td

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1670,13 +1670,11 @@ defm : FlatAtomicIntrPat <"FLAT_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd",
16701670

16711671
let OtherPredicates = [HasAtomicFlatPkAdd16Insts] in {
16721672
defm : FlatAtomicIntrPat <"FLAT_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", v2f16>;
1673-
defm : FlatAtomicIntrPat <"FLAT_ATOMIC_PK_ADD_BF16", "int_amdgcn_flat_atomic_fadd_v2bf16", v2i16>;
16741673
defm : FlatAtomicPat <"FLAT_ATOMIC_PK_ADD_F16", "atomic_load_fadd_flat", v2f16>;
16751674
defm : FlatAtomicPat <"FLAT_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_flat", v2bf16>;
16761675
}
16771676

16781677
let OtherPredicates = [HasAtomicGlobalPkAddBF16Inst] in
1679-
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "int_amdgcn_global_atomic_fadd_v2bf16", v2i16>;
16801678
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_global", v2bf16>;
16811679
} // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10
16821680

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1346,9 +1346,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
13461346
case Intrinsic::amdgcn_flat_atomic_fmax:
13471347
case Intrinsic::amdgcn_flat_atomic_fmin_num:
13481348
case Intrinsic::amdgcn_flat_atomic_fmax_num:
1349-
case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1350-
case Intrinsic::amdgcn_atomic_cond_sub_u32:
1351-
case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
1349+
case Intrinsic::amdgcn_atomic_cond_sub_u32: {
13521350
Info.opc = ISD::INTRINSIC_W_CHAIN;
13531351
Info.memVT = MVT::getVT(CI.getType());
13541352
Info.ptrVal = CI.getOperand(0);
@@ -1451,14 +1449,12 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
14511449
case Intrinsic::amdgcn_ds_ordered_add:
14521450
case Intrinsic::amdgcn_ds_ordered_swap:
14531451
case Intrinsic::amdgcn_flat_atomic_fadd:
1454-
case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
14551452
case Intrinsic::amdgcn_flat_atomic_fmax:
14561453
case Intrinsic::amdgcn_flat_atomic_fmax_num:
14571454
case Intrinsic::amdgcn_flat_atomic_fmin:
14581455
case Intrinsic::amdgcn_flat_atomic_fmin_num:
14591456
case Intrinsic::amdgcn_global_atomic_csub:
14601457
case Intrinsic::amdgcn_global_atomic_fadd:
1461-
case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
14621458
case Intrinsic::amdgcn_global_atomic_fmax:
14631459
case Intrinsic::amdgcn_global_atomic_fmax_num:
14641460
case Intrinsic::amdgcn_global_atomic_fmin:

llvm/test/Bitcode/amdgcn-atomic.ll

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -300,4 +300,26 @@ define float @upgrade_amdgcn_ds_fmax_f32_no_suffix(ptr addrspace(3) %ptr, float
300300
ret float %result0
301301
}
302302

303+
declare <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr, <2 x i16>)
304+
305+
define <2 x i16> @upgrade_amdgcn_flat_atomic_fadd_v2bf16_p0(ptr %ptr, <2 x i16> %data) {
306+
; CHECK: [[BC0:%.+]] = bitcast <2 x i16> %data to <2 x bfloat>
307+
; CHECK-NEXT: [[ATOMIC:%.+]] = atomicrmw fadd ptr %ptr, <2 x bfloat> [[BC0]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
308+
; CHECK-NEXT: [[BC1:%.+]] = bitcast <2 x bfloat> [[ATOMIC]] to <2 x i16>
309+
; CHECK-NEXT: ret <2 x i16> [[BC1]]
310+
%result = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data)
311+
ret <2 x i16> %result
312+
}
313+
314+
declare <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1), <2 x i16>)
315+
316+
define <2 x i16> @upgrade_amdgcn_global_atomic_fadd_v2bf16_p1(ptr addrspace(1) %ptr, <2 x i16> %data) {
317+
; CHECK: [[BC0:%.+]] = bitcast <2 x i16> %data to <2 x bfloat>
318+
; CHECK-NEXT: [[ATOMIC:%.+]] = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> [[BC0]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
319+
; CHECK-NEXT: [[BC1:%.+]] = bitcast <2 x bfloat> [[ATOMIC]] to <2 x i16>
320+
; CHECK-NEXT: ret <2 x i16> [[BC1]]
321+
%result = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data)
322+
ret <2 x i16> %result
323+
}
324+
303325
attributes #0 = { argmemonly nounwind willreturn }

llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll

Lines changed: 0 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,6 @@
44
declare float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data)
55
declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data)
66

7-
; bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm.
8-
declare <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data)
9-
declare <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data)
10-
117
define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) {
128
; GFX940-LABEL: flat_atomic_fadd_f32_noret:
139
; GFX940: ; %bb.0:
@@ -104,56 +100,6 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) {
104100
ret <2 x half> %ret
105101
}
106102

107-
define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %data) {
108-
; GFX940-LABEL: flat_atomic_fadd_v2bf16_noret:
109-
; GFX940: ; %bb.0:
110-
; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
111-
; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c
112-
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
113-
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
114-
; GFX940-NEXT: v_mov_b32_e32 v2, s4
115-
; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2
116-
; GFX940-NEXT: s_endpgm
117-
%ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data)
118-
ret void
119-
}
120-
121-
define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) {
122-
; GFX940-LABEL: flat_atomic_fadd_v2bf16_rtn:
123-
; GFX940: ; %bb.0:
124-
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
125-
; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0
126-
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
127-
; GFX940-NEXT: s_setpc_b64 s[30:31]
128-
%ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data)
129-
ret <2 x i16> %ret
130-
}
131-
132-
define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr, <2 x i16> %data) {
133-
; GFX940-LABEL: global_atomic_fadd_v2bf16_noret:
134-
; GFX940: ; %bb.0:
135-
; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c
136-
; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
137-
; GFX940-NEXT: v_mov_b32_e32 v1, 0
138-
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
139-
; GFX940-NEXT: v_mov_b32_e32 v0, s4
140-
; GFX940-NEXT: global_atomic_pk_add_bf16 v1, v0, s[2:3]
141-
; GFX940-NEXT: s_endpgm
142-
%ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data)
143-
ret void
144-
}
145-
146-
define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16> %data) {
147-
; GFX940-LABEL: global_atomic_fadd_v2bf16_rtn:
148-
; GFX940: ; %bb.0:
149-
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
150-
; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0
151-
; GFX940-NEXT: s_waitcnt vmcnt(0)
152-
; GFX940-NEXT: s_setpc_b64 s[30:31]
153-
%ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data)
154-
ret <2 x i16> %ret
155-
}
156-
157103
define <2 x half> @local_atomic_fadd_ret_v2f16_offset(ptr addrspace(3) %ptr, <2 x half> %val) {
158104
; GFX940-LABEL: local_atomic_fadd_ret_v2f16_offset:
159105
; GFX940: ; %bb.0:

llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll

Lines changed: 0 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,7 @@ declare <2 x bfloat> @llvm.amdgcn.struct.buffer.atomic.fadd.v2bf16(<2 x bfloat>,
77
declare <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32)
88
declare <2 x bfloat> @llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, <4 x i32> %rsrc, i32, i32, i32)
99
declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
10-
declare <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data)
1110
declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data)
12-
declare <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data)
1311

1412
define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %data) {
1513
; GFX12-SDAG-LABEL: flat_atomic_fadd_v2f16_noret:
@@ -59,104 +57,6 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) {
5957
ret <2 x half> %ret
6058
}
6159

62-
define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %data) {
63-
; GFX12-SDAG-LABEL: flat_atomic_fadd_v2bf16_noret:
64-
; GFX12-SDAG: ; %bb.0:
65-
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
66-
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
67-
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
68-
; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2
69-
; GFX12-SDAG-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2
70-
; GFX12-SDAG-NEXT: s_endpgm
71-
;
72-
; GFX12-GISEL-LABEL: flat_atomic_fadd_v2bf16_noret:
73-
; GFX12-GISEL: ; %bb.0:
74-
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
75-
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
76-
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
77-
; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2
78-
; GFX12-GISEL-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2
79-
; GFX12-GISEL-NEXT: s_endpgm
80-
%ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data)
81-
ret void
82-
}
83-
84-
define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) {
85-
; GFX12-SDAG-LABEL: flat_atomic_fadd_v2bf16_rtn:
86-
; GFX12-SDAG: ; %bb.0:
87-
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
88-
; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
89-
; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
90-
; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
91-
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
92-
; GFX12-SDAG-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
93-
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
94-
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
95-
;
96-
; GFX12-GISEL-LABEL: flat_atomic_fadd_v2bf16_rtn:
97-
; GFX12-GISEL: ; %bb.0:
98-
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
99-
; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
100-
; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
101-
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
102-
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
103-
; GFX12-GISEL-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
104-
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
105-
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
106-
%ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data)
107-
ret <2 x i16> %ret
108-
}
109-
110-
define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr, <2 x i16> %data) {
111-
; GFX12-SDAG-LABEL: global_atomic_fadd_v2bf16_noret:
112-
; GFX12-SDAG: ; %bb.0:
113-
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
114-
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
115-
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
116-
; GFX12-SDAG-NEXT: global_atomic_pk_add_bf16 v0, v1, s[0:1]
117-
; GFX12-SDAG-NEXT: s_nop 0
118-
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
119-
; GFX12-SDAG-NEXT: s_endpgm
120-
;
121-
; GFX12-GISEL-LABEL: global_atomic_fadd_v2bf16_noret:
122-
; GFX12-GISEL: ; %bb.0:
123-
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
124-
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
125-
; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
126-
; GFX12-GISEL-NEXT: global_atomic_pk_add_bf16 v1, v0, s[0:1]
127-
; GFX12-GISEL-NEXT: s_nop 0
128-
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
129-
; GFX12-GISEL-NEXT: s_endpgm
130-
%ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data)
131-
ret void
132-
}
133-
134-
define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16> %data) {
135-
; GFX12-SDAG-LABEL: global_atomic_fadd_v2bf16_rtn:
136-
; GFX12-SDAG: ; %bb.0:
137-
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
138-
; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
139-
; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
140-
; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
141-
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
142-
; GFX12-SDAG-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
143-
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
144-
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
145-
;
146-
; GFX12-GISEL-LABEL: global_atomic_fadd_v2bf16_rtn:
147-
; GFX12-GISEL: ; %bb.0:
148-
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
149-
; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
150-
; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
151-
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
152-
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
153-
; GFX12-GISEL-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
154-
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
155-
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
156-
%ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data)
157-
ret <2 x i16> %ret
158-
}
159-
16060
define void @global_atomic_pk_add_v2f16(ptr addrspace(1) %ptr, <2 x half> %data) {
16161
; GFX12-SDAG-LABEL: global_atomic_pk_add_v2f16:
16262
; GFX12-SDAG: ; %bb.0: ; %main_body

0 commit comments

Comments
 (0)