Skip to content

Commit 3567d2e

Browse files
committed
Revert "[RELAND] AMDGPU: Remove global/flat atomic fadd intrinics (llvm#97051)"
This reverts commit 20a9c40. Reason for revert: i think this is causing hangs in 532.sph_ex hpc2021 Change-Id: I29410accb55343e2e3826a5018812b16f0224139
1 parent 17e31ab commit 3567d2e

39 files changed

+2222
-375
lines changed

clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2890,21 +2890,23 @@ CGOpenMPRuntimeGPU::emitFastFPAtomicCall(CodeGenFunction &CGF, LValue X,
28902890
RValue Update, BinaryOperatorKind BO,
28912891
bool IsXBinopExpr) {
28922892
CGBuilderTy &Bld = CGF.Builder;
2893-
llvm::AtomicRMWInst::BinOp Kind = llvm::AtomicRMWInst::FAdd;
2893+
unsigned int IID = -1;
28942894
RValue UpdateFixed = Update;
28952895
switch (BO) {
28962896
case BO_Sub:
28972897
UpdateFixed = RValue::get(Bld.CreateFNeg(Update.getScalarVal()));
2898-
Kind = llvm::AtomicRMWInst::FAdd;
2898+
IID = llvm::Intrinsic::amdgcn_flat_atomic_fadd;
28992899
break;
29002900
case BO_Add:
2901-
Kind = llvm::AtomicRMWInst::FAdd;
2901+
IID = llvm::Intrinsic::amdgcn_flat_atomic_fadd;
29022902
break;
29032903
case BO_LT:
2904-
Kind = IsXBinopExpr ? llvm::AtomicRMWInst::FMax : llvm::AtomicRMWInst::FMin;
2904+
IID = IsXBinopExpr ? llvm::Intrinsic::amdgcn_flat_atomic_fmax
2905+
: llvm::Intrinsic::amdgcn_flat_atomic_fmin;
29052906
break;
29062907
case BO_GT:
2907-
Kind = IsXBinopExpr ? llvm::AtomicRMWInst::FMin : llvm::AtomicRMWInst::FMax;
2908+
IID = IsXBinopExpr ? llvm::Intrinsic::amdgcn_flat_atomic_fmin
2909+
: llvm::Intrinsic::amdgcn_flat_atomic_fmax;
29082910
break;
29092911
default:
29102912
// remaining operations are not supported yet
@@ -2928,9 +2930,10 @@ CGOpenMPRuntimeGPU::emitFastFPAtomicCall(CodeGenFunction &CGF, LValue X,
29282930
CGM.getModule(), OMPRTL___kmpc_unsafeAtomicAdd),
29292931
FPAtomicArgs);
29302932
} else {
2931-
CallInst =
2932-
Bld.CreateAtomicRMW(Kind, X.getAddress(), FPAtomicArgs[1],
2933-
llvm::AtomicOrdering::SequentiallyConsistent);
2933+
llvm::Function *AtomicF = CGM.getIntrinsic(
2934+
IID, {FPAtomicArgs[1]->getType(), FPAtomicArgs[0]->getType(),
2935+
FPAtomicArgs[1]->getType()});
2936+
CallInst = CGF.EmitNounwindRuntimeCall(AtomicF, FPAtomicArgs);
29342937
}
29352938
return std::make_pair(true, RValue::get(CallInst));
29362939
}

clang/test/OpenMP/amdgcn_target_fast_fp_apu.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ int main(){
5252
// CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
5353
// CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
5454
// CHECK: user_code.entry:
55-
// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float 1.000000e+00 seq_cst, align 4
55+
// CHECK-NEXT: [[TMP2:%.*]] = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr [[TMP0]], float 1.000000e+00) #[[ATTR2:[0-9]+]]
5656
// CHECK-NEXT: call void @__kmpc_target_deinit()
5757
// CHECK-NEXT: ret void
5858
// CHECK: worker.exit:
@@ -73,7 +73,7 @@ int main(){
7373
// CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
7474
// CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
7575
// CHECK: user_code.entry:
76-
// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float 1.000000e+00 seq_cst, align 4
76+
// CHECK-NEXT: [[TMP2:%.*]] = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr [[TMP0]], float 1.000000e+00) #[[ATTR2]]
7777
// CHECK-NEXT: call void @__kmpc_target_deinit()
7878
// CHECK-NEXT: ret void
7979
// CHECK: worker.exit:
@@ -94,7 +94,7 @@ int main(){
9494
// CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
9595
// CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
9696
// CHECK: user_code.entry:
97-
// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float 1.000000e+00 seq_cst, align 4
97+
// CHECK-NEXT: [[TMP2:%.*]] = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr [[TMP0]], float 1.000000e+00) #[[ATTR2]]
9898
// CHECK-NEXT: call void @__kmpc_target_deinit()
9999
// CHECK-NEXT: ret void
100100
// CHECK: worker.exit:

clang/test/OpenMP/amdgcn_usm_atomics_hint.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ double test_amdgcn_target_atomic_hints() {
2626

2727
#pragma omp target teams distribute parallel for map(tofrom:a,b)
2828
for (int i = 0; i < N; i++) {
29-
// CHECK-HINTS: = atomicrmw fadd
29+
// CHECK-HINTS: call {{.*}} @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64
3030
#pragma omp atomic hint(amd_fast_fp_atomics)
3131
a+=(double)i;
3232

@@ -49,11 +49,11 @@ double test_amdgcn_target_atomic_unsafe_opt() {
4949

5050
#pragma omp target teams distribute parallel for map(tofrom:a,b,c)
5151
for (int i = 0; i < N; i++) {
52-
// CHECK-FLAG-UNSAFE: = atomicrmw fadd
52+
// CHECK-FLAG-UNSAFE: call {{.*}} @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64
5353
#pragma omp atomic
5454
a+=(double)i;
5555

56-
// CHECK-FLAG-UNSAFE: = atomicrmw fadd
56+
// CHECK-FLAG-UNSAFE: call {{.*}} @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64
5757
#pragma omp atomic hint(amd_fast_fp_atomics)
5858
b+=(double)i;
5959

llvm/docs/ReleaseNotes.rst

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -78,11 +78,6 @@ Changes to the AArch64 Backend
7878
Changes to the AMDGPU Backend
7979
-----------------------------
8080

81-
* Removed ``llvm.amdgcn.flat.atomic.fadd`` and
82-
``llvm.amdgcn.global.atomic.fadd`` intrinsics. Users should use the
83-
:ref:`atomicrmw <i_atomicrmw>` instruction with `fadd` and
84-
addrspace(0) or addrspace(1) instead.
85-
8681
Changes to the ARM Backend
8782
--------------------------
8883

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3023,6 +3023,8 @@ def int_amdgcn_dot4_f32_bf8_bf8 : AMDGPU8bitFloatDot4Intrinsic;
30233023
// gfx908 intrinsics
30243024
// ===----------------------------------------------------------------------===//
30253025

3026+
def int_amdgcn_global_atomic_fadd : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
3027+
30263028
// llvm.amdgcn.mfma.*.* vdst, srcA, srcB, srcC, cbsz, abid, blgp
30273029
class AMDGPUMfmaIntrinsic<LLVMType DestTy, LLVMType SrcABTy> :
30283030
ClangBuiltin<!subst("int", "__builtin", NAME)>,
@@ -3061,6 +3063,7 @@ def int_amdgcn_mfma_f32_16x16x8bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v
30613063

30623064
def int_amdgcn_global_atomic_fmin : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
30633065
def int_amdgcn_global_atomic_fmax : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
3066+
def int_amdgcn_flat_atomic_fadd : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
30643067
def int_amdgcn_flat_atomic_fmin : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
30653068
def int_amdgcn_flat_atomic_fmax : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
30663069

llvm/lib/IR/AutoUpgrade.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1035,8 +1035,8 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
10351035

10361036
if (Name.starts_with("ds.fadd") || Name.starts_with("ds.fmin") ||
10371037
Name.starts_with("ds.fmax") ||
1038-
Name.starts_with("global.atomic.fadd") ||
1039-
Name.starts_with("flat.atomic.fadd")) {
1038+
Name.starts_with("global.atomic.fadd.v2bf16") ||
1039+
Name.starts_with("flat.atomic.fadd.v2bf16")) {
10401040
// Replaced with atomicrmw fadd/fmin/fmax, so there's no new
10411041
// declaration.
10421042
NewFn = nullptr;

llvm/lib/Target/AMDGPU/AMDGPUInstructions.td

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -618,11 +618,16 @@ multiclass local_addr_space_atomic_op {
618618
}
619619
}
620620

621+
defm int_amdgcn_flat_atomic_fadd : noret_op;
622+
defm int_amdgcn_flat_atomic_fadd : flat_addr_space_atomic_op;
621623
defm int_amdgcn_flat_atomic_fmin : noret_op;
622624
defm int_amdgcn_flat_atomic_fmax : noret_op;
625+
defm int_amdgcn_global_atomic_fadd : global_addr_space_atomic_op;
626+
defm int_amdgcn_flat_atomic_fadd : global_addr_space_atomic_op;
623627
defm int_amdgcn_global_atomic_fmin : noret_op;
624628
defm int_amdgcn_global_atomic_fmax : noret_op;
625629
defm int_amdgcn_global_atomic_csub : noret_op;
630+
defm int_amdgcn_flat_atomic_fadd : local_addr_space_atomic_op;
626631
defm int_amdgcn_global_atomic_ordered_add_b64 : noret_op;
627632
defm int_amdgcn_flat_atomic_fmin_num : noret_op;
628633
defm int_amdgcn_flat_atomic_fmax_num : noret_op;

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4914,11 +4914,13 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
49144914
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
49154915
break;
49164916
}
4917+
case Intrinsic::amdgcn_global_atomic_fadd:
49174918
case Intrinsic::amdgcn_global_atomic_csub:
49184919
case Intrinsic::amdgcn_global_atomic_fmin:
49194920
case Intrinsic::amdgcn_global_atomic_fmax:
49204921
case Intrinsic::amdgcn_global_atomic_fmin_num:
49214922
case Intrinsic::amdgcn_global_atomic_fmax_num:
4923+
case Intrinsic::amdgcn_flat_atomic_fadd:
49224924
case Intrinsic::amdgcn_flat_atomic_fmin:
49234925
case Intrinsic::amdgcn_flat_atomic_fmax:
49244926
case Intrinsic::amdgcn_flat_atomic_fmin_num:

llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,11 +239,13 @@ def : SourceOfDivergence<int_r600_read_tidig_y>;
239239
def : SourceOfDivergence<int_r600_read_tidig_z>;
240240
def : SourceOfDivergence<int_amdgcn_atomic_cond_sub_u32>;
241241
def : SourceOfDivergence<int_amdgcn_global_atomic_csub>;
242+
def : SourceOfDivergence<int_amdgcn_global_atomic_fadd>;
242243
def : SourceOfDivergence<int_amdgcn_global_atomic_fmin>;
243244
def : SourceOfDivergence<int_amdgcn_global_atomic_fmax>;
244245
def : SourceOfDivergence<int_amdgcn_global_atomic_fmin_num>;
245246
def : SourceOfDivergence<int_amdgcn_global_atomic_fmax_num>;
246247
def : SourceOfDivergence<int_amdgcn_global_atomic_ordered_add_b64>;
248+
def : SourceOfDivergence<int_amdgcn_flat_atomic_fadd>;
247249
def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin>;
248250
def : SourceOfDivergence<int_amdgcn_flat_atomic_fmax>;
249251
def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin_num>;

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1045,6 +1045,7 @@ bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
10451045
switch (IID) {
10461046
case Intrinsic::amdgcn_is_shared:
10471047
case Intrinsic::amdgcn_is_private:
1048+
case Intrinsic::amdgcn_flat_atomic_fadd:
10481049
case Intrinsic::amdgcn_flat_atomic_fmax:
10491050
case Intrinsic::amdgcn_flat_atomic_fmin:
10501051
case Intrinsic::amdgcn_flat_atomic_fmax_num:
@@ -1106,6 +1107,7 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
11061107
return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy},
11071108
{NewV, MaskOp});
11081109
}
1110+
case Intrinsic::amdgcn_flat_atomic_fadd:
11091111
case Intrinsic::amdgcn_flat_atomic_fmax:
11101112
case Intrinsic::amdgcn_flat_atomic_fmin:
11111113
case Intrinsic::amdgcn_flat_atomic_fmax_num:

llvm/lib/Target/AMDGPU/DSInstructions.td

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1135,7 +1135,11 @@ class DSAtomicRetPatIntrinsic<DS_Pseudo inst, ValueType vt, PatFrag frag,
11351135
(vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value)),
11361136
(inst $ptr, getVregSrcForVT<vt>.ret:$value, Offset:$offset, (i1 gds))> {
11371137
}
1138-
} // End SubtargetPredicate = HasLdsAtomicAddF64
1138+
1139+
def : DSAtomicRetPatIntrinsic<DS_ADD_RTN_F64, f64, int_amdgcn_flat_atomic_fadd_local_addrspace>;
1140+
let AddedComplexity = 1 in
1141+
def : DSAtomicRetPatIntrinsic<DS_ADD_F64, f64, int_amdgcn_flat_atomic_fadd_noret_local_addrspace>;
1142+
}
11391143

11401144
let SubtargetPredicate = HasAtomicDsPkAdd16Insts in {
11411145
defm : DSAtomicRetNoRetPat_mc<DS_PK_ADD_RTN_F16, DS_PK_ADD_F16, v2f16, "atomic_load_fadd">;

llvm/lib/Target/AMDGPU/FLATInstructions.td

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1625,17 +1625,25 @@ let OtherPredicates = [isGFX12Only] in {
16251625

16261626
let OtherPredicates = [HasAtomicFaddNoRtnInsts] in {
16271627
defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_ADD_F32", "atomic_load_fadd_global", f32>;
1628+
defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", "global_addrspace", f32>;
1629+
defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_global_atomic_fadd", "global_addrspace", f32>;
16281630
}
16291631

16301632
let OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts] in {
16311633
defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_global", v2f16>;
1634+
defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "global_addrspace", v2f16>;
1635+
defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", "global_addrspace", v2f16>;
16321636
}
16331637

16341638
let OtherPredicates = [HasAtomicFaddRtnInsts] in {
16351639
defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_ADD_F32", "atomic_load_fadd_global", f32>;
1640+
defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", "global_addrspace", f32>;
1641+
defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_global_atomic_fadd", "global_addrspace", f32>;
16361642
}
16371643

16381644
let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in {
1645+
defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "global_addrspace", v2f16>;
1646+
defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", "global_addrspace", v2f16>;
16391647
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_global", v2f16>;
16401648
}
16411649

@@ -1653,14 +1661,19 @@ defm : FlatAtomicIntrPat <"FLAT_ATOMIC_MAX_F64", "int_amdgcn_flat_atomic_fmax",
16531661

16541662
let OtherPredicates = [HasFlatBufferGlobalAtomicFaddF64Inst] in {
16551663
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F64", "atomic_load_fadd_global", f64>;
1664+
defm : GlobalFLATAtomicPatsWithAddrSpace<"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_flat_atomic_fadd", "global_addrspace", f64>;
1665+
defm : GlobalFLATAtomicPatsWithAddrSpace<"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_global_atomic_fadd", "global_addrspace", f64>;
16561666
defm : FlatAtomicPat <"FLAT_ATOMIC_ADD_F64", "atomic_load_fadd_flat", f64>;
1667+
defm : FlatAtomicIntrPat <"FLAT_ATOMIC_ADD_F64", "int_amdgcn_flat_atomic_fadd", f64>;
16571668
}
16581669

16591670
let OtherPredicates = [HasFlatAtomicFaddF32Inst] in {
16601671
defm : FlatAtomicPat <"FLAT_ATOMIC_ADD_F32", "atomic_load_fadd_flat", f32>;
1672+
defm : FlatAtomicIntrPat <"FLAT_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", f32>;
16611673
}
16621674

16631675
let OtherPredicates = [HasAtomicFlatPkAdd16Insts] in {
1676+
defm : FlatAtomicIntrPat <"FLAT_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", v2f16>;
16641677
defm : FlatAtomicPat <"FLAT_ATOMIC_PK_ADD_F16", "atomic_load_fadd_flat", v2f16>;
16651678
defm : FlatAtomicPat <"FLAT_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_flat", v2bf16>;
16661679
}

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1370,11 +1370,13 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
13701370
MachineMemOperand::MODereferenceable;
13711371
return true;
13721372
}
1373+
case Intrinsic::amdgcn_global_atomic_fadd:
13731374
case Intrinsic::amdgcn_global_atomic_fmin:
13741375
case Intrinsic::amdgcn_global_atomic_fmax:
13751376
case Intrinsic::amdgcn_global_atomic_fmin_num:
13761377
case Intrinsic::amdgcn_global_atomic_fmax_num:
13771378
case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1379+
case Intrinsic::amdgcn_flat_atomic_fadd:
13781380
case Intrinsic::amdgcn_flat_atomic_fmin:
13791381
case Intrinsic::amdgcn_flat_atomic_fmax:
13801382
case Intrinsic::amdgcn_flat_atomic_fmin_num:
@@ -1488,11 +1490,13 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
14881490
case Intrinsic::amdgcn_ds_consume:
14891491
case Intrinsic::amdgcn_ds_ordered_add:
14901492
case Intrinsic::amdgcn_ds_ordered_swap:
1493+
case Intrinsic::amdgcn_flat_atomic_fadd:
14911494
case Intrinsic::amdgcn_flat_atomic_fmax:
14921495
case Intrinsic::amdgcn_flat_atomic_fmax_num:
14931496
case Intrinsic::amdgcn_flat_atomic_fmin:
14941497
case Intrinsic::amdgcn_flat_atomic_fmin_num:
14951498
case Intrinsic::amdgcn_global_atomic_csub:
1499+
case Intrinsic::amdgcn_global_atomic_fadd:
14961500
case Intrinsic::amdgcn_global_atomic_fmax:
14971501
case Intrinsic::amdgcn_global_atomic_fmax_num:
14981502
case Intrinsic::amdgcn_global_atomic_fmin:

llvm/test/Bitcode/amdgcn-atomic.ll

Lines changed: 0 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -322,36 +322,4 @@ define <2 x i16> @upgrade_amdgcn_global_atomic_fadd_v2bf16_p1(ptr addrspace(1) %
322322
ret <2 x i16> %result
323323
}
324324

325-
declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr nocapture, <2 x half>) #0
326-
327-
define <2 x half> @upgrade_amdgcn_flat_atomic_fadd_v2f16_p0_v2f16(ptr %ptr, <2 x half> %data) {
328-
; CHECK: %{{.+}} = atomicrmw fadd ptr %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
329-
%result = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data)
330-
ret <2 x half> %result
331-
}
332-
333-
declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) nocapture, <2 x half>) #0
334-
335-
define <2 x half> @upgrade_amdgcn_global_atomic_fadd_v2f16_p1_v2f16(ptr addrspace(1) %ptr, <2 x half> %data) {
336-
; CHECK: %{{.+}} = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
337-
%result = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
338-
ret <2 x half> %result
339-
}
340-
341-
declare float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr nocapture, float) #0
342-
343-
define float @upgrade_amdgcn_flat_atomic_fadd_f32_p0_f32(ptr %ptr, float %data) {
344-
; CHECK: %{{.+}} = atomicrmw fadd ptr %ptr, float %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
345-
%result = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data)
346-
ret float %result
347-
}
348-
349-
declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) nocapture, float) #0
350-
351-
define float @upgrade_amdgcn_global_atomic_fadd_f32_p1_f32(ptr addrspace(1) %ptr, float %data) {
352-
; CHECK: %{{.+}} = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
353-
%result = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
354-
ret float %result
355-
}
356-
357325
attributes #0 = { argmemonly nounwind willreturn }

llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_intrinsic(ptr %ptr, float %da
1212
; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
1313
; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
1414
; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
15-
; GFX940-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr)
15+
; GFX940-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr)
1616
; GFX940-NEXT: S_ENDPGM 0
1717
;
1818
; GFX11-LABEL: name: flat_atomic_fadd_f32_no_rtn_intrinsic
@@ -23,7 +23,7 @@ define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_intrinsic(ptr %ptr, float %da
2323
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
2424
; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
2525
; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
26-
; GFX11-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr)
26+
; GFX11-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr)
2727
; GFX11-NEXT: S_ENDPGM 0
2828
%ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr %ptr, float %data)
2929
ret void
@@ -38,7 +38,7 @@ define amdgpu_ps float @flat_atomic_fadd_f32_rtn_intrinsic(ptr %ptr, float %data
3838
; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
3939
; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
4040
; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
41-
; GFX940-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr)
41+
; GFX940-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr)
4242
; GFX940-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]]
4343
; GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
4444
;
@@ -50,7 +50,7 @@ define amdgpu_ps float @flat_atomic_fadd_f32_rtn_intrinsic(ptr %ptr, float %data
5050
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
5151
; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
5252
; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
53-
; GFX11-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr)
53+
; GFX11-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr)
5454
; GFX11-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]]
5555
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
5656
%ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr %ptr, float %data)

0 commit comments

Comments
 (0)