Skip to content

Commit 2bf5ae6

Browse files
arsenmronlieb
authored andcommitted
AMDGPU: Remove global/flat atomic fadd intrinics (llvm#97051)
These have been replaced with atomicrmw. Change-Id: If94fed28e8be6fd8be51964adc6082ce512ae79b
1 parent 1765d73 commit 2bf5ae6

39 files changed

+396
-2241
lines changed

clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp

Lines changed: 26 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -2890,51 +2890,47 @@ CGOpenMPRuntimeGPU::emitFastFPAtomicCall(CodeGenFunction &CGF, LValue X,
28902890
RValue Update, BinaryOperatorKind BO,
28912891
bool IsXBinopExpr) {
28922892
CGBuilderTy &Bld = CGF.Builder;
2893-
unsigned int IID = -1;
2894-
RValue UpdateFixed = Update;
2893+
llvm::AtomicRMWInst::BinOp Kind = llvm::AtomicRMWInst::FAdd;
28952894
switch (BO) {
28962895
case BO_Sub:
2897-
UpdateFixed = RValue::get(Bld.CreateFNeg(Update.getScalarVal()));
2898-
IID = llvm::Intrinsic::amdgcn_flat_atomic_fadd;
2896+
Kind = llvm::AtomicRMWInst::FSub;
28992897
break;
29002898
case BO_Add:
2901-
IID = llvm::Intrinsic::amdgcn_flat_atomic_fadd;
2899+
Kind = llvm::AtomicRMWInst::FAdd;
29022900
break;
29032901
case BO_LT:
2904-
IID = IsXBinopExpr ? llvm::Intrinsic::amdgcn_flat_atomic_fmax
2905-
: llvm::Intrinsic::amdgcn_flat_atomic_fmin;
2902+
Kind = IsXBinopExpr ? llvm::AtomicRMWInst::FMax : llvm::AtomicRMWInst::FMin;
29062903
break;
29072904
case BO_GT:
2908-
IID = IsXBinopExpr ? llvm::Intrinsic::amdgcn_flat_atomic_fmin
2909-
: llvm::Intrinsic::amdgcn_flat_atomic_fmax;
2905+
Kind = IsXBinopExpr ? llvm::AtomicRMWInst::FMin : llvm::AtomicRMWInst::FMax;
29102906
break;
29112907
default:
29122908
// remaining operations are not supported yet
29132909
return std::make_pair(false, RValue::get(nullptr));
29142910
}
29152911

2916-
SmallVector<llvm::Value *> FPAtomicArgs;
2917-
FPAtomicArgs.reserve(2);
2918-
FPAtomicArgs.push_back(X.getPointer(CGF));
2919-
FPAtomicArgs.push_back(UpdateFixed.getScalarVal());
2912+
llvm::Value *UpdateVal = Update.getScalarVal();
2913+
2914+
// The scope of the atomic, currently set to 'agent'. By default, if this
2915+
// scope is not specified the scope will be 'system' scope.
2916+
llvm::SyncScope::ID SSID =
2917+
CGM.getLLVMContext().getOrInsertSyncScopeID("agent");
2918+
llvm::AtomicRMWInst *CallInst = Bld.CreateAtomicRMW(
2919+
Kind, X.getAddress(), UpdateVal, llvm::AtomicOrdering::Monotonic, SSID);
2920+
2921+
// The following settings are used to get the atomicrmw instruction to
2922+
// be closer in spirit to the previous use of the intrinsic.
2923+
// Setting of amdgpu.no.fine.grained.memory property
2924+
llvm::MDTuple *EmptyMD = llvm::MDNode::get(CGM.getLLVMContext(), {});
2925+
CallInst->setMetadata("amdgpu.no.fine.grained.memory", EmptyMD);
2926+
2927+
// Setting of amdgpu.ignore.denormal.mode
2928+
if (Kind == llvm::AtomicRMWInst::FAdd && UpdateVal->getType()->isFloatTy())
2929+
CallInst->setMetadata("amdgpu.ignore.denormal.mode", EmptyMD);
2930+
2931+
// Note: breaks fp_atomics test so volatile cannot be used
2932+
// CallInst->setVolatile(true);
29202933

2921-
llvm::Value *CallInst = nullptr;
2922-
if (Update.getScalarVal()->getType()->isFloatTy() &&
2923-
(getOffloadArch(CGF.CGM) == OffloadArch::GFX90a)) {
2924-
// Fast FP atomics are not available for single precision address located in
2925-
// FLAT address space.
2926-
// We need to check the address space at runtime to determine
2927-
// which function we can call. This is done in the OpenMP runtime.
2928-
CallInst =
2929-
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
2930-
CGM.getModule(), OMPRTL___kmpc_unsafeAtomicAdd),
2931-
FPAtomicArgs);
2932-
} else {
2933-
llvm::Function *AtomicF = CGM.getIntrinsic(
2934-
IID, {FPAtomicArgs[1]->getType(), FPAtomicArgs[0]->getType(),
2935-
FPAtomicArgs[1]->getType()});
2936-
CallInst = CGF.EmitNounwindRuntimeCall(AtomicF, FPAtomicArgs);
2937-
}
29382934
return std::make_pair(true, RValue::get(CallInst));
29392935
}
29402936

clang/test/OpenMP/amdgcn_target_fast_fp_apu.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ int main(){
5252
// CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
5353
// CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
5454
// CHECK: user_code.entry:
55-
// CHECK-NEXT: [[TMP2:%.*]] = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr [[TMP0]], float 1.000000e+00) #[[ATTR2:[0-9]+]]
55+
// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float 1.000000e+00 syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META11:![0-9]+]], !amdgpu.ignore.denormal.mode [[META11]]
5656
// CHECK-NEXT: call void @__kmpc_target_deinit()
5757
// CHECK-NEXT: ret void
5858
// CHECK: worker.exit:
@@ -73,7 +73,7 @@ int main(){
7373
// CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
7474
// CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
7575
// CHECK: user_code.entry:
76-
// CHECK-NEXT: [[TMP2:%.*]] = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr [[TMP0]], float 1.000000e+00) #[[ATTR2]]
76+
// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float 1.000000e+00 syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META11]], !amdgpu.ignore.denormal.mode [[META11]]
7777
// CHECK-NEXT: call void @__kmpc_target_deinit()
7878
// CHECK-NEXT: ret void
7979
// CHECK: worker.exit:
@@ -94,9 +94,12 @@ int main(){
9494
// CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
9595
// CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
9696
// CHECK: user_code.entry:
97-
// CHECK-NEXT: [[TMP2:%.*]] = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr [[TMP0]], float 1.000000e+00) #[[ATTR2]]
97+
// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float 1.000000e+00 syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META11]], !amdgpu.ignore.denormal.mode [[META11]]
9898
// CHECK-NEXT: call void @__kmpc_target_deinit()
9999
// CHECK-NEXT: ret void
100100
// CHECK: worker.exit:
101101
// CHECK-NEXT: ret void
102102
//
103+
//.
104+
// CHECK: [[META11]] = !{}
105+
//.

clang/test/OpenMP/amdgcn_usm_atomics_hint.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ double test_amdgcn_target_atomic_hints() {
2626

2727
#pragma omp target teams distribute parallel for map(tofrom:a,b)
2828
for (int i = 0; i < N; i++) {
29-
// CHECK-HINTS: call {{.*}} @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64
29+
// CHECK-HINTS: = atomicrmw fadd
3030
#pragma omp atomic hint(amd_fast_fp_atomics)
3131
a+=(double)i;
3232

@@ -49,11 +49,11 @@ double test_amdgcn_target_atomic_unsafe_opt() {
4949

5050
#pragma omp target teams distribute parallel for map(tofrom:a,b,c)
5151
for (int i = 0; i < N; i++) {
52-
// CHECK-FLAG-UNSAFE: call {{.*}} @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64
52+
// CHECK-FLAG-UNSAFE: = atomicrmw fadd
5353
#pragma omp atomic
5454
a+=(double)i;
5555

56-
// CHECK-FLAG-UNSAFE: call {{.*}} @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64
56+
// CHECK-FLAG-UNSAFE: = atomicrmw fadd
5757
#pragma omp atomic hint(amd_fast_fp_atomics)
5858
b+=(double)i;
5959

llvm/docs/ReleaseNotes.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,11 @@ Changes to the AArch64 Backend
7878
Changes to the AMDGPU Backend
7979
-----------------------------
8080

81+
* Removed ``llvm.amdgcn.flat.atomic.fadd`` and
82+
``llvm.amdgcn.global.atomic.fadd`` intrinsics. Users should use the
83+
:ref:`atomicrmw <i_atomicrmw>` instruction with `fadd` and
84+
addrspace(0) or addrspace(1) instead.
85+
8186
Changes to the ARM Backend
8287
--------------------------
8388

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3023,8 +3023,6 @@ def int_amdgcn_dot4_f32_bf8_bf8 : AMDGPU8bitFloatDot4Intrinsic;
30233023
// gfx908 intrinsics
30243024
// ===----------------------------------------------------------------------===//
30253025

3026-
def int_amdgcn_global_atomic_fadd : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
3027-
30283026
// llvm.amdgcn.mfma.*.* vdst, srcA, srcB, srcC, cbsz, abid, blgp
30293027
class AMDGPUMfmaIntrinsic<LLVMType DestTy, LLVMType SrcABTy> :
30303028
ClangBuiltin<!subst("int", "__builtin", NAME)>,
@@ -3063,7 +3061,6 @@ def int_amdgcn_mfma_f32_16x16x8bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v
30633061

30643062
def int_amdgcn_global_atomic_fmin : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
30653063
def int_amdgcn_global_atomic_fmax : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
3066-
def int_amdgcn_flat_atomic_fadd : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
30673064
def int_amdgcn_flat_atomic_fmin : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
30683065
def int_amdgcn_flat_atomic_fmax : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
30693066

llvm/lib/IR/AutoUpgrade.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1035,8 +1035,8 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
10351035

10361036
if (Name.starts_with("ds.fadd") || Name.starts_with("ds.fmin") ||
10371037
Name.starts_with("ds.fmax") ||
1038-
Name.starts_with("global.atomic.fadd.v2bf16") ||
1039-
Name.starts_with("flat.atomic.fadd.v2bf16")) {
1038+
Name.starts_with("global.atomic.fadd") ||
1039+
Name.starts_with("flat.atomic.fadd")) {
10401040
// Replaced with atomicrmw fadd/fmin/fmax, so there's no new
10411041
// declaration.
10421042
NewFn = nullptr;

llvm/lib/Target/AMDGPU/AMDGPUInstructions.td

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -618,16 +618,11 @@ multiclass local_addr_space_atomic_op {
618618
}
619619
}
620620

621-
defm int_amdgcn_flat_atomic_fadd : noret_op;
622-
defm int_amdgcn_flat_atomic_fadd : flat_addr_space_atomic_op;
623621
defm int_amdgcn_flat_atomic_fmin : noret_op;
624622
defm int_amdgcn_flat_atomic_fmax : noret_op;
625-
defm int_amdgcn_global_atomic_fadd : global_addr_space_atomic_op;
626-
defm int_amdgcn_flat_atomic_fadd : global_addr_space_atomic_op;
627623
defm int_amdgcn_global_atomic_fmin : noret_op;
628624
defm int_amdgcn_global_atomic_fmax : noret_op;
629625
defm int_amdgcn_global_atomic_csub : noret_op;
630-
defm int_amdgcn_flat_atomic_fadd : local_addr_space_atomic_op;
631626
defm int_amdgcn_global_atomic_ordered_add_b64 : noret_op;
632627
defm int_amdgcn_flat_atomic_fmin_num : noret_op;
633628
defm int_amdgcn_flat_atomic_fmax_num : noret_op;

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4914,13 +4914,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
49144914
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
49154915
break;
49164916
}
4917-
case Intrinsic::amdgcn_global_atomic_fadd:
49184917
case Intrinsic::amdgcn_global_atomic_csub:
49194918
case Intrinsic::amdgcn_global_atomic_fmin:
49204919
case Intrinsic::amdgcn_global_atomic_fmax:
49214920
case Intrinsic::amdgcn_global_atomic_fmin_num:
49224921
case Intrinsic::amdgcn_global_atomic_fmax_num:
4923-
case Intrinsic::amdgcn_flat_atomic_fadd:
49244922
case Intrinsic::amdgcn_flat_atomic_fmin:
49254923
case Intrinsic::amdgcn_flat_atomic_fmax:
49264924
case Intrinsic::amdgcn_flat_atomic_fmin_num:

llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -239,13 +239,11 @@ def : SourceOfDivergence<int_r600_read_tidig_y>;
239239
def : SourceOfDivergence<int_r600_read_tidig_z>;
240240
def : SourceOfDivergence<int_amdgcn_atomic_cond_sub_u32>;
241241
def : SourceOfDivergence<int_amdgcn_global_atomic_csub>;
242-
def : SourceOfDivergence<int_amdgcn_global_atomic_fadd>;
243242
def : SourceOfDivergence<int_amdgcn_global_atomic_fmin>;
244243
def : SourceOfDivergence<int_amdgcn_global_atomic_fmax>;
245244
def : SourceOfDivergence<int_amdgcn_global_atomic_fmin_num>;
246245
def : SourceOfDivergence<int_amdgcn_global_atomic_fmax_num>;
247246
def : SourceOfDivergence<int_amdgcn_global_atomic_ordered_add_b64>;
248-
def : SourceOfDivergence<int_amdgcn_flat_atomic_fadd>;
249247
def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin>;
250248
def : SourceOfDivergence<int_amdgcn_flat_atomic_fmax>;
251249
def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin_num>;

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1045,7 +1045,6 @@ bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
10451045
switch (IID) {
10461046
case Intrinsic::amdgcn_is_shared:
10471047
case Intrinsic::amdgcn_is_private:
1048-
case Intrinsic::amdgcn_flat_atomic_fadd:
10491048
case Intrinsic::amdgcn_flat_atomic_fmax:
10501049
case Intrinsic::amdgcn_flat_atomic_fmin:
10511050
case Intrinsic::amdgcn_flat_atomic_fmax_num:
@@ -1107,7 +1106,6 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
11071106
return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy},
11081107
{NewV, MaskOp});
11091108
}
1110-
case Intrinsic::amdgcn_flat_atomic_fadd:
11111109
case Intrinsic::amdgcn_flat_atomic_fmax:
11121110
case Intrinsic::amdgcn_flat_atomic_fmin:
11131111
case Intrinsic::amdgcn_flat_atomic_fmax_num:

llvm/lib/Target/AMDGPU/DSInstructions.td

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1135,11 +1135,7 @@ class DSAtomicRetPatIntrinsic<DS_Pseudo inst, ValueType vt, PatFrag frag,
11351135
(vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value)),
11361136
(inst $ptr, getVregSrcForVT<vt>.ret:$value, Offset:$offset, (i1 gds))> {
11371137
}
1138-
1139-
def : DSAtomicRetPatIntrinsic<DS_ADD_RTN_F64, f64, int_amdgcn_flat_atomic_fadd_local_addrspace>;
1140-
let AddedComplexity = 1 in
1141-
def : DSAtomicRetPatIntrinsic<DS_ADD_F64, f64, int_amdgcn_flat_atomic_fadd_noret_local_addrspace>;
1142-
}
1138+
} // End SubtargetPredicate = HasLdsAtomicAddF64
11431139

11441140
let SubtargetPredicate = HasAtomicDsPkAdd16Insts in {
11451141
defm : DSAtomicRetNoRetPat_mc<DS_PK_ADD_RTN_F16, DS_PK_ADD_F16, v2f16, "atomic_load_fadd">;

llvm/lib/Target/AMDGPU/FLATInstructions.td

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1625,25 +1625,17 @@ let OtherPredicates = [isGFX12Only] in {
16251625

16261626
let OtherPredicates = [HasAtomicFaddNoRtnInsts] in {
16271627
defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_ADD_F32", "atomic_load_fadd_global", f32>;
1628-
defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", "global_addrspace", f32>;
1629-
defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_global_atomic_fadd", "global_addrspace", f32>;
16301628
}
16311629

16321630
let OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts] in {
16331631
defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_global", v2f16>;
1634-
defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "global_addrspace", v2f16>;
1635-
defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", "global_addrspace", v2f16>;
16361632
}
16371633

16381634
let OtherPredicates = [HasAtomicFaddRtnInsts] in {
16391635
defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_ADD_F32", "atomic_load_fadd_global", f32>;
1640-
defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", "global_addrspace", f32>;
1641-
defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_global_atomic_fadd", "global_addrspace", f32>;
16421636
}
16431637

16441638
let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in {
1645-
defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "global_addrspace", v2f16>;
1646-
defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", "global_addrspace", v2f16>;
16471639
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_global", v2f16>;
16481640
}
16491641

@@ -1661,19 +1653,14 @@ defm : FlatAtomicIntrPat <"FLAT_ATOMIC_MAX_F64", "int_amdgcn_flat_atomic_fmax",
16611653

16621654
let OtherPredicates = [HasFlatBufferGlobalAtomicFaddF64Inst] in {
16631655
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F64", "atomic_load_fadd_global", f64>;
1664-
defm : GlobalFLATAtomicPatsWithAddrSpace<"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_flat_atomic_fadd", "global_addrspace", f64>;
1665-
defm : GlobalFLATAtomicPatsWithAddrSpace<"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_global_atomic_fadd", "global_addrspace", f64>;
16661656
defm : FlatAtomicPat <"FLAT_ATOMIC_ADD_F64", "atomic_load_fadd_flat", f64>;
1667-
defm : FlatAtomicIntrPat <"FLAT_ATOMIC_ADD_F64", "int_amdgcn_flat_atomic_fadd", f64>;
16681657
}
16691658

16701659
let OtherPredicates = [HasFlatAtomicFaddF32Inst] in {
16711660
defm : FlatAtomicPat <"FLAT_ATOMIC_ADD_F32", "atomic_load_fadd_flat", f32>;
1672-
defm : FlatAtomicIntrPat <"FLAT_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", f32>;
16731661
}
16741662

16751663
let OtherPredicates = [HasAtomicFlatPkAdd16Insts] in {
1676-
defm : FlatAtomicIntrPat <"FLAT_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", v2f16>;
16771664
defm : FlatAtomicPat <"FLAT_ATOMIC_PK_ADD_F16", "atomic_load_fadd_flat", v2f16>;
16781665
defm : FlatAtomicPat <"FLAT_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_flat", v2bf16>;
16791666
}

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1370,13 +1370,11 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
13701370
MachineMemOperand::MODereferenceable;
13711371
return true;
13721372
}
1373-
case Intrinsic::amdgcn_global_atomic_fadd:
13741373
case Intrinsic::amdgcn_global_atomic_fmin:
13751374
case Intrinsic::amdgcn_global_atomic_fmax:
13761375
case Intrinsic::amdgcn_global_atomic_fmin_num:
13771376
case Intrinsic::amdgcn_global_atomic_fmax_num:
13781377
case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1379-
case Intrinsic::amdgcn_flat_atomic_fadd:
13801378
case Intrinsic::amdgcn_flat_atomic_fmin:
13811379
case Intrinsic::amdgcn_flat_atomic_fmax:
13821380
case Intrinsic::amdgcn_flat_atomic_fmin_num:
@@ -1490,13 +1488,11 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
14901488
case Intrinsic::amdgcn_ds_consume:
14911489
case Intrinsic::amdgcn_ds_ordered_add:
14921490
case Intrinsic::amdgcn_ds_ordered_swap:
1493-
case Intrinsic::amdgcn_flat_atomic_fadd:
14941491
case Intrinsic::amdgcn_flat_atomic_fmax:
14951492
case Intrinsic::amdgcn_flat_atomic_fmax_num:
14961493
case Intrinsic::amdgcn_flat_atomic_fmin:
14971494
case Intrinsic::amdgcn_flat_atomic_fmin_num:
14981495
case Intrinsic::amdgcn_global_atomic_csub:
1499-
case Intrinsic::amdgcn_global_atomic_fadd:
15001496
case Intrinsic::amdgcn_global_atomic_fmax:
15011497
case Intrinsic::amdgcn_global_atomic_fmax_num:
15021498
case Intrinsic::amdgcn_global_atomic_fmin:

llvm/test/Bitcode/amdgcn-atomic.ll

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -322,4 +322,36 @@ define <2 x i16> @upgrade_amdgcn_global_atomic_fadd_v2bf16_p1(ptr addrspace(1) %
322322
ret <2 x i16> %result
323323
}
324324

325+
declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr nocapture, <2 x half>) #0
326+
327+
define <2 x half> @upgrade_amdgcn_flat_atomic_fadd_v2f16_p0_v2f16(ptr %ptr, <2 x half> %data) {
328+
; CHECK: %{{.+}} = atomicrmw fadd ptr %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
329+
%result = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data)
330+
ret <2 x half> %result
331+
}
332+
333+
declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) nocapture, <2 x half>) #0
334+
335+
define <2 x half> @upgrade_amdgcn_global_atomic_fadd_v2f16_p1_v2f16(ptr addrspace(1) %ptr, <2 x half> %data) {
336+
; CHECK: %{{.+}} = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
337+
%result = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
338+
ret <2 x half> %result
339+
}
340+
341+
declare float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr nocapture, float) #0
342+
343+
define float @upgrade_amdgcn_flat_atomic_fadd_f32_p0_f32(ptr %ptr, float %data) {
344+
; CHECK: %{{.+}} = atomicrmw fadd ptr %ptr, float %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
345+
%result = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data)
346+
ret float %result
347+
}
348+
349+
declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) nocapture, float) #0
350+
351+
define float @upgrade_amdgcn_global_atomic_fadd_f32_p1_f32(ptr addrspace(1) %ptr, float %data) {
352+
; CHECK: %{{.+}} = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
353+
%result = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
354+
ret float %result
355+
}
356+
325357
attributes #0 = { argmemonly nounwind willreturn }

0 commit comments

Comments
 (0)