Skip to content

Commit 5b6db43

Browse files
authored
AMDGPU: Simplify DS atomicrmw fadd handling (llvm#89468)
DS atomic fadd F32 does respect the denormal mode, so we do not need to consider the expected FP mode or unsafe-fp-atomics attribute. They don't respect the rounding mode, but we don't care outside of strictfp. This also reveals the fp-mode-is-flush check has been missing in the cases that should be considering it alongside amdgpu-unsafe-fp-atomics. This also stops considering the case where flushing is enabled for f64, as flushing isn't mandated and we barely handle this case.
1 parent 087b33b commit 5b6db43

File tree

9 files changed

+342
-155
lines changed

9 files changed

+342
-155
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1896,7 +1896,7 @@ def HasVINTERPEncoding : Predicate<"Subtarget->hasVINTERPEncoding()">,
18961896
def HasDSAddTid : Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9">,
18971897
AssemblerPredicate<(all_of FeatureGFX9Insts)>;
18981898

1899-
def HasLDSFPAtomicAdd : Predicate<"Subtarget->hasLDSFPAtomicAdd()">,
1899+
def HasLDSFPAtomicAddF32 : Predicate<"Subtarget->hasLDSFPAtomicAddF32()">,
19001900
AssemblerPredicate<(all_of FeatureGFX8Insts)>;
19011901

19021902
def HasAddNoCarryInsts : Predicate<"Subtarget->hasAddNoCarry()">,

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1624,7 +1624,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
16241624
}
16251625

16261626
auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1627-
if (ST.hasLDSFPAtomicAdd()) {
1627+
if (ST.hasLDSFPAtomicAddF32()) {
16281628
Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
16291629
if (ST.hasLdsAtomicAddF64())
16301630
Atomic.legalFor({{S64, LocalPtr}});

llvm/lib/Target/AMDGPU/DSInstructions.td

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -443,7 +443,7 @@ defm DS_AND_B32 : DS_1A1D_NORET_mc<"ds_and_b32">;
443443
defm DS_OR_B32 : DS_1A1D_NORET_mc<"ds_or_b32">;
444444
defm DS_XOR_B32 : DS_1A1D_NORET_mc<"ds_xor_b32">;
445445

446-
let SubtargetPredicate = HasLDSFPAtomicAdd in {
446+
let SubtargetPredicate = HasLDSFPAtomicAddF32 in {
447447
defm DS_ADD_F32 : DS_1A1D_NORET_mc<"ds_add_f32">;
448448
}
449449

@@ -523,7 +523,7 @@ defm DS_MAX_F64 : DS_1A1D_NORET_mc<"ds_max_f64", VReg_64>;
523523

524524
defm DS_ADD_RTN_U32 : DS_1A1D_RET_mc<"ds_add_rtn_u32", VGPR_32>;
525525

526-
let SubtargetPredicate = HasLDSFPAtomicAdd in {
526+
let SubtargetPredicate = HasLDSFPAtomicAddF32 in {
527527
defm DS_ADD_RTN_F32 : DS_1A1D_RET_mc<"ds_add_rtn_f32", VGPR_32>;
528528
}
529529
defm DS_SUB_RTN_U32 : DS_1A1D_RET_mc<"ds_sub_rtn_u32", VGPR_32>;
@@ -697,7 +697,7 @@ def DS_BPERMUTE_B32 : DS_1A1D_PERMUTE <"ds_bpermute_b32",
697697

698698
} // let SubtargetPredicate = isGFX8Plus
699699

700-
let SubtargetPredicate = HasLDSFPAtomicAdd, OtherPredicates = [HasDsSrc2Insts] in {
700+
let SubtargetPredicate = HasLDSFPAtomicAddF32, OtherPredicates = [HasDsSrc2Insts] in {
701701
def DS_ADD_SRC2_F32 : DS_1A<"ds_add_src2_f32">;
702702
}
703703

@@ -1088,7 +1088,7 @@ let SubtargetPredicate = isGFX11Plus in {
10881088
defm : DSAtomicCmpXChg_mc<DS_CMPSTORE_RTN_B32, DS_CMPSTORE_B32, i32, "atomic_cmp_swap">;
10891089
}
10901090

1091-
let SubtargetPredicate = HasLDSFPAtomicAdd in {
1091+
let SubtargetPredicate = HasLDSFPAtomicAddF32 in {
10921092
defm : DSAtomicRetNoRetPat_mc<DS_ADD_RTN_F32, DS_ADD_F32, f32, "atomic_load_fadd">;
10931093
}
10941094

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -960,7 +960,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
960960
return HasScalarAtomics;
961961
}
962962

963-
bool hasLDSFPAtomicAdd() const { return GFX8Insts; }
963+
bool hasLDSFPAtomicAddF32() const { return GFX8Insts; }
964+
bool hasLDSFPAtomicAddF64() const { return GFX90AInsts; }
964965

965966
/// \returns true if the subtarget has the v_permlanex16_b32 instruction.
966967
bool hasPermLaneX16() const { return getGeneration() >= GFX10; }

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 28 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -15977,6 +15977,8 @@ bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
1597715977
SNaN, Depth);
1597815978
}
1597915979

15980+
#if 0
15981+
// FIXME: This should be checked before unsafe fp atomics are enabled
1598015982
// Global FP atomic instructions have a hardcoded FP mode and do not support
1598115983
// FP32 denormals, and only support v2f16 denormals.
1598215984
static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) {
@@ -15986,6 +15988,7 @@ static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) {
1598615988
return DenormMode == DenormalMode::getPreserveSign();
1598715989
return DenormMode == DenormalMode::getIEEE();
1598815990
}
15991+
#endif
1598915992

1599015993
// The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe
1599115994
// floating point atomic instructions. May generate more efficient code,
@@ -16046,8 +16049,31 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1604616049
case AtomicRMWInst::FAdd: {
1604716050
Type *Ty = RMW->getType();
1604816051

16049-
if (Ty->isHalfTy())
16052+
// TODO: Handle REGION_ADDRESS
16053+
if (AS == AMDGPUAS::LOCAL_ADDRESS) {
16054+
// DS F32 FP atomics do respect the denormal mode, but the rounding mode
16055+
// is fixed to round-to-nearest-even.
16056+
//
16057+
// F64 / PK_F16 / PK_BF16 never flush and are also fixed to
16058+
// round-to-nearest-even.
16059+
//
16060+
// We ignore the rounding mode problem, even in strictfp. The C++ standard
16061+
// suggests it is OK if the floating-point mode may not match the calling
16062+
// thread.
16063+
if (Ty->isFloatTy()) {
16064+
return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None
16065+
: AtomicExpansionKind::CmpXChg;
16066+
}
16067+
16068+
if (Ty->isDoubleTy()) {
16069+
// Ignores denormal mode, but we don't consider flushing mandatory.
16070+
return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None
16071+
: AtomicExpansionKind::CmpXChg;
16072+
}
16073+
16074+
// TODO: Handle v2f16/v2bf16 cases for gfx940
1605016075
return AtomicExpansionKind::CmpXChg;
16076+
}
1605116077

1605216078
if (!Ty->isFloatTy() && (!Subtarget->hasGFX90AInsts() || !Ty->isDoubleTy()))
1605316079
return AtomicExpansionKind::CmpXChg;
@@ -16091,7 +16117,7 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1609116117
// space. If it is in global address space, we emit the global atomic
1609216118
// fadd; if it is in shared address space, we emit the LDS atomic fadd.
1609316119
if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy() &&
16094-
Subtarget->hasLDSFPAtomicAdd()) {
16120+
Subtarget->hasLDSFPAtomicAddF32()) {
1609516121
if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
1609616122
return AtomicExpansionKind::Expand;
1609716123
if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
@@ -16101,23 +16127,6 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1610116127
return AtomicExpansionKind::CmpXChg;
1610216128
}
1610316129

16104-
// DS FP atomics do respect the denormal mode, but the rounding mode is
16105-
// fixed to round-to-nearest-even.
16106-
// The only exception is DS_ADD_F64 which never flushes regardless of mode.
16107-
if (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomicAdd()) {
16108-
if (!Ty->isDoubleTy())
16109-
return AtomicExpansionKind::None;
16110-
16111-
if (fpModeMatchesGlobalFPAtomicMode(RMW))
16112-
return AtomicExpansionKind::None;
16113-
16114-
return RMW->getFunction()
16115-
->getFnAttribute("amdgpu-unsafe-fp-atomics")
16116-
.getValueAsString() == "true"
16117-
? ReportUnsafeHWInst(AtomicExpansionKind::None)
16118-
: AtomicExpansionKind::CmpXChg;
16119-
}
16120-
1612116130
return AtomicExpansionKind::CmpXChg;
1612216131
}
1612316132
case AtomicRMWInst::FMin:

llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll

Lines changed: 8 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -2074,28 +2074,17 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp
20742074
; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
20752075
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
20762076
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
2077-
; GFX90A-NEXT: s_cbranch_execz .LBB67_3
2077+
; GFX90A-NEXT: s_cbranch_execz .LBB67_2
20782078
; GFX90A-NEXT: ; %bb.1:
20792079
; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24
20802080
; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3]
20812081
; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s1
20822082
; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
20832083
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
2084-
; GFX90A-NEXT: v_mov_b32_e32 v4, s0
2085-
; GFX90A-NEXT: ds_read_b64 v[2:3], v4
2086-
; GFX90A-NEXT: s_mov_b64 s[0:1], 0
2087-
; GFX90A-NEXT: .LBB67_2: ; %atomicrmw.start
2088-
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
2089-
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
2090-
; GFX90A-NEXT: v_add_f64 v[6:7], v[2:3], v[0:1]
2091-
; GFX90A-NEXT: ds_cmpst_rtn_b64 v[6:7], v4, v[2:3], v[6:7]
2084+
; GFX90A-NEXT: v_mov_b32_e32 v2, s0
2085+
; GFX90A-NEXT: ds_add_f64 v2, v[0:1]
20922086
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
2093-
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
2094-
; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
2095-
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[6:7] op_sel:[0,1]
2096-
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1]
2097-
; GFX90A-NEXT: s_cbranch_execnz .LBB67_2
2098-
; GFX90A-NEXT: .LBB67_3:
2087+
; GFX90A-NEXT: .LBB67_2:
20992088
; GFX90A-NEXT: s_endpgm
21002089
;
21012090
; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe:
@@ -2106,28 +2095,17 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp
21062095
; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
21072096
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
21082097
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
2109-
; GFX940-NEXT: s_cbranch_execz .LBB67_3
2098+
; GFX940-NEXT: s_cbranch_execz .LBB67_2
21102099
; GFX940-NEXT: ; %bb.1:
21112100
; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24
21122101
; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3]
21132102
; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s1
21142103
; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
21152104
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
2116-
; GFX940-NEXT: v_mov_b32_e32 v4, s0
2117-
; GFX940-NEXT: ds_read_b64 v[2:3], v4
2118-
; GFX940-NEXT: s_mov_b64 s[0:1], 0
2119-
; GFX940-NEXT: .LBB67_2: ; %atomicrmw.start
2120-
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
2121-
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
2122-
; GFX940-NEXT: v_add_f64 v[6:7], v[2:3], v[0:1]
2123-
; GFX940-NEXT: ds_cmpst_rtn_b64 v[6:7], v4, v[2:3], v[6:7]
2105+
; GFX940-NEXT: v_mov_b32_e32 v2, s0
2106+
; GFX940-NEXT: ds_add_f64 v2, v[0:1]
21242107
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
2125-
; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
2126-
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
2127-
; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[6:7]
2128-
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
2129-
; GFX940-NEXT: s_cbranch_execnz .LBB67_2
2130-
; GFX940-NEXT: .LBB67_3:
2108+
; GFX940-NEXT: .LBB67_2:
21312109
; GFX940-NEXT: s_endpgm
21322110
main_body:
21332111
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst

llvm/test/CodeGen/AMDGPU/atomics-hw-remarks-gfx90a.ll

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs --pass-remarks=si-lower \
22
; RUN: %s -o - 2>&1 | FileCheck %s --check-prefix=GFX90A-HW
33

4-
; GFX90A-HW: Hardware instruction generated for atomic fadd operation at memory scope system due to an unsafe request.
54
; GFX90A-HW: Hardware instruction generated for atomic fadd operation at memory scope agent due to an unsafe request.
65
; GFX90A-HW: Hardware instruction generated for atomic fadd operation at memory scope workgroup due to an unsafe request.
76
; GFX90A-HW: Hardware instruction generated for atomic fadd operation at memory scope wavefront due to an unsafe request.

llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll

Lines changed: 14 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -2213,29 +2213,17 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp
22132213
; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
22142214
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
22152215
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
2216-
; GFX90A-NEXT: s_cbranch_execz .LBB72_3
2216+
; GFX90A-NEXT: s_cbranch_execz .LBB72_2
22172217
; GFX90A-NEXT: ; %bb.1:
2218-
; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x24
2219-
; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
2220-
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
2221-
; GFX90A-NEXT: v_mov_b32_e32 v0, s4
2222-
; GFX90A-NEXT: ds_read_b64 v[2:3], v0
2223-
; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
2218+
; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24
2219+
; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3]
2220+
; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s1
22242221
; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
2225-
; GFX90A-NEXT: s_mov_b64 s[0:1], 0
2226-
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
2227-
; GFX90A-NEXT: .LBB72_2: ; %atomicrmw.start
2228-
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
22292222
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
2230-
; GFX90A-NEXT: v_add_f64 v[6:7], v[2:3], v[0:1]
2231-
; GFX90A-NEXT: ds_cmpst_rtn_b64 v[6:7], v4, v[2:3], v[6:7]
2223+
; GFX90A-NEXT: v_mov_b32_e32 v2, s0
2224+
; GFX90A-NEXT: ds_add_f64 v2, v[0:1]
22322225
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
2233-
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
2234-
; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
2235-
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[6:7] op_sel:[0,1]
2236-
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1]
2237-
; GFX90A-NEXT: s_cbranch_execnz .LBB72_2
2238-
; GFX90A-NEXT: .LBB72_3:
2226+
; GFX90A-NEXT: .LBB72_2:
22392227
; GFX90A-NEXT: s_endpgm
22402228
;
22412229
; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe:
@@ -2245,29 +2233,17 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp
22452233
; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
22462234
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
22472235
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
2248-
; GFX940-NEXT: s_cbranch_execz .LBB72_3
2236+
; GFX940-NEXT: s_cbranch_execz .LBB72_2
22492237
; GFX940-NEXT: ; %bb.1:
2250-
; GFX940-NEXT: s_load_dword s4, s[0:1], 0x24
2251-
; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
2252-
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
2253-
; GFX940-NEXT: v_mov_b32_e32 v0, s4
2254-
; GFX940-NEXT: ds_read_b64 v[2:3], v0
2255-
; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
2238+
; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24
2239+
; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3]
2240+
; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s1
22562241
; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
2257-
; GFX940-NEXT: s_mov_b64 s[0:1], 0
2258-
; GFX940-NEXT: v_mov_b32_e32 v4, s4
2259-
; GFX940-NEXT: .LBB72_2: ; %atomicrmw.start
2260-
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
22612242
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
2262-
; GFX940-NEXT: v_add_f64 v[6:7], v[2:3], v[0:1]
2263-
; GFX940-NEXT: ds_cmpst_rtn_b64 v[6:7], v4, v[2:3], v[6:7]
2243+
; GFX940-NEXT: v_mov_b32_e32 v2, s0
2244+
; GFX940-NEXT: ds_add_f64 v2, v[0:1]
22642245
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
2265-
; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
2266-
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
2267-
; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[6:7]
2268-
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
2269-
; GFX940-NEXT: s_cbranch_execnz .LBB72_2
2270-
; GFX940-NEXT: .LBB72_3:
2246+
; GFX940-NEXT: .LBB72_2:
22712247
; GFX940-NEXT: s_endpgm
22722248
main_body:
22732249
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst

0 commit comments

Comments
 (0)