Skip to content

Commit 9fa70d1

Browse files
committed
AMDGPU: Handle new atomicrmw metadata for fadd case
This is the most complex atomicrmw support case. Note we don't have accurate remarks for all of the cases, which I'm planning on fixing in a later change with more precise wording. Continue respecting amdgpu-unsafe-fp-atomics until it's eventual removal. Also seems to fix a few cases not interpreting amdgpu-unsafe-fp-atomics appropriately aggressively.
1 parent 6341485 commit 9fa70d1

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+5049
-8051
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 82 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -16054,26 +16054,21 @@ bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
1605416054
SNaN, Depth);
1605516055
}
1605616056

16057-
#if 0
16058-
// FIXME: This should be checked before unsafe fp atomics are enabled
16059-
// Global FP atomic instructions have a hardcoded FP mode and do not support
16060-
// FP32 denormals, and only support v2f16 denormals.
16061-
static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) {
16057+
// On older subtargets, global FP atomic instructions have a hardcoded FP mode
16058+
// and do not support FP32 denormals, and only support v2f16/f64 denormals.
16059+
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW) {
16060+
if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
16061+
return true;
16062+
1606216063
const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
16063-
auto DenormMode = RMW->getParent()->getParent()->getDenormalMode(Flt);
16064-
if (&Flt == &APFloat::IEEEsingle())
16065-
return DenormMode == DenormalMode::getPreserveSign();
16066-
return DenormMode == DenormalMode::getIEEE();
16067-
}
16068-
#endif
16064+
auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
16065+
if (DenormMode == DenormalMode::getPreserveSign())
16066+
return true;
1606916067

16070-
// The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe
16071-
// floating point atomic instructions. May generate more efficient code,
16072-
// but may not respect rounding and denormal modes, and may give incorrect
16073-
// results for certain memory destinations.
16074-
bool unsafeFPAtomicsDisabled(Function *F) {
16075-
return F->getFnAttribute("amdgpu-unsafe-fp-atomics").getValueAsString() !=
16076-
"true";
16068+
// TODO: Remove this.
16069+
return RMW->getFunction()
16070+
->getFnAttribute("amdgpu-unsafe-fp-atomics")
16071+
.getValueAsBool();
1607716072
}
1607816073

1607916074
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW) {
@@ -16202,82 +16197,85 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1620216197
return AtomicExpansionKind::CmpXChg;
1620316198
}
1620416199

16205-
if (!AMDGPU::isFlatGlobalAddrSpace(AS) &&
16206-
AS != AMDGPUAS::BUFFER_FAT_POINTER)
16207-
return AtomicExpansionKind::CmpXChg;
16208-
16209-
if (Subtarget->hasGFX940Insts() && (Ty->isFloatTy() || Ty->isDoubleTy()))
16210-
return AtomicExpansionKind::None;
16211-
16212-
if (AS == AMDGPUAS::FLAT_ADDRESS) {
16213-
// gfx940, gfx12
16214-
// FIXME: Needs to account for no fine-grained memory
16215-
if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty))
16216-
return AtomicExpansionKind::None;
16217-
} else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
16218-
// gfx90a, gfx940, gfx12
16219-
// FIXME: Needs to account for no fine-grained memory
16220-
if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16221-
return AtomicExpansionKind::None;
16222-
16223-
// gfx940, gfx12
16224-
// FIXME: Needs to account for no fine-grained memory
16225-
if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty))
16226-
return AtomicExpansionKind::None;
16227-
} else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16228-
// gfx90a, gfx940, gfx12
16229-
// FIXME: Needs to account for no fine-grained memory
16230-
if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16231-
return AtomicExpansionKind::None;
16232-
16233-
// While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for
16234-
// buffer. gfx12 does have the buffer version.
16235-
if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isBFloat2(Ty))
16236-
return AtomicExpansionKind::None;
16237-
}
16238-
16239-
if (unsafeFPAtomicsDisabled(RMW->getFunction()))
16240-
return AtomicExpansionKind::CmpXChg;
16241-
16242-
// Always expand system scope fp atomics.
16243-
if (HasSystemScope)
16200+
// LDS atomics respect the denormal mode from the mode register.
16201+
//
16202+
// Traditionally f32 global/buffer memory atomics would unconditionally
16203+
// flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
16204+
// flush.
16205+
//
16206+
// On targets with flat atomic fadd, denormals would flush depending on
16207+
// whether the target address resides in LDS or global memory. We consider
16208+
// this flat-maybe-flush as will-flush.
16209+
if (Ty->isFloatTy() &&
16210+
!Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
16211+
!atomicIgnoresDenormalModeOrFPModeIsFTZ(RMW))
1624416212
return AtomicExpansionKind::CmpXChg;
1624516213

16246-
// global and flat atomic fadd f64: gfx90a, gfx940.
16247-
if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
16248-
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16214+
// FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
16215+
// safe. The message phrasing also should be better.
16216+
if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
16217+
if (AS == AMDGPUAS::FLAT_ADDRESS) {
16218+
// gfx940, gfx12
16219+
if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty))
16220+
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16221+
} else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
16222+
// gfx90a, gfx940, gfx12
16223+
if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16224+
return ReportUnsafeHWInst(AtomicExpansionKind::None);
1624916225

16250-
if (AS != AMDGPUAS::FLAT_ADDRESS) {
16251-
if (Ty->isFloatTy()) {
16252-
// global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+.
16253-
if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16226+
// gfx940, gfx12
16227+
if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty))
1625416228
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16255-
// global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
16256-
if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16229+
} else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16230+
// gfx90a, gfx940, gfx12
16231+
if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
1625716232
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16258-
} else {
16259-
// gfx908
16260-
if (RMW->use_empty() &&
16261-
Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() && isHalf2(Ty))
16233+
16234+
// While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for
16235+
// buffer. gfx12 does have the buffer version.
16236+
if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isBFloat2(Ty))
1626216237
return ReportUnsafeHWInst(AtomicExpansionKind::None);
1626316238
}
16264-
}
1626516239

16266-
// flat atomic fadd f32: gfx940, gfx11+.
16267-
if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16268-
if (Subtarget->hasFlatAtomicFaddF32Inst())
16240+
// global and flat atomic fadd f64: gfx90a, gfx940.
16241+
if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
1626916242
return ReportUnsafeHWInst(AtomicExpansionKind::None);
1627016243

16271-
// If it is in flat address space, and the type is float, we will try to
16272-
// expand it, if the target supports global and lds atomic fadd. The
16273-
// reason we need that is, in the expansion, we emit the check of address
16274-
// space. If it is in global address space, we emit the global atomic
16275-
// fadd; if it is in shared address space, we emit the LDS atomic fadd.
16276-
if (Subtarget->hasLDSFPAtomicAddF32()) {
16277-
if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16278-
return AtomicExpansionKind::Expand;
16279-
if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16280-
return AtomicExpansionKind::Expand;
16244+
if (AS != AMDGPUAS::FLAT_ADDRESS) {
16245+
if (Ty->isFloatTy()) {
16246+
// global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940,
16247+
// gfx11+.
16248+
if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16249+
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16250+
// global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
16251+
if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16252+
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16253+
} else {
16254+
// gfx908
16255+
if (RMW->use_empty() &&
16256+
Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
16257+
isHalf2(Ty))
16258+
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16259+
}
16260+
}
16261+
16262+
// flat atomic fadd f32: gfx940, gfx11+.
16263+
if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16264+
if (Subtarget->hasFlatAtomicFaddF32Inst())
16265+
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16266+
16267+
// If it is in flat address space, and the type is float, we will try to
16268+
// expand it, if the target supports global and lds atomic fadd. The
16269+
// reason we need that is, in the expansion, we emit the check of
16270+
// address space. If it is in global address space, we emit the global
16271+
// atomic fadd; if it is in shared address space, we emit the LDS atomic
16272+
// fadd.
16273+
if (Subtarget->hasLDSFPAtomicAddF32()) {
16274+
if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16275+
return AtomicExpansionKind::Expand;
16276+
if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16277+
return AtomicExpansionKind::Expand;
16278+
}
1628116279
}
1628216280
}
1628316281

llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ define amdgpu_ps float @flat_atomic_fadd_f32_rtn_intrinsic(ptr %ptr, float %data
5757
ret float %ret
5858
}
5959

60-
define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_atomicrmw(ptr %ptr, float %data) #0 {
60+
define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_atomicrmw(ptr %ptr, float %data) {
6161
; GFX940-LABEL: name: flat_atomic_fadd_f32_no_rtn_atomicrmw
6262
; GFX940: bb.1 (%ir-block.0):
6363
; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
@@ -79,11 +79,11 @@ define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_atomicrmw(ptr %ptr, float %da
7979
; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
8080
; GFX11-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr)
8181
; GFX11-NEXT: S_ENDPGM 0
82-
%ret = atomicrmw fadd ptr %ptr, float %data syncscope("wavefront") monotonic
82+
%ret = atomicrmw fadd ptr %ptr, float %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0
8383
ret void
8484
}
8585

86-
define amdgpu_ps float @flat_atomic_fadd_f32_rtn_atomicrmw(ptr %ptr, float %data) #0 {
86+
define amdgpu_ps float @flat_atomic_fadd_f32_rtn_atomicrmw(ptr %ptr, float %data) {
8787
; GFX940-LABEL: name: flat_atomic_fadd_f32_rtn_atomicrmw
8888
; GFX940: bb.1 (%ir-block.0):
8989
; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
@@ -107,10 +107,10 @@ define amdgpu_ps float @flat_atomic_fadd_f32_rtn_atomicrmw(ptr %ptr, float %data
107107
; GFX11-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr)
108108
; GFX11-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]]
109109
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
110-
%ret = atomicrmw fadd ptr %ptr, float %data syncscope("wavefront") monotonic
110+
%ret = atomicrmw fadd ptr %ptr, float %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0
111111
ret float %ret
112112
}
113113

114114
declare float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr, float)
115115

116-
attributes #0 = {"amdgpu-unsafe-fp-atomics"="true" }
116+
!0 = !{}

llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ define amdgpu_ps double @flat_atomic_fadd_f64_rtn_intrinsic(ptr %ptr, double %da
4242
ret double %ret
4343
}
4444

45-
define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_atomicrmw(ptr %ptr, double %data) #0 {
45+
define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_atomicrmw(ptr %ptr, double %data) {
4646
; GFX90A_GFX940-LABEL: name: flat_atomic_fadd_f64_no_rtn_atomicrmw
4747
; GFX90A_GFX940: bb.1 (%ir-block.0):
4848
; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
@@ -55,11 +55,11 @@ define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_atomicrmw(ptr %ptr, double %d
5555
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
5656
; GFX90A_GFX940-NEXT: FLAT_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr)
5757
; GFX90A_GFX940-NEXT: S_ENDPGM 0
58-
%ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic
58+
%ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0
5959
ret void
6060
}
6161

62-
define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw(ptr %ptr, double %data) #0 {
62+
define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw(ptr %ptr, double %data) {
6363
; GFX90A_GFX940-LABEL: name: flat_atomic_fadd_f64_rtn_atomicrmw
6464
; GFX90A_GFX940: bb.1 (%ir-block.0):
6565
; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
@@ -78,10 +78,10 @@ define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw(ptr %ptr, double %da
7878
; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
7979
; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
8080
; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
81-
%ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic
81+
%ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0
8282
ret double %ret
8383
}
8484

8585
declare double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr, double)
8686

87-
attributes #0 = {"amdgpu-unsafe-fp-atomics"="true" }
87+
!0 = !{}

llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) {
3434
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3535
; GFX940-NEXT: buffer_inv sc0 sc1
3636
; GFX940-NEXT: s_endpgm
37-
%ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst
37+
%ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst, !amdgpu.no.remote.memory !0
3838
ret void
3939
}
4040

@@ -50,7 +50,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 {
5050
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5151
; GFX940-NEXT: buffer_inv sc0 sc1
5252
; GFX940-NEXT: s_endpgm
53-
%ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst
53+
%ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst, !amdgpu.no.remote.memory !0
5454
ret void
5555
}
5656

@@ -75,7 +75,7 @@ define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) {
7575
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7676
; GFX940-NEXT: buffer_inv sc0 sc1
7777
; GFX940-NEXT: s_setpc_b64 s[30:31]
78-
%ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst
78+
%ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst, !amdgpu.no.remote.memory !0
7979
ret float %ret
8080
}
8181

@@ -235,3 +235,5 @@ define void @flat_atomic_fadd_noret_v2f16_agent_offset(ptr %ptr, <2 x half> %val
235235
}
236236

237237
attributes #0 = { "denormal-fp-math-f32"="ieee,ieee" }
238+
239+
!0 = !{}

0 commit comments

Comments
 (0)