Skip to content

Commit 9df089b

Browse files
committed
AMDGPU: Handle remote/fine-grained memory in atomicrmw fmin/fmax lowering
Consider the new atomic metadata when choosing to expand as cmpxchg instead.
1 parent 0381e27 commit 9df089b

14 files changed

+7019
-5890
lines changed

clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,8 @@ __global__ void ffp2(double *p) {
5757
// SAFE: global_atomic_cmpswap_b64
5858
// UNSAFE-LABEL: @_Z4ffp2Pd
5959
// UNSAFE: global_atomic_cmpswap_x2
60-
// UNSAFE: global_atomic_cmpswap_x2
61-
// UNSAFE: global_atomic_cmpswap_x2
60+
// UNSAFE: global_atomic_max_f64
61+
// UNSAFE: global_atomic_min_f64
6262
// UNSAFE: global_atomic_max_f64
6363
// UNSAFE: global_atomic_min_f64
6464
__atomic_fetch_sub(p, 1.0, memory_order_relaxed);
@@ -84,8 +84,8 @@ __global__ void ffp3(long double *p) {
8484
// SAFE: global_atomic_cmpswap_b64
8585
// UNSAFE-LABEL: @_Z4ffp3Pe
8686
// UNSAFE: global_atomic_cmpswap_x2
87-
// UNSAFE: global_atomic_cmpswap_x2
88-
// UNSAFE: global_atomic_cmpswap_x2
87+
// UNSAFE: global_atomic_max_f64
88+
// UNSAFE: global_atomic_min_f64
8989
// UNSAFE: global_atomic_max_f64
9090
// UNSAFE: global_atomic_min_f64
9191
__atomic_fetch_sub(p, 1.0L, memory_order_relaxed);

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 53 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -16123,6 +16123,34 @@ static bool isBFloat2(Type *Ty) {
1612316123
return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
1612416124
}
1612516125

16126+
/// \returns true if it's valid to emit a native instruction for \p RMW, based
16127+
/// on the properties of the target memory.
16128+
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
16129+
const AtomicRMWInst *RMW,
16130+
bool HasSystemScope) {
16131+
// The remote/fine-grained access logic is different from the integer
16132+
// atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
16133+
// fine-grained access does not work, even for a device local allocation.
16134+
//
16135+
// With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
16136+
// allocations work.
16137+
if (HasSystemScope) {
16138+
if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics() &&
16139+
RMW->hasMetadata("amdgpu.no.remote.memory"))
16140+
return true;
16141+
} else if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics())
16142+
return true;
16143+
16144+
if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
16145+
return true;
16146+
16147+
// TODO: Auto-upgrade this attribute to the metadata in function body and stop
16148+
// checking it.
16149+
return RMW->getFunction()
16150+
->getFnAttribute("amdgpu-unsafe-fp-atomics")
16151+
.getValueAsBool();
16152+
}
16153+
1612616154
TargetLowering::AtomicExpansionKind
1612716155
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1612816156
unsigned AS = RMW->getPointerAddressSpace();
@@ -16266,37 +16294,32 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1626616294
Type *Ty = RMW->getType();
1626716295

1626816296
// LDS float and double fmin/fmax were always supported.
16269-
if (AS == AMDGPUAS::LOCAL_ADDRESS && (Ty->isFloatTy() || Ty->isDoubleTy()))
16270-
return AtomicExpansionKind::None;
16271-
16272-
if (unsafeFPAtomicsDisabled(RMW->getFunction()))
16273-
return AtomicExpansionKind::CmpXChg;
16274-
16275-
// Always expand system scope fp atomics.
16276-
if (HasSystemScope)
16277-
return AtomicExpansionKind::CmpXChg;
16297+
if (AS == AMDGPUAS::LOCAL_ADDRESS) {
16298+
return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
16299+
: AtomicExpansionKind::CmpXChg;
16300+
}
1627816301

16279-
// For flat and global cases:
16280-
// float, double in gfx7. Manual claims denormal support.
16281-
// Removed in gfx8.
16282-
// float, double restored in gfx10.
16283-
// double removed again in gfx11, so only f32 for gfx11/gfx12.
16284-
//
16285-
// For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but no
16286-
// f32.
16287-
//
16288-
// FIXME: Check scope and fine grained memory
16289-
if (AS == AMDGPUAS::FLAT_ADDRESS) {
16290-
if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
16291-
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16292-
if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
16293-
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16294-
} else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
16295-
AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16296-
if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
16297-
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16298-
if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
16299-
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16302+
if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
16303+
// For flat and global cases:
16304+
// float, double in gfx7. Manual claims denormal support.
16305+
// Removed in gfx8.
16306+
// float, double restored in gfx10.
16307+
// double removed again in gfx11, so only f32 for gfx11/gfx12.
16308+
//
16309+
// For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but
16310+
// no f32.
16311+
if (AS == AMDGPUAS::FLAT_ADDRESS) {
16312+
if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
16313+
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16314+
if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
16315+
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16316+
} else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
16317+
AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16318+
if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
16319+
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16320+
if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
16321+
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16322+
}
1630016323
}
1630116324

1630216325
return AtomicExpansionKind::CmpXChg;

0 commit comments

Comments
 (0)