Skip to content

Commit bb497b5

Browse files
committed
AMDGPU: Handle remote/fine-grained memory in atomicrmw fmin/fmax lowering
Consider the new atomic metadata when choosing to expand as cmpxchg instead.
1 parent 78bc1b6 commit bb497b5

16 files changed

+17277
-17788
lines changed

clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,8 @@ __global__ void ffp2(double *p) {
5757
// SAFE: global_atomic_cmpswap_b64
5858
// UNSAFE-LABEL: @_Z4ffp2Pd
5959
// UNSAFE: global_atomic_cmpswap_x2
60-
// UNSAFE: global_atomic_cmpswap_x2
61-
// UNSAFE: global_atomic_cmpswap_x2
60+
// UNSAFE: global_atomic_max_f64
61+
// UNSAFE: global_atomic_min_f64
6262
// UNSAFE: global_atomic_max_f64
6363
// UNSAFE: global_atomic_min_f64
6464
__atomic_fetch_sub(p, 1.0, memory_order_relaxed);
@@ -84,8 +84,8 @@ __global__ void ffp3(long double *p) {
8484
// SAFE: global_atomic_cmpswap_b64
8585
// UNSAFE-LABEL: @_Z4ffp3Pe
8686
// UNSAFE: global_atomic_cmpswap_x2
87-
// UNSAFE: global_atomic_cmpswap_x2
88-
// UNSAFE: global_atomic_cmpswap_x2
87+
// UNSAFE: global_atomic_max_f64
88+
// UNSAFE: global_atomic_min_f64
8989
// UNSAFE: global_atomic_max_f64
9090
// UNSAFE: global_atomic_min_f64
9191
__atomic_fetch_sub(p, 1.0L, memory_order_relaxed);

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 53 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -16110,6 +16110,34 @@ static bool isBFloat2(Type *Ty) {
1611016110
return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
1611116111
}
1611216112

16113+
/// \returns true if it's valid to emit a native instruction for \p RMW, based
16114+
/// on the properties of the target memory.
16115+
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
16116+
const AtomicRMWInst *RMW,
16117+
bool HasSystemScope) {
16118+
// The remote/fine-grained access logic is different from the integer
16119+
// atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
16120+
// fine-grained access does not work, even for a device local allocation.
16121+
//
16122+
// With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
16123+
// allocations work.
16124+
if (HasSystemScope) {
16125+
if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics() &&
16126+
RMW->hasMetadata("amdgpu.no.remote.memory"))
16127+
return true;
16128+
} else if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics())
16129+
return true;
16130+
16131+
if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
16132+
return true;
16133+
16134+
// TODO: Auto-upgrade this attribute to the metadata in function body and stop
16135+
// checking it.
16136+
return RMW->getFunction()
16137+
->getFnAttribute("amdgpu-unsafe-fp-atomics")
16138+
.getValueAsBool();
16139+
}
16140+
1611316141
TargetLowering::AtomicExpansionKind
1611416142
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1611516143
unsigned AS = RMW->getPointerAddressSpace();
@@ -16260,37 +16288,32 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1626016288
Type *Ty = RMW->getType();
1626116289

1626216290
// LDS float and double fmin/fmax were always supported.
16263-
if (AS == AMDGPUAS::LOCAL_ADDRESS && (Ty->isFloatTy() || Ty->isDoubleTy()))
16264-
return AtomicExpansionKind::None;
16265-
16266-
if (unsafeFPAtomicsDisabled(RMW->getFunction()))
16267-
return AtomicExpansionKind::CmpXChg;
16268-
16269-
// Always expand system scope fp atomics.
16270-
if (HasSystemScope)
16271-
return AtomicExpansionKind::CmpXChg;
16291+
if (AS == AMDGPUAS::LOCAL_ADDRESS) {
16292+
return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
16293+
: AtomicExpansionKind::CmpXChg;
16294+
}
1627216295

16273-
// For flat and global cases:
16274-
// float, double in gfx7. Manual claims denormal support.
16275-
// Removed in gfx8.
16276-
// float, double restored in gfx10.
16277-
// double removed again in gfx11, so only f32 for gfx11/gfx12.
16278-
//
16279-
// For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but no
16280-
// f32.
16281-
//
16282-
// FIXME: Check scope and fine grained memory
16283-
if (AS == AMDGPUAS::FLAT_ADDRESS) {
16284-
if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
16285-
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16286-
if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
16287-
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16288-
} else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
16289-
AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16290-
if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
16291-
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16292-
if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
16293-
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16296+
if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
16297+
// For flat and global cases:
16298+
// float, double in gfx7. Manual claims denormal support.
16299+
// Removed in gfx8.
16300+
// float, double restored in gfx10.
16301+
// double removed again in gfx11, so only f32 for gfx11/gfx12.
16302+
//
16303+
// For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but
16304+
// no f32.
16305+
if (AS == AMDGPUAS::FLAT_ADDRESS) {
16306+
if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
16307+
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16308+
if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
16309+
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16310+
} else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
16311+
AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16312+
if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
16313+
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16314+
if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
16315+
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16316+
}
1629416317
}
1629516318

1629616319
return AtomicExpansionKind::CmpXChg;

0 commit comments

Comments
 (0)