Skip to content

Commit 581f9cb

Browse files
committed
AMDGPU: Handle remote/fine-grained memory in atomicrmw fmin/fmax lowering
Consider the new atomic metadata when choosing to expand as cmpxchg instead.
1 parent 234b772 commit 581f9cb

14 files changed

+7019
-5890
lines changed

clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,8 @@ __global__ void ffp2(double *p) {
5757
// SAFE: global_atomic_cmpswap_b64
5858
// UNSAFE-LABEL: @_Z4ffp2Pd
5959
// UNSAFE: global_atomic_cmpswap_x2
60-
// UNSAFE: global_atomic_cmpswap_x2
61-
// UNSAFE: global_atomic_cmpswap_x2
60+
// UNSAFE: global_atomic_max_f64
61+
// UNSAFE: global_atomic_min_f64
6262
// UNSAFE: global_atomic_max_f64
6363
// UNSAFE: global_atomic_min_f64
6464
__atomic_fetch_sub(p, 1.0, memory_order_relaxed);
@@ -84,8 +84,8 @@ __global__ void ffp3(long double *p) {
8484
// SAFE: global_atomic_cmpswap_b64
8585
// UNSAFE-LABEL: @_Z4ffp3Pe
8686
// UNSAFE: global_atomic_cmpswap_x2
87-
// UNSAFE: global_atomic_cmpswap_x2
88-
// UNSAFE: global_atomic_cmpswap_x2
87+
// UNSAFE: global_atomic_max_f64
88+
// UNSAFE: global_atomic_min_f64
8989
// UNSAFE: global_atomic_max_f64
9090
// UNSAFE: global_atomic_min_f64
9191
__atomic_fetch_sub(p, 1.0L, memory_order_relaxed);

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 53 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -16110,6 +16110,34 @@ static bool isBFloat2(Type *Ty) {
1611016110
return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
1611116111
}
1611216112

16113+
/// \returns true if it's valid to emit a native instruction for \p RMW, based
16114+
/// on the properties of the target memory.
16115+
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
16116+
const AtomicRMWInst *RMW,
16117+
bool HasSystemScope) {
16118+
// The remote/fine-grained access logic is different from the integer
16119+
// atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
16120+
// fine-grained access does not work, even for a device local allocation.
16121+
//
16122+
// With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
16123+
// allocations work.
16124+
if (HasSystemScope) {
16125+
if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics() &&
16126+
RMW->hasMetadata("amdgpu.no.remote.memory"))
16127+
return true;
16128+
} else if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics())
16129+
return true;
16130+
16131+
if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
16132+
return true;
16133+
16134+
// TODO: Auto-upgrade this attribute to the metadata in function body and stop
16135+
// checking it.
16136+
return RMW->getFunction()
16137+
->getFnAttribute("amdgpu-unsafe-fp-atomics")
16138+
.getValueAsBool();
16139+
}
16140+
1611316141
TargetLowering::AtomicExpansionKind
1611416142
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1611516143
unsigned AS = RMW->getPointerAddressSpace();
@@ -16253,37 +16281,32 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1625316281
Type *Ty = RMW->getType();
1625416282

1625516283
// LDS float and double fmin/fmax were always supported.
16256-
if (AS == AMDGPUAS::LOCAL_ADDRESS && (Ty->isFloatTy() || Ty->isDoubleTy()))
16257-
return AtomicExpansionKind::None;
16258-
16259-
if (unsafeFPAtomicsDisabled(RMW->getFunction()))
16260-
return AtomicExpansionKind::CmpXChg;
16261-
16262-
// Always expand system scope fp atomics.
16263-
if (HasSystemScope)
16264-
return AtomicExpansionKind::CmpXChg;
16284+
if (AS == AMDGPUAS::LOCAL_ADDRESS) {
16285+
return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
16286+
: AtomicExpansionKind::CmpXChg;
16287+
}
1626516288

16266-
// For flat and global cases:
16267-
// float, double in gfx7. Manual claims denormal support.
16268-
// Removed in gfx8.
16269-
// float, double restored in gfx10.
16270-
// double removed again in gfx11, so only f32 for gfx11/gfx12.
16271-
//
16272-
// For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but no
16273-
// f32.
16274-
//
16275-
// FIXME: Check scope and fine grained memory
16276-
if (AS == AMDGPUAS::FLAT_ADDRESS) {
16277-
if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
16278-
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16279-
if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
16280-
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16281-
} else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
16282-
AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16283-
if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
16284-
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16285-
if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
16286-
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16289+
if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
16290+
// For flat and global cases:
16291+
// float, double in gfx7. Manual claims denormal support.
16292+
// Removed in gfx8.
16293+
// float, double restored in gfx10.
16294+
// double removed again in gfx11, so only f32 for gfx11/gfx12.
16295+
//
16296+
// For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but
16297+
// no f32.
16298+
if (AS == AMDGPUAS::FLAT_ADDRESS) {
16299+
if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
16300+
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16301+
if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
16302+
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16303+
} else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
16304+
AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16305+
if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
16306+
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16307+
if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
16308+
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16309+
}
1628716310
}
1628816311

1628916312
return AtomicExpansionKind::CmpXChg;

0 commit comments

Comments
 (0)