@@ -16110,6 +16110,34 @@ static bool isBFloat2(Type *Ty) {
16110
16110
return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
16111
16111
}
16112
16112
16113
+ /// \returns true if it's valid to emit a native instruction for \p RMW, based
16114
+ /// on the properties of the target memory.
16115
+ static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
16116
+ const AtomicRMWInst *RMW,
16117
+ bool HasSystemScope) {
16118
+ // The remote/fine-grained access logic is different from the integer
16119
+ // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
16120
+ // fine-grained access does not work, even for a device local allocation.
16121
+ //
16122
+ // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
16123
+ // allocations work.
16124
+ if (HasSystemScope) {
16125
+ if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics() &&
16126
+ RMW->hasMetadata("amdgpu.no.remote.memory"))
16127
+ return true;
16128
+ } else if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics())
16129
+ return true;
16130
+
16131
+ if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
16132
+ return true;
16133
+
16134
+ // TODO: Auto-upgrade this attribute to the metadata in function body and stop
16135
+ // checking it.
16136
+ return RMW->getFunction()
16137
+ ->getFnAttribute("amdgpu-unsafe-fp-atomics")
16138
+ .getValueAsBool();
16139
+ }
16140
+
16113
16141
TargetLowering::AtomicExpansionKind
16114
16142
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
16115
16143
unsigned AS = RMW->getPointerAddressSpace();
@@ -16253,37 +16281,32 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
16253
16281
Type *Ty = RMW->getType();
16254
16282
16255
16283
// LDS float and double fmin/fmax were always supported.
16256
- if (AS == AMDGPUAS::LOCAL_ADDRESS && (Ty->isFloatTy() || Ty->isDoubleTy()))
16257
- return AtomicExpansionKind::None;
16258
-
16259
- if (unsafeFPAtomicsDisabled(RMW->getFunction()))
16260
- return AtomicExpansionKind::CmpXChg;
16261
-
16262
- // Always expand system scope fp atomics.
16263
- if (HasSystemScope)
16264
- return AtomicExpansionKind::CmpXChg;
16284
+ if (AS == AMDGPUAS::LOCAL_ADDRESS) {
16285
+ return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
16286
+ : AtomicExpansionKind::CmpXChg;
16287
+ }
16265
16288
16266
- // For flat and global cases:
16267
- // float, double in gfx7. Manual claims denormal support.
16268
- // Removed in gfx8 .
16269
- // float, double restored in gfx10 .
16270
- // double removed again in gfx11, so only f32 for gfx11/gfx12 .
16271
- //
16272
- // For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but no
16273
- // f32.
16274
- //
16275
- // FIXME: Check scope and fine grained memory
16276
- if (AS == AMDGPUAS::FLAT_ADDRESS) {
16277
- if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
16278
- return ReportUnsafeHWInst(AtomicExpansionKind::None);
16279
- if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
16280
- return ReportUnsafeHWInst(AtomicExpansionKind::None);
16281
- } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
16282
- AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16283
- if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
16284
- return ReportUnsafeHWInst(AtomicExpansionKind::None);
16285
- if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
16286
- return ReportUnsafeHWInst(AtomicExpansionKind::None);
16289
+ if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
16290
+ // For flat and global cases:
16291
+ // float, double in gfx7. Manual claims denormal support .
16292
+ // Removed in gfx8 .
16293
+ // float, double restored in gfx10 .
16294
+ // double removed again in gfx11, so only f32 for gfx11/gfx12.
16295
+ //
16296
+ // For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but
16297
+ // no f32.
16298
+ if (AS == AMDGPUAS::FLAT_ADDRESS) {
16299
+ if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
16300
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
16301
+ if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
16302
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
16303
+ } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
16304
+ AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16305
+ if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
16306
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
16307
+ if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
16308
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
16309
+ }
16287
16310
}
16288
16311
16289
16312
return AtomicExpansionKind::CmpXChg;
0 commit comments