@@ -16054,26 +16054,21 @@ bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
16054
16054
SNaN, Depth);
16055
16055
}
16056
16056
16057
- #if 0
16058
- // FIXME: This should be checked before unsafe fp atomics are enabled
16059
- // Global FP atomic instructions have a hardcoded FP mode and do not support
16060
- // FP32 denormals, and only support v2f16 denormals.
16061
- static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) {
16057
+ // On older subtargets, global FP atomic instructions have a hardcoded FP mode
16058
+ // and do not support FP32 denormals, and only support v2f16/f64 denormals.
16059
+ static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW) {
16060
+ if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
16061
+ return true;
16062
+
16062
16063
const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
16063
- auto DenormMode = RMW->getParent()->getParent()->getDenormalMode(Flt);
16064
- if (&Flt == &APFloat::IEEEsingle())
16065
- return DenormMode == DenormalMode::getPreserveSign();
16066
- return DenormMode == DenormalMode::getIEEE();
16067
- }
16068
- #endif
16064
+ auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
16065
+ if (DenormMode == DenormalMode::getPreserveSign())
16066
+ return true;
16069
16067
16070
- // The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe
16071
- // floating point atomic instructions. May generate more efficient code,
16072
- // but may not respect rounding and denormal modes, and may give incorrect
16073
- // results for certain memory destinations.
16074
- bool unsafeFPAtomicsDisabled(Function *F) {
16075
- return F->getFnAttribute("amdgpu-unsafe-fp-atomics").getValueAsString() !=
16076
- "true";
16068
+ // TODO: Remove this.
16069
+ return RMW->getFunction()
16070
+ ->getFnAttribute("amdgpu-unsafe-fp-atomics")
16071
+ .getValueAsBool();
16077
16072
}
16078
16073
16079
16074
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW) {
@@ -16202,82 +16197,85 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
16202
16197
return AtomicExpansionKind::CmpXChg;
16203
16198
}
16204
16199
16205
- if (!AMDGPU::isFlatGlobalAddrSpace(AS) &&
16206
- AS != AMDGPUAS::BUFFER_FAT_POINTER)
16207
- return AtomicExpansionKind::CmpXChg;
16208
-
16209
- if (Subtarget->hasGFX940Insts() && (Ty->isFloatTy() || Ty->isDoubleTy()))
16210
- return AtomicExpansionKind::None;
16211
-
16212
- if (AS == AMDGPUAS::FLAT_ADDRESS) {
16213
- // gfx940, gfx12
16214
- // FIXME: Needs to account for no fine-grained memory
16215
- if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty))
16216
- return AtomicExpansionKind::None;
16217
- } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
16218
- // gfx90a, gfx940, gfx12
16219
- // FIXME: Needs to account for no fine-grained memory
16220
- if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16221
- return AtomicExpansionKind::None;
16222
-
16223
- // gfx940, gfx12
16224
- // FIXME: Needs to account for no fine-grained memory
16225
- if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty))
16226
- return AtomicExpansionKind::None;
16227
- } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16228
- // gfx90a, gfx940, gfx12
16229
- // FIXME: Needs to account for no fine-grained memory
16230
- if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16231
- return AtomicExpansionKind::None;
16232
-
16233
- // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for
16234
- // buffer. gfx12 does have the buffer version.
16235
- if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isBFloat2(Ty))
16236
- return AtomicExpansionKind::None;
16237
- }
16238
-
16239
- if (unsafeFPAtomicsDisabled(RMW->getFunction()))
16240
- return AtomicExpansionKind::CmpXChg;
16241
-
16242
- // Always expand system scope fp atomics.
16243
- if (HasSystemScope)
16200
+ // LDS atomics respect the denormal mode from the mode register.
16201
+ //
16202
+ // Traditionally f32 global/buffer memory atomics would unconditionally
16203
+ // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
16204
+ // flush.
16205
+ //
16206
+ // On targets with flat atomic fadd, denormals would flush depending on
16207
+ // whether the target address resides in LDS or global memory. We consider
16208
+ // this flat-maybe-flush as will-flush.
16209
+ if (Ty->isFloatTy() &&
16210
+ !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
16211
+ !atomicIgnoresDenormalModeOrFPModeIsFTZ(RMW))
16244
16212
return AtomicExpansionKind::CmpXChg;
16245
16213
16246
- // global and flat atomic fadd f64: gfx90a, gfx940.
16247
- if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
16248
- return ReportUnsafeHWInst(AtomicExpansionKind::None);
16214
+ // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
16215
+ // safe. The message phrasing also should be better.
16216
+ if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
16217
+ if (AS == AMDGPUAS::FLAT_ADDRESS) {
16218
+ // gfx940, gfx12
16219
+ if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty))
16220
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
16221
+ } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
16222
+ // gfx90a, gfx940, gfx12
16223
+ if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16224
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
16249
16225
16250
- if (AS != AMDGPUAS::FLAT_ADDRESS) {
16251
- if (Ty->isFloatTy()) {
16252
- // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+.
16253
- if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16226
+ // gfx940, gfx12
16227
+ if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty))
16254
16228
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16255
- // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
16256
- if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16229
+ } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16230
+ // gfx90a, gfx940, gfx12
16231
+ if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16257
16232
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16258
- } else {
16259
- // gfx908
16260
- if (RMW->use_empty() &&
16261
- Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts () && isHalf2 (Ty))
16233
+
16234
+ // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for
16235
+ // buffer. gfx12 does have the buffer version.
16236
+ if ( Subtarget->hasAtomicBufferPkAddBF16Inst () && isBFloat2 (Ty))
16262
16237
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16263
16238
}
16264
- }
16265
16239
16266
- // flat atomic fadd f32: gfx940, gfx11+.
16267
- if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16268
- if (Subtarget->hasFlatAtomicFaddF32Inst())
16240
+ // global and flat atomic fadd f64: gfx90a, gfx940.
16241
+ if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
16269
16242
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16270
16243
16271
- // If it is in flat address space, and the type is float, we will try to
16272
- // expand it, if the target supports global and lds atomic fadd. The
16273
- // reason we need that is, in the expansion, we emit the check of address
16274
- // space. If it is in global address space, we emit the global atomic
16275
- // fadd; if it is in shared address space, we emit the LDS atomic fadd.
16276
- if (Subtarget->hasLDSFPAtomicAddF32()) {
16277
- if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16278
- return AtomicExpansionKind::Expand;
16279
- if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16280
- return AtomicExpansionKind::Expand;
16244
+ if (AS != AMDGPUAS::FLAT_ADDRESS) {
16245
+ if (Ty->isFloatTy()) {
16246
+ // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940,
16247
+ // gfx11+.
16248
+ if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16249
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
16250
+ // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
16251
+ if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16252
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
16253
+ } else {
16254
+ // gfx908
16255
+ if (RMW->use_empty() &&
16256
+ Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
16257
+ isHalf2(Ty))
16258
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
16259
+ }
16260
+ }
16261
+
16262
+ // flat atomic fadd f32: gfx940, gfx11+.
16263
+ if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16264
+ if (Subtarget->hasFlatAtomicFaddF32Inst())
16265
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
16266
+
16267
+ // If it is in flat address space, and the type is float, we will try to
16268
+ // expand it, if the target supports global and lds atomic fadd. The
16269
+ // reason we need that is, in the expansion, we emit the check of
16270
+ // address space. If it is in global address space, we emit the global
16271
+ // atomic fadd; if it is in shared address space, we emit the LDS atomic
16272
+ // fadd.
16273
+ if (Subtarget->hasLDSFPAtomicAddF32()) {
16274
+ if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16275
+ return AtomicExpansionKind::Expand;
16276
+ if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16277
+ return AtomicExpansionKind::Expand;
16278
+ }
16281
16279
}
16282
16280
}
16283
16281
0 commit comments