@@ -16037,26 +16037,15 @@ bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
16037
16037
SNaN, Depth);
16038
16038
}
16039
16039
16040
- #if 0
16041
- // FIXME: This should be checked before unsafe fp atomics are enabled
16042
- // Global FP atomic instructions have a hardcoded FP mode and do not support
16043
- // FP32 denormals, and only support v2f16 denormals.
16044
- static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) {
16045
- const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
16046
- auto DenormMode = RMW->getParent()->getParent()->getDenormalMode(Flt);
16047
- if (&Flt == &APFloat::IEEEsingle())
16048
- return DenormMode == DenormalMode::getPreserveSign();
16049
- return DenormMode == DenormalMode::getIEEE();
16050
- }
16051
- #endif
16040
+ // On older subtargets, global FP atomic instructions have a hardcoded FP mode
16041
+ // and do not support FP32 denormals, and only support v2f16/f64 denormals.
16042
+ static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW) {
16043
+ if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
16044
+ return true;
16052
16045
16053
- // The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe
16054
- // floating point atomic instructions. May generate more efficient code,
16055
- // but may not respect rounding and denormal modes, and may give incorrect
16056
- // results for certain memory destinations.
16057
- bool unsafeFPAtomicsDisabled(Function *F) {
16058
- return F->getFnAttribute("amdgpu-unsafe-fp-atomics").getValueAsString() !=
16059
- "true";
16046
+ const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
16047
+ auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
16048
+ return DenormMode == DenormalMode::getPreserveSign();
16060
16049
}
16061
16050
16062
16051
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW) {
@@ -16185,75 +16174,74 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
16185
16174
return AtomicExpansionKind::CmpXChg;
16186
16175
}
16187
16176
16188
- if (!AMDGPU::isFlatGlobalAddrSpace(AS) &&
16189
- AS != AMDGPUAS::BUFFER_FAT_POINTER)
16190
- return AtomicExpansionKind::CmpXChg;
16191
-
16192
- if (Subtarget->hasGFX940Insts() && (Ty->isFloatTy() || Ty->isDoubleTy()))
16193
- return AtomicExpansionKind::None;
16194
-
16195
- if (AS == AMDGPUAS::FLAT_ADDRESS) {
16196
- // gfx940, gfx12
16197
- // FIXME: Needs to account for no fine-grained memory
16198
- if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty))
16199
- return AtomicExpansionKind::None;
16200
- } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
16201
- // gfx90a, gfx940, gfx12
16202
- // FIXME: Needs to account for no fine-grained memory
16203
- if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16204
- return AtomicExpansionKind::None;
16205
-
16206
- // gfx940, gfx12
16207
- // FIXME: Needs to account for no fine-grained memory
16208
- if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty))
16209
- return AtomicExpansionKind::None;
16210
- } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16211
- // gfx90a, gfx940, gfx12
16212
- // FIXME: Needs to account for no fine-grained memory
16213
- if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16214
- return AtomicExpansionKind::None;
16215
-
16216
- // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for
16217
- // buffer. gfx12 does have the buffer version.
16218
- if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isBFloat2(Ty))
16219
- return AtomicExpansionKind::None;
16220
- }
16221
-
16222
- if (unsafeFPAtomicsDisabled(RMW->getFunction()))
16223
- return AtomicExpansionKind::CmpXChg;
16224
-
16225
- // Always expand system scope fp atomics.
16226
- if (HasSystemScope)
16177
+ // LDS atomics respect the denormal mode from the mode register.
16178
+ //
16179
+ // Traditionally f32 global/buffer memory atomics would unconditionally
16180
+ // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
16181
+ // flush.
16182
+ //
16183
+ // On targets with flat atomic fadd, denormals would flush depending on
16184
+ // whether the target address resides in LDS or global memory. We consider
16185
+ // this flat-maybe-flush as will-flush.
16186
+ if (Ty->isFloatTy() &&
16187
+ !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
16188
+ !atomicIgnoresDenormalModeOrFPModeIsFTZ(RMW))
16227
16189
return AtomicExpansionKind::CmpXChg;
16228
16190
16229
- // global and flat atomic fadd f64: gfx90a, gfx940.
16230
- if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
16231
- return ReportUnsafeHWInst(AtomicExpansionKind::None);
16232
-
16233
- if (AS != AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16234
- // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+.
16235
- if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16236
- return ReportUnsafeHWInst(AtomicExpansionKind::None);
16237
- // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
16238
- if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16239
- return ReportUnsafeHWInst(AtomicExpansionKind::None);
16240
- }
16191
+ if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
16192
+ if (AS == AMDGPUAS::FLAT_ADDRESS) {
16193
+ // gfx940, gfx12
16194
+ if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty))
16195
+ return AtomicExpansionKind::None;
16196
+ } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
16197
+ // gfx90a, gfx940, gfx12
16198
+ if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16199
+ return AtomicExpansionKind::None;
16200
+
16201
+ // gfx940, gfx12
16202
+ if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty))
16203
+ return AtomicExpansionKind::None;
16204
+ } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16205
+ // gfx90a, gfx940, gfx12
16206
+ if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16207
+ return AtomicExpansionKind::None;
16208
+
16209
+ // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for
16210
+ // buffer. gfx12 does have the buffer version.
16211
+ if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isBFloat2(Ty))
16212
+ return AtomicExpansionKind::None;
16213
+ }
16241
16214
16242
- // flat atomic fadd f32: gfx940, gfx11+.
16243
- if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16244
- if (Subtarget->hasFlatAtomicFaddF32Inst())
16215
+ // global and flat atomic fadd f64: gfx90a, gfx940.
16216
+ if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
16245
16217
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16246
16218
16247
- // If it is in flat address space, and the type is float, we will try to
16248
- // expand it, if the target supports global and lds atomic fadd. The
16249
- // reason we need that is, in the expansion, we emit the check of address
16250
- // space. If it is in global address space, we emit the global atomic
16251
- // fadd; if it is in shared address space, we emit the LDS atomic fadd.
16252
- if (Subtarget->hasLDSFPAtomicAddF32()) {
16219
+ if (AS != AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16220
+ // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+.
16253
16221
if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16254
- return AtomicExpansionKind::Expand;
16222
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
16223
+ // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
16255
16224
if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16256
- return AtomicExpansionKind::Expand;
16225
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
16226
+ }
16227
+
16228
+ // flat atomic fadd f32: gfx940, gfx11+.
16229
+ if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16230
+ if (Subtarget->hasFlatAtomicFaddF32Inst())
16231
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
16232
+
16233
+ // If it is in flat address space, and the type is float, we will try to
16234
+ // expand it, if the target supports global and lds atomic fadd. The
16235
+ // reason we need that is, in the expansion, we emit the check of
16236
+ // address space. If it is in global address space, we emit the global
16237
+ // atomic fadd; if it is in shared address space, we emit the LDS atomic
16238
+ // fadd.
16239
+ if (Subtarget->hasLDSFPAtomicAddF32()) {
16240
+ if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16241
+ return AtomicExpansionKind::Expand;
16242
+ if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16243
+ return AtomicExpansionKind::Expand;
16244
+ }
16257
16245
}
16258
16246
}
16259
16247
0 commit comments