Skip to content

AMDGPU: Handle new atomicrmw metadata for fadd case #96760

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
166 changes: 82 additions & 84 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16062,26 +16062,21 @@ bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
SNaN, Depth);
}

#if 0
// FIXME: This should be checked before unsafe fp atomics are enabled
// Global FP atomic instructions have a hardcoded FP mode and do not support
// FP32 denormals, and only support v2f16 denormals.
static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) {
// On older subtargets, global FP atomic instructions have a hardcoded FP mode
// and do not support FP32 denormals, and only support v2f16/f64 denormals.
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW) {
if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
return true;

const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
auto DenormMode = RMW->getParent()->getParent()->getDenormalMode(Flt);
if (&Flt == &APFloat::IEEEsingle())
return DenormMode == DenormalMode::getPreserveSign();
return DenormMode == DenormalMode::getIEEE();
}
#endif
auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
if (DenormMode == DenormalMode::getPreserveSign())
return true;

// The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe
// floating point atomic instructions. May generate more efficient code,
// but may not respect rounding and denormal modes, and may give incorrect
// results for certain memory destinations.
bool unsafeFPAtomicsDisabled(Function *F) {
return F->getFnAttribute("amdgpu-unsafe-fp-atomics").getValueAsString() !=
"true";
// TODO: Remove this.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you elaborate (in a comment) on why/when this should be removed?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The patch to remove this is already waiting for review further up the stack

return RMW->getFunction()
->getFnAttribute("amdgpu-unsafe-fp-atomics")
.getValueAsBool();
}

static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW) {
Expand Down Expand Up @@ -16210,82 +16205,85 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
return AtomicExpansionKind::CmpXChg;
}

if (!AMDGPU::isFlatGlobalAddrSpace(AS) &&
AS != AMDGPUAS::BUFFER_FAT_POINTER)
return AtomicExpansionKind::CmpXChg;

if (Subtarget->hasGFX940Insts() && (Ty->isFloatTy() || Ty->isDoubleTy()))
return AtomicExpansionKind::None;

if (AS == AMDGPUAS::FLAT_ADDRESS) {
// gfx940, gfx12
// FIXME: Needs to account for no fine-grained memory
if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty))
return AtomicExpansionKind::None;
} else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
// gfx90a, gfx940, gfx12
// FIXME: Needs to account for no fine-grained memory
if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
return AtomicExpansionKind::None;

// gfx940, gfx12
// FIXME: Needs to account for no fine-grained memory
if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty))
return AtomicExpansionKind::None;
} else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
// gfx90a, gfx940, gfx12
// FIXME: Needs to account for no fine-grained memory
if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
return AtomicExpansionKind::None;

// While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for
// buffer. gfx12 does have the buffer version.
if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isBFloat2(Ty))
return AtomicExpansionKind::None;
}

if (unsafeFPAtomicsDisabled(RMW->getFunction()))
return AtomicExpansionKind::CmpXChg;

// Always expand system scope fp atomics.
if (HasSystemScope)
// LDS atomics respect the denormal mode from the mode register.
//
// Traditionally f32 global/buffer memory atomics would unconditionally
// flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
// flush.
//
// On targets with flat atomic fadd, denormals would flush depending on
// whether the target address resides in LDS or global memory. We consider
// this flat-maybe-flush as will-flush.
if (Ty->isFloatTy() &&
!Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
!atomicIgnoresDenormalModeOrFPModeIsFTZ(RMW))
return AtomicExpansionKind::CmpXChg;

// global and flat atomic fadd f64: gfx90a, gfx940.
if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
return ReportUnsafeHWInst(AtomicExpansionKind::None);
// FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
// safe. The message phrasing also should be better.
if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
if (AS == AMDGPUAS::FLAT_ADDRESS) {
// gfx940, gfx12
if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty))
return ReportUnsafeHWInst(AtomicExpansionKind::None);
} else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
// gfx90a, gfx940, gfx12
if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
return ReportUnsafeHWInst(AtomicExpansionKind::None);

if (AS != AMDGPUAS::FLAT_ADDRESS) {
if (Ty->isFloatTy()) {
// global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+.
if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
// gfx940, gfx12
if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty))
return ReportUnsafeHWInst(AtomicExpansionKind::None);
// global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
} else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
// gfx90a, gfx940, gfx12
if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
return ReportUnsafeHWInst(AtomicExpansionKind::None);
} else {
// gfx908
if (RMW->use_empty() &&
Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() && isHalf2(Ty))

// While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for
// buffer. gfx12 does have the buffer version.
if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isBFloat2(Ty))
return ReportUnsafeHWInst(AtomicExpansionKind::None);
}
}

// flat atomic fadd f32: gfx940, gfx11+.
if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
if (Subtarget->hasFlatAtomicFaddF32Inst())
// global and flat atomic fadd f64: gfx90a, gfx940.
if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
return ReportUnsafeHWInst(AtomicExpansionKind::None);

// If it is in flat address space, and the type is float, we will try to
// expand it, if the target supports global and lds atomic fadd. The
// reason we need that is, in the expansion, we emit the check of address
// space. If it is in global address space, we emit the global atomic
// fadd; if it is in shared address space, we emit the LDS atomic fadd.
if (Subtarget->hasLDSFPAtomicAddF32()) {
if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
return AtomicExpansionKind::Expand;
if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
return AtomicExpansionKind::Expand;
if (AS != AMDGPUAS::FLAT_ADDRESS) {
if (Ty->isFloatTy()) {
// global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940,
// gfx11+.
if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
return ReportUnsafeHWInst(AtomicExpansionKind::None);
// global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
Comment on lines +16256 to +16259
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: use || and only one if

return ReportUnsafeHWInst(AtomicExpansionKind::None);
} else {
// gfx908
if (RMW->use_empty() &&
Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
isHalf2(Ty))
return ReportUnsafeHWInst(AtomicExpansionKind::None);
}
Comment on lines +16261 to +16267
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

else if ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It will just be undone later up the stack

}

// flat atomic fadd f32: gfx940, gfx11+.
if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
if (Subtarget->hasFlatAtomicFaddF32Inst())
return ReportUnsafeHWInst(AtomicExpansionKind::None);

// If it is in flat address space, and the type is float, we will try to
// expand it, if the target supports global and lds atomic fadd. The
// reason we need that is, in the expansion, we emit the check of
// address space. If it is in global address space, we emit the global
// atomic fadd; if it is in shared address space, we emit the LDS atomic
// fadd.
if (Subtarget->hasLDSFPAtomicAddF32()) {
if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
return AtomicExpansionKind::Expand;
if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
Comment on lines +16282 to +16284
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: use || and only one if

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had that at one point but decided it's less readable as all the conditions get added

return AtomicExpansionKind::Expand;
}
}
}

Expand Down
10 changes: 5 additions & 5 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ define amdgpu_ps float @flat_atomic_fadd_f32_rtn_intrinsic(ptr %ptr, float %data
ret float %ret
}

define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_atomicrmw(ptr %ptr, float %data) #0 {
define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_atomicrmw(ptr %ptr, float %data) {
; GFX940-LABEL: name: flat_atomic_fadd_f32_no_rtn_atomicrmw
; GFX940: bb.1 (%ir-block.0):
; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
Expand All @@ -79,11 +79,11 @@ define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_atomicrmw(ptr %ptr, float %da
; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX11-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr)
; GFX11-NEXT: S_ENDPGM 0
%ret = atomicrmw fadd ptr %ptr, float %data syncscope("wavefront") monotonic
%ret = atomicrmw fadd ptr %ptr, float %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0
ret void
}

define amdgpu_ps float @flat_atomic_fadd_f32_rtn_atomicrmw(ptr %ptr, float %data) #0 {
define amdgpu_ps float @flat_atomic_fadd_f32_rtn_atomicrmw(ptr %ptr, float %data) {
; GFX940-LABEL: name: flat_atomic_fadd_f32_rtn_atomicrmw
; GFX940: bb.1 (%ir-block.0):
; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
Expand All @@ -107,10 +107,10 @@ define amdgpu_ps float @flat_atomic_fadd_f32_rtn_atomicrmw(ptr %ptr, float %data
; GFX11-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr)
; GFX11-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]]
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%ret = atomicrmw fadd ptr %ptr, float %data syncscope("wavefront") monotonic
%ret = atomicrmw fadd ptr %ptr, float %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0
ret float %ret
}

declare float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr, float)

attributes #0 = {"amdgpu-unsafe-fp-atomics"="true" }
!0 = !{}
10 changes: 5 additions & 5 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ define amdgpu_ps double @flat_atomic_fadd_f64_rtn_intrinsic(ptr %ptr, double %da
ret double %ret
}

define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_atomicrmw(ptr %ptr, double %data) #0 {
define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_atomicrmw(ptr %ptr, double %data) {
; GFX90A_GFX940-LABEL: name: flat_atomic_fadd_f64_no_rtn_atomicrmw
; GFX90A_GFX940: bb.1 (%ir-block.0):
; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
Expand All @@ -55,11 +55,11 @@ define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_atomicrmw(ptr %ptr, double %d
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
; GFX90A_GFX940-NEXT: FLAT_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr)
; GFX90A_GFX940-NEXT: S_ENDPGM 0
%ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic
%ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0
ret void
}

define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw(ptr %ptr, double %data) #0 {
define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw(ptr %ptr, double %data) {
; GFX90A_GFX940-LABEL: name: flat_atomic_fadd_f64_rtn_atomicrmw
; GFX90A_GFX940: bb.1 (%ir-block.0):
; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
Expand All @@ -78,10 +78,10 @@ define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw(ptr %ptr, double %da
; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
%ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic
%ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0
ret double %ret
}

declare double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr, double)

attributes #0 = {"amdgpu-unsafe-fp-atomics"="true" }
!0 = !{}
8 changes: 5 additions & 3 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) {
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: s_endpgm
%ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst
%ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst, !amdgpu.no.remote.memory !0
ret void
}

Expand All @@ -52,7 +52,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 {
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: s_endpgm
%ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst
%ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst, !amdgpu.no.remote.memory !0
ret void
}

Expand All @@ -77,7 +77,7 @@ define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) {
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
%ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst
%ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst, !amdgpu.no.remote.memory !0
ret float %ret
}

Expand Down Expand Up @@ -287,3 +287,5 @@ define void @flat_atomic_fadd_noret_v2f16_agent_offset(ptr %ptr, <2 x half> %val
}

attributes #0 = { "denormal-fp-math-f32"="ieee,ieee" }

!0 = !{}
Loading
Loading