Skip to content

[AMDGPU] CodeGen for SMEM instructions #75579

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Dec 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -828,6 +828,12 @@ def FeaturePseudoScalarTrans : SubtargetFeature<"pseudo-scalar-trans",
"Has Pseudo Scalar Transcendental instructions"
>;

def FeatureHasRestrictedSOffset : SubtargetFeature<"restricted-soffset",
"HasRestrictedSOffset",
"true",
"Has restricted SOffset (immediate not supported)."
>;

//===------------------------------------------------------------===//
// Subtarget Features (options and debugging)
//===------------------------------------------------------------===//
Expand Down Expand Up @@ -1474,6 +1480,7 @@ def FeatureISAVersion12 : FeatureSet<
FeatureVcmpxPermlaneHazard,
FeatureSALUFloatInsts,
FeaturePseudoScalarTrans,
FeatureHasRestrictedSOffset,
FeatureVGPRSingleUseHintInsts,
FeatureMADIntraFwdBug,
FeatureScalarDwordx3Loads]>;
Expand Down Expand Up @@ -1787,6 +1794,11 @@ def HasUnpackedD16VMem : Predicate<"Subtarget->hasUnpackedD16VMem()">,
def HasPackedD16VMem : Predicate<"!Subtarget->hasUnpackedD16VMem()">,
AssemblerPredicate<(all_of (not FeatureUnpackedD16VMem))>;

def HasRestrictedSOffset : Predicate<"Subtarget->hasRestrictedSOffset()">,
AssemblerPredicate<(all_of FeatureHasRestrictedSOffset)>;
def HasUnrestrictedSOffset : Predicate<"!Subtarget->hasRestrictedSOffset()">,
AssemblerPredicate<(all_of (not FeatureHasRestrictedSOffset))>;

def D16PreservesUnusedBits :
Predicate<"Subtarget->d16PreservesUnusedBits()">,
AssemblerPredicate<(all_of FeatureGFX9Insts, (not FeatureSRAMECC))>;
Expand Down
8 changes: 4 additions & 4 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -460,8 +460,8 @@ static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
return false;

// If we have 96-bit memory operations, we shouldn't touch them. Note we may
// end up widening these for a scalar load during RegBankSelect, since there
// aren't 96-bit scalar loads.
// end up widening these for a scalar load during RegBankSelect, if we don't
// have 96-bit scalar loads.
if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
return false;

Expand Down Expand Up @@ -6467,10 +6467,10 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(
MemSize, MemAlign);
MI.addMemOperand(MF, MMO);

// There are no 96-bit result scalar loads, but widening to 128-bit should
// If we don't have 96-bit result scalar loads, widening to 128-bit should
// always be legal. We may need to restore this to a 96-bit result if it turns
// out this needs to be converted to a vector load during RegBankSelect.
if (!isPowerOf2_32(Size)) {
if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
if (Ty.isVector())
Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
else
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1061,7 +1061,7 @@ bool AMDGPURegisterBankInfo::applyMappingLoad(
if (DstBank == &AMDGPU::SGPRRegBank) {
// There are some special cases that we need to look at for 32 bit and 96
// bit SGPR loads otherwise we have nothing to do.
if (LoadSize != 32 && LoadSize != 96)
if (LoadSize != 32 && (LoadSize != 96 || Subtarget.hasScalarDwordx3Loads()))
return false;

MachineMemOperand *MMO = *MI.memoperands_begin();
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool HasSALUFloatInsts = false;
bool HasVGPRSingleUseHintInsts = false;
bool HasPseudoScalarTrans = false;
bool HasRestrictedSOffset = false;

bool HasVcmpxPermlaneHazard = false;
bool HasVMEMtoScalarWriteHazard = false;
Expand Down Expand Up @@ -1163,6 +1164,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,

bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; }

bool hasRestrictedSOffset() const { return HasRestrictedSOffset; }

/// Return the maximum number of waves per SIMD for kernels using \p SGPRs
/// SGPRs
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
Expand Down
90 changes: 64 additions & 26 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1438,11 +1438,15 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
// On VI, these use the SMEM format and the offset is 20-bit in bytes.
if (!isUInt<20>(AM.BaseOffs))
return false;
} else {
} else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
// On GFX9 the offset is signed 21-bit in bytes (but must not be negative
// for S_BUFFER_* instructions).
if (!isInt<21>(AM.BaseOffs))
return false;
} else {
// On GFX12, all offsets are signed 24-bit in bytes.
if (!isInt<24>(AM.BaseOffs))
return false;
}

if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
Expand Down Expand Up @@ -7497,7 +7501,8 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
};

// Widen vec3 load to vec4.
if (VT.isVector() && VT.getVectorNumElements() == 3) {
if (VT.isVector() && VT.getVectorNumElements() == 3 &&
!Subtarget->hasScalarDwordx3Loads()) {
EVT WidenedVT =
EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);
auto WidenedOp = DAG.getMemIntrinsicNode(
Expand Down Expand Up @@ -7913,6 +7918,19 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
}
}

// On targets not supporting constant in soffset field, turn zero to
// SGPR_NULL to avoid generating an extra s_mov with zero.
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG,
const GCNSubtarget *Subtarget) {
if (Subtarget->hasRestrictedSOffset())
if (auto SOffsetConst = dyn_cast<ConstantSDNode>(SOffset)) {
if (SOffsetConst->isZero()) {
return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
}
}
return SOffset;
}

SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
SelectionDAG &DAG,
unsigned NewOpcode) const {
Expand All @@ -7921,13 +7939,14 @@ SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
SDValue VData = Op.getOperand(2);
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
SDValue Ops[] = {
Op.getOperand(0), // Chain
VData, // vdata
Rsrc, // rsrc
DAG.getConstant(0, DL, MVT::i32), // vindex
Offsets.first, // voffset
Op.getOperand(5), // soffset
SOffset, // soffset
Offsets.second, // offset
Op.getOperand(6), // cachepolicy
DAG.getTargetConstant(0, DL, MVT::i1), // idxen
Expand All @@ -7954,13 +7973,14 @@ SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
SDValue VData = Op.getOperand(2);
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
SDValue Ops[] = {
Op.getOperand(0), // Chain
VData, // vdata
Rsrc, // rsrc
Op.getOperand(4), // vindex
Offsets.first, // voffset
Op.getOperand(6), // soffset
SOffset, // soffset
Offsets.second, // offset
Op.getOperand(7), // cachepolicy
DAG.getTargetConstant(1, DL, MVT::i1), // idxen
Expand Down Expand Up @@ -8116,12 +8136,13 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,

SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
SDValue Ops[] = {
Op.getOperand(0), // Chain
Rsrc, // rsrc
DAG.getConstant(0, DL, MVT::i32), // vindex
Offsets.first, // voffset
Op.getOperand(4), // soffset
SOffset, // soffset
Offsets.second, // offset
Op.getOperand(5), // cachepolicy, swizzled buffer
DAG.getTargetConstant(0, DL, MVT::i1), // idxen
Expand All @@ -8140,12 +8161,13 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,

SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
SDValue Ops[] = {
Op.getOperand(0), // Chain
Rsrc, // rsrc
Op.getOperand(3), // vindex
Offsets.first, // voffset
Op.getOperand(5), // soffset
SOffset, // soffset
Offsets.second, // offset
Op.getOperand(6), // cachepolicy, swizzled buffer
DAG.getTargetConstant(1, DL, MVT::i1), // idxen
Expand All @@ -8157,21 +8179,22 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
MemSDNode *M = cast<MemSDNode>(Op);
EVT LoadVT = Op.getValueType();

auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
unsigned Glc = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
unsigned Slc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
unsigned IdxEn = getIdxEn(Op.getOperand(3));
SDValue Ops[] = {
Op.getOperand(0), // Chain
Op.getOperand(2), // rsrc
Op.getOperand(3), // vindex
Op.getOperand(4), // voffset
Op.getOperand(5), // soffset
Op.getOperand(6), // offset
DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
DAG.getTargetConstant(IdxEn, DL, MVT::i1) // idxen
Op.getOperand(0), // Chain
Op.getOperand(2), // rsrc
Op.getOperand(3), // vindex
Op.getOperand(4), // voffset
SOffset, // soffset
Op.getOperand(6), // offset
DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
DAG.getTargetConstant(IdxEn, DL, MVT::i1) // idxen
};

if (LoadVT.getScalarType() == MVT::f16)
Expand All @@ -8187,13 +8210,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
EVT LoadVT = Op.getValueType();
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);

SDValue Ops[] = {
Op.getOperand(0), // Chain
Rsrc, // rsrc
DAG.getConstant(0, DL, MVT::i32), // vindex
Offsets.first, // voffset
Op.getOperand(4), // soffset
SOffset, // soffset
Offsets.second, // offset
Op.getOperand(5), // format
Op.getOperand(6), // cachepolicy, swizzled buffer
Expand All @@ -8213,13 +8237,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
EVT LoadVT = Op.getValueType();
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);

SDValue Ops[] = {
Op.getOperand(0), // Chain
Rsrc, // rsrc
Op.getOperand(3), // vindex
Offsets.first, // voffset
Op.getOperand(5), // soffset
SOffset, // soffset
Offsets.second, // offset
Op.getOperand(6), // format
Op.getOperand(7), // cachepolicy, swizzled buffer
Expand Down Expand Up @@ -8432,14 +8457,15 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
SDValue Ops[] = {
Op.getOperand(0), // Chain
Op.getOperand(2), // src
Op.getOperand(3), // cmp
Rsrc, // rsrc
DAG.getConstant(0, DL, MVT::i32), // vindex
Offsets.first, // voffset
Op.getOperand(6), // soffset
SOffset, // soffset
Offsets.second, // offset
Op.getOperand(7), // cachepolicy
DAG.getTargetConstant(0, DL, MVT::i1), // idxen
Expand All @@ -8454,14 +8480,15 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(6), DAG);
auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
SDValue Ops[] = {
Op.getOperand(0), // Chain
Op.getOperand(2), // src
Op.getOperand(3), // cmp
Rsrc, // rsrc
Op.getOperand(5), // vindex
Offsets.first, // voffset
Op.getOperand(7), // soffset
SOffset, // soffset
Offsets.second, // offset
Op.getOperand(8), // cachepolicy
DAG.getTargetConstant(1, DL, MVT::i1), // idxen
Expand Down Expand Up @@ -8893,13 +8920,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
VData = handleD16VData(VData, DAG);
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
SDValue Ops[] = {
Chain,
VData, // vdata
Rsrc, // rsrc
Op.getOperand(4), // vindex
Offsets.first, // voffset
Op.getOperand(6), // soffset
SOffset, // soffset
Offsets.second, // offset
Op.getOperand(7), // format
Op.getOperand(8), // cachepolicy, swizzled buffer
Expand All @@ -8920,13 +8948,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
VData = handleD16VData(VData, DAG);
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
SDValue Ops[] = {
Chain,
VData, // vdata
Rsrc, // rsrc
DAG.getConstant(0, DL, MVT::i32), // vindex
Offsets.first, // voffset
Op.getOperand(5), // soffset
SOffset, // soffset
Offsets.second, // offset
Op.getOperand(6), // format
Op.getOperand(7), // cachepolicy, swizzled buffer
Expand Down Expand Up @@ -9000,13 +9029,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,

SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
SDValue Ops[] = {
Chain,
VData,
Rsrc,
DAG.getConstant(0, DL, MVT::i32), // vindex
Offsets.first, // voffset
Op.getOperand(5), // soffset
SOffset, // soffset
Offsets.second, // offset
Op.getOperand(6), // cachepolicy, swizzled buffer
DAG.getTargetConstant(0, DL, MVT::i1), // idxen
Expand Down Expand Up @@ -9050,13 +9080,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,

auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
SDValue Ops[] = {
Chain,
VData,
Rsrc,
Op.getOperand(4), // vindex
Offsets.first, // voffset
Op.getOperand(6), // soffset
SOffset, // soffset
Offsets.second, // offset
Op.getOperand(7), // cachepolicy, swizzled buffer
DAG.getTargetConstant(1, DL, MVT::i1), // idxen
Expand Down Expand Up @@ -9404,8 +9435,13 @@ void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
return;
}
}

SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
: DAG.getConstant(0, DL, MVT::i32);

Offsets[0] = CombinedOffset;
Offsets[1] = DAG.getConstant(0, DL, MVT::i32);
Offsets[1] = SOffsetZero;
Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
}

Expand Down Expand Up @@ -9663,7 +9699,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
if (!Op->isDivergent() && Alignment >= Align(4) && NumElements < 32) {
if (MemVT.isPow2VectorType())
if (MemVT.isPow2VectorType() ||
(Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
return SDValue();
return WidenOrSplitVectorLoad(Op, DAG);
}
Expand All @@ -9679,7 +9716,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
Load->isSimple() && isMemOpHasNoClobberedMemOperand(Load) &&
Alignment >= Align(4) && NumElements < 32) {
if (MemVT.isPow2VectorType())
if (MemVT.isPow2VectorType() ||
(Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
return SDValue();
return WidenOrSplitVectorLoad(Op, DAG);
}
Expand Down
Loading