Skip to content

Commit a278ac5

Browse files
authored
[AMDGPU] CodeGen for SMEM instructions (llvm#75579)
1 parent 70579c9 commit a278ac5

30 files changed

+10826
-667
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -828,6 +828,12 @@ def FeaturePseudoScalarTrans : SubtargetFeature<"pseudo-scalar-trans",
828828
"Has Pseudo Scalar Transcendental instructions"
829829
>;
830830

831+
def FeatureHasRestrictedSOffset : SubtargetFeature<"restricted-soffset",
832+
"HasRestrictedSOffset",
833+
"true",
834+
"Has restricted SOffset (immediate not supported)."
835+
>;
836+
831837
//===------------------------------------------------------------===//
832838
// Subtarget Features (options and debugging)
833839
//===------------------------------------------------------------===//
@@ -1474,6 +1480,7 @@ def FeatureISAVersion12 : FeatureSet<
14741480
FeatureVcmpxPermlaneHazard,
14751481
FeatureSALUFloatInsts,
14761482
FeaturePseudoScalarTrans,
1483+
FeatureHasRestrictedSOffset,
14771484
FeatureVGPRSingleUseHintInsts,
14781485
FeatureMADIntraFwdBug,
14791486
FeatureScalarDwordx3Loads]>;
@@ -1787,6 +1794,11 @@ def HasUnpackedD16VMem : Predicate<"Subtarget->hasUnpackedD16VMem()">,
17871794
def HasPackedD16VMem : Predicate<"!Subtarget->hasUnpackedD16VMem()">,
17881795
AssemblerPredicate<(all_of (not FeatureUnpackedD16VMem))>;
17891796

1797+
def HasRestrictedSOffset : Predicate<"Subtarget->hasRestrictedSOffset()">,
1798+
AssemblerPredicate<(all_of FeatureHasRestrictedSOffset)>;
1799+
def HasUnrestrictedSOffset : Predicate<"!Subtarget->hasRestrictedSOffset()">,
1800+
AssemblerPredicate<(all_of (not FeatureHasRestrictedSOffset))>;
1801+
17901802
def D16PreservesUnusedBits :
17911803
Predicate<"Subtarget->d16PreservesUnusedBits()">,
17921804
AssemblerPredicate<(all_of FeatureGFX9Insts, (not FeatureSRAMECC))>;

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -460,8 +460,8 @@ static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
460460
return false;
461461

462462
// If we have 96-bit memory operations, we shouldn't touch them. Note we may
463-
// end up widening these for a scalar load during RegBankSelect, since there
464-
// aren't 96-bit scalar loads.
463+
// end up widening these for a scalar load during RegBankSelect, if we don't
464+
// have 96-bit scalar loads.
465465
if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
466466
return false;
467467

@@ -6467,10 +6467,10 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(
64676467
MemSize, MemAlign);
64686468
MI.addMemOperand(MF, MMO);
64696469

6470-
// There are no 96-bit result scalar loads, but widening to 128-bit should
6470+
// If we don't have 96-bit result scalar loads, widening to 128-bit should
64716471
// always be legal. We may need to restore this to a 96-bit result if it turns
64726472
// out this needs to be converted to a vector load during RegBankSelect.
6473-
if (!isPowerOf2_32(Size)) {
6473+
if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
64746474
if (Ty.isVector())
64756475
Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
64766476
else

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1061,7 +1061,7 @@ bool AMDGPURegisterBankInfo::applyMappingLoad(
10611061
if (DstBank == &AMDGPU::SGPRRegBank) {
10621062
// There are some special cases that we need to look at for 32 bit and 96
10631063
// bit SGPR loads otherwise we have nothing to do.
1064-
if (LoadSize != 32 && LoadSize != 96)
1064+
if (LoadSize != 32 && (LoadSize != 96 || Subtarget.hasScalarDwordx3Loads()))
10651065
return false;
10661066

10671067
MachineMemOperand *MMO = *MI.memoperands_begin();

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
199199
bool HasSALUFloatInsts = false;
200200
bool HasVGPRSingleUseHintInsts = false;
201201
bool HasPseudoScalarTrans = false;
202+
bool HasRestrictedSOffset = false;
202203

203204
bool HasVcmpxPermlaneHazard = false;
204205
bool HasVMEMtoScalarWriteHazard = false;
@@ -1163,6 +1164,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
11631164

11641165
bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; }
11651166

1167+
bool hasRestrictedSOffset() const { return HasRestrictedSOffset; }
1168+
11661169
/// Return the maximum number of waves per SIMD for kernels using \p SGPRs
11671170
/// SGPRs
11681171
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 64 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1438,11 +1438,15 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
14381438
// On VI, these use the SMEM format and the offset is 20-bit in bytes.
14391439
if (!isUInt<20>(AM.BaseOffs))
14401440
return false;
1441-
} else {
1441+
} else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
14421442
// On GFX9 the offset is signed 21-bit in bytes (but must not be negative
14431443
// for S_BUFFER_* instructions).
14441444
if (!isInt<21>(AM.BaseOffs))
14451445
return false;
1446+
} else {
1447+
// On GFX12, all offsets are signed 24-bit in bytes.
1448+
if (!isInt<24>(AM.BaseOffs))
1449+
return false;
14461450
}
14471451

14481452
if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
@@ -7497,7 +7501,8 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
74977501
};
74987502

74997503
// Widen vec3 load to vec4.
7500-
if (VT.isVector() && VT.getVectorNumElements() == 3) {
7504+
if (VT.isVector() && VT.getVectorNumElements() == 3 &&
7505+
!Subtarget->hasScalarDwordx3Loads()) {
75017506
EVT WidenedVT =
75027507
EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);
75037508
auto WidenedOp = DAG.getMemIntrinsicNode(
@@ -7913,6 +7918,19 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
79137918
}
79147919
}
79157920

7921+
// On targets not supporting constant in soffset field, turn zero to
7922+
// SGPR_NULL to avoid generating an extra s_mov with zero.
7923+
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG,
7924+
const GCNSubtarget *Subtarget) {
7925+
if (Subtarget->hasRestrictedSOffset())
7926+
if (auto SOffsetConst = dyn_cast<ConstantSDNode>(SOffset)) {
7927+
if (SOffsetConst->isZero()) {
7928+
return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
7929+
}
7930+
}
7931+
return SOffset;
7932+
}
7933+
79167934
SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
79177935
SelectionDAG &DAG,
79187936
unsigned NewOpcode) const {
@@ -7921,13 +7939,14 @@ SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
79217939
SDValue VData = Op.getOperand(2);
79227940
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
79237941
auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
7942+
auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
79247943
SDValue Ops[] = {
79257944
Op.getOperand(0), // Chain
79267945
VData, // vdata
79277946
Rsrc, // rsrc
79287947
DAG.getConstant(0, DL, MVT::i32), // vindex
79297948
Offsets.first, // voffset
7930-
Op.getOperand(5), // soffset
7949+
SOffset, // soffset
79317950
Offsets.second, // offset
79327951
Op.getOperand(6), // cachepolicy
79337952
DAG.getTargetConstant(0, DL, MVT::i1), // idxen
@@ -7954,13 +7973,14 @@ SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
79547973
SDValue VData = Op.getOperand(2);
79557974
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
79567975
auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
7976+
auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
79577977
SDValue Ops[] = {
79587978
Op.getOperand(0), // Chain
79597979
VData, // vdata
79607980
Rsrc, // rsrc
79617981
Op.getOperand(4), // vindex
79627982
Offsets.first, // voffset
7963-
Op.getOperand(6), // soffset
7983+
SOffset, // soffset
79647984
Offsets.second, // offset
79657985
Op.getOperand(7), // cachepolicy
79667986
DAG.getTargetConstant(1, DL, MVT::i1), // idxen
@@ -8116,12 +8136,13 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
81168136

81178137
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
81188138
auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
8139+
auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
81198140
SDValue Ops[] = {
81208141
Op.getOperand(0), // Chain
81218142
Rsrc, // rsrc
81228143
DAG.getConstant(0, DL, MVT::i32), // vindex
81238144
Offsets.first, // voffset
8124-
Op.getOperand(4), // soffset
8145+
SOffset, // soffset
81258146
Offsets.second, // offset
81268147
Op.getOperand(5), // cachepolicy, swizzled buffer
81278148
DAG.getTargetConstant(0, DL, MVT::i1), // idxen
@@ -8140,12 +8161,13 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
81408161

81418162
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
81428163
auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
8164+
auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
81438165
SDValue Ops[] = {
81448166
Op.getOperand(0), // Chain
81458167
Rsrc, // rsrc
81468168
Op.getOperand(3), // vindex
81478169
Offsets.first, // voffset
8148-
Op.getOperand(5), // soffset
8170+
SOffset, // soffset
81498171
Offsets.second, // offset
81508172
Op.getOperand(6), // cachepolicy, swizzled buffer
81518173
DAG.getTargetConstant(1, DL, MVT::i1), // idxen
@@ -8157,21 +8179,22 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
81578179
MemSDNode *M = cast<MemSDNode>(Op);
81588180
EVT LoadVT = Op.getValueType();
81598181

8182+
auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
81608183
unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
81618184
unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
81628185
unsigned Glc = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
81638186
unsigned Slc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
81648187
unsigned IdxEn = getIdxEn(Op.getOperand(3));
81658188
SDValue Ops[] = {
8166-
Op.getOperand(0), // Chain
8167-
Op.getOperand(2), // rsrc
8168-
Op.getOperand(3), // vindex
8169-
Op.getOperand(4), // voffset
8170-
Op.getOperand(5), // soffset
8171-
Op.getOperand(6), // offset
8172-
DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
8173-
DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
8174-
DAG.getTargetConstant(IdxEn, DL, MVT::i1) // idxen
8189+
Op.getOperand(0), // Chain
8190+
Op.getOperand(2), // rsrc
8191+
Op.getOperand(3), // vindex
8192+
Op.getOperand(4), // voffset
8193+
SOffset, // soffset
8194+
Op.getOperand(6), // offset
8195+
DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
8196+
DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
8197+
DAG.getTargetConstant(IdxEn, DL, MVT::i1) // idxen
81758198
};
81768199

81778200
if (LoadVT.getScalarType() == MVT::f16)
@@ -8187,13 +8210,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
81878210
EVT LoadVT = Op.getValueType();
81888211
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
81898212
auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
8213+
auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
81908214

81918215
SDValue Ops[] = {
81928216
Op.getOperand(0), // Chain
81938217
Rsrc, // rsrc
81948218
DAG.getConstant(0, DL, MVT::i32), // vindex
81958219
Offsets.first, // voffset
8196-
Op.getOperand(4), // soffset
8220+
SOffset, // soffset
81978221
Offsets.second, // offset
81988222
Op.getOperand(5), // format
81998223
Op.getOperand(6), // cachepolicy, swizzled buffer
@@ -8213,13 +8237,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
82138237
EVT LoadVT = Op.getValueType();
82148238
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
82158239
auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
8240+
auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
82168241

82178242
SDValue Ops[] = {
82188243
Op.getOperand(0), // Chain
82198244
Rsrc, // rsrc
82208245
Op.getOperand(3), // vindex
82218246
Offsets.first, // voffset
8222-
Op.getOperand(5), // soffset
8247+
SOffset, // soffset
82238248
Offsets.second, // offset
82248249
Op.getOperand(6), // format
82258250
Op.getOperand(7), // cachepolicy, swizzled buffer
@@ -8432,14 +8457,15 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
84328457
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
84338458
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
84348459
auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
8460+
auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
84358461
SDValue Ops[] = {
84368462
Op.getOperand(0), // Chain
84378463
Op.getOperand(2), // src
84388464
Op.getOperand(3), // cmp
84398465
Rsrc, // rsrc
84408466
DAG.getConstant(0, DL, MVT::i32), // vindex
84418467
Offsets.first, // voffset
8442-
Op.getOperand(6), // soffset
8468+
SOffset, // soffset
84438469
Offsets.second, // offset
84448470
Op.getOperand(7), // cachepolicy
84458471
DAG.getTargetConstant(0, DL, MVT::i1), // idxen
@@ -8454,14 +8480,15 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
84548480
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
84558481
SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
84568482
auto Offsets = splitBufferOffsets(Op.getOperand(6), DAG);
8483+
auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
84578484
SDValue Ops[] = {
84588485
Op.getOperand(0), // Chain
84598486
Op.getOperand(2), // src
84608487
Op.getOperand(3), // cmp
84618488
Rsrc, // rsrc
84628489
Op.getOperand(5), // vindex
84638490
Offsets.first, // voffset
8464-
Op.getOperand(7), // soffset
8491+
SOffset, // soffset
84658492
Offsets.second, // offset
84668493
Op.getOperand(8), // cachepolicy
84678494
DAG.getTargetConstant(1, DL, MVT::i1), // idxen
@@ -8893,13 +8920,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
88938920
VData = handleD16VData(VData, DAG);
88948921
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
88958922
auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
8923+
auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
88968924
SDValue Ops[] = {
88978925
Chain,
88988926
VData, // vdata
88998927
Rsrc, // rsrc
89008928
Op.getOperand(4), // vindex
89018929
Offsets.first, // voffset
8902-
Op.getOperand(6), // soffset
8930+
SOffset, // soffset
89038931
Offsets.second, // offset
89048932
Op.getOperand(7), // format
89058933
Op.getOperand(8), // cachepolicy, swizzled buffer
@@ -8920,13 +8948,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
89208948
VData = handleD16VData(VData, DAG);
89218949
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
89228950
auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
8951+
auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
89238952
SDValue Ops[] = {
89248953
Chain,
89258954
VData, // vdata
89268955
Rsrc, // rsrc
89278956
DAG.getConstant(0, DL, MVT::i32), // vindex
89288957
Offsets.first, // voffset
8929-
Op.getOperand(5), // soffset
8958+
SOffset, // soffset
89308959
Offsets.second, // offset
89318960
Op.getOperand(6), // format
89328961
Op.getOperand(7), // cachepolicy, swizzled buffer
@@ -9000,13 +9029,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
90009029

90019030
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
90029031
auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
9032+
auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
90039033
SDValue Ops[] = {
90049034
Chain,
90059035
VData,
90069036
Rsrc,
90079037
DAG.getConstant(0, DL, MVT::i32), // vindex
90089038
Offsets.first, // voffset
9009-
Op.getOperand(5), // soffset
9039+
SOffset, // soffset
90109040
Offsets.second, // offset
90119041
Op.getOperand(6), // cachepolicy, swizzled buffer
90129042
DAG.getTargetConstant(0, DL, MVT::i1), // idxen
@@ -9050,13 +9080,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
90509080

90519081
auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
90529082
auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
9083+
auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
90539084
SDValue Ops[] = {
90549085
Chain,
90559086
VData,
90569087
Rsrc,
90579088
Op.getOperand(4), // vindex
90589089
Offsets.first, // voffset
9059-
Op.getOperand(6), // soffset
9090+
SOffset, // soffset
90609091
Offsets.second, // offset
90619092
Op.getOperand(7), // cachepolicy, swizzled buffer
90629093
DAG.getTargetConstant(1, DL, MVT::i1), // idxen
@@ -9404,8 +9435,13 @@ void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
94049435
return;
94059436
}
94069437
}
9438+
9439+
SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
9440+
? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
9441+
: DAG.getConstant(0, DL, MVT::i32);
9442+
94079443
Offsets[0] = CombinedOffset;
9408-
Offsets[1] = DAG.getConstant(0, DL, MVT::i32);
9444+
Offsets[1] = SOffsetZero;
94099445
Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
94109446
}
94119447

@@ -9663,7 +9699,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
96639699
if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
96649700
AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
96659701
if (!Op->isDivergent() && Alignment >= Align(4) && NumElements < 32) {
9666-
if (MemVT.isPow2VectorType())
9702+
if (MemVT.isPow2VectorType() ||
9703+
(Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
96679704
return SDValue();
96689705
return WidenOrSplitVectorLoad(Op, DAG);
96699706
}
@@ -9679,7 +9716,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
96799716
if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
96809717
Load->isSimple() && isMemOpHasNoClobberedMemOperand(Load) &&
96819718
Alignment >= Align(4) && NumElements < 32) {
9682-
if (MemVT.isPow2VectorType())
9719+
if (MemVT.isPow2VectorType() ||
9720+
(Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
96839721
return SDValue();
96849722
return WidenOrSplitVectorLoad(Op, DAG);
96859723
}

0 commit comments

Comments
 (0)