Skip to content

Commit 4a77414

Browse files
authored
[AMDGPU] CodeGen for GFX12 8/16-bit SMEM loads (llvm#77633)
1 parent 42b9ea8 commit 4a77414

22 files changed

+1330
-302
lines changed

llvm/lib/Target/AMDGPU/AMDGPUGISel.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,10 @@ def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMIN, SIbuffer_atomic_fmin>;
265265
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMAX, SIbuffer_atomic_fmax>;
266266
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_CMPSWAP, SIbuffer_atomic_cmpswap>;
267267
def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD, SIsbuffer_load>;
268+
def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_SBYTE, SIsbuffer_load_byte>;
269+
def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_UBYTE, SIsbuffer_load_ubyte>;
270+
def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_SSHORT, SIsbuffer_load_short>;
271+
def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_USHORT, SIsbuffer_load_ushort>;
268272

269273
def : GINodeEquiv<G_FPTRUNC_ROUND_UPWARD, SIfptrunc_round_upward>;
270274
def : GINodeEquiv<G_FPTRUNC_ROUND_DOWNWARD, SIfptrunc_round_downward>;

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3186,10 +3186,11 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
31863186
bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode * N) const {
31873187
auto Ld = cast<LoadSDNode>(N);
31883188

3189-
if (N->isDivergent() && !AMDGPUInstrInfo::isUniformMMO(Ld->getMemOperand()))
3189+
const MachineMemOperand *MMO = Ld->getMemOperand();
3190+
if (N->isDivergent() && !AMDGPUInstrInfo::isUniformMMO(MMO))
31903191
return false;
31913192

3192-
return Ld->getAlign() >= Align(4) &&
3193+
return Ld->getAlign() >= Align(std::min(MMO->getSize(), uint64_t(4))) &&
31933194
((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
31943195
Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
31953196
(Subtarget->getScalarizeGlobalBehavior() &&

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -784,6 +784,7 @@ bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
784784
unsigned AS = MN->getAddressSpace();
785785
// Do not shrink an aligned scalar load to sub-dword.
786786
// Scalar engine cannot do sub-dword loads.
787+
// TODO: Update this for GFX12 which does have scalar sub-dword loads.
787788
if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
788789
(AS == AMDGPUAS::CONSTANT_ADDRESS ||
789790
AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
@@ -5453,6 +5454,10 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
54535454
NODE_NAME_CASE(BUFFER_LOAD_FORMAT_TFE)
54545455
NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
54555456
NODE_NAME_CASE(SBUFFER_LOAD)
5457+
NODE_NAME_CASE(SBUFFER_LOAD_BYTE)
5458+
NODE_NAME_CASE(SBUFFER_LOAD_UBYTE)
5459+
NODE_NAME_CASE(SBUFFER_LOAD_SHORT)
5460+
NODE_NAME_CASE(SBUFFER_LOAD_USHORT)
54565461
NODE_NAME_CASE(BUFFER_STORE)
54575462
NODE_NAME_CASE(BUFFER_STORE_BYTE)
54585463
NODE_NAME_CASE(BUFFER_STORE_SHORT)

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -567,6 +567,10 @@ enum NodeType : unsigned {
567567
BUFFER_LOAD_FORMAT_TFE,
568568
BUFFER_LOAD_FORMAT_D16,
569569
SBUFFER_LOAD,
570+
SBUFFER_LOAD_BYTE,
571+
SBUFFER_LOAD_UBYTE,
572+
SBUFFER_LOAD_SHORT,
573+
SBUFFER_LOAD_USHORT,
570574
BUFFER_STORE,
571575
BUFFER_STORE_BYTE,
572576
BUFFER_STORE_SHORT,

llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,8 @@ bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
9090
if (skipFunction(F))
9191
return false;
9292

93+
// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
94+
9395
AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
9496
UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
9597

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6443,15 +6443,28 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
64436443
return true;
64446444
}
64456445

6446-
bool AMDGPULegalizerInfo::legalizeSBufferLoad(
6447-
LegalizerHelper &Helper, MachineInstr &MI) const {
6446+
bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper,
6447+
MachineInstr &MI) const {
64486448
MachineIRBuilder &B = Helper.MIRBuilder;
64496449
GISelChangeObserver &Observer = Helper.Observer;
64506450

6451-
Register Dst = MI.getOperand(0).getReg();
6452-
LLT Ty = B.getMRI()->getType(Dst);
6451+
Register OrigDst = MI.getOperand(0).getReg();
6452+
Register Dst;
6453+
LLT Ty = B.getMRI()->getType(OrigDst);
64536454
unsigned Size = Ty.getSizeInBits();
64546455
MachineFunction &MF = B.getMF();
6456+
unsigned Opc = 0;
6457+
if (Size < 32 && ST.hasScalarSubwordLoads()) {
6458+
assert(Size == 8 || Size == 16);
6459+
Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
6460+
: AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
6461+
// The 8-bit and 16-bit scalar buffer load instructions have 32-bit
6462+
// destination register.
6463+
Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32));
6464+
} else {
6465+
Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
6466+
Dst = OrigDst;
6467+
}
64556468

64566469
Observer.changingInstr(MI);
64576470

@@ -6469,19 +6482,24 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(
64696482
// FIXME: We don't really need this intermediate instruction. The intrinsic
64706483
// should be fixed to have a memory operand. Since it's readnone, we're not
64716484
// allowed to add one.
6472-
MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
6485+
MI.setDesc(B.getTII().get(Opc));
64736486
MI.removeOperand(1); // Remove intrinsic ID
64746487

64756488
// FIXME: When intrinsic definition is fixed, this should have an MMO already.
64766489
// TODO: Should this use datalayout alignment?
64776490
const unsigned MemSize = (Size + 7) / 8;
6478-
const Align MemAlign(4);
6491+
const Align MemAlign(std::min(MemSize, 4u));
64796492
MachineMemOperand *MMO = MF.getMachineMemOperand(
64806493
MachinePointerInfo(),
64816494
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
64826495
MachineMemOperand::MOInvariant,
64836496
MemSize, MemAlign);
64846497
MI.addMemOperand(MF, MMO);
6498+
if (Dst != OrigDst) {
6499+
MI.getOperand(0).setReg(Dst);
6500+
B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6501+
B.buildTrunc(OrigDst, Dst);
6502+
}
64856503

64866504
// If we don't have 96-bit result scalar loads, widening to 128-bit should
64876505
// always be legal. We may need to restore this to a 96-bit result if it turns

llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,7 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
202202
// Since we don't have sub-dword scalar loads, avoid doing an extload by
203203
// loading earlier than the argument address, and extracting the relevant
204204
// bits.
205+
// TODO: Update this for GFX12 which does have scalar sub-dword loads.
205206
//
206207
// Additionally widen any sub-dword load to i32 even if suitably aligned,
207208
// so that CSE between different argument loads works easily.

llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -411,6 +411,12 @@ bool AMDGPUPostLegalizerCombinerImpl::matchCombineSignExtendInReg(
411411
case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
412412
MatchData = {LoadMI, AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT};
413413
return Width == 16;
414+
case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
415+
MatchData = {LoadMI, AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE};
416+
return Width == 8;
417+
case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
418+
MatchData = {LoadMI, AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT};
419+
return Width == 16;
414420
}
415421
return false;
416422
}

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -449,8 +449,13 @@ bool AMDGPURegisterBankInfo::isScalarLoadLegal(const MachineInstr &MI) const {
449449
const unsigned AS = MMO->getAddrSpace();
450450
const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||
451451
AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
452+
const unsigned MemSize = 8 * MMO->getSize();
453+
452454
// Require 4-byte alignment.
453-
return MMO->getAlign() >= Align(4) &&
455+
return (MMO->getAlign() >= Align(4) ||
456+
(Subtarget.hasScalarSubwordLoads() &&
457+
((MemSize == 16 && MMO->getAlign() >= Align(2)) ||
458+
(MemSize == 8 && MMO->getAlign() >= Align(1))))) &&
454459
// Can't do a scalar atomic load.
455460
!MMO->isAtomic() &&
456461
// Don't use scalar loads for volatile accesses to non-constant address
@@ -1074,6 +1079,13 @@ bool AMDGPURegisterBankInfo::applyMappingLoad(
10741079
(MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI)))
10751080
return false;
10761081

1082+
if (LoadSize == 32 &&
1083+
((MemSize == 8 && MMO->getAlign() >= Align(1)) ||
1084+
(MemSize == 16 && MMO->getAlign() >= Align(2))) &&
1085+
isScalarLoadLegal(MI) &&
1086+
Subtarget.getGeneration() >= AMDGPUSubtarget::GFX12)
1087+
return false;
1088+
10771089
Register PtrReg = MI.getOperand(1).getReg();
10781090

10791091
ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
@@ -3073,7 +3085,11 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
30733085
executeInWaterfallLoop(B, MI, {3, 6});
30743086
return;
30753087
}
3076-
case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
3088+
case AMDGPU::G_AMDGPU_S_BUFFER_LOAD:
3089+
case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
3090+
case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:
3091+
case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
3092+
case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT: {
30773093
applyMappingSBufferLoad(B, OpdMapper);
30783094
return;
30793095
}
@@ -4396,7 +4412,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
43964412
// initialized.
43974413
break;
43984414
}
4399-
case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
4415+
case AMDGPU::G_AMDGPU_S_BUFFER_LOAD:
4416+
case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
4417+
case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:
4418+
case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
4419+
case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT: {
44004420
// Lie and claim everything is legal, even though some need to be
44014421
// SGPRs. applyMapping will have to deal with it as a waterfall loop.
44024422
OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -429,6 +429,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
429429
return GFX9Insts;
430430
}
431431

432+
bool hasScalarSubwordLoads() const { return getGeneration() >= GFX12; }
433+
432434
TrapHandlerAbi getTrapHandlerAbi() const {
433435
return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE;
434436
}

0 commit comments

Comments
 (0)