Skip to content

Commit 65eb443

Browse files
committed
[AMDGPU][SILoadStoreOptimizer] Merge constrained sloads
Consider the constrained multi-dword loads while merging individual loads to a single multi-dword load.
1 parent 37495cb commit 65eb443

File tree

116 files changed

+5687
-5549
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

116 files changed

+5687
-5549
lines changed

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -967,6 +967,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
967967

968968
bool hasLDSFPAtomicAddF32() const { return GFX8Insts; }
969969
bool hasLDSFPAtomicAddF64() const { return GFX90AInsts; }
970+
bool hasXnackReplay() const { return GFX8Insts; }
970971

971972
/// \returns true if the subtarget has the v_permlanex16_b32 instruction.
972973
bool hasPermLaneX16() const { return getGeneration() >= GFX10; }

llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp

Lines changed: 63 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,8 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
216216
CombineInfo &Paired, bool Modify = false);
217217
static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
218218
const CombineInfo &Paired);
219-
static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
219+
static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired,
220+
const GCNSubtarget *STI = nullptr);
220221
static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
221222
const CombineInfo &Paired);
222223
const TargetRegisterClass *
@@ -343,6 +344,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
343344
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
344345
case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
345346
case AMDGPU::S_LOAD_DWORD_IMM:
347+
case AMDGPU::S_LOAD_DWORD_IMM_ec:
346348
case AMDGPU::GLOBAL_LOAD_DWORD:
347349
case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
348350
case AMDGPU::GLOBAL_STORE_DWORD:
@@ -353,6 +355,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
353355
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
354356
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
355357
case AMDGPU::S_LOAD_DWORDX2_IMM:
358+
case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
356359
case AMDGPU::GLOBAL_LOAD_DWORDX2:
357360
case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
358361
case AMDGPU::GLOBAL_STORE_DWORDX2:
@@ -363,6 +366,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
363366
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
364367
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
365368
case AMDGPU::S_LOAD_DWORDX3_IMM:
369+
case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
366370
case AMDGPU::GLOBAL_LOAD_DWORDX3:
367371
case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
368372
case AMDGPU::GLOBAL_STORE_DWORDX3:
@@ -373,6 +377,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
373377
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
374378
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
375379
case AMDGPU::S_LOAD_DWORDX4_IMM:
380+
case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
376381
case AMDGPU::GLOBAL_LOAD_DWORDX4:
377382
case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
378383
case AMDGPU::GLOBAL_STORE_DWORDX4:
@@ -383,6 +388,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
383388
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
384389
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
385390
case AMDGPU::S_LOAD_DWORDX8_IMM:
391+
case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
386392
return 8;
387393
case AMDGPU::DS_READ_B32:
388394
case AMDGPU::DS_READ_B32_gfx9:
@@ -507,6 +513,11 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
507513
case AMDGPU::S_LOAD_DWORDX3_IMM:
508514
case AMDGPU::S_LOAD_DWORDX4_IMM:
509515
case AMDGPU::S_LOAD_DWORDX8_IMM:
516+
case AMDGPU::S_LOAD_DWORD_IMM_ec:
517+
case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
518+
case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
519+
case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
520+
case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
510521
return S_LOAD_IMM;
511522
case AMDGPU::DS_READ_B32:
512523
case AMDGPU::DS_READ_B32_gfx9:
@@ -591,6 +602,11 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
591602
case AMDGPU::S_LOAD_DWORDX3_IMM:
592603
case AMDGPU::S_LOAD_DWORDX4_IMM:
593604
case AMDGPU::S_LOAD_DWORDX8_IMM:
605+
case AMDGPU::S_LOAD_DWORD_IMM_ec:
606+
case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
607+
case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
608+
case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
609+
case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
594610
return AMDGPU::S_LOAD_DWORD_IMM;
595611
case AMDGPU::GLOBAL_LOAD_DWORD:
596612
case AMDGPU::GLOBAL_LOAD_DWORDX2:
@@ -703,6 +719,11 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
703719
case AMDGPU::S_LOAD_DWORDX3_IMM:
704720
case AMDGPU::S_LOAD_DWORDX4_IMM:
705721
case AMDGPU::S_LOAD_DWORDX8_IMM:
722+
case AMDGPU::S_LOAD_DWORD_IMM_ec:
723+
case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
724+
case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
725+
case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
726+
case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
706727
Result.SBase = true;
707728
return Result;
708729
case AMDGPU::DS_READ_B32:
@@ -1212,8 +1233,17 @@ void SILoadStoreOptimizer::copyToDestRegs(
12121233

12131234
// Copy to the old destination registers.
12141235
const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1215-
const auto *Dest0 = TII->getNamedOperand(*CI.I, OpName);
1216-
const auto *Dest1 = TII->getNamedOperand(*Paired.I, OpName);
1236+
auto *Dest0 = TII->getNamedOperand(*CI.I, OpName);
1237+
auto *Dest1 = TII->getNamedOperand(*Paired.I, OpName);
1238+
1239+
// The constrained sload instructions in S_LOAD_IMM class will have
1240+
// `early-clobber` flag in the dst operand. Remove the flag before using the
1241+
// MOs in copies.
1242+
if (Dest0->isEarlyClobber())
1243+
Dest0->setIsEarlyClobber(false);
1244+
1245+
if (Dest1->isEarlyClobber())
1246+
Dest1->setIsEarlyClobber(false);
12171247

12181248
BuildMI(*MBB, InsertBefore, DL, CopyDesc)
12191249
.add(*Dest0) // Copy to same destination including flags and sub reg.
@@ -1446,7 +1476,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
14461476
MachineBasicBlock::iterator InsertBefore) {
14471477
MachineBasicBlock *MBB = CI.I->getParent();
14481478
DebugLoc DL = CI.I->getDebugLoc();
1449-
const unsigned Opcode = getNewOpcode(CI, Paired);
1479+
const unsigned Opcode = getNewOpcode(CI, Paired, STM);
14501480

14511481
const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
14521482

@@ -1658,7 +1688,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
16581688
}
16591689

16601690
unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1661-
const CombineInfo &Paired) {
1691+
const CombineInfo &Paired,
1692+
const GCNSubtarget *STI) {
16621693
const unsigned Width = CI.Width + Paired.Width;
16631694

16641695
switch (getCommonInstClass(CI, Paired)) {
@@ -1701,17 +1732,33 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
17011732
return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
17021733
}
17031734
case S_LOAD_IMM:
1704-
switch (Width) {
1705-
default:
1706-
return 0;
1707-
case 2:
1708-
return AMDGPU::S_LOAD_DWORDX2_IMM;
1709-
case 3:
1710-
return AMDGPU::S_LOAD_DWORDX3_IMM;
1711-
case 4:
1712-
return AMDGPU::S_LOAD_DWORDX4_IMM;
1713-
case 8:
1714-
return AMDGPU::S_LOAD_DWORDX8_IMM;
1735+
// For targets that support XNACK replay, use the constrained load opcode.
1736+
if (STI && STI->hasXnackReplay()) {
1737+
switch (Width) {
1738+
default:
1739+
return 0;
1740+
case 2:
1741+
return AMDGPU::S_LOAD_DWORDX2_IMM_ec;
1742+
case 3:
1743+
return AMDGPU::S_LOAD_DWORDX3_IMM_ec;
1744+
case 4:
1745+
return AMDGPU::S_LOAD_DWORDX4_IMM_ec;
1746+
case 8:
1747+
return AMDGPU::S_LOAD_DWORDX8_IMM_ec;
1748+
}
1749+
} else {
1750+
switch (Width) {
1751+
default:
1752+
return 0;
1753+
case 2:
1754+
return AMDGPU::S_LOAD_DWORDX2_IMM;
1755+
case 3:
1756+
return AMDGPU::S_LOAD_DWORDX3_IMM;
1757+
case 4:
1758+
return AMDGPU::S_LOAD_DWORDX4_IMM;
1759+
case 8:
1760+
return AMDGPU::S_LOAD_DWORDX8_IMM;
1761+
}
17151762
}
17161763
case GLOBAL_LOAD:
17171764
switch (Width) {

0 commit comments

Comments
 (0)