-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU] Rematerialize scalar loads #68778
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Extend the list of instructions that can be rematerialized in SIInstrInfo::isReallyTriviallyReMaterializable() to support scalar loads. Try shrinking instructions to remat only the part needed for current context. Add SIInstrInfo::reMaterialize target hook, and handle shrinking of S_LOAD_DWORDX16_IMM to S_LOAD_DWORDX8_IMM as a proof of concept.
Ported from https://reviews.llvm.org/D154083 (and rebased). |
@llvm/pr-subscribers-backend-amdgpu Author: Piotr Sobczak (piotrAMD) ChangesExtend the list of instructions that can be rematerialized in SIInstrInfo::isReallyTriviallyReMaterializable() to support scalar loads. Try shrinking instructions to remat only the part needed for current context. Add SIInstrInfo::reMaterialize target hook, and handle shrinking of S_LOAD_DWORDX16_IMM to S_LOAD_DWORDX8_IMM as a proof of concept. Patch is 44.21 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/68778.diff 5 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 51397cbb791469d..31a086cd2a61820 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -108,7 +108,18 @@ static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
bool SIInstrInfo::isReallyTriviallyReMaterializable(
const MachineInstr &MI) const {
+
+ bool CanRemat = false;
if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isSDWA(MI) || isSALU(MI)) {
+ CanRemat = true;
+ } else if (isSMRD(MI)) {
+ CanRemat = !MI.memoperands_empty() &&
+ llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
+ return MMO->isLoad() && MMO->isInvariant();
+ });
+ }
+
+ if (CanRemat) {
// Normally VALU use of exec would block the rematerialization, but that
// is OK in this case to have an implicit exec read as all VALU do.
// We really want all of the generic logic for this except for this.
@@ -2434,6 +2445,105 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
return true;
}
+void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I, Register DestReg,
+ unsigned SubIdx, const MachineInstr &Orig,
+ const TargetRegisterInfo &RI) const {
+
+ // Try shrinking the instruction to remat only the part needed for current
+ // context.
+ // TODO: Handle more cases.
+ unsigned Opcode = Orig.getOpcode();
+ switch (Opcode) {
+ case AMDGPU::S_LOAD_DWORDX16_IMM:
+ case AMDGPU::S_LOAD_DWORDX8_IMM: {
+ if (SubIdx != 0)
+ break;
+
+ if (I == MBB.end())
+ break;
+
+ if (I->isBundled())
+ break;
+
+ // Look for a single use of the register that is also a subreg.
+ Register RegToFind = Orig.getOperand(0).getReg();
+ int SingleUseIdx = -1;
+ for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+ const MachineOperand &CandMO = I->getOperand(i);
+ if (!CandMO.isReg())
+ continue;
+ Register CandReg = CandMO.getReg();
+ if (!CandReg)
+ continue;
+
+ if (CandReg == RegToFind || RI.regsOverlap(CandReg, RegToFind)) {
+ if (SingleUseIdx == -1 && CandMO.isUse()) {
+ SingleUseIdx = i;
+ } else {
+ SingleUseIdx = -1;
+ break;
+ }
+ }
+ }
+ if (SingleUseIdx == -1)
+ break;
+ MachineOperand *UseMO = &I->getOperand(SingleUseIdx);
+ if (UseMO->getSubReg() == AMDGPU::NoSubRegister)
+ break;
+
+ unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
+ unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
+
+ MachineFunction *MF = MBB.getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ assert(MRI.hasAtMostUserInstrs(DestReg, 0) &&
+ "DestReg should have no users yet.");
+
+ unsigned NewOpcode = -1;
+ if (SubregSize == 256)
+ NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
+ else if (SubregSize == 128)
+ NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
+ else
+ break;
+
+ const MCInstrDesc &TID = get(NewOpcode);
+ const TargetRegisterClass *NewRC =
+ RI.getAllocatableClass(getRegClass(TID, 0, &RI, *MF));
+ MRI.setRegClass(DestReg, NewRC);
+
+ UseMO->setReg(DestReg);
+ UseMO->setSubReg(AMDGPU::NoSubRegister);
+
+ // Use a smaller load with the desired size, possibly with updated offset.
+ MachineInstr *MI = MF->CloneMachineInstr(&Orig);
+ MI->setDesc(TID);
+ MI->getOperand(0).setReg(DestReg);
+ MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
+ if (Offset) {
+ MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
+ int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
+ OffsetMO->setImm(FinalOffset);
+ }
+ SmallVector<MachineMemOperand *> NewMMOs;
+ for (const MachineMemOperand *MemOp : Orig.memoperands())
+ NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
+ SubregSize / 8));
+ MI->setMemRefs(*MF, NewMMOs);
+
+ MBB.insert(I, MI);
+ return;
+ }
+
+ default:
+ break;
+ }
+ MachineInstr *MI = MBB.getParent()->CloneMachineInstr(&Orig);
+ MI->substituteRegister(MI->getOperand(0).getReg(), DestReg, SubIdx, RI);
+ MBB.insert(I, MI);
+}
+
std::pair<MachineInstr*, MachineInstr*>
SIInstrInfo::expandMovDPP64(MachineInstr &MI) const {
assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 5ef17c44f7de389..a64cf0244e4c08c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -275,6 +275,11 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
bool expandPostRAPseudo(MachineInstr &MI) const override;
+ void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ Register DestReg, unsigned SubIdx,
+ const MachineInstr &Orig,
+ const TargetRegisterInfo &TRI) const override;
+
// Splits a V_MOV_B64_DPP_PSEUDO opcode into a pair of v_mov_b32_dpp
// instructions. Returns a pair of generated instructions.
// Can split either post-RA with physical registers or pre-RA with
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
index d6f7a92af9dcb6f..1999c7b065e6854 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
@@ -47,8 +47,8 @@ entry:
}
; CHECK: .name: num_spilled_sgprs
-; GFX700: .sgpr_spill_count: 38
-; GFX803: .sgpr_spill_count: 22
+; GFX700: .sgpr_spill_count: 12
+; GFX803: .sgpr_spill_count: 12
; GFX900: .sgpr_spill_count: 48
; GFX1010: .sgpr_spill_count: 48
; CHECK: .symbol: num_spilled_sgprs.kd
diff --git a/llvm/test/CodeGen/AMDGPU/remat-smrd.mir b/llvm/test/CodeGen/AMDGPU/remat-smrd.mir
index 753cb135fdae9c0..95eac12a65389ef 100644
--- a/llvm/test/CodeGen/AMDGPU/remat-smrd.mir
+++ b/llvm/test/CodeGen/AMDGPU/remat-smrd.mir
@@ -12,15 +12,11 @@ body: |
; GCN: liveins: $sgpr8_sgpr9
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $sgpr2_sgpr3 = COPY $sgpr8_sgpr9
- ; GCN-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM renamable $sgpr2_sgpr3, 0, 0 :: (dereferenceable invariant load (s32), addrspace 4)
- ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.0, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.0, addrspace 5)
; GCN-NEXT: renamable $sgpr1 = S_LOAD_DWORD_IMM renamable $sgpr2_sgpr3, 4, 0 :: (dereferenceable invariant load (s32), addrspace 4)
- ; GCN-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM renamable $sgpr2_sgpr3, 8, 0 :: (dereferenceable invariant load (s32), addrspace 4)
- ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.1, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.1, addrspace 5)
- ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.0, addrspace 5)
+ ; GCN-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM renamable $sgpr2_sgpr3, 0, 0 :: (dereferenceable invariant load (s32), addrspace 4)
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr1
- ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.1, addrspace 5)
+ ; GCN-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM renamable $sgpr2_sgpr3, 8, 0 :: (dereferenceable invariant load (s32), addrspace 4)
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0
; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr2_sgpr3
%0:sreg_64_xexec = COPY $sgpr8_sgpr9
@@ -44,16 +40,10 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $sgpr0_sgpr1 = COPY $sgpr8_sgpr9
; GCN-NEXT: renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM renamable $sgpr0_sgpr1, 0, 0 :: (dereferenceable invariant load (s64), addrspace 4)
- ; GCN-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr2_sgpr3, %stack.1, implicit $exec, implicit $sp_reg :: (store (s64) into %stack.1, align 4, addrspace 5)
- ; GCN-NEXT: renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM renamable $sgpr0_sgpr1, 4, 0 :: (dereferenceable invariant load (s64), addrspace 4)
- ; GCN-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr2_sgpr3, %stack.0, implicit $exec, implicit $sp_reg :: (store (s64) into %stack.0, align 4, addrspace 5)
- ; GCN-NEXT: renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM renamable $sgpr0_sgpr1, 8, 0 :: (dereferenceable invariant load (s64), addrspace 4)
- ; GCN-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr2_sgpr3, %stack.2, implicit $exec, implicit $sp_reg :: (store (s64) into %stack.2, align 4, addrspace 5)
- ; GCN-NEXT: renamable $sgpr2_sgpr3 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sp_reg :: (load (s64) from %stack.1, align 4, addrspace 5)
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr2_sgpr3
- ; GCN-NEXT: renamable $sgpr2_sgpr3 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s64) from %stack.0, align 4, addrspace 5)
+ ; GCN-NEXT: renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM renamable $sgpr0_sgpr1, 4, 0 :: (dereferenceable invariant load (s64), addrspace 4)
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr2_sgpr3
- ; GCN-NEXT: renamable $sgpr2_sgpr3 = SI_SPILL_S64_RESTORE %stack.2, implicit $exec, implicit $sp_reg :: (load (s64) from %stack.2, align 4, addrspace 5)
+ ; GCN-NEXT: renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM renamable $sgpr0_sgpr1, 8, 0 :: (dereferenceable invariant load (s64), addrspace 4)
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr2_sgpr3
; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr0_sgpr1
%0:sreg_64_xexec = COPY $sgpr8_sgpr9
@@ -77,16 +67,10 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $sgpr0_sgpr1 = COPY $sgpr8_sgpr9
; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX4_IMM renamable $sgpr0_sgpr1, 0, 0 :: (dereferenceable invariant load (s128), addrspace 4)
- ; GCN-NEXT: SI_SPILL_S128_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, %stack.1, implicit $exec, implicit $sp_reg :: (store (s128) into %stack.1, align 4, addrspace 5)
- ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX4_IMM renamable $sgpr0_sgpr1, 4, 0 :: (dereferenceable invariant load (s128), addrspace 4)
- ; GCN-NEXT: SI_SPILL_S128_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, %stack.0, implicit $exec, implicit $sp_reg :: (store (s128) into %stack.0, align 4, addrspace 5)
- ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX4_IMM renamable $sgpr0_sgpr1, 8, 0 :: (dereferenceable invariant load (s128), addrspace 4)
- ; GCN-NEXT: SI_SPILL_S128_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, %stack.2, implicit $exec, implicit $sp_reg :: (store (s128) into %stack.2, align 4, addrspace 5)
- ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = SI_SPILL_S128_RESTORE %stack.1, implicit $exec, implicit $sp_reg :: (load (s128) from %stack.1, align 4, addrspace 5)
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7
- ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = SI_SPILL_S128_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s128) from %stack.0, align 4, addrspace 5)
+ ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX4_IMM renamable $sgpr0_sgpr1, 4, 0 :: (dereferenceable invariant load (s128), addrspace 4)
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7
- ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = SI_SPILL_S128_RESTORE %stack.2, implicit $exec, implicit $sp_reg :: (load (s128) from %stack.2, align 4, addrspace 5)
+ ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX4_IMM renamable $sgpr0_sgpr1, 8, 0 :: (dereferenceable invariant load (s128), addrspace 4)
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7
; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr0_sgpr1
%0:sreg_64_xexec = COPY $sgpr8_sgpr9
@@ -110,16 +94,10 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $sgpr0_sgpr1 = COPY $sgpr8_sgpr9
; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr0_sgpr1, 0, 0 :: (dereferenceable invariant load (s256), addrspace 4)
- ; GCN-NEXT: SI_SPILL_S256_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, %stack.1, implicit $exec, implicit $sp_reg :: (store (s256) into %stack.1, align 4, addrspace 5)
- ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr0_sgpr1, 4, 0 :: (dereferenceable invariant load (s256), addrspace 4)
- ; GCN-NEXT: SI_SPILL_S256_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, %stack.0, implicit $exec, implicit $sp_reg :: (store (s256) into %stack.0, align 4, addrspace 5)
- ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr0_sgpr1, 8, 0 :: (dereferenceable invariant load (s256), addrspace 4)
- ; GCN-NEXT: SI_SPILL_S256_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, %stack.2, implicit $exec, implicit $sp_reg :: (store (s256) into %stack.2, align 4, addrspace 5)
- ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = SI_SPILL_S256_RESTORE %stack.1, implicit $exec, implicit $sp_reg :: (load (s256) from %stack.1, align 4, addrspace 5)
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11
- ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = SI_SPILL_S256_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s256) from %stack.0, align 4, addrspace 5)
+ ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr0_sgpr1, 4, 0 :: (dereferenceable invariant load (s256), addrspace 4)
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11
- ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = SI_SPILL_S256_RESTORE %stack.2, implicit $exec, implicit $sp_reg :: (load (s256) from %stack.2, align 4, addrspace 5)
+ ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr0_sgpr1, 8, 0 :: (dereferenceable invariant load (s256), addrspace 4)
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11
; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr0_sgpr1
%0:sreg_64_xexec = COPY $sgpr8_sgpr9
@@ -143,16 +121,10 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $sgpr0_sgpr1 = COPY $sgpr8_sgpr9
; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM renamable $sgpr0_sgpr1, 0, 0 :: (dereferenceable invariant load (s512), addrspace 4)
- ; GCN-NEXT: SI_SPILL_S512_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19, %stack.1, implicit $exec, implicit $sp_reg :: (store (s512) into %stack.1, align 4, addrspace 5)
- ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM renamable $sgpr0_sgpr1, 4, 0 :: (dereferenceable invariant load (s512), addrspace 4)
- ; GCN-NEXT: SI_SPILL_S512_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19, %stack.0, implicit $exec, implicit $sp_reg :: (store (s512) into %stack.0, align 4, addrspace 5)
- ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM renamable $sgpr0_sgpr1, 8, 0 :: (dereferenceable invariant load (s512), addrspace 4)
- ; GCN-NEXT: SI_SPILL_S512_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19, %stack.2, implicit $exec, implicit $sp_reg :: (store (s512) into %stack.2, align 4, addrspace 5)
- ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = SI_SPILL_S512_RESTORE %stack.1, implicit $exec, implicit $sp_reg :: (load (s512) from %stack.1, align 4, addrspace 5)
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
- ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = SI_SPILL_S512_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s512) from %stack.0, align 4, addrspace 5)
+ ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM renamable $sgpr0_sgpr1, 4, 0 :: (dereferenceable invariant load (s512), addrspace 4)
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
- ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = SI_SPILL_S512_RESTORE %stack.2, implicit $exec, implicit $sp_reg :: (load (s512) from %stack.2, align 4, addrspace 5)
+ ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM renamable $sgpr0_sgpr1, 8, 0 :: (dereferenceable invariant load (s512), addrspace 4)
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr0_sgpr1
%0:sreg_64_xexec = COPY $sgpr8_sgpr9
@@ -175,13 +147,12 @@ body: |
; GCN: liveins: $sgpr8_sgpr9, $vgpr0_vgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $sgpr0_sgpr1 = COPY $sgpr8_sgpr9
- ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM renamable $sgpr0_sgpr1, 0, 0 :: (dereferenceable invariant load (s512), align 4, addrspace 4)
- ; GCN-NEXT: SI_SPILL_S512_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19, %stack.0, implicit $exec, implicit $sp_reg :: (store (s512) into %stack.0, align 4, addrspace 5)
; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM renamable $sgpr0_sgpr1, 128, 0 :: (dereferenceable invariant load (s512), align 4, addrspace 4)
- ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = SI_SPILL_S512_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s512) from %stack.0, align 4, addrspace 5)
+ ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr0_sgpr1, 32, 0 :: (dereferenceable invariant load (s256), align 4, addrspace 4)
+ ; GCN-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = COPY killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM renamable $sgpr0_sgpr1, 128, 0 :: (dereferenceable invariant load (s512), align 4, addrspace 4)
- ; GCN-NEXT: renamable...
[truncated]
|
Addressing review comments. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, I'm just nitpicking now.
Transformed index-based loop into operand-based one. |
Thanks. If there are no more comments, I will merge the change tomorrow. |
Extend the list of instructions that can be rematerialized in SIInstrInfo::isReallyTriviallyReMaterializable() to support scalar loads. Try shrinking instructions to remat only the part needed for current context. Add SIInstrInfo::reMaterialize target hook, and handle shrinking of S_LOAD_DWORDX16_IMM to S_LOAD_DWORDX8_IMM as a proof of concept.
Extend the list of instructions that can be rematerialized in SIInstrInfo::isReallyTriviallyReMaterializable() to support scalar loads.
Try shrinking instructions to remat only the part needed for current context. Add SIInstrInfo::reMaterialize target hook, and handle shrinking of S_LOAD_DWORDX16_IMM to S_LOAD_DWORDX8_IMM as a proof of concept.