[AMDGPU] Rematerialize scalar loads #68778

piotrAMD · 2023-10-11T08:45:59Z

Extend the list of instructions that can be rematerialized in SIInstrInfo::isReallyTriviallyReMaterializable() to support scalar loads.

Try shrinking instructions to remat only the part needed for current context. Add SIInstrInfo::reMaterialize target hook, and handle shrinking of S_LOAD_DWORDX16_IMM to S_LOAD_DWORDX8_IMM as a proof of concept.

Extend the list of instructions that can be rematerialized in SIInstrInfo::isReallyTriviallyReMaterializable() to support scalar loads. Try shrinking instructions to remat only the part needed for current context. Add SIInstrInfo::reMaterialize target hook, and handle shrinking of S_LOAD_DWORDX16_IMM to S_LOAD_DWORDX8_IMM as a proof of concept.

piotrAMD · 2023-10-11T08:46:38Z

Ported from https://reviews.llvm.org/D154083 (and rebased).

llvmbot · 2023-10-11T08:49:19Z

@llvm/pr-subscribers-backend-amdgpu

Author: Piotr Sobczak (piotrAMD)

Changes

Extend the list of instructions that can be rematerialized in SIInstrInfo::isReallyTriviallyReMaterializable() to support scalar loads.

Try shrinking instructions to remat only the part needed for current context. Add SIInstrInfo::reMaterialize target hook, and handle shrinking of S_LOAD_DWORDX16_IMM to S_LOAD_DWORDX8_IMM as a proof of concept.

Patch is 44.21 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/68778.diff

5 Files Affected:

(modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+110)
(modified) llvm/lib/Target/AMDGPU/SIInstrInfo.h (+5)
(modified) llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll (+2-2)
(modified) llvm/test/CodeGen/AMDGPU/remat-smrd.mir (+29-87)
(modified) llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir (+5-37)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 51397cbb791469d..31a086cd2a61820 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -108,7 +108,18 @@ static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
 
 bool SIInstrInfo::isReallyTriviallyReMaterializable(
     const MachineInstr &MI) const {
+
+  bool CanRemat = false;
   if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isSDWA(MI) || isSALU(MI)) {
+    CanRemat = true;
+  } else if (isSMRD(MI)) {
+    CanRemat = !MI.memoperands_empty() &&
+               llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
+                 return MMO->isLoad() && MMO->isInvariant();
+               });
+  }
+
+  if (CanRemat) {
     // Normally VALU use of exec would block the rematerialization, but that
     // is OK in this case to have an implicit exec read as all VALU do.
     // We really want all of the generic logic for this except for this.
@@ -2434,6 +2445,105 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   return true;
 }
 
+void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator I, Register DestReg,
+                                unsigned SubIdx, const MachineInstr &Orig,
+                                const TargetRegisterInfo &RI) const {
+
+  // Try shrinking the instruction to remat only the part needed for current
+  // context.
+  // TODO: Handle more cases.
+  unsigned Opcode = Orig.getOpcode();
+  switch (Opcode) {
+  case AMDGPU::S_LOAD_DWORDX16_IMM:
+  case AMDGPU::S_LOAD_DWORDX8_IMM: {
+    if (SubIdx != 0)
+      break;
+
+    if (I == MBB.end())
+      break;
+
+    if (I->isBundled())
+      break;
+
+    // Look for a single use of the register that is also a subreg.
+    Register RegToFind = Orig.getOperand(0).getReg();
+    int SingleUseIdx = -1;
+    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+      const MachineOperand &CandMO = I->getOperand(i);
+      if (!CandMO.isReg())
+        continue;
+      Register CandReg = CandMO.getReg();
+      if (!CandReg)
+        continue;
+
+      if (CandReg == RegToFind || RI.regsOverlap(CandReg, RegToFind)) {
+        if (SingleUseIdx == -1 && CandMO.isUse()) {
+          SingleUseIdx = i;
+        } else {
+          SingleUseIdx = -1;
+          break;
+        }
+      }
+    }
+    if (SingleUseIdx == -1)
+      break;
+    MachineOperand *UseMO = &I->getOperand(SingleUseIdx);
+    if (UseMO->getSubReg() == AMDGPU::NoSubRegister)
+      break;
+
+    unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
+    unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
+
+    MachineFunction *MF = MBB.getParent();
+    MachineRegisterInfo &MRI = MF->getRegInfo();
+    assert(MRI.hasAtMostUserInstrs(DestReg, 0) &&
+           "DestReg should have no users yet.");
+
+    unsigned NewOpcode = -1;
+    if (SubregSize == 256)
+      NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
+    else if (SubregSize == 128)
+      NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
+    else
+      break;
+
+    const MCInstrDesc &TID = get(NewOpcode);
+    const TargetRegisterClass *NewRC =
+        RI.getAllocatableClass(getRegClass(TID, 0, &RI, *MF));
+    MRI.setRegClass(DestReg, NewRC);
+
+    UseMO->setReg(DestReg);
+    UseMO->setSubReg(AMDGPU::NoSubRegister);
+
+    // Use a smaller load with the desired size, possibly with updated offset.
+    MachineInstr *MI = MF->CloneMachineInstr(&Orig);
+    MI->setDesc(TID);
+    MI->getOperand(0).setReg(DestReg);
+    MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
+    if (Offset) {
+      MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
+      int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
+      OffsetMO->setImm(FinalOffset);
+    }
+    SmallVector<MachineMemOperand *> NewMMOs;
+    for (const MachineMemOperand *MemOp : Orig.memoperands())
+      NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
+                                                 SubregSize / 8));
+    MI->setMemRefs(*MF, NewMMOs);
+
+    MBB.insert(I, MI);
+    return;
+  }
+
+  default:
+    break;
+  }
+  MachineInstr *MI = MBB.getParent()->CloneMachineInstr(&Orig);
+  MI->substituteRegister(MI->getOperand(0).getReg(), DestReg, SubIdx, RI);
+  MBB.insert(I, MI);
+}
+
 std::pair<MachineInstr*, MachineInstr*>
 SIInstrInfo::expandMovDPP64(MachineInstr &MI) const {
   assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 5ef17c44f7de389..a64cf0244e4c08c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -275,6 +275,11 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
 
   bool expandPostRAPseudo(MachineInstr &MI) const override;
 
+  void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                     Register DestReg, unsigned SubIdx,
+                     const MachineInstr &Orig,
+                     const TargetRegisterInfo &TRI) const override;
+
   // Splits a V_MOV_B64_DPP_PSEUDO opcode into a pair of v_mov_b32_dpp
   // instructions. Returns a pair of generated instructions.
   // Can split either post-RA with physical registers or pre-RA with
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
index d6f7a92af9dcb6f..1999c7b065e6854 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
@@ -47,8 +47,8 @@ entry:
 }
 
 ; CHECK:   .name:       num_spilled_sgprs
-; GFX700:   .sgpr_spill_count: 38
-; GFX803:   .sgpr_spill_count: 22
+; GFX700:   .sgpr_spill_count: 12
+; GFX803:   .sgpr_spill_count: 12
 ; GFX900:   .sgpr_spill_count: 48
 ; GFX1010:  .sgpr_spill_count: 48
 ; CHECK:   .symbol:     num_spilled_sgprs.kd
diff --git a/llvm/test/CodeGen/AMDGPU/remat-smrd.mir b/llvm/test/CodeGen/AMDGPU/remat-smrd.mir
index 753cb135fdae9c0..95eac12a65389ef 100644
--- a/llvm/test/CodeGen/AMDGPU/remat-smrd.mir
+++ b/llvm/test/CodeGen/AMDGPU/remat-smrd.mir
@@ -12,15 +12,11 @@ body:             |
     ; GCN: liveins: $sgpr8_sgpr9
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: renamable $sgpr2_sgpr3 = COPY $sgpr8_sgpr9
-    ; GCN-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM renamable $sgpr2_sgpr3, 0, 0 :: (dereferenceable invariant load (s32), addrspace 4)
-    ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.0, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.0, addrspace 5)
     ; GCN-NEXT: renamable $sgpr1 = S_LOAD_DWORD_IMM renamable $sgpr2_sgpr3, 4, 0 :: (dereferenceable invariant load (s32), addrspace 4)
-    ; GCN-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM renamable $sgpr2_sgpr3, 8, 0 :: (dereferenceable invariant load (s32), addrspace 4)
-    ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.1, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.1, addrspace 5)
-    ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.0, addrspace 5)
+    ; GCN-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM renamable $sgpr2_sgpr3, 0, 0 :: (dereferenceable invariant load (s32), addrspace 4)
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr1
-    ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.1, addrspace 5)
+    ; GCN-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM renamable $sgpr2_sgpr3, 8, 0 :: (dereferenceable invariant load (s32), addrspace 4)
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0
     ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr2_sgpr3
     %0:sreg_64_xexec = COPY $sgpr8_sgpr9
@@ -44,16 +40,10 @@ body:             |
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: renamable $sgpr0_sgpr1 = COPY $sgpr8_sgpr9
     ; GCN-NEXT: renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM renamable $sgpr0_sgpr1, 0, 0 :: (dereferenceable invariant load (s64), addrspace 4)
-    ; GCN-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr2_sgpr3, %stack.1, implicit $exec, implicit $sp_reg :: (store (s64) into %stack.1, align 4, addrspace 5)
-    ; GCN-NEXT: renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM renamable $sgpr0_sgpr1, 4, 0 :: (dereferenceable invariant load (s64), addrspace 4)
-    ; GCN-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr2_sgpr3, %stack.0, implicit $exec, implicit $sp_reg :: (store (s64) into %stack.0, align 4, addrspace 5)
-    ; GCN-NEXT: renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM renamable $sgpr0_sgpr1, 8, 0 :: (dereferenceable invariant load (s64), addrspace 4)
-    ; GCN-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr2_sgpr3, %stack.2, implicit $exec, implicit $sp_reg :: (store (s64) into %stack.2, align 4, addrspace 5)
-    ; GCN-NEXT: renamable $sgpr2_sgpr3 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sp_reg :: (load (s64) from %stack.1, align 4, addrspace 5)
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr2_sgpr3
-    ; GCN-NEXT: renamable $sgpr2_sgpr3 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s64) from %stack.0, align 4, addrspace 5)
+    ; GCN-NEXT: renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM renamable $sgpr0_sgpr1, 4, 0 :: (dereferenceable invariant load (s64), addrspace 4)
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr2_sgpr3
-    ; GCN-NEXT: renamable $sgpr2_sgpr3 = SI_SPILL_S64_RESTORE %stack.2, implicit $exec, implicit $sp_reg :: (load (s64) from %stack.2, align 4, addrspace 5)
+    ; GCN-NEXT: renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM renamable $sgpr0_sgpr1, 8, 0 :: (dereferenceable invariant load (s64), addrspace 4)
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr2_sgpr3
     ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr0_sgpr1
     %0:sreg_64_xexec = COPY $sgpr8_sgpr9
@@ -77,16 +67,10 @@ body:             |
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: renamable $sgpr0_sgpr1 = COPY $sgpr8_sgpr9
     ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX4_IMM renamable $sgpr0_sgpr1, 0, 0 :: (dereferenceable invariant load (s128), addrspace 4)
-    ; GCN-NEXT: SI_SPILL_S128_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, %stack.1, implicit $exec, implicit $sp_reg :: (store (s128) into %stack.1, align 4, addrspace 5)
-    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX4_IMM renamable $sgpr0_sgpr1, 4, 0 :: (dereferenceable invariant load (s128), addrspace 4)
-    ; GCN-NEXT: SI_SPILL_S128_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, %stack.0, implicit $exec, implicit $sp_reg :: (store (s128) into %stack.0, align 4, addrspace 5)
-    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX4_IMM renamable $sgpr0_sgpr1, 8, 0 :: (dereferenceable invariant load (s128), addrspace 4)
-    ; GCN-NEXT: SI_SPILL_S128_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, %stack.2, implicit $exec, implicit $sp_reg :: (store (s128) into %stack.2, align 4, addrspace 5)
-    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = SI_SPILL_S128_RESTORE %stack.1, implicit $exec, implicit $sp_reg :: (load (s128) from %stack.1, align 4, addrspace 5)
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7
-    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = SI_SPILL_S128_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s128) from %stack.0, align 4, addrspace 5)
+    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX4_IMM renamable $sgpr0_sgpr1, 4, 0 :: (dereferenceable invariant load (s128), addrspace 4)
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7
-    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = SI_SPILL_S128_RESTORE %stack.2, implicit $exec, implicit $sp_reg :: (load (s128) from %stack.2, align 4, addrspace 5)
+    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX4_IMM renamable $sgpr0_sgpr1, 8, 0 :: (dereferenceable invariant load (s128), addrspace 4)
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7
     ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr0_sgpr1
     %0:sreg_64_xexec = COPY $sgpr8_sgpr9
@@ -110,16 +94,10 @@ body:             |
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: renamable $sgpr0_sgpr1 = COPY $sgpr8_sgpr9
     ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr0_sgpr1, 0, 0 :: (dereferenceable invariant load (s256), addrspace 4)
-    ; GCN-NEXT: SI_SPILL_S256_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, %stack.1, implicit $exec, implicit $sp_reg :: (store (s256) into %stack.1, align 4, addrspace 5)
-    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr0_sgpr1, 4, 0 :: (dereferenceable invariant load (s256), addrspace 4)
-    ; GCN-NEXT: SI_SPILL_S256_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, %stack.0, implicit $exec, implicit $sp_reg :: (store (s256) into %stack.0, align 4, addrspace 5)
-    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr0_sgpr1, 8, 0 :: (dereferenceable invariant load (s256), addrspace 4)
-    ; GCN-NEXT: SI_SPILL_S256_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, %stack.2, implicit $exec, implicit $sp_reg :: (store (s256) into %stack.2, align 4, addrspace 5)
-    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = SI_SPILL_S256_RESTORE %stack.1, implicit $exec, implicit $sp_reg :: (load (s256) from %stack.1, align 4, addrspace 5)
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11
-    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = SI_SPILL_S256_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s256) from %stack.0, align 4, addrspace 5)
+    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr0_sgpr1, 4, 0 :: (dereferenceable invariant load (s256), addrspace 4)
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11
-    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = SI_SPILL_S256_RESTORE %stack.2, implicit $exec, implicit $sp_reg :: (load (s256) from %stack.2, align 4, addrspace 5)
+    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr0_sgpr1, 8, 0 :: (dereferenceable invariant load (s256), addrspace 4)
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11
     ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr0_sgpr1
     %0:sreg_64_xexec = COPY $sgpr8_sgpr9
@@ -143,16 +121,10 @@ body:             |
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: renamable $sgpr0_sgpr1 = COPY $sgpr8_sgpr9
     ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM renamable $sgpr0_sgpr1, 0, 0 :: (dereferenceable invariant load (s512), addrspace 4)
-    ; GCN-NEXT: SI_SPILL_S512_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19, %stack.1, implicit $exec, implicit $sp_reg :: (store (s512) into %stack.1, align 4, addrspace 5)
-    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM renamable $sgpr0_sgpr1, 4, 0 :: (dereferenceable invariant load (s512), addrspace 4)
-    ; GCN-NEXT: SI_SPILL_S512_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19, %stack.0, implicit $exec, implicit $sp_reg :: (store (s512) into %stack.0, align 4, addrspace 5)
-    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM renamable $sgpr0_sgpr1, 8, 0 :: (dereferenceable invariant load (s512), addrspace 4)
-    ; GCN-NEXT: SI_SPILL_S512_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19, %stack.2, implicit $exec, implicit $sp_reg :: (store (s512) into %stack.2, align 4, addrspace 5)
-    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = SI_SPILL_S512_RESTORE %stack.1, implicit $exec, implicit $sp_reg :: (load (s512) from %stack.1, align 4, addrspace 5)
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = SI_SPILL_S512_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s512) from %stack.0, align 4, addrspace 5)
+    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM renamable $sgpr0_sgpr1, 4, 0 :: (dereferenceable invariant load (s512), addrspace 4)
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = SI_SPILL_S512_RESTORE %stack.2, implicit $exec, implicit $sp_reg :: (load (s512) from %stack.2, align 4, addrspace 5)
+    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM renamable $sgpr0_sgpr1, 8, 0 :: (dereferenceable invariant load (s512), addrspace 4)
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
     ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr0_sgpr1
     %0:sreg_64_xexec = COPY $sgpr8_sgpr9
@@ -175,13 +147,12 @@ body:             |
     ; GCN: liveins: $sgpr8_sgpr9, $vgpr0_vgpr1
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: renamable $sgpr0_sgpr1 = COPY $sgpr8_sgpr9
-    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM renamable $sgpr0_sgpr1, 0, 0 :: (dereferenceable invariant load (s512), align 4, addrspace 4)
-    ; GCN-NEXT: SI_SPILL_S512_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19, %stack.0, implicit $exec, implicit $sp_reg :: (store (s512) into %stack.0, align 4, addrspace 5)
     ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM renamable $sgpr0_sgpr1, 128, 0 :: (dereferenceable invariant load (s512), align 4, addrspace 4)
-    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = SI_SPILL_S512_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s512) from %stack.0, align 4, addrspace 5)
+    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr0_sgpr1, 32, 0 :: (dereferenceable invariant load (s256), align 4, addrspace 4)
+    ; GCN-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = COPY killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
     ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM renamable $sgpr0_sgpr1, 128, 0 :: (dereferenceable invariant load (s512), align 4, addrspace 4)
-    ; GCN-NEXT: renamable...
[truncated]

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

piotrAMD · 2023-10-13T10:21:22Z

Addressing review comments.

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

jayfoad

LGTM, I'm just nitpicking now.

piotrAMD · 2023-10-16T16:08:15Z

Transformed index-based loop into operand-based one.

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

piotrAMD · 2023-10-25T14:36:22Z

Thanks. If there are no more comments, I will merge the change tomorrow.

Extend the list of instructions that can be rematerialized in SIInstrInfo::isReallyTriviallyReMaterializable() to support scalar loads. Try shrinking instructions to remat only the part needed for current context. Add SIInstrInfo::reMaterialize target hook, and handle shrinking of S_LOAD_DWORDX16_IMM to S_LOAD_DWORDX8_IMM as a proof of concept.

piotrAMD requested review from jayfoad, arsenm and rampitec October 11, 2023 08:47

llvmbot added the backend:AMDGPU label Oct 11, 2023

jayfoad reviewed Oct 12, 2023

View reviewed changes