[AMDGPU] Compiler should synthesize private buffer resource descriptor from flat_scratch_init #79586

alex-t · 2024-01-26T12:19:29Z

This change implements synthesizing the private buffer resource descriptor in the kernel prolog instead of using the preloaded kernel argument.

llvmbot · 2024-01-26T12:19:59Z

@llvm/pr-subscribers-llvm-globalisel

@llvm/pr-subscribers-backend-amdgpu

Author: None (alex-t)

Changes

Patch is 100.56 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/79586.diff

24 Files Affected:

(modified) llvm/lib/Target/AMDGPU/SIFrameLowering.cpp (+47-29)
(modified) llvm/lib/Target/AMDGPU/SIFrameLowering.h (+7-7)
(modified) llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll (+4-4)
(modified) llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll (+8-8)
(modified) llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll (+2-2)
(modified) llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll (+3-3)
(modified) llvm/test/CodeGen/AMDGPU/call-argument-types.ll (+140-140)
(modified) llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll (+12-12)
(modified) llvm/test/CodeGen/AMDGPU/call-waitcnt.ll (+12-12)
(modified) llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll (+3-3)
(modified) llvm/test/CodeGen/AMDGPU/cc-update.ll (+32-32)
(modified) llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll (+4-4)
(modified) llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll (+4-4)
(modified) llvm/test/CodeGen/AMDGPU/indirect-call.ll (+8-8)
(modified) llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll (+2-2)
(modified) llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll (+24-24)
(modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll (+2-2)
(modified) llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll (+6-6)
(modified) llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll (+6-6)
(modified) llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll (+6-6)
(modified) llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll (+6-6)
(modified) llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll (+2-2)
(modified) llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll (+6-6)
(modified) llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll (+2-2)

diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 0f89df14448667..4842cf83af0e18 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -378,7 +378,7 @@ class PrologEpilogSGPRSpillBuilder {
 } // namespace llvm
 
 // Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()`
-void SIFrameLowering::emitEntryFunctionFlatScratchInit(
+Register SIFrameLowering::emitEntryFunctionFlatScratchInit(
     MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     const DebugLoc &DL, Register ScratchWaveOffsetReg) const {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
@@ -398,6 +398,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
 
   Register FlatScrInitLo;
   Register FlatScrInitHi;
+  Register FlatScratchInitReg = AMDGPU::NoRegister;
 
   if (ST.isAmdPalOS()) {
     // Extract the scratch offset from the descriptor in the GIT
@@ -407,7 +408,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
 
     // Find unused reg to load flat scratch init into
     MachineRegisterInfo &MRI = MF.getRegInfo();
-    Register FlatScrInit = AMDGPU::NoRegister;
+    Register FlatScratchInitReg = AMDGPU::NoRegister;
     ArrayRef<MCPhysReg> AllSGPR64s = TRI->getAllSGPR64(MF);
     unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2;
     AllSGPR64s = AllSGPR64s.slice(
@@ -416,16 +417,16 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
     for (MCPhysReg Reg : AllSGPR64s) {
       if (LiveUnits.available(Reg) && !MRI.isReserved(Reg) &&
           MRI.isAllocatable(Reg) && !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
-        FlatScrInit = Reg;
+        FlatScratchInitReg = Reg;
         break;
       }
     }
-    assert(FlatScrInit && "Failed to find free register for scratch init");
+    assert(FlatScratchInitReg && "Failed to find free register for scratch init");
 
-    FlatScrInitLo = TRI->getSubReg(FlatScrInit, AMDGPU::sub0);
-    FlatScrInitHi = TRI->getSubReg(FlatScrInit, AMDGPU::sub1);
+    FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
+    FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
 
-    buildGitPtr(MBB, I, DL, TII, FlatScrInit);
+    buildGitPtr(MBB, I, DL, TII, FlatScratchInitReg);
 
     // We now have the GIT ptr - now get the scratch descriptor from the entry
     // at offset 0 (or offset 16 for a compute shader).
@@ -440,8 +441,8 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
         MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
     const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
     unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
-    BuildMI(MBB, I, DL, LoadDwordX2, FlatScrInit)
-        .addReg(FlatScrInit)
+    BuildMI(MBB, I, DL, LoadDwordX2, FlatScratchInitReg)
+        .addReg(FlatScratchInitReg)
         .addImm(EncodedOffset) // offset
         .addImm(0)             // cpol
         .addMemOperand(MMO);
@@ -453,7 +454,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
         .addImm(0xffff);
     And->getOperand(3).setIsDead(); // Mark SCC as dead.
   } else {
-    Register FlatScratchInitReg =
+    FlatScratchInitReg =
         MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
     assert(FlatScratchInitReg);
 
@@ -485,7 +486,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
         addReg(FlatScrInitHi).
         addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI |
                        (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
-      return;
+      return FlatScratchInitReg;
     }
 
     // For GFX9.
@@ -498,7 +499,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
       .addImm(0);
     Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
 
-    return;
+    return AMDGPU::FLAT_SCR;
   }
 
   assert(ST.getGeneration() < AMDGPUSubtarget::GFX9);
@@ -519,6 +520,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
     .addReg(FlatScrInitLo, RegState::Kill)
     .addImm(8);
   LShr->getOperand(3).setIsDead(); // Mark SCC as dead.
+  return AMDGPU::FLAT_SCR;
 }
 
 // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
@@ -610,11 +612,15 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
   const SIInstrInfo *TII = ST.getInstrInfo();
   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  const Function &F = MF.getFunction();
   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
 
   assert(MFI->isEntryFunction());
 
+  bool NeedsFlatScratchInit =
+      MFI->getUserSGPRInfo().hasFlatScratchInit() &&
+      (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() ||
+       (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch()));
+
   Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
       AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
 
@@ -640,7 +646,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
   // Now that we have fixed the reserved SRSRC we need to locate the
   // (potentially) preloaded SRSRC.
   Register PreloadedScratchRsrcReg;
-  if (ST.isAmdHsaOrMesa(F)) {
+  if (ST.isAmdHsaOrMesa(MF.getFunction()) && !NeedsFlatScratchInit) {
     PreloadedScratchRsrcReg =
         MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
     if (ScratchRsrcReg && PreloadedScratchRsrcReg) {
@@ -696,10 +702,6 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
   }
 
-  bool NeedsFlatScratchInit =
-      MFI->getUserSGPRInfo().hasFlatScratchInit() &&
-      (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() ||
-       (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch()));
 
   if ((NeedsFlatScratchInit || ScratchRsrcReg) &&
       PreloadedScratchWaveOffsetReg && !ST.flatScratchIsArchitected()) {
@@ -707,22 +709,24 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
     MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
   }
 
+  Register FlatScratchInit = AMDGPU::NoRegister;
   if (NeedsFlatScratchInit) {
-    emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
+    FlatScratchInit =
+        emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
   }
 
   if (ScratchRsrcReg) {
-    emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL,
-                                         PreloadedScratchRsrcReg,
-                                         ScratchRsrcReg, ScratchWaveOffsetReg);
+    emitEntryFunctionScratchRsrcRegSetup(
+        MF, MBB, I, DL, FlatScratchInit, ScratchRsrcReg,
+        PreloadedScratchRsrcReg, ScratchWaveOffsetReg);
   }
 }
 
 // Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg`
 void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
     MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-    const DebugLoc &DL, Register PreloadedScratchRsrcReg,
-    Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const {
+    const DebugLoc &DL, Register FlatScratchInit, Register ScratchRsrcReg,
+    Register PreloadedScratchRsrcReg, Register ScratchWaveOffsetReg) const {
 
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
@@ -770,7 +774,8 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
           .addImm(21)
           .addReg(Rsrc03);
     }
-  } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) {
+  } else if (ST.isMesaGfxShader(Fn) ||
+             (!FlatScratchInit.isValid() && !PreloadedScratchRsrcReg)) {
     assert(!ST.isAmdHsaOrMesa(Fn));
     const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
 
@@ -829,11 +834,24 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
       .addImm(Rsrc23 >> 32)
       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
   } else if (ST.isAmdHsaOrMesa(Fn)) {
-    assert(PreloadedScratchRsrcReg);
 
-    if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
-      BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
-          .addReg(PreloadedScratchRsrcReg, RegState::Kill);
+    if (FlatScratchInit) {
+      I = BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY),
+                  TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1))
+              .addReg(FlatScratchInit)
+              .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+      I = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64),
+                  TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2_sub3))
+              .addImm(0xf0000000)
+              .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+      return;
+    } else {
+      assert(PreloadedScratchRsrcReg);
+
+      if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
+        BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
+            .addReg(PreloadedScratchRsrcReg, RegState::Kill);
+      }
     }
   }
 
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
index b3feb759ed811f..f706d48b2dc101 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -67,19 +67,19 @@ class SIFrameLowering final : public AMDGPUFrameLowering {
                                 MachineBasicBlock::iterator MI) const override;
 
 private:
-  void emitEntryFunctionFlatScratchInit(MachineFunction &MF,
-                                        MachineBasicBlock &MBB,
-                                        MachineBasicBlock::iterator I,
-                                        const DebugLoc &DL,
-                                        Register ScratchWaveOffsetReg) const;
+  Register
+  emitEntryFunctionFlatScratchInit(MachineFunction &MF, MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator I,
+                                   const DebugLoc &DL,
+                                   Register ScratchWaveOffsetReg) const;
 
   Register getEntryFunctionReservedScratchRsrcReg(MachineFunction &MF) const;
 
   void emitEntryFunctionScratchRsrcRegSetup(
       MachineFunction &MF, MachineBasicBlock &MBB,
       MachineBasicBlock::iterator I, const DebugLoc &DL,
-      Register PreloadedPrivateBufferReg, Register ScratchRsrcReg,
-      Register ScratchWaveOffsetReg) const;
+      Register FlatScratchInit, Register ScratchRsrcReg,
+      Register PreloadedScratchRsrcReg, Register ScratchWaveOffsetReg) const;
 
 public:
   bool hasFP(const MachineFunction &MF) const override;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
index e597ce6f114a6b..a7277414391cb2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
@@ -14,9 +14,9 @@ define amdgpu_kernel void @kernel_caller_stack() {
 ; MUBUF:       ; %bb.0:
 ; MUBUF-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
 ; MUBUF-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
-; MUBUF-NEXT:    s_add_u32 s0, s0, s7
+; MUBUF-NEXT:    s_mov_b64 s[2:3], 0xf0000000
 ; MUBUF-NEXT:    s_mov_b32 s32, 0
-; MUBUF-NEXT:    s_addc_u32 s1, s1, 0
+; MUBUF-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; MUBUF-NEXT:    v_mov_b32_e32 v0, 9
 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
 ; MUBUF-NEXT:    v_mov_b32_e32 v0, 10
@@ -62,8 +62,8 @@ define amdgpu_kernel void @kernel_caller_byval() {
 ; MUBUF:       ; %bb.0:
 ; MUBUF-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
 ; MUBUF-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
-; MUBUF-NEXT:    s_add_u32 s0, s0, s7
-; MUBUF-NEXT:    s_addc_u32 s1, s1, 0
+; MUBUF-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; MUBUF-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:12
diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
index a439c0f51ffe9c..bda25cda4c5f9c 100644
--- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
@@ -48,19 +48,19 @@ define amdgpu_kernel void @parent_kernel_missing_inputs() #0 {
 ; FIXEDABI-SDAG-LABEL: parent_kernel_missing_inputs:
 ; FIXEDABI-SDAG:       ; %bb.0:
 ; FIXEDABI-SDAG-NEXT:    s_add_i32 s4, s4, s9
-; FIXEDABI-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
 ; FIXEDABI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; FIXEDABI-SDAG-NEXT:    s_add_u32 s0, s0, s9
+; FIXEDABI-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; FIXEDABI-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; FIXEDABI-SDAG-NEXT:    s_mov_b64 s[2:3], 0xf0000000
 ; FIXEDABI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; FIXEDABI-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
-; FIXEDABI-SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; FIXEDABI-SDAG-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; FIXEDABI-SDAG-NEXT:    s_mov_b32 s14, s8
 ; FIXEDABI-SDAG-NEXT:    v_or_b32_e32 v31, v0, v2
 ; FIXEDABI-SDAG-NEXT:    s_mov_b64 s[8:9], 0
 ; FIXEDABI-SDAG-NEXT:    s_mov_b32 s12, s6
 ; FIXEDABI-SDAG-NEXT:    s_mov_b32 s13, s7
 ; FIXEDABI-SDAG-NEXT:    s_mov_b32 s32, 0
-; FIXEDABI-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; FIXEDABI-SDAG-NEXT:    s_getpc_b64 s[4:5]
 ; FIXEDABI-SDAG-NEXT:    s_add_u32 s4, s4, requires_all_inputs@rel32@lo+4
 ; FIXEDABI-SDAG-NEXT:    s_addc_u32 s5, s5, requires_all_inputs@rel32@hi+12
@@ -70,19 +70,19 @@ define amdgpu_kernel void @parent_kernel_missing_inputs() #0 {
 ; FIXEDABI-GISEL-LABEL: parent_kernel_missing_inputs:
 ; FIXEDABI-GISEL:       ; %bb.0:
 ; FIXEDABI-GISEL-NEXT:    s_add_i32 s4, s4, s9
-; FIXEDABI-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
 ; FIXEDABI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; FIXEDABI-GISEL-NEXT:    s_add_u32 s0, s0, s9
+; FIXEDABI-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; FIXEDABI-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; FIXEDABI-GISEL-NEXT:    s_mov_b64 s[2:3], 0xf0000000
 ; FIXEDABI-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; FIXEDABI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 20, v2
-; FIXEDABI-GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; FIXEDABI-GISEL-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; FIXEDABI-GISEL-NEXT:    s_mov_b32 s14, s8
 ; FIXEDABI-GISEL-NEXT:    v_or_b32_e32 v31, v0, v1
 ; FIXEDABI-GISEL-NEXT:    s_mov_b64 s[8:9], 0
 ; FIXEDABI-GISEL-NEXT:    s_mov_b32 s12, s6
 ; FIXEDABI-GISEL-NEXT:    s_mov_b32 s13, s7
 ; FIXEDABI-GISEL-NEXT:    s_mov_b32 s32, 0
-; FIXEDABI-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; FIXEDABI-GISEL-NEXT:    s_getpc_b64 s[4:5]
 ; FIXEDABI-GISEL-NEXT:    s_add_u32 s4, s4, requires_all_inputs@rel32@lo+4
 ; FIXEDABI-GISEL-NEXT:    s_addc_u32 s5, s5, requires_all_inputs@rel32@hi+12
diff --git a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
index 7c8d40c49bb805..20f60d1db7fb5f 100644
--- a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
+++ b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
@@ -10,8 +10,8 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
 ; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; CHECK-NEXT:    s_load_dwordx8 s[36:43], s[6:7], 0x0
-; CHECK-NEXT:    s_add_u32 s0, s0, s15
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT:    s_mov_b64 s[0:1], s[10:11]
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; CHECK-NEXT:    s_mov_b32 s8, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index 5a128c7541d1ec..2baaefb76acb3b 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -5,13 +5,13 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-LABEL: name: f1
   ; GFX90A: bb.0.bb:
   ; GFX90A-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr15, $sgpr10_sgpr11
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr15, $sgpr10_sgpr11
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $sgpr32 = S_MOV_B32 0
   ; GFX90A-NEXT:   $flat_scr_lo = S_ADD_U32 $sgpr10, $sgpr15, implicit-def $scc
   ; GFX90A-NEXT:   $flat_scr_hi = S_ADDC_U32 $sgpr11, 0, implicit-def dead $scc, implicit $scc
-  ; GFX90A-NEXT:   $sgpr0 = S_ADD_U32 $sgpr0, $sgpr15, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GFX90A-NEXT:   $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   $sgpr2_sgpr3 = S_MOV_B64 4026531840, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   $sgpr0_sgpr1 = COPY $flat_scr, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT:   renamable $sgpr10_sgpr11 = COPY $sgpr8_sgpr9
   ; GFX90A-NEXT:   renamable $vgpr31 = COPY $vgpr0, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr6_sgpr7, 24, 0 :: (dereferenceable invariant load (s32) from %ir.arg4.kernarg.offset.align.down, align 8, addrspace 4)
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index a192a1b8dff935..0fe54349215ba3 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -129,12 +129,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
 ; HSA-LABEL: test_call_external_void_func_i1_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s7
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 1
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i1@rel32@lo+4
 ; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_i1@rel32@hi+12
@@ -234,8 +234,8 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0 glc
 ; HSA-NEXT:    s_waitcnt vmcnt(0)
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4
@@ -339,8 +339,8 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0 glc
 ; HSA-NEXT:    s_waitcnt vmcnt(0)
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
 ; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i1_zeroext@rel32@lo+4
@@ -422,12 +422,12 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 {
 ; HSA-LABEL: test_call_external_void_func_i8_imm:
 ; HSA:       ; %bb.0:
 ; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT:    s_mov_b64 s[0:1], flat_scratch
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; HSA-NEXT:    s_getpc_b64...
[truncated]

github-actions · 2024-01-26T12:23:39Z

✅ With the latest revision this PR passed the C/C++ code formatter.

llvm/lib/Target/AMDGPU/SIFrameLowering.cpp

…r from flat_scratch_init

alex-t · 2024-02-05T19:19:21Z

Ping

slinder1

(Sorry for adding more nits after initial review)

I'm a bit confused about the goal; does this patch just pave the way to actually stop requesting the preloaded RSRC in some cases in the future?

llvm/lib/Target/AMDGPU/SIFrameLowering.cpp

arsenm

We do need a follow up change to prune this from the calling convention lowering. It's going to cause a huge amount of test churn, so it should be done separately

llvm/lib/Target/AMDGPU/SIFrameLowering.cpp

alex-t · 2024-02-06T18:31:30Z

(Sorry for adding more nits after initial review)

I'm a bit confused about the goal; does this patch just pave the way to actually stop requesting the preloaded RSRC in some cases in the future?

Yep. It currently avoids using the preloaded RSRC if the FlatScratchInit is available as RSRC low 64 bits repeat the FlatScratchInit and the hi 64 bits are known const value. However, I still have no idea how to instruct runtime not to set s[0:3] up with the preloaded values in case the whole 128 bits are effectively synthesized in a compile time.

slinder1 · 2024-02-06T19:25:46Z

(Sorry for adding more nits after initial review)
I'm a bit confused about the goal; does this patch just pave the way to actually stop requesting the preloaded RSRC in some cases in the future?

Yep. It currently avoids using the preloaded RSRC if the FlatScratchInit is available as RSRC low 64 bits repeat the FlatScratchInit and the hi 64 bits are known const value. However, I still have no idea how to instruct runtime not to set s[0:3] up with the preloaded values in case the whole 128 bits are effectively synthesized in a compile time.

Ah, makes sense; I'm assuming you've seen this, but just in case you haven't we do have ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER (see https://llvm.org/docs/AMDGPUUsage.html#code-object-v3-kernel-descriptor) which might help. If this simply doesn't work (or isn't always available) then I'm similarly not certain how one achieves this uniformly.

Also, linking to AMDGPUUsage reminded me that we do document the initial SRSRC setup at https://llvm.org/docs/AMDGPUUsage.html#private-segment-buffer . Would you be able to update that as part of the patch? It lives at llvm/docs/AMDGPUUsage.rst. At the very least a FIXME/WARNING in there would be good so we don't mislead anyone while we work out the specifics.

alex-t · 2024-02-06T20:20:48Z

We do need a follow up change to prune this from the calling convention lowering. It's going to cause a huge amount of test churn, so it should be done separately

So, could you please formalize the request a bit and possibly open the Jira ticket?
Just to clarify what exactly did you mean to prune: Rsrc23 constants "black magic" or... what?

alex-t · 2024-02-06T20:27:40Z

Ah, makes sense; I'm assuming you've seen this, but just in case you haven't we do have ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER (see https://llvm.org/docs/AMDGPUUsage.html#code-object-v3-kernel-descriptor) which might help. If this simply doesn't work (or isn't always available) then I'm similarly not certain how one achieves this uniformly.

I knew we had a bit for this. I mostly was curious how one could set/clear the appropriate bit. Just because I haven't experience with the runtime code and API.

alex-t · 2024-02-06T20:29:14Z

Also, linking to AMDGPUUsage reminded me that we do document the initial SRSRC setup at https://llvm.org/docs/AMDGPUUsage.html#private-segment-buffer . Would you be able to update that as part of the patch? It lives at llvm/docs/AMDGPUUsage.rst. At the very least a FIXME/WARNING in there would be good so we don't mislead anyone while we work out the specifics.

Good Idea. I promise I will do what I can :)

…r from flat_scratch_init

slinder1 · 2024-02-07T16:15:40Z

Thank you for the nit changes! This LGTM, as long as you at least put the FIXME in the docs in the initial commit. I'm happy to update it after-the-fact if it unblocks you

…r from flat_scratch_init

alex-t · 2024-02-07T17:28:11Z

Thank you for the nit changes! This LGTM, as long as you at least put the FIXME in the docs in the initial commit. I'm happy to update it after-the-fact if it unblocks you

I have updated the llvm/docs/AMDGPUUsage.rst. Could you please see if it seems like what you want?

slinder1

Thank you, this LGTM!

We can revisit the wording of the whole section when we no longer request the Private Segment Buffer SGPRs unconditionally

arsenm · 2024-02-08T11:26:45Z

We do need a follow up change to prune this from the calling convention lowering. It's going to cause a huge amount of test churn, so it should be done separately

So, could you please formalize the request a bit and possibly open the Jira ticket? Just to clarify what exactly did you mean to prune: Rsrc23 constants "black magic" or... what?

It's really the same ticket. As it stands, this PR is pointless and just increases the instruction count for no benefit. The point was to prune out the resource descriptor in the kernel user SGPRs, which this isn't doing yet. All of the kernel uses of s[0:3] should be removed and shifted down by 4 SGPRs

llvm/docs/AMDGPUUsage.rst

…r from flat_scratch_init

alex-t · 2024-02-08T15:27:30Z

As it stands, this PR is pointless and just increases the instruction count for no benefit. The point was to prune out the resource descriptor in the kernel user SGPRs, which this isn't doing yet. All of the kernel uses of s[0:3] should be removed and shifted down by 4 SGPRs

We had a long email conversation regarding this. Let's try one more time.
My question is - how did you intend to get rid of the using s[0:3]?
The buffer_load/store instructions require V# in 4 adjacent SGPRs. How are you supposed to save SGPRs on SRD uses? We could shorten the SGPRs live ranges by rematerializing the SRD before each use but we are not guaranteed to have free SGPRs after the regalloc is done. We could explicitly construct the SRD from the FlatScratrchInit SGPR pair on each use, but we still need two more SGPRs for the high 64 bits.

…r from flat_scratch_init

…r from flat_scratch_init. 3 tests updated

ronlieb · 2024-02-09T03:28:51Z

@alex-t could you try applying your patch to AMD internal gerrit ?
would help us with some down stream merges attn: @saiislam .
thx

jplehr · 2024-02-09T08:01:26Z

Hi,
this patch may have introduced quite a few breakages in OpenMP GPU Offload buildbots.
https://lab.llvm.org/buildbot/#/builders/193/builds/46392
https://lab.llvm.org/staging/#/builders/185/builds/1650
https://lab.llvm.org/staging/#/builders/140/builds/2167

I'd appreciate if someone can look at this. If you need help with reproducing, configs, etc, please reach out. Happy to help.

jplehr · 2024-02-09T08:44:35Z

I reverted locally and the problems go away.

…escriptor from flat_scratch_init" (#81234) Reverts #79586 This broke the AMDGPU OpenMP Offload buildbot. The typical error message was that the GPU attempted to read beyong the largest legal address. Error message: AMDGPU fatal error 1: Received error in queue 0x7f8363f22000: HSA_STATUS_ERROR_MEMORY_APERTURE_VIOLATION: The agent attempted to access memory beyond the largest legal address.

ronlieb · 2024-02-09T11:45:50Z

our downstream CI is seeing issues with this patch as well , thx for revert

alex-t · 2024-02-09T13:52:26Z

Thanks for catching this. The patch managed to pass PSDB applied to amd-staging.
So, I thought it was safe to commit.
I will look at the openmp issues closely later today but, at a first glance, it seems like the private segment buffer gets initialized with the wrong value.

alex-t · 2024-02-09T16:43:13Z

Hi, this patch may have introduced quite a few breakages in OpenMP GPU Offload buildbots. https://lab.llvm.org/buildbot/#/builders/193/builds/46392 https://lab.llvm.org/staging/#/builders/185/builds/1650 https://lab.llvm.org/staging/#/builders/140/builds/2167

I'd appreciate if someone can look at this. If you need help with reproducing, configs, etc, please reach out. Happy to help.

Could you possibly provide a reproducer case from the build failed? Or suggest how I can reproduce the failure, preferably w/o cloning/configuring/building the whole project.
The error that occurred suggests the private segment buffer descriptor was initialized by an incorrect value. The initialization depends on the target being compiled. So, I need at least to understand the code path on which things get wrong. I would appreciate any help on that.

jplehr · 2024-02-09T21:05:01Z

Hi Alex, do I understand correctly that the preferred reproducer is intermediate IR + final binary?

alex-t · 2024-02-10T11:20:28Z

Hi Alex, do I understand correctly that the preferred reproducer is intermediate IR + final binary?

As far as I understand, the error happens in runtime while attempting to read memory beyond the aperture bounds.
So, what I would like ideally is guidance on how to get/configure/build the code that causes an error, or just the assembly itself to inspect the exact value of the V# (SRD) being used. Also, I would need an IR and compiler invocation command line to debug the reasons for setting SRD incorrectly.
Literally, I need to track down the compilation of the code that caused incorrect memory access.
Although, I know almost nothing about the openmp offload project and I would appreciate a reference on the docs that makes me understand better what went on.

JonChesterfield · 2024-02-12T14:51:27Z

Literally, I need to track down the compilation of the code that caused incorrect memory access. Although, I know almost nothing about the openmp offload project and I would appreciate a reference on the docs that makes me understand better what went on.

It's quite obfuscated by the openmp toolchain providing a user friendly experience. Ultimately there's a single IR blob that gets fed into the backend to build a single code object containing some number of kernels. save-temps might give you that IR, or it might give you some unrelated IR - both have been true at various points in the past.

clang -fopenmp -### should tell you the steps. Some are openmp-specific IR rewriting tools. Somewhere in that pipeline is the IR you seek.

alex-t added 4 commits January 22, 2024 19:53

SWDEV-409366 WIP

099d17c

SWDEV-409366 WIP

cc479a9

SWDEV-409366 WIP

cefab24

SWDEV-409366 WIP 24 Jan

cbecf53

llvmbot added backend:AMDGPU llvm:globalisel labels Jan 26, 2024

alex-t requested a review from arsenm January 26, 2024 12:20

arsenm reviewed Jan 26, 2024

View reviewed changes

arsenm requested review from scott-linder and slinder1 January 26, 2024 15:50

slinder1 reviewed Jan 29, 2024

View reviewed changes

llvm/lib/Target/AMDGPU/SIFrameLowering.cpp Outdated Show resolved Hide resolved

[AMDGPU] Compiler should synthesize private buffer resource descripto…

10de7a0

…r from flat_scratch_init

slinder1 reviewed Feb 5, 2024

View reviewed changes

llvm/lib/Target/AMDGPU/SIFrameLowering.cpp Outdated Show resolved Hide resolved

llvm/lib/Target/AMDGPU/SIFrameLowering.cpp Show resolved Hide resolved

llvm/lib/Target/AMDGPU/SIFrameLowering.cpp Show resolved Hide resolved

arsenm reviewed Feb 6, 2024

View reviewed changes

llvm/lib/Target/AMDGPU/SIFrameLowering.cpp Show resolved Hide resolved

llvm/lib/Target/AMDGPU/SIFrameLowering.cpp Show resolved Hide resolved

[AMDGPU] Compiler should synthesize private buffer resource descripto…

5031235

…r from flat_scratch_init

[AMDGPU] Compiler should synthesize private buffer resource descripto…

de3555e

…r from flat_scratch_init

slinder1 approved these changes Feb 7, 2024

View reviewed changes

arsenm reviewed Feb 8, 2024

View reviewed changes

llvm/docs/AMDGPUUsage.rst Outdated Show resolved Hide resolved

[AMDGPU] Compiler should synthesize private buffer resource descripto…

3b36516

…r from flat_scratch_init

alex-t added 3 commits February 8, 2024 16:29

[AMDGPU] Compiler should synthesize private buffer resource descripto…

8740809

…r from flat_scratch_init

merge main

ddba68d

[AMDGPU] Compiler should synthesize private buffer resource descripto…

285e8e5

…r from flat_scratch_init. 3 tests updated

alex-t merged commit 88e5251 into llvm:main Feb 8, 2024

jplehr mentioned this pull request Feb 9, 2024

Revert "[AMDGPU] Compiler should synthesize private buffer resource descriptor from flat_scratch_init" #81234

Merged

[AMDGPU] Compiler should synthesize private buffer resource descriptor from flat_scratch_init #79586

[AMDGPU] Compiler should synthesize private buffer resource descriptor from flat_scratch_init #79586

Uh oh!

Conversation

alex-t commented Jan 26, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Jan 26, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

github-actions bot commented Jan 26, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

alex-t commented Feb 5, 2024

Uh oh!

slinder1 left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

arsenm left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

alex-t commented Feb 6, 2024

Uh oh!

slinder1 commented Feb 6, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

alex-t commented Feb 6, 2024

Uh oh!

alex-t commented Feb 6, 2024

Uh oh!

alex-t commented Feb 6, 2024

Uh oh!

slinder1 commented Feb 7, 2024

Uh oh!

alex-t commented Feb 7, 2024

Uh oh!

slinder1 left a comment

Choose a reason for hiding this comment

Uh oh!

arsenm commented Feb 8, 2024

Uh oh!

Uh oh!

alex-t commented Feb 8, 2024

Uh oh!

ronlieb commented Feb 9, 2024

Uh oh!

jplehr commented Feb 9, 2024

Uh oh!

jplehr commented Feb 9, 2024

Uh oh!

ronlieb commented Feb 9, 2024

Uh oh!

alex-t commented Feb 9, 2024

Uh oh!

alex-t commented Feb 9, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

jplehr commented Feb 9, 2024

Uh oh!

alex-t commented Feb 10, 2024

Uh oh!

JonChesterfield commented Feb 12, 2024

Uh oh!

Uh oh!

alex-t commented Jan 26, 2024 •

edited

Loading

llvmbot commented Jan 26, 2024 •

edited

Loading

github-actions bot commented Jan 26, 2024 •

edited

Loading

slinder1 commented Feb 6, 2024 •

edited

Loading

alex-t commented Feb 9, 2024 •

edited

Loading