-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU] Compiler should synthesize private buffer resource descriptor from flat_scratch_init #79586
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: None (alex-t) ChangesPatch is 100.56 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/79586.diff 24 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 0f89df14448667..4842cf83af0e18 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -378,7 +378,7 @@ class PrologEpilogSGPRSpillBuilder {
} // namespace llvm
// Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()`
-void SIFrameLowering::emitEntryFunctionFlatScratchInit(
+Register SIFrameLowering::emitEntryFunctionFlatScratchInit(
MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
const DebugLoc &DL, Register ScratchWaveOffsetReg) const {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
@@ -398,6 +398,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
Register FlatScrInitLo;
Register FlatScrInitHi;
+ Register FlatScratchInitReg = AMDGPU::NoRegister;
if (ST.isAmdPalOS()) {
// Extract the scratch offset from the descriptor in the GIT
@@ -407,7 +408,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
// Find unused reg to load flat scratch init into
MachineRegisterInfo &MRI = MF.getRegInfo();
- Register FlatScrInit = AMDGPU::NoRegister;
+ Register FlatScratchInitReg = AMDGPU::NoRegister;
ArrayRef<MCPhysReg> AllSGPR64s = TRI->getAllSGPR64(MF);
unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2;
AllSGPR64s = AllSGPR64s.slice(
@@ -416,16 +417,16 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
for (MCPhysReg Reg : AllSGPR64s) {
if (LiveUnits.available(Reg) && !MRI.isReserved(Reg) &&
MRI.isAllocatable(Reg) && !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
- FlatScrInit = Reg;
+ FlatScratchInitReg = Reg;
break;
}
}
- assert(FlatScrInit && "Failed to find free register for scratch init");
+ assert(FlatScratchInitReg && "Failed to find free register for scratch init");
- FlatScrInitLo = TRI->getSubReg(FlatScrInit, AMDGPU::sub0);
- FlatScrInitHi = TRI->getSubReg(FlatScrInit, AMDGPU::sub1);
+ FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
+ FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
- buildGitPtr(MBB, I, DL, TII, FlatScrInit);
+ buildGitPtr(MBB, I, DL, TII, FlatScratchInitReg);
// We now have the GIT ptr - now get the scratch descriptor from the entry
// at offset 0 (or offset 16 for a compute shader).
@@ -440,8 +441,8 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
- BuildMI(MBB, I, DL, LoadDwordX2, FlatScrInit)
- .addReg(FlatScrInit)
+ BuildMI(MBB, I, DL, LoadDwordX2, FlatScratchInitReg)
+ .addReg(FlatScratchInitReg)
.addImm(EncodedOffset) // offset
.addImm(0) // cpol
.addMemOperand(MMO);
@@ -453,7 +454,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
.addImm(0xffff);
And->getOperand(3).setIsDead(); // Mark SCC as dead.
} else {
- Register FlatScratchInitReg =
+ FlatScratchInitReg =
MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
assert(FlatScratchInitReg);
@@ -485,7 +486,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
addReg(FlatScrInitHi).
addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI |
(31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
- return;
+ return FlatScratchInitReg;
}
// For GFX9.
@@ -498,7 +499,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
.addImm(0);
Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
- return;
+ return AMDGPU::FLAT_SCR;
}
assert(ST.getGeneration() < AMDGPUSubtarget::GFX9);
@@ -519,6 +520,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
.addReg(FlatScrInitLo, RegState::Kill)
.addImm(8);
LShr->getOperand(3).setIsDead(); // Mark SCC as dead.
+ return AMDGPU::FLAT_SCR;
}
// Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
@@ -610,11 +612,15 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo *TRI = &TII->getRegisterInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
- const Function &F = MF.getFunction();
MachineFrameInfo &FrameInfo = MF.getFrameInfo();
assert(MFI->isEntryFunction());
+ bool NeedsFlatScratchInit =
+ MFI->getUserSGPRInfo().hasFlatScratchInit() &&
+ (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() ||
+ (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch()));
+
Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
@@ -640,7 +646,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
// Now that we have fixed the reserved SRSRC we need to locate the
// (potentially) preloaded SRSRC.
Register PreloadedScratchRsrcReg;
- if (ST.isAmdHsaOrMesa(F)) {
+ if (ST.isAmdHsaOrMesa(MF.getFunction()) && !NeedsFlatScratchInit) {
PreloadedScratchRsrcReg =
MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
if (ScratchRsrcReg && PreloadedScratchRsrcReg) {
@@ -696,10 +702,6 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
}
- bool NeedsFlatScratchInit =
- MFI->getUserSGPRInfo().hasFlatScratchInit() &&
- (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() ||
- (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch()));
if ((NeedsFlatScratchInit || ScratchRsrcReg) &&
PreloadedScratchWaveOffsetReg && !ST.flatScratchIsArchitected()) {
@@ -707,22 +709,24 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
}
+ Register FlatScratchInit = AMDGPU::NoRegister;
if (NeedsFlatScratchInit) {
- emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
+ FlatScratchInit =
+ emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
}
if (ScratchRsrcReg) {
- emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL,
- PreloadedScratchRsrcReg,
- ScratchRsrcReg, ScratchWaveOffsetReg);
+ emitEntryFunctionScratchRsrcRegSetup(
+ MF, MBB, I, DL, FlatScratchInit, ScratchRsrcReg,
+ PreloadedScratchRsrcReg, ScratchWaveOffsetReg);
}
}
// Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg`
void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
- const DebugLoc &DL, Register PreloadedScratchRsrcReg,
- Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const {
+ const DebugLoc &DL, Register FlatScratchInit, Register ScratchRsrcReg,
+ Register PreloadedScratchRsrcReg, Register ScratchWaveOffsetReg) const {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
@@ -770,7 +774,8 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
.addImm(21)
.addReg(Rsrc03);
}
- } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) {
+ } else if (ST.isMesaGfxShader(Fn) ||
+ (!FlatScratchInit.isValid() && !PreloadedScratchRsrcReg)) {
assert(!ST.isAmdHsaOrMesa(Fn));
const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
@@ -829,11 +834,24 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
.addImm(Rsrc23 >> 32)
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
} else if (ST.isAmdHsaOrMesa(Fn)) {
- assert(PreloadedScratchRsrcReg);
- if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
- BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
- .addReg(PreloadedScratchRsrcReg, RegState::Kill);
+ if (FlatScratchInit) {
+ I = BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY),
+ TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1))
+ .addReg(FlatScratchInit)
+ .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+ I = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64),
+ TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2_sub3))
+ .addImm(0xf0000000)
+ .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+ return;
+ } else {
+ assert(PreloadedScratchRsrcReg);
+
+ if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
+ .addReg(PreloadedScratchRsrcReg, RegState::Kill);
+ }
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
index b3feb759ed811f..f706d48b2dc101 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -67,19 +67,19 @@ class SIFrameLowering final : public AMDGPUFrameLowering {
MachineBasicBlock::iterator MI) const override;
private:
- void emitEntryFunctionFlatScratchInit(MachineFunction &MF,
- MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I,
- const DebugLoc &DL,
- Register ScratchWaveOffsetReg) const;
+ Register
+ emitEntryFunctionFlatScratchInit(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const DebugLoc &DL,
+ Register ScratchWaveOffsetReg) const;
Register getEntryFunctionReservedScratchRsrcReg(MachineFunction &MF) const;
void emitEntryFunctionScratchRsrcRegSetup(
MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator I, const DebugLoc &DL,
- Register PreloadedPrivateBufferReg, Register ScratchRsrcReg,
- Register ScratchWaveOffsetReg) const;
+ Register FlatScratchInit, Register ScratchRsrcReg,
+ Register PreloadedScratchRsrcReg, Register ScratchWaveOffsetReg) const;
public:
bool hasFP(const MachineFunction &MF) const override;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
index e597ce6f114a6b..a7277414391cb2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
@@ -14,9 +14,9 @@ define amdgpu_kernel void @kernel_caller_stack() {
; MUBUF: ; %bb.0:
; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s4, s7
; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
-; MUBUF-NEXT: s_add_u32 s0, s0, s7
+; MUBUF-NEXT: s_mov_b64 s[2:3], 0xf0000000
; MUBUF-NEXT: s_mov_b32 s32, 0
-; MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; MUBUF-NEXT: s_mov_b64 s[0:1], flat_scratch
; MUBUF-NEXT: v_mov_b32_e32 v0, 9
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4
; MUBUF-NEXT: v_mov_b32_e32 v0, 10
@@ -62,8 +62,8 @@ define amdgpu_kernel void @kernel_caller_byval() {
; MUBUF: ; %bb.0:
; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s4, s7
; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
-; MUBUF-NEXT: s_add_u32 s0, s0, s7
-; MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; MUBUF-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; MUBUF-NEXT: s_mov_b64 s[0:1], flat_scratch
; MUBUF-NEXT: v_mov_b32_e32 v0, 0
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:12
diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
index a439c0f51ffe9c..bda25cda4c5f9c 100644
--- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
@@ -48,19 +48,19 @@ define amdgpu_kernel void @parent_kernel_missing_inputs() #0 {
; FIXEDABI-SDAG-LABEL: parent_kernel_missing_inputs:
; FIXEDABI-SDAG: ; %bb.0:
; FIXEDABI-SDAG-NEXT: s_add_i32 s4, s4, s9
-; FIXEDABI-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
; FIXEDABI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; FIXEDABI-SDAG-NEXT: s_add_u32 s0, s0, s9
+; FIXEDABI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s5
+; FIXEDABI-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
+; FIXEDABI-SDAG-NEXT: s_mov_b64 s[2:3], 0xf0000000
; FIXEDABI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; FIXEDABI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
-; FIXEDABI-SDAG-NEXT: s_addc_u32 s1, s1, 0
+; FIXEDABI-SDAG-NEXT: s_mov_b64 s[0:1], flat_scratch
; FIXEDABI-SDAG-NEXT: s_mov_b32 s14, s8
; FIXEDABI-SDAG-NEXT: v_or_b32_e32 v31, v0, v2
; FIXEDABI-SDAG-NEXT: s_mov_b64 s[8:9], 0
; FIXEDABI-SDAG-NEXT: s_mov_b32 s12, s6
; FIXEDABI-SDAG-NEXT: s_mov_b32 s13, s7
; FIXEDABI-SDAG-NEXT: s_mov_b32 s32, 0
-; FIXEDABI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s5
; FIXEDABI-SDAG-NEXT: s_getpc_b64 s[4:5]
; FIXEDABI-SDAG-NEXT: s_add_u32 s4, s4, requires_all_inputs@rel32@lo+4
; FIXEDABI-SDAG-NEXT: s_addc_u32 s5, s5, requires_all_inputs@rel32@hi+12
@@ -70,19 +70,19 @@ define amdgpu_kernel void @parent_kernel_missing_inputs() #0 {
; FIXEDABI-GISEL-LABEL: parent_kernel_missing_inputs:
; FIXEDABI-GISEL: ; %bb.0:
; FIXEDABI-GISEL-NEXT: s_add_i32 s4, s4, s9
-; FIXEDABI-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
; FIXEDABI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; FIXEDABI-GISEL-NEXT: s_add_u32 s0, s0, s9
+; FIXEDABI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s5
+; FIXEDABI-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
+; FIXEDABI-GISEL-NEXT: s_mov_b64 s[2:3], 0xf0000000
; FIXEDABI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
; FIXEDABI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 20, v2
-; FIXEDABI-GISEL-NEXT: s_addc_u32 s1, s1, 0
+; FIXEDABI-GISEL-NEXT: s_mov_b64 s[0:1], flat_scratch
; FIXEDABI-GISEL-NEXT: s_mov_b32 s14, s8
; FIXEDABI-GISEL-NEXT: v_or_b32_e32 v31, v0, v1
; FIXEDABI-GISEL-NEXT: s_mov_b64 s[8:9], 0
; FIXEDABI-GISEL-NEXT: s_mov_b32 s12, s6
; FIXEDABI-GISEL-NEXT: s_mov_b32 s13, s7
; FIXEDABI-GISEL-NEXT: s_mov_b32 s32, 0
-; FIXEDABI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s5
; FIXEDABI-GISEL-NEXT: s_getpc_b64 s[4:5]
; FIXEDABI-GISEL-NEXT: s_add_u32 s4, s4, requires_all_inputs@rel32@lo+4
; FIXEDABI-GISEL-NEXT: s_addc_u32 s5, s5, requires_all_inputs@rel32@hi+12
diff --git a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
index 7c8d40c49bb805..20f60d1db7fb5f 100644
--- a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
+++ b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
@@ -10,8 +10,8 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
; CHECK-NEXT: s_load_dwordx8 s[36:43], s[6:7], 0x0
-; CHECK-NEXT: s_add_u32 s0, s0, s15
-; CHECK-NEXT: s_addc_u32 s1, s1, 0
+; CHECK-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; CHECK-NEXT: s_mov_b64 s[0:1], s[10:11]
; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9]
; CHECK-NEXT: s_mov_b32 s8, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index 5a128c7541d1ec..2baaefb76acb3b 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -5,13 +5,13 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-LABEL: name: f1
; GFX90A: bb.0.bb:
; GFX90A-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr15, $sgpr10_sgpr11
+ ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr15, $sgpr10_sgpr11
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: $sgpr32 = S_MOV_B32 0
; GFX90A-NEXT: $flat_scr_lo = S_ADD_U32 $sgpr10, $sgpr15, implicit-def $scc
; GFX90A-NEXT: $flat_scr_hi = S_ADDC_U32 $sgpr11, 0, implicit-def dead $scc, implicit $scc
- ; GFX90A-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $sgpr15, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
- ; GFX90A-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: $sgpr2_sgpr3 = S_MOV_B64 4026531840, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: $sgpr0_sgpr1 = COPY $flat_scr, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: renamable $sgpr10_sgpr11 = COPY $sgpr8_sgpr9
; GFX90A-NEXT: renamable $vgpr31 = COPY $vgpr0, implicit $exec
; GFX90A-NEXT: renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr6_sgpr7, 24, 0 :: (dereferenceable invariant load (s32) from %ir.arg4.kernarg.offset.align.down, align 8, addrspace 4)
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index a192a1b8dff935..0fe54349215ba3 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -129,12 +129,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
; HSA-LABEL: test_call_external_void_func_i1_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s4, s4, s7
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; HSA-NEXT: s_add_u32 s0, s0, s7
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 1
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i1@rel32@lo+4
; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_i1@rel32@hi+12
@@ -234,8 +234,8 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 glc
; HSA-NEXT: s_waitcnt vmcnt(0)
-; HSA-NEXT: s_add_u32 s0, s0, s9
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4
@@ -339,8 +339,8 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
; HSA-NEXT: s_mov_b32 s6, -1
; HSA-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 glc
; HSA-NEXT: s_waitcnt vmcnt(0)
-; HSA-NEXT: s_add_u32 s0, s0, s9
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[4:5]
; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i1_zeroext@rel32@lo+4
@@ -422,12 +422,12 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 {
; HSA-LABEL: test_call_external_void_func_i8_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s6, s6, s9
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT: s_add_u32 s0, s0, s9
-; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b64 s[2:3], 0xf0000000
+; HSA-NEXT: s_mov_b64 s[0:1], flat_scratch
; HSA-NEXT: v_mov_b32_e32 v0, 0x7b
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_getpc_b64...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
…r from flat_scratch_init
Ping |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(Sorry for adding more nits after initial review)
I'm a bit confused about the goal; does this patch just pave the way to actually stop requesting the preloaded RSRC in some cases in the future?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We do need a follow up change to prune this from the calling convention lowering. It's going to cause a huge amount of test churn, so it should be done separately
Yep. It currently avoids using the preloaded RSRC if the FlatScratchInit is available as RSRC low 64 bits repeat the FlatScratchInit and the hi 64 bits are known const value. However, I still have no idea how to instruct runtime not to set s[0:3] up with the preloaded values in case the whole 128 bits are effectively synthesized in a compile time. |
Ah, makes sense; I'm assuming you've seen this, but just in case you haven't we do have ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER (see https://llvm.org/docs/AMDGPUUsage.html#code-object-v3-kernel-descriptor) which might help. If this simply doesn't work (or isn't always available) then I'm similarly not certain how one achieves this uniformly. Also, linking to AMDGPUUsage reminded me that we do document the initial SRSRC setup at https://llvm.org/docs/AMDGPUUsage.html#private-segment-buffer . Would you be able to update that as part of the patch? It lives at |
So, could you please formalize the request a bit and possibly open the Jira ticket? |
I knew we had a bit for this. I mostly was curious how one could set/clear the appropriate bit. Just because I haven't experience with the runtime code and API. |
Good Idea. I promise I will do what I can :) |
…r from flat_scratch_init
Thank you for the nit changes! This LGTM, as long as you at least put the FIXME in the docs in the initial commit. I'm happy to update it after-the-fact if it unblocks you |
…r from flat_scratch_init
I have updated the llvm/docs/AMDGPUUsage.rst. Could you please see if it seems like what you want? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you, this LGTM!
We can revisit the wording of the whole section when we no longer request the Private Segment Buffer SGPRs unconditionally
It's really the same ticket. As it stands, this PR is pointless and just increases the instruction count for no benefit. The point was to prune out the resource descriptor in the kernel user SGPRs, which this isn't doing yet. All of the kernel uses of s[0:3] should be removed and shifted down by 4 SGPRs |
…r from flat_scratch_init
We had a long email conversation regarding this. Let's try one more time. |
…r from flat_scratch_init
…r from flat_scratch_init. 3 tests updated
Hi, I'd appreciate if someone can look at this. If you need help with reproducing, configs, etc, please reach out. Happy to help. |
I reverted locally and the problems go away. |
…escriptor from flat_scratch_init" (#81234) Reverts #79586 This broke the AMDGPU OpenMP Offload buildbot. The typical error message was that the GPU attempted to read beyong the largest legal address. Error message: AMDGPU fatal error 1: Received error in queue 0x7f8363f22000: HSA_STATUS_ERROR_MEMORY_APERTURE_VIOLATION: The agent attempted to access memory beyond the largest legal address.
our downstream CI is seeing issues with this patch as well , thx for revert |
Thanks for catching this. The patch managed to pass PSDB applied to amd-staging. |
Could you possibly provide a reproducer case from the build failed? Or suggest how I can reproduce the failure, preferably w/o cloning/configuring/building the whole project. |
Hi Alex, do I understand correctly that the preferred reproducer is intermediate IR + final binary? |
As far as I understand, the error happens in runtime while attempting to read memory beyond the aperture bounds. |
It's quite obfuscated by the openmp toolchain providing a user friendly experience. Ultimately there's a single IR blob that gets fed into the backend to build a single code object containing some number of kernels. save-temps might give you that IR, or it might give you some unrelated IR - both have been true at various points in the past.
|
This change implements synthesizing the private buffer resource descriptor in the kernel prolog instead of using the preloaded kernel argument.