Skip to content

[AMDGPU] Compiler should synthesize private buffer resource descriptor from flat_scratch_init #79586

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Feb 8, 2024
Merged
10 changes: 7 additions & 3 deletions llvm/docs/AMDGPUUsage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5530,9 +5530,13 @@ If the *Target Properties* column of :ref:`amdgpu-processor-table` specifies
Instead the flat SCRATCH instructions are used.

Otherwise, Private Segment Buffer SGPR register is used to initialize 4 SGPRs
that are used as a V# to access scratch. CP uses the value provided by the
runtime. It is used, together with Scratch Wavefront Offset as an offset, to
access the private memory space using a segment address. See
that are used as a V# to access scratch.
The compiler synthesizes the initialization value for the Private Segment
Buffer in the kernel prologue, using the Flat Scratch Init to initialize low
64-bit and a known constant for the high ones. If the Flat Scratch Init is not
available, CP uses the value provided by the runtime. It is used, together with
Scratch Wavefront Offset as an offset, to access the private memory space using
a segment address. See
:ref:`amdgpu-amdhsa-initial-kernel-execution-state`.

The scratch V# is a four-aligned SGPR and always selected for the kernel as
Expand Down
108 changes: 67 additions & 41 deletions llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -379,7 +379,8 @@ class PrologEpilogSGPRSpillBuilder {
} // namespace llvm

// Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()`
void SIFrameLowering::emitEntryFunctionFlatScratchInit(
// and return the FlatScratchInit Register used
Register SIFrameLowering::emitEntryFunctionFlatScratchInit(
MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
const DebugLoc &DL, Register ScratchWaveOffsetReg) const {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
Expand All @@ -399,6 +400,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(

Register FlatScrInitLo;
Register FlatScrInitHi;
Register FlatScratchInitReg;

if (ST.isAmdPalOS()) {
// Extract the scratch offset from the descriptor in the GIT
Expand All @@ -408,7 +410,6 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(

// Find unused reg to load flat scratch init into
MachineRegisterInfo &MRI = MF.getRegInfo();
Register FlatScrInit = AMDGPU::NoRegister;
ArrayRef<MCPhysReg> AllSGPR64s = TRI->getAllSGPR64(MF);
unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2;
AllSGPR64s = AllSGPR64s.slice(
Expand All @@ -417,16 +418,28 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
for (MCPhysReg Reg : AllSGPR64s) {
if (LiveUnits.available(Reg) && !MRI.isReserved(Reg) &&
MRI.isAllocatable(Reg) && !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
FlatScrInit = Reg;
FlatScratchInitReg = Reg;
break;
}
}
assert(FlatScrInit && "Failed to find free register for scratch init");

FlatScrInitLo = TRI->getSubReg(FlatScrInit, AMDGPU::sub0);
FlatScrInitHi = TRI->getSubReg(FlatScrInit, AMDGPU::sub1);
} else {
FlatScratchInitReg =
MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);

MachineRegisterInfo &MRI = MF.getRegInfo();
MRI.addLiveIn(FlatScratchInitReg);
MBB.addLiveIn(FlatScratchInitReg);
}

assert(FlatScratchInitReg && "Failed to find free register for scratch init");

FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);

if (ST.isAmdPalOS()) {

buildGitPtr(MBB, I, DL, TII, FlatScrInit);
buildGitPtr(MBB, I, DL, TII, FlatScratchInitReg);

// We now have the GIT ptr - now get the scratch descriptor from the entry
// at offset 0 (or offset 16 for a compute shader).
Expand All @@ -441,29 +454,18 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
BuildMI(MBB, I, DL, LoadDwordX2, FlatScrInit)
.addReg(FlatScrInit)
BuildMI(MBB, I, DL, LoadDwordX2, FlatScratchInitReg)
.addReg(FlatScratchInitReg)
.addImm(EncodedOffset) // offset
.addImm(0) // cpol
.addMemOperand(MMO);

// Mask the offset in [47:0] of the descriptor
const MCInstrDesc &SAndB32 = TII->get(AMDGPU::S_AND_B32);
auto And = BuildMI(MBB, I, DL, SAndB32, FlatScrInitHi)
.addReg(FlatScrInitHi)
.addImm(0xffff);
.addReg(FlatScrInitHi)
.addImm(0xffff);
And->getOperand(3).setIsDead(); // Mark SCC as dead.
} else {
Register FlatScratchInitReg =
MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
assert(FlatScratchInitReg);

MachineRegisterInfo &MRI = MF.getRegInfo();
MRI.addLiveIn(FlatScratchInitReg);
MBB.addLiveIn(FlatScratchInitReg);

FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
}

// Do a 64-bit pointer add.
Expand All @@ -486,20 +488,21 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
addReg(FlatScrInitHi).
addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI |
(31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
return;
return FlatScratchInitReg;
}

// For GFX9.
assert(ST.getGeneration() == AMDGPUSubtarget::GFX9);

BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO)
.addReg(FlatScrInitLo)
.addReg(ScratchWaveOffsetReg);
.addReg(FlatScrInitLo)
.addReg(ScratchWaveOffsetReg);
auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32),
AMDGPU::FLAT_SCR_HI)
.addReg(FlatScrInitHi)
.addImm(0);
Addc->getOperand(3).setIsDead(); // Mark SCC as dead.

return;
return AMDGPU::FLAT_SCR;
}

assert(ST.getGeneration() < AMDGPUSubtarget::GFX9);
Expand All @@ -520,6 +523,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
.addReg(FlatScrInitLo, RegState::Kill)
.addImm(8);
LShr->getOperand(3).setIsDead(); // Mark SCC as dead.
return AMDGPU::FLAT_SCR;
}

// Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
Expand Down Expand Up @@ -611,11 +615,15 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo *TRI = &TII->getRegisterInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
const Function &F = MF.getFunction();
MachineFrameInfo &FrameInfo = MF.getFrameInfo();

assert(MFI->isEntryFunction());

bool NeedsFlatScratchInit =
MFI->getUserSGPRInfo().hasFlatScratchInit() &&
(MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() ||
(!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch()));

Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);

Expand All @@ -641,7 +649,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
// Now that we have fixed the reserved SRSRC we need to locate the
// (potentially) preloaded SRSRC.
Register PreloadedScratchRsrcReg;
if (ST.isAmdHsaOrMesa(F)) {
if (ST.isAmdHsaOrMesa(MF.getFunction()) && !NeedsFlatScratchInit) {
PreloadedScratchRsrcReg =
MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
if (ScratchRsrcReg && PreloadedScratchRsrcReg) {
Expand Down Expand Up @@ -697,33 +705,30 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
}

bool NeedsFlatScratchInit =
MFI->getUserSGPRInfo().hasFlatScratchInit() &&
(MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() ||
(!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch()));

if ((NeedsFlatScratchInit || ScratchRsrcReg) &&
PreloadedScratchWaveOffsetReg && !ST.flatScratchIsArchitected()) {
MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
}

Register FlatScratchInit;
if (NeedsFlatScratchInit) {
emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
FlatScratchInit =
emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
}

if (ScratchRsrcReg) {
emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL,
PreloadedScratchRsrcReg,
ScratchRsrcReg, ScratchWaveOffsetReg);
emitEntryFunctionScratchRsrcRegSetup(
MF, MBB, I, DL, FlatScratchInit, ScratchRsrcReg,
PreloadedScratchRsrcReg, ScratchWaveOffsetReg);
}
}

// Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg`
void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
const DebugLoc &DL, Register PreloadedScratchRsrcReg,
Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const {
const DebugLoc &DL, Register FlatScratchInit, Register ScratchRsrcReg,
Register PreloadedScratchRsrcReg, Register ScratchWaveOffsetReg) const {

const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
Expand Down Expand Up @@ -771,7 +776,8 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
.addImm(21)
.addReg(Rsrc03);
}
} else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) {
} else if (ST.isMesaGfxShader(Fn) ||
(!FlatScratchInit.isValid() && !PreloadedScratchRsrcReg)) {
assert(!ST.isAmdHsaOrMesa(Fn));
const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);

Expand Down Expand Up @@ -830,6 +836,26 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
.addImm(Rsrc23 >> 32)
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
} else if (ST.isAmdHsaOrMesa(Fn)) {

if (FlatScratchInit) {
const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
Register Lo_32 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
Register Hi_32 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
uint64_t Rsrc23 = TII->getScratchRsrcWords23();
I = BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY),
TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1))
.addReg(FlatScratchInit)
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
BuildMI(MBB, I, DL, SMovB32, Lo_32)
.addImm(Rsrc23 & 0xffffffff)
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);

BuildMI(MBB, I, DL, SMovB32, Hi_32)
.addImm(Rsrc23 >> 32)
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
return;
}

assert(PreloadedScratchRsrcReg);

if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
Expand Down
14 changes: 7 additions & 7 deletions llvm/lib/Target/AMDGPU/SIFrameLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,19 +67,19 @@ class SIFrameLowering final : public AMDGPUFrameLowering {
MachineBasicBlock::iterator MI) const override;

private:
void emitEntryFunctionFlatScratchInit(MachineFunction &MF,
MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
const DebugLoc &DL,
Register ScratchWaveOffsetReg) const;
Register
emitEntryFunctionFlatScratchInit(MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
const DebugLoc &DL,
Register ScratchWaveOffsetReg) const;

Register getEntryFunctionReservedScratchRsrcReg(MachineFunction &MF) const;

void emitEntryFunctionScratchRsrcRegSetup(
MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator I, const DebugLoc &DL,
Register PreloadedPrivateBufferReg, Register ScratchRsrcReg,
Register ScratchWaveOffsetReg) const;
Register FlatScratchInit, Register ScratchRsrcReg,
Register PreloadedScratchRsrcReg, Register ScratchWaveOffsetReg) const;

public:
bool hasFP(const MachineFunction &MF) const override;
Expand Down
10 changes: 6 additions & 4 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,11 @@ define amdgpu_kernel void @kernel_caller_stack() {
; MUBUF-LABEL: kernel_caller_stack:
; MUBUF: ; %bb.0:
; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s4, s7
; MUBUF-NEXT: s_mov_b32 s2, -1
; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
; MUBUF-NEXT: s_add_u32 s0, s0, s7
; MUBUF-NEXT: s_mov_b32 s3, 0xe00000
; MUBUF-NEXT: s_mov_b32 s32, 0
; MUBUF-NEXT: s_addc_u32 s1, s1, 0
; MUBUF-NEXT: s_mov_b64 s[0:1], flat_scratch
; MUBUF-NEXT: v_mov_b32_e32 v0, 9
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4
; MUBUF-NEXT: v_mov_b32_e32 v0, 10
Expand Down Expand Up @@ -61,9 +62,10 @@ define amdgpu_kernel void @kernel_caller_byval() {
; MUBUF-LABEL: kernel_caller_byval:
; MUBUF: ; %bb.0:
; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s4, s7
; MUBUF-NEXT: s_mov_b32 s2, -1
; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
; MUBUF-NEXT: s_add_u32 s0, s0, s7
; MUBUF-NEXT: s_addc_u32 s1, s1, 0
; MUBUF-NEXT: s_mov_b32 s3, 0xe00000
; MUBUF-NEXT: s_mov_b64 s[0:1], flat_scratch
; MUBUF-NEXT: v_mov_b32_e32 v0, 0
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:12
Expand Down
18 changes: 10 additions & 8 deletions llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
Original file line number Diff line number Diff line change
Expand Up @@ -48,19 +48,20 @@ define amdgpu_kernel void @parent_kernel_missing_inputs() #0 {
; FIXEDABI-SDAG-LABEL: parent_kernel_missing_inputs:
; FIXEDABI-SDAG: ; %bb.0:
; FIXEDABI-SDAG-NEXT: s_add_i32 s4, s4, s9
; FIXEDABI-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
; FIXEDABI-SDAG-NEXT: s_mov_b32 s2, -1
; FIXEDABI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; FIXEDABI-SDAG-NEXT: s_add_u32 s0, s0, s9
; FIXEDABI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s5
; FIXEDABI-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
; FIXEDABI-SDAG-NEXT: s_mov_b32 s3, 0x11e80000
; FIXEDABI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; FIXEDABI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
; FIXEDABI-SDAG-NEXT: s_addc_u32 s1, s1, 0
; FIXEDABI-SDAG-NEXT: s_mov_b64 s[0:1], flat_scratch
; FIXEDABI-SDAG-NEXT: s_mov_b32 s14, s8
; FIXEDABI-SDAG-NEXT: v_or_b32_e32 v31, v0, v2
; FIXEDABI-SDAG-NEXT: s_mov_b64 s[8:9], 0
; FIXEDABI-SDAG-NEXT: s_mov_b32 s12, s6
; FIXEDABI-SDAG-NEXT: s_mov_b32 s13, s7
; FIXEDABI-SDAG-NEXT: s_mov_b32 s32, 0
; FIXEDABI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s5
; FIXEDABI-SDAG-NEXT: s_getpc_b64 s[4:5]
; FIXEDABI-SDAG-NEXT: s_add_u32 s4, s4, requires_all_inputs@rel32@lo+4
; FIXEDABI-SDAG-NEXT: s_addc_u32 s5, s5, requires_all_inputs@rel32@hi+12
Expand All @@ -70,19 +71,20 @@ define amdgpu_kernel void @parent_kernel_missing_inputs() #0 {
; FIXEDABI-GISEL-LABEL: parent_kernel_missing_inputs:
; FIXEDABI-GISEL: ; %bb.0:
; FIXEDABI-GISEL-NEXT: s_add_i32 s4, s4, s9
; FIXEDABI-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
; FIXEDABI-GISEL-NEXT: s_mov_b32 s2, -1
; FIXEDABI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; FIXEDABI-GISEL-NEXT: s_add_u32 s0, s0, s9
; FIXEDABI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s5
; FIXEDABI-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
; FIXEDABI-GISEL-NEXT: s_mov_b32 s3, 0x11e80000
; FIXEDABI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
; FIXEDABI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 20, v2
; FIXEDABI-GISEL-NEXT: s_addc_u32 s1, s1, 0
; FIXEDABI-GISEL-NEXT: s_mov_b64 s[0:1], flat_scratch
; FIXEDABI-GISEL-NEXT: s_mov_b32 s14, s8
; FIXEDABI-GISEL-NEXT: v_or_b32_e32 v31, v0, v1
; FIXEDABI-GISEL-NEXT: s_mov_b64 s[8:9], 0
; FIXEDABI-GISEL-NEXT: s_mov_b32 s12, s6
; FIXEDABI-GISEL-NEXT: s_mov_b32 s13, s7
; FIXEDABI-GISEL-NEXT: s_mov_b32 s32, 0
; FIXEDABI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s5
; FIXEDABI-GISEL-NEXT: s_getpc_b64 s[4:5]
; FIXEDABI-GISEL-NEXT: s_add_u32 s4, s4, requires_all_inputs@rel32@lo+4
; FIXEDABI-GISEL-NEXT: s_addc_u32 s5, s5, requires_all_inputs@rel32@hi+12
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,9 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
; CHECK-NEXT: s_load_dwordx8 s[36:43], s[6:7], 0x0
; CHECK-NEXT: s_add_u32 s0, s0, s15
; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: s_mov_b32 s2, -1
; CHECK-NEXT: s_mov_b32 s3, 0x31c16000
; CHECK-NEXT: s_mov_b64 s[0:1], s[10:11]
; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9]
; CHECK-NEXT: s_mov_b32 s8, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,14 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-LABEL: name: f1
; GFX90A: bb.0.bb:
; GFX90A-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr15, $sgpr10_sgpr11
; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr15, $sgpr10_sgpr11
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: $sgpr32 = S_MOV_B32 0
; GFX90A-NEXT: $flat_scr_lo = S_ADD_U32 $sgpr10, $sgpr15, implicit-def $scc
; GFX90A-NEXT: $flat_scr_hi = S_ADDC_U32 $sgpr11, 0, implicit-def dead $scc, implicit $scc
; GFX90A-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $sgpr15, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: $sgpr2 = S_MOV_B32 4294967295, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: $sgpr3 = S_MOV_B32 14680064, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: $sgpr0_sgpr1 = COPY $flat_scr, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: renamable $sgpr10_sgpr11 = COPY $sgpr8_sgpr9
; GFX90A-NEXT: renamable $vgpr31 = COPY $vgpr0, implicit $exec
; GFX90A-NEXT: renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr6_sgpr7, 24, 0 :: (dereferenceable invariant load (s32) from %ir.arg4.kernarg.offset.align.down, align 8, addrspace 4)
Expand Down
Loading