Skip to content

[AMDGPU] Convert PrologEpilogSGPRSpills from DenseMap to sorted vector #90957

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 24 additions & 8 deletions llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -522,13 +522,13 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
// the serialization easier.
ReservedRegSet WWMReservedRegs;

using PrologEpilogSGPRSpillsMap =
DenseMap<Register, PrologEpilogSGPRSaveRestoreInfo>;
using PrologEpilogSGPRSpill =
std::pair<Register, PrologEpilogSGPRSaveRestoreInfo>;
// To track the SGPR spill method used for a CSR SGPR register during
// frame lowering. Even though the SGPR spills are handled during
// SILowerSGPRSpills pass, some special handling needed later during the
// PrologEpilogInserter.
PrologEpilogSGPRSpillsMap PrologEpilogSGPRSpills;
SmallVector<PrologEpilogSGPRSpill, 3> PrologEpilogSGPRSpills;

// To save/restore EXEC MASK around WWM spills and copies.
Register SGPRForEXECCopy;
Expand Down Expand Up @@ -596,7 +596,11 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
const WWMSpillsMap &getWWMSpills() const { return WWMSpills; }
const ReservedRegSet &getWWMReservedRegs() const { return WWMReservedRegs; }

const PrologEpilogSGPRSpillsMap &getPrologEpilogSGPRSpills() const {
ArrayRef<PrologEpilogSGPRSpill> getPrologEpilogSGPRSpills() const {
assert(
is_sorted(PrologEpilogSGPRSpills, [](const auto &LHS, const auto &RHS) {
return LHS.first < RHS.first;
}));
return PrologEpilogSGPRSpills;
}

Expand All @@ -606,18 +610,29 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,

void addToPrologEpilogSGPRSpills(Register Reg,
PrologEpilogSGPRSaveRestoreInfo SI) {
PrologEpilogSGPRSpills.insert(std::make_pair(Reg, SI));
assert(!hasPrologEpilogSGPRSpillEntry(Reg));

// Insert a new entry in the right place to keep the vector in sorted order.
// This should be cheap since the vector is expected to be very short.
PrologEpilogSGPRSpills.insert(
upper_bound(
PrologEpilogSGPRSpills, Reg,
[](const auto &LHS, const auto &RHS) { return LHS < RHS.first; }),
std::make_pair(Reg, SI));
}

// Check if an entry created for \p Reg in PrologEpilogSGPRSpills. Return true
// on success and false otherwise.
bool hasPrologEpilogSGPRSpillEntry(Register Reg) const {
return PrologEpilogSGPRSpills.contains(Reg);
auto I = find_if(PrologEpilogSGPRSpills,
[&Reg](const auto &Spill) { return Spill.first == Reg; });
return I != PrologEpilogSGPRSpills.end();
}

// Get the scratch SGPR if allocated to save/restore \p Reg.
Register getScratchSGPRCopyDstReg(Register Reg) const {
auto I = PrologEpilogSGPRSpills.find(Reg);
auto I = find_if(PrologEpilogSGPRSpills,
[&Reg](const auto &Spill) { return Spill.first == Reg; });
if (I != PrologEpilogSGPRSpills.end() &&
I->second.getKind() == SGPRSaveKind::COPY_TO_SCRATCH_SGPR)
return I->second.getReg();
Expand Down Expand Up @@ -646,7 +661,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,

const PrologEpilogSGPRSaveRestoreInfo &
getPrologEpilogSGPRSaveRestoreInfo(Register Reg) const {
auto I = PrologEpilogSGPRSpills.find(Reg);
auto I = find_if(PrologEpilogSGPRSpills,
[&Reg](const auto &Spill) { return Spill.first == Reg; });
assert(I != PrologEpilogSGPRSpills.end());

return I->second;
Expand Down
20 changes: 10 additions & 10 deletions llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,11 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
; GFX906-NEXT: v_writelane_b32 v2, s24, 5
; GFX906-NEXT: s_mov_b64 s[26:27], s[10:11]
; GFX906-NEXT: v_writelane_b32 v2, s26, 6
; GFX906-NEXT: v_writelane_b32 v41, s34, 2
; GFX906-NEXT: v_writelane_b32 v41, s16, 4
; GFX906-NEXT: v_writelane_b32 v2, s27, 7
; GFX906-NEXT: v_writelane_b32 v41, s35, 3
; GFX906-NEXT: v_writelane_b32 v41, s34, 2
; GFX906-NEXT: v_writelane_b32 v2, s8, 8
; GFX906-NEXT: v_writelane_b32 v41, s16, 4
; GFX906-NEXT: v_writelane_b32 v41, s35, 3
; GFX906-NEXT: v_writelane_b32 v2, s9, 9
; GFX906-NEXT: v_writelane_b32 v41, s30, 0
; GFX906-NEXT: v_writelane_b32 v2, s4, 10
Expand Down Expand Up @@ -340,9 +340,9 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
; GFX906-NEXT: v_readlane_b32 s31, v41, 1
; GFX906-NEXT: v_readlane_b32 s30, v41, 0
; GFX906-NEXT: ; kill: killed $vgpr40
; GFX906-NEXT: v_readlane_b32 s4, v41, 4
; GFX906-NEXT: v_readlane_b32 s34, v41, 2
; GFX906-NEXT: v_readlane_b32 s35, v41, 3
; GFX906-NEXT: v_readlane_b32 s4, v41, 4
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[30:33] offset:112
; GFX906-NEXT: s_waitcnt vmcnt(0)
Expand Down Expand Up @@ -383,12 +383,12 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
; GFX908-NEXT: s_mov_b64 exec, -1
; GFX908-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:152 ; 4-byte Folded Spill
; GFX908-NEXT: s_mov_b64 exec, s[18:19]
; GFX908-NEXT: v_mov_b32_e32 v3, s16
; GFX908-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:168 ; 4-byte Folded Spill
; GFX908-NEXT: v_mov_b32_e32 v3, s34
; GFX908-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:160 ; 4-byte Folded Spill
; GFX908-NEXT: v_mov_b32_e32 v3, s35
; GFX908-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:164 ; 4-byte Folded Spill
; GFX908-NEXT: v_mov_b32_e32 v3, s16
; GFX908-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:168 ; 4-byte Folded Spill
; GFX908-NEXT: s_addk_i32 s32, 0x2c00
; GFX908-NEXT: s_mov_b64 s[16:17], exec
; GFX908-NEXT: s_mov_b64 exec, 1
Expand Down Expand Up @@ -753,16 +753,16 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:172
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_mov_b64 exec, s[4:5]
; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:160 ; 4-byte Folded Reload
; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:168 ; 4-byte Folded Reload
; GFX908-NEXT: ; kill: killed $vgpr40
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_readfirstlane_b32 s4, v0
; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:160 ; 4-byte Folded Reload
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_readfirstlane_b32 s34, v0
; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:164 ; 4-byte Folded Reload
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_readfirstlane_b32 s35, v0
; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:168 ; 4-byte Folded Reload
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_readfirstlane_b32 s4, v0
; GFX908-NEXT: s_xor_saveexec_b64 s[6:7], -1
; GFX908-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:148 ; 4-byte Folded Reload
; GFX908-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:156 ; 4-byte Folded Reload
Expand Down
16 changes: 7 additions & 9 deletions llvm/test/CodeGen/AMDGPU/stack-realign.ll
Original file line number Diff line number Diff line change
Expand Up @@ -162,21 +162,21 @@ define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 {
; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1
; GCN-NEXT: buffer_store_dword [[VGPR_REG:v[0-9]+]], off, s[0:3], s33 offset:1028 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[18:19]
; GCN-NEXT: v_writelane_b32 [[VGPR_REG]], [[FP_SCRATCH_COPY]], 2
; GCN-NEXT: v_mov_b32_e32 v32, 0
; GCN-DAG: v_writelane_b32 [[VGPR_REG]], s34, 3
; GCN: s_mov_b32 s34, s32
; GCN: buffer_store_dword v32, off, s[0:3], s33 offset:1024
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34
; GCN-DAG: s_add_i32 s32, s32, 0x30000
; GCN: v_writelane_b32 [[VGPR_REG]], [[FP_SCRATCH_COPY]], 2
; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32
; GCN: s_swappc_b64 s[30:31],

; GCN: v_readlane_b32 s31, [[VGPR_REG]], 1
; GCN: v_readlane_b32 s30, [[VGPR_REG]], 0
; GCN-NEXT: v_readlane_b32 s34, [[VGPR_REG]], 3
; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[VGPR_REG]], 2
; GCN-NEXT: v_readlane_b32 s34, [[VGPR_REG]], 3
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_load_dword [[VGPR_REG]], off, s[0:3], s33 offset:1028 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[6:7]
Expand Down Expand Up @@ -265,9 +265,9 @@ define void @no_free_regs_spill_bp_to_memory(<32 x i32> %a, i32 %b) #5 {
; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33
; GCN: s_xor_saveexec_b64 s[6:7], -1
; GCN: buffer_store_dword v39, off, s[0:3], s33
; GCN: v_mov_b32_e32 v0, s34
; GCN: buffer_store_dword v0, off, s[0:3], s33
; GCN: v_mov_b32_e32 v0, [[FP_SCRATCH_COPY]]
; GCN: buffer_store_dword v0, off, s[0:3], s33
; GCN: v_mov_b32_e32 v0, s34
; GCN-DAG: buffer_store_dword v0, off, s[0:3], s33
%local_val = alloca i32, align 128, addrspace(5)
store volatile i32 %b, ptr addrspace(5) %local_val, align 128
Expand Down Expand Up @@ -304,13 +304,11 @@ define void @spill_bp_to_memory_scratch_reg_needed_mubuf_offset(<32 x i32> %a, i
; GCN-NEXT: s_add_i32 s5, s33, 0x42100
; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s5 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: v_mov_b32_e32 v0, s34
; GCN-NOT: v_mov_b32_e32 v0, 0x108c
; GCN-NEXT: s_add_i32 s5, s33, 0x42300
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill
; GCN-NEXT: v_mov_b32_e32 v0, [[FP_SCRATCH_COPY]]
; GCN-NOT: v_mov_b32_e32 v0, 0x1088
; GCN-NEXT: s_add_i32 s5, s33, 0x42200
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill
; GCN-NEXT: v_mov_b32_e32 v0, s34
; GCN-NEXT: s_add_i32 s5, s33, 0x42300
; GCN-NEXT: s_mov_b32 s34, s32
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill
%local_val = alloca i32, align 128, addrspace(5)
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@ define void @vector_reg_liverange_split() #0 {
; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword a32, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX90A-NEXT: s_mov_b64 exec, s[18:19]
; GFX90A-NEXT: v_writelane_b32 v40, s16, 4
; GFX90A-NEXT: v_writelane_b32 v40, s28, 2
; GFX90A-NEXT: v_writelane_b32 v40, s29, 3
Comment on lines +21 to 23
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note these spills are now in ascending order of SGPR number. Same in other tests.

; GFX90A-NEXT: v_writelane_b32 v40, s16, 4
; GFX90A-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
; GFX90A-NEXT: v_writelane_b32 v40, s30, 0
; GFX90A-NEXT: s_addk_i32 s32, 0x400
Expand Down Expand Up @@ -48,9 +48,9 @@ define void @vector_reg_liverange_split() #0 {
; GFX90A-NEXT: v_readlane_b32 s31, v40, 1
; GFX90A-NEXT: v_readlane_b32 s30, v40, 0
; GFX90A-NEXT: ; kill: killed $vgpr0
; GFX90A-NEXT: v_readlane_b32 s4, v40, 4
; GFX90A-NEXT: v_readlane_b32 s28, v40, 2
; GFX90A-NEXT: v_readlane_b32 s29, v40, 3
; GFX90A-NEXT: v_readlane_b32 s4, v40, 4
; GFX90A-NEXT: s_xor_saveexec_b64 s[6:7], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, -1
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ define void @test() #0 {
; GCN-NEXT: s_mov_b64 exec, -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[18:19]
; GCN-NEXT: v_writelane_b32 v40, s16, 4
; GCN-NEXT: v_writelane_b32 v40, s28, 2
; GCN-NEXT: v_writelane_b32 v40, s29, 3
; GCN-NEXT: v_writelane_b32 v40, s16, 4
; GCN-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
; GCN-NEXT: v_writelane_b32 v40, s30, 0
; GCN-NEXT: s_addk_i32 s32, 0x800
Expand Down Expand Up @@ -55,9 +55,9 @@ define void @test() #0 {
; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
; GCN-NEXT: ; kill: killed $vgpr1
; GCN-NEXT: v_readlane_b32 s4, v40, 4
; GCN-NEXT: v_readlane_b32 s28, v40, 2
; GCN-NEXT: v_readlane_b32 s29, v40, 3
; GCN-NEXT: v_readlane_b32 s4, v40, 4
; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
Expand All @@ -79,9 +79,9 @@ define void @test() #0 {
; GCN-O0-NEXT: s_mov_b64 exec, -1
; GCN-O0-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[18:19]
; GCN-O0-NEXT: v_writelane_b32 v40, s16, 4
; GCN-O0-NEXT: v_writelane_b32 v40, s28, 2
; GCN-O0-NEXT: v_writelane_b32 v40, s29, 3
; GCN-O0-NEXT: v_writelane_b32 v40, s16, 4
; GCN-O0-NEXT: s_add_i32 s32, s32, 0x400
; GCN-O0-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
; GCN-O0-NEXT: v_writelane_b32 v40, s30, 0
Expand Down Expand Up @@ -117,9 +117,9 @@ define void @test() #0 {
; GCN-O0-NEXT: v_readlane_b32 s31, v40, 1
; GCN-O0-NEXT: v_readlane_b32 s30, v40, 0
; GCN-O0-NEXT: ; kill: killed $vgpr0
; GCN-O0-NEXT: v_readlane_b32 s4, v40, 4
; GCN-O0-NEXT: v_readlane_b32 s28, v40, 2
; GCN-O0-NEXT: v_readlane_b32 s29, v40, 3
; GCN-O0-NEXT: v_readlane_b32 s4, v40, 4
; GCN-O0-NEXT: s_xor_saveexec_b64 s[6:7], -1
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, -1
Expand Down