Skip to content

Commit 5aea839

Browse files
committed
[AMDGPU] Switch to backwards scavenging in eliminateFrameIndex
Frame index elimination runs backwards so we must use backwards scavenging. Otherwise, when a scavenged register is spilled, the scavenger will remember that the register is in use until the restore point, but it will never reach that restore point. The result is that in some cases it will keep scavenging different registers instead of reusing the same one. Differential Revision: https://reviews.llvm.org/D152394
1 parent b03e6e6 commit 5aea839

21 files changed

+1167
-1212
lines changed

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,8 @@ struct SGPRSpillBuilder {
170170
// a register as actually in use in another lane, so we need to save all
171171
// used lanes of the chosen VGPR.
172172
assert(RS && "Cannot spill SGPR to memory without RegScavenger");
173-
TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0, false);
173+
TmpVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false,
174+
0, false);
174175

175176
// Reserve temporary stack slot
176177
TmpVGPRIndex = MFI.getScavengeFI(MF.getFrameInfo(), TRI);
@@ -199,7 +200,7 @@ struct SGPRSpillBuilder {
199200
const TargetRegisterClass &RC =
200201
IsWave32 ? AMDGPU::SGPR_32RegClass : AMDGPU::SGPR_64RegClass;
201202
RS->setRegUsed(SuperReg);
202-
SavedExecReg = RS->scavengeRegister(&RC, MI, 0, false);
203+
SavedExecReg = RS->scavengeRegisterBackwards(RC, MI, false, 0, false);
203204

204205
int64_t VGPRLanes = getPerVGPRData().VGPRLanes;
205206

@@ -1591,7 +1592,8 @@ void SIRegisterInfo::buildSpillLoadStore(
15911592
} else if (UseVGPROffset) {
15921593
// FIXME: change to scavengeRegisterBackwards()
15931594
if (!TmpOffsetVGPR) {
1594-
TmpOffsetVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
1595+
TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
1596+
MI, false, 0);
15951597
RS->setRegUsed(TmpOffsetVGPR);
15961598
}
15971599
}
@@ -2282,7 +2284,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
22822284
const TargetRegisterClass *RC = UseSGPR ? &AMDGPU::SReg_32_XM0RegClass
22832285
: &AMDGPU::VGPR_32RegClass;
22842286

2285-
Register TmpReg = RS->scavengeRegister(RC, MI, 0, !UseSGPR);
2287+
Register TmpReg =
2288+
RS->scavengeRegisterBackwards(*RC, MI, false, 0, !UseSGPR);
22862289
FIOp.setReg(TmpReg);
22872290
FIOp.setIsKill();
22882291

@@ -2302,8 +2305,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
23022305

23032306
Register TmpSReg =
23042307
UseSGPR ? TmpReg
2305-
: RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0,
2306-
!UseSGPR);
2308+
: RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
2309+
MI, false, 0, !UseSGPR);
23072310

23082311
// TODO: for flat scratch another attempt can be made with a VGPR index
23092312
// if no SGPRs can be scavenged.
@@ -2377,8 +2380,9 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
23772380
: &AMDGPU::VGPR_32RegClass;
23782381
bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32 ||
23792382
MI->getOpcode() == AMDGPU::V_MOV_B32_e64;
2380-
Register ResultReg = IsCopy ? MI->getOperand(0).getReg()
2381-
: RS->scavengeRegister(RC, MI, 0);
2383+
Register ResultReg =
2384+
IsCopy ? MI->getOperand(0).getReg()
2385+
: RS->scavengeRegisterBackwards(*RC, MI, false, 0);
23822386

23832387
int64_t Offset = FrameInfo.getObjectOffset(Index);
23842388
if (Offset == 0) {
@@ -2391,8 +2395,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
23912395
if (IsSALU && !LiveSCC)
23922396
Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead.
23932397
if (IsSALU && LiveSCC) {
2394-
Register NewDest =
2395-
RS->scavengeRegister(&AMDGPU::SReg_32RegClass, Shift, 0);
2398+
Register NewDest = RS->scavengeRegisterBackwards(
2399+
AMDGPU::SReg_32RegClass, Shift, false, 0);
23962400
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
23972401
NewDest)
23982402
.addReg(ResultReg);
@@ -2446,8 +2450,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
24462450

24472451
// We may have 1 free scratch SGPR even though a carry out is
24482452
// unavailable. Only one additional mov is needed.
2449-
Register TmpScaledReg =
2450-
RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false);
2453+
Register TmpScaledReg = RS->scavengeRegisterBackwards(
2454+
AMDGPU::SReg_32_XM0RegClass, MI, false, 0, false);
24512455
Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg;
24522456

24532457
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg)
@@ -2512,7 +2516,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
25122516

25132517
FIOp.ChangeToImmediate(Offset);
25142518
if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) {
2515-
Register TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
2519+
Register TmpReg = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
2520+
MI, false, 0);
25162521
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
25172522
.addImm(Offset);
25182523
FIOp.ChangeToRegister(TmpReg, false, false, true);

llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll

Lines changed: 23 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -159,50 +159,32 @@ define amdgpu_kernel void @kernel_caller_byval() {
159159
; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
160160
; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
161161
; FLATSCR-NEXT: v_mov_b32_e32 v1, 0
162-
; FLATSCR-NEXT: s_mov_b32 s8, 0
163-
; FLATSCR-NEXT: s_mov_b32 s12, 0
164-
; FLATSCR-NEXT: s_mov_b32 s11, 0
165-
; FLATSCR-NEXT: s_mov_b32 s10, 0
166-
; FLATSCR-NEXT: s_mov_b32 s9, 0
167-
; FLATSCR-NEXT: s_mov_b32 s13, 0
168-
; FLATSCR-NEXT: s_mov_b32 s7, 0
169-
; FLATSCR-NEXT: s_mov_b32 s5, 0
170-
; FLATSCR-NEXT: s_mov_b32 s3, 0
171-
; FLATSCR-NEXT: s_mov_b32 s1, 0
172162
; FLATSCR-NEXT: s_mov_b32 s0, 0
173-
; FLATSCR-NEXT: s_mov_b32 s2, 0
174-
; FLATSCR-NEXT: s_mov_b32 s4, 0
175-
; FLATSCR-NEXT: s_mov_b32 s6, 0
176-
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s8 offset:8
177-
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s12 offset:16
178-
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s11 offset:24
179-
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s10 offset:32
180-
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s9 offset:40
181-
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s13 offset:48
182-
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s7 offset:56
183-
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s5 offset:64
184-
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s3 offset:72
185-
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s1 offset:80
163+
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:8
164+
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:16
165+
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:24
166+
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:32
167+
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:40
168+
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:48
169+
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:56
170+
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:64
171+
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:72
172+
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:80
186173
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:88
187-
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s2 offset:96
188-
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s4 offset:104
189-
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s6 offset:112
190-
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s8 offset:120
191-
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s12 offset:128
192-
; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s11 offset:8
174+
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:96
175+
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:104
176+
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:112
177+
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:120
178+
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:128
179+
; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 offset:8
193180
; FLATSCR-NEXT: s_nop 0
194-
; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s10 offset:16
195-
; FLATSCR-NEXT: scratch_load_dwordx2 v[4:5], off, s9 offset:24
196-
; FLATSCR-NEXT: s_mov_b32 s37, 0
197-
; FLATSCR-NEXT: scratch_load_dwordx2 v[6:7], off, s37 offset:32
198-
; FLATSCR-NEXT: s_mov_b32 s36, 0
199-
; FLATSCR-NEXT: scratch_load_dwordx2 v[8:9], off, s36 offset:40
200-
; FLATSCR-NEXT: s_mov_b32 s35, 0
201-
; FLATSCR-NEXT: scratch_load_dwordx2 v[10:11], off, s35 offset:48
202-
; FLATSCR-NEXT: s_mov_b32 s34, 0
203-
; FLATSCR-NEXT: scratch_load_dwordx2 v[12:13], off, s34 offset:56
204-
; FLATSCR-NEXT: s_mov_b32 s33, 0
205-
; FLATSCR-NEXT: scratch_load_dwordx2 v[14:15], off, s33 offset:64
181+
; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s0 offset:16
182+
; FLATSCR-NEXT: scratch_load_dwordx2 v[4:5], off, s0 offset:24
183+
; FLATSCR-NEXT: scratch_load_dwordx2 v[6:7], off, s0 offset:32
184+
; FLATSCR-NEXT: scratch_load_dwordx2 v[8:9], off, s0 offset:40
185+
; FLATSCR-NEXT: scratch_load_dwordx2 v[10:11], off, s0 offset:48
186+
; FLATSCR-NEXT: scratch_load_dwordx2 v[12:13], off, s0 offset:56
187+
; FLATSCR-NEXT: scratch_load_dwordx2 v[14:15], off, s0 offset:64
206188
; FLATSCR-NEXT: s_movk_i32 s32, 0x50
207189
; FLATSCR-NEXT: s_getpc_b64 s[0:1]
208190
; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_byval@rel32@lo+4

llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,8 @@ define i16 @v_extract_v128i16_varidx(ptr addrspace(1) %ptr, i32 %idx) {
239239
; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
240240
; GCN-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:224
241241
; GCN-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:240
242+
; GCN-NEXT: v_lshrrev_b32_e64 v1, 6, s33
243+
; GCN-NEXT: v_add_u32_e32 v1, 0x100, v1
242244
; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:256
243245
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:260
244246
; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:264
@@ -308,10 +310,8 @@ define i16 @v_extract_v128i16_varidx(ptr addrspace(1) %ptr, i32 %idx) {
308310
; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload
309311
; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload
310312
; GCN-NEXT: v_bfe_u32 v0, v6, 1, 6
311-
; GCN-NEXT: v_lshrrev_b32_e64 v2, 6, s33
312313
; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
313-
; GCN-NEXT: v_add_u32_e32 v2, 0x100, v2
314-
; GCN-NEXT: v_add_u32_e32 v0, v2, v0
314+
; GCN-NEXT: v_add_u32_e32 v0, v1, v0
315315
; GCN-NEXT: v_and_b32_e32 v1, 1, v6
316316
; GCN-NEXT: v_lshlrev_b32_e32 v1, 4, v1
317317
; GCN-NEXT: s_waitcnt vmcnt(0)

llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll

Lines changed: 21 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -279,8 +279,8 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
279279
; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24
280280
; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5
281281
; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
282-
; GFX9-NEXT: s_mov_b32 s2, 0
283-
; GFX9-NEXT: scratch_load_dword v0, off, s2 offset:4 glc
282+
; GFX9-NEXT: s_mov_b32 s1, 0
283+
; GFX9-NEXT: scratch_load_dword v0, off, s1 offset:4 glc
284284
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
285285
; GFX9-NEXT: s_lshl_b32 s1, s0, 2
286286
; GFX9-NEXT: s_and_b32 s0, s0, 15
@@ -459,9 +459,9 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
459459
; GFX9-NEXT: scratch_load_dword v1, off, s32 glc
460460
; GFX9-NEXT: s_waitcnt vmcnt(0)
461461
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
462-
; GFX9-NEXT: s_add_i32 s1, s32, 0x100
462+
; GFX9-NEXT: s_add_i32 s0, s32, 0x100
463463
; GFX9-NEXT: v_and_b32_e32 v0, 15, v0
464-
; GFX9-NEXT: v_add_u32_e32 v1, s1, v1
464+
; GFX9-NEXT: v_add_u32_e32 v1, s0, v1
465465
; GFX9-NEXT: v_mov_b32_e32 v2, 15
466466
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
467467
; GFX9-NEXT: s_add_i32 s0, s32, 0x100
@@ -478,13 +478,13 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
478478
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
479479
; GFX10-NEXT: v_and_b32_e32 v1, 15, v0
480480
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
481-
; GFX10-NEXT: s_add_i32 s1, s32, 0x100
482481
; GFX10-NEXT: s_add_i32 s0, s32, 0x100
483482
; GFX10-NEXT: v_mov_b32_e32 v2, 15
484-
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1
485-
; GFX10-NEXT: v_add_nc_u32_e32 v0, s1, v0
486483
; GFX10-NEXT: scratch_load_dword v3, off, s32 glc dlc
487484
; GFX10-NEXT: s_waitcnt vmcnt(0)
485+
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1
486+
; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0
487+
; GFX10-NEXT: s_add_i32 s0, s32, 0x100
488488
; GFX10-NEXT: v_add_nc_u32_e32 v1, s0, v1
489489
; GFX10-NEXT: scratch_store_dword v0, v2, off
490490
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
@@ -544,8 +544,8 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
544544
; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24
545545
; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5
546546
; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
547-
; GFX9-NEXT: s_mov_b32 s2, 0
548-
; GFX9-NEXT: scratch_load_dword v0, off, s2 offset:4 glc
547+
; GFX9-NEXT: s_mov_b32 s1, 0
548+
; GFX9-NEXT: scratch_load_dword v0, off, s1 offset:4 glc
549549
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
550550
; GFX9-NEXT: s_lshl_b32 s1, s0, 2
551551
; GFX9-NEXT: s_and_b32 s0, s0, 15
@@ -728,9 +728,9 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
728728
; GFX9-NEXT: scratch_load_dword v1, off, s32 offset:4 glc
729729
; GFX9-NEXT: s_waitcnt vmcnt(0)
730730
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
731-
; GFX9-NEXT: s_add_i32 s1, s32, 0x4004
731+
; GFX9-NEXT: s_add_i32 s0, s32, 0x4004
732732
; GFX9-NEXT: v_and_b32_e32 v0, 15, v0
733-
; GFX9-NEXT: v_add_u32_e32 v1, s1, v1
733+
; GFX9-NEXT: v_add_u32_e32 v1, s0, v1
734734
; GFX9-NEXT: v_mov_b32_e32 v2, 15
735735
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
736736
; GFX9-NEXT: s_add_i32 s0, s32, 0x4004
@@ -747,13 +747,13 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
747747
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
748748
; GFX10-NEXT: v_and_b32_e32 v1, 15, v0
749749
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
750-
; GFX10-NEXT: s_add_i32 s1, s32, 0x4004
751750
; GFX10-NEXT: s_add_i32 s0, s32, 0x4004
752751
; GFX10-NEXT: v_mov_b32_e32 v2, 15
753-
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1
754-
; GFX10-NEXT: v_add_nc_u32_e32 v0, s1, v0
755752
; GFX10-NEXT: scratch_load_dword v3, off, s32 offset:4 glc dlc
756753
; GFX10-NEXT: s_waitcnt vmcnt(0)
754+
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1
755+
; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0
756+
; GFX10-NEXT: s_add_i32 s0, s32, 0x4004
757757
; GFX10-NEXT: v_add_nc_u32_e32 v1, s0, v1
758758
; GFX10-NEXT: scratch_store_dword v0, v2, off
759759
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
@@ -767,8 +767,8 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
767767
; GFX940-NEXT: scratch_load_dword v1, off, s32 offset:4 sc0 sc1
768768
; GFX940-NEXT: s_waitcnt vmcnt(0)
769769
; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0
770-
; GFX940-NEXT: s_add_i32 s1, s32, 0x4004
771-
; GFX940-NEXT: v_add_u32_e32 v1, s1, v1
770+
; GFX940-NEXT: s_add_i32 s0, s32, 0x4004
771+
; GFX940-NEXT: v_add_u32_e32 v1, s0, v1
772772
; GFX940-NEXT: v_mov_b32_e32 v2, 15
773773
; GFX940-NEXT: v_and_b32_e32 v0, 15, v0
774774
; GFX940-NEXT: scratch_store_dword v1, v2, off sc0 sc1
@@ -785,12 +785,11 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
785785
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
786786
; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v0
787787
; GFX11-NEXT: v_and_b32_e32 v0, 15, v0
788-
; GFX11-NEXT: s_add_i32 s1, s32, 0x4004
789788
; GFX11-NEXT: s_add_i32 s0, s32, 0x4004
790-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
791-
; GFX11-NEXT: v_add_nc_u32_e32 v1, s1, v1
792789
; GFX11-NEXT: scratch_load_b32 v3, off, s32 offset:4 glc dlc
793790
; GFX11-NEXT: s_waitcnt vmcnt(0)
791+
; GFX11-NEXT: v_add_nc_u32_e32 v1, s0, v1
792+
; GFX11-NEXT: s_add_i32 s0, s32, 0x4004
794793
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
795794
; GFX11-NEXT: scratch_store_b32 v1, v2, off dlc
796795
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
@@ -816,10 +815,10 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
816815
; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3
817816
; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
818817
; GFX9-NEXT: v_mov_b32_e32 v0, 13
819-
; GFX9-NEXT: s_mov_b32 s1, 0
820-
; GFX9-NEXT: s_movk_i32 s0, 0x3e80
821-
; GFX9-NEXT: scratch_store_dword off, v0, s1 offset:4
818+
; GFX9-NEXT: s_mov_b32 s0, 0
819+
; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:4
822820
; GFX9-NEXT: s_waitcnt vmcnt(0)
821+
; GFX9-NEXT: s_movk_i32 s0, 0x3e80
823822
; GFX9-NEXT: v_mov_b32_e32 v0, 15
824823
; GFX9-NEXT: s_add_i32 s0, s0, 4
825824
; GFX9-NEXT: scratch_store_dword off, v0, s0

llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll

Lines changed: 8 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -481,25 +481,22 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i
481481
; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
482482
; FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
483483
; FLATSCR-NEXT: v_mov_b32_e32 v2, 0
484-
; FLATSCR-NEXT: s_mov_b32 s6, 0
485-
; FLATSCR-NEXT: s_mov_b32 s5, 0
486484
; FLATSCR-NEXT: s_mov_b32 s4, 0
487485
; FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
488486
; FLATSCR-NEXT: global_load_ushort v0, v2, s[0:1]
489487
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
490-
; FLATSCR-NEXT: scratch_store_short off, v0, s6 offset:4
488+
; FLATSCR-NEXT: scratch_store_short off, v0, s4 offset:4
491489
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
492490
; FLATSCR-NEXT: global_load_ushort v0, v2, s[0:1] offset:2
493491
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
494-
; FLATSCR-NEXT: scratch_store_short off, v0, s5 offset:6
492+
; FLATSCR-NEXT: scratch_store_short off, v0, s4 offset:6
495493
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
496494
; FLATSCR-NEXT: global_load_ushort v0, v2, s[0:1] offset:4
497-
; FLATSCR-NEXT: s_mov_b32 s1, 0
498495
; FLATSCR-NEXT: s_mov_b32 s0, 0
499496
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
500-
; FLATSCR-NEXT: scratch_store_short off, v0, s4 offset:8
497+
; FLATSCR-NEXT: scratch_store_short off, v0, s0 offset:8
501498
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
502-
; FLATSCR-NEXT: scratch_load_dword v0, off, s1 offset:4
499+
; FLATSCR-NEXT: scratch_load_dword v0, off, s0 offset:4
503500
; FLATSCR-NEXT: scratch_load_dword v1, off, s0 offset:6
504501
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
505502
; FLATSCR-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
@@ -544,27 +541,24 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i
544541
; FLATSCR_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
545542
; FLATSCR_GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
546543
; FLATSCR_GFX10-NEXT: v_mov_b32_e32 v2, 0
547-
; FLATSCR_GFX10-NEXT: s_mov_b32 s6, 0
548-
; FLATSCR_GFX10-NEXT: s_mov_b32 s5, 0
549544
; FLATSCR_GFX10-NEXT: s_mov_b32 s4, 0
550545
; FLATSCR_GFX10-NEXT: s_waitcnt lgkmcnt(0)
551546
; FLATSCR_GFX10-NEXT: global_load_ushort v0, v2, s[0:1]
552547
; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0)
553-
; FLATSCR_GFX10-NEXT: scratch_store_short off, v0, s6 offset:4
548+
; FLATSCR_GFX10-NEXT: scratch_store_short off, v0, s4 offset:4
554549
; FLATSCR_GFX10-NEXT: s_waitcnt_vscnt null, 0x0
555550
; FLATSCR_GFX10-NEXT: global_load_ushort v0, v2, s[0:1] offset:2
556551
; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0)
557-
; FLATSCR_GFX10-NEXT: scratch_store_short off, v0, s5 offset:6
552+
; FLATSCR_GFX10-NEXT: scratch_store_short off, v0, s4 offset:6
558553
; FLATSCR_GFX10-NEXT: s_waitcnt_vscnt null, 0x0
559554
; FLATSCR_GFX10-NEXT: global_load_ushort v0, v2, s[0:1] offset:4
560555
; FLATSCR_GFX10-NEXT: s_waitcnt_depctr 0xffe3
561-
; FLATSCR_GFX10-NEXT: s_mov_b32 s1, 0
562556
; FLATSCR_GFX10-NEXT: s_mov_b32 s0, 0
563557
; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0)
564-
; FLATSCR_GFX10-NEXT: scratch_store_short off, v0, s4 offset:8
558+
; FLATSCR_GFX10-NEXT: scratch_store_short off, v0, s0 offset:8
565559
; FLATSCR_GFX10-NEXT: s_waitcnt_vscnt null, 0x0
566560
; FLATSCR_GFX10-NEXT: s_clause 0x1
567-
; FLATSCR_GFX10-NEXT: scratch_load_dword v0, off, s1 offset:4
561+
; FLATSCR_GFX10-NEXT: scratch_load_dword v0, off, s0 offset:4
568562
; FLATSCR_GFX10-NEXT: scratch_load_dword v1, off, s0 offset:6
569563
; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0)
570564
; FLATSCR_GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]

llvm/test/CodeGen/AMDGPU/extract-load-i1.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@ define i1 @extractloadi1(ptr %ptr, i32 %idx) {
99
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1010
; CHECK-NEXT: flat_load_ubyte v0, v[0:1]
1111
; CHECK-NEXT: v_and_b32_e32 v1, 7, v2
12-
; CHECK-NEXT: v_lshr_b32_e64 v9, s32, 6
13-
; CHECK-NEXT: v_or_b32_e32 v1, v9, v1
12+
; CHECK-NEXT: v_lshr_b32_e64 v2, s32, 6
13+
; CHECK-NEXT: v_or_b32_e32 v1, v2, v1
1414
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1515
; CHECK-NEXT: v_bfe_u32 v2, v0, 1, 1
1616
; CHECK-NEXT: v_bfe_u32 v3, v0, 2, 2

0 commit comments

Comments
 (0)