Skip to content

Commit 6fbcfa7

Browse files
cdevadaszhang2amd
authored andcommitted
(Reland) [fastalloc] Support allocating specific register class in fastalloc
This reverts commit 853bb19. Change-Id: I4807fc9d7ddac1401b75e6f031755b00125ad321
1 parent 876f99a commit 6fbcfa7

15 files changed

+1678
-933
lines changed

llvm/lib/CodeGen/RegAllocFast.cpp

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1022,6 +1022,8 @@ void RegAllocFast::allocVirtRegUndef(MachineOperand &MO) {
10221022
/// (tied or earlyclobber) that may interfere with preassigned uses.
10231023
void RegAllocFast::defineLiveThroughVirtReg(MachineInstr &MI, unsigned OpNum,
10241024
Register VirtReg) {
1025+
if (!shouldAllocateRegister(VirtReg))
1026+
return;
10251027
LiveRegMap::iterator LRI = findLiveVirtReg(VirtReg);
10261028
if (LRI != LiveVirtRegs.end()) {
10271029
MCPhysReg PrevReg = LRI->PhysReg;
@@ -1054,6 +1056,8 @@ void RegAllocFast::defineLiveThroughVirtReg(MachineInstr &MI, unsigned OpNum,
10541056
void RegAllocFast::defineVirtReg(MachineInstr &MI, unsigned OpNum,
10551057
Register VirtReg, bool LookAtPhysRegUses) {
10561058
assert(VirtReg.isVirtual() && "Not a virtual register");
1059+
if (!shouldAllocateRegister(VirtReg))
1060+
return;
10571061
MachineOperand &MO = MI.getOperand(OpNum);
10581062
LiveRegMap::iterator LRI;
10591063
bool New;
@@ -1121,6 +1125,8 @@ void RegAllocFast::defineVirtReg(MachineInstr &MI, unsigned OpNum,
11211125
void RegAllocFast::useVirtReg(MachineInstr &MI, unsigned OpNum,
11221126
Register VirtReg) {
11231127
assert(VirtReg.isVirtual() && "Not a virtual register");
1128+
if (!shouldAllocateRegister(VirtReg))
1129+
return;
11241130
MachineOperand &MO = MI.getOperand(OpNum);
11251131
LiveRegMap::iterator LRI;
11261132
bool New;
@@ -1145,8 +1151,13 @@ void RegAllocFast::useVirtReg(MachineInstr &MI, unsigned OpNum,
11451151
Register Hint;
11461152
if (MI.isCopy() && MI.getOperand(1).getSubReg() == 0) {
11471153
Hint = MI.getOperand(0).getReg();
1148-
assert(Hint.isPhysical() &&
1149-
"Copy destination should already be assigned");
1154+
if (Hint.isVirtual()) {
1155+
assert(!shouldAllocateRegister(Hint));
1156+
Hint = Register();
1157+
} else {
1158+
assert(Hint.isPhysical() &&
1159+
"Copy destination should already be assigned");
1160+
}
11501161
}
11511162
allocVirtReg(MI, *LRI, Hint, false);
11521163
if (LRI->Error) {
@@ -1254,6 +1265,8 @@ void RegAllocFast::addRegClassDefCounts(std::vector<unsigned> &RegClassDefCounts
12541265
assert(RegClassDefCounts.size() == TRI->getNumRegClasses());
12551266

12561267
if (Reg.isVirtual()) {
1268+
if (!shouldAllocateRegister(Reg))
1269+
return;
12571270
const TargetRegisterClass *OpRC = MRI->getRegClass(Reg);
12581271
for (unsigned RCIdx = 0, RCIdxEnd = TRI->getNumRegClasses();
12591272
RCIdx != RCIdxEnd; ++RCIdx) {
@@ -1313,6 +1326,8 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) {
13131326
if (MO.isReg()) {
13141327
Register Reg = MO.getReg();
13151328
if (Reg.isVirtual()) {
1329+
if (!shouldAllocateRegister(Reg))
1330+
continue;
13161331
if (MO.isDef()) {
13171332
HasDef = true;
13181333
HasVRegDef = true;
@@ -1376,7 +1391,7 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) {
13761391
}
13771392

13781393
if (MO.isDef()) {
1379-
if (Reg.isVirtual())
1394+
if (Reg.isVirtual() && shouldAllocateRegister(Reg))
13801395
DefOperandIndexes.push_back(I);
13811396

13821397
addRegClassDefCounts(RegClassDefCounts, Reg);
@@ -1466,6 +1481,10 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) {
14661481
Register Reg = MO.getReg();
14671482
if (!Reg)
14681483
continue;
1484+
if (Reg.isVirtual()) {
1485+
assert(!shouldAllocateRegister(Reg));
1486+
continue;
1487+
}
14691488
assert(Reg.isPhysical());
14701489
if (MRI->isReserved(Reg))
14711490
continue;
@@ -1512,7 +1531,7 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) {
15121531
if (!MO.isReg() || !MO.isUse())
15131532
continue;
15141533
Register Reg = MO.getReg();
1515-
if (!Reg.isVirtual())
1534+
if (!Reg.isVirtual() || !shouldAllocateRegister(Reg))
15161535
continue;
15171536

15181537
if (MO.isUndef()) {
@@ -1539,7 +1558,7 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) {
15391558
if (!MO.isReg() || !MO.isUse())
15401559
continue;
15411560
Register Reg = MO.getReg();
1542-
if (!Reg.isVirtual())
1561+
if (!Reg.isVirtual() || !shouldAllocateRegister(Reg))
15431562
continue;
15441563

15451564
assert(MO.isUndef() && "Should only have undef virtreg uses left");
@@ -1557,6 +1576,10 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) {
15571576
Register Reg = MO.getReg();
15581577
if (!Reg)
15591578
continue;
1579+
if (Reg.isVirtual()) {
1580+
assert(!shouldAllocateRegister(Reg));
1581+
continue;
1582+
}
15601583
assert(Reg.isPhysical() && "should have register assigned");
15611584

15621585
// We sometimes get odd situations like:
@@ -1702,7 +1725,7 @@ void RegAllocFast::handleBundle(MachineInstr &MI) {
17021725
continue;
17031726

17041727
Register Reg = MO.getReg();
1705-
if (!Reg.isVirtual())
1728+
if (!Reg.isVirtual() || !shouldAllocateRegister(Reg))
17061729
continue;
17071730

17081731
DenseMap<Register, MCPhysReg>::iterator DI;

llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll

Lines changed: 57 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -17,98 +17,93 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
1717
; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
1818
; CHECK-NEXT: s_mov_b32 exec_lo, s4
1919
; CHECK-NEXT: ; implicit-def: $vgpr8
20-
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
21-
; CHECK-NEXT: v_mov_b32_e32 v14, v1
20+
; CHECK-NEXT: v_mov_b32_e32 v8, v0
21+
; CHECK-NEXT: s_or_saveexec_b32 s21, -1
22+
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
23+
; CHECK-NEXT: s_mov_b32 exec_lo, s21
24+
; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
25+
; CHECK-NEXT: v_mov_b32_e32 v15, v1
26+
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
27+
; CHECK-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
28+
; CHECK-NEXT: v_mov_b32_e32 v14, v2
2229
; CHECK-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
23-
; CHECK-NEXT: v_mov_b32_e32 v13, v2
30+
; CHECK-NEXT: v_mov_b32_e32 v13, v3
2431
; CHECK-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
25-
; CHECK-NEXT: v_mov_b32_e32 v12, v3
32+
; CHECK-NEXT: v_mov_b32_e32 v12, v4
2633
; CHECK-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
27-
; CHECK-NEXT: v_mov_b32_e32 v11, v4
34+
; CHECK-NEXT: v_mov_b32_e32 v11, v5
2835
; CHECK-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
29-
; CHECK-NEXT: v_mov_b32_e32 v10, v5
36+
; CHECK-NEXT: v_mov_b32_e32 v10, v6
3037
; CHECK-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
31-
; CHECK-NEXT: v_mov_b32_e32 v9, v6
38+
; CHECK-NEXT: v_mov_b32_e32 v9, v7
3239
; CHECK-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
33-
; CHECK-NEXT: v_mov_b32_e32 v8, v7
34-
; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
35-
; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 killed $exec
36-
; CHECK-NEXT: v_mov_b32_e32 v1, v14
37-
; CHECK-NEXT: v_mov_b32_e32 v2, v13
38-
; CHECK-NEXT: v_mov_b32_e32 v3, v12
39-
; CHECK-NEXT: v_mov_b32_e32 v4, v11
40-
; CHECK-NEXT: v_mov_b32_e32 v5, v10
41-
; CHECK-NEXT: v_mov_b32_e32 v6, v9
42-
; CHECK-NEXT: v_mov_b32_e32 v7, v8
43-
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
40+
; CHECK-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 killed $exec
41+
; CHECK-NEXT: v_mov_b32_e32 v2, v15
42+
; CHECK-NEXT: v_mov_b32_e32 v3, v14
43+
; CHECK-NEXT: v_mov_b32_e32 v4, v13
44+
; CHECK-NEXT: v_mov_b32_e32 v5, v12
45+
; CHECK-NEXT: v_mov_b32_e32 v6, v11
46+
; CHECK-NEXT: v_mov_b32_e32 v7, v10
47+
; CHECK-NEXT: v_mov_b32_e32 v8, v9
48+
; CHECK-NEXT: s_waitcnt vmcnt(0)
4449
; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
4550
; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
4651
; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
4752
; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
4853
; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
4954
; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
5055
; CHECK-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
51-
; CHECK-NEXT: s_or_saveexec_b32 s21, -1
52-
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
53-
; CHECK-NEXT: s_mov_b32 exec_lo, s21
56+
; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
5457
; CHECK-NEXT: s_mov_b32 s8, 0
5558
; CHECK-NEXT: s_mov_b32 s4, s8
5659
; CHECK-NEXT: s_mov_b32 s5, s8
5760
; CHECK-NEXT: s_mov_b32 s6, s8
5861
; CHECK-NEXT: s_mov_b32 s7, s8
59-
; CHECK-NEXT: s_waitcnt vmcnt(0)
6062
; CHECK-NEXT: v_writelane_b32 v0, s4, 0
6163
; CHECK-NEXT: v_writelane_b32 v0, s5, 1
6264
; CHECK-NEXT: v_writelane_b32 v0, s6, 2
6365
; CHECK-NEXT: v_writelane_b32 v0, s7, 3
64-
; CHECK-NEXT: s_or_saveexec_b32 s21, -1
65-
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
66-
; CHECK-NEXT: s_mov_b32 exec_lo, s21
6766
; CHECK-NEXT: s_mov_b32 s6, 0
6867
; CHECK-NEXT: s_mov_b32 s4, s6
6968
; CHECK-NEXT: s_mov_b32 s5, s6
70-
; CHECK-NEXT: v_mov_b32_e32 v0, s4
71-
; CHECK-NEXT: v_mov_b32_e32 v1, s5
72-
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
69+
; CHECK-NEXT: v_mov_b32_e32 v1, s4
70+
; CHECK-NEXT: v_mov_b32_e32 v2, s5
7371
; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
74-
; CHECK-NEXT: s_or_saveexec_b32 s21, -1
75-
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
76-
; CHECK-NEXT: s_mov_b32 exec_lo, s21
72+
; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
7773
; CHECK-NEXT: s_mov_b32 s4, exec_lo
78-
; CHECK-NEXT: s_waitcnt vmcnt(0)
7974
; CHECK-NEXT: v_writelane_b32 v0, s4, 4
8075
; CHECK-NEXT: s_or_saveexec_b32 s21, -1
81-
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
76+
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
8277
; CHECK-NEXT: s_mov_b32 exec_lo, s21
8378
; CHECK-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
84-
; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
79+
; CHECK-NEXT: s_or_saveexec_b32 s21, -1
80+
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
81+
; CHECK-NEXT: s_mov_b32 exec_lo, s21
8582
; CHECK-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
8683
; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
8784
; CHECK-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
8885
; CHECK-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
8986
; CHECK-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
9087
; CHECK-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
9188
; CHECK-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
92-
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
89+
; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
9390
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
9491
; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
9592
; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
9693
; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
9794
; CHECK-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
9895
; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
9996
; CHECK-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
97+
; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
10098
; CHECK-NEXT: s_waitcnt vmcnt(0)
101-
; CHECK-NEXT: v_readfirstlane_b32 s12, v7
102-
; CHECK-NEXT: v_readfirstlane_b32 s10, v6
103-
; CHECK-NEXT: v_readfirstlane_b32 s9, v5
104-
; CHECK-NEXT: v_readfirstlane_b32 s8, v4
105-
; CHECK-NEXT: v_readfirstlane_b32 s7, v3
106-
; CHECK-NEXT: v_readfirstlane_b32 s6, v2
107-
; CHECK-NEXT: v_readfirstlane_b32 s5, v1
108-
; CHECK-NEXT: v_readfirstlane_b32 s4, v0
109-
; CHECK-NEXT: s_or_saveexec_b32 s21, -1
110-
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
111-
; CHECK-NEXT: s_mov_b32 exec_lo, s21
99+
; CHECK-NEXT: v_readfirstlane_b32 s12, v8
100+
; CHECK-NEXT: v_readfirstlane_b32 s10, v7
101+
; CHECK-NEXT: v_readfirstlane_b32 s9, v6
102+
; CHECK-NEXT: v_readfirstlane_b32 s8, v5
103+
; CHECK-NEXT: v_readfirstlane_b32 s7, v4
104+
; CHECK-NEXT: v_readfirstlane_b32 s6, v3
105+
; CHECK-NEXT: v_readfirstlane_b32 s5, v2
106+
; CHECK-NEXT: v_readfirstlane_b32 s4, v1
112107
; CHECK-NEXT: ; kill: def $sgpr12 killed $sgpr12 def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
113108
; CHECK-NEXT: s_mov_b32 s13, s10
114109
; CHECK-NEXT: s_mov_b32 s14, s9
@@ -117,7 +112,6 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
117112
; CHECK-NEXT: s_mov_b32 s17, s6
118113
; CHECK-NEXT: s_mov_b32 s18, s5
119114
; CHECK-NEXT: s_mov_b32 s19, s4
120-
; CHECK-NEXT: s_waitcnt vmcnt(0)
121115
; CHECK-NEXT: v_writelane_b32 v0, s12, 5
122116
; CHECK-NEXT: v_writelane_b32 v0, s13, 6
123117
; CHECK-NEXT: v_writelane_b32 v0, s14, 7
@@ -126,45 +120,38 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
126120
; CHECK-NEXT: v_writelane_b32 v0, s17, 10
127121
; CHECK-NEXT: v_writelane_b32 v0, s18, 11
128122
; CHECK-NEXT: v_writelane_b32 v0, s19, 12
129-
; CHECK-NEXT: s_or_saveexec_b32 s21, -1
130-
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
131-
; CHECK-NEXT: s_mov_b32 exec_lo, s21
132-
; CHECK-NEXT: v_mov_b32_e32 v6, v8
133123
; CHECK-NEXT: v_mov_b32_e32 v7, v9
134-
; CHECK-NEXT: v_mov_b32_e32 v4, v10
124+
; CHECK-NEXT: v_mov_b32_e32 v8, v10
135125
; CHECK-NEXT: v_mov_b32_e32 v5, v11
136-
; CHECK-NEXT: v_mov_b32_e32 v2, v12
126+
; CHECK-NEXT: v_mov_b32_e32 v6, v12
137127
; CHECK-NEXT: v_mov_b32_e32 v3, v13
138-
; CHECK-NEXT: v_mov_b32_e32 v0, v14
128+
; CHECK-NEXT: v_mov_b32_e32 v4, v14
139129
; CHECK-NEXT: v_mov_b32_e32 v1, v15
130+
; CHECK-NEXT: v_mov_b32_e32 v2, v16
140131
; CHECK-NEXT: s_mov_b64 s[4:5], s[12:13]
141132
; CHECK-NEXT: s_mov_b64 s[10:11], s[14:15]
142133
; CHECK-NEXT: s_mov_b64 s[8:9], s[16:17]
143134
; CHECK-NEXT: s_mov_b64 s[6:7], s[18:19]
144-
; CHECK-NEXT: v_cmp_eq_u64_e64 s4, s[4:5], v[6:7]
145-
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[10:11], v[4:5]
135+
; CHECK-NEXT: v_cmp_eq_u64_e64 s4, s[4:5], v[7:8]
136+
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[10:11], v[5:6]
146137
; CHECK-NEXT: s_and_b32 s4, s4, s5
147-
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[8:9], v[2:3]
138+
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[8:9], v[3:4]
148139
; CHECK-NEXT: s_and_b32 s4, s4, s5
149-
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[6:7], v[0:1]
150-
; CHECK-NEXT: s_or_saveexec_b32 s21, -1
151-
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
152-
; CHECK-NEXT: s_mov_b32 exec_lo, s21
140+
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[6:7], v[1:2]
153141
; CHECK-NEXT: s_and_b32 s4, s4, s5
154142
; CHECK-NEXT: s_and_saveexec_b32 s4, s4
155-
; CHECK-NEXT: s_waitcnt vmcnt(0)
156143
; CHECK-NEXT: v_writelane_b32 v0, s4, 13
157144
; CHECK-NEXT: s_or_saveexec_b32 s21, -1
158-
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
145+
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
159146
; CHECK-NEXT: s_mov_b32 exec_lo, s21
160147
; CHECK-NEXT: ; %bb.2: ; in Loop: Header=BB0_1 Depth=1
148+
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
149+
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
161150
; CHECK-NEXT: s_or_saveexec_b32 s21, -1
162-
; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
151+
; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
163152
; CHECK-NEXT: s_mov_b32 exec_lo, s21
164153
; CHECK-NEXT: s_waitcnt vmcnt(0)
165154
; CHECK-NEXT: v_readlane_b32 s4, v2, 13
166-
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
167-
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
168155
; CHECK-NEXT: v_readlane_b32 s8, v2, 5
169156
; CHECK-NEXT: v_readlane_b32 s9, v2, 6
170157
; CHECK-NEXT: v_readlane_b32 s10, v2, 7
@@ -177,24 +164,23 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
177164
; CHECK-NEXT: v_readlane_b32 s17, v2, 1
178165
; CHECK-NEXT: v_readlane_b32 s18, v2, 2
179166
; CHECK-NEXT: v_readlane_b32 s19, v2, 3
180-
; CHECK-NEXT: s_waitcnt vmcnt(0)
181167
; CHECK-NEXT: image_sample v0, v[0:1], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D
182168
; CHECK-NEXT: s_waitcnt vmcnt(0)
183-
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
169+
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
184170
; CHECK-NEXT: s_xor_b32 exec_lo, exec_lo, s4
185171
; CHECK-NEXT: s_cbranch_execnz .LBB0_1
186172
; CHECK-NEXT: ; %bb.3:
187173
; CHECK-NEXT: s_or_saveexec_b32 s21, -1
188-
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
174+
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
189175
; CHECK-NEXT: s_mov_b32 exec_lo, s21
190176
; CHECK-NEXT: s_waitcnt vmcnt(0)
191177
; CHECK-NEXT: v_readlane_b32 s4, v0, 4
192178
; CHECK-NEXT: s_mov_b32 exec_lo, s4
193179
; CHECK-NEXT: ; %bb.4:
194180
; CHECK-NEXT: s_or_saveexec_b32 s21, -1
195-
; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
181+
; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
196182
; CHECK-NEXT: s_mov_b32 exec_lo, s21
197-
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
183+
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
198184
; CHECK-NEXT: ; implicit-def: $sgpr4
199185
; CHECK-NEXT: v_mov_b32_e32 v1, s4
200186
; CHECK-NEXT: v_mov_b32_e32 v2, s4

0 commit comments

Comments
 (0)