Skip to content

Commit 3359ea6

Browse files
author
QingShan Zhang
committed
[Scheduling] Create the missing dependency edges for store cluster
If it is load cluster, we don't need to create the dependency edges(SUb->reg) from SUb to SUa as they both depend on the base register "reg" +-------+ +----> reg | | +---+---+ | ^ | | | | | | | +---+---+ | | SUa | Load 0(reg) | +---+---+ | ^ | | | | | +---+---+ +----+ SUb | Load 4(reg) +-------+ But if it is store cluster, we need to create it as follow shows to avoid the instruction store depend on scheduled in-between SUb and SUa. +-------+ +----> reg | | +---+---+ | ^ | | Missing +-------+ | | +-------------------->+ y | | | | +---+---+ | +---+-+-+ ^ | | SUa | Store x 0(reg) | | +---+---+ | | ^ | | | +------------------------+ | | | | +---+--++ +----+ SUb | Store y 4(reg) +-------+ Reviewed By: evandro, arsenm, rampitec, foad, fhahn Differential Revision: https://reviews.llvm.org/D72031
1 parent 96b0280 commit 3359ea6

File tree

10 files changed

+596
-554
lines changed

10 files changed

+596
-554
lines changed

llvm/lib/CodeGen/MachineScheduler.cpp

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1624,16 +1624,32 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps(
16241624
LLVM_DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU("
16251625
<< SUb->NodeNum << ")\n");
16261626

1627-
// Copy successor edges from SUa to SUb. Interleaving computation
1628-
// dependent on SUa can prevent load combining due to register reuse.
1629-
// Predecessor edges do not need to be copied from SUb to SUa since
1630-
// nearby loads should have effectively the same inputs.
1631-
for (const SDep &Succ : SUa->Succs) {
1632-
if (Succ.getSUnit() == SUb)
1633-
continue;
1634-
LLVM_DEBUG(dbgs() << " Copy Succ SU(" << Succ.getSUnit()->NodeNum
1635-
<< ")\n");
1636-
DAG->addEdge(Succ.getSUnit(), SDep(SUb, SDep::Artificial));
1627+
if (IsLoad) {
1628+
// Copy successor edges from SUa to SUb. Interleaving computation
1629+
// dependent on SUa can prevent load combining due to register reuse.
1630+
// Predecessor edges do not need to be copied from SUb to SUa since
1631+
// nearby loads should have effectively the same inputs.
1632+
for (const SDep &Succ : SUa->Succs) {
1633+
if (Succ.getSUnit() == SUb)
1634+
continue;
1635+
LLVM_DEBUG(dbgs() << " Copy Succ SU(" << Succ.getSUnit()->NodeNum
1636+
<< ")\n");
1637+
DAG->addEdge(Succ.getSUnit(), SDep(SUb, SDep::Artificial));
1638+
}
1639+
} else {
1640+
// Copy predecessor edges from SUb to SUa to avoid the SUnits that
1641+
// SUb dependent on scheduled in-between SUb and SUa. Successor edges
1642+
// do not need to be copied from SUa to SUb since no one will depend
1643+
// on stores.
1644+
// Notice that, we don't need to care about the memory dependency as
1645+
// we won't try to cluster them if they have any memory dependency.
1646+
for (const SDep &Pred : SUb->Preds) {
1647+
if (Pred.getSUnit() == SUa)
1648+
continue;
1649+
LLVM_DEBUG(dbgs() << " Copy Pred SU(" << Pred.getSUnit()->NodeNum
1650+
<< ")\n");
1651+
DAG->addEdge(SUa, SDep(Pred.getSUnit(), SDep::Artificial));
1652+
}
16371653
}
16381654

16391655
LLVM_DEBUG(dbgs() << " Curr cluster length: " << ClusterLength

llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,3 +194,22 @@ entry:
194194
store i64 %add6.3, i64* %arrayidx5.3, align 8
195195
ret void
196196
}
197+
198+
; Verify that the SU(2) and SU(4) are the preds of SU(3)
199+
; CHECK: ********** MI Scheduling **********
200+
; CHECK-LABEL: stp_missing_preds_edges:%bb.0
201+
; CHECK:Cluster ld/st SU(3) - SU(5)
202+
; CHECK: Copy Pred SU(4)
203+
; CHECK: Copy Pred SU(2)
204+
; CHECK:SU(2): %0:gpr64common = COPY $x0
205+
; CHECK:SU(3): STRWui %1:gpr32, %0:gpr64common, 0
206+
; CHECK:SU(4): %3:gpr32common = nsw ADDWri %2:gpr32common, 5, 0
207+
; CHECK:SU(5): STRWui %3:gpr32common, %0:gpr64common, 1
208+
define void @stp_missing_preds_edges(i32* %p, i32 %m, i32 %n) {
209+
entry:
210+
store i32 %m, i32* %p, align 4
211+
%add = add nsw i32 %n, 5
212+
%arrayidx1 = getelementptr inbounds i32, i32* %p, i64 1
213+
store i32 %add, i32* %arrayidx1, align 4
214+
ret void
215+
}

llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll

Lines changed: 483 additions & 474 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll

Lines changed: 50 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -25,15 +25,15 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
2525
; GCN-NEXT: v_mov_b32_e32 v6, s15
2626
; GCN-NEXT: v_mov_b32_e32 v8, s16
2727
; GCN-NEXT: v_mov_b32_e32 v10, s17
28+
; GCN-NEXT: v_mov_b32_e32 v12, s18
29+
; GCN-NEXT: v_mov_b32_e32 v14, s19
2830
; GCN-NEXT: s_movk_i32 s5, 0x60
2931
; GCN-NEXT: v_add_u32_e32 v2, 8, v0
3032
; GCN-NEXT: v_add_u32_e32 v3, 12, v0
3133
; GCN-NEXT: v_add_u32_e32 v7, 16, v0
3234
; GCN-NEXT: v_add_u32_e32 v9, 20, v0
3335
; GCN-NEXT: v_add_u32_e32 v11, 24, v0
34-
; GCN-NEXT: v_mov_b32_e32 v12, s18
3536
; GCN-NEXT: v_add_u32_e32 v13, 28, v0
36-
; GCN-NEXT: v_mov_b32_e32 v14, s19
3737
; GCN-NEXT: v_add_u32_e32 v15, 32, v0
3838
; GCN-NEXT: v_mov_b32_e32 v16, s20
3939
; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen
@@ -71,7 +71,7 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
7171
; GCN-NEXT: buffer_store_dword v28, v27, s[0:3], 0 offen
7272
; GCN-NEXT: buffer_store_dword v30, v29, s[0:3], 0 offen
7373
; GCN-NEXT: buffer_store_dword v32, v31, s[0:3], 0 offen
74-
; GCN-NEXT: s_movk_i32 s10, 0x70
74+
; GCN-NEXT: s_movk_i32 s13, 0x70
7575
; GCN-NEXT: v_add_u32_e32 v35, 0x48, v0
7676
; GCN-NEXT: v_mov_b32_e32 v36, s70
7777
; GCN-NEXT: v_add_u32_e32 v37, 0x4c, v0
@@ -96,19 +96,19 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
9696
; GCN-NEXT: v_add_u32_e32 v26, 0x64, v0
9797
; GCN-NEXT: v_mov_b32_e32 v14, s77
9898
; GCN-NEXT: v_mov_b32_e32 v4, s81
99-
; GCN-NEXT: s_movk_i32 s11, 0x90
100-
; GCN-NEXT: s_movk_i32 s13, 0xa0
99+
; GCN-NEXT: s_movk_i32 s14, 0x90
100+
; GCN-NEXT: s_movk_i32 s15, 0xa0
101101
; GCN-NEXT: v_add_u32_e32 v28, 0x68, v0
102102
; GCN-NEXT: v_mov_b32_e32 v16, s78
103103
; GCN-NEXT: v_add_u32_e32 v30, 0x6c, v0
104104
; GCN-NEXT: v_mov_b32_e32 v18, s79
105+
; GCN-NEXT: v_add_u32_e32 v32, s13, v0
105106
; GCN-NEXT: v_mov_b32_e32 v20, s80
106-
; GCN-NEXT: v_mov_b32_e32 v5, s82
107-
; GCN-NEXT: v_mov_b32_e32 v6, s83
108-
; GCN-NEXT: v_add_u32_e32 v32, s10, v0
109107
; GCN-NEXT: v_add_u32_e32 v34, 0x74, v0
110108
; GCN-NEXT: v_add_u32_e32 v36, 0x78, v0
109+
; GCN-NEXT: v_mov_b32_e32 v5, s82
111110
; GCN-NEXT: v_add_u32_e32 v43, 0x7c, v0
111+
; GCN-NEXT: v_mov_b32_e32 v6, s83
112112
; GCN-NEXT: v_add_u32_e32 v44, 0x80, v0
113113
; GCN-NEXT: v_mov_b32_e32 v8, s52
114114
; GCN-NEXT: buffer_store_dword v14, v26, s[0:3], 0 offen
@@ -121,20 +121,20 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
121121
; GCN-NEXT: buffer_store_dword v8, v44, s[0:3], 0 offen
122122
; GCN-NEXT: v_add_u32_e32 v45, 0x84, v0
123123
; GCN-NEXT: v_mov_b32_e32 v4, s53
124-
; GCN-NEXT: s_movk_i32 s14, 0xb0
124+
; GCN-NEXT: s_movk_i32 s16, 0xb0
125125
; GCN-NEXT: v_add_u32_e32 v46, 0x88, v0
126126
; GCN-NEXT: v_mov_b32_e32 v5, s54
127127
; GCN-NEXT: v_add_u32_e32 v47, 0x8c, v0
128128
; GCN-NEXT: v_mov_b32_e32 v6, s55
129-
; GCN-NEXT: v_add_u32_e32 v48, s11, v0
129+
; GCN-NEXT: v_add_u32_e32 v48, s14, v0
130130
; GCN-NEXT: v_mov_b32_e32 v8, s56
131131
; GCN-NEXT: v_add_u32_e32 v49, 0x94, v0
132132
; GCN-NEXT: v_mov_b32_e32 v10, s57
133133
; GCN-NEXT: v_add_u32_e32 v50, 0x98, v0
134134
; GCN-NEXT: v_mov_b32_e32 v12, s58
135135
; GCN-NEXT: v_add_u32_e32 v51, 0x9c, v0
136136
; GCN-NEXT: v_mov_b32_e32 v14, s59
137-
; GCN-NEXT: v_add_u32_e32 v52, s13, v0
137+
; GCN-NEXT: v_add_u32_e32 v52, s15, v0
138138
; GCN-NEXT: v_mov_b32_e32 v16, s60
139139
; GCN-NEXT: buffer_store_dword v4, v45, s[0:3], 0 offen
140140
; GCN-NEXT: buffer_store_dword v5, v46, s[0:3], 0 offen
@@ -146,13 +146,13 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
146146
; GCN-NEXT: buffer_store_dword v16, v52, s[0:3], 0 offen
147147
; GCN-NEXT: v_add_u32_e32 v53, 0xa4, v0
148148
; GCN-NEXT: v_mov_b32_e32 v4, s61
149-
; GCN-NEXT: s_movk_i32 s15, 0xd0
150-
; GCN-NEXT: s_movk_i32 s16, 0xe0
149+
; GCN-NEXT: s_movk_i32 s17, 0xd0
150+
; GCN-NEXT: s_movk_i32 s18, 0xe0
151151
; GCN-NEXT: v_add_u32_e32 v54, 0xa8, v0
152152
; GCN-NEXT: v_mov_b32_e32 v5, s62
153153
; GCN-NEXT: v_add_u32_e32 v55, 0xac, v0
154154
; GCN-NEXT: v_mov_b32_e32 v6, s63
155-
; GCN-NEXT: v_add_u32_e32 v56, s14, v0
155+
; GCN-NEXT: v_add_u32_e32 v56, s16, v0
156156
; GCN-NEXT: v_mov_b32_e32 v8, s64
157157
; GCN-NEXT: v_add_u32_e32 v57, 0xb4, v0
158158
; GCN-NEXT: v_mov_b32_e32 v10, s65
@@ -173,20 +173,20 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
173173
; GCN-NEXT: v_add_u32_e32 v61, 0xc4, v0
174174
; GCN-NEXT: v_mov_b32_e32 v4, s37
175175
; GCN-NEXT: s_and_b32 s7, s7, 63
176-
; GCN-NEXT: s_movk_i32 s17, 0xf0
176+
; GCN-NEXT: s_movk_i32 s19, 0xf0
177177
; GCN-NEXT: v_add_u32_e32 v62, 0xc8, v0
178178
; GCN-NEXT: v_mov_b32_e32 v5, s38
179179
; GCN-NEXT: v_add_u32_e32 v63, 0xcc, v0
180180
; GCN-NEXT: v_mov_b32_e32 v6, s39
181-
; GCN-NEXT: v_add_u32_e32 v64, s15, v0
181+
; GCN-NEXT: v_add_u32_e32 v64, s17, v0
182182
; GCN-NEXT: v_mov_b32_e32 v8, s40
183183
; GCN-NEXT: v_add_u32_e32 v65, 0xd4, v0
184184
; GCN-NEXT: v_mov_b32_e32 v10, s41
185185
; GCN-NEXT: v_add_u32_e32 v66, 0xd8, v0
186186
; GCN-NEXT: v_mov_b32_e32 v12, s42
187187
; GCN-NEXT: v_add_u32_e32 v67, 0xdc, v0
188188
; GCN-NEXT: v_mov_b32_e32 v14, s43
189-
; GCN-NEXT: v_add_u32_e32 v68, s16, v0
189+
; GCN-NEXT: v_add_u32_e32 v68, s18, v0
190190
; GCN-NEXT: v_mov_b32_e32 v16, s44
191191
; GCN-NEXT: buffer_store_dword v4, v61, s[0:3], 0 offen
192192
; GCN-NEXT: buffer_store_dword v5, v62, s[0:3], 0 offen
@@ -202,7 +202,7 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
202202
; GCN-NEXT: v_mov_b32_e32 v5, s46
203203
; GCN-NEXT: v_add_u32_e32 v71, 0xec, v0
204204
; GCN-NEXT: v_mov_b32_e32 v6, s47
205-
; GCN-NEXT: v_add_u32_e32 v72, s17, v0
205+
; GCN-NEXT: v_add_u32_e32 v72, s19, v0
206206
; GCN-NEXT: v_mov_b32_e32 v8, s48
207207
; GCN-NEXT: v_add_u32_e32 v73, 0xf4, v0
208208
; GCN-NEXT: v_mov_b32_e32 v10, s49
@@ -217,9 +217,9 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
217217
; GCN-NEXT: v_mov_b32_e32 v4, s12
218218
; GCN-NEXT: s_lshl_b32 s7, s7, 2
219219
; GCN-NEXT: v_add_u32_e32 v75, 0xfc, v0
220-
; GCN-NEXT: v_mov_b32_e32 v5, s51
220+
; GCN-NEXT: v_mov_b32_e32 v14, s51
221221
; GCN-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:256
222-
; GCN-NEXT: buffer_store_dword v5, v75, s[0:3], 0 offen
222+
; GCN-NEXT: buffer_store_dword v14, v75, s[0:3], 0 offen
223223
; GCN-NEXT: v_mov_b32_e32 v4, s6
224224
; GCN-NEXT: v_add_u32_e32 v0, s7, v0
225225
; GCN-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
@@ -289,78 +289,78 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
289289
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:256
290290
; GCN-NEXT: s_add_u32 s6, s8, 16
291291
; GCN-NEXT: s_addc_u32 s7, s9, 0
292-
; GCN-NEXT: v_mov_b32_e32 v65, s9
293292
; GCN-NEXT: v_mov_b32_e32 v67, s7
294293
; GCN-NEXT: v_mov_b32_e32 v66, s6
295294
; GCN-NEXT: s_add_u32 s6, s8, 32
296-
; GCN-NEXT: v_mov_b32_e32 v64, s8
297295
; GCN-NEXT: s_addc_u32 s7, s9, 0
296+
; GCN-NEXT: v_mov_b32_e32 v65, s9
297+
; GCN-NEXT: s_add_u32 s10, s8, 48
298+
; GCN-NEXT: v_mov_b32_e32 v64, s8
299+
; GCN-NEXT: s_addc_u32 s11, s9, 0
298300
; GCN-NEXT: s_waitcnt vmcnt(0)
299301
; GCN-NEXT: global_store_dwordx4 v[64:65], v[0:3], off
300302
; GCN-NEXT: global_store_dwordx4 v[66:67], v[4:7], off
301303
; GCN-NEXT: v_mov_b32_e32 v0, s6
302304
; GCN-NEXT: v_mov_b32_e32 v1, s7
303-
; GCN-NEXT: s_add_u32 s6, s8, 48
304-
; GCN-NEXT: s_addc_u32 s7, s9, 0
305-
; GCN-NEXT: v_mov_b32_e32 v2, s6
306-
; GCN-NEXT: v_mov_b32_e32 v3, s7
307305
; GCN-NEXT: s_add_u32 s6, s8, 64
306+
; GCN-NEXT: v_mov_b32_e32 v2, s10
307+
; GCN-NEXT: s_addc_u32 s7, s9, 0
308+
; GCN-NEXT: v_mov_b32_e32 v3, s11
309+
; GCN-NEXT: s_add_u32 s10, s8, s4
310+
; GCN-NEXT: s_addc_u32 s11, s9, 0
311+
; GCN-NEXT: s_add_u32 s4, s8, s5
308312
; GCN-NEXT: global_store_dwordx4 v[0:1], v[8:11], off
309313
; GCN-NEXT: global_store_dwordx4 v[2:3], v[12:15], off
310-
; GCN-NEXT: s_addc_u32 s7, s9, 0
311314
; GCN-NEXT: v_mov_b32_e32 v0, s6
315+
; GCN-NEXT: s_addc_u32 s5, s9, 0
312316
; GCN-NEXT: v_mov_b32_e32 v1, s7
313-
; GCN-NEXT: s_add_u32 s6, s8, s4
314-
; GCN-NEXT: s_addc_u32 s7, s9, 0
315-
; GCN-NEXT: s_add_u32 s4, s8, s5
316-
; GCN-NEXT: v_mov_b32_e32 v2, s6
317-
; GCN-NEXT: v_mov_b32_e32 v3, s7
317+
; GCN-NEXT: s_add_u32 s6, s8, s13
318+
; GCN-NEXT: v_mov_b32_e32 v2, s10
319+
; GCN-NEXT: v_mov_b32_e32 v3, s11
318320
; GCN-NEXT: global_store_dwordx4 v[0:1], v[16:19], off
319321
; GCN-NEXT: global_store_dwordx4 v[2:3], v[20:23], off
320-
; GCN-NEXT: s_addc_u32 s5, s9, 0
322+
; GCN-NEXT: s_addc_u32 s7, s9, 0
321323
; GCN-NEXT: v_mov_b32_e32 v0, s4
322324
; GCN-NEXT: v_mov_b32_e32 v1, s5
323-
; GCN-NEXT: s_add_u32 s4, s8, s10
324-
; GCN-NEXT: s_addc_u32 s5, s9, 0
325-
; GCN-NEXT: v_mov_b32_e32 v2, s4
326-
; GCN-NEXT: v_mov_b32_e32 v3, s5
327325
; GCN-NEXT: s_add_u32 s4, s8, 0x80
326+
; GCN-NEXT: v_mov_b32_e32 v2, s6
327+
; GCN-NEXT: s_addc_u32 s5, s9, 0
328+
; GCN-NEXT: v_mov_b32_e32 v3, s7
329+
; GCN-NEXT: s_add_u32 s6, s8, s14
328330
; GCN-NEXT: global_store_dwordx4 v[0:1], v[24:27], off
329331
; GCN-NEXT: global_store_dwordx4 v[2:3], v[28:31], off
330-
; GCN-NEXT: s_addc_u32 s5, s9, 0
332+
; GCN-NEXT: s_addc_u32 s7, s9, 0
331333
; GCN-NEXT: v_mov_b32_e32 v0, s4
332334
; GCN-NEXT: v_mov_b32_e32 v1, s5
333-
; GCN-NEXT: s_add_u32 s4, s8, s11
335+
; GCN-NEXT: s_add_u32 s4, s8, s15
336+
; GCN-NEXT: v_mov_b32_e32 v2, s6
334337
; GCN-NEXT: s_addc_u32 s5, s9, 0
335-
; GCN-NEXT: v_mov_b32_e32 v2, s4
336-
; GCN-NEXT: v_mov_b32_e32 v3, s5
337-
; GCN-NEXT: s_add_u32 s4, s8, s13
338+
; GCN-NEXT: v_mov_b32_e32 v3, s7
339+
; GCN-NEXT: s_add_u32 s6, s8, s16
338340
; GCN-NEXT: global_store_dwordx4 v[0:1], v[32:35], off
339341
; GCN-NEXT: global_store_dwordx4 v[2:3], v[36:39], off
340-
; GCN-NEXT: s_addc_u32 s5, s9, 0
342+
; GCN-NEXT: s_addc_u32 s7, s9, 0
341343
; GCN-NEXT: v_mov_b32_e32 v0, s4
342344
; GCN-NEXT: v_mov_b32_e32 v1, s5
343-
; GCN-NEXT: s_add_u32 s4, s8, s14
344-
; GCN-NEXT: s_addc_u32 s5, s9, 0
345-
; GCN-NEXT: v_mov_b32_e32 v2, s4
346-
; GCN-NEXT: v_mov_b32_e32 v3, s5
347345
; GCN-NEXT: s_add_u32 s4, s8, 0xc0
346+
; GCN-NEXT: v_mov_b32_e32 v2, s6
347+
; GCN-NEXT: v_mov_b32_e32 v3, s7
348348
; GCN-NEXT: global_store_dwordx4 v[0:1], v[40:43], off
349349
; GCN-NEXT: global_store_dwordx4 v[2:3], v[44:47], off
350350
; GCN-NEXT: s_addc_u32 s5, s9, 0
351351
; GCN-NEXT: v_mov_b32_e32 v0, s4
352352
; GCN-NEXT: v_mov_b32_e32 v1, s5
353-
; GCN-NEXT: s_add_u32 s4, s8, s15
353+
; GCN-NEXT: s_add_u32 s4, s8, s17
354354
; GCN-NEXT: s_addc_u32 s5, s9, 0
355355
; GCN-NEXT: v_mov_b32_e32 v2, s4
356356
; GCN-NEXT: v_mov_b32_e32 v3, s5
357-
; GCN-NEXT: s_add_u32 s4, s8, s16
357+
; GCN-NEXT: s_add_u32 s4, s8, s18
358358
; GCN-NEXT: global_store_dwordx4 v[0:1], v[48:51], off
359359
; GCN-NEXT: global_store_dwordx4 v[2:3], v[52:55], off
360360
; GCN-NEXT: s_addc_u32 s5, s9, 0
361361
; GCN-NEXT: v_mov_b32_e32 v0, s4
362362
; GCN-NEXT: v_mov_b32_e32 v1, s5
363-
; GCN-NEXT: s_add_u32 s4, s8, s17
363+
; GCN-NEXT: s_add_u32 s4, s8, s19
364364
; GCN-NEXT: s_addc_u32 s5, s9, 0
365365
; GCN-NEXT: v_mov_b32_e32 v2, s4
366366
; GCN-NEXT: v_mov_b32_e32 v3, s5

llvm/test/CodeGen/AMDGPU/call-argument-types.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -744,13 +744,13 @@ entry:
744744

745745
; GCN-LABEL: {{^}}tail_call_byval_align16:
746746
; GCN-NOT: s32
747-
; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12
748-
; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:8
747+
; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:8
748+
; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:12
749749

750750
; GCN: s_getpc_b64
751751

752-
; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4
753-
; GCN: buffer_store_dword v33, off, s[0:3], s32{{$}}
752+
; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:4
753+
; GCN: buffer_store_dword v32, off, s[0:3], s32{{$}}
754754
; GCN-NOT: s32
755755
; GCN: s_setpc_b64
756756
define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {

llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -624,11 +624,10 @@ define void @too_many_args_use_workitem_id_x_byval(
624624

625625

626626
; FIXEDABI: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7
627-
; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], 0 offset:4{{$}}
628-
629627
; FIXEDABI: s_movk_i32 s32, 0x400{{$}}
630-
631628
; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140
629+
; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], 0 offset:4{{$}}
630+
632631
; FIXEDABI: buffer_store_dword [[K1]], off, s[0:3], s32{{$}}
633632

634633
; FIXME: Why this reload?
@@ -670,9 +669,8 @@ define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_byval() #1
670669

671670
; FIXED-ABI-NOT: v31
672671
; FIXEDABI: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7{{$}}
673-
; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], s33{{$}}
674-
675672
; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140{{$}}
673+
; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], s33{{$}}
676674
; FIXEDABI: buffer_store_dword [[K1]], off, s[0:3], s32{{$}}
677675
; FIXEDABI: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33{{$}}
678676

llvm/test/CodeGen/AMDGPU/fshr.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1364,11 +1364,11 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2
13641364
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
13651365
; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
13661366
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v2
1367-
; GFX9-NEXT: buffer_store_byte_d16_hi v2, v0, s[0:3], 0 offen offset:5
1368-
; GFX9-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:4
13691367
; GFX9-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:3
13701368
; GFX9-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen offset:2
13711369
; GFX9-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
1370+
; GFX9-NEXT: buffer_store_byte_d16_hi v2, v0, s[0:3], 0 offen offset:5
1371+
; GFX9-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:4
13721372
; GFX9-NEXT: s_waitcnt vmcnt(0)
13731373
; GFX9-NEXT: s_setpc_b64 s[30:31]
13741374
;

llvm/test/CodeGen/AMDGPU/half.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,7 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1
312312
; SI: v_cvt_f32_f16_e32
313313
; SI: v_cvt_f32_f16_e32
314314
; SI: v_cvt_f32_f16_e32
315+
; SI: v_cvt_f32_f16_e32
315316

316317
; GCN: flat_store_dwordx4
317318

@@ -325,7 +326,6 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1
325326
; SI: v_cvt_f32_f16_e32
326327
; SI: v_cvt_f32_f16_e32
327328
; SI: v_cvt_f32_f16_e32
328-
; SI: v_cvt_f32_f16_e32
329329

330330
; VI: v_cvt_f32_f16_e32
331331
; VI: v_cvt_f32_f16_sdwa

0 commit comments

Comments
 (0)