Skip to content

Commit ff9c041

Browse files
authored
[MachineScheduler] Fix physreg dependencies of ExitSU (#123541)
Providing the correct operand index allows addPhysRegDataDeps to compute the correct latency. Pull Request: #123541
1 parent 1533682 commit ff9c041

File tree

58 files changed

+1304
-1231
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

58 files changed

+1304
-1231
lines changed

llvm/lib/CodeGen/ScheduleDAGInstrs.cpp

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -209,13 +209,25 @@ void ScheduleDAGInstrs::addSchedBarrierDeps() {
209209
ExitSU.setInstr(ExitMI);
210210
// Add dependencies on the defs and uses of the instruction.
211211
if (ExitMI) {
212+
const MCInstrDesc &MIDesc = ExitMI->getDesc();
212213
for (const MachineOperand &MO : ExitMI->all_uses()) {
214+
unsigned OpIdx = MO.getOperandNo();
213215
Register Reg = MO.getReg();
214216
if (Reg.isPhysical()) {
217+
// addPhysRegDataDeps uses the provided operand index to retrieve
218+
// the operand use cycle from the scheduling model. If the operand
219+
// is "fake" (e.g., an operand of a call instruction used to pass
220+
// an argument to the called function.), the scheduling model may not
221+
// have an entry for it. If this is the case, pass -1 as operand index,
222+
// which will cause addPhysRegDataDeps to add an artificial dependency.
223+
// FIXME: Using hasImplicitUseOfPhysReg here is inaccurate as it misses
224+
// aliases. When fixing, make sure to update addPhysRegDataDeps, too.
225+
bool IsRealUse = OpIdx < MIDesc.getNumOperands() ||
226+
MIDesc.hasImplicitUseOfPhysReg(Reg);
215227
for (MCRegUnit Unit : TRI->regunits(Reg))
216-
Uses.insert(PhysRegSUOper(&ExitSU, -1, Unit));
228+
Uses.insert(PhysRegSUOper(&ExitSU, IsRealUse ? OpIdx : -1, Unit));
217229
} else if (Reg.isVirtual() && MO.readsReg()) {
218-
addVRegUseDeps(&ExitSU, MO.getOperandNo());
230+
addVRegUseDeps(&ExitSU, OpIdx);
219231
}
220232
}
221233
}

llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,12 @@ define ptr addrspace(1) @call_assert_align() {
1515
; CHECK-NEXT: v_writelane_b32 v40, s16, 2
1616
; CHECK-NEXT: s_addk_i32 s32, 0x400
1717
; CHECK-NEXT: v_writelane_b32 v40, s30, 0
18-
; CHECK-NEXT: v_mov_b32_e32 v0, 0
19-
; CHECK-NEXT: v_mov_b32_e32 v1, 0
20-
; CHECK-NEXT: v_writelane_b32 v40, s31, 1
2118
; CHECK-NEXT: s_getpc_b64 s[16:17]
2219
; CHECK-NEXT: s_add_u32 s16, s16, ext@rel32@lo+4
2320
; CHECK-NEXT: s_addc_u32 s17, s17, ext@rel32@hi+12
21+
; CHECK-NEXT: v_mov_b32_e32 v0, 0
22+
; CHECK-NEXT: v_mov_b32_e32 v1, 0
23+
; CHECK-NEXT: v_writelane_b32 v40, s31, 1
2424
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
2525
; CHECK-NEXT: v_mov_b32_e32 v2, 0
2626
; CHECK-NEXT: global_store_dword v[0:1], v2, off
@@ -45,11 +45,11 @@ define ptr addrspace(1) @tail_call_assert_align() {
4545
; CHECK-LABEL: tail_call_assert_align:
4646
; CHECK: ; %bb.0: ; %entry
4747
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
48-
; CHECK-NEXT: v_mov_b32_e32 v0, 0
49-
; CHECK-NEXT: v_mov_b32_e32 v1, 0
5048
; CHECK-NEXT: s_getpc_b64 s[16:17]
5149
; CHECK-NEXT: s_add_u32 s16, s16, ext@rel32@lo+4
5250
; CHECK-NEXT: s_addc_u32 s17, s17, ext@rel32@hi+12
51+
; CHECK-NEXT: v_mov_b32_e32 v0, 0
52+
; CHECK-NEXT: v_mov_b32_e32 v1, 0
5353
; CHECK-NEXT: s_setpc_b64 s[16:17]
5454
entry:
5555
%call = tail call align 4 ptr addrspace(1) @ext(ptr addrspace(1) null)

llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,8 @@ define amdgpu_kernel void @kernel_caller_stack() {
4444
; FLATSCR-NEXT: scratch_store_dword off, v0, s0
4545
; FLATSCR-NEXT: s_add_u32 s0, s32, 12
4646
; FLATSCR-NEXT: v_mov_b32_e32 v0, 11
47-
; FLATSCR-NEXT: scratch_store_dword off, v0, s0
4847
; FLATSCR-NEXT: s_add_u32 s2, s32, 16
48+
; FLATSCR-NEXT: scratch_store_dword off, v0, s0
4949
; FLATSCR-NEXT: v_mov_b32_e32 v0, 12
5050
; FLATSCR-NEXT: s_getpc_b64 s[0:1]
5151
; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_v16i32_v16i32_v4i32@rel32@lo+4
@@ -239,11 +239,11 @@ define void @func_caller_stack() {
239239
; MUBUF-NEXT: v_writelane_b32 v40, s30, 0
240240
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12
241241
; MUBUF-NEXT: v_mov_b32_e32 v0, 12
242-
; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
243-
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16
244242
; MUBUF-NEXT: s_getpc_b64 s[4:5]
245243
; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_v16i32_v16i32_v4i32@rel32@lo+4
246244
; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32_v16i32_v4i32@rel32@hi+12
245+
; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
246+
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16
247247
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5]
248248
; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
249249
; MUBUF-NEXT: v_readlane_b32 s30, v40, 0
@@ -274,15 +274,15 @@ define void @func_caller_stack() {
274274
; FLATSCR-NEXT: scratch_store_dword off, v0, s0
275275
; FLATSCR-NEXT: s_add_u32 s0, s32, 12
276276
; FLATSCR-NEXT: v_mov_b32_e32 v0, 11
277-
; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0
278277
; FLATSCR-NEXT: scratch_store_dword off, v0, s0
279278
; FLATSCR-NEXT: s_add_u32 s0, s32, 16
280279
; FLATSCR-NEXT: v_mov_b32_e32 v0, 12
281-
; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1
280+
; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0
282281
; FLATSCR-NEXT: scratch_store_dword off, v0, s0
283282
; FLATSCR-NEXT: s_getpc_b64 s[0:1]
284283
; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_v16i32_v16i32_v4i32@rel32@lo+4
285284
; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_v16i32_v16i32_v4i32@rel32@hi+12
285+
; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1
286286
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
287287
; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
288288
; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0
@@ -312,10 +312,10 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) {
312312
; MUBUF-NEXT: s_addk_i32 s32, 0x400
313313
; MUBUF-NEXT: v_writelane_b32 v40, s4, 2
314314
; MUBUF-NEXT: v_writelane_b32 v40, s30, 0
315-
; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
316315
; MUBUF-NEXT: s_getpc_b64 s[4:5]
317316
; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_byval@rel32@lo+4
318317
; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_byval@rel32@hi+12
318+
; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
319319
; MUBUF-NEXT: s_waitcnt vmcnt(1)
320320
; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32
321321
; MUBUF-NEXT: s_waitcnt vmcnt(1)
@@ -394,8 +394,8 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) {
394394
; FLATSCR-NEXT: v_add_u32_e32 v3, 8, v0
395395
; FLATSCR-NEXT: v_writelane_b32 v40, s0, 2
396396
; FLATSCR-NEXT: s_add_u32 s0, s32, 8
397-
; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0
398397
; FLATSCR-NEXT: s_add_u32 s2, s32, 56
398+
; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0
399399
; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1
400400
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
401401
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s32

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -191,9 +191,9 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts,
191191
; GFX10-LABEL: divergent_i1_xor_used_outside_loop_larger_loop_body:
192192
; GFX10: ; %bb.0: ; %entry
193193
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
194+
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
194195
; GFX10-NEXT: s_mov_b32 s5, 0
195196
; GFX10-NEXT: s_mov_b32 s6, -1
196-
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
197197
; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo
198198
; GFX10-NEXT: s_cbranch_execz .LBB3_6
199199
; GFX10-NEXT: ; %bb.1: ; %loop.start.preheader

llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -387,8 +387,8 @@ define amdgpu_ps void @and_i1_scc(i32 inreg %a, i32 inreg %b, ptr addrspace(1) %
387387
define amdgpu_ps void @divergent_phi_with_uniform_inputs(i32 %a, ptr addrspace(1) %out) {
388388
; OLD_RBS-LABEL: divergent_phi_with_uniform_inputs:
389389
; OLD_RBS: ; %bb.0: ; %A
390-
; OLD_RBS-NEXT: s_mov_b32 s0, 0
391390
; OLD_RBS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
391+
; OLD_RBS-NEXT: s_mov_b32 s0, 0
392392
; OLD_RBS-NEXT: s_and_saveexec_b32 s1, vcc_lo
393393
; OLD_RBS-NEXT: ; %bb.1: ; %B
394394
; OLD_RBS-NEXT: s_mov_b32 s0, 1

llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,10 @@ define void @parent_func_missing_inputs() #0 {
2525
; FIXEDABI-NEXT: v_writelane_b32 v40, s16, 2
2626
; FIXEDABI-NEXT: s_addk_i32 s32, 0x400
2727
; FIXEDABI-NEXT: v_writelane_b32 v40, s30, 0
28-
; FIXEDABI-NEXT: v_writelane_b32 v40, s31, 1
2928
; FIXEDABI-NEXT: s_getpc_b64 s[16:17]
3029
; FIXEDABI-NEXT: s_add_u32 s16, s16, requires_all_inputs@rel32@lo+4
3130
; FIXEDABI-NEXT: s_addc_u32 s17, s17, requires_all_inputs@rel32@hi+12
31+
; FIXEDABI-NEXT: v_writelane_b32 v40, s31, 1
3232
; FIXEDABI-NEXT: s_swappc_b64 s[30:31], s[16:17]
3333
; FIXEDABI-NEXT: v_readlane_b32 s31, v40, 1
3434
; FIXEDABI-NEXT: v_readlane_b32 s30, v40, 0
@@ -49,43 +49,43 @@ define amdgpu_kernel void @parent_kernel_missing_inputs() #0 {
4949
; FIXEDABI-SDAG: ; %bb.0:
5050
; FIXEDABI-SDAG-NEXT: s_add_i32 s4, s4, s9
5151
; FIXEDABI-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
52-
; FIXEDABI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1
5352
; FIXEDABI-SDAG-NEXT: s_add_u32 s0, s0, s9
53+
; FIXEDABI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1
54+
; FIXEDABI-SDAG-NEXT: s_addc_u32 s1, s1, 0
5455
; FIXEDABI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 20, v2
5556
; FIXEDABI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
56-
; FIXEDABI-SDAG-NEXT: s_addc_u32 s1, s1, 0
57+
; FIXEDABI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s5
5758
; FIXEDABI-SDAG-NEXT: s_mov_b32 s14, s8
59+
; FIXEDABI-SDAG-NEXT: s_getpc_b64 s[4:5]
60+
; FIXEDABI-SDAG-NEXT: s_add_u32 s4, s4, requires_all_inputs@rel32@lo+4
61+
; FIXEDABI-SDAG-NEXT: s_addc_u32 s5, s5, requires_all_inputs@rel32@hi+12
5862
; FIXEDABI-SDAG-NEXT: v_or_b32_e32 v31, v0, v2
5963
; FIXEDABI-SDAG-NEXT: s_mov_b64 s[8:9], 0
6064
; FIXEDABI-SDAG-NEXT: s_mov_b32 s12, s6
6165
; FIXEDABI-SDAG-NEXT: s_mov_b32 s13, s7
6266
; FIXEDABI-SDAG-NEXT: s_mov_b32 s32, 0
63-
; FIXEDABI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s5
64-
; FIXEDABI-SDAG-NEXT: s_getpc_b64 s[4:5]
65-
; FIXEDABI-SDAG-NEXT: s_add_u32 s4, s4, requires_all_inputs@rel32@lo+4
66-
; FIXEDABI-SDAG-NEXT: s_addc_u32 s5, s5, requires_all_inputs@rel32@hi+12
6767
; FIXEDABI-SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
6868
; FIXEDABI-SDAG-NEXT: s_endpgm
6969
;
7070
; FIXEDABI-GISEL-LABEL: parent_kernel_missing_inputs:
7171
; FIXEDABI-GISEL: ; %bb.0:
7272
; FIXEDABI-GISEL-NEXT: s_add_i32 s4, s4, s9
7373
; FIXEDABI-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
74-
; FIXEDABI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1
7574
; FIXEDABI-GISEL-NEXT: s_add_u32 s0, s0, s9
75+
; FIXEDABI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1
76+
; FIXEDABI-GISEL-NEXT: s_addc_u32 s1, s1, 0
7677
; FIXEDABI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
7778
; FIXEDABI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 20, v2
78-
; FIXEDABI-GISEL-NEXT: s_addc_u32 s1, s1, 0
79+
; FIXEDABI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s5
7980
; FIXEDABI-GISEL-NEXT: s_mov_b32 s14, s8
8081
; FIXEDABI-GISEL-NEXT: v_or_b32_e32 v31, v0, v1
82+
; FIXEDABI-GISEL-NEXT: s_getpc_b64 s[4:5]
83+
; FIXEDABI-GISEL-NEXT: s_add_u32 s4, s4, requires_all_inputs@rel32@lo+4
84+
; FIXEDABI-GISEL-NEXT: s_addc_u32 s5, s5, requires_all_inputs@rel32@hi+12
8185
; FIXEDABI-GISEL-NEXT: s_mov_b64 s[8:9], 0
8286
; FIXEDABI-GISEL-NEXT: s_mov_b32 s12, s6
8387
; FIXEDABI-GISEL-NEXT: s_mov_b32 s13, s7
8488
; FIXEDABI-GISEL-NEXT: s_mov_b32 s32, 0
85-
; FIXEDABI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s5
86-
; FIXEDABI-GISEL-NEXT: s_getpc_b64 s[4:5]
87-
; FIXEDABI-GISEL-NEXT: s_add_u32 s4, s4, requires_all_inputs@rel32@lo+4
88-
; FIXEDABI-GISEL-NEXT: s_addc_u32 s5, s5, requires_all_inputs@rel32@hi+12
8989
; FIXEDABI-GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
9090
; FIXEDABI-GISEL-NEXT: s_endpgm
9191
call void @requires_all_inputs()

llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1286,9 +1286,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
12861286
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
12871287
; GFX1032_DPP-NEXT: v_writelane_b32 v3, s5, 16
12881288
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
1289+
; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
12891290
; GFX1032_DPP-NEXT: s_mov_b32 s4, s6
12901291
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
1291-
; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
12921292
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0
12931293
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
12941294
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB2_2
@@ -1412,9 +1412,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
14121412
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
14131413
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s5, 16
14141414
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
1415+
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
1416+
; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
14151417
; GFX1132_DPP-NEXT: s_mov_b32 s4, s6
14161418
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
1417-
; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
14181419
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
14191420
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
14201421
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB2_2
@@ -1540,9 +1541,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
15401541
; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1
15411542
; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16
15421543
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4
1544+
; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
1545+
; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
15431546
; GFX1232_DPP-NEXT: s_mov_b32 s4, s6
15441547
; GFX1232_DPP-NEXT: s_mov_b32 s6, -1
1545-
; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
15461548
; GFX1232_DPP-NEXT: ; implicit-def: $vgpr0
15471549
; GFX1232_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
15481550
; GFX1232_DPP-NEXT: s_cbranch_execz .LBB2_2
@@ -3129,8 +3131,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
31293131
; GFX1032_DPP-NEXT: v_writelane_b32 v8, s8, 16
31303132
; GFX1032_DPP-NEXT: v_writelane_b32 v7, s7, 16
31313133
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s6
3132-
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
31333134
; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
3135+
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
31343136
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
31353137
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
31363138
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB5_2
@@ -4839,9 +4841,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
48394841
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
48404842
; GFX1032_DPP-NEXT: v_writelane_b32 v3, s5, 16
48414843
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
4844+
; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
48424845
; GFX1032_DPP-NEXT: s_mov_b32 s4, s6
48434846
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
4844-
; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
48454847
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0
48464848
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
48474849
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB8_2
@@ -4965,9 +4967,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
49654967
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
49664968
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s5, 16
49674969
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
4970+
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
4971+
; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
49684972
; GFX1132_DPP-NEXT: s_mov_b32 s4, s6
49694973
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
4970-
; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
49714974
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
49724975
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
49734976
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB8_2
@@ -5093,9 +5096,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
50935096
; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1
50945097
; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16
50955098
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4
5099+
; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
5100+
; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
50965101
; GFX1232_DPP-NEXT: s_mov_b32 s4, s6
50975102
; GFX1232_DPP-NEXT: s_mov_b32 s6, -1
5098-
; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
50995103
; GFX1232_DPP-NEXT: ; implicit-def: $vgpr0
51005104
; GFX1232_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
51015105
; GFX1232_DPP-NEXT: s_cbranch_execz .LBB8_2
@@ -6715,8 +6719,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
67156719
; GFX1032_DPP-NEXT: v_writelane_b32 v8, s8, 16
67166720
; GFX1032_DPP-NEXT: v_writelane_b32 v7, s7, 16
67176721
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s6
6718-
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
67196722
; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
6723+
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
67206724
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
67216725
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
67226726
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB11_2

0 commit comments

Comments
 (0)