Skip to content

Commit 5b9cca3

Browse files
committed
[AMDGPU] Disable VALU sinking and hoisting with WWM
Machine LICM can hoist a VALU instruction from a WWM region. In this case WQM pass will have to create yet another WWM region around the hoisted instruction, which is not desired. Unfortunatelly we cannot tell if an instruction is in the WWM region, so this patch disables hoisting if WWM is used in the function. This works around the bug SWDEV-502411.
1 parent dcc141b commit 5b9cca3

14 files changed

+7739
-7379
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2772,6 +2772,9 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
27722772
case Intrinsic::amdgcn_wwm:
27732773
case Intrinsic::amdgcn_strict_wwm:
27742774
Opcode = AMDGPU::STRICT_WWM;
2775+
CurDAG->getMachineFunction()
2776+
.getInfo<SIMachineFunctionInfo>()
2777+
->setInitWholeWave();
27752778
break;
27762779
case Intrinsic::amdgcn_strict_wqm:
27772780
Opcode = AMDGPU::STRICT_WQM;

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1055,8 +1055,12 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
10551055
case Intrinsic::amdgcn_softwqm:
10561056
return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
10571057
case Intrinsic::amdgcn_strict_wwm:
1058-
case Intrinsic::amdgcn_wwm:
1058+
case Intrinsic::amdgcn_wwm: {
1059+
MachineFunction *MF = I.getParent()->getParent();
1060+
SIMachineFunctionInfo *MFInfo = MF->getInfo<SIMachineFunctionInfo>();
1061+
MFInfo->setInitWholeWave();
10591062
return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
1063+
}
10601064
case Intrinsic::amdgcn_strict_wqm:
10611065
return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
10621066
case Intrinsic::amdgcn_writelane:

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,11 @@ static bool resultDependsOnExec(const MachineInstr &MI) {
184184
bool SIInstrInfo::isIgnorableUse(const MachineOperand &MO) const {
185185
// Any implicit use of exec by VALU is not a real register read.
186186
return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
187-
isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent());
187+
isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent()) &&
188+
!MO.getParent()
189+
->getMF()
190+
->getInfo<SIMachineFunctionInfo>()
191+
->hasInitWholeWave();
188192
}
189193

190194
bool SIInstrInfo::isSafeToSink(MachineInstr &MI,

llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll

Lines changed: 700 additions & 680 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll

Lines changed: 3067 additions & 2967 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/cse-convergent.ll

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,31 +8,33 @@ define i32 @test(i32 %val, i32 %cond) {
88
; GCN-NEXT: s_xor_saveexec_b32 s4, -1
99
; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
1010
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
11+
; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
1112
; GCN-NEXT: s_waitcnt_depctr 0xffe3
1213
; GCN-NEXT: s_mov_b32 exec_lo, s4
1314
; GCN-NEXT: s_or_saveexec_b32 s4, -1
1415
; GCN-NEXT: v_mov_b32_e32 v2, 0
1516
; GCN-NEXT: v_cndmask_b32_e64 v3, 0, v0, s4
16-
; GCN-NEXT: v_mov_b32_dpp v2, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
17-
; GCN-NEXT: s_mov_b32 exec_lo, s4
18-
; GCN-NEXT: v_mov_b32_e32 v5, 0
1917
; GCN-NEXT: v_mov_b32_e32 v4, v2
18+
; GCN-NEXT: v_mov_b32_dpp v4, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
19+
; GCN-NEXT: s_mov_b32 exec_lo, s4
20+
; GCN-NEXT: v_mov_b32_e32 v5, v4
2021
; GCN-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
2122
; GCN-NEXT: s_and_saveexec_b32 s4, vcc_lo
2223
; GCN-NEXT: ; %bb.1: ; %if
2324
; GCN-NEXT: s_or_saveexec_b32 s5, -1
24-
; GCN-NEXT: v_mov_b32_e32 v2, 0
25-
; GCN-NEXT: v_cndmask_b32_e64 v3, 0, v0, s5
26-
; GCN-NEXT: v_mov_b32_dpp v2, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
25+
; GCN-NEXT: v_mov_b32_e32 v3, 0
26+
; GCN-NEXT: v_cndmask_b32_e64 v2, 0, v0, s5
27+
; GCN-NEXT: v_mov_b32_dpp v3, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
2728
; GCN-NEXT: s_mov_b32 exec_lo, s5
28-
; GCN-NEXT: v_mov_b32_e32 v5, v2
29+
; GCN-NEXT: v_mov_b32_e32 v2, v3
2930
; GCN-NEXT: ; %bb.2: ; %end
3031
; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s4
31-
; GCN-NEXT: v_add_nc_u32_e32 v0, v4, v5
32+
; GCN-NEXT: v_add_nc_u32_e32 v0, v5, v2
3233
; GCN-NEXT: s_xor_saveexec_b32 s4, -1
33-
; GCN-NEXT: s_clause 0x1
34+
; GCN-NEXT: s_clause 0x2
3435
; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32
3536
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:4
37+
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8
3638
; GCN-NEXT: s_waitcnt_depctr 0xffe3
3739
; GCN-NEXT: s_mov_b32 exec_lo, s4
3840
; GCN-NEXT: s_waitcnt vmcnt(0)

llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll

Lines changed: 1138 additions & 1077 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll

Lines changed: 792 additions & 745 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll

Lines changed: 792 additions & 745 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll

Lines changed: 1179 additions & 1110 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,31 +4,32 @@
44
define amdgpu_cs void @if_then(ptr addrspace(8) inreg %input, ptr addrspace(8) inreg %output, <3 x i32> %LocalInvocationId) {
55
; GCN-LABEL: if_then:
66
; GCN: ; %bb.0: ; %.entry
7-
; GCN-NEXT: v_mov_b32_e32 v3, 0
7+
; GCN-NEXT: v_mov_b32_e32 v4, 0
88
; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
99
; GCN-NEXT: s_and_saveexec_b32 s0, vcc_lo
1010
; GCN-NEXT: ; %bb.1: ; %.bb0
11-
; GCN-NEXT: v_mov_b32_e32 v3, 1
11+
; GCN-NEXT: v_mov_b32_e32 v4, 1
1212
; GCN-NEXT: ; %bb.2: ; %.merge
1313
; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s0
1414
; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, 3, v0
1515
; GCN-NEXT: s_and_saveexec_b32 s0, vcc_lo
1616
; GCN-NEXT: s_cbranch_execz .LBB0_4
1717
; GCN-NEXT: ; %bb.3: ; %.then
1818
; GCN-NEXT: s_or_saveexec_b32 s1, -1
19-
; GCN-NEXT: v_cndmask_b32_e64 v1, 0, v3, s1
20-
; GCN-NEXT: v_mov_b32_e32 v2, 0
21-
; GCN-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf
19+
; GCN-NEXT: v_mov_b32_e32 v1, 0
20+
; GCN-NEXT: v_cndmask_b32_e64 v2, 0, v4, s1
21+
; GCN-NEXT: v_mov_b32_e32 v3, v1
22+
; GCN-NEXT: v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf
2223
; GCN-NEXT: s_mov_b32 exec_lo, s1
23-
; GCN-NEXT: v_mov_b32_e32 v0, v2
24-
; GCN-NEXT: v_mov_b32_e32 v4, -1
25-
; GCN-NEXT: v_mov_b32_e32 v3, 0
26-
; GCN-NEXT: buffer_store_dword v4, v0, s[4:7], 0 offen
24+
; GCN-NEXT: v_mov_b32_e32 v0, v3
25+
; GCN-NEXT: v_mov_b32_e32 v5, -1
26+
; GCN-NEXT: v_mov_b32_e32 v4, v1
27+
; GCN-NEXT: buffer_store_dword v5, v0, s[4:7], 0 offen
2728
; GCN-NEXT: .LBB0_4: ; %.end
2829
; GCN-NEXT: s_waitcnt_depctr 0xffe3
2930
; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s0
3031
; GCN-NEXT: v_mov_b32_e32 v0, -1
31-
; GCN-NEXT: buffer_store_dword v0, v3, s[4:7], 0 offen
32+
; GCN-NEXT: buffer_store_dword v0, v4, s[4:7], 0 offen
3233
; GCN-NEXT: s_endpgm
3334
.entry:
3435
%LocalInvocationId.i0 = extractelement <3 x i32> %LocalInvocationId, i32 0

llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,11 @@ define amdgpu_cs void @should_not_hoist_set_inactive(<4 x i32> inreg %i14, i32 i
2424
; GCN-NEXT: ; %bb.3: ; %bb1
2525
; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1
2626
; GCN-NEXT: s_or_saveexec_b32 s9, -1
27-
; GCN-NEXT: v_cndmask_b32_e64 v3, 0, s4, s9
28-
; GCN-NEXT: v_mov_b32_e32 v4, 0
29-
; GCN-NEXT: v_mov_b32_dpp v4, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
27+
; GCN-NEXT: v_mov_b32_e32 v3, 0
28+
; GCN-NEXT: v_cndmask_b32_e64 v4, 0, s4, s9
29+
; GCN-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
3030
; GCN-NEXT: s_mov_b32 exec_lo, s9
31-
; GCN-NEXT: v_mov_b32_e32 v0, v4
31+
; GCN-NEXT: v_mov_b32_e32 v0, v3
3232
; GCN-NEXT: s_and_b32 exec_lo, exec_lo, s5
3333
; GCN-NEXT: s_cbranch_execz .LBB0_1
3434
; GCN-NEXT: ; %bb.4: ; %bb2

llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -250,39 +250,41 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
250250
; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1
251251
; GFX9-O3-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
252252
; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
253+
; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
253254
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
254-
; GFX9-O3-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0
255-
; GFX9-O3-NEXT: v_mov_b32_e32 v5, 0
255+
; GFX9-O3-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], 0
256256
; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
257257
; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0
258+
; GFX9-O3-NEXT: v_mov_b32_e32 v2, v1
258259
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
259-
; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v3, s[34:35]
260+
; GFX9-O3-NEXT: v_cndmask_b32_e64 v3, 0, v4, s[34:35]
260261
; GFX9-O3-NEXT: s_nop 1
261-
; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
262-
; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1
262+
; GFX9-O3-NEXT: v_mov_b32_dpp v2, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
263+
; GFX9-O3-NEXT: v_add_u32_e32 v2, v3, v2
263264
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
264-
; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1
265+
; GFX9-O3-NEXT: v_mov_b32_e32 v4, v2
265266
; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
266267
; GFX9-O3-NEXT: s_and_saveexec_b64 s[34:35], vcc
267268
; GFX9-O3-NEXT: ; %bb.1: ; %if
268269
; GFX9-O3-NEXT: s_or_saveexec_b64 s[36:37], -1
269270
; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0
270-
; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[36:37]
271+
; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v5, s[36:37]
271272
; GFX9-O3-NEXT: s_nop 1
272273
; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
273-
; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1
274+
; GFX9-O3-NEXT: v_add_u32_e32 v2, v2, v1
274275
; GFX9-O3-NEXT: s_mov_b64 exec, s[36:37]
275-
; GFX9-O3-NEXT: v_mov_b32_e32 v5, v1
276+
; GFX9-O3-NEXT: v_mov_b32_e32 v1, v2
276277
; GFX9-O3-NEXT: ; %bb.2: ; %merge
277278
; GFX9-O3-NEXT: s_or_b64 exec, exec, s[34:35]
278-
; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
279+
; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
279280
; GFX9-O3-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
280281
; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 1, v0
281282
; GFX9-O3-NEXT: v_and_b32_e32 v0, 2, v0
282283
; GFX9-O3-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4
283284
; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1
284285
; GFX9-O3-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
285286
; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
287+
; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
286288
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
287289
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
288290
; GFX9-O3-NEXT: s_setpc_b64 s[30:31]

llvm/test/CodeGen/AMDGPU/wwm-reserved.ll

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -217,31 +217,31 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
217217
;
218218
; GFX9-O3-LABEL: cfg:
219219
; GFX9-O3: ; %bb.0: ; %entry
220-
; GFX9-O3-NEXT: buffer_load_dwordx2 v[3:4], off, s[0:3], 0
221-
; GFX9-O3-NEXT: v_mov_b32_e32 v5, 0
220+
; GFX9-O3-NEXT: buffer_load_dwordx2 v[4:5], off, s[0:3], 0
222221
; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1
223222
; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0
223+
; GFX9-O3-NEXT: v_mov_b32_e32 v2, v1
224224
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
225-
; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v3, s[4:5]
225+
; GFX9-O3-NEXT: v_cndmask_b32_e64 v3, 0, v4, s[4:5]
226226
; GFX9-O3-NEXT: s_nop 1
227-
; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
228-
; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1
227+
; GFX9-O3-NEXT: v_mov_b32_dpp v2, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
228+
; GFX9-O3-NEXT: v_add_u32_e32 v2, v3, v2
229229
; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5]
230-
; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1
230+
; GFX9-O3-NEXT: v_mov_b32_e32 v4, v2
231231
; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
232232
; GFX9-O3-NEXT: s_and_saveexec_b64 s[4:5], vcc
233233
; GFX9-O3-NEXT: ; %bb.1: ; %if
234234
; GFX9-O3-NEXT: s_or_saveexec_b64 s[6:7], -1
235235
; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0
236-
; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[6:7]
236+
; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v5, s[6:7]
237237
; GFX9-O3-NEXT: s_nop 1
238238
; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
239-
; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1
239+
; GFX9-O3-NEXT: v_add_u32_e32 v2, v2, v1
240240
; GFX9-O3-NEXT: s_mov_b64 exec, s[6:7]
241-
; GFX9-O3-NEXT: v_mov_b32_e32 v5, v1
241+
; GFX9-O3-NEXT: v_mov_b32_e32 v1, v2
242242
; GFX9-O3-NEXT: ; %bb.2: ; %merge
243243
; GFX9-O3-NEXT: s_or_b64 exec, exec, s[4:5]
244-
; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
244+
; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
245245
; GFX9-O3-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
246246
; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 1, v0
247247
; GFX9-O3-NEXT: v_and_b32_e32 v0, 2, v0
@@ -1069,31 +1069,31 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
10691069
;
10701070
; GFX9-O3-LABEL: strict_wwm_cfg:
10711071
; GFX9-O3: ; %bb.0: ; %entry
1072-
; GFX9-O3-NEXT: buffer_load_dwordx2 v[3:4], off, s[0:3], 0
1073-
; GFX9-O3-NEXT: v_mov_b32_e32 v5, 0
1072+
; GFX9-O3-NEXT: buffer_load_dwordx2 v[4:5], off, s[0:3], 0
10741073
; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1
10751074
; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0
1075+
; GFX9-O3-NEXT: v_mov_b32_e32 v2, v1
10761076
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
1077-
; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v3, s[4:5]
1077+
; GFX9-O3-NEXT: v_cndmask_b32_e64 v3, 0, v4, s[4:5]
10781078
; GFX9-O3-NEXT: s_nop 1
1079-
; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
1080-
; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1
1079+
; GFX9-O3-NEXT: v_mov_b32_dpp v2, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
1080+
; GFX9-O3-NEXT: v_add_u32_e32 v2, v3, v2
10811081
; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5]
1082-
; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1
1082+
; GFX9-O3-NEXT: v_mov_b32_e32 v4, v2
10831083
; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
10841084
; GFX9-O3-NEXT: s_and_saveexec_b64 s[4:5], vcc
10851085
; GFX9-O3-NEXT: ; %bb.1: ; %if
10861086
; GFX9-O3-NEXT: s_or_saveexec_b64 s[6:7], -1
10871087
; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0
1088-
; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[6:7]
1088+
; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v5, s[6:7]
10891089
; GFX9-O3-NEXT: s_nop 1
10901090
; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
1091-
; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1
1091+
; GFX9-O3-NEXT: v_add_u32_e32 v2, v2, v1
10921092
; GFX9-O3-NEXT: s_mov_b64 exec, s[6:7]
1093-
; GFX9-O3-NEXT: v_mov_b32_e32 v5, v1
1093+
; GFX9-O3-NEXT: v_mov_b32_e32 v1, v2
10941094
; GFX9-O3-NEXT: ; %bb.2: ; %merge
10951095
; GFX9-O3-NEXT: s_or_b64 exec, exec, s[4:5]
1096-
; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
1096+
; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
10971097
; GFX9-O3-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
10981098
; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 1, v0
10991099
; GFX9-O3-NEXT: v_and_b32_e32 v0, 2, v0

0 commit comments

Comments
 (0)