Skip to content

Commit 1237d7f

Browse files
AMDGPU/GlobalISelDivergenceLowering: constrain incoming registers
Implement constrainIncomingRegisterTakenAsIs by constraining incoming registers taken as-is with lane mask attributes. Most often they only have S1 LLT. This is final step in having PHI instructions created in this pass to be fully instruction-selected.
1 parent 6613489 commit 1237d7f

7 files changed

+259
-233
lines changed

llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
#include "AMDGPU.h"
1919
#include "SILowerI1Copies.h"
20+
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
2021
#include "llvm/CodeGen/MachineFunctionPass.h"
2122
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
2223
#include "llvm/InitializePasses.h"
@@ -60,6 +61,7 @@ class DivergenceLoweringHelper : public PhiLoweringHelper {
6061

6162
private:
6263
MachineUniformityInfo *MUI = nullptr;
64+
MachineIRBuilder B;
6365

6466
public:
6567
void markAsLaneMask(Register DstReg) const override;
@@ -80,7 +82,7 @@ class DivergenceLoweringHelper : public PhiLoweringHelper {
8082
DivergenceLoweringHelper::DivergenceLoweringHelper(
8183
MachineFunction *MF, MachineDominatorTree *DT,
8284
MachinePostDominatorTree *PDT, MachineUniformityInfo *MUI)
83-
: PhiLoweringHelper(MF, DT, PDT), MUI(MUI) {}
85+
: PhiLoweringHelper(MF, DT, PDT), MUI(MUI), B(*MF) {}
8486

8587
// _(s1) -> SReg_32/64(s1)
8688
void DivergenceLoweringHelper::markAsLaneMask(Register DstReg) const {
@@ -182,7 +184,16 @@ void DivergenceLoweringHelper::buildMergeLaneMasks(
182184
.addReg(CurMaskedReg);
183185
}
184186

185-
void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) { return; }
187+
// GlobalISel has to constrain S1 incoming taken as-is with lane mask register
188+
// class. Insert a copy of Incoming.Reg to new lane mask inside Incoming.Block,
189+
// Incoming.Reg becomes that new lane mask.
190+
void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) {
191+
B.setInsertPt(*In.Block, In.Block->getFirstTerminator());
192+
193+
auto Copy = B.buildCopy(LLT::scalar(1), In.Reg);
194+
MRI->setRegClass(Copy.getReg(0), ST->getBoolRC());
195+
In.Reg = Copy.getReg(0);
196+
}
186197

187198
} // End anonymous namespace.
188199

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll

Lines changed: 10 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
2-
; RUN: llc -global-isel -amdgpu-global-isel-risky-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
2+
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
33

44
; Divergent phis that don't require lowering using lane mask merging
55

@@ -147,32 +147,28 @@ define void @divergent_i1_phi_used_inside_loop_bigger_loop_body(float %val, floa
147147
; GFX10-LABEL: divergent_i1_phi_used_inside_loop_bigger_loop_body:
148148
; GFX10: ; %bb.0: ; %entry
149149
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
150-
; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, 1.0, v1
151-
; GFX10-NEXT: s_mov_b32 s5, 0
150+
; GFX10-NEXT: s_mov_b32 s4, 0
151+
; GFX10-NEXT: v_cmp_lt_f32_e64 s5, 1.0, v1
152152
; GFX10-NEXT: v_mov_b32_e32 v1, 0x3e8
153-
; GFX10-NEXT: v_mov_b32_e32 v8, s5
153+
; GFX10-NEXT: v_mov_b32_e32 v8, s4
154154
; GFX10-NEXT: ; implicit-def: $sgpr6
155-
; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo
156155
; GFX10-NEXT: s_branch .LBB3_2
157156
; GFX10-NEXT: .LBB3_1: ; %loop_body
158157
; GFX10-NEXT: ; in Loop: Header=BB3_2 Depth=1
159158
; GFX10-NEXT: v_cvt_f32_u32_e32 v9, v8
160-
; GFX10-NEXT: s_xor_b32 s4, s4, -1
159+
; GFX10-NEXT: s_xor_b32 s5, s5, -1
161160
; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v8
162161
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v9, v0
163-
; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s4
164-
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
162+
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
165163
; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo
166-
; GFX10-NEXT: s_and_b32 s4, exec_lo, s4
167-
; GFX10-NEXT: s_or_b32 s6, s6, s4
168-
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
164+
; GFX10-NEXT: s_and_b32 s7, exec_lo, s5
165+
; GFX10-NEXT: s_or_b32 s6, s6, s7
166+
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
169167
; GFX10-NEXT: s_cbranch_execz .LBB3_6
170168
; GFX10-NEXT: .LBB3_2: ; %loop_start
171169
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
172-
; GFX10-NEXT: v_and_b32_e32 v9, 1, v9
173170
; GFX10-NEXT: v_cmp_ge_i32_e32 vcc_lo, 0x3e8, v8
174171
; GFX10-NEXT: s_mov_b32 s7, 1
175-
; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v9
176172
; GFX10-NEXT: s_cbranch_vccz .LBB3_4
177173
; GFX10-NEXT: ; %bb.3: ; %else
178174
; GFX10-NEXT: ; in Loop: Header=BB3_2 Depth=1
@@ -189,7 +185,7 @@ define void @divergent_i1_phi_used_inside_loop_bigger_loop_body(float %val, floa
189185
; GFX10-NEXT: flat_store_dword v[4:5], v1
190186
; GFX10-NEXT: s_branch .LBB3_1
191187
; GFX10-NEXT: .LBB3_6: ; %exit
192-
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
188+
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
193189
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s6
194190
; GFX10-NEXT: flat_store_dword v[2:3], v0
195191
; GFX10-NEXT: s_waitcnt lgkmcnt(0)

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir

Lines changed: 38 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ body: |
3333
; GFX10-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[COPY2]](s32), [[C]]
3434
; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
3535
; GFX10-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY3]](s32), [[C1]]
36+
; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP]](s1)
3637
; GFX10-NEXT: G_BRCOND [[ICMP1]](s1), %bb.2
3738
; GFX10-NEXT: G_BR %bb.1
3839
; GFX10-NEXT: {{ $}}
@@ -46,20 +47,22 @@ body: |
4647
; GFX10-NEXT: bb.2:
4748
; GFX10-NEXT: successors: %bb.4(0x80000000)
4849
; GFX10-NEXT: {{ $}}
49-
; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = G_PHI %14(s1), %bb.3, [[ICMP]](s1), %bb.0
50+
; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[COPY6]](s1), %bb.0, %20(s1), %bb.3
51+
; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
5052
; GFX10-NEXT: G_BR %bb.4
5153
; GFX10-NEXT: {{ $}}
5254
; GFX10-NEXT: bb.3:
5355
; GFX10-NEXT: successors: %bb.2(0x80000000)
5456
; GFX10-NEXT: {{ $}}
5557
; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
5658
; GFX10-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY2]](s32), [[C3]]
59+
; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
5760
; GFX10-NEXT: G_BR %bb.2
5861
; GFX10-NEXT: {{ $}}
5962
; GFX10-NEXT: bb.4:
6063
; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
6164
; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
62-
; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI]](s1), [[C5]], [[C4]]
65+
; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY7]](s1), [[C5]], [[C4]]
6366
; GFX10-NEXT: G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
6467
; GFX10-NEXT: S_ENDPGM 0
6568
bb.0:
@@ -126,9 +129,10 @@ body: |
126129
; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr0
127130
; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
128131
; GFX10-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[COPY2]](s32), [[C]]
129-
; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP]](s1)
130132
; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
131133
; GFX10-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY3]](s32), [[C1]]
134+
; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP]](s1)
135+
; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[COPY4]](s1)
132136
; GFX10-NEXT: G_BRCOND [[ICMP1]](s1), %bb.2
133137
; GFX10-NEXT: G_BR %bb.1
134138
; GFX10-NEXT: {{ $}}
@@ -137,17 +141,17 @@ body: |
137141
; GFX10-NEXT: {{ $}}
138142
; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
139143
; GFX10-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY2]](s32), [[C2]]
140-
; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
141-
; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY4]](s1), $exec_lo, implicit-def $scc
142-
; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY5]](s1), implicit-def $scc
144+
; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
145+
; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY5]](s1), $exec_lo, implicit-def $scc
146+
; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY6]](s1), implicit-def $scc
143147
; GFX10-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
144148
; GFX10-NEXT: {{ $}}
145149
; GFX10-NEXT: bb.2:
146-
; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[ICMP]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.1
147-
; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
150+
; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[COPY4]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.1
151+
; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
148152
; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
149153
; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
150-
; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY6]](s1), [[C4]], [[C3]]
154+
; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY7]](s1), [[C4]], [[C3]]
151155
; GFX10-NEXT: G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
152156
; GFX10-NEXT: S_ENDPGM 0
153157
bb.0:
@@ -292,19 +296,21 @@ body: |
292296
; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
293297
; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
294298
; GFX10-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[COPY1]](s32), [[C1]]
299+
; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[FCMP]](s1)
295300
; GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
296301
; GFX10-NEXT: {{ $}}
297302
; GFX10-NEXT: bb.1:
298303
; GFX10-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
299304
; GFX10-NEXT: {{ $}}
300-
; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %39(s1), %bb.5
301-
; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %15(s32), %bb.5, [[C]](s32), %bb.0
302-
; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %17(s32), %bb.5
303-
; GFX10-NEXT: [[PHI3:%[0-9]+]]:sreg_32(s1) = G_PHI [[FCMP]](s1), %bb.0, %19(s1), %bb.5
304-
; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
305+
; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %42(s1), %bb.5
306+
; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[COPY8]](s1), %bb.0, %39(s1), %bb.5
307+
; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI %15(s32), %bb.5, [[C]](s32), %bb.0
308+
; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %17(s32), %bb.5
309+
; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
310+
; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
305311
; GFX10-NEXT: [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
306312
; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1000
307-
; GFX10-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sle), [[PHI2]](s32), [[C3]]
313+
; GFX10-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sle), [[PHI3]](s32), [[C3]]
308314
; GFX10-NEXT: G_BRCOND [[ICMP]](s1), %bb.4
309315
; GFX10-NEXT: G_BR %bb.2
310316
; GFX10-NEXT: {{ $}}
@@ -336,26 +342,27 @@ body: |
336342
; GFX10-NEXT: successors: %bb.6(0x04000000), %bb.1(0x7c000000)
337343
; GFX10-NEXT: {{ $}}
338344
; GFX10-NEXT: [[C8:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
339-
; GFX10-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[PHI3]], [[C8]]
340-
; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[XOR1]](s1)
341-
; GFX10-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI2]](s32)
345+
; GFX10-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[COPY10]], [[C8]]
346+
; GFX10-NEXT: [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[XOR1]](s1)
347+
; GFX10-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI3]](s32)
342348
; GFX10-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
343349
; GFX10-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
344-
; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C9]]
345-
; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI1]](s32)
346-
; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
347-
; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY9]](s1), implicit-def $scc
350+
; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI3]], [[C9]]
351+
; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI2]](s32)
352+
; GFX10-NEXT: [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[XOR1]](s1)
353+
; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY9]](s1), $exec_lo, implicit-def $scc
354+
; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY11]](s1), implicit-def $scc
348355
; GFX10-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
349356
; GFX10-NEXT: SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
350357
; GFX10-NEXT: G_BR %bb.6
351358
; GFX10-NEXT: {{ $}}
352359
; GFX10-NEXT: bb.6:
353360
; GFX10-NEXT: [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.5
354-
; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
361+
; GFX10-NEXT: [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
355362
; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI5]](s32)
356363
; GFX10-NEXT: [[C10:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
357364
; GFX10-NEXT: [[C11:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
358-
; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY10]](s1), [[C11]], [[C10]]
365+
; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY13]](s1), [[C11]], [[C10]]
359366
; GFX10-NEXT: G_STORE [[SELECT]](s32), [[MV]](p0) :: (store (s32))
360367
; GFX10-NEXT: SI_RETURN
361368
bb.0:
@@ -475,6 +482,7 @@ body: |
475482
; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s1) = G_TRUNC [[AND1]](s32)
476483
; GFX10-NEXT: [[C5:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
477484
; GFX10-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[TRUNC1]], [[C5]]
485+
; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32(s1) = COPY [[C5]](s1)
478486
; GFX10-NEXT: G_BRCOND [[XOR]](s1), %bb.2
479487
; GFX10-NEXT: G_BR %bb.1
480488
; GFX10-NEXT: {{ $}}
@@ -487,9 +495,10 @@ body: |
487495
; GFX10-NEXT: bb.2:
488496
; GFX10-NEXT: successors: %bb.5(0x40000000), %bb.6(0x40000000)
489497
; GFX10-NEXT: {{ $}}
490-
; GFX10-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI %30(s32), %bb.4, [[DEF]](s32), %bb.0
491-
; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = G_PHI %32(s1), %bb.4, [[C5]](s1), %bb.0
492-
; GFX10-NEXT: G_BRCOND [[PHI1]](s1), %bb.5
498+
; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[COPY3]](s1), %bb.0, %58(s1), %bb.4
499+
; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %30(s32), %bb.4, [[DEF]](s32), %bb.0
500+
; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
501+
; GFX10-NEXT: G_BRCOND [[COPY4]](s1), %bb.5
493502
; GFX10-NEXT: G_BR %bb.6
494503
; GFX10-NEXT: {{ $}}
495504
; GFX10-NEXT: bb.3:
@@ -517,6 +526,7 @@ body: |
517526
; GFX10-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[PHI5]](s32), [[AMDGPU_BUFFER_LOAD]]
518527
; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s1) = G_OR [[ICMP]], [[ICMP2]]
519528
; GFX10-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s1)
529+
; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[C10]](s1)
520530
; GFX10-NEXT: G_BR %bb.2
521531
; GFX10-NEXT: {{ $}}
522532
; GFX10-NEXT: bb.5:
@@ -527,7 +537,7 @@ body: |
527537
; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[C11]]
528538
; GFX10-NEXT: {{ $}}
529539
; GFX10-NEXT: bb.6:
530-
; GFX10-NEXT: [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[PHI]](s32), %bb.2, [[OR2]](s32), %bb.5
540+
; GFX10-NEXT: [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[PHI1]](s32), %bb.2, [[OR2]](s32), %bb.5
531541
; GFX10-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[LOAD]](<8 x s32>)
532542
; GFX10-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[COPY2]], [[COPY1]]
533543
; GFX10-NEXT: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 2

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll

Lines changed: 16 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
2-
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -amdgpu-global-isel-risky-select -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
2+
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
33

44
; This file contains various tests that have divergent i1s used outside of
55
; the loop. These are lane masks is sgpr and need to have correct value in
@@ -137,28 +137,24 @@ define void @divergent_i1_xor_used_outside_loop(float %val, float %pre.cond.val,
137137
; GFX10-LABEL: divergent_i1_xor_used_outside_loop:
138138
; GFX10: ; %bb.0: ; %entry
139139
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
140-
; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, 1.0, v1
141-
; GFX10-NEXT: s_mov_b32 s5, 0
140+
; GFX10-NEXT: s_mov_b32 s4, 0
141+
; GFX10-NEXT: v_cmp_lt_f32_e64 s5, 1.0, v1
142+
; GFX10-NEXT: v_mov_b32_e32 v1, s4
142143
; GFX10-NEXT: ; implicit-def: $sgpr6
143-
; GFX10-NEXT: v_mov_b32_e32 v1, s5
144-
; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
145144
; GFX10-NEXT: .LBB2_1: ; %loop
146145
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
147-
; GFX10-NEXT: v_and_b32_e32 v4, 1, v4
148-
; GFX10-NEXT: v_cvt_f32_u32_e32 v5, v1
146+
; GFX10-NEXT: v_cvt_f32_u32_e32 v4, v1
147+
; GFX10-NEXT: s_xor_b32 s5, s5, -1
149148
; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v1
150-
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
151-
; GFX10-NEXT: v_cmp_gt_f32_e64 s4, v5, v0
152-
; GFX10-NEXT: s_xor_b32 s7, vcc_lo, -1
153-
; GFX10-NEXT: s_or_b32 s5, s4, s5
154-
; GFX10-NEXT: v_mov_b32_e32 v4, s7
155-
; GFX10-NEXT: s_andn2_b32 s4, s6, exec_lo
156-
; GFX10-NEXT: s_and_b32 s6, exec_lo, s7
157-
; GFX10-NEXT: s_or_b32 s6, s4, s6
158-
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
149+
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v0
150+
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
151+
; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo
152+
; GFX10-NEXT: s_and_b32 s7, exec_lo, s5
153+
; GFX10-NEXT: s_or_b32 s6, s6, s7
154+
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
159155
; GFX10-NEXT: s_cbranch_execnz .LBB2_1
160156
; GFX10-NEXT: ; %bb.2: ; %exit
161-
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
157+
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
162158
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s6
163159
; GFX10-NEXT: flat_store_dword v[2:3], v0
164160
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -197,7 +193,7 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts,
197193
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
198194
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
199195
; GFX10-NEXT: s_mov_b32 s5, 0
200-
; GFX10-NEXT: s_mov_b32 s6, 1
196+
; GFX10-NEXT: s_mov_b32 s6, -1
201197
; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo
202198
; GFX10-NEXT: s_cbranch_execz .LBB3_6
203199
; GFX10-NEXT: ; %bb.1: ; %loop.start.preheader
@@ -332,7 +328,7 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
332328
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
333329
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s7
334330
; GFX10-NEXT: v_cmp_ne_u32_e64 s4, v1, v4
335-
; GFX10-NEXT: s_mov_b32 s7, 1
331+
; GFX10-NEXT: s_mov_b32 s7, -1
336332
; GFX10-NEXT: ; implicit-def: $vgpr5
337333
; GFX10-NEXT: s_and_saveexec_b32 s8, s4
338334
; GFX10-NEXT: s_cbranch_execz .LBB4_1
@@ -410,7 +406,7 @@ define amdgpu_ps void @divergent_i1_freeze_used_outside_loop(i32 %n, ptr addrspa
410406
; GFX10-LABEL: divergent_i1_freeze_used_outside_loop:
411407
; GFX10: ; %bb.0: ; %entry
412408
; GFX10-NEXT: s_mov_b32 s0, 0
413-
; GFX10-NEXT: s_mov_b32 s3, 1
409+
; GFX10-NEXT: s_mov_b32 s3, -1
414410
; GFX10-NEXT: v_mov_b32_e32 v5, s0
415411
; GFX10-NEXT: ; implicit-def: $sgpr1
416412
; GFX10-NEXT: ; implicit-def: $sgpr2

0 commit comments

Comments
 (0)