Skip to content

Commit 4bcf7e7

Browse files
AMDGPU/GlobalISel: Fix inst-selection of ballot
Both input and output of ballot are lane-masks: result is lane-mask with 'S32/S64 LLT and SGPR bank' input is lane-mask with 'S1 LLT and VCC reg bank'. Ballot copies bits from input lane-mask for all active lanes and puts 0 for inactive lanes. GlobalISel did not set 0 in result for inactive lanes for non-constant input.
1 parent 22829f7 commit 4bcf7e7

File tree

64 files changed

+865
-359
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

64 files changed

+865
-359
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 43 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1429,34 +1429,59 @@ bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
14291429
std::optional<ValueAndVReg> Arg =
14301430
getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI);
14311431

1432-
const auto BuildCopy = [&](Register SrcReg) {
1433-
if (Size == STI.getWavefrontSize()) {
1434-
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1435-
.addReg(SrcReg);
1436-
return;
1437-
}
1432+
const auto BuildAnd = [&](unsigned Opcode, Register Dst, Register Src,
1433+
Register Exec) -> bool {
1434+
auto And = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst)
1435+
.addReg(Src)
1436+
.addReg(Exec)
1437+
.setOperandDead(3); // Dead scc
1438+
return constrainSelectedInstRegOperands(*And, TII, TRI, RBI);
1439+
};
14381440

1439-
// If emitting a i64 ballot in wave32, fill the upper bits with zeroes.
1440-
Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1441-
BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
1442-
BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1443-
.addReg(SrcReg)
1441+
const auto BuildREG_SEQUENCE = [&](Register Dst, Register Lo, Register Hi) {
1442+
BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
1443+
.addReg(Lo)
14441444
.addImm(AMDGPU::sub0)
1445-
.addReg(HiReg)
1445+
.addReg(Hi)
14461446
.addImm(AMDGPU::sub1);
14471447
};
14481448

14491449
if (Arg) {
1450-
const int64_t Value = Arg->Value.getSExtValue();
1450+
const int64_t Value = Arg->Value.getZExtValue();
14511451
if (Value == 0) {
1452+
// DstReg(32or64) = S_MOV 0
14521453
unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
14531454
BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
1454-
} else if (Value == -1) // all ones
1455-
BuildCopy(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC);
1456-
else
1455+
} else if (Value == 1) {
1456+
if (Size == STI.getWavefrontSize()) {
1457+
// DstReg(32or64) = COPY EXEC
1458+
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1459+
.addReg(TRI.getExec());
1460+
} else {
1461+
// DstReg(64) = REG_SEQUENCE EXEC_LO, 0
1462+
Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1463+
BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
1464+
BuildREG_SEQUENCE(DstReg, TRI.getExec(), HiReg);
1465+
}
1466+
} else
14571467
return false;
1458-
} else
1459-
BuildCopy(I.getOperand(2).getReg());
1468+
} else {
1469+
Register SrcReg = I.getOperand(2).getReg();
1470+
if (Size == STI.getWavefrontSize()) {
1471+
// DstReg(32or64) = AND SrcReg, EXEC
1472+
unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
1473+
if (!BuildAnd(AndOpc, DstReg, SrcReg, TRI.getExec()))
1474+
return false;
1475+
} else {
1476+
// DstReg(64) = REG_SEQUENCE (AND SrcReg(32), EXEC_LO), 0
1477+
Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1478+
if (!BuildAnd(AMDGPU::S_AND_B32, LoReg, SrcReg, AMDGPU::EXEC_LO))
1479+
return false;
1480+
Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1481+
BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
1482+
BuildREG_SEQUENCE(DstReg, LoReg, HiReg);
1483+
}
1484+
}
14601485

14611486
I.eraseFromParent();
14621487
return true;

llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,7 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
138138
; CHECK-NEXT: s_and_b32 s4, s4, s5
139139
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[6:7], v[1:2]
140140
; CHECK-NEXT: s_and_b32 s4, s4, s5
141+
; CHECK-NEXT: s_and_b32 s4, s4, exec_lo
141142
; CHECK-NEXT: s_and_saveexec_b32 s4, s4
142143
; CHECK-NEXT: v_writelane_b32 v0, s4, 13
143144
; CHECK-NEXT: s_or_saveexec_b32 s21, -1

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,8 @@ define amdgpu_cs i32 @non_compare(i32 %x) {
3333
; CHECK-LABEL: non_compare:
3434
; CHECK: ; %bb.0:
3535
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
36-
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, v0
36+
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
37+
; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
3738
; CHECK-NEXT: ; return to shader part epilog
3839
%trunc = trunc i32 %x to i1
3940
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %trunc)
@@ -45,7 +46,8 @@ define amdgpu_cs i32 @non_compare(i32 %x) {
4546
define amdgpu_cs i32 @compare_ints(i32 %x, i32 %y) {
4647
; CHECK-LABEL: compare_ints:
4748
; CHECK: ; %bb.0:
48-
; CHECK-NEXT: v_cmp_eq_u32_e64 s0, v0, v1
49+
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
50+
; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
4951
; CHECK-NEXT: ; return to shader part epilog
5052
%cmp = icmp eq i32 %x, %y
5153
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
@@ -55,7 +57,8 @@ define amdgpu_cs i32 @compare_ints(i32 %x, i32 %y) {
5557
define amdgpu_cs i32 @compare_int_with_constant(i32 %x) {
5658
; CHECK-LABEL: compare_int_with_constant:
5759
; CHECK: ; %bb.0:
58-
; CHECK-NEXT: v_cmp_le_i32_e64 s0, 0x63, v0
60+
; CHECK-NEXT: v_cmp_le_i32_e32 vcc_lo, 0x63, v0
61+
; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
5962
; CHECK-NEXT: ; return to shader part epilog
6063
%cmp = icmp sge i32 %x, 99
6164
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
@@ -65,7 +68,8 @@ define amdgpu_cs i32 @compare_int_with_constant(i32 %x) {
6568
define amdgpu_cs i32 @compare_floats(float %x, float %y) {
6669
; CHECK-LABEL: compare_floats:
6770
; CHECK: ; %bb.0:
68-
; CHECK-NEXT: v_cmp_gt_f32_e64 s0, v0, v1
71+
; CHECK-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v1
72+
; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
6973
; CHECK-NEXT: ; return to shader part epilog
7074
%cmp = fcmp ogt float %x, %y
7175
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
@@ -76,7 +80,8 @@ define amdgpu_cs i32 @ctpop_of_ballot(float %x, float %y) {
7680
; CHECK-LABEL: ctpop_of_ballot:
7781
; CHECK: ; %bb.0:
7882
; CHECK-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v1
79-
; CHECK-NEXT: s_bcnt1_i32_b32 s0, vcc_lo
83+
; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
84+
; CHECK-NEXT: s_bcnt1_i32_b32 s0, s0
8085
; CHECK-NEXT: ; return to shader part epilog
8186
%cmp = fcmp ogt float %x, %y
8287
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
@@ -89,7 +94,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
8994
; CHECK: ; %bb.0:
9095
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
9196
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
92-
; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0
97+
; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
98+
; CHECK-NEXT: s_cmp_eq_u32 s0, 0
9399
; CHECK-NEXT: s_cbranch_scc1 .LBB7_2
94100
; CHECK-NEXT: ; %bb.1: ; %true
95101
; CHECK-NEXT: s_mov_b32 s0, 42
@@ -113,6 +119,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) {
113119
; CHECK: ; %bb.0:
114120
; CHECK-NEXT: s_and_b32 s0, 1, s0
115121
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
122+
; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
116123
; CHECK-NEXT: s_cmp_eq_u32 s0, 0
117124
; CHECK-NEXT: s_cbranch_scc1 .LBB8_2
118125
; CHECK-NEXT: ; %bb.1: ; %true
@@ -137,7 +144,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
137144
; CHECK: ; %bb.0:
138145
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
139146
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
140-
; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0
147+
; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
148+
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
141149
; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
142150
; CHECK-NEXT: ; %bb.1: ; %false
143151
; CHECK-NEXT: s_mov_b32 s0, 33
@@ -161,6 +169,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
161169
; CHECK: ; %bb.0:
162170
; CHECK-NEXT: s_and_b32 s0, 1, s0
163171
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
172+
; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
164173
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
165174
; CHECK-NEXT: s_cbranch_scc0 .LBB10_2
166175
; CHECK-NEXT: ; %bb.1: ; %false
@@ -184,7 +193,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_compare(i32 %v) {
184193
; CHECK-LABEL: branch_divergent_ballot_ne_zero_compare:
185194
; CHECK: ; %bb.0:
186195
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
187-
; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0
196+
; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
197+
; CHECK-NEXT: s_cmp_eq_u32 s0, 0
188198
; CHECK-NEXT: s_cbranch_scc1 .LBB11_2
189199
; CHECK-NEXT: ; %bb.1: ; %true
190200
; CHECK-NEXT: s_mov_b32 s0, 42
@@ -210,6 +220,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) {
210220
; CHECK-NEXT: s_cselect_b32 s0, 1, 0
211221
; CHECK-NEXT: s_and_b32 s0, 1, s0
212222
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
223+
; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
213224
; CHECK-NEXT: s_cmp_eq_u32 s0, 0
214225
; CHECK-NEXT: s_cbranch_scc1 .LBB12_2
215226
; CHECK-NEXT: ; %bb.1: ; %true
@@ -233,7 +244,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_compare(i32 %v) {
233244
; CHECK-LABEL: branch_divergent_ballot_eq_zero_compare:
234245
; CHECK: ; %bb.0:
235246
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
236-
; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0
247+
; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
248+
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
237249
; CHECK-NEXT: s_cbranch_scc0 .LBB13_2
238250
; CHECK-NEXT: ; %bb.1: ; %false
239251
; CHECK-NEXT: s_mov_b32 s0, 33
@@ -259,6 +271,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
259271
; CHECK-NEXT: s_cselect_b32 s0, 1, 0
260272
; CHECK-NEXT: s_and_b32 s0, 1, s0
261273
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
274+
; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
262275
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
263276
; CHECK-NEXT: s_cbranch_scc0 .LBB14_2
264277
; CHECK-NEXT: ; %bb.1: ; %false
@@ -284,6 +297,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_and(i32 %v1, i32 %v2) {
284297
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
285298
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
286299
; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0
300+
; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
287301
; CHECK-NEXT: s_cmp_eq_u32 s0, 0
288302
; CHECK-NEXT: s_cbranch_scc1 .LBB15_2
289303
; CHECK-NEXT: ; %bb.1: ; %true
@@ -315,6 +329,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg
315329
; CHECK-NEXT: s_and_b32 s0, s0, s1
316330
; CHECK-NEXT: s_and_b32 s0, 1, s0
317331
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
332+
; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
318333
; CHECK-NEXT: s_cmp_eq_u32 s0, 0
319334
; CHECK-NEXT: s_cbranch_scc1 .LBB16_2
320335
; CHECK-NEXT: ; %bb.1: ; %true
@@ -342,6 +357,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
342357
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
343358
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
344359
; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0
360+
; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
345361
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
346362
; CHECK-NEXT: s_cbranch_scc0 .LBB17_2
347363
; CHECK-NEXT: ; %bb.1: ; %false
@@ -373,6 +389,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg
373389
; CHECK-NEXT: s_and_b32 s0, s0, s1
374390
; CHECK-NEXT: s_and_b32 s0, 1, s0
375391
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
392+
; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
376393
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
377394
; CHECK-NEXT: s_cbranch_scc0 .LBB18_2
378395
; CHECK-NEXT: ; %bb.1: ; %false
@@ -401,6 +418,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_sgt_N_compare(i32 inreg %v) {
401418
; CHECK-NEXT: s_cselect_b32 s0, 1, 0
402419
; CHECK-NEXT: s_and_b32 s0, 1, s0
403420
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
421+
; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
404422
; CHECK-NEXT: s_cmp_le_i32 s0, 22
405423
; CHECK-NEXT: s_cbranch_scc1 .LBB19_2
406424
; CHECK-NEXT: ; %bb.1: ; %true

0 commit comments

Comments
 (0)