Skip to content

Commit a0a2f8f

Browse files
AMDGPU/GlobalISel: Fix inst-selection of ballot
Both input and output of ballot are lane-masks: result is lane-mask with 'S32/S64 LLT and SGPR bank' input is lane-mask with 'S1 LLT and VCC reg bank'. Ballot copies bits from input lane-mask for all active lanes and puts 0 for inactive lanes. GlobalISel did not set 0 in result for inactive lanes for non-constant input.
1 parent bfde178 commit a0a2f8f

File tree

63 files changed

+802
-306
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

63 files changed

+802
-306
lines changed

llvm/docs/AMDGPUUsage.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1369,6 +1369,12 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
13691369
sign-extended from the width of the underlying PC hardware register even on
13701370
processors where the s_getpc_b64 instruction returns a zero-extended value.
13711371

1372+
llvm.amdgcn.ballot Returns a bitfield(i32 or i64) containing the result of its i1 argument
1373+
in all active lanes, and zero in all inactive lanes.
1374+
Provides a way to convert i1 in LLVM IR to i32 or i64 lane mask - bitfield
1375+
used by hardware to control active lanes when used in EXEC register.
1376+
For example, ballot(i1 true) return EXEC mask.
1377+
13721378
============================================== ==========================================================
13731379

13741380
.. TODO::

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2085,6 +2085,8 @@ def int_amdgcn_fcmp :
20852085
[IntrNoMem, IntrConvergent,
20862086
ImmArg<ArgIndex<2>>, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
20872087

2088+
// Returns a bitfield(i32 or i64) containing the result of its i1 argument
2089+
// in all active lanes, and zero in all inactive lanes.
20882090
def int_amdgcn_ballot :
20892091
Intrinsic<[llvm_anyint_ty], [llvm_i1_ty],
20902092
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 117 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@
2525
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
2626
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
2727
#include "llvm/CodeGen/MachineFrameInfo.h"
28+
#include "llvm/CodeGen/MachineInstr.h"
29+
#include "llvm/CodeGen/MachineInstrBuilder.h"
30+
#include "llvm/CodeGen/MachineOperand.h"
2831
#include "llvm/IR/DiagnosticInfo.h"
2932
#include "llvm/IR/IntrinsicsAMDGPU.h"
3033
#include <optional>
@@ -1429,34 +1432,131 @@ bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
14291432
std::optional<ValueAndVReg> Arg =
14301433
getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI);
14311434

1432-
const auto BuildCopy = [&](Register SrcReg) {
1433-
if (Size == STI.getWavefrontSize()) {
1434-
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1435-
.addReg(SrcReg);
1436-
return;
1435+
const auto getCmpInput = [&]() -> MachineInstr * {
1436+
MachineInstr *SrcMI = getDefIgnoringCopies(I.getOperand(2).getReg(), *MRI);
1437+
// Try to fold sgpr compare.
1438+
if (SrcMI->getOpcode() == AMDGPU::G_TRUNC)
1439+
SrcMI = MRI->getVRegDef(SrcMI->getOperand(1).getReg());
1440+
1441+
if (SrcMI->getOpcode() == AMDGPU::G_ICMP ||
1442+
SrcMI->getOpcode() == AMDGPU::G_FCMP)
1443+
return SrcMI;
1444+
return nullptr;
1445+
};
1446+
1447+
const auto FoldCmp = [&](Register Dst, MachineInstr *CmpMI) -> bool {
1448+
// Fold ballot of a compare. Active lanes when the ballot is executed need
1449+
// to also be active when the compare is executed for this fold to be
1450+
// correct. If an inactive lane on compare becomes active for the ballot,
1451+
// divergent control flow is involved. The compare is in a divergent branch
1452+
// and needs to go through phi before being used by the ballot, the ballot
1453+
// is in a block that merged control flow. Using the compare directly in the
1454+
// ballot implies that active lanes for the ballot are a subset of active
1455+
// lanes for the compare.
1456+
auto Pred = cast<GAnyCmp>(CmpMI)->getCond();
1457+
Register Src0Reg = CmpMI->getOperand(2).getReg();
1458+
Register Src1Reg = CmpMI->getOperand(3).getReg();
1459+
unsigned OpSize = MRI->getType(Src0Reg).getSizeInBits();
1460+
1461+
int CmpOpcode = getV_CMPOpcode(Pred, OpSize, *Subtarget);
1462+
if (CmpOpcode == -1)
1463+
return false;
1464+
1465+
const auto constrainToVGPR = [&](Register Reg,
1466+
MachineInstr *InsertPt) -> Register {
1467+
if (RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID)
1468+
return Reg;
1469+
Register VgprReg =
1470+
MRI->createVirtualRegister(TRI.getVGPRClassForBitWidth(OpSize));
1471+
BuildMI(*BB, InsertPt, DL, TII.get(AMDGPU::COPY), VgprReg).addReg(Reg);
1472+
return VgprReg;
1473+
};
1474+
1475+
MachineInstrBuilder Cmp;
1476+
if (CmpMI->getOpcode() == AMDGPU::G_ICMP) {
1477+
Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpcode), Dst);
1478+
Cmp.addReg(constrainToVGPR(Src0Reg, Cmp))
1479+
.addReg(constrainToVGPR(Src1Reg, Cmp));
1480+
} else {
1481+
auto [Src0, Src0Mods] = selectVOP3ModsImpl(Src0Reg);
1482+
auto [Src1, Src1Mods] = selectVOP3ModsImpl(Src1Reg);
1483+
Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpcode), Dst);
1484+
Cmp.addImm(Src0Mods)
1485+
.addReg(constrainToVGPR(Src0, Cmp))
1486+
.addImm(Src1Mods)
1487+
.addReg(constrainToVGPR(Src1, Cmp))
1488+
.addImm(0);
14371489
}
14381490

1439-
// If emitting a i64 ballot in wave32, fill the upper bits with zeroes.
1440-
Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1441-
BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
1442-
BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1443-
.addReg(SrcReg)
1491+
return constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
1492+
};
1493+
1494+
const auto BuildAnd = [&](unsigned Opcode, Register Dst, Register Src,
1495+
Register Exec) -> bool {
1496+
auto And = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst)
1497+
.addReg(Src)
1498+
.addReg(Exec)
1499+
.setOperandDead(3); // Dead scc
1500+
return constrainSelectedInstRegOperands(*And, TII, TRI, RBI);
1501+
};
1502+
1503+
const auto BuildREG_SEQUENCE = [&](Register Dst, Register Lo, Register Hi) {
1504+
BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
1505+
.addReg(Lo)
14441506
.addImm(AMDGPU::sub0)
1445-
.addReg(HiReg)
1507+
.addReg(Hi)
14461508
.addImm(AMDGPU::sub1);
14471509
};
14481510

14491511
if (Arg) {
1450-
const int64_t Value = Arg->Value.getSExtValue();
1512+
const int64_t Value = Arg->Value.getZExtValue();
14511513
if (Value == 0) {
1514+
// DstReg(32or64) = S_MOV 0
14521515
unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
14531516
BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
1454-
} else if (Value == -1) // all ones
1455-
BuildCopy(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC);
1456-
else
1517+
} else if (Value == 1) {
1518+
if (Size == STI.getWavefrontSize()) {
1519+
// DstReg(32or64) = COPY EXEC
1520+
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1521+
.addReg(TRI.getExec());
1522+
} else {
1523+
// DstReg(64) = REG_SEQUENCE EXEC_LO, 0
1524+
Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1525+
BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
1526+
BuildREG_SEQUENCE(DstReg, TRI.getExec(), HiReg);
1527+
}
1528+
} else
14571529
return false;
1458-
} else
1459-
BuildCopy(I.getOperand(2).getReg());
1530+
} else {
1531+
Register SrcReg = I.getOperand(2).getReg();
1532+
if (Size == STI.getWavefrontSize()) {
1533+
if (MachineInstr *Cmp = getCmpInput()) {
1534+
// DstReg(32or64) = V_CMP...
1535+
if (!FoldCmp(DstReg, Cmp))
1536+
return false;
1537+
} else {
1538+
// DstReg(32or64) = AND SrcReg, EXEC
1539+
unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
1540+
if (!BuildAnd(AndOpc, DstReg, SrcReg, TRI.getExec()))
1541+
return false;
1542+
}
1543+
} else {
1544+
Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1545+
if (MachineInstr *Cmp = getCmpInput()) {
1546+
// LoReg(32) = V_CMP...
1547+
if (!FoldCmp(LoReg, Cmp))
1548+
return false;
1549+
} else {
1550+
// LoReg(32) = AND SrcReg, EXEC
1551+
if (!BuildAnd(AMDGPU::S_AND_B32, LoReg, SrcReg, AMDGPU::EXEC_LO))
1552+
return false;
1553+
}
1554+
// DstReg(64) = REG_SEQUENCE (LoReg(32), EXEC_LO), 0
1555+
Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1556+
BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
1557+
BuildREG_SEQUENCE(DstReg, LoReg, HiReg);
1558+
}
1559+
}
14601560

14611561
I.eraseFromParent();
14621562
return true;

llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,7 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
138138
; CHECK-NEXT: s_and_b32 s4, s4, s5
139139
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[6:7], v[1:2]
140140
; CHECK-NEXT: s_and_b32 s4, s4, s5
141+
; CHECK-NEXT: s_and_b32 s4, s4, exec_lo
141142
; CHECK-NEXT: s_and_saveexec_b32 s4, s4
142143
; CHECK-NEXT: v_writelane_b32 v0, s4, 13
143144
; CHECK-NEXT: s_or_saveexec_b32 s21, -1

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,8 @@ define amdgpu_cs i32 @non_compare(i32 %x) {
3333
; CHECK-LABEL: non_compare:
3434
; CHECK: ; %bb.0:
3535
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
36-
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, v0
36+
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
37+
; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
3738
; CHECK-NEXT: ; return to shader part epilog
3839
%trunc = trunc i32 %x to i1
3940
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %trunc)
@@ -89,7 +90,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
8990
; CHECK: ; %bb.0:
9091
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
9192
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
92-
; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0
93+
; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
94+
; CHECK-NEXT: s_cmp_eq_u32 s0, 0
9395
; CHECK-NEXT: s_cbranch_scc1 .LBB7_2
9496
; CHECK-NEXT: ; %bb.1: ; %true
9597
; CHECK-NEXT: s_mov_b32 s0, 42
@@ -113,6 +115,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) {
113115
; CHECK: ; %bb.0:
114116
; CHECK-NEXT: s_and_b32 s0, 1, s0
115117
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
118+
; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
116119
; CHECK-NEXT: s_cmp_eq_u32 s0, 0
117120
; CHECK-NEXT: s_cbranch_scc1 .LBB8_2
118121
; CHECK-NEXT: ; %bb.1: ; %true
@@ -137,7 +140,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
137140
; CHECK: ; %bb.0:
138141
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
139142
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
140-
; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0
143+
; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
144+
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
141145
; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
142146
; CHECK-NEXT: ; %bb.1: ; %false
143147
; CHECK-NEXT: s_mov_b32 s0, 33
@@ -161,6 +165,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
161165
; CHECK: ; %bb.0:
162166
; CHECK-NEXT: s_and_b32 s0, 1, s0
163167
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
168+
; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
164169
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
165170
; CHECK-NEXT: s_cbranch_scc0 .LBB10_2
166171
; CHECK-NEXT: ; %bb.1: ; %false
@@ -206,10 +211,7 @@ false:
206211
define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) {
207212
; CHECK-LABEL: branch_uniform_ballot_ne_zero_compare:
208213
; CHECK: ; %bb.0:
209-
; CHECK-NEXT: s_cmp_lt_u32 s0, 12
210-
; CHECK-NEXT: s_cselect_b32 s0, 1, 0
211-
; CHECK-NEXT: s_and_b32 s0, 1, s0
212-
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
214+
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, s0, 12
213215
; CHECK-NEXT: s_cmp_eq_u32 s0, 0
214216
; CHECK-NEXT: s_cbranch_scc1 .LBB12_2
215217
; CHECK-NEXT: ; %bb.1: ; %true
@@ -255,10 +257,7 @@ false:
255257
define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
256258
; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare:
257259
; CHECK: ; %bb.0:
258-
; CHECK-NEXT: s_cmp_lt_u32 s0, 12
259-
; CHECK-NEXT: s_cselect_b32 s0, 1, 0
260-
; CHECK-NEXT: s_and_b32 s0, 1, s0
261-
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
260+
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, s0, 12
262261
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
263262
; CHECK-NEXT: s_cbranch_scc0 .LBB14_2
264263
; CHECK-NEXT: ; %bb.1: ; %false
@@ -284,6 +283,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_and(i32 %v1, i32 %v2) {
284283
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
285284
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
286285
; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0
286+
; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
287287
; CHECK-NEXT: s_cmp_eq_u32 s0, 0
288288
; CHECK-NEXT: s_cbranch_scc1 .LBB15_2
289289
; CHECK-NEXT: ; %bb.1: ; %true
@@ -315,6 +315,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg
315315
; CHECK-NEXT: s_and_b32 s0, s0, s1
316316
; CHECK-NEXT: s_and_b32 s0, 1, s0
317317
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
318+
; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
318319
; CHECK-NEXT: s_cmp_eq_u32 s0, 0
319320
; CHECK-NEXT: s_cbranch_scc1 .LBB16_2
320321
; CHECK-NEXT: ; %bb.1: ; %true
@@ -342,6 +343,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
342343
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
343344
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
344345
; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0
346+
; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
345347
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
346348
; CHECK-NEXT: s_cbranch_scc0 .LBB17_2
347349
; CHECK-NEXT: ; %bb.1: ; %false
@@ -373,6 +375,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg
373375
; CHECK-NEXT: s_and_b32 s0, s0, s1
374376
; CHECK-NEXT: s_and_b32 s0, 1, s0
375377
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
378+
; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
376379
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
377380
; CHECK-NEXT: s_cbranch_scc0 .LBB18_2
378381
; CHECK-NEXT: ; %bb.1: ; %false
@@ -397,10 +400,7 @@ false:
397400
define amdgpu_cs i32 @branch_uniform_ballot_sgt_N_compare(i32 inreg %v) {
398401
; CHECK-LABEL: branch_uniform_ballot_sgt_N_compare:
399402
; CHECK: ; %bb.0:
400-
; CHECK-NEXT: s_cmp_lt_u32 s0, 12
401-
; CHECK-NEXT: s_cselect_b32 s0, 1, 0
402-
; CHECK-NEXT: s_and_b32 s0, 1, s0
403-
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
403+
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, s0, 12
404404
; CHECK-NEXT: s_cmp_le_i32 s0, 22
405405
; CHECK-NEXT: s_cbranch_scc1 .LBB19_2
406406
; CHECK-NEXT: ; %bb.1: ; %true

0 commit comments

Comments
 (0)